aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/p8023.c3
-rw-r--r--net/Makefile2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/ax25/ax25_in.c6
-rw-r--r--net/ax25/ax25_route.c19
-rw-r--r--net/bluetooth/af_bluetooth.c12
-rw-r--r--net/bluetooth/hci_core.c2
-rw-r--r--net/bluetooth/hci_event.c6
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hci_sysfs.c4
-rw-r--r--net/bluetooth/hidp/core.c4
-rw-r--r--net/bluetooth/l2cap.c98
-rw-r--r--net/bluetooth/rfcomm/core.c124
-rw-r--r--net/bluetooth/rfcomm/sock.c90
-rw-r--r--net/bluetooth/sco.c92
-rw-r--r--net/core/datagram.c21
-rw-r--r--net/core/dev.c12
-rw-r--r--net/core/dev_mcast.c3
-rw-r--r--net/core/netpoll.c18
-rw-r--r--net/core/rtnetlink.c83
-rw-r--r--net/core/skbuff.c15
-rw-r--r--net/core/sock.c3
-rw-r--r--net/core/stream.c12
-rw-r--r--net/dccp/ipv4.c38
-rw-r--r--net/dccp/proto.c3
-rw-r--r--net/decnet/af_decnet.c14
-rw-r--r--net/decnet/dn_table.c14
-rw-r--r--net/ethernet/pe2.c3
-rw-r--r--net/ieee80211/ieee80211_crypt.c153
-rw-r--r--net/ieee80211/ieee80211_crypt_ccmp.c1
-rw-r--r--net/ieee80211/ieee80211_crypt_tkip.c1
-rw-r--r--net/ieee80211/ieee80211_crypt_wep.c1
-rw-r--r--net/ieee80211/ieee80211_geo.c1
-rw-r--r--net/ieee80211/ieee80211_module.c1
-rw-r--r--net/ieee80211/ieee80211_rx.c3
-rw-r--r--net/ieee80211/ieee80211_tx.c1
-rw-r--r--net/ieee80211/ieee80211_wx.c14
-rw-r--r--net/ipv4/af_inet.c3
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c19
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/ip_fragment.c40
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ip_output.c15
-rw-r--r--net/ipv4/ip_sockglue.c12
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c6
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c7
-rw-r--r--net/ipv4/multipath_wrandom.c10
-rw-r--r--net/ipv4/netfilter/Kconfig41
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_helper_pptp.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_irc.c4
-rw-r--r--net/ipv4/netfilter/ip_conntrack_netlink.c170
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c26
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c19
-rw-r--r--net/ipv4/netfilter/ip_conntrack_tftp.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c6
-rw-r--r--net/ipv4/netfilter/ip_nat_helper_pptp.c30
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_gre.c4
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c2
-rw-r--r--net/ipv4/netfilter/ip_nat_snmp_basic.c3
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c12
-rw-r--r--net/ipv4/netfilter/ipt_CONNMARK.c23
-rw-r--r--net/ipv4/netfilter/ipt_NOTRACK.c4
-rw-r--r--net/ipv4/netfilter/ipt_connbytes.c39
-rw-r--r--net/ipv4/netfilter/ipt_connmark.c10
-rw-r--r--net/ipv4/netfilter/ipt_conntrack.c96
-rw-r--r--net/ipv4/netfilter/ipt_helper.c54
-rw-r--r--net/ipv4/netfilter/ipt_state.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c571
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c301
-rw-r--r--net/ipv4/sysctl_net_ipv4.c8
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_bic.c12
-rw-r--r--net/ipv4/tcp_cong.c40
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c13
-rw-r--r--net/ipv4/tcp_hybla.c6
-rw-r--r--net/ipv4/tcp_input.c288
-rw-r--r--net/ipv4/tcp_ipv4.c33
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c61
-rw-r--r--net/ipv4/tcp_scalable.c14
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c42
-rw-r--r--net/ipv4/udp.c7
-rw-r--r--net/ipv6/addrconf.c435
-rw-r--r--net/ipv6/af_inet6.c8
-rw-r--r--net/ipv6/icmp.c21
-rw-r--r--net/ipv6/ip6_fib.c54
-rw-r--r--net/ipv6/ip6_input.c5
-rw-r--r--net/ipv6/ip6_output.c21
-rw-r--r--net/ipv6/ip6_tunnel.c7
-rw-r--r--net/ipv6/ipcomp6.c3
-rw-r--r--net/ipv6/ipv6_sockglue.c5
-rw-r--r--net/ipv6/ipv6_syms.c2
-rw-r--r--net/ipv6/netfilter/Kconfig36
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6t_MARK.c6
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c556
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c272
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c897
-rw-r--r--net/ipv6/raw.c46
-rw-r--r--net/ipv6/reassembly.c41
-rw-r--r--net/ipv6/route.c8
-rw-r--r--net/ipv6/tcp_ipv6.c35
-rw-r--r--net/ipv6/udp.c25
-rw-r--r--net/irda/discovery.c3
-rw-r--r--net/irda/irias_object.c16
-rw-r--r--net/llc/af_llc.c5
-rw-r--r--net/llc/llc_c_ac.c20
-rw-r--r--net/netfilter/Kconfig74
-rw-r--r--net/netfilter/Makefile8
-rw-r--r--net/netfilter/nf_conntrack_core.c1545
-rw-r--r--net/netfilter/nf_conntrack_ftp.c698
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c98
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c85
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c670
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1169
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c216
-rw-r--r--net/netfilter/nf_conntrack_standalone.c869
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nfnetlink.c19
-rw-r--r--net/netfilter/nfnetlink_log.c12
-rw-r--r--net/netfilter/nfnetlink_queue.c15
-rw-r--r--net/netlink/Makefile2
-rw-r--r--net/netlink/af_netlink.c97
-rw-r--r--net/netlink/attr.c328
-rw-r--r--net/netlink/genetlink.c579
-rw-r--r--net/rose/rose_route.c6
-rw-r--r--net/rxrpc/transport.c15
-rw-r--r--net/sched/Kconfig37
-rw-r--r--net/sched/cls_fw.c3
-rw-r--r--net/sched/cls_route.c3
-rw-r--r--net/sched/cls_rsvp.h3
-rw-r--r--net/sched/cls_tcindex.c9
-rw-r--r--net/sched/cls_u32.c4
-rw-r--r--net/sched/em_meta.c3
-rw-r--r--net/sched/ematch.c5
-rw-r--r--net/sched/sch_gred.c841
-rw-r--r--net/sched/sch_netem.c122
-rw-r--r--net/sched/sch_red.c418
-rw-r--r--net/sctp/associola.c37
-rw-r--r--net/sctp/endpointola.c26
-rw-r--r--net/sctp/input.c20
-rw-r--r--net/sctp/protocol.c6
-rw-r--r--net/sctp/sm_make_chunk.c6
-rw-r--r--net/sctp/sm_sideeffect.c6
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c5
-rw-r--r--net/sctp/sysctl.c8
-rw-r--r--net/sctp/ulpevent.c24
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c3
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_seal.c3
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_token.c3
-rw-r--r--net/sunrpc/auth_gss/gss_spkm3_unseal.c6
-rw-r--r--net/sunrpc/clnt.c32
-rw-r--r--net/sunrpc/rpc_pipe.c6
-rw-r--r--net/sunrpc/socklib.c5
-rw-r--r--net/sunrpc/svc.c21
-rw-r--r--net/sunrpc/svcsock.c10
-rw-r--r--net/sunrpc/xdr.c3
-rw-r--r--net/unix/af_unix.c2
-rw-r--r--net/wanrouter/af_wanpipe.c20
-rw-r--r--net/wanrouter/wanmain.c12
-rw-r--r--net/xfrm/xfrm_state.c12
-rw-r--r--net/xfrm/xfrm_user.c69
173 files changed, 11320 insertions, 2575 deletions
diff --git a/net/802/p8023.c b/net/802/p8023.c
index 6368d3dce44..d23e906456e 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -54,8 +54,7 @@ struct datalink_proto *make_8023_client(void)
*/
void destroy_8023_client(struct datalink_proto *dl)
{
- if (dl)
- kfree(dl);
+ kfree(dl);
}
EXPORT_SYMBOL(destroy_8023_client);
diff --git a/net/Makefile b/net/Makefile
index 4aa2f46d2a5..f5141b9d4f3 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y)
# LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/
obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
-obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
ifneq ($(CONFIG_IPV6),)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 8e37e71e34f..1b683f30265 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1138,10 +1138,8 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr,
sk->sk_state = TCP_CLOSE;
sock->state = SS_UNCONNECTED;
- if (ax25->digipeat != NULL) {
- kfree(ax25->digipeat);
- ax25->digipeat = NULL;
- }
+ kfree(ax25->digipeat);
+ ax25->digipeat = NULL;
/*
* Handle digi-peaters to be used.
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 73cfc3411c4..4cf87540fb3 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -401,10 +401,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
}
if (dp.ndigi == 0) {
- if (ax25->digipeat != NULL) {
- kfree(ax25->digipeat);
- ax25->digipeat = NULL;
- }
+ kfree(ax25->digipeat);
+ ax25->digipeat = NULL;
} else {
/* Reverse the source SABM's path */
memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi));
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 26b77d97222..b1e945bd6ed 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -54,15 +54,13 @@ void ax25_rt_device_down(struct net_device *dev)
if (s->dev == dev) {
if (ax25_route_list == s) {
ax25_route_list = s->next;
- if (s->digipeat != NULL)
- kfree(s->digipeat);
+ kfree(s->digipeat);
kfree(s);
} else {
for (t = ax25_route_list; t != NULL; t = t->next) {
if (t->next == s) {
t->next = s->next;
- if (s->digipeat != NULL)
- kfree(s->digipeat);
+ kfree(s->digipeat);
kfree(s);
break;
}
@@ -90,10 +88,8 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
while (ax25_rt != NULL) {
if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 &&
ax25_rt->dev == ax25_dev->dev) {
- if (ax25_rt->digipeat != NULL) {
- kfree(ax25_rt->digipeat);
- ax25_rt->digipeat = NULL;
- }
+ kfree(ax25_rt->digipeat);
+ ax25_rt->digipeat = NULL;
if (route->digi_count != 0) {
if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
write_unlock(&ax25_route_lock);
@@ -145,8 +141,7 @@ static int ax25_rt_add(struct ax25_routes_struct *route)
static void ax25_rt_destroy(ax25_route *ax25_rt)
{
if (atomic_read(&ax25_rt->ref) == 0) {
- if (ax25_rt->digipeat != NULL)
- kfree(ax25_rt->digipeat);
+ kfree(ax25_rt->digipeat);
kfree(ax25_rt);
return;
}
@@ -530,9 +525,7 @@ void __exit ax25_rt_free(void)
s = ax25_rt;
ax25_rt = ax25_rt->next;
- if (s->digipeat != NULL)
- kfree(s->digipeat);
-
+ kfree(s->digipeat);
kfree(s);
}
write_unlock(&ax25_route_lock);
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 03532062a46..ea616e3fc98 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -36,7 +36,6 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/poll.h>
-#include <linux/proc_fs.h>
#include <net/sock.h>
#if defined(CONFIG_KMOD)
@@ -50,10 +49,7 @@
#define BT_DBG(D...)
#endif
-#define VERSION "2.7"
-
-struct proc_dir_entry *proc_bt;
-EXPORT_SYMBOL(proc_bt);
+#define VERSION "2.8"
/* Bluetooth sockets */
#define BT_MAX_PROTO 8
@@ -312,10 +308,6 @@ static int __init bt_init(void)
{
BT_INFO("Core ver %s", VERSION);
- proc_bt = proc_mkdir("bluetooth", NULL);
- if (proc_bt)
- proc_bt->owner = THIS_MODULE;
-
sock_register(&bt_sock_family_ops);
BT_INFO("HCI device and connection manager initialized");
@@ -334,8 +326,6 @@ static void __exit bt_exit(void)
bt_sysfs_cleanup();
sock_unregister(PF_BLUETOOTH);
-
- remove_proc_entry("bluetooth", NULL);
}
subsys_initcall(bt_init);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index cf0df1c8c93..9106354c781 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -183,7 +183,7 @@ static void hci_reset_req(struct hci_dev *hdev, unsigned long opt)
static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
{
struct sk_buff *skb;
- __u16 param;
+ __le16 param;
BT_DBG("%s %ld", hdev->name, opt);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index b61b4e8e36f..eb64555d1fb 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -242,7 +242,7 @@ static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb
break;
status = *((__u8 *) skb->data);
- setting = __le16_to_cpu(get_unaligned((__u16 *) sent));
+ setting = __le16_to_cpu(get_unaligned((__le16 *) sent));
if (!status && hdev->voice_setting != setting) {
hdev->voice_setting = setting;
@@ -728,7 +728,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff
static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data;
- __u16 *ptr;
+ __le16 *ptr;
int i;
skb_pull(skb, sizeof(*ev));
@@ -742,7 +742,7 @@ static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *s
tasklet_disable(&hdev->tx_task);
- for (i = 0, ptr = (__u16 *) skb->data; i < ev->num_hndl; i++) {
+ for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) {
struct hci_conn *conn;
__u16 handle, count;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 799e448750a..1d6d0a15c09 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -416,7 +416,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
skb->dev = (void *) hdev;
if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
- u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
+ u16 opcode = __le16_to_cpu(get_unaligned((__le16 *) skb->data));
u16 ogf = hci_opcode_ogf(opcode);
u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 7856bc26acc..bd7568ac87f 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -103,7 +103,7 @@ static void bt_release(struct class_device *cdev)
kfree(hdev);
}
-static struct class bt_class = {
+struct class bt_class = {
.name = "bluetooth",
.release = bt_release,
#ifdef CONFIG_HOTPLUG
@@ -111,6 +111,8 @@ static struct class bt_class = {
#endif
};
+EXPORT_SYMBOL_GPL(bt_class);
+
int hci_register_sysfs(struct hci_dev *hdev)
{
struct class_device *cdev = &hdev->class_dev;
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 860444a7fc0..cdb9cfafd96 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -660,9 +660,7 @@ unlink:
failed:
up_write(&hidp_session_sem);
- if (session->input)
- kfree(session->input);
-
+ kfree(session->input);
kfree(session);
return err;
}
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 59b2dd36baa..e3bb11ca423 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -38,9 +38,8 @@
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <linux/list.h>
+#include <linux/device.h>
#include <net/sock.h>
#include <asm/system.h>
@@ -56,7 +55,7 @@
#define BT_DBG(D...)
#endif
-#define VERSION "2.7"
+#define VERSION "2.8"
static struct proto_ops l2cap_sock_ops;
@@ -2137,94 +2136,29 @@ drop:
return 0;
}
-/* ---- Proc fs support ---- */
-#ifdef CONFIG_PROC_FS
-static void *l2cap_seq_start(struct seq_file *seq, loff_t *pos)
+static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
{
struct sock *sk;
struct hlist_node *node;
- loff_t l = *pos;
+ char *str = buf;
read_lock_bh(&l2cap_sk_list.lock);
- sk_for_each(sk, node, &l2cap_sk_list.head)
- if (!l--)
- goto found;
- sk = NULL;
-found:
- return sk;
-}
+ sk_for_each(sk, node, &l2cap_sk_list.head) {
+ struct l2cap_pinfo *pi = l2cap_pi(sk);
-static void *l2cap_seq_next(struct seq_file *seq, void *e, loff_t *pos)
-{
- (*pos)++;
- return sk_next(e);
-}
+ str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n",
+ batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
+ sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu,
+ pi->omtu, pi->link_mode);
+ }
-static void l2cap_seq_stop(struct seq_file *seq, void *e)
-{
read_unlock_bh(&l2cap_sk_list.lock);
-}
-static int l2cap_seq_show(struct seq_file *seq, void *e)
-{
- struct sock *sk = e;
- struct l2cap_pinfo *pi = l2cap_pi(sk);
-
- seq_printf(seq, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu,
- pi->omtu, pi->link_mode);
- return 0;
+ return (str - buf);
}
-static struct seq_operations l2cap_seq_ops = {
- .start = l2cap_seq_start,
- .next = l2cap_seq_next,
- .stop = l2cap_seq_stop,
- .show = l2cap_seq_show
-};
-
-static int l2cap_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &l2cap_seq_ops);
-}
-
-static struct file_operations l2cap_seq_fops = {
- .owner = THIS_MODULE,
- .open = l2cap_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init l2cap_proc_init(void)
-{
- struct proc_dir_entry *p = create_proc_entry("l2cap", S_IRUGO, proc_bt);
- if (!p)
- return -ENOMEM;
- p->owner = THIS_MODULE;
- p->proc_fops = &l2cap_seq_fops;
- return 0;
-}
-
-static void __exit l2cap_proc_cleanup(void)
-{
- remove_proc_entry("l2cap", proc_bt);
-}
-
-#else /* CONFIG_PROC_FS */
-
-static int __init l2cap_proc_init(void)
-{
- return 0;
-}
-
-static void __exit l2cap_proc_cleanup(void)
-{
- return;
-}
-#endif /* CONFIG_PROC_FS */
+static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
static struct proto_ops l2cap_sock_ops = {
.family = PF_BLUETOOTH,
@@ -2266,7 +2200,7 @@ static struct hci_proto l2cap_hci_proto = {
static int __init l2cap_init(void)
{
int err;
-
+
err = proto_register(&l2cap_proto, 0);
if (err < 0)
return err;
@@ -2284,7 +2218,7 @@ static int __init l2cap_init(void)
goto error;
}
- l2cap_proc_init();
+ class_create_file(&bt_class, &class_attr_l2cap);
BT_INFO("L2CAP ver %s", VERSION);
BT_INFO("L2CAP socket layer initialized");
@@ -2298,7 +2232,7 @@ error:
static void __exit l2cap_exit(void)
{
- l2cap_proc_cleanup();
+ class_remove_file(&bt_class, &class_attr_l2cap);
if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
BT_ERR("L2CAP socket unregistration failed");
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index c3d56ead840..0d89d643413 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -35,9 +35,8 @@
#include <linux/signal.h>
#include <linux/init.h>
#include <linux/wait.h>
+#include <linux/device.h>
#include <linux/net.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <net/sock.h>
#include <asm/uaccess.h>
#include <asm/unaligned.h>
@@ -47,17 +46,13 @@
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/rfcomm.h>
-#define VERSION "1.5"
+#define VERSION "1.6"
#ifndef CONFIG_BT_RFCOMM_DEBUG
#undef BT_DBG
#define BT_DBG(D...)
#endif
-#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_bt_rfcomm;
-#endif
-
static struct task_struct *rfcomm_thread;
static DECLARE_MUTEX(rfcomm_sem);
@@ -2001,117 +1996,32 @@ static struct hci_cb rfcomm_cb = {
.encrypt_cfm = rfcomm_encrypt_cfm
};
-/* ---- Proc fs support ---- */
-#ifdef CONFIG_PROC_FS
-static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos)
+static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf)
{
struct rfcomm_session *s;
struct list_head *pp, *p;
- loff_t l = *pos;
+ char *str = buf;
rfcomm_lock();
list_for_each(p, &session_list) {
s = list_entry(p, struct rfcomm_session, list);
- list_for_each(pp, &s->dlcs)
- if (!l--) {
- seq->private = s;
- return pp;
- }
- }
- return NULL;
-}
+ list_for_each(pp, &s->dlcs) {
+ struct sock *sk = s->sock->sk;
+ struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list);
-static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos)
-{
- struct rfcomm_session *s = seq->private;
- struct list_head *pp, *p = e;
- (*pos)++;
-
- if (p->next != &s->dlcs)
- return p->next;
-
- list_for_each(p, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
- __list_for_each(pp, &s->dlcs) {
- seq->private = s;
- return pp;
+ str += sprintf(str, "%s %s %ld %d %d %d %d\n",
+ batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
+ d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits);
}
}
- return NULL;
-}
-static void rfcomm_seq_stop(struct seq_file *seq, void *e)
-{
rfcomm_unlock();
-}
-
-static int rfcomm_seq_show(struct seq_file *seq, void *e)
-{
- struct rfcomm_session *s = seq->private;
- struct sock *sk = s->sock->sk;
- struct rfcomm_dlc *d = list_entry(e, struct rfcomm_dlc, list);
-
- seq_printf(seq, "%s %s %ld %d %d %d %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits);
- return 0;
-}
-
-static struct seq_operations rfcomm_seq_ops = {
- .start = rfcomm_seq_start,
- .next = rfcomm_seq_next,
- .stop = rfcomm_seq_stop,
- .show = rfcomm_seq_show
-};
-
-static int rfcomm_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rfcomm_seq_ops);
-}
-
-static struct file_operations rfcomm_seq_fops = {
- .owner = THIS_MODULE,
- .open = rfcomm_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init rfcomm_proc_init(void)
-{
- struct proc_dir_entry *p;
-
- proc_bt_rfcomm = proc_mkdir("rfcomm", proc_bt);
- if (proc_bt_rfcomm) {
- proc_bt_rfcomm->owner = THIS_MODULE;
-
- p = create_proc_entry("dlc", S_IRUGO, proc_bt_rfcomm);
- if (p)
- p->proc_fops = &rfcomm_seq_fops;
- }
- return 0;
-}
-
-static void __exit rfcomm_proc_cleanup(void)
-{
- remove_proc_entry("dlc", proc_bt_rfcomm);
- remove_proc_entry("rfcomm", proc_bt);
+ return (str - buf);
}
-#else /* CONFIG_PROC_FS */
-
-static int __init rfcomm_proc_init(void)
-{
- return 0;
-}
-
-static void __exit rfcomm_proc_cleanup(void)
-{
- return;
-}
-#endif /* CONFIG_PROC_FS */
+static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL);
/* ---- Initialization ---- */
static int __init rfcomm_init(void)
@@ -2122,9 +2032,7 @@ static int __init rfcomm_init(void)
kernel_thread(rfcomm_run, NULL, CLONE_KERNEL);
- BT_INFO("RFCOMM ver %s", VERSION);
-
- rfcomm_proc_init();
+ class_create_file(&bt_class, &class_attr_rfcomm_dlc);
rfcomm_init_sockets();
@@ -2132,11 +2040,15 @@ static int __init rfcomm_init(void)
rfcomm_init_ttys();
#endif
+ BT_INFO("RFCOMM ver %s", VERSION);
+
return 0;
}
static void __exit rfcomm_exit(void)
{
+ class_remove_file(&bt_class, &class_attr_rfcomm_dlc);
+
hci_unregister_cb(&rfcomm_cb);
/* Terminate working thread.
@@ -2153,8 +2065,6 @@ static void __exit rfcomm_exit(void)
#endif
rfcomm_cleanup_sockets();
-
- rfcomm_proc_cleanup();
}
module_init(rfcomm_init);
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index a2b30f0aedb..6c34261b232 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -42,8 +42,7 @@
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/list.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
+#include <linux/device.h>
#include <net/sock.h>
#include <asm/system.h>
@@ -887,89 +886,26 @@ done:
return result;
}
-/* ---- Proc fs support ---- */
-#ifdef CONFIG_PROC_FS
-static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos)
+static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf)
{
struct sock *sk;
struct hlist_node *node;
- loff_t l = *pos;
+ char *str = buf;
read_lock_bh(&rfcomm_sk_list.lock);
- sk_for_each(sk, node, &rfcomm_sk_list.head)
- if (!l--)
- return sk;
- return NULL;
-}
-
-static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos)
-{
- struct sock *sk = e;
- (*pos)++;
- return sk_next(sk);
-}
+ sk_for_each(sk, node, &rfcomm_sk_list.head) {
+ str += sprintf(str, "%s %s %d %d\n",
+ batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
+ sk->sk_state, rfcomm_pi(sk)->channel);
+ }
-static void rfcomm_seq_stop(struct seq_file *seq, void *e)
-{
read_unlock_bh(&rfcomm_sk_list.lock);
-}
-static int rfcomm_seq_show(struct seq_file *seq, void *e)
-{
- struct sock *sk = e;
- seq_printf(seq, "%s %s %d %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
- sk->sk_state, rfcomm_pi(sk)->channel);
- return 0;
-}
-
-static struct seq_operations rfcomm_seq_ops = {
- .start = rfcomm_seq_start,
- .next = rfcomm_seq_next,
- .stop = rfcomm_seq_stop,
- .show = rfcomm_seq_show
-};
-
-static int rfcomm_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rfcomm_seq_ops);
+ return (str - buf);
}
-static struct file_operations rfcomm_seq_fops = {
- .owner = THIS_MODULE,
- .open = rfcomm_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init rfcomm_sock_proc_init(void)
-{
- struct proc_dir_entry *p = create_proc_entry("sock", S_IRUGO, proc_bt_rfcomm);
- if (!p)
- return -ENOMEM;
- p->proc_fops = &rfcomm_seq_fops;
- return 0;
-}
-
-static void __exit rfcomm_sock_proc_cleanup(void)
-{
- remove_proc_entry("sock", proc_bt_rfcomm);
-}
-
-#else /* CONFIG_PROC_FS */
-
-static int __init rfcomm_sock_proc_init(void)
-{
- return 0;
-}
-
-static void __exit rfcomm_sock_proc_cleanup(void)
-{
- return;
-}
-#endif /* CONFIG_PROC_FS */
+static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
static struct proto_ops rfcomm_sock_ops = {
.family = PF_BLUETOOTH,
@@ -997,7 +933,7 @@ static struct net_proto_family rfcomm_sock_family_ops = {
.create = rfcomm_sock_create
};
-int __init rfcomm_init_sockets(void)
+int __init rfcomm_init_sockets(void)
{
int err;
@@ -1009,7 +945,7 @@ int __init rfcomm_init_sockets(void)
if (err < 0)
goto error;
- rfcomm_sock_proc_init();
+ class_create_file(&bt_class, &class_attr_rfcomm);
BT_INFO("RFCOMM socket layer initialized");
@@ -1023,7 +959,7 @@ error:
void __exit rfcomm_cleanup_sockets(void)
{
- rfcomm_sock_proc_cleanup();
+ class_remove_file(&bt_class, &class_attr_rfcomm);
if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
BT_ERR("RFCOMM socket layer unregistration failed");
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 997e42df115..9cb00dc6c08 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -38,8 +38,7 @@
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
+#include <linux/device.h>
#include <linux/list.h>
#include <net/sock.h>
@@ -55,7 +54,7 @@
#define BT_DBG(D...)
#endif
-#define VERSION "0.4"
+#define VERSION "0.5"
static struct proto_ops sco_sock_ops;
@@ -893,91 +892,26 @@ drop:
return 0;
}
-/* ---- Proc fs support ---- */
-#ifdef CONFIG_PROC_FS
-static void *sco_seq_start(struct seq_file *seq, loff_t *pos)
+static ssize_t sco_sysfs_show(struct class *dev, char *buf)
{
struct sock *sk;
struct hlist_node *node;
- loff_t l = *pos;
+ char *str = buf;
read_lock_bh(&sco_sk_list.lock);
- sk_for_each(sk, node, &sco_sk_list.head)
- if (!l--)
- goto found;
- sk = NULL;
-found:
- return sk;
-}
-
-static void *sco_seq_next(struct seq_file *seq, void *e, loff_t *pos)
-{
- struct sock *sk = e;
- (*pos)++;
- return sk_next(sk);
-}
+ sk_for_each(sk, node, &sco_sk_list.head) {
+ str += sprintf(str, "%s %s %d\n",
+ batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
+ sk->sk_state);
+ }
-static void sco_seq_stop(struct seq_file *seq, void *e)
-{
read_unlock_bh(&sco_sk_list.lock);
-}
-
-static int sco_seq_show(struct seq_file *seq, void *e)
-{
- struct sock *sk = e;
- seq_printf(seq, "%s %s %d\n",
- batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state);
- return 0;
-}
-static struct seq_operations sco_seq_ops = {
- .start = sco_seq_start,
- .next = sco_seq_next,
- .stop = sco_seq_stop,
- .show = sco_seq_show
-};
-
-static int sco_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &sco_seq_ops);
+ return (str - buf);
}
-static struct file_operations sco_seq_fops = {
- .owner = THIS_MODULE,
- .open = sco_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init sco_proc_init(void)
-{
- struct proc_dir_entry *p = create_proc_entry("sco", S_IRUGO, proc_bt);
- if (!p)
- return -ENOMEM;
- p->owner = THIS_MODULE;
- p->proc_fops = &sco_seq_fops;
- return 0;
-}
-
-static void __exit sco_proc_cleanup(void)
-{
- remove_proc_entry("sco", proc_bt);
-}
-
-#else /* CONFIG_PROC_FS */
-
-static int __init sco_proc_init(void)
-{
- return 0;
-}
-
-static void __exit sco_proc_cleanup(void)
-{
- return;
-}
-#endif /* CONFIG_PROC_FS */
+static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
static struct proto_ops sco_sock_ops = {
.family = PF_BLUETOOTH,
@@ -1035,7 +969,7 @@ static int __init sco_init(void)
goto error;
}
- sco_proc_init();
+ class_create_file(&bt_class, &class_attr_sco);
BT_INFO("SCO (Voice Link) ver %s", VERSION);
BT_INFO("SCO socket layer initialized");
@@ -1049,7 +983,7 @@ error:
static void __exit sco_exit(void)
{
- sco_proc_cleanup();
+ class_remove_file(&bt_class, &class_attr_sco);
if (bt_sock_unregister(BTPROTO_SCO) < 0)
BT_ERR("SCO socket unregistration failed");
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d219435d086..1bcfef51ac5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -350,6 +350,20 @@ fault:
return -EFAULT;
}
+unsigned int __skb_checksum_complete(struct sk_buff *skb)
+{
+ unsigned int sum;
+
+ sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
+
/**
* skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
* @skb: skbuff
@@ -363,7 +377,7 @@ fault:
* -EFAULT - fault during copy. Beware, in this case iovec
* can be modified!
*/
-int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
+int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen, struct iovec *iov)
{
unsigned int csum;
@@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
iov++;
if (iov->iov_len < chunk) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen,
- skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_error;
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
goto fault;
@@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
goto fault;
if ((unsigned short)csum_fold(csum))
goto csum_error;
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
iov->iov_len -= chunk;
iov->iov_base += chunk;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d154159527..0b48e294aaf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1108,6 +1108,18 @@ out:
return ret;
}
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+void netdev_rx_csum_fault(struct net_device *dev)
+{
+ if (net_ratelimit()) {
+ printk(KERN_ERR "%s: hw csum failure.\n", dev->name);
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(netdev_rx_csum_fault);
+#endif
+
#ifdef CONFIG_HIGHMEM
/* Actually, we should eliminate this check as soon as we know, that:
* 1. IOMMU is present and allows to map all the memory.
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index db098ff3cd6..cb530eef0e3 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -194,8 +194,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
done:
spin_unlock_bh(&dev->xmit_lock);
- if (dmi1)
- kfree(dmi1);
+ kfree(dmi1);
return err;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 802fe11efad..49424a42a2c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb)
static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
- if (uh->check == 0)
+ unsigned int psum;
+
+ if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
return 0;
- if (skb->ip_summed == CHECKSUM_HW)
- return csum_tcpudp_magic(
- saddr, daddr, ulen, IPPROTO_UDP, skb->csum);
+ psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+
+ if (skb->ip_summed == CHECKSUM_HW &&
+ !(u16)csum_fold(csum_add(psum, skb->csum)))
+ return 0;
- skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ skb->csum = psum;
- return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ return __skb_checksum_complete(skb);
}
/*
@@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb)
if (ulen != len)
goto out;
- if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0)
+ if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
goto out;
if (np->local_ip && np->local_ip != ntohl(iph->daddr))
goto out;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9bed7569ce3..8700379685e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -49,6 +49,7 @@
#include <net/udp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
+#include <net/netlink.h>
DECLARE_MUTEX(rtnl_sem);
@@ -462,11 +463,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
}
-static int rtnetlink_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
/* Protected by RTNL sempahore. */
static struct rtattr **rta_buf;
static int rtattr_max;
@@ -524,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
}
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
- u32 rlen;
-
if (link->dumpit == NULL)
link = &(rtnetlink_links[PF_UNSPEC][type]);
@@ -533,14 +527,11 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
goto err_inval;
if ((*errp = netlink_dump_start(rtnl, skb, nlh,
- link->dumpit,
- rtnetlink_done)) != 0) {
+ link->dumpit, NULL)) != 0) {
return -1;
}
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
+
+ netlink_queue_skip(nlh, skb);
return -1;
}
@@ -579,75 +570,13 @@ err_inval:
return -1;
}
-/*
- * Process one packet of messages.
- * Malformed skbs with wrong lengths of messages are discarded silently.
- */
-
-static inline int rtnetlink_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr * nlh;
-
- while (skb->len >= NLMSG_SPACE(0)) {
- u32 rlen;
-
- nlh = (struct nlmsghdr *)skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
- return 0;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if (rtnetlink_rcv_msg(skb, nlh, &err)) {
- /* Not error, but we must interrupt processing here:
- * Note, that in this case we do not pull message
- * from skb, it will be processed later.
- */
- if (err == 0)
- return -1;
- netlink_ack(skb, nlh, err);
- } else if (nlh->nlmsg_flags&NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
- }
-
- return 0;
-}
-
-/*
- * rtnetlink input queue processing routine:
- * - process as much as there was in the queue upon entry.
- * - feed skbs to rtnetlink_rcv_skb, until it refuse a message,
- * that will occur, when a dump started.
- */
-
static void rtnetlink_rcv(struct sock *sk, int len)
{
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+ unsigned int qlen = 0;
do {
- struct sk_buff *skb;
-
rtnl_lock();
-
- if (qlen > skb_queue_len(&sk->sk_receive_queue))
- qlen = skb_queue_len(&sk->sk_receive_queue);
-
- for (; qlen; qlen--) {
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (rtnetlink_rcv_skb(skb)) {
- if (skb->len)
- skb_queue_head(&sk->sk_receive_queue,
- skb);
- else {
- kfree_skb(skb);
- qlen--;
- }
- break;
- }
- kfree_skb(skb);
- }
-
+ netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg);
up(&rtnl_sem);
netdev_run_todo();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 95501e40100..b7d13a4fff4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -336,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb)
}
#ifdef CONFIG_NETFILTER
nf_conntrack_put(skb->nfct);
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ nf_conntrack_put_reasm(skb->nfct_reasm);
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(skb->nf_bridge);
#endif
@@ -414,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
C(nfct);
nf_conntrack_get(skb->nfct);
C(nfctinfo);
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ C(nfct_reasm);
+ nf_conntrack_get_reasm(skb->nfct_reasm);
+#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
C(ipvs_property);
#endif
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ C(nfct_reasm);
+ nf_conntrack_get_reasm(skb->nfct_reasm);
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
C(nf_bridge);
nf_bridge_get(skb->nf_bridge);
@@ -474,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->nfct = old->nfct;
nf_conntrack_get(old->nfct);
new->nfctinfo = old->nfctinfo;
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ new->nfct_reasm = old->nfct_reasm;
+ nf_conntrack_get_reasm(old->nfct_reasm);
+#endif
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
new->ipvs_property = old->ipvs_property;
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 9602ceb3bac..13cc3be4f05 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1242,8 +1242,7 @@ static void sock_def_write_space(struct sock *sk)
static void sock_def_destruct(struct sock *sk)
{
- if (sk->sk_protinfo)
- kfree(sk->sk_protinfo);
+ kfree(sk->sk_protinfo);
}
void sk_send_sigurg(struct sock *sk)
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf874..15bfd03e802 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
struct task_struct *tsk = current;
DEFINE_WAIT(wait);
+ int done;
- while (1) {
+ do {
if (sk->sk_err)
return sock_error(sk);
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
- if (sk_wait_event(sk, timeo_p,
- !((1 << sk->sk_state) &
- ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
- break;
+ done = sk_wait_event(sk, timeo_p,
+ !((1 << sk->sk_state) &
+ ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
finish_wait(sk->sk_sleep, &wait);
sk->sk_write_pending--;
- }
+ } while (!done);
return 0;
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff9..ca03521112c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
.lhash_users = ATOMIC_INIT(0),
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
- .portalloc_lock = SPIN_LOCK_UNLOCKED,
- .port_rover = 1024 - 1,
};
EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
int ret;
if (snum == 0) {
- int rover;
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
+ int rover = net_random() % (high - low) + low;
struct hlist_node *node;
struct inet_timewait_sock *tw = NULL;
local_bh_disable();
-
- /* TODO. Actually it is not so bad idea to remove
- * dccp_hashinfo.portalloc_lock before next submission to
- * Linus.
- * As soon as we touch this place at all it is time to think.
- *
- * Now it protects single _advisory_ variable
- * dccp_hashinfo.port_rover, hence it is mostly useless.
- * Code will work nicely if we just delete it, but
- * I am afraid in contented case it will work not better or
- * even worse: another cpu just will hit the same bucket
- * and spin there.
- * So some cpu salt could remove both contention and
- * memory pingpong. Any ideas how to do this in a nice way?
- */
- spin_lock(&dccp_hashinfo.portalloc_lock);
- rover = dccp_hashinfo.port_rover;
-
do {
- rover++;
- if ((rover < low) || (rover > high))
- rover = low;
head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
dccp_hashinfo.bhash_size)];
spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
next_port:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- dccp_hashinfo.port_rover = rover;
- spin_unlock(&dccp_hashinfo.portalloc_lock);
local_bh_enable();
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
ok:
/* All locks still held and bhs disabled */
- dccp_hashinfo.port_rover = rover;
- spin_unlock(&dccp_hashinfo.portalloc_lock);
-
inet_bind_hash(sk, tb, rover);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(rover);
@@ -1289,10 +1263,8 @@ static int dccp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash != NULL)
inet_put_port(&dccp_hashinfo, sk);
- if (dp->dccps_service_list != NULL) {
- kfree(dp->dccps_service_list);
- dp->dccps_service_list = NULL;
- }
+ kfree(dp->dccps_service_list);
+ dp->dccps_service_list = NULL;
ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index a021c3422f6..e0ace7cbb99 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -238,8 +238,7 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service,
lock_sock(sk);
dp->dccps_service = service;
- if (dp->dccps_service_list != NULL)
- kfree(dp->dccps_service_list);
+ kfree(dp->dccps_service_list);
dp->dccps_service_list = sl;
release_sock(sk);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 3f25cadccdd..f89e55f814d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- rv = dn_check_state(sk, NULL, 0, &timeo, flags);
- if (rv)
- goto out;
-
if (sk->sk_shutdown & RCV_SHUTDOWN) {
- if (!(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- rv = -EPIPE;
+ rv = 0;
goto out;
}
+ rv = dn_check_state(sk, NULL, 0, &timeo, flags);
+ if (rv)
+ goto out;
+
if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
rv = -EOPNOTSUPP;
goto out;
@@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
if (sk->sk_shutdown & SEND_SHUTDOWN) {
err = -EPIPE;
+ if (!(flags & MSG_NOSIGNAL))
+ send_sig(SIGPIPE, current, 0);
goto out_err;
}
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index eeba56f9932..6f8b5658cb4 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -784,16 +784,14 @@ struct dn_fib_table *dn_fib_get_table(int n, int create)
static void dn_fib_del_tree(int n)
{
- struct dn_fib_table *t;
+ struct dn_fib_table *t;
- write_lock(&dn_fib_tables_lock);
- t = dn_fib_tables[n];
- dn_fib_tables[n] = NULL;
- write_unlock(&dn_fib_tables_lock);
+ write_lock(&dn_fib_tables_lock);
+ t = dn_fib_tables[n];
+ dn_fib_tables[n] = NULL;
+ write_unlock(&dn_fib_tables_lock);
- if (t) {
- kfree(t);
- }
+ kfree(t);
}
struct dn_fib_table *dn_fib_empty_table(void)
diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c
index 98a494be603..9d57b4fb644 100644
--- a/net/ethernet/pe2.c
+++ b/net/ethernet/pe2.c
@@ -32,8 +32,7 @@ struct datalink_proto *make_EII_client(void)
void destroy_EII_client(struct datalink_proto *dl)
{
- if (dl)
- kfree(dl);
+ kfree(dl);
}
EXPORT_SYMBOL(destroy_EII_client);
diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c
index f3b6aa3be63..ecc9bb196ab 100644
--- a/net/ieee80211/ieee80211_crypt.c
+++ b/net/ieee80211/ieee80211_crypt.c
@@ -11,16 +11,14 @@
*
*/
-#include <linux/config.h>
-#include <linux/version.h>
+#include <linux/errno.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <asm/string.h>
-#include <asm/errno.h>
-
+#include <linux/string.h>
#include <net/ieee80211.h>
+
MODULE_AUTHOR("Jouni Malinen");
MODULE_DESCRIPTION("HostAP crypto");
MODULE_LICENSE("GPL");
@@ -30,32 +28,20 @@ struct ieee80211_crypto_alg {
struct ieee80211_crypto_ops *ops;
};
-struct ieee80211_crypto {
- struct list_head algs;
- spinlock_t lock;
-};
-
-static struct ieee80211_crypto *hcrypt;
+static LIST_HEAD(ieee80211_crypto_algs);
+static DEFINE_SPINLOCK(ieee80211_crypto_lock);
void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
{
- struct list_head *ptr, *n;
- struct ieee80211_crypt_data *entry;
+ struct ieee80211_crypt_data *entry, *next;
unsigned long flags;
spin_lock_irqsave(&ieee->lock, flags);
-
- if (list_empty(&ieee->crypt_deinit_list))
- goto unlock;
-
- for (ptr = ieee->crypt_deinit_list.next, n = ptr->next;
- ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) {
- entry = list_entry(ptr, struct ieee80211_crypt_data, list);
-
+ list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) {
if (atomic_read(&entry->refcnt) != 0 && !force)
continue;
- list_del(ptr);
+ list_del(&entry->list);
if (entry->ops) {
entry->ops->deinit(entry->priv);
@@ -63,7 +49,6 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force)
}
kfree(entry);
}
- unlock:
spin_unlock_irqrestore(&ieee->lock, flags);
}
@@ -126,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
unsigned long flags;
struct ieee80211_crypto_alg *alg;
- if (hcrypt == NULL)
- return -1;
-
alg = kmalloc(sizeof(*alg), GFP_KERNEL);
if (alg == NULL)
return -ENOMEM;
@@ -136,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
memset(alg, 0, sizeof(*alg));
alg->ops = ops;
- spin_lock_irqsave(&hcrypt->lock, flags);
- list_add(&alg->list, &hcrypt->algs);
- spin_unlock_irqrestore(&hcrypt->lock, flags);
+ spin_lock_irqsave(&ieee80211_crypto_lock, flags);
+ list_add(&alg->list, &ieee80211_crypto_algs);
+ spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n",
ops->name);
@@ -148,64 +130,49 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops)
int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops)
{
+ struct ieee80211_crypto_alg *alg;
unsigned long flags;
- struct list_head *ptr;
- struct ieee80211_crypto_alg *del_alg = NULL;
-
- if (hcrypt == NULL)
- return -1;
-
- spin_lock_irqsave(&hcrypt->lock, flags);
- for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
- struct ieee80211_crypto_alg *alg =
- (struct ieee80211_crypto_alg *)ptr;
- if (alg->ops == ops) {
- list_del(&alg->list);
- del_alg = alg;
- break;
- }
- }
- spin_unlock_irqrestore(&hcrypt->lock, flags);
- if (del_alg) {
- printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
- "'%s'\n", ops->name);
- kfree(del_alg);
+ spin_lock_irqsave(&ieee80211_crypto_lock, flags);
+ list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
+ if (alg->ops == ops)
+ goto found;
}
-
- return del_alg ? 0 : -1;
+ spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
+ return -EINVAL;
+
+ found:
+ printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
+ "'%s'\n", ops->name);
+ list_del(&alg->list);
+ spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
+ kfree(alg);
+ return 0;
}
struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name)
{
+ struct ieee80211_crypto_alg *alg;
unsigned long flags;
- struct list_head *ptr;
- struct ieee80211_crypto_alg *found_alg = NULL;
-
- if (hcrypt == NULL)
- return NULL;
-
- spin_lock_irqsave(&hcrypt->lock, flags);
- for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) {
- struct ieee80211_crypto_alg *alg =
- (struct ieee80211_crypto_alg *)ptr;
- if (strcmp(alg->ops->name, name) == 0) {
- found_alg = alg;
- break;
- }
+
+ spin_lock_irqsave(&ieee80211_crypto_lock, flags);
+ list_for_each_entry(alg, &ieee80211_crypto_algs, list) {
+ if (strcmp(alg->ops->name, name) == 0)
+ goto found;
}
- spin_unlock_irqrestore(&hcrypt->lock, flags);
+ spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
+ return NULL;
- if (found_alg)
- return found_alg->ops;
- else
- return NULL;
+ found:
+ spin_unlock_irqrestore(&ieee80211_crypto_lock, flags);
+ return alg->ops;
}
static void *ieee80211_crypt_null_init(int keyidx)
{
return (void *)1;
}
+
static void ieee80211_crypt_null_deinit(void *priv)
{
}
@@ -214,56 +181,18 @@ static struct ieee80211_crypto_ops ieee80211_crypt_null = {
.name = "NULL",
.init = ieee80211_crypt_null_init,
.deinit = ieee80211_crypt_null_deinit,
- .encrypt_mpdu = NULL,
- .decrypt_mpdu = NULL,
- .encrypt_msdu = NULL,
- .decrypt_msdu = NULL,
- .set_key = NULL,
- .get_key = NULL,
- .extra_mpdu_prefix_len = 0,
- .extra_mpdu_postfix_len = 0,
.owner = THIS_MODULE,
};
static int __init ieee80211_crypto_init(void)
{
- int ret = -ENOMEM;
-
- hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL);
- if (!hcrypt)
- goto out;
-
- memset(hcrypt, 0, sizeof(*hcrypt));
- INIT_LIST_HEAD(&hcrypt->algs);
- spin_lock_init(&hcrypt->lock);
-
- ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null);
- if (ret < 0) {
- kfree(hcrypt);
- hcrypt = NULL;
- }
- out:
- return ret;
+ return ieee80211_register_crypto_ops(&ieee80211_crypt_null);
}
static void __exit ieee80211_crypto_deinit(void)
{
- struct list_head *ptr, *n;
-
- if (hcrypt == NULL)
- return;
-
- for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs;
- ptr = n, n = ptr->next) {
- struct ieee80211_crypto_alg *alg =
- (struct ieee80211_crypto_alg *)ptr;
- list_del(ptr);
- printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm "
- "'%s' (deinit)\n", alg->ops->name);
- kfree(alg);
- }
-
- kfree(hcrypt);
+ ieee80211_unregister_crypto_ops(&ieee80211_crypt_null);
+ BUG_ON(!list_empty(&ieee80211_crypto_algs));
}
EXPORT_SYMBOL(ieee80211_crypt_deinit_entries);
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c
index 05a853c1301..47022172850 100644
--- a/net/ieee80211/ieee80211_crypt_ccmp.c
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -10,7 +10,6 @@
*/
#include <linux/config.h>
-#include <linux/version.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c
index 2e34f29b795..e0988320efb 100644
--- a/net/ieee80211/ieee80211_crypt_tkip.c
+++ b/net/ieee80211/ieee80211_crypt_tkip.c
@@ -10,7 +10,6 @@
*/
#include <linux/config.h>
-#include <linux/version.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c
index 7c08ed2f262..073aebdf0f6 100644
--- a/net/ieee80211/ieee80211_crypt_wep.c
+++ b/net/ieee80211/ieee80211_crypt_wep.c
@@ -10,7 +10,6 @@
*/
#include <linux/config.h>
-#include <linux/version.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c
index c4b54ef8f6d..610cc5cbc25 100644
--- a/net/ieee80211/ieee80211_geo.c
+++ b/net/ieee80211/ieee80211_geo.c
@@ -38,7 +38,6 @@
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/wireless.h>
#include <linux/etherdevice.h>
#include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c
index f66d792cd20..321287bc887 100644
--- a/net/ieee80211/ieee80211_module.c
+++ b/net/ieee80211/ieee80211_module.c
@@ -45,7 +45,6 @@
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/wireless.h>
#include <linux/etherdevice.h>
#include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index ce694cf5c16..03efaacbdb7 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -28,7 +28,6 @@
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/wireless.h>
#include <linux/etherdevice.h>
#include <asm/uaccess.h>
@@ -370,6 +369,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
/* Put this code here so that we avoid duplicating it in all
* Rx paths. - Jean II */
#ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */
+#ifdef CONFIG_NET_RADIO
/* If spy monitoring on */
if (ieee->spy_data.spy_number > 0) {
struct iw_quality wstats;
@@ -396,6 +396,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb,
/* Update spy records */
wireless_spy_update(ieee->dev, hdr->addr2, &wstats);
}
+#endif /* CONFIG_NET_RADIO */
#endif /* IW_WIRELESS_SPY */
#ifdef NOT_YET
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
index 95ccbadbf55..445f206e65e 100644
--- a/net/ieee80211/ieee80211_tx.c
+++ b/net/ieee80211/ieee80211_tx.c
@@ -38,7 +38,6 @@
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/wireless.h>
#include <linux/etherdevice.h>
#include <asm/uaccess.h>
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
index 1ce7af9bec3..181755f2aa8 100644
--- a/net/ieee80211/ieee80211_wx.c
+++ b/net/ieee80211/ieee80211_wx.c
@@ -161,9 +161,11 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee,
(ieee->perfect_rssi - ieee->worst_rssi) -
(ieee->perfect_rssi - network->stats.rssi) *
(15 * (ieee->perfect_rssi - ieee->worst_rssi) +
- 62 * (ieee->perfect_rssi - network->stats.rssi))) /
- ((ieee->perfect_rssi - ieee->worst_rssi) *
- (ieee->perfect_rssi - ieee->worst_rssi));
+ 62 * (ieee->perfect_rssi -
+ network->stats.rssi))) /
+ ((ieee->perfect_rssi -
+ ieee->worst_rssi) * (ieee->perfect_rssi -
+ ieee->worst_rssi));
if (iwe.u.qual.qual > 100)
iwe.u.qual.qual = 100;
else if (iwe.u.qual.qual < 1)
@@ -520,7 +522,8 @@ int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee,
crypt = &ieee->crypt[idx];
group_key = 1;
} else {
- if (idx != 0)
+ /* some Cisco APs use idx>0 for unicast in dynamic WEP */
+ if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP)
return -EINVAL;
if (ieee->iw_mode == IW_MODE_INFRA)
crypt = &ieee->crypt[idx];
@@ -688,7 +691,8 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee,
} else
idx = ieee->tx_keyidx;
- if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY)
+ if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY &&
+ ext->alg != IW_ENCODE_ALG_WEP)
if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA)
return -EINVAL;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a9d84f93442..eaa150c33b0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -147,8 +147,7 @@ void inet_sock_destruct(struct sock *sk)
BUG_TRAP(!sk->sk_wmem_queued);
BUG_TRAP(!sk->sk_forward_alloc);
- if (inet->opt)
- kfree(inet->opt);
+ kfree(inet->opt);
dst_release(sk->sk_dst_cache);
sk_refcnt_debug_dec(sk);
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 990633c09df..2267c1fad87 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -266,8 +266,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg)
if (tb)
err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
}
- if (rta.rta_mx)
- kfree(rta.rta_mx);
+ kfree(rta.rta_mx);
}
rtnl_unlock();
return err;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 175e093ec56..e3eceecd049 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb)
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
goto error;
- default:;
}
if (!pskb_pull(skb, sizeof(struct icmphdr)))
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c6247fc8406..c04607b4921 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb)
return 0;
}
- if (!pskb_may_pull(skb, sizeof(struct igmphdr)) ||
- (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
- in_dev_put(in_dev);
- kfree_skb(skb);
- return 0;
+ if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
+ goto drop;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb))
+ goto drop;
}
ih = skb->h.igmph;
@@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb)
default:
NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
}
+
+drop:
in_dev_put(in_dev);
kfree_skb(skb);
return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5b..3fe021f1a56 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
- int rover;
+ int rover = net_random() % (high - low) + low;
- spin_lock(&hashinfo->portalloc_lock);
- if (hashinfo->port_rover < low)
- rover = low;
- else
- rover = hashinfo->port_rover;
do {
- rover++;
- if (rover > high)
- rover = low;
head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
break;
next:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- hashinfo->port_rover = rover;
- spin_unlock(&hashinfo->portalloc_lock);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 71f3c7350c6..39061ed53cf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -724,12 +724,6 @@ done:
return skb->len;
}
-static int inet_diag_dump_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
-
static __inline__ int
inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
@@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
goto err_inval;
}
return netlink_dump_start(idiagnl, skb, nlh,
- inet_diag_dump,
- inet_diag_dump_done);
+ inet_diag_dump, NULL);
} else {
return inet_diag_get_exact(skb, nlh);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e7d26d9943c..8ce0ce2ee48 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -71,7 +71,7 @@ struct ipfrag_skb_cb
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
- struct ipq *next; /* linked list pointers */
+ struct hlist_node list;
struct list_head lru_list; /* lru list member */
u32 user;
u32 saddr;
@@ -89,7 +89,6 @@ struct ipq {
spinlock_t lock;
atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
- struct ipq **pprev;
int iif;
struct timeval stamp;
};
@@ -99,7 +98,7 @@ struct ipq {
#define IPQ_HASHSZ 64
/* Per-bucket lock is easy to add now. */
-static struct ipq *ipq_hash[IPQ_HASHSZ];
+static struct hlist_head ipq_hash[IPQ_HASHSZ];
static DEFINE_RWLOCK(ipfrag_lock);
static u32 ipfrag_hash_rnd;
static LIST_HEAD(ipq_lru_list);
@@ -107,9 +106,7 @@ int ip_frag_nqueues = 0;
static __inline__ void __ipq_unlink(struct ipq *qp)
{
- if(qp->next)
- qp->next->pprev = qp->pprev;
- *qp->pprev = qp->next;
+ hlist_del(&qp->list);
list_del(&qp->lru_list);
ip_frag_nqueues--;
}
@@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy)
get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
for (i = 0; i < IPQ_HASHSZ; i++) {
struct ipq *q;
+ struct hlist_node *p, *n;
- q = ipq_hash[i];
- while (q) {
- struct ipq *next = q->next;
+ hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) {
unsigned int hval = ipqhashfn(q->id, q->saddr,
q->daddr, q->protocol);
if (hval != i) {
- /* Unlink. */
- if (q->next)
- q->next->pprev = q->pprev;
- *q->pprev = q->next;
+ hlist_del(&q->list);
/* Relink to new hash chain. */
- if ((q->next = ipq_hash[hval]) != NULL)
- q->next->pprev = &q->next;
- ipq_hash[hval] = q;
- q->pprev = &ipq_hash[hval];
+ hlist_add_head(&q->list, &ipq_hash[hval]);
}
-
- q = next;
}
}
write_unlock(&ipfrag_lock);
@@ -310,14 +298,16 @@ out:
static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
{
struct ipq *qp;
-
+#ifdef CONFIG_SMP
+ struct hlist_node *n;
+#endif
write_lock(&ipfrag_lock);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
* promoted read lock to write lock.
*/
- for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
if(qp->id == qp_in->id &&
qp->saddr == qp_in->saddr &&
qp->daddr == qp_in->daddr &&
@@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
- if((qp->next = ipq_hash[hash]) != NULL)
- qp->next->pprev = &qp->next;
- ipq_hash[hash] = qp;
- qp->pprev = &ipq_hash[hash];
+ hlist_add_head(&qp->list, &ipq_hash[hash]);
INIT_LIST_HEAD(&qp->lru_list);
list_add_tail(&qp->lru_list, &ipq_lru_list);
ip_frag_nqueues++;
@@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
__u8 protocol = iph->protocol;
unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
struct ipq *qp;
+ struct hlist_node *n;
read_lock(&ipfrag_lock);
- for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+ hlist_for_each_entry(qp, n, &ipq_hash[hash], list) {
if(qp->id == id &&
qp->saddr == saddr &&
qp->daddr == daddr &&
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 896ce3f8f53..4e9c74b54b1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb)
goto drop_nolock;
if (flags&GRE_CSUM) {
- if (skb->ip_summed == CHECKSUM_HW) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
csum = (u16)csum_fold(skb->csum);
- if (csum)
- skb->ip_summed = CHECKSUM_NONE;
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- skb->csum = skb_checksum(skb, 0, skb->len, 0);
+ if (!csum)
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = 0;
+ csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_HW;
- csum = (u16)csum_fold(skb->csum);
}
offset += 4;
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bce4e875193..dbe12da8d8b 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -510,8 +510,7 @@ static int ip_options_get_finish(struct ip_options **optp,
kfree(opt);
return -EINVAL;
}
- if (*optp)
- kfree(*optp);
+ kfree(*optp);
*optp = opt;
return 0;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 17758234a3e..11c2f68254f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -353,7 +353,8 @@ packet_routed:
ip_options_build(skb, opt, inet->daddr, rt, 0);
}
- ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
+ ip_select_ident_more(iph, &rt->u.dst, sk,
+ (skb_shinfo(skb)->tso_segs ?: 1) - 1);
/* Add an IP checksum. */
ip_send_check(iph);
@@ -1262,10 +1263,8 @@ int ip_push_pending_frames(struct sock *sk)
out:
inet->cork.flags &= ~IPCORK_OPT;
- if (inet->cork.opt) {
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- }
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
if (inet->cork.rt) {
ip_rt_put(inet->cork.rt);
inet->cork.rt = NULL;
@@ -1289,10 +1288,8 @@ void ip_flush_pending_frames(struct sock *sk)
kfree_skb(skb);
inet->cork.flags &= ~IPCORK_OPT;
- if (inet->cork.opt) {
- kfree(inet->cork.opt);
- inet->cork.opt = NULL;
- }
+ kfree(inet->cork.opt);
+ inet->cork.opt = NULL;
if (inet->cork.rt) {
ip_rt_put(inet->cork.rt);
inet->cork.rt = NULL;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 2f0b47da5b3..4f2d8725730 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -202,8 +202,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s
if (ra->sk == sk) {
if (on) {
write_unlock_bh(&ip_ra_lock);
- if (new_ra)
- kfree(new_ra);
+ kfree(new_ra);
return -EADDRINUSE;
}
*rap = ra->next;
@@ -446,8 +445,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
#endif
}
opt = xchg(&inet->opt, opt);
- if (opt)
- kfree(opt);
+ kfree(opt);
break;
}
case IP_PKTINFO:
@@ -828,10 +826,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
err = ip_mc_msfilter(sk, msf, ifindex);
mc_msf_out:
- if (msf)
- kfree(msf);
- if (gsf)
- kfree(gsf);
+ kfree(msf);
+ kfree(gsf);
break;
}
case IP_ROUTER_ALERT:
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index fc6f95aaa96..d7eb680101c 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -110,8 +110,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
return 0;
out:
- if (inc->timeout_table)
- kfree(inc->timeout_table);
+ kfree(inc->timeout_table);
kfree(inc);
return ret;
}
@@ -136,8 +135,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
list_del(&inc->a_list);
- if (inc->timeout_table != NULL)
- kfree(inc->timeout_table);
+ kfree(inc->timeout_table);
kfree(inc);
}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 981cc3244ef..1a0843cd58a 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1009,11 +1009,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
- } else {
- /* don't restart its timer, and silently
- drop the packet. */
- __ip_vs_conn_put(cp);
}
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
return NF_DROP;
}
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index bd7d75b6abe..d34a9fa608e 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -207,16 +207,12 @@ static void wrandom_select_route(const struct flowi *flp,
decision = mpc->rt;
last_power = mpc->power;
- if (last_mpc)
- kfree(last_mpc);
-
+ kfree(last_mpc);
last_mpc = mpc;
}
- if (last_mpc) {
- /* concurrent __multipath_flush may lead to !last_mpc */
- kfree(last_mpc);
- }
+ /* concurrent __multipath_flush may lead to !last_mpc */
+ kfree(last_mpc);
decision->u.dst.__use++;
*rp = decision;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 7d917e4ce1d..9d3c8b5f327 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -5,6 +5,20 @@
menu "IP: Netfilter Configuration"
depends on INET && NETFILTER
+config NF_CONNTRACK_IPV4
+ tristate "IPv4 support for new connection tracking (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && NF_CONNTRACK
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv4 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
# connection tracking, helpers and protocols
config IP_NF_CONNTRACK
tristate "Connection tracking (required for masq/NAT)"
@@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE
tristate "Packet type match support"
depends on IP_NF_IPTABLES
help
- Packet type matching allows you to match a packet by
- its "class", eg. BROADCAST, MULTICAST, ...
+ Packet type matching allows you to match a packet by
+ its "class", eg. BROADCAST, MULTICAST, ...
Typical usage:
iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
@@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS
config IP_NF_MATCH_HELPER
tristate "Helper match support"
- depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES
+ depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
help
Helper matching allows you to match packets in dynamic connections
tracked by a conntrack-helper, ie. ip_conntrack_ftp
@@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER
config IP_NF_MATCH_STATE
tristate "Connection state match support"
- depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES
+ depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
help
Connection state matching allows you to match packets based on their
relationship to a tracked connection (ie. previous packets). This
@@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE
config IP_NF_MATCH_CONNTRACK
tristate "Connection tracking match support"
- depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES
+ depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
help
This is a general conntrack match module, a superset of the state match.
@@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT
config IP_NF_MATCH_CONNMARK
tristate 'Connection mark match support'
- depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES
+ depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
This option adds a `connmark' match, which allows you to match the
connection mark value previously set for the session by `CONNMARK'.
@@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK
config IP_NF_MATCH_CONNBYTES
tristate 'Connection byte/packet counter match support'
- depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+ depends on IP_NF_IPTABLES
+ depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4)
help
This option adds a `connbytes' match, which allows you to match the
number of bytes and/or packets for each direction within a connection.
@@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL
config IP_NF_TARGET_CONNMARK
tristate 'CONNMARK target support'
- depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
+ depends on IP_NF_MANGLE
+ depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
This option adds a `CONNMARK' target, which allows one to manipulate
the connection mark value. Similar to the MARK target, but
@@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK
config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support (EXPERIMENTAL)"
- depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+ depends on IP_NF_IPTABLES && EXPERIMENTAL
+ depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4)
help
The CLUSTERIP target allows you to build load-balancing clusters of
network servers without having a dedicated load-balancing
@@ -782,7 +803,7 @@ config IP_NF_RAW
config IP_NF_TARGET_NOTRACK
tristate 'NOTRACK target support'
depends on IP_NF_RAW
- depends on IP_NF_CONNTRACK
+ depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4
help
The NOTRACK target allows a select rule to specify
which packets *not* to enter the conntrack/NAT
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index dab4b58dd31..058c48e258f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
+
+# l3 independent conntrack
+obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index d77d6b3f5f8..59e12b02b22 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -29,9 +29,9 @@ static char *ftp_buffer;
static DEFINE_SPINLOCK(ip_ftp_lock);
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
static int loose;
module_param(loose, int, 0600);
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 926a6684643..4108a5e12b3 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master,
exp_orig->expectfn = pptp_expectfn;
exp_orig->flags = 0;
- exp_orig->dir = IP_CT_DIR_ORIGINAL;
-
/* both expectations are identical apart from tuple */
memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
- exp_reply->dir = !exp_orig->dir;
-
if (ip_nat_pptp_hook_exp_gre)
ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
else {
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 15457415a4f..2dea1db1440 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -34,7 +34,7 @@
#include <linux/moduleparam.h>
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
static int max_dcc_channels = 8;
static unsigned int dcc_timeout = 300;
@@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
MODULE_LICENSE("GPL");
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of IRC servers");
module_param(max_dcc_channels, int, 0400);
MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 166e6069f12..de9f4464438 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -28,11 +28,8 @@
#include <linux/netlink.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
-#include <linux/rtnetlink.h>
#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -58,14 +55,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct ip_conntrack_tuple *tuple)
{
struct ip_conntrack_protocol *proto;
+ int ret = 0;
NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
- if (proto && proto->tuple_to_nfattr)
- return proto->tuple_to_nfattr(skb, tuple);
+ if (likely(proto && proto->tuple_to_nfattr)) {
+ ret = proto->tuple_to_nfattr(skb, tuple);
+ ip_conntrack_proto_put(proto);
+ }
- return 0;
+ return ret;
nfattr_failure:
return -1;
@@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
{
enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
struct nfattr *nest_count = NFA_NEST(skb, type);
- u_int64_t tmp;
+ u_int32_t tmp;
tmp = htonl(ct->counters[dir].packets);
NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
@@ -467,7 +467,7 @@ out:
}
#endif
-static const int cta_min_ip[CTA_IP_MAX] = {
+static const size_t cta_min_ip[CTA_IP_MAX] = {
[CTA_IP_V4_SRC-1] = sizeof(u_int32_t),
[CTA_IP_V4_DST-1] = sizeof(u_int32_t),
};
@@ -479,9 +479,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
DEBUGP("entered %s\n", __FUNCTION__);
-
- if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_IP_MAX, attr);
if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
return -EINVAL;
@@ -497,12 +495,9 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
DEBUGP("leaving\n");
return 0;
-
-nfattr_failure:
- return -1;
}
-static const int cta_min_proto[CTA_PROTO_MAX] = {
+static const size_t cta_min_proto[CTA_PROTO_MAX] = {
[CTA_PROTO_NUM-1] = sizeof(u_int16_t),
[CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
[CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t),
@@ -521,8 +516,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
DEBUGP("entered %s\n", __FUNCTION__);
- if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_PROTO_MAX, attr);
if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
return -EINVAL;
@@ -539,9 +533,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr,
}
return ret;
-
-nfattr_failure:
- return -1;
}
static inline int
@@ -555,8 +546,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
memset(tuple, 0, sizeof(*tuple));
- if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]);
if (!tb[CTA_TUPLE_IP-1])
return -EINVAL;
@@ -583,13 +573,10 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
DEBUGP("leaving\n");
return 0;
-
-nfattr_failure:
- return -1;
}
#ifdef CONFIG_IP_NF_NAT_NEEDED
-static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = {
[CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t),
[CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t),
};
@@ -603,11 +590,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
DEBUGP("entered %s\n", __FUNCTION__);
- if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr);
if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
- goto nfattr_failure;
+ return -EINVAL;
npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
if (!npt)
@@ -626,11 +612,13 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr,
DEBUGP("leaving\n");
return 0;
-
-nfattr_failure:
- return -1;
}
+static const size_t cta_min_nat[CTA_NAT_MAX] = {
+ [CTA_NAT_MINIP-1] = sizeof(u_int32_t),
+ [CTA_NAT_MAXIP-1] = sizeof(u_int32_t),
+};
+
static inline int
ctnetlink_parse_nat(struct nfattr *cda[],
const struct ip_conntrack *ct, struct ip_nat_range *range)
@@ -642,8 +630,10 @@ ctnetlink_parse_nat(struct nfattr *cda[],
memset(range, 0, sizeof(*range));
- if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]);
+
+ if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat))
+ return -EINVAL;
if (tb[CTA_NAT_MINIP-1])
range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
@@ -665,9 +655,6 @@ ctnetlink_parse_nat(struct nfattr *cda[],
DEBUGP("leaving\n");
return 0;
-
-nfattr_failure:
- return -1;
}
#endif
@@ -678,8 +665,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
DEBUGP("entered %s\n", __FUNCTION__);
- if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_HELP_MAX, attr);
if (!tb[CTA_HELP_NAME-1])
return -EINVAL;
@@ -687,11 +673,16 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
*helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
return 0;
-
-nfattr_failure:
- return -1;
}
+static const size_t cta_min[CTA_MAX] = {
+ [CTA_STATUS-1] = sizeof(u_int32_t),
+ [CTA_TIMEOUT-1] = sizeof(u_int32_t),
+ [CTA_MARK-1] = sizeof(u_int32_t),
+ [CTA_USE-1] = sizeof(u_int32_t),
+ [CTA_ID-1] = sizeof(u_int32_t)
+};
+
static int
ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
@@ -703,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
else if (cda[CTA_TUPLE_REPLY-1])
@@ -785,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
else if (cda[CTA_TUPLE_REPLY-1])
@@ -804,7 +801,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
ct = tuplehash_to_ctrack(h);
err = -ENOMEM;
- skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+ skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb2) {
ip_conntrack_put(ct);
return -ENOMEM;
@@ -815,7 +812,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
IPCTNL_MSG_CT_NEW, 1, ct);
ip_conntrack_put(ct);
if (err <= 0)
- goto out;
+ goto free;
err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
if (err < 0)
@@ -824,10 +821,10 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("leaving\n");
return 0;
+free:
+ kfree_skb(skb2);
out:
- if (skb2)
- kfree_skb(skb2);
- return -1;
+ return err;
}
static inline int
@@ -957,8 +954,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
int err = 0;
- if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0)
- goto nfattr_failure;
+ nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr);
proto = ip_conntrack_proto_find_get(npt);
if (!proto)
@@ -969,9 +965,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
ip_conntrack_proto_put(proto);
return err;
-
-nfattr_failure:
- return -ENOMEM;
}
static int
@@ -1005,6 +998,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
return err;
}
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK-1])
+ ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#endif
+
DEBUGP("all done\n");
return 0;
}
@@ -1048,6 +1046,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[],
if (ct->helper)
ip_conntrack_helper_put(ct->helper);
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+ if (cda[CTA_MARK-1])
+ ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1]));
+#endif
+
DEBUGP("conntrack with id %u inserted\n", ct->id);
return 0;
@@ -1066,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_MAX, cta_min))
+ return -EINVAL;
+
if (cda[CTA_TUPLE_ORIG-1]) {
err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
if (err < 0)
@@ -1271,6 +1277,11 @@ out:
return skb->len;
}
+static const size_t cta_min_exp[CTA_EXPECT_MAX] = {
+ [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t),
+ [CTA_EXPECT_ID-1] = sizeof(u_int32_t)
+};
+
static int
ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
@@ -1282,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct nfgenmsg *msg = NLMSG_DATA(nlh);
u32 rlen;
@@ -1312,6 +1326,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
if (!exp)
return -ENOENT;
+ if (cda[CTA_EXPECT_ID-1]) {
+ u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+ if (exp->id != ntohl(id)) {
+ ip_conntrack_expect_put(exp);
+ return -ENOENT;
+ }
+ }
+
err = -ENOMEM;
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb2)
@@ -1322,21 +1344,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
1, exp);
if (err <= 0)
- goto out;
+ goto free;
ip_conntrack_expect_put(exp);
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
- if (err < 0)
- goto free;
-
- return err;
+ return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+free:
+ kfree_skb(skb2);
out:
ip_conntrack_expect_put(exp);
-free:
- if (skb2)
- kfree_skb(skb2);
return err;
}
@@ -1349,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
struct ip_conntrack_helper *h;
int err;
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (cda[CTA_EXPECT_TUPLE-1]) {
/* delete a single expect by tuple */
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
@@ -1392,7 +1412,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
ip_conntrack_expect_put(exp);
}
}
- write_unlock(&ip_conntrack_lock);
+ write_unlock_bh(&ip_conntrack_lock);
} else {
/* This basically means we have to flush everything*/
write_lock_bh(&ip_conntrack_lock);
@@ -1478,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
DEBUGP("entered %s\n", __FUNCTION__);
+ if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp))
+ return -EINVAL;
+
if (!cda[CTA_EXPECT_TUPLE-1]
|| !cda[CTA_EXPECT_MASK-1]
|| !cda[CTA_EXPECT_MASTER-1])
@@ -1520,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = {
static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
[IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_MAX, },
};
static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
[IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
[IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
[IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = CTA_EXPECT_MAX, },
};
static struct nfnetlink_subsystem ctnl_subsys = {
@@ -1559,6 +1575,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = {
.cb = ctnl_exp_cb,
};
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
+
static int __init ctnetlink_init(void)
{
int ret;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 98f0015dd25..e4d6b268e8c 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -13,6 +13,7 @@
#include <linux/in.h>
#include <linux/icmp.h>
#include <linux/seq_file.h>
+#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/checksum.h>
#include <linux/netfilter.h>
@@ -151,13 +152,13 @@ icmp_error_message(struct sk_buff *skb,
/* Not enough header? */
inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
if (inside == NULL)
- return NF_ACCEPT;
+ return -NF_ACCEPT;
/* Ignore ICMP's containing fragments (shouldn't happen) */
if (inside->ip.frag_off & htons(IP_OFFSET)) {
DEBUGP("icmp_error_track: fragment of proto %u\n",
inside->ip.protocol);
- return NF_ACCEPT;
+ return -NF_ACCEPT;
}
innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
@@ -166,7 +167,7 @@ icmp_error_message(struct sk_buff *skb,
if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
ip_conntrack_proto_put(innerproto);
- return NF_ACCEPT;
+ return -NF_ACCEPT;
}
/* Ordinarily, we'd expect the inverted tupleproto, but it's
@@ -174,7 +175,7 @@ icmp_error_message(struct sk_buff *skb,
if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
DEBUGP("icmp_error_track: Can't invert tuple\n");
ip_conntrack_proto_put(innerproto);
- return NF_ACCEPT;
+ return -NF_ACCEPT;
}
ip_conntrack_proto_put(innerproto);
@@ -190,7 +191,7 @@ icmp_error_message(struct sk_buff *skb,
if (!h) {
DEBUGP("icmp_error_track: no match\n");
- return NF_ACCEPT;
+ return -NF_ACCEPT;
}
/* Reverse direction from that found */
if (DIRECTION(h) != IP_CT_DIR_REPLY)
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
case CHECKSUM_HW:
if (!(u16)csum_fold(skb->csum))
break;
- if (LOG_INVALID(IPPROTO_ICMP))
- nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
- "ip_ct_icmp: bad HW ICMP checksum ");
- return -NF_ACCEPT;
+ /* fall through */
case CHECKSUM_NONE:
- if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+ skb->csum = 0;
+ if (__skb_checksum_complete(skb)) {
if (LOG_INVALID(IPPROTO_ICMP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
"ip_ct_icmp: bad ICMP checksum ");
return -NF_ACCEPT;
}
- default:
- break;
}
checksum_skipped:
@@ -296,7 +293,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[],
struct ip_conntrack_tuple *tuple)
{
if (!tb[CTA_PROTO_ICMP_TYPE-1]
- || !tb[CTA_PROTO_ICMP_CODE-1])
+ || !tb[CTA_PROTO_ICMP_CODE-1]
+ || !tb[CTA_PROTO_ICMP_ID-1])
return -1;
tuple->dst.u.icmp.type =
@@ -304,7 +302,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[],
tuple->dst.u.icmp.code =
*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
tuple->src.u.icmp.id =
- *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+ *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
return 0;
}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index d6701cafbcc..ee3b7d6c4d2 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -357,13 +357,24 @@ nfattr_failure:
return -1;
}
+static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = {
+ [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t),
+};
+
static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
{
struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
- if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0)
- goto nfattr_failure;
+ /* updates could not contain anything about the private
+ * protocol info, in that case skip the parsing */
+ if (!attr)
+ return 0;
+
+ nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr);
+
+ if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp))
+ return -EINVAL;
if (!tb[CTA_PROTOINFO_TCP_STATE-1])
return -EINVAL;
@@ -374,9 +385,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
write_unlock_bh(&tcp_lock);
return 0;
-
-nfattr_failure:
- return -1;
}
#endif
@@ -813,6 +821,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
{
[TH_SYN] = 1,
[TH_SYN|TH_ACK] = 1,
+ [TH_SYN|TH_PUSH] = 1,
[TH_SYN|TH_ACK|TH_PUSH] = 1,
[TH_RST] = 1,
[TH_RST|TH_ACK] = 1,
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index a78736b8525..d3c5a371f99 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper");
MODULE_LICENSE("GPL");
#define MAX_PORTS 8
-static short ports[MAX_PORTS];
+static unsigned short ports[MAX_PORTS];
static int ports_c;
-module_param_array(ports, short, &ports_c, 0400);
+module_param_array(ports, ushort, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of tftp servers");
#if 0
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index c5e3abd2467..762f4d93936 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum)
* removed until we've grabbed the reference */
preempt_disable();
p = __ip_nat_proto_find(protonum);
- if (p) {
- if (!try_module_get(p->me))
- p = &ip_nat_unknown_protocol;
- }
+ if (!try_module_get(p->me))
+ p = &ip_nat_unknown_protocol;
preempt_enable();
return p;
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 3cdd0684d30..e546203f566 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
struct ip_conntrack_tuple t;
struct ip_ct_pptp_master *ct_pptp_info;
struct ip_nat_pptp *nat_pptp_info;
+ struct ip_nat_range range;
ct_pptp_info = &master->help.ct_pptp_info;
nat_pptp_info = &master->nat.help.nat_pptp_info;
@@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
DEBUGP("not found!\n");
}
- ip_nat_follow_master(ct, exp);
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where master sends to */
+ range.flags = IP_NAT_RANGE_MAP_IPS;
+ range.min_ip = range.max_ip
+ = ct->master->tuplehash[!exp->dir].tuple.dst.ip;
+ if (exp->dir == IP_CT_DIR_ORIGINAL) {
+ range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+ range.min = range.max = exp->saved_proto;
+ }
+ /* hook doesn't matter, but it has to do source manip */
+ ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = IP_NAT_RANGE_MAP_IPS;
+ range.min_ip = range.max_ip
+ = ct->master->tuplehash[!exp->dir].tuple.src.ip;
+ if (exp->dir == IP_CT_DIR_REPLY) {
+ range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+ range.min = range.max = exp->saved_proto;
+ }
+ /* hook doesn't matter, but it has to do destination manip */
+ ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
}
/* outbound packets == from PNS to PAC */
@@ -213,9 +237,10 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
/* alter expectation for PNS->PAC direction */
invert_tuplepr(&inv_t, &expect_orig->tuple);
- expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id);
+ expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id);
expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+ expect_orig->dir = IP_CT_DIR_ORIGINAL;
inv_t.src.ip = reply_t->src.ip;
inv_t.dst.ip = reply_t->dst.ip;
inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
@@ -233,6 +258,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+ expect_reply->dir = IP_CT_DIR_REPLY;
inv_t.src.ip = orig_t->src.ip;
inv_t.dst.ip = orig_t->dst.ip;
inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 7c128540167..f7cad7cf1ae 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb,
break;
case GRE_VERSION_PPTP:
DEBUGP("call_id -> 0x%04x\n",
- ntohl(tuple->dst.u.gre.key));
- pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key));
+ ntohs(tuple->dst.u.gre.key));
+ pgreh->call_id = tuple->dst.u.gre.key;
break;
default:
DEBUGP("can't nat unknown GRE version\n");
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index 99bbef56f84..f0099a646a0 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
struct ip_nat_protocol ip_nat_unknown_protocol = {
.name = "unknown",
- .me = THIS_MODULE,
+ /* .me isn't set: getting a ref to this cannot fail. */
.manip_pkt = unknown_manip_pkt,
.in_range = unknown_in_range,
.unique_tuple = unknown_unique_tuple,
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 93b2c5111bb..8acb7ed40b4 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1161,8 +1161,7 @@ static int snmp_parse_mangle(unsigned char *msg,
if (!snmp_object_decode(&ctx, obj)) {
if (*obj) {
- if ((*obj)->id)
- kfree((*obj)->id);
+ kfree((*obj)->id);
kfree(*obj);
}
kfree(obj);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9bcb398fbc1..45c52d8f4d9 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,7 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
#define CLUSTERIP_VERSION "0.8"
@@ -316,14 +316,14 @@ target(struct sk_buff **pskb,
{
const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
enum ip_conntrack_info ctinfo;
- struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
- u_int32_t hash;
+ u_int32_t *mark, hash;
/* don't need to clusterip_config_get() here, since refcount
* is only decremented by destroy() - and ip_tables guarantees
* that the ->target() function isn't called after ->destroy() */
- if (!ct) {
+ mark = nf_ct_get_mark((*pskb), &ctinfo);
+ if (mark == NULL) {
printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
/* FIXME: need to drop invalid ones, since replies
* to outgoing connections of other nodes will be
@@ -346,7 +346,7 @@ target(struct sk_buff **pskb,
switch (ctinfo) {
case IP_CT_NEW:
- ct->mark = hash;
+ *mark = hash;
break;
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
@@ -363,7 +363,7 @@ target(struct sk_buff **pskb,
#ifdef DEBUG_CLUSTERP
DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
#endif
- DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
+ DEBUGP("hash=%u ct_hash=%u ", hash, *mark);
if (!clusterip_responsible(cipinfo->config, hash)) {
DEBUGP("not responsible\n");
return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 13463802133..8acac5a40a9 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -29,7 +29,7 @@ MODULE_LICENSE("GPL");
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
static unsigned int
target(struct sk_buff **pskb,
@@ -43,24 +43,24 @@ target(struct sk_buff **pskb,
u_int32_t diff;
u_int32_t nfmark;
u_int32_t newmark;
+ u_int32_t ctinfo;
+ u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo);
- enum ip_conntrack_info ctinfo;
- struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
- if (ct) {
+ if (ctmark) {
switch(markinfo->mode) {
case IPT_CONNMARK_SET:
- newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
- if (newmark != ct->mark)
- ct->mark = newmark;
+ newmark = (*ctmark & ~markinfo->mask) | markinfo->mark;
+ if (newmark != *ctmark)
+ *ctmark = newmark;
break;
case IPT_CONNMARK_SAVE:
- newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
- if (ct->mark != newmark)
- ct->mark = newmark;
+ newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
+ if (*ctmark != newmark)
+ *ctmark = newmark;
break;
case IPT_CONNMARK_RESTORE:
nfmark = (*pskb)->nfmark;
- diff = (ct->mark ^ nfmark) & markinfo->mask;
+ diff = (*ctmark ^ nfmark) & markinfo->mask;
if (diff != 0)
(*pskb)->nfmark = nfmark ^ diff;
break;
@@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = {
static int __init init(void)
{
+ need_ip_conntrack();
return ipt_register_target(&ipt_connmark_reg);
}
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
index a4bb9b3bc29..e3c69d072c6 100644
--- a/net/ipv4/netfilter/ipt_NOTRACK.c
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -5,7 +5,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
static unsigned int
target(struct sk_buff **pskb,
@@ -23,7 +23,7 @@ target(struct sk_buff **pskb,
If there is a real ct entry correspondig to this packet,
it'll hang aroun till timing out. We don't deal with it
for performance reasons. JK */
- (*pskb)->nfct = &ip_conntrack_untracked.ct_general;
+ nf_ct_untrack(*pskb);
(*pskb)->nfctinfo = IP_CT_NEW;
nf_conntrack_get((*pskb)->nfct);
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
index df4a42c6da2..d68a048b717 100644
--- a/net/ipv4/netfilter/ipt_connbytes.c
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -10,7 +10,7 @@
*/
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_connbytes.h>
@@ -46,60 +46,59 @@ match(const struct sk_buff *skb,
int *hotdrop)
{
const struct ipt_connbytes_info *sinfo = matchinfo;
- enum ip_conntrack_info ctinfo;
- struct ip_conntrack *ct;
u_int64_t what = 0; /* initialize to make gcc happy */
+ const struct ip_conntrack_counter *counters;
- if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+ if (!(counters = nf_ct_get_counters(skb)))
return 0; /* no match */
switch (sinfo->what) {
case IPT_CONNBYTES_PKTS:
switch (sinfo->direction) {
case IPT_CONNBYTES_DIR_ORIGINAL:
- what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+ what = counters[IP_CT_DIR_ORIGINAL].packets;
break;
case IPT_CONNBYTES_DIR_REPLY:
- what = ct->counters[IP_CT_DIR_REPLY].packets;
+ what = counters[IP_CT_DIR_REPLY].packets;
break;
case IPT_CONNBYTES_DIR_BOTH:
- what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
- what += ct->counters[IP_CT_DIR_REPLY].packets;
+ what = counters[IP_CT_DIR_ORIGINAL].packets;
+ what += counters[IP_CT_DIR_REPLY].packets;
break;
}
break;
case IPT_CONNBYTES_BYTES:
switch (sinfo->direction) {
case IPT_CONNBYTES_DIR_ORIGINAL:
- what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+ what = counters[IP_CT_DIR_ORIGINAL].bytes;
break;
case IPT_CONNBYTES_DIR_REPLY:
- what = ct->counters[IP_CT_DIR_REPLY].bytes;
+ what = counters[IP_CT_DIR_REPLY].bytes;
break;
case IPT_CONNBYTES_DIR_BOTH:
- what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
- what += ct->counters[IP_CT_DIR_REPLY].bytes;
+ what = counters[IP_CT_DIR_ORIGINAL].bytes;
+ what += counters[IP_CT_DIR_REPLY].bytes;
break;
}
break;
case IPT_CONNBYTES_AVGPKT:
switch (sinfo->direction) {
case IPT_CONNBYTES_DIR_ORIGINAL:
- what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
- ct->counters[IP_CT_DIR_ORIGINAL].packets);
+ what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes,
+ counters[IP_CT_DIR_ORIGINAL].packets);
break;
case IPT_CONNBYTES_DIR_REPLY:
- what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
- ct->counters[IP_CT_DIR_REPLY].packets);
+ what = div64_64(counters[IP_CT_DIR_REPLY].bytes,
+ counters[IP_CT_DIR_REPLY].packets);
break;
case IPT_CONNBYTES_DIR_BOTH:
{
u_int64_t bytes;
u_int64_t pkts;
- bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
- ct->counters[IP_CT_DIR_REPLY].bytes;
- pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
- ct->counters[IP_CT_DIR_REPLY].packets;
+ bytes = counters[IP_CT_DIR_ORIGINAL].bytes +
+ counters[IP_CT_DIR_REPLY].bytes;
+ pkts = counters[IP_CT_DIR_ORIGINAL].packets+
+ counters[IP_CT_DIR_REPLY].packets;
/* FIXME_THEORETICAL: what to do if sum
* overflows ? */
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index bf8de47ce00..5306ef293b9 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -28,7 +28,7 @@ MODULE_LICENSE("GPL");
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_connmark.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
static int
match(const struct sk_buff *skb,
@@ -39,12 +39,12 @@ match(const struct sk_buff *skb,
int *hotdrop)
{
const struct ipt_connmark_info *info = matchinfo;
- enum ip_conntrack_info ctinfo;
- struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
- if (!ct)
+ u_int32_t ctinfo;
+ const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo);
+ if (!ctmark)
return 0;
- return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+ return (((*ctmark) & info->mask) == info->mark) ^ info->invert;
}
static int
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
index c1d22801b7c..c8d18705469 100644
--- a/net/ipv4/netfilter/ipt_conntrack.c
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -10,7 +10,14 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
+#else
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_conntrack.h>
@@ -18,6 +25,8 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
MODULE_DESCRIPTION("iptables connection tracking match module");
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
+
static int
match(const struct sk_buff *skb,
const struct net_device *in,
@@ -102,6 +111,93 @@ match(const struct sk_buff *skb,
return 1;
}
+#else /* CONFIG_IP_NF_CONNTRACK */
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_conntrack_info *sinfo = matchinfo;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+
+ ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
+
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+
+ if (ct == &nf_conntrack_untracked)
+ statebit = IPT_CONNTRACK_STATE_UNTRACKED;
+ else if (ct)
+ statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
+ else
+ statebit = IPT_CONNTRACK_STATE_INVALID;
+
+ if(sinfo->flags & IPT_CONNTRACK_STATE) {
+ if (ct) {
+ if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip !=
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip)
+ statebit |= IPT_CONNTRACK_STATE_SNAT;
+
+ if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip !=
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip)
+ statebit |= IPT_CONNTRACK_STATE_DNAT;
+ }
+
+ if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_PROTO) {
+ if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
+ if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_STATUS) {
+ if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
+ return 0;
+ }
+
+ if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
+ unsigned long expires;
+
+ if(!ct)
+ return 0;
+
+ expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+
+ if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* CONFIG_NF_IP_CONNTRACK */
+
static int check(const char *tablename,
const struct ipt_ip *ip,
void *matchinfo,
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 3e7dd014de4..bf14e1c7798 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -13,9 +13,15 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter.h>
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#else
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#endif
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_helper.h>
@@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module");
#define DEBUGP(format, args...)
#endif
+#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)
static int
match(const struct sk_buff *skb,
const struct net_device *in,
@@ -73,6 +80,53 @@ out_unlock:
return ret;
}
+#else /* CONFIG_IP_NF_CONNTRACK */
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ int *hotdrop)
+{
+ const struct ipt_helper_info *info = matchinfo;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int ret = info->invert;
+
+ ct = nf_ct_get((struct sk_buff *)skb, &ctinfo);
+ if (!ct) {
+ DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
+ return ret;
+ }
+
+ if (!ct->master) {
+ DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
+ return ret;
+ }
+
+ read_lock_bh(&nf_conntrack_lock);
+ if (!ct->master->helper) {
+ DEBUGP("ipt_helper: master ct %p has no helper\n",
+ exp->expectant);
+ goto out_unlock;
+ }
+
+ DEBUGP("master's name = %s , info->name = %s\n",
+ ct->master->helper->name, info->name);
+
+ if (info->name[0] == '\0')
+ ret ^= 1;
+ else
+ ret ^= !strncmp(ct->master->helper->name, info->name,
+ strlen(ct->master->helper->name));
+out_unlock:
+ read_unlock_bh(&nf_conntrack_lock);
+ return ret;
+}
+#endif
+
static int check(const char *tablename,
const struct ipt_ip *ip,
void *matchinfo,
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
index b1511b97ea5..4d7f16b70ce 100644
--- a/net/ipv4/netfilter/ipt_state.c
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -10,7 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <net/netfilter/nf_conntrack_compat.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv4/ipt_state.h>
@@ -30,9 +30,9 @@ match(const struct sk_buff *skb,
enum ip_conntrack_info ctinfo;
unsigned int statebit;
- if (skb->nfct == &ip_conntrack_untracked.ct_general)
+ if (nf_ct_is_untracked(skb))
statebit = IPT_STATE_UNTRACKED;
- else if (!ip_conntrack_get(skb, &ctinfo))
+ else if (!nf_ct_get_ctinfo(skb, &ctinfo))
statebit = IPT_STATE_INVALID;
else
statebit = IPT_STATE_BIT(ctinfo);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
new file mode 100644
index 00000000000..8202c1c0afa
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -0,0 +1,571 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - move L3 protocol dependent part to this file.
+ * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - add get_features() to support various size of conntrack
+ * structures.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat);
+
+static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ u_int32_t _addrs[2], *ap;
+ ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
+ sizeof(u_int32_t) * 2, _addrs);
+ if (ap == NULL)
+ return 0;
+
+ tuple->src.u3.ip = ap[0];
+ tuple->dst.u3.ip = ap[1];
+
+ return 1;
+}
+
+static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u3.ip = orig->dst.u3.ip;
+ tuple->dst.u3.ip = orig->src.u3.ip;
+
+ return 1;
+}
+
+static int ipv4_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
+ NIPQUAD(tuple->src.u3.ip),
+ NIPQUAD(tuple->dst.u3.ip));
+}
+
+static int ipv4_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+/* Returns new sk_buff, or NULL */
+static struct sk_buff *
+nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ skb_orphan(skb);
+
+ local_bh_disable();
+ skb = ip_defrag(skb, user);
+ local_bh_enable();
+
+ if (skb)
+ ip_send_check(skb->nh.iph);
+
+ return skb;
+}
+
+static int
+ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
+ u_int8_t *protonum)
+{
+ /* Never happen */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
+ if (net_ratelimit()) {
+ printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n",
+ (*pskb)->nh.iph->protocol, hooknum);
+ }
+ return -NF_DROP;
+ }
+
+ *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4;
+ *protonum = (*pskb)->nh.iph->protocol;
+
+ return NF_ACCEPT;
+}
+
+int nat_module_is_loaded = 0;
+static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple)
+{
+ if (nat_module_is_loaded)
+ return NF_CT_F_NAT;
+
+ return NF_CT_F_BASIC;
+}
+
+static unsigned int ipv4_confirm(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(pskb);
+}
+
+static unsigned int ipv4_conntrack_help(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(*pskb, &ctinfo);
+ if (ct && ct->helper) {
+ unsigned int ret;
+ ret = ct->helper->help(pskb,
+ (*pskb)->nh.raw - (*pskb)->data
+ + (*pskb)->nh.iph->ihl*4,
+ ct, ctinfo);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+#endif
+
+ /* Gather fragments. */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = nf_ct_ipv4_gather_frags(*pskb,
+ hooknum == NF_IP_PRE_ROUTING ?
+ IP_DEFRAG_CONNTRACK_IN :
+ IP_DEFRAG_CONNTRACK_OUT);
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv4_refrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+
+ /* We've seen it coming out the other side: confirm */
+ if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
+ return NF_DROP;
+
+ /* Local packets are never produced too large for their
+ interface. We degfragment them at LOCAL_OUT, however,
+ so we have to refragment them here. */
+ if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
+ !skb_shinfo(*pskb)->tso_size) {
+ /* No hook can be after us, so this should be OK. */
+ ip_fragment(*pskb, okfn);
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return nf_conntrack_in(PF_INET, hooknum, pskb);
+}
+
+static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+ return nf_conntrack_in(PF_INET, hooknum, pskb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ipv4_conntrack_defrag_ops = {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ipv4_conntrack_in_ops = {
+ .hook = ipv4_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+};
+
+static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = {
+ .hook = ipv4_conntrack_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ipv4_conntrack_local_out_ops = {
+ .hook = ipv4_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+};
+
+/* helpers */
+static struct nf_hook_ops ipv4_conntrack_helper_out_ops = {
+ .hook = ipv4_conntrack_help,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+};
+
+static struct nf_hook_ops ipv4_conntrack_helper_in_ops = {
+ .hook = ipv4_conntrack_help,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+};
+
+
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ipv4_conntrack_out_ops = {
+ .hook = ipv4_refrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+};
+
+static struct nf_hook_ops ipv4_conntrack_local_in_ops = {
+ .hook = ipv4_confirm,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+};
+
+#ifdef CONFIG_SYSCTL
+/* From nf_conntrack_proto_icmp.c */
+extern unsigned long nf_ct_icmp_timeout;
+static struct ctl_table_header *nf_ct_ipv4_sysctl_header;
+
+static ctl_table nf_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT,
+ .procname = "nf_conntrack_icmp_timeout",
+ .data = &nf_ct_icmp_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = nf_ct_sysctl_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = nf_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+
+ NF_CT_TUPLE_U_BLANK(&tuple);
+ tuple.src.u3.ip = inet->rcv_saddr;
+ tuple.src.u.tcp.port = inet->sport;
+ tuple.dst.u3.ip = inet->daddr;
+ tuple.dst.u.tcp.port = inet->dport;
+ tuple.src.l3num = PF_INET;
+ tuple.dst.protonum = IPPROTO_TCP;
+
+ /* We only do TCP at the moment: is there a better way? */
+ if (strcmp(sk->sk_prot->name, "TCP")) {
+ DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+ DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = nf_conntrack_find_get(&tuple, NULL);
+ if (h) {
+ struct sockaddr_in sin;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ nf_ct_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+ NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port),
+ NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST+1,
+ .get = &getorigdst,
+};
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
+ .l3proto = PF_INET,
+ .name = "ipv4",
+ .pkt_to_tuple = ipv4_pkt_to_tuple,
+ .invert_tuple = ipv4_invert_tuple,
+ .print_tuple = ipv4_print_tuple,
+ .print_conntrack = ipv4_print_conntrack,
+ .prepare = ipv4_prepare,
+ .get_features = ipv4_get_features,
+ .me = THIS_MODULE,
+};
+
+extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4;
+extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4;
+extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp;
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret < 0) {
+ printk(KERN_ERR "Unable to register netfilter socket option\n");
+ goto cleanup_nothing;
+ }
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register tcp.\n");
+ goto cleanup_sockopt;
+ }
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register udp.\n");
+ goto cleanup_tcp;
+ }
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register icmp.\n");
+ goto cleanup_udp;
+ }
+
+ ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register ipv4\n");
+ goto cleanup_icmp;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_defrag_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n");
+ goto cleanup_ipv4;
+ }
+ ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n");
+ goto cleanup_defragops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_in_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register pre-routing hook.\n");
+ goto cleanup_defraglocalops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_local_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register local out hook.\n");
+ goto cleanup_inops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_helper_in_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register local helper hook.\n");
+ goto cleanup_inandlocalops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_helper_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n");
+ goto cleanup_helperinops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register post-routing hook.\n");
+ goto cleanup_helperoutops;
+ }
+
+ ret = nf_register_hook(&ipv4_conntrack_local_in_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv4: can't register local in hook.\n");
+ goto cleanup_inoutandlocalops;
+ }
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
+ if (nf_ct_ipv4_sysctl_header == NULL) {
+ printk("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto cleanup_localinops;
+ }
+#endif
+
+ /* For use by REJECT target */
+ ip_ct_attach = __nf_conntrack_attach;
+
+ return ret;
+
+ cleanup:
+ synchronize_net();
+ ip_ct_attach = NULL;
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(nf_ct_ipv4_sysctl_header);
+ cleanup_localinops:
+#endif
+ nf_unregister_hook(&ipv4_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+ nf_unregister_hook(&ipv4_conntrack_out_ops);
+ cleanup_helperoutops:
+ nf_unregister_hook(&ipv4_conntrack_helper_out_ops);
+ cleanup_helperinops:
+ nf_unregister_hook(&ipv4_conntrack_helper_in_ops);
+ cleanup_inandlocalops:
+ nf_unregister_hook(&ipv4_conntrack_local_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ipv4_conntrack_in_ops);
+ cleanup_defraglocalops:
+ nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+ nf_unregister_hook(&ipv4_conntrack_defrag_ops);
+ cleanup_ipv4:
+ nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
+ cleanup_icmp:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp);
+ cleanup_udp:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4);
+ cleanup_tcp:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4);
+ cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst);
+ cleanup_nothing:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+
+static int __init init(void)
+{
+ need_nf_conntrack();
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+void need_ip_conntrack(void)
+{
+}
+
+EXPORT_SYMBOL(need_ip_conntrack);
+EXPORT_SYMBOL(nf_ct_ipv4_gather_frags);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
new file mode 100644
index 00000000000..7ddb5c08f7b
--- /dev/null
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -0,0 +1,301 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - enable working with Layer 3 protocol independent connection tracking.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+unsigned long nf_ct_icmp_timeout = 30*HZ;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int icmp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct icmphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->dst.u.icmp.type = hp->type;
+ tuple->src.u.icmp.id = hp->un.echo.id;
+ tuple->dst.u.icmp.code = hp->code;
+
+ return 1;
+}
+
+static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ /* Add 1; spaces filled with 0. */
+ static u_int8_t invmap[]
+ = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+ [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+ [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+ [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+ [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+ [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+ [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+ [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
+
+ if (orig->dst.u.icmp.type >= sizeof(invmap)
+ || !invmap[orig->dst.u.icmp.type])
+ return 0;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+/* Print out the private part of the conntrack. */
+static int icmp_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ /* Try to delete connection immediately after all replies:
+ won't actually vanish as we still have skb, and del_timer
+ means this will only run once even if count hits zero twice
+ (theoretically possible with SMP) */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ if (atomic_dec_and_test(&ct->proto.icmp.count)
+ && del_timer(&ct->timeout))
+ ct->timeout.function((unsigned long)ct);
+ } else {
+ atomic_inc(&ct->proto.icmp.count);
+ nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int icmp_new(struct nf_conn *conntrack,
+ const struct sk_buff *skb, unsigned int dataoff)
+{
+ static u_int8_t valid_new[]
+ = { [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1 };
+
+ if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
+ /* Can't create a new ICMP `conn' with this. */
+ DEBUGP("icmp: can't create new conn with type %u\n",
+ conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+ NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+ return 0;
+ }
+ atomic_set(&conntrack->proto.icmp.count, 0);
+ return 1;
+}
+
+extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
+/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
+static int
+icmp_error_message(struct sk_buff *skb,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct nf_conntrack_tuple innertuple, origtuple;
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } _in, *inside;
+ struct nf_conntrack_protocol *innerproto;
+ struct nf_conntrack_tuple_hash *h;
+ int dataoff;
+
+ NF_CT_ASSERT(skb->nfct == NULL);
+
+ /* Not enough header? */
+ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
+ if (inside == NULL)
+ return -NF_ACCEPT;
+
+ /* Ignore ICMP's containing fragments (shouldn't happen) */
+ if (inside->ip.frag_off & htons(IP_OFFSET)) {
+ DEBUGP("icmp_error_message: fragment of proto %u\n",
+ inside->ip.protocol);
+ return -NF_ACCEPT;
+ }
+
+ innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol);
+ dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp);
+ /* Are they talking about one of our connections? */
+ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET,
+ inside->ip.protocol, &origtuple,
+ &nf_conntrack_l3proto_ipv4, innerproto)) {
+ DEBUGP("icmp_error_message: ! get_tuple p=%u",
+ inside->ip.protocol);
+ return -NF_ACCEPT;
+ }
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple,
+ &nf_conntrack_l3proto_ipv4, innerproto)) {
+ DEBUGP("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = nf_conntrack_find_get(&innertuple, NULL);
+ if (!h) {
+ /* Locally generated ICMPs will match inverted if they
+ haven't been SNAT'ed yet */
+ /* FIXME: NAT code has to handle half-done double NAT --RR */
+ if (hooknum == NF_IP_LOCAL_OUT)
+ h = nf_conntrack_find_get(&origtuple, NULL);
+
+ if (!h) {
+ DEBUGP("icmp_error_message: no match\n");
+ return -NF_ACCEPT;
+ }
+
+ /* Reverse direction from that found */
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ } else {
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ }
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return -NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
+{
+ struct icmphdr _ih, *icmph;
+
+ /* Not enough header? */
+ icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
+ if (icmph == NULL) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* See ip_conntrack_proto_tcp.c */
+ if (hooknum != NF_IP_PRE_ROUTING)
+ goto checksum_skipped;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!(u16)csum_fold(skb->csum))
+ break;
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: bad HW ICMP checksum ");
+ return -NF_ACCEPT;
+ case CHECKSUM_NONE:
+ if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+ NULL,
+ "nf_ct_icmp: bad ICMP checksum ");
+ return -NF_ACCEPT;
+ }
+ default:
+ break;
+ }
+
+checksum_skipped:
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ if (LOG_INVALID(IPPROTO_ICMP))
+ nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmp: invalid ICMP type ");
+ return -NF_ACCEPT;
+ }
+
+ /* Need to track icmp error message? */
+ if (icmph->type != ICMP_DEST_UNREACH
+ && icmph->type != ICMP_SOURCE_QUENCH
+ && icmph->type != ICMP_TIME_EXCEEDED
+ && icmph->type != ICMP_PARAMETERPROB
+ && icmph->type != ICMP_REDIRECT)
+ return NF_ACCEPT;
+
+ return icmp_error_message(skb, ctinfo, hooknum);
+}
+
+struct nf_conntrack_protocol nf_conntrack_protocol_icmp =
+{
+ .list = { NULL, NULL },
+ .l3proto = PF_INET,
+ .proto = IPPROTO_ICMP,
+ .name = "icmp",
+ .pkt_to_tuple = icmp_pkt_to_tuple,
+ .invert_tuple = icmp_invert_tuple,
+ .print_tuple = icmp_print_tuple,
+ .print_conntrack = icmp_print_conntrack,
+ .packet = icmp_packet,
+ .new = icmp_new,
+ .error = icmp_error,
+ .destroy = NULL,
+ .me = NULL
+};
+
+EXPORT_SYMBOL(nf_conntrack_protocol_icmp);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 65268562351..01444a02b48 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_tcp_congestion_control,
.strategy = &sysctl_tcp_congestion_control,
},
+ {
+ .ctl_name = NET_TCP_ABC,
+ .procname = "tcp_abc",
+ .data = &sysctl_tcp_abc,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a958..9ac7a4f46bd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
} else if (tcp_need_reset(old_state) ||
(tp->snd_nxt != tp->write_seq &&
(1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
- /* The last check adjusts for discrepance of Linux wrt. RFC
+ /* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
tcp_send_active_reset(sk, gfp_any());
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->packets_out = 0;
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
+ tp->bytes_acked = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2112,7 +2113,6 @@ void __init tcp_init(void)
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
- tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ae35e060904..1d0cd86621b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
bictcp_low_utilization(sk, data_acked);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
bictcp_update(ca, tp->snd_cwnd);
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= ca->cnt) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bbf2d6624e8..c7cc62c8dc1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
- /* In dangerous area, increase slowly.
- * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- } else
- tp->snd_cwnd_cnt++;
- }
+ /* In "safe" area, increase. */
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+
+ /* In dangerous area, increase slowly. */
+ else if (sysctl_tcp_abc) {
+ /* RFC3465: Apppriate Byte Count
+ * increase once for each full cwnd acked
+ */
+ if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
+ tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ }
+ } else {
+ /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt++;
+ }
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6acc04bde08..63cf7e54084 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
}
static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
- u32 in_flight, int good)
+ u32 in_flight, int data_acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hstcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
/* Update AIMD parameters */
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index e47b37984e9..3284cfb993e 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
struct tcp_sock *tp = tcp_sk(sk);
struct htcp *ca = inet_csk_ca(sk);
- if (in_flight < tp->snd_cwnd)
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- /* In "safe" area, increase. */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
+
measure_rtt(sk);
/* keep track of number of round-trip times since last backoff event */
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
htcp_alpha_update(ca);
}
- /* In dangerous area, increase slowly.
+ /* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 77add63623d..40dbb387751 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
ca->minrtt = tp->srtt;
}
+ if (!tcp_is_cwnd_limited(sk, in_flight))
+ return;
+
if (!ca->hybla_en)
return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
- if (in_flight < tp->snd_cwnd)
- return;
-
if (ca->rho == 0)
hybla_recalc_param(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578d..bf2e23086bc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
* Andi Kleen : Moved open_request checking here
* and process RSTs for open_requests.
* Andi Kleen : Better prune_queue, and other fixes.
- * Andrey Savochkin: Fix RTT measurements in the presnce of
+ * Andrey Savochkin: Fix RTT measurements in the presence of
* timestamps.
* Andrey Savochkin: Check sequence numbers correctly when
* removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
int sysctl_tcp_nometrics_save;
int sysctl_tcp_moderate_rcvbuf = 1;
+int sysctl_tcp_abc = 1;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
* of receiver window. Check #2.
*
* The scheme does not work when sender sends good segments opening
- * window and then starts to feed us spagetti. But it should work
+ * window and then starts to feed us spaghetti. But it should work
* in common situations. Otherwise, we have to rely on queue collapsing.
*/
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
{
/* Optimize this! */
int truesize = tcp_win_from_space(skb->truesize)/2;
- int window = tcp_full_space(sk)/2;
+ int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
/* Try to select rcvbuf so that 4 mss-sized segments
- * will fit to window and correspoding skbs will fit to our rcvbuf.
+ * will fit to window and corresponding skbs will fit to our rcvbuf.
* (was 3; 4 is minimum to allow fast retransmit to work.)
*/
while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}
-/* 4. Try to fixup all. It is made iimediately after connection enters
+/* 4. Try to fixup all. It is made immediately after connection enters
* established state.
*/
static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- struct sk_buff *skb;
- unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
- int ofo_win = 0;
icsk->icsk_ack.quick = 0;
- skb_queue_walk(&tp->out_of_order_queue, skb) {
- ofo_win += skb->len;
- }
-
- /* If overcommit is due to out of order segments,
- * do not clamp window. Try to expand rcvbuf instead.
- */
- if (ofo_win) {
- if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
- atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- sysctl_tcp_rmem[2]);
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+ !tcp_memory_pressure &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
}
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- app_win += ofo_win;
- if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
- app_win >>= 1;
- if (app_win > icsk->icsk_ack.rcv_mss)
- app_win -= icsk->icsk_ack.rcv_mss;
- app_win = max(app_win, 2U*tp->advmss);
-
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
- }
}
/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* are stalled on filesystem I/O.
*
* Also, since we are only going for a minimum in the
- * non-timestamp case, we do not smoothe things out
- * else with timestamps disabled convergance takes too
+ * non-timestamp case, we do not smooth things out
+ * else with timestamps disabled convergence takes too
* long.
*/
if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
} else if (m < new_sample)
new_sample = m << 3;
} else {
- /* No previous mesaure. */
+ /* No previous measure. */
new_sample = m << 3;
}
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
if (icsk->icsk_ack.ato > icsk->icsk_rto)
icsk->icsk_ack.ato = icsk->icsk_rto;
} else if (m > icsk->icsk_rto) {
- /* Too long gap. Apparently sender falled to
+ /* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
{
struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
long m = mrtt; /* RTT */
/* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
*
* Funny. This algorithm seems to be very broken.
* These formulae increase RTO, when it should be decreased, increase
- * too slowly, when it should be incresed fastly, decrease too fastly
+ * too slowly, when it should be increased quickly, decrease too quickly
* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
tp->rtt_seq = tp->snd_nxt;
}
-
- if (icsk->icsk_ca_ops->rtt_sample)
- icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
* to do with delayed acks, because at cwnd>2 true delack timeout
* is invisible. Actually, Linux-2.4 also generates erratic
- * ACKs in some curcumstances.
+ * ACKs in some circumstances.
*/
inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
* all the algo is pure shit and should be replaced
- * with correct one. It is exaclty, which we pretend to do.
+ * with correct one. It is exactly, which we pretend to do.
*/
}
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
* to make it more realistic.
*
* A bit of theory. RTT is time passed after "normal" sized packet
- * is sent until it is ACKed. In normal curcumstances sending small
+ * is sent until it is ACKed. In normal circumstances sending small
* packets force peer to delay ACKs and calculation is correct too.
* The algorithm is adaptive and, provided we follow specs, it
* NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
int prior_fackets;
u32 lost_retrans = 0;
int flag = 0;
+ int dup_sack = 0;
int i;
if (!tp->sacked_out)
tp->fackets_out = 0;
prior_fackets = tp->fackets_out;
- for (i=0; i<num_sacks; i++, sp++) {
- struct sk_buff *skb;
- __u32 start_seq = ntohl(sp->start_seq);
- __u32 end_seq = ntohl(sp->end_seq);
- int fack_count = 0;
- int dup_sack = 0;
+ /* SACK fastpath:
+ * if the only SACK change is the increase of the end_seq of
+ * the first block then only apply that SACK block
+ * and use retrans queue hinting otherwise slowpath */
+ flag = 1;
+ for (i = 0; i< num_sacks; i++) {
+ __u32 start_seq = ntohl(sp[i].start_seq);
+ __u32 end_seq = ntohl(sp[i].end_seq);
+
+ if (i == 0){
+ if (tp->recv_sack_cache[i].start_seq != start_seq)
+ flag = 0;
+ } else {
+ if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
+ (tp->recv_sack_cache[i].end_seq != end_seq))
+ flag = 0;
+ }
+ tp->recv_sack_cache[i].start_seq = start_seq;
+ tp->recv_sack_cache[i].end_seq = end_seq;
/* Check for D-SACK. */
if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (before(ack, prior_snd_una - tp->max_window))
return 0;
}
+ }
+
+ if (flag)
+ num_sacks = 1;
+ else {
+ int j;
+ tp->fastpath_skb_hint = NULL;
+
+ /* order SACK blocks to allow in order walk of the retrans queue */
+ for (i = num_sacks-1; i > 0; i--) {
+ for (j = 0; j < i; j++){
+ if (after(ntohl(sp[j].start_seq),
+ ntohl(sp[j+1].start_seq))){
+ sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
+ sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
+ sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
+ sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
+ }
+
+ }
+ }
+ }
+
+ /* clear flag as used for different purpose in following code */
+ flag = 0;
+
+ for (i=0; i<num_sacks; i++, sp++) {
+ struct sk_buff *skb;
+ __u32 start_seq = ntohl(sp->start_seq);
+ __u32 end_seq = ntohl(sp->end_seq);
+ int fack_count;
+
+ /* Use SACK fastpath hint if valid */
+ if (tp->fastpath_skb_hint) {
+ skb = tp->fastpath_skb_hint;
+ fack_count = tp->fastpath_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ fack_count = 0;
+ }
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
flag |= FLAG_DATA_LOST;
- sk_stream_for_retrans_queue(skb, sk) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
int in_sack, pcount;
u8 sacked;
+ tp->fastpath_skb_hint = skb;
+ tp->fastpath_cnt_hint = fack_count;
+
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
tp->lost_out -= tcp_skb_pcount(skb);
tp->retrans_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
} else {
/* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sacked & TCPCB_LOST) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
tp->lost_out -= tcp_skb_pcount(skb);
+
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
}
}
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ tp->retransmit_skb_hint = NULL;
}
}
}
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
+ /* clear lost hint */
+ tp->retransmit_skb_hint = NULL;
+
if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->frto_highmark;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->bytes_acked = 0;
tcp_clear_retrans(tp);
/* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
TCP_ECN_queue_cwr(tp);
+
+ clear_all_retrans_hints(tp);
}
static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
int packets, u32 high_seq)
{
struct sk_buff *skb;
- int cnt = packets;
+ int cnt;
- BUG_TRAP(cnt <= tp->packets_out);
+ BUG_TRAP(packets <= tp->packets_out);
+ if (tp->lost_skb_hint) {
+ skb = tp->lost_skb_hint;
+ cnt = tp->lost_cnt_hint;
+ } else {
+ skb = sk->sk_write_queue.next;
+ cnt = 0;
+ }
- sk_stream_for_retrans_queue(skb, sk) {
- cnt -= tcp_skb_pcount(skb);
- if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ /* TODO: do this better */
+ /* this is not the most efficient way to do this... */
+ tp->lost_skb_hint = skb;
+ tp->lost_cnt_hint = cnt;
+ cnt += tcp_skb_pcount(skb);
+ if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
break;
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retransmit_queue hints
+ * if this is beyond hint */
+ if(tp->retransmit_skb_hint != NULL &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
+
+ tp->retransmit_skb_hint = NULL;
+ }
}
}
tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
if (tcp_head_timedout(sk, tp)) {
struct sk_buff *skb;
- sk_stream_for_retrans_queue(skb, sk) {
- if (tcp_skb_timedout(sk, skb) &&
- !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
+ : sk->sk_write_queue.next;
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ if (!tcp_skb_timedout(sk, skb))
+ break;
+
+ if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+
+ /* clear xmit_retrans hint */
+ if (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+
+ tp->retransmit_skb_hint = NULL;
}
}
+
+ tp->scoreboard_skb_hint = skb;
+
tcp_sync_left_out(tp);
}
}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
}
tcp_moderate_cwnd(tp);
tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ /* There is something screwy going on with the retrans hints after
+ an undo */
+ clear_all_retrans_hints(tp);
}
static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
sk_stream_for_retrans_queue(skb, sk) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
}
+
+ clear_all_retrans_hints(tp);
+
DBGUNDO(sk, tp, "partial loss");
tp->lost_out = 0;
tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
TCP_ECN_queue_cwr(tp);
}
+ tp->bytes_acked = 0;
tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
}
/* Read draft-ietf-tcplw-high-performance before mucking
- * with this code. (Superceeds RFC1323)
+ * with this code. (Supersedes RFC1323)
*/
-static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
{
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*
* Changed: reset backoff as soon as we see the first valid sample.
- * If we do not, we get strongly overstimated rto. With timestamps
+ * If we do not, we get strongly overestimated rto. With timestamps
* samples are accepted even from very old segments: f.e., when rtt=1
* increases to 8, we retransmit 5 times and after 8 seconds delayed
* answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
*/
struct tcp_sock *tp = tcp_sk(sk);
const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
-static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
{
/* We don't have a timestamp. Can only use
* packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
if (flag & FLAG_RETRANS_DATA_ACKED)
return;
- tcp_rtt_estimator(sk, seq_rtt, usrtt);
+ tcp_rtt_estimator(sk, seq_rtt);
tcp_set_rto(sk);
inet_csk(sk)->icsk_backoff = 0;
tcp_bound_rto(sk);
}
static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
- const s32 seq_rtt, u32 *usrtt)
+ const s32 seq_rtt)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
- tcp_ack_saw_tstamp(sk, usrtt, flag);
+ tcp_ack_saw_tstamp(sk, flag);
else if (seq_rtt >= 0)
- tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
+ tcp_ack_no_tstamp(sk, seq_rtt, flag);
}
static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
return acked;
}
+static inline u32 tcp_usrtt(const struct sk_buff *skb)
+{
+ struct timeval tv, now;
+
+ do_gettimeofday(&now);
+ skb_get_timestamp(skb, &tv);
+ return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
+}
/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
__s32 seq_rtt = -1;
- struct timeval usnow;
u32 pkts_acked = 0;
-
- if (seq_usrtt)
- do_gettimeofday(&usnow);
+ void (*rtt_sample)(struct sock *sk, u32 usrtt)
+ = icsk->icsk_ca_ops->rtt_sample;
while ((skb = skb_peek(&sk->sk_write_queue)) &&
skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
tp->retrans_out -= tcp_skb_pcount(skb);
acked |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
- if (seq_usrtt) {
- struct timeval tv;
-
- skb_get_timestamp(skb, &tv);
- *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
- + (usnow.tv_usec - tv.tv_usec);
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
}
-
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= tcp_skb_pcount(skb);
if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
!before(scb->end_seq, tp->snd_up))
tp->urg_mode = 0;
}
- } else if (seq_rtt < 0)
+ } else if (seq_rtt < 0) {
seq_rtt = now - scb->when;
+ if (rtt_sample)
+ (*rtt_sample)(sk, tcp_usrtt(skb));
+ }
tcp_dec_pcount_approx(&tp->fackets_out, skb);
tcp_packets_out_dec(tp, skb);
__skb_unlink(skb, &sk->sk_write_queue);
sk_stream_free_skb(sk, skb);
+ clear_all_retrans_hints(tp);
}
if (acked&FLAG_ACKED) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
+ tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk, tp);
if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
}
/* F-RTO affects on two new ACKs following RTO.
- * At latest on third ACK the TCP behavor is back to normal.
+ * At latest on third ACK the TCP behavior is back to normal.
*/
tp->frto_counter = (tp->frto_counter + 1) % 3;
}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
u32 ack = TCP_SKB_CB(skb)->ack_seq;
u32 prior_in_flight;
s32 seq_rtt;
- s32 seq_usrtt = 0;
int prior_packets;
/* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una))
goto old_ack;
+ if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
+ tp->bytes_acked += ack - prior_snd_una;
+
if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
/* Window is constant, pure forward advance.
* No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
prior_in_flight = tcp_packets_in_flight(tp);
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
- icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
+ flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
if (tp->frto_counter)
tcp_process_frto(sk, prior_snd_una);
if (tcp_ack_is_dubious(sk, flag)) {
- /* Advanve CWND, if state allows this. */
+ /* Advance CWND, if state allows this. */
if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
{
struct sk_buff *skb;
- /* First, check that queue is collapsable and find
+ /* First, check that queue is collapsible and find
* the point where collapsing can be useful. */
for (skb = head; skb != tail; ) {
/* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/*
* This routine is only called when we have urgent data
- * signalled. Its the 'slow' part of tcp_urg. It could be
+ * signaled. Its the 'slow' part of tcp_urg. It could be
* moved inline now as tcp_urg is only called from one
* place. We handle URGent data wrong. We have to - as
* BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
* urgent. To do this requires some care. We cannot just ignore
* tp->copied_seq since we would read the last urgent byte again
* as data, nor can we alter copied_seq until this data arrives
- * or we break the sematics of SIOCATMARK (and thus sockatmark())
+ * or we break the semantics of SIOCATMARK (and thus sockatmark())
*
* NOTE. Double Dutch. Rendering to plain English: author of comment
* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tp->rx_opt.saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
- * if header_predition is to be made
+ * if header_prediction is to be made
* 'S' will always be tp->tcp_header_len >> 2
* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
* turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!tp->srtt)
- tcp_ack_saw_tstamp(sk, NULL, 0);
+ tcp_ack_saw_tstamp(sk, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
EXPORT_SYMBOL(sysctl_tcp_ecn);
EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_abc);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d8474..4d5021e1929 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -39,7 +39,7 @@
* request_sock handling and moved
* most of it into the af independent code.
* Added tail drop and some other bugfixes.
- * Added new listen sematics.
+ * Added new listen semantics.
* Mike McLagan : Routing by source
* Juan Jose Ciarlante: ip_dynaddr bits
* Andi Kleen: various fixes.
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
.lhash_lock = RW_LOCK_UNLOCKED,
.lhash_users = ATOMIC_INIT(0),
.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
- .portalloc_lock = SPIN_LOCK_UNLOCKED,
- .port_rover = 1024 - 1,
};
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
@@ -825,8 +823,7 @@ out:
*/
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
- if (inet_rsk(req)->opt)
- kfree(inet_rsk(req)->opt);
+ kfree(inet_rsk(req)->opt);
}
static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1113,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
static int tcp_v4_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr, skb->csum))
+ skb->nh.iph->daddr, skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
-
- LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
+ }
}
+
+ skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len, IPPROTO_TCP, 0);
+
if (skb->len <= 76) {
- if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
- skb->nh.iph->daddr,
- skb_checksum(skb, 0, skb->len, 0)))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
- skb->nh.iph->saddr,
- skb->nh.iph->daddr, 0);
+ return __skb_checksum_complete(skb);
}
return 0;
}
@@ -1219,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
/* An explanation is required here, I think.
* Packet length and doff are validated by header prediction,
- * provided case of th->doff==0 is elimineted.
+ * provided case of th->doff==0 is eliminated.
* So, we defer the checks. */
if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v4_checksum_init(skb) < 0))
+ tcp_v4_checksum_init(skb)))
goto bad_packet;
th = skb->h.th;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b1a63b2c6b4..1b66a2ac432 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -158,7 +158,7 @@ kill_with_rst:
/* I am shamed, but failed to make it more elegant.
* Yes, it is direct reference to IP, which is impossible
* to generalize to IPv6. Taking into account that IPv6
- * do not undertsnad recycling in any case, it not
+ * do not understand recycling in any case, it not
* a big problem in practice. --ANK */
if (tw->tw_family == AF_INET &&
tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
@@ -194,7 +194,7 @@ kill_with_rst:
/* In window segment, it may be only reset or bare ack. */
if (th->rst) {
- /* This is TIME_WAIT assasination, in two flavors.
+ /* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
*/
newtp->snd_cwnd = 2;
newtp->snd_cwnd_cnt = 0;
+ newtp->bytes_acked = 0;
newtp->frto_counter = 0;
newtp->frto_highmark = 0;
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
/* RFC793 page 36: "If the connection is in any non-synchronized state ...
* and the incoming segment acknowledges something not yet
- * sent (the segment carries an unaccaptable ACK) ...
+ * sent (the segment carries an unacceptable ACK) ...
* a reset is sent."
*
* Invalid ACK: reset will be sent by listening socket
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b907456a79f..029c70dfb58 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
u16 flags;
BUG_ON(len > skb->len);
+
+ clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
- It is minumum of user_mss and mss received with SYN.
+ It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* MSS for the peer's data. Previous verions used mss_clamp
+ /* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
* but may be worse for the performance because of rcv_mss
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 ||
tcp_skb_pcount(next_skb) != 1);
- /* Ok. We will be able to collapse the packet. */
+ /* changing transmit queue under us so clear hints */
+ clear_all_retrans_hints(tp);
+
+ /* Ok. We will be able to collapse the packet. */
__skb_unlink(next_skb, &sk->sk_write_queue);
memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
+ clear_all_retrans_hints(tp);
+
if (!lost)
return;
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
int err;
/* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: frgagmentation, tunneling, mangling etc.
+ * copying overhead: fragmentation, tunneling, mangling etc.
*/
if (atomic_read(&sk->sk_wmem_alloc) >
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int packet_cnt = tp->lost_out;
+ int packet_cnt;
+
+ if (tp->retransmit_skb_hint) {
+ skb = tp->retransmit_skb_hint;
+ packet_cnt = tp->retransmit_cnt_hint;
+ }else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
/* First pass: retransmit lost packets. */
- if (packet_cnt) {
- sk_stream_for_retrans_queue(skb, sk) {
+ if (tp->lost_out) {
+ sk_stream_for_retrans_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ /* we could do better than to assign each time */
+ tp->retransmit_skb_hint = skb;
+ tp->retransmit_cnt_hint = packet_cnt;
+
/* Assume this retransmit will generate
* only one packet for congestion window
* calculation purposes. This works because
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
return;
- if (sacked&TCPCB_LOST) {
+ if (sacked & TCPCB_LOST) {
if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->retransmit_skb_hint = NULL;
return;
+ }
if (icsk->icsk_ca_state != TCP_CA_Loss)
NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
else
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
TCP_RTO_MAX);
}
- packet_cnt -= tcp_skb_pcount(skb);
- if (packet_cnt <= 0)
+ packet_cnt += tcp_skb_pcount(skb);
+ if (packet_cnt >= tp->lost_out)
break;
}
}
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_may_send_now(sk, tp))
return;
- packet_cnt = 0;
+ if (tp->forward_skb_hint) {
+ skb = tp->forward_skb_hint;
+ packet_cnt = tp->forward_cnt_hint;
+ } else{
+ skb = sk->sk_write_queue.next;
+ packet_cnt = 0;
+ }
+
+ sk_stream_for_retrans_queue_from(skb, sk) {
+ tp->forward_cnt_hint = packet_cnt;
+ tp->forward_skb_hint = skb;
- sk_stream_for_retrans_queue(skb, sk) {
/* Similar to the retransmit loop above we
* can pretend that the retransmitted SKB
* we send out here will be composed of one
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue;
/* Ok, retransmit it. */
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb)) {
+ tp->forward_skb_hint = NULL;
break;
+ }
if (skb == skb_peek(&sk->sk_write_queue))
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 327770bf552..26d7486ee50 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
u32 in_flight, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (in_flight < tp->snd_cwnd)
+
+ if (!tcp_is_cwnd_limited(sk, in_flight))
return;
- if (tp->snd_cwnd <= tp->snd_ssthresh) {
- tp->snd_cwnd++;
- } else {
+ if (tp->snd_cwnd <= tp->snd_ssthresh)
+ tcp_slow_start(tp);
+ else {
tp->snd_cwnd_cnt++;
if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
}
}
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
- tp->snd_cwnd_stamp = tcp_time_stamp;
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 415ee47ac1c..e1880959614 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
- * Criterium is still not confirmed experimentally and may change.
+ * Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
hole detection. :-(
It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
+ to make it. It is disgusting. It does not work in any
case. Let me to cite the same draft, which requires for
us to implement this:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 93c5f92070f..b7d296a8ac6 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
/* We don't have enough RTT samples to do the Vegas
* calculation, so we'll behave like Reno.
*/
- if (tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd++;
+ tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
} else {
u32 rtt, target_cwnd, diff;
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
*/
diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
- if (tp->snd_cwnd < tp->snd_ssthresh) {
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
/* Slow start. */
if (diff > gamma) {
/* Going too fast. Time to slow down
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
V_PARAM_SHIFT)+1);
}
+ tcp_slow_start(tp);
} else {
/* Congestion avoidance. */
u32 next_snd_cwnd;
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
else if (next_snd_cwnd < tp->snd_cwnd)
tp->snd_cwnd--;
}
- }
- /* Wipe the slate clean for the next RTT. */
- vegas->cntRTT = 0;
- vegas->minRTT = 0x7fffffff;
+ if (tp->snd_cwnd < 2)
+ tp->snd_cwnd = 2;
+ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+ tp->snd_cwnd = tp->snd_cwnd_clamp;
+ }
}
- /* The following code is executed for every ack we receive,
- * except for conditions checked in should_advance_cwnd()
- * before the call to tcp_cong_avoid(). Mainly this means that
- * we only execute this code if the ack actually acked some
- * data.
- */
-
- /* If we are in slow start, increase our cwnd in response to this ACK.
- * (If we are not in slow start then we are in congestion avoidance,
- * and adjust our congestion window only once per RTT. See the code
- * above.)
- */
- if (tp->snd_cwnd <= tp->snd_ssthresh)
- tp->snd_cwnd++;
-
- /* to keep cwnd from growing without bound */
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
- /* Make sure that we are never so timid as to reduce our cwnd below
- * 2 MSS.
- *
- * Going below 2 MSS would risk huge delayed ACKs from our receiver.
- */
- tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+ /* Wipe the slate clean for the next RTT. */
+ vegas->cntRTT = 0;
+ vegas->minRTT = 0x7fffffff;
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0bd1013cb0..2422a5f7195 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
{
- return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+ return __skb_checksum_complete(skb);
}
static __inline__ int udp_checksum_complete(struct sk_buff *skb)
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
if (uh->check == 0) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
- return 0;
- LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
}
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2c5f57299d6..56a09a4ac41 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -35,6 +35,9 @@
* YOSHIFUJI Hideaki @USAGI : ARCnet support
* YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to
* seq_file.
+ * YOSHIFUJI Hideaki @USAGI : improved source address
+ * selection; consider scope,
+ * status etc.
*/
#include <linux/config.h>
@@ -193,46 +196,51 @@ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
#endif
const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
-int ipv6_addr_type(const struct in6_addr *addr)
+#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16)
+
+static inline unsigned ipv6_addr_scope2type(unsigned scope)
+{
+ switch(scope) {
+ case IPV6_ADDR_SCOPE_NODELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
+ IPV6_ADDR_LOOPBACK);
+ case IPV6_ADDR_SCOPE_LINKLOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
+ IPV6_ADDR_LINKLOCAL);
+ case IPV6_ADDR_SCOPE_SITELOCAL:
+ return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
+ IPV6_ADDR_SITELOCAL);
+ }
+ return IPV6_ADDR_SCOPE_TYPE(scope);
+}
+
+int __ipv6_addr_type(const struct in6_addr *addr)
{
- int type;
u32 st;
st = addr->s6_addr32[0];
- if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
- type = IPV6_ADDR_MULTICAST;
-
- switch((st & htonl(0x00FF0000))) {
- case __constant_htonl(0x00010000):
- type |= IPV6_ADDR_LOOPBACK;
- break;
-
- case __constant_htonl(0x00020000):
- type |= IPV6_ADDR_LINKLOCAL;
- break;
-
- case __constant_htonl(0x00050000):
- type |= IPV6_ADDR_SITELOCAL;
- break;
- };
- return type;
- }
-
- type = IPV6_ADDR_UNICAST;
-
/* Consider all addresses with the first three bits different of
- 000 and 111 as finished.
+ 000 and 111 as unicasts.
*/
if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
(st & htonl(0xE0000000)) != htonl(0xE0000000))
- return type;
-
- if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
- return (IPV6_ADDR_LINKLOCAL | type);
+ return (IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));
+ if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
+ /* multicast */
+ /* addr-select 3.1 */
+ return (IPV6_ADDR_MULTICAST |
+ ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
+ }
+
+ if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
+ return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */
if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
- return (IPV6_ADDR_SITELOCAL | type);
+ return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */
if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
if (addr->s6_addr32[2] == 0) {
@@ -240,24 +248,20 @@ int ipv6_addr_type(const struct in6_addr *addr)
return IPV6_ADDR_ANY;
if (addr->s6_addr32[3] == htonl(0x00000001))
- return (IPV6_ADDR_LOOPBACK | type);
+ return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */
- return (IPV6_ADDR_COMPATv4 | type);
+ return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
}
if (addr->s6_addr32[2] == htonl(0x0000ffff))
- return IPV6_ADDR_MAPPED;
+ return (IPV6_ADDR_MAPPED |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */
}
- st &= htonl(0xFF000000);
- if (st == 0)
- return IPV6_ADDR_RESERVED;
- st &= htonl(0xFE000000);
- if (st == htonl(0x02000000))
- return IPV6_ADDR_RESERVED; /* for NSAP */
- if (st == htonl(0x04000000))
- return IPV6_ADDR_RESERVED; /* for IPX */
- return type;
+ return (IPV6_ADDR_RESERVED |
+ IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */
}
static void addrconf_del_timer(struct inet6_ifaddr *ifp)
@@ -805,138 +809,276 @@ out:
#endif
/*
- * Choose an appropriate source address
- * should do:
- * i) get an address with an appropriate scope
- * ii) see if there is a specific route for the destination and use
- * an address of the attached interface
- * iii) don't use deprecated addresses
+ * Choose an appropriate source address (RFC3484)
*/
-static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref)
+struct ipv6_saddr_score {
+ int addr_type;
+ unsigned int attrs;
+ int matchlen;
+ unsigned int scope;
+ unsigned int rule;
+};
+
+#define IPV6_SADDR_SCORE_LOCAL 0x0001
+#define IPV6_SADDR_SCORE_PREFERRED 0x0004
+#define IPV6_SADDR_SCORE_HOA 0x0008
+#define IPV6_SADDR_SCORE_OIF 0x0010
+#define IPV6_SADDR_SCORE_LABEL 0x0020
+#define IPV6_SADDR_SCORE_PRIVACY 0x0040
+
+static int inline ipv6_saddr_preferred(int type)
{
- int pref;
- pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2;
-#ifdef CONFIG_IPV6_PRIVACY
- pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1;
-#endif
- return pref;
+ if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|
+ IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED))
+ return 1;
+ return 0;
}
-#ifdef CONFIG_IPV6_PRIVACY
-#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3)
-#else
-#define IPV6_GET_SADDR_MAXSCORE(score) (score)
-#endif
+/* static matching label */
+static int inline ipv6_saddr_label(const struct in6_addr *addr, int type)
+{
+ /*
+ * prefix (longest match) label
+ * -----------------------------
+ * ::1/128 0
+ * ::/0 1
+ * 2002::/16 2
+ * ::/96 3
+ * ::ffff:0:0/96 4
+ */
+ if (type & IPV6_ADDR_LOOPBACK)
+ return 0;
+ else if (type & IPV6_ADDR_COMPATv4)
+ return 3;
+ else if (type & IPV6_ADDR_MAPPED)
+ return 4;
+ else if (addr->s6_addr16[0] == htons(0x2002))
+ return 2;
+ return 1;
+}
-int ipv6_dev_get_saddr(struct net_device *dev,
+int ipv6_dev_get_saddr(struct net_device *daddr_dev,
struct in6_addr *daddr, struct in6_addr *saddr)
{
- struct inet6_ifaddr *ifp = NULL;
- struct inet6_ifaddr *match = NULL;
- struct inet6_dev *idev;
- int scope;
- int err;
- int hiscore = -1, score;
+ struct ipv6_saddr_score hiscore;
+ struct inet6_ifaddr *ifa_result = NULL;
+ int daddr_type = __ipv6_addr_type(daddr);
+ int daddr_scope = __ipv6_addr_src_scope(daddr_type);
+ u32 daddr_label = ipv6_saddr_label(daddr, daddr_type);
+ struct net_device *dev;
- scope = ipv6_addr_scope(daddr);
+ memset(&hiscore, 0, sizeof(hiscore));
- /*
- * known dev
- * search dev and walk through dev addresses
- */
+ read_lock(&dev_base_lock);
+ read_lock(&addrconf_lock);
- if (dev) {
- if (dev->flags & IFF_LOOPBACK)
- scope = IFA_HOST;
+ for (dev = dev_base; dev; dev=dev->next) {
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+
+ /* Rule 0: Candidate Source Address (section 4)
+ * - multicast and link-local destination address,
+ * the set of candidate source address MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same link as the outgoing
+ * interface.
+ * (- For site-local destination addresses, the
+ * set of candidate source addresses MUST only
+ * include addresses assigned to interfaces
+ * belonging to the same site as the outgoing
+ * interface.)
+ */
+ if ((daddr_type & IPV6_ADDR_MULTICAST ||
+ daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) &&
+ daddr_dev && dev != daddr_dev)
+ continue;
- read_lock(&addrconf_lock);
idev = __in6_dev_get(dev);
- if (idev) {
- read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
- if (ifp->scope == scope) {
- if (ifp->flags&IFA_F_TENTATIVE)
- continue;
-#ifdef CONFIG_IPV6_PRIVACY
- score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0);
-#else
- score = ipv6_saddr_pref(ifp, 0);
-#endif
- if (score <= hiscore)
- continue;
+ if (!idev)
+ continue;
- if (match)
- in6_ifa_put(match);
- match = ifp;
- hiscore = score;
- in6_ifa_hold(ifp);
+ read_lock_bh(&idev->lock);
+ for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
+ struct ipv6_saddr_score score;
- if (IPV6_GET_SADDR_MAXSCORE(score)) {
- read_unlock_bh(&idev->lock);
- read_unlock(&addrconf_lock);
- goto out;
- }
+ score.addr_type = __ipv6_addr_type(&ifa->addr);
+
+ /* Rule 0: Candidate Source Address (section 4)
+ * - In any case, anycast addresses, multicast
+ * addresses, and the unspecified address MUST
+ * NOT be included in a candidate set.
+ */
+ if (unlikely(score.addr_type == IPV6_ADDR_ANY ||
+ score.addr_type & IPV6_ADDR_MULTICAST)) {
+ LIMIT_NETDEBUG(KERN_DEBUG
+ "ADDRCONF: unspecified / multicast address"
+ "assigned as unicast address on %s",
+ dev->name);
+ continue;
+ }
+
+ score.attrs = 0;
+ score.matchlen = 0;
+ score.scope = 0;
+ score.rule = 0;
+
+ if (ifa_result == NULL) {
+ /* record it if the first available entry */
+ goto record_it;
+ }
+
+ /* Rule 1: Prefer same address */
+ if (hiscore.rule < 1) {
+ if (ipv6_addr_equal(&ifa_result->addr, daddr))
+ hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL;
+ hiscore.rule++;
+ }
+ if (ipv6_addr_equal(&ifa->addr, daddr)) {
+ score.attrs |= IPV6_SADDR_SCORE_LOCAL;
+ if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) {
+ score.rule = 1;
+ goto record_it;
}
+ } else {
+ if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)
+ continue;
}
- read_unlock_bh(&idev->lock);
- }
- read_unlock(&addrconf_lock);
- }
- if (scope == IFA_LINK)
- goto out;
+ /* Rule 2: Prefer appropriate scope */
+ if (hiscore.rule < 2) {
+ hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type);
+ hiscore.rule++;
+ }
+ score.scope = __ipv6_addr_src_scope(score.addr_type);
+ if (hiscore.scope < score.scope) {
+ if (hiscore.scope < daddr_scope) {
+ score.rule = 2;
+ goto record_it;
+ } else
+ continue;
+ } else if (score.scope < hiscore.scope) {
+ if (score.scope < daddr_scope)
+ continue;
+ else {
+ score.rule = 2;
+ goto record_it;
+ }
+ }
- /*
- * dev == NULL or search failed for specified dev
- */
+ /* Rule 3: Avoid deprecated address */
+ if (hiscore.rule < 3) {
+ if (ipv6_saddr_preferred(hiscore.addr_type) ||
+ !(ifa_result->flags & IFA_F_DEPRECATED))
+ hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED;
+ hiscore.rule++;
+ }
+ if (ipv6_saddr_preferred(score.addr_type) ||
+ !(ifa->flags & IFA_F_DEPRECATED)) {
+ score.attrs |= IPV6_SADDR_SCORE_PREFERRED;
+ if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) {
+ score.rule = 3;
+ goto record_it;
+ }
+ } else {
+ if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)
+ continue;
+ }
- read_lock(&dev_base_lock);
- read_lock(&addrconf_lock);
- for (dev = dev_base; dev; dev=dev->next) {
- idev = __in6_dev_get(dev);
- if (idev) {
- read_lock_bh(&idev->lock);
- for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) {
- if (ifp->scope == scope) {
- if (ifp->flags&IFA_F_TENTATIVE)
- continue;
-#ifdef CONFIG_IPV6_PRIVACY
- score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0);
-#else
- score = ipv6_saddr_pref(ifp, 0);
-#endif
- if (score <= hiscore)
- continue;
+ /* Rule 4: Prefer home address -- not implemented yet */
- if (match)
- in6_ifa_put(match);
- match = ifp;
- hiscore = score;
- in6_ifa_hold(ifp);
+ /* Rule 5: Prefer outgoing interface */
+ if (hiscore.rule < 5) {
+ if (daddr_dev == NULL ||
+ daddr_dev == ifa_result->idev->dev)
+ hiscore.attrs |= IPV6_SADDR_SCORE_OIF;
+ hiscore.rule++;
+ }
+ if (daddr_dev == NULL ||
+ daddr_dev == ifa->idev->dev) {
+ score.attrs |= IPV6_SADDR_SCORE_OIF;
+ if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) {
+ score.rule = 5;
+ goto record_it;
+ }
+ } else {
+ if (hiscore.attrs & IPV6_SADDR_SCORE_OIF)
+ continue;
+ }
- if (IPV6_GET_SADDR_MAXSCORE(score)) {
- read_unlock_bh(&idev->lock);
- goto out_unlock_base;
- }
+ /* Rule 6: Prefer matching label */
+ if (hiscore.rule < 6) {
+ if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label)
+ hiscore.attrs |= IPV6_SADDR_SCORE_LABEL;
+ hiscore.rule++;
+ }
+ if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) {
+ score.attrs |= IPV6_SADDR_SCORE_LABEL;
+ if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) {
+ score.rule = 6;
+ goto record_it;
}
+ } else {
+ if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL)
+ continue;
}
- read_unlock_bh(&idev->lock);
+
+#ifdef CONFIG_IPV6_PRIVACY
+ /* Rule 7: Prefer public address
+ * Note: prefer temprary address if use_tempaddr >= 2
+ */
+ if (hiscore.rule < 7) {
+ if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^
+ (ifa_result->idev->cnf.use_tempaddr >= 2))
+ hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY;
+ hiscore.rule++;
+ }
+ if ((!(ifa->flags & IFA_F_TEMPORARY)) ^
+ (ifa->idev->cnf.use_tempaddr >= 2)) {
+ score.attrs |= IPV6_SADDR_SCORE_PRIVACY;
+ if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) {
+ score.rule = 7;
+ goto record_it;
+ }
+ } else {
+ if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)
+ continue;
+ }
+#endif
+ /* Rule 8: Use longest matching prefix */
+ if (hiscore.rule < 8) {
+ hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr);
+ hiscore.rule++;
+ }
+ score.matchlen = ipv6_addr_diff(&ifa->addr, daddr);
+ if (score.matchlen > hiscore.matchlen) {
+ score.rule = 8;
+ goto record_it;
+ }
+#if 0
+ else if (score.matchlen < hiscore.matchlen)
+ continue;
+#endif
+
+ /* Final Rule: choose first available one */
+ continue;
+record_it:
+ if (ifa_result)
+ in6_ifa_put(ifa_result);
+ in6_ifa_hold(ifa);
+ ifa_result = ifa;
+ hiscore = score;
}
+ read_unlock_bh(&idev->lock);
}
-
-out_unlock_base:
read_unlock(&addrconf_lock);
read_unlock(&dev_base_lock);
-out:
- err = -EADDRNOTAVAIL;
- if (match) {
- ipv6_addr_copy(saddr, &match->addr);
- err = 0;
- in6_ifa_put(match);
- }
-
- return err;
+ if (!ifa_result)
+ return -EADDRNOTAVAIL;
+
+ ipv6_addr_copy(saddr, &ifa_result->addr);
+ in6_ifa_put(ifa_result);
+ return 0;
}
@@ -2950,8 +3092,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
nlmsg_failure:
rtattr_failure:
- if (array)
- kfree(array);
+ kfree(array);
skb_trim(skb, b - skb->data);
return -1;
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4f8795af2ed..c63b8ce0e1b 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -699,12 +699,14 @@ static int __init inet6_init(void)
/* Register the family here so that the init calls below will
* be able to create sockets. (?? is this dangerous ??)
*/
- (void) sock_register(&inet6_family_ops);
+ err = sock_register(&inet6_family_ops);
+ if (err)
+ goto out_unregister_raw_proto;
/* Initialise ipv6 mibs */
err = init_ipv6_mibs();
if (err)
- goto out_unregister_raw_proto;
+ goto out_unregister_sock;
/*
* ipngwg API draft makes clear that the correct semantics
@@ -796,6 +798,8 @@ icmp_fail:
ipv6_sysctl_unregister();
#endif
cleanup_ipv6_mibs();
+out_unregister_sock:
+ sock_unregister(PF_INET6);
out_unregister_raw_proto:
proto_unregister(&rawv6_prot);
out_unregister_udp_proto:
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 23e540365a1..1bdf0fb8bf8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
daddr = &skb->nh.ipv6h->daddr;
/* Perform checksum. */
- if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
- skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- if (skb->ip_summed == CHECKSUM_NONE) {
- if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
- skb_checksum(skb, 0, skb->len, 0))) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_HW:
+ if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
+ skb->csum))
+ break;
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len,
+ IPPROTO_ICMPV6, 0);
+ if (__skb_checksum_complete(skb)) {
LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
NIP6(*saddr), NIP6(*daddr));
goto discard_it;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4fcc5a7acf6..1bf6d9a769e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -127,56 +127,6 @@ static __inline__ int addr_bit_set(void *token, int fn_bit)
return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
}
-/*
- * find the first different bit between two addresses
- * length of address must be a multiple of 32bits
- */
-
-static __inline__ int addr_diff(void *token1, void *token2, int addrlen)
-{
- __u32 *a1 = token1;
- __u32 *a2 = token2;
- int i;
-
- addrlen >>= 2;
-
- for (i = 0; i < addrlen; i++) {
- __u32 xb;
-
- xb = a1[i] ^ a2[i];
-
- if (xb) {
- int j = 31;
-
- xb = ntohl(xb);
-
- while ((xb & (1 << j)) == 0)
- j--;
-
- return (i * 32 + 31 - j);
- }
- }
-
- /*
- * we should *never* get to this point since that
- * would mean the addrs are equal
- *
- * However, we do get to it 8) And exacly, when
- * addresses are equal 8)
- *
- * ip route add 1111::/128 via ...
- * ip route add 1111::/64 via ...
- * and we are here.
- *
- * Ideally, this function should stop comparison
- * at prefix length. It does not, but it is still OK,
- * if returned value is greater than prefix length.
- * --ANK (980803)
- */
-
- return addrlen<<5;
-}
-
static __inline__ struct fib6_node * node_alloc(void)
{
struct fib6_node *fn;
@@ -296,11 +246,11 @@ insert_above:
/* find 1st bit in difference between the 2 addrs.
- See comment in addr_diff: bit may be an invalid value,
+ See comment in __ipv6_addr_diff: bit may be an invalid value,
but if it is >= plen, the value is ignored in any case.
*/
- bit = addr_diff(addr, &key->addr, addrlen);
+ bit = __ipv6_addr_diff(addr, &key->addr, addrlen);
/*
* (intermediate)[in]
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6e348042693..a6026d2787d 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -176,6 +176,11 @@ resubmit:
if (ipprot->flags & INET6_PROTO_FINAL) {
struct ipv6hdr *hdr;
+ /* Free reference early: we don't need it any more,
+ and it may hold ip_conntrack module loaded
+ indefinitely. */
+ nf_reset(skb);
+
skb_postpull_rcsum(skb, skb->nh.raw,
skb->h.raw - skb->nh.raw);
hdr = skb->nh.ipv6h;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 614296a920c..c1fa693511a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -441,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
#ifdef CONFIG_NETFILTER
to->nfmark = from->nfmark;
/* Connection association is same as pre-frag packet */
+ nf_conntrack_put(to->nfct);
to->nfct = from->nfct;
nf_conntrack_get(to->nfct);
to->nfctinfo = from->nfctinfo;
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ nf_conntrack_put_reasm(to->nfct_reasm);
+ to->nfct_reasm = from->nfct_reasm;
+ nf_conntrack_get_reasm(to->nfct_reasm);
+#endif
#ifdef CONFIG_BRIDGE_NETFILTER
nf_bridge_put(to->nf_bridge);
to->nf_bridge = from->nf_bridge;
@@ -587,8 +593,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
skb->next = NULL;
}
- if (tmp_hdr)
- kfree(tmp_hdr);
+ kfree(tmp_hdr);
if (err == 0) {
IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
@@ -1186,10 +1191,8 @@ int ip6_push_pending_frames(struct sock *sk)
out:
inet->cork.flags &= ~IPCORK_OPT;
- if (np->cork.opt) {
- kfree(np->cork.opt);
- np->cork.opt = NULL;
- }
+ kfree(np->cork.opt);
+ np->cork.opt = NULL;
if (np->cork.rt) {
dst_release(&np->cork.rt->u.dst);
np->cork.rt = NULL;
@@ -1214,10 +1217,8 @@ void ip6_flush_pending_frames(struct sock *sk)
inet->cork.flags &= ~IPCORK_OPT;
- if (np->cork.opt) {
- kfree(np->cork.opt);
- np->cork.opt = NULL;
- }
+ kfree(np->cork.opt);
+ np->cork.opt = NULL;
if (np->cork.rt) {
dst_release(&np->cork.rt->u.dst);
np->cork.rt = NULL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index cf94372d1af..e315d0f80af 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) {
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ read_unlock(&ip6ip6_lock);
kfree_skb(skb);
return 0;
}
@@ -756,8 +757,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
}
ip6_tnl_dst_store(t, dst);
- if (opt)
- kfree(opt);
+ kfree(opt);
t->recursion--;
return 0;
@@ -766,8 +766,7 @@ tx_err_link_failure:
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
- if (opt)
- kfree(opt);
+ kfree(opt);
tx_err:
stats->tx_errors++;
stats->tx_dropped++;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 85bfbc69b2c..55917fb1709 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -130,8 +130,7 @@ static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, s
out_put_cpu:
put_cpu();
out:
- if (tmp_hdr)
- kfree(tmp_hdr);
+ kfree(tmp_hdr);
if (err)
goto error_out;
return nexthdr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 8567873d0dd..25757ade989 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -80,8 +80,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
if (ra->sk == sk) {
if (sel>=0) {
write_unlock_bh(&ip6_ra_lock);
- if (new_ra)
- kfree(new_ra);
+ kfree(new_ra);
return -EADDRINUSE;
}
@@ -288,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
{
struct ipv6_txoptions *opt;
if (optlen == 0)
- optval = 0;
+ optval = NULL;
/* hop-by-hop / destination options are privileged option */
retv = -EPERM;
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 37a4a99c9fe..16482785bdf 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -7,7 +7,7 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
-EXPORT_SYMBOL(ipv6_addr_type);
+EXPORT_SYMBOL(__ipv6_addr_type);
EXPORT_SYMBOL(icmpv6_send);
EXPORT_SYMBOL(icmpv6_statistics);
EXPORT_SYMBOL(icmpv6_err_convert);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index bb7ccfe33f2..060d6120241 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,10 +5,20 @@
menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
depends on INET && IPV6 && NETFILTER && EXPERIMENTAL
-#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK
-#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then
-# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
-#fi
+config NF_CONNTRACK_IPV6
+ tristate "IPv6 support for new connection tracking (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && NF_CONNTRACK
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ This is IPv6 support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config IP6_NF_QUEUE
tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
---help---
@@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES
config IP6_NF_MATCH_MARK
tristate "netfilter MARK match support"
depends on IP6_NF_IPTABLES
@@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES
-# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES
-# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then
-# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES
-# fi
-# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES
-# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES
-# fi
# The targets
config IP6_NF_FILTER
tristate "Packet filtering"
@@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE
To compile it as a module, choose M here. If unsure, say N.
-# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
-# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
-# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER
-# fi
-# fi
config IP6_NF_MANGLE
tristate "Packet mangling"
depends on IP6_NF_IPTABLES
@@ -236,7 +230,6 @@ config IP6_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE
config IP6_NF_TARGET_MARK
tristate "MARK target support"
depends on IP6_NF_MANGLE
@@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL
To compile it as a module, choose M here. If unsure, say N.
-#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
depends on IP6_NF_IPTABLES
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2b2c370e8b1..9ab5b2ca1f5 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+
+# objects for l3 independent conntrack
+nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
+
+# l3 independent conntrack
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index 0c7584f9217..eab8fb864ee 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -56,9 +56,9 @@ checkentry(const char *tablename,
return 1;
}
-static struct ip6t_target ip6t_mark_reg = {
- .name = "MARK",
- .target = target,
+static struct ip6t_target ip6t_mark_reg = {
+ .name = "MARK",
+ .target = target,
.checkentry = checkentry,
.me = THIS_MODULE
};
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
new file mode 100644
index 00000000000..753a3ae8502
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -0,0 +1,556 @@
+/*
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - support Layer 3 protocol independent connection tracking.
+ * Based on the original ip_conntrack code which had the following
+ * copyright information:
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - add get_features() to support various size of conntrack
+ * structures.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
+
+static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ u_int32_t _addrs[8], *ap;
+
+ ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
+ sizeof(_addrs), _addrs);
+ if (ap == NULL)
+ return 0;
+
+ memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+
+ return 1;
+}
+
+static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
+
+ return 1;
+}
+
+static int ipv6_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ",
+ NIP6(*((struct in6_addr *)tuple->src.u3.ip6)),
+ NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)));
+}
+
+static int ipv6_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+/*
+ * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * if success, *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - it may return pointer pointing beyond end of packet,
+ * if the last recognized header is truncated in the middle.
+ * - if packet is truncated, so that all parsed headers are skipped,
+ * it returns -1.
+ * - if packet is fragmented, return pointer of the fragment header.
+ * - ESP is unparsable for now and considered like
+ * normal payload protocol.
+ * - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ */
+
+int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp,
+ int len)
+{
+ u8 nexthdr = *nexthdrp;
+
+ while (ipv6_ext_hdr(nexthdr)) {
+ struct ipv6_opt_hdr hdr;
+ int hdrlen;
+
+ if (len < (int)sizeof(struct ipv6_opt_hdr))
+ return -1;
+ if (nexthdr == NEXTHDR_NONE)
+ break;
+ if (nexthdr == NEXTHDR_FRAGMENT)
+ break;
+ if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+ BUG();
+ if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hdr.hdrlen+2)<<2;
+ else
+ hdrlen = ipv6_optlen(&hdr);
+
+ nexthdr = hdr.nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
+
+ *nexthdrp = nexthdr;
+ return start;
+}
+
+static int
+ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff,
+ u_int8_t *protonum)
+{
+ unsigned int extoff;
+ unsigned char pnum;
+ int protoff;
+
+ extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data;
+ pnum = (*pskb)->nh.ipv6h->nexthdr;
+
+ protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
+ (*pskb)->len - extoff);
+
+ /*
+ * (protoff == (*pskb)->len) mean that the packet doesn't have no data
+ * except of IPv6 & ext headers. but it's tracked anyway. - YK
+ */
+ if ((protoff < 0) || (protoff > (*pskb)->len)) {
+ DEBUGP("ip6_conntrack_core: can't find proto in pkt\n");
+ NF_CT_STAT_INC(error);
+ NF_CT_STAT_INC(invalid);
+ return -NF_ACCEPT;
+ }
+
+ *dataoff = protoff;
+ *protonum = pnum;
+ return NF_ACCEPT;
+}
+
+static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple)
+{
+ return NF_CT_F_BASIC;
+}
+
+static unsigned int ipv6_confirm(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(*pskb, &ctinfo);
+ if (ct && ct->helper) {
+ unsigned int ret, protoff;
+ unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1)
+ - (*pskb)->data;
+ unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr;
+
+ protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum,
+ (*pskb)->len - extoff);
+ if (protoff < 0 || protoff > (*pskb)->len ||
+ pnum == NEXTHDR_FRAGMENT) {
+ DEBUGP("proto header not found\n");
+ return NF_ACCEPT;
+ }
+
+ ret = ct->helper->help(pskb, protoff, ct, ctinfo);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+
+ /* We've seen it coming out the other side: confirm it */
+
+ return nf_conntrack_confirm(pskb);
+}
+
+extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb);
+extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
+ struct net_device *in,
+ struct net_device *out,
+ int (*okfn)(struct sk_buff *));
+static unsigned int ipv6_defrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *reasm;
+
+ /* Previously seen (loopback)? */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+
+ reasm = nf_ct_frag6_gather(*pskb);
+
+ /* queued */
+ if (reasm == NULL)
+ return NF_STOLEN;
+
+ /* error occured or not fragmented */
+ if (reasm == *pskb)
+ return NF_ACCEPT;
+
+ nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+ (struct net_device *)out, okfn);
+
+ return NF_STOLEN;
+}
+
+static unsigned int ipv6_conntrack_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *reasm = (*pskb)->nfct_reasm;
+
+ /* This packet is fragmented and has reassembled packet. */
+ if (reasm) {
+ /* Reassembled packet isn't parsed yet ? */
+ if (!reasm->nfct) {
+ unsigned int ret;
+
+ ret = nf_conntrack_in(PF_INET6, hooknum, &reasm);
+ if (ret != NF_ACCEPT)
+ return ret;
+ }
+ nf_conntrack_get(reasm->nfct);
+ (*pskb)->nfct = reasm->nfct;
+ return NF_ACCEPT;
+ }
+
+ return nf_conntrack_in(PF_INET6, hooknum, pskb);
+}
+
+static unsigned int ipv6_conntrack_local(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct ipv6hdr)) {
+ if (net_ratelimit())
+ printk("ipv6_conntrack_local: packet too short\n");
+ return NF_ACCEPT;
+ }
+ return ipv6_conntrack_in(hooknum, pskb, in, out, okfn);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ipv6_conntrack_defrag_ops = {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ipv6_conntrack_in_ops = {
+ .hook = ipv6_conntrack_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK,
+};
+
+static struct nf_hook_ops ipv6_conntrack_local_out_ops = {
+ .hook = ipv6_conntrack_local,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK,
+};
+
+static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = {
+ .hook = ipv6_defrag,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK_DEFRAG,
+};
+
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ipv6_conntrack_out_ops = {
+ .hook = ipv6_confirm,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_POST_ROUTING,
+ .priority = NF_IP6_PRI_LAST,
+};
+
+static struct nf_hook_ops ipv6_conntrack_local_in_ops = {
+ .hook = ipv6_confirm,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_IP6_LOCAL_IN,
+ .priority = NF_IP6_PRI_LAST-1,
+};
+
+#ifdef CONFIG_SYSCTL
+
+/* From nf_conntrack_proto_icmpv6.c */
+extern unsigned long nf_ct_icmpv6_timeout;
+
+/* From nf_conntrack_frag6.c */
+extern unsigned long nf_ct_frag6_timeout;
+extern unsigned int nf_ct_frag6_low_thresh;
+extern unsigned int nf_ct_frag6_high_thresh;
+
+static struct ctl_table_header *nf_ct_ipv6_sysctl_header;
+
+static ctl_table nf_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT,
+ .procname = "nf_conntrack_icmpv6_timeout",
+ .data = &nf_ct_icmpv6_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT,
+ .procname = "nf_conntrack_frag6_timeout",
+ .data = &nf_ct_frag6_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH,
+ .procname = "nf_conntrack_frag6_low_thresh",
+ .data = &nf_ct_frag6_low_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,
+ .procname = "nf_conntrack_frag6_high_thresh",
+ .data = &nf_ct_frag6_high_thresh,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = nf_ct_sysctl_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = nf_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
+ .l3proto = PF_INET6,
+ .name = "ipv6",
+ .pkt_to_tuple = ipv6_pkt_to_tuple,
+ .invert_tuple = ipv6_invert_tuple,
+ .print_tuple = ipv6_print_tuple,
+ .print_conntrack = ipv6_print_conntrack,
+ .prepare = ipv6_prepare,
+ .get_features = ipv6_get_features,
+ .me = THIS_MODULE,
+};
+
+extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6;
+extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6;
+extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6;
+extern int nf_ct_frag6_init(void);
+extern void nf_ct_frag6_cleanup(void);
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = nf_ct_frag6_init();
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't initialize frag6.\n");
+ goto cleanup_nothing;
+ }
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register tcp.\n");
+ goto cleanup_frag6;
+ }
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register udp.\n");
+ goto cleanup_tcp;
+ }
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register icmpv6.\n");
+ goto cleanup_udp;
+ }
+
+ ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register ipv6\n");
+ goto cleanup_icmpv6;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_defrag_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register pre-routing defrag "
+ "hook.\n");
+ goto cleanup_ipv6;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register local_out defrag "
+ "hook.\n");
+ goto cleanup_defragops;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_in_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register pre-routing hook.\n");
+ goto cleanup_defraglocalops;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_local_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register local out hook.\n");
+ goto cleanup_inops;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_out_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register post-routing hook.\n");
+ goto cleanup_inandlocalops;
+ }
+
+ ret = nf_register_hook(&ipv6_conntrack_local_in_ops);
+ if (ret < 0) {
+ printk("nf_conntrack_ipv6: can't register local in hook.\n");
+ goto cleanup_inoutandlocalops;
+ }
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
+ if (nf_ct_ipv6_sysctl_header == NULL) {
+ printk("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto cleanup_localinops;
+ }
+#endif
+ return ret;
+
+ cleanup:
+ synchronize_net();
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(nf_ct_ipv6_sysctl_header);
+ cleanup_localinops:
+#endif
+ nf_unregister_hook(&ipv6_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+ nf_unregister_hook(&ipv6_conntrack_out_ops);
+ cleanup_inandlocalops:
+ nf_unregister_hook(&ipv6_conntrack_local_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ipv6_conntrack_in_ops);
+ cleanup_defraglocalops:
+ nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+ nf_unregister_hook(&ipv6_conntrack_defrag_ops);
+ cleanup_ipv6:
+ nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
+ cleanup_icmpv6:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6);
+ cleanup_udp:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6);
+ cleanup_tcp:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6);
+ cleanup_frag6:
+ nf_ct_frag6_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
+
+static int __init init(void)
+{
+ need_nf_conntrack();
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+void need_ip6_conntrack(void)
+{
+}
+
+EXPORT_SYMBOL(need_ip6_conntrack);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
new file mode 100644
index 00000000000..c0f1da5497a
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C)2003,2004 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - ICMPv6 tracking support. Derived from the original ip_conntrack code
+ * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following
+ * copyright information:
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
+
+unsigned long nf_ct_icmpv6_timeout = 30*HZ;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int icmpv6_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct icmp6hdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return 0;
+ tuple->dst.u.icmp.type = hp->icmp6_type;
+ tuple->src.u.icmp.id = hp->icmp6_identifier;
+ tuple->dst.u.icmp.code = hp->icmp6_code;
+
+ return 1;
+}
+
+static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ /* Add 1; spaces filled with 0. */
+ static u_int8_t invmap[] = {
+ [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1,
+ [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1,
+ [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1,
+ [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1
+ };
+
+ __u8 type = orig->dst.u.icmp.type - 128;
+ if (type >= sizeof(invmap) || !invmap[type])
+ return 0;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmpv6_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+/* Print out the private part of the conntrack. */
+static int icmpv6_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmpv6_packet(struct nf_conn *ct,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ /* Try to delete connection immediately after all replies:
+ won't actually vanish as we still have skb, and del_timer
+ means this will only run once even if count hits zero twice
+ (theoretically possible with SMP) */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ if (atomic_dec_and_test(&ct->proto.icmp.count)
+ && del_timer(&ct->timeout))
+ ct->timeout.function((unsigned long)ct);
+ } else {
+ atomic_inc(&ct->proto.icmp.count);
+ nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout);
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int icmpv6_new(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ static u_int8_t valid_new[] = {
+ [ICMPV6_ECHO_REQUEST - 128] = 1,
+ [ICMPV6_NI_QUERY - 128] = 1
+ };
+
+ if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new)
+ || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) {
+ /* Can't create a new ICMPv6 `conn' with this. */
+ DEBUGP("icmp: can't create new conn with type %u\n",
+ conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+ NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+ return 0;
+ }
+ atomic_set(&conntrack->proto.icmp.count, 0);
+ return 1;
+}
+
+extern int
+nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len);
+extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
+static int
+icmpv6_error_message(struct sk_buff *skb,
+ unsigned int icmp6off,
+ enum ip_conntrack_info *ctinfo,
+ unsigned int hooknum)
+{
+ struct nf_conntrack_tuple intuple, origtuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct icmp6hdr _hdr, *hp;
+ unsigned int inip6off;
+ struct nf_conntrack_protocol *inproto;
+ u_int8_t inprotonum;
+ unsigned int inprotoff;
+
+ NF_CT_ASSERT(skb->nfct == NULL);
+
+ hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n");
+ return -NF_ACCEPT;
+ }
+
+ inip6off = icmp6off + sizeof(_hdr);
+ if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr),
+ &inprotonum, sizeof(inprotonum)) != 0) {
+ DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n");
+ return -NF_ACCEPT;
+ }
+ inprotoff = nf_ct_ipv6_skip_exthdr(skb,
+ inip6off + sizeof(struct ipv6hdr),
+ &inprotonum,
+ skb->len - inip6off
+ - sizeof(struct ipv6hdr));
+
+ if ((inprotoff < 0) || (inprotoff > skb->len) ||
+ (inprotonum == NEXTHDR_FRAGMENT)) {
+ DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n");
+ return -NF_ACCEPT;
+ }
+
+ inproto = nf_ct_find_proto(PF_INET6, inprotonum);
+
+ /* Are they talking about one of our connections? */
+ if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum,
+ &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) {
+ DEBUGP("icmpv6_error: Can't get tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!nf_ct_invert_tuple(&intuple, &origtuple,
+ &nf_conntrack_l3proto_ipv6, inproto)) {
+ DEBUGP("icmpv6_error: Can't invert tuple\n");
+ return -NF_ACCEPT;
+ }
+
+ *ctinfo = IP_CT_RELATED;
+
+ h = nf_conntrack_find_get(&intuple, NULL);
+ if (!h) {
+ DEBUGP("icmpv6_error: no match\n");
+ return -NF_ACCEPT;
+ } else {
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ *ctinfo += IP_CT_IS_REPLY;
+ }
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return -NF_ACCEPT;
+}
+
+static int
+icmpv6_error(struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum)
+{
+ struct icmp6hdr _ih, *icmp6h;
+
+ icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
+ if (icmp6h == NULL) {
+ if (LOG_INVALID(IPPROTO_ICMPV6))
+ nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmpv6: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ if (hooknum != NF_IP6_PRE_ROUTING)
+ goto skipped;
+
+ /* Ignore it if the checksum's bogus. */
+ if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
+ skb->len - dataoff, IPPROTO_ICMPV6,
+ skb_checksum(skb, dataoff,
+ skb->len - dataoff, 0))) {
+ nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
+ "nf_ct_icmpv6: ICMPv6 checksum failed\n");
+ return -NF_ACCEPT;
+ }
+
+skipped:
+
+ /* is not error message ? */
+ if (icmp6h->icmp6_type >= 128)
+ return NF_ACCEPT;
+
+ return icmpv6_error_message(skb, dataoff, ctinfo, hooknum);
+}
+
+struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 =
+{
+ .l3proto = PF_INET6,
+ .proto = IPPROTO_ICMPV6,
+ .name = "icmpv6",
+ .pkt_to_tuple = icmpv6_pkt_to_tuple,
+ .invert_tuple = icmpv6_invert_tuple,
+ .print_tuple = icmpv6_print_tuple,
+ .print_conntrack = icmpv6_print_conntrack,
+ .packet = icmpv6_packet,
+ .new = icmpv6_new,
+ .error = icmpv6_error,
+};
+
+EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
new file mode 100644
index 00000000000..c2c52af9e56
--- /dev/null
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -0,0 +1,897 @@
+/*
+ * IPv6 fragment reassembly for connection tracking
+ *
+ * Copyright (C)2004 USAGI/WIDE Project
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on: net/ipv6/reassembly.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/jiffies.h>
+#include <linux/net.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/addrconf.h>
+#include <linux/sysctl.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */
+#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */
+#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT
+
+unsigned int nf_ct_frag6_high_thresh = 256*1024;
+unsigned int nf_ct_frag6_low_thresh = 192*1024;
+unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT;
+
+struct nf_ct_frag6_skb_cb
+{
+ struct inet6_skb_parm h;
+ int offset;
+ struct sk_buff *orig;
+};
+
+#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb))
+
+struct nf_ct_frag6_queue
+{
+ struct nf_ct_frag6_queue *next;
+ struct list_head lru_list; /* lru list member */
+
+ __u32 id; /* fragment id */
+ struct in6_addr saddr;
+ struct in6_addr daddr;
+
+ spinlock_t lock;
+ atomic_t refcnt;
+ struct timer_list timer; /* expire timer */
+ struct sk_buff *fragments;
+ int len;
+ int meat;
+ struct timeval stamp;
+ unsigned int csum;
+ __u8 last_in; /* has first/last segment arrived? */
+#define COMPLETE 4
+#define FIRST_IN 2
+#define LAST_IN 1
+ __u16 nhoffset;
+ struct nf_ct_frag6_queue **pprev;
+};
+
+/* Hash table. */
+
+#define FRAG6Q_HASHSZ 64
+
+static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ];
+static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED;
+static u32 nf_ct_frag6_hash_rnd;
+static LIST_HEAD(nf_ct_frag6_lru_list);
+int nf_ct_frag6_nqueues = 0;
+
+static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq)
+{
+ if (fq->next)
+ fq->next->pprev = fq->pprev;
+ *fq->pprev = fq->next;
+ list_del(&fq->lru_list);
+ nf_ct_frag6_nqueues--;
+}
+
+static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq)
+{
+ write_lock(&nf_ct_frag6_lock);
+ __fq_unlink(fq);
+ write_unlock(&nf_ct_frag6_lock);
+}
+
+static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr,
+ struct in6_addr *daddr)
+{
+ u32 a, b, c;
+
+ a = saddr->s6_addr32[0];
+ b = saddr->s6_addr32[1];
+ c = saddr->s6_addr32[2];
+
+ a += JHASH_GOLDEN_RATIO;
+ b += JHASH_GOLDEN_RATIO;
+ c += nf_ct_frag6_hash_rnd;
+ __jhash_mix(a, b, c);
+
+ a += saddr->s6_addr32[3];
+ b += daddr->s6_addr32[0];
+ c += daddr->s6_addr32[1];
+ __jhash_mix(a, b, c);
+
+ a += daddr->s6_addr32[2];
+ b += daddr->s6_addr32[3];
+ c += id;
+ __jhash_mix(a, b, c);
+
+ return c & (FRAG6Q_HASHSZ - 1);
+}
+
+static struct timer_list nf_ct_frag6_secret_timer;
+int nf_ct_frag6_secret_interval = 10 * 60 * HZ;
+
+static void nf_ct_frag6_secret_rebuild(unsigned long dummy)
+{
+ unsigned long now = jiffies;
+ int i;
+
+ write_lock(&nf_ct_frag6_lock);
+ get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32));
+ for (i = 0; i < FRAG6Q_HASHSZ; i++) {
+ struct nf_ct_frag6_queue *q;
+
+ q = nf_ct_frag6_hash[i];
+ while (q) {
+ struct nf_ct_frag6_queue *next = q->next;
+ unsigned int hval = ip6qhashfn(q->id,
+ &q->saddr,
+ &q->daddr);
+
+ if (hval != i) {
+ /* Unlink. */
+ if (q->next)
+ q->next->pprev = q->pprev;
+ *q->pprev = q->next;
+
+ /* Relink to new hash chain. */
+ if ((q->next = nf_ct_frag6_hash[hval]) != NULL)
+ q->next->pprev = &q->next;
+ nf_ct_frag6_hash[hval] = q;
+ q->pprev = &nf_ct_frag6_hash[hval];
+ }
+
+ q = next;
+ }
+ }
+ write_unlock(&nf_ct_frag6_lock);
+
+ mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval);
+}
+
+atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0);
+
+/* Memory Tracking Functions. */
+static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
+{
+ if (work)
+ *work -= skb->truesize;
+ atomic_sub(skb->truesize, &nf_ct_frag6_mem);
+ if (NFCT_FRAG6_CB(skb)->orig)
+ kfree_skb(NFCT_FRAG6_CB(skb)->orig);
+
+ kfree_skb(skb);
+}
+
+static inline void frag_free_queue(struct nf_ct_frag6_queue *fq,
+ unsigned int *work)
+{
+ if (work)
+ *work -= sizeof(struct nf_ct_frag6_queue);
+ atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
+ kfree(fq);
+}
+
+static inline struct nf_ct_frag6_queue *frag_alloc_queue(void)
+{
+ struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC);
+
+ if (!fq)
+ return NULL;
+ atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem);
+ return fq;
+}
+
+/* Destruction primitives. */
+
+/* Complete destruction of fq. */
+static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq,
+ unsigned int *work)
+{
+ struct sk_buff *fp;
+
+ BUG_TRAP(fq->last_in&COMPLETE);
+ BUG_TRAP(del_timer(&fq->timer) == 0);
+
+ /* Release all fragment data. */
+ fp = fq->fragments;
+ while (fp) {
+ struct sk_buff *xp = fp->next;
+
+ frag_kfree_skb(fp, work);
+ fp = xp;
+ }
+
+ frag_free_queue(fq, work);
+}
+
+static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work)
+{
+ if (atomic_dec_and_test(&fq->refcnt))
+ nf_ct_frag6_destroy(fq, work);
+}
+
+/* Kill fq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
+{
+ if (del_timer(&fq->timer))
+ atomic_dec(&fq->refcnt);
+
+ if (!(fq->last_in & COMPLETE)) {
+ fq_unlink(fq);
+ atomic_dec(&fq->refcnt);
+ fq->last_in |= COMPLETE;
+ }
+}
+
+static void nf_ct_frag6_evictor(void)
+{
+ struct nf_ct_frag6_queue *fq;
+ struct list_head *tmp;
+ unsigned int work;
+
+ work = atomic_read(&nf_ct_frag6_mem);
+ if (work <= nf_ct_frag6_low_thresh)
+ return;
+
+ work -= nf_ct_frag6_low_thresh;
+ while (work > 0) {
+ read_lock(&nf_ct_frag6_lock);
+ if (list_empty(&nf_ct_frag6_lru_list)) {
+ read_unlock(&nf_ct_frag6_lock);
+ return;
+ }
+ tmp = nf_ct_frag6_lru_list.next;
+ BUG_ON(tmp == NULL);
+ fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list);
+ atomic_inc(&fq->refcnt);
+ read_unlock(&nf_ct_frag6_lock);
+
+ spin_lock(&fq->lock);
+ if (!(fq->last_in&COMPLETE))
+ fq_kill(fq);
+ spin_unlock(&fq->lock);
+
+ fq_put(fq, &work);
+ }
+}
+
+static void nf_ct_frag6_expire(unsigned long data)
+{
+ struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data;
+
+ spin_lock(&fq->lock);
+
+ if (fq->last_in & COMPLETE)
+ goto out;
+
+ fq_kill(fq);
+
+out:
+ spin_unlock(&fq->lock);
+ fq_put(fq, NULL);
+}
+
+/* Creation primitives. */
+
+
+static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash,
+ struct nf_ct_frag6_queue *fq_in)
+{
+ struct nf_ct_frag6_queue *fq;
+
+ write_lock(&nf_ct_frag6_lock);
+#ifdef CONFIG_SMP
+ for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
+ if (fq->id == fq_in->id &&
+ !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) &&
+ !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) {
+ atomic_inc(&fq->refcnt);
+ write_unlock(&nf_ct_frag6_lock);
+ fq_in->last_in |= COMPLETE;
+ fq_put(fq_in, NULL);
+ return fq;
+ }
+ }
+#endif
+ fq = fq_in;
+
+ if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout))
+ atomic_inc(&fq->refcnt);
+
+ atomic_inc(&fq->refcnt);
+ if ((fq->next = nf_ct_frag6_hash[hash]) != NULL)
+ fq->next->pprev = &fq->next;
+ nf_ct_frag6_hash[hash] = fq;
+ fq->pprev = &nf_ct_frag6_hash[hash];
+ INIT_LIST_HEAD(&fq->lru_list);
+ list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
+ nf_ct_frag6_nqueues++;
+ write_unlock(&nf_ct_frag6_lock);
+ return fq;
+}
+
+
+static struct nf_ct_frag6_queue *
+nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst)
+{
+ struct nf_ct_frag6_queue *fq;
+
+ if ((fq = frag_alloc_queue()) == NULL) {
+ DEBUGP("Can't alloc new queue\n");
+ goto oom;
+ }
+
+ memset(fq, 0, sizeof(struct nf_ct_frag6_queue));
+
+ fq->id = id;
+ ipv6_addr_copy(&fq->saddr, src);
+ ipv6_addr_copy(&fq->daddr, dst);
+
+ init_timer(&fq->timer);
+ fq->timer.function = nf_ct_frag6_expire;
+ fq->timer.data = (long) fq;
+ fq->lock = SPIN_LOCK_UNLOCKED;
+ atomic_set(&fq->refcnt, 1);
+
+ return nf_ct_frag6_intern(hash, fq);
+
+oom:
+ return NULL;
+}
+
+static __inline__ struct nf_ct_frag6_queue *
+fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
+{
+ struct nf_ct_frag6_queue *fq;
+ unsigned int hash = ip6qhashfn(id, src, dst);
+
+ read_lock(&nf_ct_frag6_lock);
+ for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) {
+ if (fq->id == id &&
+ !ipv6_addr_cmp(src, &fq->saddr) &&
+ !ipv6_addr_cmp(dst, &fq->daddr)) {
+ atomic_inc(&fq->refcnt);
+ read_unlock(&nf_ct_frag6_lock);
+ return fq;
+ }
+ }
+ read_unlock(&nf_ct_frag6_lock);
+
+ return nf_ct_frag6_create(hash, id, src, dst);
+}
+
+
+static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+ struct frag_hdr *fhdr, int nhoff)
+{
+ struct sk_buff *prev, *next;
+ int offset, end;
+
+ if (fq->last_in & COMPLETE) {
+ DEBUGP("Allready completed\n");
+ goto err;
+ }
+
+ offset = ntohs(fhdr->frag_off) & ~0x7;
+ end = offset + (ntohs(skb->nh.ipv6h->payload_len) -
+ ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
+
+ if ((unsigned int)end > IPV6_MAXPLEN) {
+ DEBUGP("offset is too large.\n");
+ return -1;
+ }
+
+ if (skb->ip_summed == CHECKSUM_HW)
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(skb->nh.raw,
+ (u8*)(fhdr + 1) - skb->nh.raw,
+ 0));
+
+ /* Is this the final fragment? */
+ if (!(fhdr->frag_off & htons(IP6_MF))) {
+ /* If we already have some bits beyond end
+ * or have different end, the segment is corrupted.
+ */
+ if (end < fq->len ||
+ ((fq->last_in & LAST_IN) && end != fq->len)) {
+ DEBUGP("already received last fragment\n");
+ goto err;
+ }
+ fq->last_in |= LAST_IN;
+ fq->len = end;
+ } else {
+ /* Check if the fragment is rounded to 8 bytes.
+ * Required by the RFC.
+ */
+ if (end & 0x7) {
+ /* RFC2460 says always send parameter problem in
+ * this case. -DaveM
+ */
+ DEBUGP("the end of this fragment is not rounded to 8 bytes.\n");
+ return -1;
+ }
+ if (end > fq->len) {
+ /* Some bits beyond end -> corruption. */
+ if (fq->last_in & LAST_IN) {
+ DEBUGP("last packet already reached.\n");
+ goto err;
+ }
+ fq->len = end;
+ }
+ }
+
+ if (end == offset)
+ goto err;
+
+ /* Point into the IP datagram 'data' part. */
+ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
+ DEBUGP("queue: message is too short.\n");
+ goto err;
+ }
+ if (end-offset < skb->len) {
+ if (pskb_trim(skb, end - offset)) {
+ DEBUGP("Can't trim\n");
+ goto err;
+ }
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
+ /* Find out which fragments are in front and at the back of us
+ * in the chain of fragments so far. We must know where to put
+ * this fragment, right?
+ */
+ prev = NULL;
+ for (next = fq->fragments; next != NULL; next = next->next) {
+ if (NFCT_FRAG6_CB(next)->offset >= offset)
+ break; /* bingo! */
+ prev = next;
+ }
+
+ /* We found where to put this one. Check for overlap with
+ * preceding fragment, and, if needed, align things so that
+ * any overlaps are eliminated.
+ */
+ if (prev) {
+ int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
+
+ if (i > 0) {
+ offset += i;
+ if (end <= offset) {
+ DEBUGP("overlap\n");
+ goto err;
+ }
+ if (!pskb_pull(skb, i)) {
+ DEBUGP("Can't pull\n");
+ goto err;
+ }
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+ }
+
+ /* Look for overlap with succeeding segments.
+ * If we can merge fragments, do it.
+ */
+ while (next && NFCT_FRAG6_CB(next)->offset < end) {
+ /* overlap is 'i' bytes */
+ int i = end - NFCT_FRAG6_CB(next)->offset;
+
+ if (i < next->len) {
+ /* Eat head of the next overlapped fragment
+ * and leave the loop. The next ones cannot overlap.
+ */
+ DEBUGP("Eat head of the overlapped parts.: %d", i);
+ if (!pskb_pull(next, i))
+ goto err;
+
+ /* next fragment */
+ NFCT_FRAG6_CB(next)->offset += i;
+ fq->meat -= i;
+ if (next->ip_summed != CHECKSUM_UNNECESSARY)
+ next->ip_summed = CHECKSUM_NONE;
+ break;
+ } else {
+ struct sk_buff *free_it = next;
+
+ /* Old fragmnet is completely overridden with
+ * new one drop it.
+ */
+ next = next->next;
+
+ if (prev)
+ prev->next = next;
+ else
+ fq->fragments = next;
+
+ fq->meat -= free_it->len;
+ frag_kfree_skb(free_it, NULL);
+ }
+ }
+
+ NFCT_FRAG6_CB(skb)->offset = offset;
+
+ /* Insert this fragment in the chain of fragments. */
+ skb->next = next;
+ if (prev)
+ prev->next = skb;
+ else
+ fq->fragments = skb;
+
+ skb->dev = NULL;
+ skb_get_timestamp(skb, &fq->stamp);
+ fq->meat += skb->len;
+ atomic_add(skb->truesize, &nf_ct_frag6_mem);
+
+ /* The first fragment.
+ * nhoffset is obtained from the first fragment, of course.
+ */
+ if (offset == 0) {
+ fq->nhoffset = nhoff;
+ fq->last_in |= FIRST_IN;
+ }
+ write_lock(&nf_ct_frag6_lock);
+ list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list);
+ write_unlock(&nf_ct_frag6_lock);
+ return 0;
+
+err:
+ return -1;
+}
+
+/*
+ * Check if this packet is complete.
+ * Returns NULL on failure by any reason, and pointer
+ * to current nexthdr field in reassembled frame.
+ *
+ * It is called with locked fq, and caller must check that
+ * queue is eligible for reassembly i.e. it is not COMPLETE,
+ * the last and the first frames arrived and all the bits are here.
+ */
+static struct sk_buff *
+nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+{
+ struct sk_buff *fp, *op, *head = fq->fragments;
+ int payload_len;
+
+ fq_kill(fq);
+
+ BUG_TRAP(head != NULL);
+ BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0);
+
+ /* Unfragmented part is taken from the first segment. */
+ payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr);
+ if (payload_len > IPV6_MAXPLEN) {
+ DEBUGP("payload len is too large.\n");
+ goto out_oversize;
+ }
+
+ /* Head of list must not be cloned. */
+ if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) {
+ DEBUGP("skb is cloned but can't expand head");
+ goto out_oom;
+ }
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments. */
+ if (skb_shinfo(head)->frag_list) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) {
+ DEBUGP("Can't alloc skb\n");
+ goto out_oom;
+ }
+ clone->next = head->next;
+ head->next = clone;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_shinfo(head)->frag_list = NULL;
+ for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+ plen += skb_shinfo(head)->frags[i].size;
+ clone->len = clone->data_len = head->data_len - plen;
+ head->data_len -= clone->len;
+ head->len -= clone->len;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+
+ NFCT_FRAG6_CB(clone)->orig = NULL;
+ atomic_add(clone->truesize, &nf_ct_frag6_mem);
+ }
+
+ /* We have to remove fragment header from datagram and to relocate
+ * header in order to calculate ICV correctly. */
+ head->nh.raw[fq->nhoffset] = head->h.raw[0];
+ memmove(head->head + sizeof(struct frag_hdr), head->head,
+ (head->data - head->head) - sizeof(struct frag_hdr));
+ head->mac.raw += sizeof(struct frag_hdr);
+ head->nh.raw += sizeof(struct frag_hdr);
+
+ skb_shinfo(head)->frag_list = head->next;
+ head->h.raw = head->data;
+ skb_push(head, head->data - head->nh.raw);
+ atomic_sub(head->truesize, &nf_ct_frag6_mem);
+
+ for (fp=head->next; fp; fp = fp->next) {
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_HW)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ atomic_sub(fp->truesize, &nf_ct_frag6_mem);
+ }
+
+ head->next = NULL;
+ head->dev = dev;
+ skb_set_timestamp(head, &fq->stamp);
+ head->nh.ipv6h->payload_len = htons(payload_len);
+
+ /* Yes, and fold redundant checksum back. 8) */
+ if (head->ip_summed == CHECKSUM_HW)
+ head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
+
+ fq->fragments = NULL;
+
+ /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
+ fp = skb_shinfo(head)->frag_list;
+ if (NFCT_FRAG6_CB(fp)->orig == NULL)
+ /* at above code, head skb is divided into two skbs. */
+ fp = fp->next;
+
+ op = NFCT_FRAG6_CB(head)->orig;
+ for (; fp; fp = fp->next) {
+ struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
+
+ op->next = orig;
+ op = orig;
+ NFCT_FRAG6_CB(fp)->orig = NULL;
+ }
+
+ return head;
+
+out_oversize:
+ if (net_ratelimit())
+ printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len);
+ goto out_fail;
+out_oom:
+ if (net_ratelimit())
+ printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n");
+out_fail:
+ return NULL;
+}
+
+/*
+ * find the header just before Fragment Header.
+ *
+ * if success return 0 and set ...
+ * (*prevhdrp): the value of "Next Header Field" in the header
+ * just before Fragment Header.
+ * (*prevhoff): the offset of "Next Header Field" in the header
+ * just before Fragment Header.
+ * (*fhoff) : the offset of Fragment Header.
+ *
+ * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c
+ *
+ */
+static int
+find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
+{
+ u8 nexthdr = skb->nh.ipv6h->nexthdr;
+ u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data;
+ int start = (u8 *)(skb->nh.ipv6h+1) - skb->data;
+ int len = skb->len - start;
+ u8 prevhdr = NEXTHDR_IPV6;
+
+ while (nexthdr != NEXTHDR_FRAGMENT) {
+ struct ipv6_opt_hdr hdr;
+ int hdrlen;
+
+ if (!ipv6_ext_hdr(nexthdr)) {
+ return -1;
+ }
+ if (len < (int)sizeof(struct ipv6_opt_hdr)) {
+ DEBUGP("too short\n");
+ return -1;
+ }
+ if (nexthdr == NEXTHDR_NONE) {
+ DEBUGP("next header is none\n");
+ return -1;
+ }
+ if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+ BUG();
+ if (nexthdr == NEXTHDR_AUTH)
+ hdrlen = (hdr.hdrlen+2)<<2;
+ else
+ hdrlen = ipv6_optlen(&hdr);
+
+ prevhdr = nexthdr;
+ prev_nhoff = start;
+
+ nexthdr = hdr.nexthdr;
+ len -= hdrlen;
+ start += hdrlen;
+ }
+
+ if (len < 0)
+ return -1;
+
+ *prevhdrp = prevhdr;
+ *prevhoff = prev_nhoff;
+ *fhoff = start;
+
+ return 0;
+}
+
+struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb)
+{
+ struct sk_buff *clone;
+ struct net_device *dev = skb->dev;
+ struct frag_hdr *fhdr;
+ struct nf_ct_frag6_queue *fq;
+ struct ipv6hdr *hdr;
+ int fhoff, nhoff;
+ u8 prevhdr;
+ struct sk_buff *ret_skb = NULL;
+
+ /* Jumbo payload inhibits frag. header */
+ if (skb->nh.ipv6h->payload_len == 0) {
+ DEBUGP("payload len = 0\n");
+ return skb;
+ }
+
+ if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
+ return skb;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (clone == NULL) {
+ DEBUGP("Can't clone skb\n");
+ return skb;
+ }
+
+ NFCT_FRAG6_CB(clone)->orig = skb;
+
+ if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
+ DEBUGP("message is too short.\n");
+ goto ret_orig;
+ }
+
+ clone->h.raw = clone->data + fhoff;
+ hdr = clone->nh.ipv6h;
+ fhdr = (struct frag_hdr *)clone->h.raw;
+
+ if (!(fhdr->frag_off & htons(0xFFF9))) {
+ DEBUGP("Invalid fragment offset\n");
+ /* It is not a fragmented frame */
+ goto ret_orig;
+ }
+
+ if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh)
+ nf_ct_frag6_evictor();
+
+ fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr);
+ if (fq == NULL) {
+ DEBUGP("Can't find and can't create new queue\n");
+ goto ret_orig;
+ }
+
+ spin_lock(&fq->lock);
+
+ if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
+ spin_unlock(&fq->lock);
+ DEBUGP("Can't insert skb to queue\n");
+ fq_put(fq, NULL);
+ goto ret_orig;
+ }
+
+ if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) {
+ ret_skb = nf_ct_frag6_reasm(fq, dev);
+ if (ret_skb == NULL)
+ DEBUGP("Can't reassemble fragmented packets\n");
+ }
+ spin_unlock(&fq->lock);
+
+ fq_put(fq, NULL);
+ return ret_skb;
+
+ret_orig:
+ kfree_skb(clone);
+ return skb;
+}
+
+void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
+ struct net_device *in, struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *s, *s2;
+
+ for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
+ nf_conntrack_put_reasm(s->nfct_reasm);
+ nf_conntrack_get_reasm(skb);
+ s->nfct_reasm = skb;
+
+ s2 = s->next;
+ NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn,
+ NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
+ s = s2;
+ }
+ nf_conntrack_put_reasm(skb);
+}
+
+int nf_ct_frag6_kfree_frags(struct sk_buff *skb)
+{
+ struct sk_buff *s, *s2;
+
+ for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) {
+
+ s2 = s->next;
+ kfree_skb(s);
+ }
+
+ kfree_skb(skb);
+
+ return 0;
+}
+
+int nf_ct_frag6_init(void)
+{
+ nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+ (jiffies ^ (jiffies >> 6)));
+
+ init_timer(&nf_ct_frag6_secret_timer);
+ nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild;
+ nf_ct_frag6_secret_timer.expires = jiffies
+ + nf_ct_frag6_secret_interval;
+ add_timer(&nf_ct_frag6_secret_timer);
+
+ return 0;
+}
+
+void nf_ct_frag6_cleanup(void)
+{
+ del_timer(&nf_ct_frag6_secret_timer);
+ nf_ct_frag6_low_thresh = 0;
+ nf_ct_frag6_evictor();
+}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index a1265a320b1..8e9628f1c4c 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
- if (clone)
+ if (clone) {
+ nf_reset(clone);
rawv6_rcv(sk, clone);
+ }
}
sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
IP6CB(skb)->iif);
@@ -296,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb,
static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
- skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- /* FIXME: increment a raw6 drops counter here */
- kfree_skb(skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb_checksum_complete(skb)) {
+ /* FIXME: increment a raw6 drops counter here */
+ kfree_skb(skb);
+ return 0;
}
/* Charge it to the socket. */
@@ -335,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (!rp->checksum)
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if (skb->ip_summed == CHECKSUM_HW) {
- skb_postpull_rcsum(skb, skb->nh.raw,
- skb->h.raw - skb->nh.raw);
+ if (skb->ip_summed == CHECKSUM_HW) {
+ skb_postpull_rcsum(skb, skb->nh.raw,
+ skb->h.raw - skb->nh.raw);
+ if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr,
+ skb->len, inet->num, skb->csum))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- if (skb->ip_summed == CHECKSUM_NONE)
- skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,
- skb->len, inet->num, 0);
}
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr,
+ skb->len, inet->num, 0);
if (inet->hdrincl) {
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
+ if (skb_checksum_complete(skb)) {
/* FIXME: increment a raw6 drops counter here */
kfree_skb(skb);
return 0;
}
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
rawv6_rcv_skb(sk, skb);
@@ -405,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
} else if (msg->msg_flags&MSG_TRUNC) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
} else {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e4fe9ee484d..5d316cb72ec 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -74,7 +74,7 @@ struct ip6frag_skb_cb
struct frag_queue
{
- struct frag_queue *next;
+ struct hlist_node list;
struct list_head lru_list; /* lru list member */
__u32 id; /* fragment id */
@@ -95,14 +95,13 @@ struct frag_queue
#define FIRST_IN 2
#define LAST_IN 1
__u16 nhoffset;
- struct frag_queue **pprev;
};
/* Hash table. */
#define IP6Q_HASHSZ 64
-static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ];
+static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ];
static DEFINE_RWLOCK(ip6_frag_lock);
static u32 ip6_frag_hash_rnd;
static LIST_HEAD(ip6_frag_lru_list);
@@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0;
static __inline__ void __fq_unlink(struct frag_queue *fq)
{
- if(fq->next)
- fq->next->pprev = fq->pprev;
- *fq->pprev = fq->next;
+ hlist_del(&fq->list);
list_del(&fq->lru_list);
ip6_frag_nqueues--;
}
@@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy)
get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32));
for (i = 0; i < IP6Q_HASHSZ; i++) {
struct frag_queue *q;
+ struct hlist_node *p, *n;
- q = ip6_frag_hash[i];
- while (q) {
- struct frag_queue *next = q->next;
+ hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) {
unsigned int hval = ip6qhashfn(q->id,
&q->saddr,
&q->daddr);
if (hval != i) {
- /* Unlink. */
- if (q->next)
- q->next->pprev = q->pprev;
- *q->pprev = q->next;
+ hlist_del(&q->list);
/* Relink to new hash chain. */
- if ((q->next = ip6_frag_hash[hval]) != NULL)
- q->next->pprev = &q->next;
- ip6_frag_hash[hval] = q;
- q->pprev = &ip6_frag_hash[hval];
- }
+ hlist_add_head(&q->list,
+ &ip6_frag_hash[hval]);
- q = next;
+ }
}
}
write_unlock(&ip6_frag_lock);
@@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash,
struct frag_queue *fq_in)
{
struct frag_queue *fq;
+#ifdef CONFIG_SMP
+ struct hlist_node *n;
+#endif
write_lock(&ip6_frag_lock);
#ifdef CONFIG_SMP
- for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
if (fq->id == fq_in->id &&
ipv6_addr_equal(&fq_in->saddr, &fq->saddr) &&
ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) {
@@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash,
atomic_inc(&fq->refcnt);
atomic_inc(&fq->refcnt);
- if((fq->next = ip6_frag_hash[hash]) != NULL)
- fq->next->pprev = &fq->next;
- ip6_frag_hash[hash] = fq;
- fq->pprev = &ip6_frag_hash[hash];
+ hlist_add_head(&fq->list, &ip6_frag_hash[hash]);
INIT_LIST_HEAD(&fq->lru_list);
list_add_tail(&fq->lru_list, &ip6_frag_lru_list);
ip6_frag_nqueues++;
@@ -401,10 +391,11 @@ static __inline__ struct frag_queue *
fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst)
{
struct frag_queue *fq;
+ struct hlist_node *n;
unsigned int hash = ip6qhashfn(id, src, dst);
read_lock(&ip6_frag_lock);
- for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) {
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
if (fq->id == id &&
ipv6_addr_equal(src, &fq->saddr) &&
ipv6_addr_equal(dst, &fq->daddr)) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 227e99ed510..a7a537b5059 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1701,16 +1701,14 @@ static void fib6_dump_end(struct netlink_callback *cb)
fib6_walker_unlink(w);
kfree(w);
}
- if (cb->args[1]) {
- cb->done = (void*)cb->args[1];
- cb->args[1] = 0;
- }
+ cb->done = (void*)cb->args[1];
+ cb->args[1] = 0;
}
static int fib6_dump_done(struct netlink_callback *cb)
{
fib6_dump_end(cb);
- return cb->done(cb);
+ return cb->done ? cb->done(cb) : 0;
}
int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b7..62c0e5bd931 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
- int rover;
+ int rover = net_random() % (high - low) + low;
- spin_lock(&tcp_hashinfo.portalloc_lock);
- if (tcp_hashinfo.port_rover < low)
- rover = low;
- else
- rover = tcp_hashinfo.port_rover;
- do { rover++;
- if (rover > high)
- rover = low;
+ do {
head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
break;
next:
spin_unlock(&head->lock);
+ if (++rover > high)
+ rover = low;
} while (--remaining > 0);
- tcp_hashinfo.port_rover = rover;
- spin_unlock(&tcp_hashinfo.portalloc_lock);
/* Exhausted local port range during search? It is not
* possible for us to be holding one of the bind hash
@@ -1408,20 +1401,18 @@ out:
static int tcp_v6_checksum_init(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_HW) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,skb->csum))
+ &skb->nh.ipv6h->daddr,skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
- LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
+ }
}
+
+ skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
+ &skb->nh.ipv6h->daddr, 0);
+
if (skb->len <= 76) {
- if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0)))
- return -1;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
- &skb->nh.ipv6h->daddr,0);
+ return __skb_checksum_complete(skb);
}
return 0;
}
@@ -1582,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
goto discard_it;
if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
- tcp_v6_checksum_init(skb) < 0))
+ tcp_v6_checksum_init(skb)))
goto bad_packet;
th = skb->h.th;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bf9519341fd..e671153b47b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -248,7 +248,7 @@ try_again:
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
} else if (msg->msg_flags&MSG_TRUNC) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (__skb_checksum_complete(skb))
goto csum_copy_err;
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
copied);
@@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
return -1;
}
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
- kfree_skb(skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb_checksum_complete(skb)) {
+ UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
+ kfree_skb(skb);
+ return 0;
}
if (sock_queue_rcv_skb(sk,skb)<0) {
@@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
uh = skb->h.uh;
}
- if (skb->ip_summed==CHECKSUM_HW) {
+ if (skb->ip_summed == CHECKSUM_HW &&
+ !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
skb->ip_summed = CHECKSUM_UNNECESSARY;
- if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
- LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
+
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
@@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
+ if (skb_checksum_complete(skb))
goto discard;
UDP6_INC_STATS_BH(UDP_MIB_NOPORTS);
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
index c4ba5fa1446..3fefc822c1c 100644
--- a/net/irda/discovery.c
+++ b/net/irda/discovery.c
@@ -194,8 +194,7 @@ void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force)
/* Remove it from the log */
curr = hashbin_remove_this(log, (irda_queue_t *) curr);
- if (curr)
- kfree(curr);
+ kfree(curr);
}
}
diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c
index 6fec428b451..75f2666e863 100644
--- a/net/irda/irias_object.c
+++ b/net/irda/irias_object.c
@@ -122,8 +122,7 @@ static void __irias_delete_attrib(struct ias_attrib *attrib)
IRDA_ASSERT(attrib != NULL, return;);
IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;);
- if (attrib->name)
- kfree(attrib->name);
+ kfree(attrib->name);
irias_delete_value(attrib->value);
attrib->magic = ~IAS_ATTRIB_MAGIC;
@@ -136,8 +135,7 @@ void __irias_delete_object(struct ias_object *obj)
IRDA_ASSERT(obj != NULL, return;);
IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;);
- if (obj->name)
- kfree(obj->name);
+ kfree(obj->name);
hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib);
@@ -562,14 +560,12 @@ void irias_delete_value(struct ias_value *value)
/* No need to deallocate */
break;
case IAS_STRING:
- /* If string, deallocate string */
- if (value->t.string != NULL)
- kfree(value->t.string);
+ /* Deallocate string */
+ kfree(value->t.string);
break;
case IAS_OCT_SEQ:
- /* If byte stream, deallocate byte stream */
- if (value->t.oct_seq != NULL)
- kfree(value->t.oct_seq);
+ /* Deallocate byte stream */
+ kfree(value->t.oct_seq);
break;
default:
IRDA_DEBUG(0, "%s(), Unknown value type!\n", __FUNCTION__);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 59d02cbbeb9..c3f0b078345 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
struct llc_sock* llc = llc_sk(sk);
int rc = 0;
- if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) {
+ if (unlikely(llc_data_accept_state(llc->state) ||
+ llc->remote_busy_flag ||
+ llc->p_flag)) {
long timeout = sock_sndtimeo(sk, noblock);
rc = llc_ui_wait_for_busy_core(sk, timeout);
@@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
if (sk_wait_event(sk, &timeout,
(sk->sk_shutdown & RCV_SHUTDOWN) ||
(!llc_data_accept_state(llc->state) &&
+ !llc->remote_busy_flag &&
!llc->p_flag)))
break;
rc = -ERESTARTSYS;
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index b0bcfb1f12d..8169f24ed33 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb)
llc->ack_must_be_send = 1;
llc->ack_pf = pf_bit & 1;
}
- if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) {
+ if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO)
+ % LLC_2_SEQ_NBR_MODULO) >= llc->npta) {
llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb);
llc->ack_must_be_send = 0;
llc->ack_pf = 0;
@@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb)
llc->dec_step = 0;
llc->dec_cntr = llc->inc_cntr = 2;
++llc->npta;
- if (llc->npta > 127)
- llc->npta = 127 ;
+ if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO;
} else
--llc->inc_cntr;
return 0;
@@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q);
- llc->k -= unacked_pdu;
- if (llc->k < 2)
- llc->k = 2;
+ if (llc->k - unacked_pdu < 1)
+ llc->k = 1;
+ else
+ llc->k -= unacked_pdu;
return 0;
}
@@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
llc->k += 1;
- if (llc->k > 128)
- llc->k = 128 ;
+ if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO)
+ llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO;
return 0;
}
@@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb)
static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb)
{
- llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128;
+ llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO;
return 0;
}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8296b38bf27..a84f9221e5f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,3 +1,6 @@
+menu "Core Netfilter Configuration"
+ depends on NET && NETFILTER
+
config NETFILTER_NETLINK
tristate "Netfilter netlink interface"
help
@@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG
and is also scheduled to replace the old syslog-based ipt_LOG
and ip6t_LOG modules.
+config NF_CONNTRACK
+ tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && IP_NF_CONNTRACK=n
+ default n
+ ---help---
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections.
+
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+config NF_CT_ACCT
+ bool "Connection tracking flow accounting"
+ depends on NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ keep per-flow packet and byte counters.
+
+ Those counters can be used for flow-based accounting or the
+ `connbytes' match.
+
+ If unsure, say `N'.
+
+config NF_CONNTRACK_MARK
+ bool 'Connection mark tracking support'
+ depends on NF_CONNTRACK
+ help
+ This option enables support for connection marks, used by the
+ `CONNMARK' target and `connmark' match. Similar to the mark value
+ of packets, but this mark value is kept in the conntrack session
+ instead of the individual packets.
+
+config NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified aboutchanges in the connection tracking state.
+
+ If unsure, say `N'.
+
+config NF_CT_PROTO_SCTP
+ tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)'
+ depends on EXPERIMENTAL && NF_CONNTRACK
+ default n
+ help
+ With this option enabled, the layer 3 independent connection
+ tracking code will be able to do state tracking on SCTP connections.
+
+ If you want to compile it as a module, say M here and read
+ Documentation/modules.txt. If unsure, say `N'.
+
+config NF_CONNTRACK_FTP
+ tristate "FTP support on new connection tracking (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && NF_CONNTRACK
+ help
+ Tracking FTP connections is problematic: special helpers are
+ required for tracking them, and doing masquerading and other forms
+ of Network Address Translation on them.
+
+ This is FTP support on Layer 3 independent connection tracking.
+ Layer 3 independent connection tracking is experimental scheme
+ which generalize ip_conntrack to support other layer 3 protocols.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+endmenu
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b3b44f8b415..55f019ad2c0 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+
+nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o
+
+obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
+obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
new file mode 100644
index 00000000000..1da678303d7
--- /dev/null
+++ b/net/netfilter/nf_conntrack_core.c
@@ -0,0 +1,1545 @@
+/* Connection state tracking for netfilter. This is separated from,
+ but required by, the NAT layer; it can also be used by an iptables
+ extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ * - new API and handling of conntrack/nat helpers
+ * - now capable of multiple expectations for one master
+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
+ * - add usage/reference counts to ip_conntrack_expect
+ * - export ip_conntrack[_expect]_{find_get,put} functions
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - generalize L3 protocol denendent part.
+ * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - add support various size of conntrack structures.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_core.c
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+ registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#define NF_CONNTRACK_VERSION "0.4.1"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DEFINE_RWLOCK(nf_conntrack_lock);
+
+/* nf_conntrack_standalone needs this */
+atomic_t nf_conntrack_count = ATOMIC_INIT(0);
+
+void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
+LIST_HEAD(nf_conntrack_expect_list);
+struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
+struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
+static LIST_HEAD(helpers);
+unsigned int nf_conntrack_htable_size = 0;
+int nf_conntrack_max;
+struct list_head *nf_conntrack_hash;
+static kmem_cache_t *nf_conntrack_expect_cachep;
+struct nf_conn nf_conntrack_untracked;
+unsigned int nf_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int nf_conntrack_vmalloc;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+struct notifier_block *nf_conntrack_chain;
+struct notifier_block *nf_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
+
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
+{
+ DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+ if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
+ && ecache->events)
+ notifier_call_chain(&nf_conntrack_chain, ecache->events,
+ ecache->ct);
+
+ ecache->events = 0;
+ nf_ct_put(ecache->ct);
+ ecache->ct = NULL;
+}
+
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling for freeing the skb */
+void nf_ct_deliver_cached_events(const struct nf_conn *ct)
+{
+ struct nf_conntrack_ecache *ecache;
+
+ local_bh_disable();
+ ecache = &__get_cpu_var(nf_conntrack_ecache);
+ if (ecache->ct == ct)
+ __nf_ct_deliver_cached_events(ecache);
+ local_bh_enable();
+}
+
+/* Deliver cached events for old pending events, if current conntrack != old */
+void __nf_ct_event_cache_init(struct nf_conn *ct)
+{
+ struct nf_conntrack_ecache *ecache;
+
+ /* take care of delivering potentially old events */
+ ecache = &__get_cpu_var(nf_conntrack_ecache);
+ BUG_ON(ecache->ct == ct);
+ if (ecache->ct)
+ __nf_ct_deliver_cached_events(ecache);
+ /* initialize for this conntrack/packet */
+ ecache->ct = ct;
+ nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called
+ * while packets are still passing through the code */
+static void nf_ct_event_cache_flush(void)
+{
+ struct nf_conntrack_ecache *ecache;
+ int cpu;
+
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(nf_conntrack_ecache, cpu);
+ if (ecache->ct)
+ nf_ct_put(ecache->ct);
+ }
+}
+#else
+static inline void nf_ct_event_cache_flush(void) {}
+#endif /* CONFIG_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
+
+/*
+ * This scheme offers various size of "struct nf_conn" dependent on
+ * features(helper, nat, ...)
+ */
+
+#define NF_CT_FEATURES_NAMELEN 256
+static struct {
+ /* name of slab cache. printed in /proc/slabinfo */
+ char *name;
+
+ /* size of slab cache */
+ size_t size;
+
+ /* slab cache pointer */
+ kmem_cache_t *cachep;
+
+ /* allocated slab cache + modules which uses this slab cache */
+ int use;
+
+ /* Initialization */
+ int (*init_conntrack)(struct nf_conn *, u_int32_t);
+
+} nf_ct_cache[NF_CT_F_NUM];
+
+/* protect members of nf_ct_cache except of "use" */
+DEFINE_RWLOCK(nf_ct_cache_lock);
+
+/* This avoids calling kmem_cache_create() with same name simultaneously */
+DECLARE_MUTEX(nf_ct_cache_mutex);
+
+extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
+struct nf_conntrack_protocol *
+nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
+{
+ if (unlikely(nf_ct_protos[l3proto] == NULL))
+ return &nf_conntrack_generic_protocol;
+
+ return nf_ct_protos[l3proto][protocol];
+}
+
+static int nf_conntrack_hash_rnd_initted;
+static unsigned int nf_conntrack_hash_rnd;
+
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+ unsigned int size, unsigned int rnd)
+{
+ unsigned int a, b;
+ a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
+ ((tuple->src.l3num) << 16) | tuple->dst.protonum);
+ b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
+ (tuple->src.u.all << 16) | tuple->dst.u.all);
+
+ return jhash_2words(a, b, rnd) % size;
+}
+
+static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, nf_conntrack_htable_size,
+ nf_conntrack_hash_rnd);
+}
+
+/* Initialize "struct nf_conn" which has spaces for helper */
+static int
+init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
+{
+
+ conntrack->help = (union nf_conntrack_help *)
+ (((unsigned long)conntrack->data
+ + (__alignof__(union nf_conntrack_help) - 1))
+ & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
+ return 0;
+}
+
+int nf_conntrack_register_cache(u_int32_t features, const char *name,
+ size_t size,
+ int (*init)(struct nf_conn *, u_int32_t))
+{
+ int ret = 0;
+ char *cache_name;
+ kmem_cache_t *cachep;
+
+ DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
+ features, name, size);
+
+ if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
+ DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
+ features);
+ return -EINVAL;
+ }
+
+ down(&nf_ct_cache_mutex);
+
+ write_lock_bh(&nf_ct_cache_lock);
+ /* e.g: multiple helpers are loaded */
+ if (nf_ct_cache[features].use > 0) {
+ DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
+ if ((!strncmp(nf_ct_cache[features].name, name,
+ NF_CT_FEATURES_NAMELEN))
+ && nf_ct_cache[features].size == size
+ && nf_ct_cache[features].init_conntrack == init) {
+ DEBUGP("nf_conntrack_register_cache: reusing.\n");
+ nf_ct_cache[features].use++;
+ ret = 0;
+ } else
+ ret = -EBUSY;
+
+ write_unlock_bh(&nf_ct_cache_lock);
+ up(&nf_ct_cache_mutex);
+ return ret;
+ }
+ write_unlock_bh(&nf_ct_cache_lock);
+
+ /*
+ * The memory space for name of slab cache must be alive until
+ * cache is destroyed.
+ */
+ cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
+ if (cache_name == NULL) {
+ DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
+ ret = -ENOMEM;
+ goto out_up_mutex;
+ }
+
+ if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
+ >= NF_CT_FEATURES_NAMELEN) {
+ printk("nf_conntrack_register_cache: name too long\n");
+ ret = -EINVAL;
+ goto out_free_name;
+ }
+
+ cachep = kmem_cache_create(cache_name, size, 0, 0,
+ NULL, NULL);
+ if (!cachep) {
+ printk("nf_conntrack_register_cache: Can't create slab cache "
+ "for the features = 0x%x\n", features);
+ ret = -ENOMEM;
+ goto out_free_name;
+ }
+
+ write_lock_bh(&nf_ct_cache_lock);
+ nf_ct_cache[features].use = 1;
+ nf_ct_cache[features].size = size;
+ nf_ct_cache[features].init_conntrack = init;
+ nf_ct_cache[features].cachep = cachep;
+ nf_ct_cache[features].name = cache_name;
+ write_unlock_bh(&nf_ct_cache_lock);
+
+ goto out_up_mutex;
+
+out_free_name:
+ kfree(cache_name);
+out_up_mutex:
+ up(&nf_ct_cache_mutex);
+ return ret;
+}
+
+/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
+void nf_conntrack_unregister_cache(u_int32_t features)
+{
+ kmem_cache_t *cachep;
+ char *name;
+
+ /*
+ * This assures that kmem_cache_create() isn't called before destroying
+ * slab cache.
+ */
+ DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
+ down(&nf_ct_cache_mutex);
+
+ write_lock_bh(&nf_ct_cache_lock);
+ if (--nf_ct_cache[features].use > 0) {
+ write_unlock_bh(&nf_ct_cache_lock);
+ up(&nf_ct_cache_mutex);
+ return;
+ }
+ cachep = nf_ct_cache[features].cachep;
+ name = nf_ct_cache[features].name;
+ nf_ct_cache[features].cachep = NULL;
+ nf_ct_cache[features].name = NULL;
+ nf_ct_cache[features].init_conntrack = NULL;
+ nf_ct_cache[features].size = 0;
+ write_unlock_bh(&nf_ct_cache_lock);
+
+ synchronize_net();
+
+ kmem_cache_destroy(cachep);
+ kfree(name);
+
+ up(&nf_ct_cache_mutex);
+}
+
+int
+nf_ct_get_tuple(const struct sk_buff *skb,
+ unsigned int nhoff,
+ unsigned int dataoff,
+ u_int16_t l3num,
+ u_int8_t protonum,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_protocol *protocol)
+{
+ NF_CT_TUPLE_U_BLANK(tuple);
+
+ tuple->src.l3num = l3num;
+ if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+ return 0;
+
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+ return protocol->pkt_to_tuple(skb, dataoff, tuple);
+}
+
+int
+nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_l3proto *l3proto,
+ const struct nf_conntrack_protocol *protocol)
+{
+ NF_CT_TUPLE_U_BLANK(inverse);
+
+ inverse->src.l3num = orig->src.l3num;
+ if (l3proto->invert_tuple(inverse, orig) == 0)
+ return 0;
+
+ inverse->dst.dir = !orig->dst.dir;
+
+ inverse->dst.protonum = orig->dst.protonum;
+ return protocol->invert_tuple(inverse, orig);
+}
+
+/* nf_conntrack_expect helper functions */
+static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+{
+ ASSERT_WRITE_LOCK(&nf_conntrack_lock);
+ NF_CT_ASSERT(!timer_pending(&exp->timeout));
+ list_del(&exp->list);
+ NF_CT_STAT_INC(expect_delete);
+ exp->master->expecting--;
+ nf_conntrack_expect_put(exp);
+}
+
+static void expectation_timed_out(unsigned long ul_expect)
+{
+ struct nf_conntrack_expect *exp = (void *)ul_expect;
+
+ write_lock_bh(&nf_conntrack_lock);
+ nf_ct_unlink_expect(exp);
+ write_unlock_bh(&nf_conntrack_lock);
+ nf_conntrack_expect_put(exp);
+}
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct nf_conntrack_expect *
+find_expectation(const struct nf_conntrack_tuple *tuple)
+{
+ struct nf_conntrack_expect *i;
+
+ list_for_each_entry(i, &nf_conntrack_expect_list, list) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+ master ct never got confirmed, we'd hold a reference to it
+ and weird things would happen to future packets). */
+ if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+ && nf_ct_is_confirmed(i->master)) {
+ if (i->flags & NF_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ return i;
+ }
+ }
+ }
+ return NULL;
+}
+
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct nf_conn *ct)
+{
+ struct nf_conntrack_expect *i, *tmp;
+
+ /* Optimization: most connection never expect any others. */
+ if (ct->expecting == 0)
+ return;
+
+ list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
+ if (i->master == ct && del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ nf_conntrack_expect_put(i);
+ }
+ }
+}
+
+static void
+clean_from_lists(struct nf_conn *ct)
+{
+ unsigned int ho, hr;
+
+ DEBUGP("clean_from_lists(%p)\n", ct);
+ ASSERT_WRITE_LOCK(&nf_conntrack_lock);
+
+ ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+
+ /* Destroy all pending expectations */
+ remove_expectations(ct);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_protocol *proto;
+
+ DEBUGP("destroy_conntrack(%p)\n", ct);
+ NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+ NF_CT_ASSERT(!timer_pending(&ct->timeout));
+
+ nf_conntrack_event(IPCT_DESTROY, ct);
+ set_bit(IPS_DYING_BIT, &ct->status);
+
+ /* To make sure we don't get any weird locking issues here:
+ * destroy_conntrack() MUST NOT be called with a write lock
+ * to nf_conntrack_lock!!! -HW */
+ l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
+ if (l3proto && l3proto->destroy)
+ l3proto->destroy(ct);
+
+ proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+ if (proto && proto->destroy)
+ proto->destroy(ct);
+
+ if (nf_conntrack_destroyed)
+ nf_conntrack_destroyed(ct);
+
+ write_lock_bh(&nf_conntrack_lock);
+ /* Expectations will have been removed in clean_from_lists,
+ * except TFTP can create an expectation on the first packet,
+ * before connection is in the list, so we need to clean here,
+ * too. */
+ remove_expectations(ct);
+
+ /* We overload first tuple to link into unconfirmed list. */
+ if (!nf_ct_is_confirmed(ct)) {
+ BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+ }
+
+ NF_CT_STAT_INC(delete);
+ write_unlock_bh(&nf_conntrack_lock);
+
+ if (ct->master)
+ nf_ct_put(ct->master);
+
+ DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+ nf_conntrack_free(ct);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+
+ write_lock_bh(&nf_conntrack_lock);
+ /* Inside lock so preempt is disabled on module removal path.
+ * Otherwise we can get spurious warnings. */
+ NF_CT_STAT_INC(delete_list);
+ clean_from_lists(ct);
+ write_unlock_bh(&nf_conntrack_lock);
+ nf_ct_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ ASSERT_READ_LOCK(&nf_conntrack_lock);
+ return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
+ && nf_ct_tuple_equal(tuple, &i->tuple);
+}
+
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ struct nf_conntrack_tuple_hash *h;
+ unsigned int hash = hash_conntrack(tuple);
+
+ ASSERT_READ_LOCK(&nf_conntrack_lock);
+ list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
+ if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+ NF_CT_STAT_INC(found);
+ return h;
+ }
+ NF_CT_STAT_INC(searched);
+ }
+
+ return NULL;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ struct nf_conntrack_tuple_hash *h;
+
+ read_lock_bh(&nf_conntrack_lock);
+ h = __nf_conntrack_find(tuple, ignored_conntrack);
+ if (h)
+ atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
+ read_unlock_bh(&nf_conntrack_lock);
+
+ return h;
+}
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__nf_conntrack_confirm(struct sk_buff **pskb)
+{
+ unsigned int hash, repl_hash;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(*pskb, &ctinfo);
+
+ /* ipt_REJECT uses nf_conntrack_attach to attach related
+ ICMP/TCP RST packets in other direction. Actual packet
+ which created connection will be IP_CT_NEW or for an
+ expected connection, IP_CT_RELATED. */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return NF_ACCEPT;
+
+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+ /* We're not in hash table, and we refuse to set up related
+ connections for unconfirmed conns. But packet copies and
+ REJECT will give spurious warnings here. */
+ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+ /* No external references means noone else could have
+ confirmed us. */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+ DEBUGP("Confirming conntrack %p\n", ct);
+
+ write_lock_bh(&nf_conntrack_lock);
+
+ /* See if there's one in the list already, including reverse:
+ NAT could have grabbed it without realizing, since we're
+ not in the hash. If there is, we lost race. */
+ if (!LIST_FIND(&nf_conntrack_hash[hash],
+ conntrack_tuple_cmp,
+ struct nf_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+ && !LIST_FIND(&nf_conntrack_hash[repl_hash],
+ conntrack_tuple_cmp,
+ struct nf_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+ /* Remove from unconfirmed list */
+ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+ list_prepend(&nf_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ list_prepend(&nf_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* Timer relative to confirmation time, not original
+ setting time, otherwise we'd get timer wrap in
+ weird delay cases. */
+ ct->timeout.expires += jiffies;
+ add_timer(&ct->timeout);
+ atomic_inc(&ct->ct_general.use);
+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ NF_CT_STAT_INC(insert);
+ write_unlock_bh(&nf_conntrack_lock);
+ if (ct->helper)
+ nf_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ nf_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+ return NF_ACCEPT;
+ }
+
+ NF_CT_STAT_INC(insert_failed);
+ write_unlock_bh(&nf_conntrack_lock);
+ return NF_DROP;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+ for NAT). */
+int
+nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack)
+{
+ struct nf_conntrack_tuple_hash *h;
+
+ read_lock_bh(&nf_conntrack_lock);
+ h = __nf_conntrack_find(tuple, ignored_conntrack);
+ read_unlock_bh(&nf_conntrack_lock);
+
+ return h != NULL;
+}
+
+/* There's a small race here where we may free a just-assured
+ connection. Too bad: we're in trouble anyway. */
+static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
+{
+ return !(test_bit(IPS_ASSURED_BIT,
+ &nf_ct_tuplehash_to_ctrack(i)->status));
+}
+
+static int early_drop(struct list_head *chain)
+{
+ /* Traverse backwards: gives us oldest, which is roughly LRU */
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct = NULL;
+ int dropped = 0;
+
+ read_lock_bh(&nf_conntrack_lock);
+ h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ atomic_inc(&ct->ct_general.use);
+ }
+ read_unlock_bh(&nf_conntrack_lock);
+
+ if (!ct)
+ return dropped;
+
+ if (del_timer(&ct->timeout)) {
+ death_by_timeout((unsigned long)ct);
+ dropped = 1;
+ NF_CT_STAT_INC(early_drop);
+ }
+ nf_ct_put(ct);
+ return dropped;
+}
+
+static inline int helper_cmp(const struct nf_conntrack_helper *i,
+ const struct nf_conntrack_tuple *rtuple)
+{
+ return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
+}
+
+static struct nf_conntrack_helper *
+nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
+{
+ return LIST_FIND(&helpers, helper_cmp,
+ struct nf_conntrack_helper *,
+ tuple);
+}
+
+static struct nf_conn *
+__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl,
+ const struct nf_conntrack_l3proto *l3proto)
+{
+ struct nf_conn *conntrack = NULL;
+ u_int32_t features = 0;
+
+ if (!nf_conntrack_hash_rnd_initted) {
+ get_random_bytes(&nf_conntrack_hash_rnd, 4);
+ nf_conntrack_hash_rnd_initted = 1;
+ }
+
+ if (nf_conntrack_max
+ && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
+ /* Try dropping from this hash chain. */
+ if (!early_drop(&nf_conntrack_hash[hash])) {
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "nf_conntrack: table full, dropping"
+ " packet.\n");
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ /* find features needed by this conntrack. */
+ features = l3proto->get_features(orig);
+ read_lock_bh(&nf_conntrack_lock);
+ if (nf_ct_find_helper(repl) != NULL)
+ features |= NF_CT_F_HELP;
+ read_unlock_bh(&nf_conntrack_lock);
+
+ DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
+
+ read_lock_bh(&nf_ct_cache_lock);
+
+ if (!nf_ct_cache[features].use) {
+ DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
+ features);
+ goto out;
+ }
+
+ conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
+ if (conntrack == NULL) {
+ DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
+ goto out;
+ }
+
+ memset(conntrack, 0, nf_ct_cache[features].size);
+ conntrack->features = features;
+ if (nf_ct_cache[features].init_conntrack &&
+ nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
+ DEBUGP("nf_conntrack_alloc: failed to init\n");
+ kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
+ conntrack = NULL;
+ goto out;
+ }
+
+ atomic_set(&conntrack->ct_general.use, 1);
+ conntrack->ct_general.destroy = destroy_conntrack;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+ /* Don't set timer yet: wait for confirmation */
+ init_timer(&conntrack->timeout);
+ conntrack->timeout.data = (unsigned long)conntrack;
+ conntrack->timeout.function = death_by_timeout;
+
+ atomic_inc(&nf_conntrack_count);
+out:
+ read_unlock_bh(&nf_ct_cache_lock);
+ return conntrack;
+}
+
+struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl)
+{
+ struct nf_conntrack_l3proto *l3proto;
+
+ l3proto = nf_ct_find_l3proto(orig->src.l3num);
+ return __nf_conntrack_alloc(orig, repl, l3proto);
+}
+
+void nf_conntrack_free(struct nf_conn *conntrack)
+{
+ u_int32_t features = conntrack->features;
+ NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
+ DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
+ conntrack);
+ kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
+ atomic_dec(&nf_conntrack_count);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ failed due to stress. Otherwise it really is unclassifiable. */
+static struct nf_conntrack_tuple_hash *
+init_conntrack(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto,
+ struct nf_conntrack_protocol *protocol,
+ struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ struct nf_conn *conntrack;
+ struct nf_conntrack_tuple repl_tuple;
+ struct nf_conntrack_expect *exp;
+
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+ conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
+ if (conntrack == NULL || IS_ERR(conntrack)) {
+ DEBUGP("Can't allocate conntrack.\n");
+ return (struct nf_conntrack_tuple_hash *)conntrack;
+ }
+
+ if (!protocol->new(conntrack, skb, dataoff)) {
+ nf_conntrack_free(conntrack);
+ DEBUGP("init conntrack: can't track with proto module\n");
+ return NULL;
+ }
+
+ write_lock_bh(&nf_conntrack_lock);
+ exp = find_expectation(tuple);
+
+ if (exp) {
+ DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
+ conntrack, exp);
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
+ conntrack->master = exp->master;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ conntrack->mark = exp->master->mark;
+#endif
+ nf_conntrack_get(&conntrack->master->ct_general);
+ NF_CT_STAT_INC(expect_new);
+ } else {
+ conntrack->helper = nf_ct_find_helper(&repl_tuple);
+
+ NF_CT_STAT_INC(new);
+ }
+
+ /* Overload tuple linked list to put us in unconfirmed list. */
+ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+ write_unlock_bh(&nf_conntrack_lock);
+
+ if (exp) {
+ if (exp->expectfn)
+ exp->expectfn(conntrack, exp);
+ nf_conntrack_expect_put(exp);
+ }
+
+ return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct nf_conn *
+resolve_normal_ct(struct sk_buff *skb,
+ unsigned int dataoff,
+ u_int16_t l3num,
+ u_int8_t protonum,
+ struct nf_conntrack_l3proto *l3proto,
+ struct nf_conntrack_protocol *proto,
+ int *set_reply,
+ enum ip_conntrack_info *ctinfo)
+{
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+
+ if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
+ dataoff, l3num, protonum, &tuple, l3proto,
+ proto)) {
+ DEBUGP("resolve_normal_ct: Can't get tuple\n");
+ return NULL;
+ }
+
+ /* look for tuple match */
+ h = nf_conntrack_find_get(&tuple, NULL);
+ if (!h) {
+ h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
+ if (!h)
+ return NULL;
+ if (IS_ERR(h))
+ return (void *)h;
+ }
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ /* It exists; we have (non-exclusive) reference. */
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
+ *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ /* Please set reply bit if this packet OK */
+ *set_reply = 1;
+ } else {
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
+ *ctinfo = IP_CT_ESTABLISHED;
+ } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+ DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
+ *ctinfo = IP_CT_RELATED;
+ } else {
+ DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
+ *ctinfo = IP_CT_NEW;
+ }
+ *set_reply = 0;
+ }
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = *ctinfo;
+ return ct;
+}
+
+unsigned int
+nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_protocol *proto;
+ unsigned int dataoff;
+ u_int8_t protonum;
+ int set_reply = 0;
+ int ret;
+
+ /* Previously seen (loopback or untracked)? Ignore. */
+ if ((*pskb)->nfct) {
+ NF_CT_STAT_INC(ignore);
+ return NF_ACCEPT;
+ }
+
+ l3proto = nf_ct_find_l3proto((u_int16_t)pf);
+ if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
+ DEBUGP("not prepared to track yet or error occured\n");
+ return -ret;
+ }
+
+ proto = nf_ct_find_proto((u_int16_t)pf, protonum);
+
+ /* It may be an special packet, error, unclean...
+ * inverse of the return code tells to the netfilter
+ * core what to do with the packet. */
+ if (proto->error != NULL &&
+ (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
+ NF_CT_STAT_INC(error);
+ NF_CT_STAT_INC(invalid);
+ return -ret;
+ }
+
+ ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
+ &set_reply, &ctinfo);
+ if (!ct) {
+ /* Not valid part of a connection */
+ NF_CT_STAT_INC(invalid);
+ return NF_ACCEPT;
+ }
+
+ if (IS_ERR(ct)) {
+ /* Too stressed to deal. */
+ NF_CT_STAT_INC(drop);
+ return NF_DROP;
+ }
+
+ NF_CT_ASSERT((*pskb)->nfct);
+
+ ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
+ if (ret < 0) {
+ /* Invalid: inverse of the return code tells
+ * the netfilter core what to do */
+ DEBUGP("nf_conntrack_in: Can't track with proto module\n");
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ NF_CT_STAT_INC(invalid);
+ return -ret;
+ }
+
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_STATUS, *pskb);
+
+ return ret;
+}
+
+int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
+ const struct nf_conntrack_tuple *orig)
+{
+ return nf_ct_invert_tuple(inverse, orig,
+ nf_ct_find_l3proto(orig->src.l3num),
+ nf_ct_find_proto(orig->src.l3num,
+ orig->dst.protonum));
+}
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ /* Part covered by intersection of masks must be unequal,
+ otherwise they clash */
+ struct nf_conntrack_tuple intersect_mask;
+ int count;
+
+ intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
+ intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
+ intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
+ intersect_mask.dst.protonum = a->mask.dst.protonum
+ & b->mask.dst.protonum;
+
+ for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
+ intersect_mask.src.u3.all[count] =
+ a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
+ }
+
+ for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
+ intersect_mask.dst.u3.all[count] =
+ a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
+ }
+
+ return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b)
+{
+ return a->master == b->master
+ && nf_ct_tuple_equal(&a->tuple, &b->tuple)
+ && nf_ct_tuple_equal(&a->mask, &b->mask);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_expect *i;
+
+ write_lock_bh(&nf_conntrack_lock);
+ /* choose the the oldest expectation to evict */
+ list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
+ if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ write_unlock_bh(&nf_conntrack_lock);
+ nf_conntrack_expect_put(i);
+ return;
+ }
+ }
+ write_unlock_bh(&nf_conntrack_lock);
+}
+
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
+struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
+{
+ struct nf_conntrack_expect *new;
+
+ new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
+ if (!new) {
+ DEBUGP("expect_related: OOM allocating expect\n");
+ return NULL;
+ }
+ new->master = me;
+ atomic_set(&new->use, 1);
+ return new;
+}
+
+void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
+{
+ if (atomic_dec_and_test(&exp->use))
+ kmem_cache_free(nf_conntrack_expect_cachep, exp);
+}
+
+static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
+{
+ atomic_inc(&exp->use);
+ exp->master->expecting++;
+ list_add(&exp->list, &nf_conntrack_expect_list);
+
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+ exp->timeout.function = expectation_timed_out;
+ exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
+ add_timer(&exp->timeout);
+
+ atomic_inc(&exp->use);
+ NF_CT_STAT_INC(expect_create);
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct nf_conn *master)
+{
+ struct nf_conntrack_expect *i;
+
+ list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
+ if (i->master == master) {
+ if (del_timer(&i->timeout)) {
+ nf_ct_unlink_expect(i);
+ nf_conntrack_expect_put(i);
+ }
+ break;
+ }
+ }
+}
+
+static inline int refresh_timer(struct nf_conntrack_expect *i)
+{
+ if (!del_timer(&i->timeout))
+ return 0;
+
+ i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+ add_timer(&i->timeout);
+ return 1;
+}
+
+int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
+{
+ struct nf_conntrack_expect *i;
+ int ret;
+
+ DEBUGP("nf_conntrack_expect_related %p\n", related_to);
+ DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
+ DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
+
+ write_lock_bh(&nf_conntrack_lock);
+ list_for_each_entry(i, &nf_conntrack_expect_list, list) {
+ if (expect_matches(i, expect)) {
+ /* Refresh timer: if it's dying, ignore.. */
+ if (refresh_timer(i)) {
+ ret = 0;
+ goto out;
+ }
+ } else if (expect_clash(i, expect)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+ /* Will be over limit? */
+ if (expect->master->helper->max_expected &&
+ expect->master->expecting >= expect->master->helper->max_expected)
+ evict_oldest_expect(expect->master);
+
+ nf_conntrack_expect_insert(expect);
+ nf_conntrack_expect_event(IPEXP_NEW, expect);
+ ret = 0;
+out:
+ write_unlock_bh(&nf_conntrack_lock);
+ return ret;
+}
+
+/* Alter reply tuple (maybe alter helper). This is for NAT, and is
+ implicitly racy: see __nf_conntrack_confirm */
+void nf_conntrack_alter_reply(struct nf_conn *conntrack,
+ const struct nf_conntrack_tuple *newreply)
+{
+ write_lock_bh(&nf_conntrack_lock);
+ /* Should be unconfirmed, so not in hash table yet */
+ NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
+
+ DEBUGP("Altering reply tuple of %p to ", conntrack);
+ NF_CT_DUMP_TUPLE(newreply);
+
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ if (!conntrack->master && conntrack->expecting == 0)
+ conntrack->helper = nf_ct_find_helper(newreply);
+ write_unlock_bh(&nf_conntrack_lock);
+}
+
+int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
+{
+ int ret;
+ BUG_ON(me->timeout == 0);
+
+ ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
+ sizeof(struct nf_conn)
+ + sizeof(union nf_conntrack_help)
+ + __alignof__(union nf_conntrack_help),
+ init_conntrack_for_helper);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
+ return ret;
+ }
+ write_lock_bh(&nf_conntrack_lock);
+ list_prepend(&helpers, me);
+ write_unlock_bh(&nf_conntrack_lock);
+
+ return 0;
+}
+
+static inline int unhelp(struct nf_conntrack_tuple_hash *i,
+ const struct nf_conntrack_helper *me)
+{
+ if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
+ nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
+ nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
+ }
+ return 0;
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
+ unsigned int i;
+ struct nf_conntrack_expect *exp, *tmp;
+
+ /* Need write lock here, to delete helper. */
+ write_lock_bh(&nf_conntrack_lock);
+ LIST_DELETE(&helpers, me);
+
+ /* Get rid of expectations */
+ list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
+ if (exp->master->helper == me && del_timer(&exp->timeout)) {
+ nf_ct_unlink_expect(exp);
+ nf_conntrack_expect_put(exp);
+ }
+ }
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
+ for (i = 0; i < nf_conntrack_htable_size; i++)
+ LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
+ struct nf_conntrack_tuple_hash *, me);
+ write_unlock_bh(&nf_conntrack_lock);
+
+ /* Someone could be still looking at the helper in a bh. */
+ synchronize_net();
+}
+
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __nf_ct_refresh_acct(struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct sk_buff *skb,
+ unsigned long extra_jiffies,
+ int do_acct)
+{
+ int event = 0;
+
+ NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
+ NF_CT_ASSERT(skb);
+
+ write_lock_bh(&nf_conntrack_lock);
+
+ /* If not in hash table, timer will not be active yet */
+ if (!nf_ct_is_confirmed(ct)) {
+ ct->timeout.expires = extra_jiffies;
+ event = IPCT_REFRESH;
+ } else {
+ /* Need del_timer for race avoidance (may already be dying). */
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.expires = jiffies + extra_jiffies;
+ add_timer(&ct->timeout);
+ event = IPCT_REFRESH;
+ }
+ }
+
+#ifdef CONFIG_NF_CT_ACCT
+ if (do_acct) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ skb->len - (unsigned int)(skb->nh.raw - skb->data);
+ if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+ || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+ event |= IPCT_COUNTER_FILLING;
+ }
+#endif
+
+ write_unlock_bh(&nf_conntrack_lock);
+
+ /* must be unlocked when calling event cache */
+ if (event)
+ nf_conntrack_event_cache(event, skb);
+}
+
+/* Used by ipt_REJECT and ip6t_REJECT. */
+void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ /* This ICMP is in reverse direction to the packet which caused it */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+ ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+ else
+ ctinfo = IP_CT_RELATED;
+
+ /* Attach to new skbuff, and increment count */
+ nskb->nfct = &ct->ct_general;
+ nskb->nfctinfo = ctinfo;
+ nf_conntrack_get(nskb->nfct);
+}
+
+static inline int
+do_iter(const struct nf_conntrack_tuple_hash *i,
+ int (*iter)(struct nf_conn *i, void *data),
+ void *data)
+{
+ return iter(nf_ct_tuplehash_to_ctrack(i), data);
+}
+
+/* Bring out ya dead! */
+static struct nf_conntrack_tuple_hash *
+get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
+ void *data, unsigned int *bucket)
+{
+ struct nf_conntrack_tuple_hash *h = NULL;
+
+ write_lock_bh(&nf_conntrack_lock);
+ for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
+ h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
+ struct nf_conntrack_tuple_hash *, iter, data);
+ if (h)
+ break;
+ }
+ if (!h)
+ h = LIST_FIND_W(&unconfirmed, do_iter,
+ struct nf_conntrack_tuple_hash *, iter, data);
+ if (h)
+ atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
+ write_unlock_bh(&nf_conntrack_lock);
+
+ return h;
+}
+
+void
+nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
+{
+ struct nf_conntrack_tuple_hash *h;
+ unsigned int bucket = 0;
+
+ while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Time to push up daises... */
+ if (del_timer(&ct->timeout))
+ death_by_timeout((unsigned long)ct);
+ /* ... else the timer will get him soon. */
+
+ nf_ct_put(ct);
+ }
+}
+
+static int kill_all(struct nf_conn *i, void *data)
+{
+ return 1;
+}
+
+static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
+{
+ if (vmalloced)
+ vfree(hash);
+ else
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct list_head) * size));
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void nf_conntrack_cleanup(void)
+{
+ int i;
+
+ /* This makes sure all current packets have passed through
+ netfilter framework. Roll on, two-stage module
+ delete... */
+ synchronize_net();
+
+ nf_ct_event_cache_flush();
+ i_see_dead_people:
+ nf_ct_iterate_cleanup(kill_all, NULL);
+ if (atomic_read(&nf_conntrack_count) != 0) {
+ schedule();
+ goto i_see_dead_people;
+ }
+
+ for (i = 0; i < NF_CT_F_NUM; i++) {
+ if (nf_ct_cache[i].use == 0)
+ continue;
+
+ NF_CT_ASSERT(nf_ct_cache[i].use == 1);
+ nf_ct_cache[i].use = 1;
+ nf_conntrack_unregister_cache(i);
+ }
+ kmem_cache_destroy(nf_conntrack_expect_cachep);
+ free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
+ nf_conntrack_htable_size);
+
+ /* free l3proto protocol tables */
+ for (i = 0; i < PF_MAX; i++)
+ if (nf_ct_protos[i]) {
+ kfree(nf_ct_protos[i]);
+ nf_ct_protos[i] = NULL;
+ }
+}
+
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+ struct list_head *hash;
+ unsigned int i;
+
+ *vmalloced = 0;
+ hash = (void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
+ hash = vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+int set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, hashsize, vmalloced;
+ int old_vmalloced, old_size;
+ int rnd;
+ struct list_head *hash, *old_hash;
+ struct nf_conntrack_tuple_hash *h;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!nf_conntrack_htable_size)
+ return param_set_uint(val, kp);
+
+ hashsize = simple_strtol(val, NULL, 0);
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = alloc_hashtable(hashsize, &vmalloced);
+ if (!hash)
+ return -ENOMEM;
+
+ /* We have to rehahs for the new table anyway, so we also can
+ * use a newrandom seed */
+ get_random_bytes(&rnd, 4);
+
+ write_lock_bh(&nf_conntrack_lock);
+ for (i = 0; i < nf_conntrack_htable_size; i++) {
+ while (!list_empty(&nf_conntrack_hash[i])) {
+ h = list_entry(nf_conntrack_hash[i].next,
+ struct nf_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+ list_add_tail(&h->list, &hash[bucket]);
+ }
+ }
+ old_size = nf_conntrack_htable_size;
+ old_vmalloced = nf_conntrack_vmalloc;
+ old_hash = nf_conntrack_hash;
+
+ nf_conntrack_htable_size = hashsize;
+ nf_conntrack_vmalloc = vmalloced;
+ nf_conntrack_hash = hash;
+ nf_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&nf_conntrack_lock);
+
+ free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+int __init nf_conntrack_init(void)
+{
+ unsigned int i;
+ int ret;
+
+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
+ * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
+ if (!nf_conntrack_htable_size) {
+ nf_conntrack_htable_size
+ = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ nf_conntrack_htable_size = 8192;
+ if (nf_conntrack_htable_size < 16)
+ nf_conntrack_htable_size = 16;
+ }
+ nf_conntrack_max = 8 * nf_conntrack_htable_size;
+
+ printk("nf_conntrack version %s (%u buckets, %d max)\n",
+ NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
+ nf_conntrack_max);
+
+ nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
+ &nf_conntrack_vmalloc);
+ if (!nf_conntrack_hash) {
+ printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+ goto err_out;
+ }
+
+ ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
+ sizeof(struct nf_conn), NULL);
+ if (ret < 0) {
+ printk(KERN_ERR "Unable to create nf_conn slab cache\n");
+ goto err_free_hash;
+ }
+
+ nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
+ sizeof(struct nf_conntrack_expect),
+ 0, 0, NULL, NULL);
+ if (!nf_conntrack_expect_cachep) {
+ printk(KERN_ERR "Unable to create nf_expect slab cache\n");
+ goto err_free_conntrack_slab;
+ }
+
+ /* Don't NEED lock here, but good form anyway. */
+ write_lock_bh(&nf_conntrack_lock);
+ for (i = 0; i < PF_MAX; i++)
+ nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
+ write_unlock_bh(&nf_conntrack_lock);
+
+ /* Set up fake conntrack:
+ - to never be deleted, not in any hashes */
+ atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+ /* - and look it like as a confirmed connection */
+ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
+
+ return ret;
+
+err_free_conntrack_slab:
+ nf_conntrack_unregister_cache(NF_CT_F_BASIC);
+err_free_hash:
+ free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
+ nf_conntrack_htable_size);
+err_out:
+ return -ENOMEM;
+}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
new file mode 100644
index 00000000000..65080e269f2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -0,0 +1,698 @@
+/* FTP extension for connection tracking. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - enable working with Layer 3 protocol independent connection tracking.
+ * - track EPRT and EPSV commands with IPv6 address.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/ctype.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter/nf_conntrack_ftp.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+
+/* This is slow, but it's simple. --RR */
+static char *ftp_buffer;
+
+static DEFINE_SPINLOCK(nf_ftp_lock);
+
+#define MAX_PORTS 8
+static u_int16_t ports[MAX_PORTS];
+static unsigned int ports_c;
+module_param_array(ports, ushort, &ports_c, 0400);
+
+static int loose;
+module_param(loose, int, 0600);
+
+unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ enum ip_ct_ftp_type type,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct nf_conntrack_expect *exp,
+ u32 *seq);
+EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
+ char);
+
+static struct ftp_search {
+ enum ip_conntrack_dir dir;
+ const char *pattern;
+ size_t plen;
+ char skip;
+ char term;
+ enum ip_ct_ftp_type ftptype;
+ int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
+} search[] = {
+ {
+ IP_CT_DIR_ORIGINAL,
+ "PORT", sizeof("PORT") - 1, ' ', '\r',
+ IP_CT_FTP_PORT,
+ try_rfc959,
+ },
+ {
+ IP_CT_DIR_REPLY,
+ "227 ", sizeof("227 ") - 1, '(', ')',
+ IP_CT_FTP_PASV,
+ try_rfc959,
+ },
+ {
+ IP_CT_DIR_ORIGINAL,
+ "EPRT", sizeof("EPRT") - 1, ' ', '\r',
+ IP_CT_FTP_EPRT,
+ try_eprt,
+ },
+ {
+ IP_CT_DIR_REPLY,
+ "229 ", sizeof("229 ") - 1, '(', ')',
+ IP_CT_FTP_EPSV,
+ try_epsv_response,
+ },
+};
+
+/* This code is based on inet_pton() in glibc-2.2.4 */
+static int
+get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term)
+{
+ static const char xdigits[] = "0123456789abcdef";
+ u_int8_t tmp[16], *tp, *endp, *colonp;
+ int ch, saw_xdigit;
+ u_int32_t val;
+ size_t clen = 0;
+
+ tp = memset(tmp, '\0', sizeof(tmp));
+ endp = tp + sizeof(tmp);
+ colonp = NULL;
+
+ /* Leading :: requires some special handling. */
+ if (*src == ':'){
+ if (*++src != ':') {
+ DEBUGP("invalid \":\" at the head of addr\n");
+ return 0;
+ }
+ clen++;
+ }
+
+ saw_xdigit = 0;
+ val = 0;
+ while ((clen < dlen) && (*src != term)) {
+ const char *pch;
+
+ ch = tolower(*src++);
+ clen++;
+
+ pch = strchr(xdigits, ch);
+ if (pch != NULL) {
+ val <<= 4;
+ val |= (pch - xdigits);
+ if (val > 0xffff)
+ return 0;
+
+ saw_xdigit = 1;
+ continue;
+ }
+ if (ch != ':') {
+ DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch);
+ return 0;
+ }
+
+ if (!saw_xdigit) {
+ if (colonp) {
+ DEBUGP("invalid location of \"::\".\n");
+ return 0;
+ }
+ colonp = tp;
+ continue;
+ } else if (*src == term) {
+ DEBUGP("trancated IPv6 addr\n");
+ return 0;
+ }
+
+ if (tp + 2 > endp)
+ return 0;
+ *tp++ = (u_int8_t) (val >> 8) & 0xff;
+ *tp++ = (u_int8_t) val & 0xff;
+
+ saw_xdigit = 0;
+ val = 0;
+ continue;
+ }
+ if (saw_xdigit) {
+ if (tp + 2 > endp)
+ return 0;
+ *tp++ = (u_int8_t) (val >> 8) & 0xff;
+ *tp++ = (u_int8_t) val & 0xff;
+ }
+ if (colonp != NULL) {
+ /*
+ * Since some memmove()'s erroneously fail to handle
+ * overlapping regions, we'll do the shift by hand.
+ */
+ const int n = tp - colonp;
+ int i;
+
+ if (tp == endp)
+ return 0;
+
+ for (i = 1; i <= n; i++) {
+ endp[- i] = colonp[n - i];
+ colonp[n - i] = 0;
+ }
+ tp = endp;
+ }
+ if (tp != endp || (*src != term))
+ return 0;
+
+ memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr));
+ return clen;
+}
+
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+ int array_size, char sep, char term)
+{
+ u_int32_t i, len;
+
+ memset(array, 0, sizeof(array[0])*array_size);
+
+ /* Keep data pointing at next char. */
+ for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+ if (*data >= '0' && *data <= '9') {
+ array[i] = array[i]*10 + *data - '0';
+ }
+ else if (*data == sep)
+ i++;
+ else {
+ /* Unexpected character; true if it's the
+ terminator and we're finished. */
+ if (*data == term && i == array_size - 1)
+ return len;
+
+ DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
+ len, i, *data);
+ return 0;
+ }
+ }
+ DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
+
+ return 0;
+}
+
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term)
+{
+ int length;
+ u_int32_t array[6];
+
+ length = try_number(data, dlen, array, 6, ',', term);
+ if (length == 0)
+ return 0;
+
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
+ (array[2] << 8) | array[3]);
+ cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
+ return length;
+}
+
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+ u_int16_t *port)
+{
+ u_int16_t tmp_port = 0;
+ int i;
+
+ for (i = start; i < dlen; i++) {
+ /* Finished? */
+ if (data[i] == delim) {
+ if (tmp_port == 0)
+ break;
+ *port = htons(tmp_port);
+ DEBUGP("get_port: return %d\n", tmp_port);
+ return i + 1;
+ }
+ else if (data[i] >= '0' && data[i] <= '9')
+ tmp_port = tmp_port*10 + data[i] - '0';
+ else { /* Some other crap */
+ DEBUGP("get_port: invalid char.\n");
+ break;
+ }
+ }
+ return 0;
+}
+
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
+static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
+ char term)
+{
+ char delim;
+ int length;
+
+ /* First character is delimiter, then "1" for IPv4 or "2" for IPv6,
+ then delimiter again. */
+ if (dlen <= 3) {
+ DEBUGP("EPRT: too short\n");
+ return 0;
+ }
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) {
+ DEBUGP("try_eprt: invalid delimitter.\n");
+ return 0;
+ }
+
+ if ((cmd->l3num == PF_INET && data[1] != '1') ||
+ (cmd->l3num == PF_INET6 && data[1] != '2')) {
+ DEBUGP("EPRT: invalid protocol number.\n");
+ return 0;
+ }
+
+ DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim);
+
+ if (data[1] == '1') {
+ u_int32_t array[4];
+
+ /* Now we have IP address. */
+ length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+ if (length != 0)
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16)
+ | (array[2] << 8) | array[3]);
+ } else {
+ /* Now we have IPv6 address. */
+ length = get_ipv6_addr(data + 3, dlen - 3,
+ (struct in6_addr *)cmd->u3.ip6, delim);
+ }
+
+ if (length == 0)
+ return 0;
+ DEBUGP("EPRT: Got IP address!\n");
+ /* Start offset includes initial "|1|", and trailing delimiter */
+ return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen,
+ struct nf_conntrack_man *cmd, char term)
+{
+ char delim;
+
+ /* Three delimiters. */
+ if (dlen <= 3) return 0;
+ delim = data[0];
+ if (isdigit(delim) || delim < 33 || delim > 126
+ || data[1] != delim || data[2] != delim)
+ return 0;
+
+ return get_port(data, 3, dlen, delim, &cmd->u.tcp.port);
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ unsigned int *numoff,
+ unsigned int *numlen,
+ struct nf_conntrack_man *cmd,
+ int (*getnum)(const char *, size_t,
+ struct nf_conntrack_man *, char))
+{
+ size_t i;
+
+ DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+ if (dlen == 0)
+ return 0;
+
+ if (dlen <= plen) {
+ /* Short packet: try for partial? */
+ if (strnicmp(data, pattern, dlen) == 0)
+ return -1;
+ else return 0;
+ }
+
+ if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+ size_t i;
+
+ DEBUGP("ftp: string mismatch\n");
+ for (i = 0; i < plen; i++) {
+ DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+ i, data[i], data[i],
+ pattern[i], pattern[i]);
+ }
+#endif
+ return 0;
+ }
+
+ DEBUGP("Pattern matches!\n");
+ /* Now we've found the constant string, try to skip
+ to the 'skip' character */
+ for (i = plen; data[i] != skip; i++)
+ if (i == dlen - 1) return -1;
+
+ /* Skip over the last character */
+ i++;
+
+ DEBUGP("Skipped up to `%c'!\n", skip);
+
+ *numoff = i;
+ *numlen = getnum(data + i, dlen - i, cmd, term);
+ if (!*numlen)
+ return -1;
+
+ DEBUGP("Match succeeded!\n");
+ return 1;
+}
+
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+ if (info->seq_aft_nl[dir][i] == seq)
+ return 1;
+ return 0;
+}
+
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
+{
+ unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
+
+ /* Look for oldest: if we find exact match, we're done. */
+ for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+ if (info->seq_aft_nl[dir][i] == nl_seq)
+ return;
+
+ if (oldest == info->seq_aft_nl_num[dir]
+ || before(info->seq_aft_nl[dir][i], oldest))
+ oldest = i;
+ }
+
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
+ info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+ nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else if (oldest != NUM_SEQ_TO_REMEMBER) {
+ info->seq_aft_nl[dir][oldest] = nl_seq;
+ nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ }
+}
+
+static int help(struct sk_buff **pskb,
+ unsigned int protoff,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int dataoff, datalen;
+ struct tcphdr _tcph, *th;
+ char *fb_ptr;
+ int ret;
+ u32 seq;
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int matchlen, matchoff;
+ struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info;
+ struct nf_conntrack_expect *exp;
+ struct nf_conntrack_man cmd = {};
+
+ unsigned int i;
+ int found = 0, ends_in_nl;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+ DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return NF_ACCEPT;
+
+ dataoff = protoff + th->doff * 4;
+ /* No data? */
+ if (dataoff >= (*pskb)->len) {
+ DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
+ (*pskb)->len);
+ return NF_ACCEPT;
+ }
+ datalen = (*pskb)->len - dataoff;
+
+ spin_lock_bh(&nf_ftp_lock);
+ fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer);
+ BUG_ON(fb_ptr == NULL);
+
+ ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+ seq = ntohl(th->seq) + datalen;
+
+ /* Look up to see if we're just after a \n. */
+ if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+ /* Now if this ends in \n, update ftp info. */
+ DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
+ ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][0],
+ ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
+ ct_ftp_info->seq_aft_nl[dir][1]);
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+ /* Initialize IP/IPv6 addr to expected address (it's not mentioned
+ in EPSV responses) */
+ cmd.l3num = ct->tuplehash[dir].tuple.src.l3num;
+ memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+ sizeof(cmd.u3.all));
+
+ for (i = 0; i < ARRAY_SIZE(search); i++) {
+ if (search[i].dir != dir) continue;
+
+ found = find_pattern(fb_ptr, datalen,
+ search[i].pattern,
+ search[i].plen,
+ search[i].skip,
+ search[i].term,
+ &matchoff, &matchlen,
+ &cmd,
+ search[i].getnum);
+ if (found) break;
+ }
+ if (found == -1) {
+ /* We don't usually drop packets. After all, this is
+ connection tracking, not packet filtering.
+ However, it is necessary for accurate tracking in
+ this case. */
+ if (net_ratelimit())
+ printk("conntrack_ftp: partial %s %u+%u\n",
+ search[i].pattern,
+ ntohl(th->seq), datalen);
+ ret = NF_DROP;
+ goto out;
+ } else if (found == 0) { /* No match */
+ ret = NF_ACCEPT;
+ goto out_update_nl;
+ }
+
+ DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+ (int)matchlen, fb_ptr + matchoff,
+ matchlen, ntohl(th->seq) + matchoff);
+
+ exp = nf_conntrack_expect_alloc(ct);
+ if (exp == NULL) {
+ ret = NF_DROP;
+ goto out;
+ }
+
+ /* We refer to the reverse direction ("!dir") tuples here,
+ * because we're expecting something in the other direction.
+ * Doesn't matter unless NAT is happening. */
+ exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3;
+
+ /* Update the ftp info */
+ if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) &&
+ memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
+ sizeof(cmd.u3.all))) {
+ /* Enrico Scholz's passive FTP to partially RNAT'd ftp
+ server: it really wants us to connect to a
+ different IP address. Simply don't record it for
+ NAT. */
+ if (cmd.l3num == PF_INET) {
+ DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
+ NIPQUAD(cmd.u3.ip),
+ NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
+ } else {
+ DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n",
+ NIP6(*((struct in6_addr *)cmd.u3.ip6)),
+ NIP6(*((struct in6_addr *)ct->tuplehash[dir]
+ .tuple.src.u3.ip6)));
+ }
+
+ /* Thanks to Cristiano Lincoln Mattos
+ <lincoln@cesar.org.br> for reporting this potential
+ problem (DMZ machines opening holes to internal
+ networks, or the packet filter itself). */
+ if (!loose) {
+ ret = NF_ACCEPT;
+ goto out_put_expect;
+ }
+ memcpy(&exp->tuple.dst.u3, &cmd.u3.all,
+ sizeof(exp->tuple.dst.u3));
+ }
+
+ exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3;
+ exp->tuple.src.l3num = cmd.l3num;
+ exp->tuple.src.u.tcp.port = 0;
+ exp->tuple.dst.u.tcp.port = cmd.u.tcp.port;
+ exp->tuple.dst.protonum = IPPROTO_TCP;
+
+ exp->mask = (struct nf_conntrack_tuple)
+ { .src = { .l3num = 0xFFFF,
+ .u = { .tcp = { 0 }},
+ },
+ .dst = { .protonum = 0xFF,
+ .u = { .tcp = { 0xFFFF }},
+ },
+ };
+ if (cmd.l3num == PF_INET) {
+ exp->mask.src.u3.ip = 0xFFFFFFFF;
+ exp->mask.dst.u3.ip = 0xFFFFFFFF;
+ } else {
+ memset(exp->mask.src.u3.ip6, 0xFF,
+ sizeof(exp->mask.src.u3.ip6));
+ memset(exp->mask.dst.u3.ip6, 0xFF,
+ sizeof(exp->mask.src.u3.ip6));
+ }
+
+ exp->expectfn = NULL;
+ exp->flags = 0;
+
+ /* Now, NAT might want to mangle the packet, and register the
+ * (possibly changed) expectation itself. */
+ if (nf_nat_ftp_hook)
+ ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+ matchoff, matchlen, exp, &seq);
+ else {
+ /* Can't expect this? Best to drop packet now. */
+ if (nf_conntrack_expect_related(exp) != 0)
+ ret = NF_DROP;
+ else
+ ret = NF_ACCEPT;
+ }
+
+out_put_expect:
+ nf_conntrack_expect_put(exp);
+
+out_update_nl:
+ /* Now if this ends in \n, update ftp info. Seq may have been
+ * adjusted by NAT code. */
+ if (ends_in_nl)
+ update_nl_seq(seq, ct_ftp_info, dir, *pskb);
+ out:
+ spin_unlock_bh(&nf_ftp_lock);
+ return ret;
+}
+
+static struct nf_conntrack_helper ftp[MAX_PORTS][2];
+static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")];
+
+/* don't make this __exit, since it's called from __init ! */
+static void fini(void)
+{
+ int i, j;
+ for (i = 0; i < ports_c; i++) {
+ for (j = 0; j < 2; j++) {
+ if (ftp[i][j].me == NULL)
+ continue;
+
+ DEBUGP("nf_ct_ftp: unregistering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ nf_conntrack_helper_unregister(&ftp[i][j]);
+ }
+ }
+
+ kfree(ftp_buffer);
+}
+
+static int __init init(void)
+{
+ int i, j = -1, ret = 0;
+ char *tmpname;
+
+ ftp_buffer = kmalloc(65536, GFP_KERNEL);
+ if (!ftp_buffer)
+ return -ENOMEM;
+
+ if (ports_c == 0)
+ ports[ports_c++] = FTP_PORT;
+
+ /* FIXME should be configurable whether IPv4 and IPv6 FTP connections
+ are tracked or not - YK */
+ for (i = 0; i < ports_c; i++) {
+ memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper));
+
+ ftp[i][0].tuple.src.l3num = PF_INET;
+ ftp[i][1].tuple.src.l3num = PF_INET6;
+ for (j = 0; j < 2; j++) {
+ ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]);
+ ftp[i][j].tuple.dst.protonum = IPPROTO_TCP;
+ ftp[i][j].mask.src.u.tcp.port = 0xFFFF;
+ ftp[i][j].mask.dst.protonum = 0xFF;
+ ftp[i][j].max_expected = 1;
+ ftp[i][j].timeout = 5 * 60; /* 5 Minutes */
+ ftp[i][j].me = THIS_MODULE;
+ ftp[i][j].help = help;
+ tmpname = &ftp_names[i][j][0];
+ if (ports[i] == FTP_PORT)
+ sprintf(tmpname, "ftp");
+ else
+ sprintf(tmpname, "ftp-%d", ports[i]);
+ ftp[i][j].name = tmpname;
+
+ DEBUGP("nf_ct_ftp: registering helper for pf: %d "
+ "port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ ret = nf_conntrack_helper_register(&ftp[i][j]);
+ if (ret) {
+ printk("nf_ct_ftp: failed to register helper "
+ " for pf: %d port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
+ fini();
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
new file mode 100644
index 00000000000..7de4f06c63c
--- /dev/null
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -0,0 +1,98 @@
+/*
+ * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ *
+ * Based largely upon the original ip_conntrack code which
+ * had the following copyright information:
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Author:
+ * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat);
+
+static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+ memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+ return 1;
+}
+
+static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
+ memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
+
+ return 1;
+}
+
+static int generic_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return 0;
+}
+
+static int generic_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+static int
+generic_prepare(struct sk_buff **pskb, unsigned int hooknum,
+ unsigned int *dataoff, u_int8_t *protonum)
+{
+ /* Never track !!! */
+ return -NF_ACCEPT;
+}
+
+
+static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple)
+
+{
+ return NF_CT_F_BASIC;
+}
+
+struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = {
+ .l3proto = PF_UNSPEC,
+ .name = "unknown",
+ .pkt_to_tuple = generic_pkt_to_tuple,
+ .invert_tuple = generic_invert_tuple,
+ .print_tuple = generic_print_tuple,
+ .print_conntrack = generic_print_conntrack,
+ .prepare = generic_prepare,
+ .get_features = generic_get_features,
+ .me = THIS_MODULE,
+};
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
new file mode 100644
index 00000000000..36425f6c833
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -0,0 +1,85 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - enable working with L3 protocol independent connection tracking.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+
+unsigned long nf_ct_generic_timeout = 600*HZ;
+
+static int generic_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+static int generic_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int generic_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return 0;
+}
+
+/* Print out the private part of the conntrack. */
+static int generic_print_conntrack(struct seq_file *s,
+ const struct nf_conn *state)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int packet(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int new(struct nf_conn *conntrack, const struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ return 1;
+}
+
+struct nf_conntrack_protocol nf_conntrack_generic_protocol =
+{
+ .l3proto = PF_UNSPEC,
+ .proto = 0,
+ .name = "unknown",
+ .pkt_to_tuple = generic_pkt_to_tuple,
+ .invert_tuple = generic_invert_tuple,
+ .print_tuple = generic_print_tuple,
+ .print_conntrack = generic_print_conntrack,
+ .packet = packet,
+ .new = new,
+};
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
new file mode 100644
index 00000000000..3a600f77b4e
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -0,0 +1,670 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ *
+ * SCTP is defined in RFC 2960. References to various sections in this code
+ * are to this RFC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - enable working with L3 protocol independent connection tracking.
+ *
+ * Derived from net/ipv4/ip_conntrack_sctp.c
+ */
+
+/*
+ * Added support for proc manipulation of timeouts.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+
+#if 0
+#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.sctp */
+static DEFINE_RWLOCK(sctp_lock);
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR
+
+ And so for me for SCTP :D -Kiran */
+
+static const char *sctp_conntrack_names[] = {
+ "NONE",
+ "CLOSED",
+ "COOKIE_WAIT",
+ "COOKIE_ECHOED",
+ "ESTABLISHED",
+ "SHUTDOWN_SENT",
+ "SHUTDOWN_RECD",
+ "SHUTDOWN_ACK_SENT",
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+static unsigned long nf_ct_sctp_timeout_closed = 10 SECS;
+static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS;
+static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS;
+static unsigned long nf_ct_sctp_timeout_established = 5 DAYS;
+static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000;
+static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000;
+static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS;
+
+static unsigned long * sctp_timeouts[]
+= { NULL, /* SCTP_CONNTRACK_NONE */
+ &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */
+ &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */
+ &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */
+ &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */
+ &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */
+ &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */
+ &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
+ };
+
+#define sNO SCTP_CONNTRACK_NONE
+#define sCL SCTP_CONNTRACK_CLOSED
+#define sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define sES SCTP_CONNTRACK_ESTABLISHED
+#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define sIV SCTP_CONNTRACK_MAX
+
+/*
+ These are the descriptions of the states:
+
+NOTE: These state names are tantalizingly similar to the states of an
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end
+point. Please note the subtleties. -Kiran
+
+NONE - Nothing so far.
+COOKIE WAIT - We have seen an INIT chunk in the original direction, or also
+ an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+ to that of the SHUTDOWN chunk.
+CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
+ the SHUTDOWN chunk. Connection is closed.
+*/
+
+/* TODO
+ - I have assumed that the first INIT is in the original direction.
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+
+/* SCTP conntrack state transitions */
+static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ },
+ {
+/* REPLY */
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+ }
+};
+
+static int sctp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ sctp_sctphdr_t _hdr, *hp;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.sctp.port = hp->source;
+ tuple->dst.u.sctp.port = hp->dest;
+ return 1;
+}
+
+static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+ tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int sctp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.sctp.port),
+ ntohs(tuple->dst.u.sctp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int sctp_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ enum sctp_conntrack state;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ read_lock_bh(&sctp_lock);
+ state = conntrack->proto.sctp.state;
+ read_unlock_bh(&sctp_lock);
+
+ return seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+
+#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
+for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \
+ offset < skb->len && \
+ (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \
+ offset += (htons(sch->length) + 3) & ~3, count++)
+
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ char *map)
+{
+ u_int32_t offset, count;
+ sctp_chunkhdr_t _sch, *sch;
+ int flag;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ flag = 0;
+
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type);
+
+ if (sch->type == SCTP_CID_INIT
+ || sch->type == SCTP_CID_INIT_ACK
+ || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ flag = 1;
+ }
+
+ /* Cookie Ack/Echo chunks not the first OR
+ Init / Init Ack / Shutdown compl chunks not the only chunks */
+ if ((sch->type == SCTP_CID_COOKIE_ACK
+ || sch->type == SCTP_CID_COOKIE_ECHO
+ || flag)
+ && count !=0 ) {
+ DEBUGP("Basic checks failed\n");
+ return 1;
+ }
+
+ if (map) {
+ set_bit(sch->type, (void *)map);
+ }
+ }
+
+ DEBUGP("Basic checks passed\n");
+ return 0;
+}
+
+static int new_state(enum ip_conntrack_dir dir,
+ enum sctp_conntrack cur_state,
+ int chunk_type)
+{
+ int i;
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ DEBUGP("Chunk type: %d\n", chunk_type);
+
+ switch (chunk_type) {
+ case SCTP_CID_INIT:
+ DEBUGP("SCTP_CID_INIT\n");
+ i = 0; break;
+ case SCTP_CID_INIT_ACK:
+ DEBUGP("SCTP_CID_INIT_ACK\n");
+ i = 1; break;
+ case SCTP_CID_ABORT:
+ DEBUGP("SCTP_CID_ABORT\n");
+ i = 2; break;
+ case SCTP_CID_SHUTDOWN:
+ DEBUGP("SCTP_CID_SHUTDOWN\n");
+ i = 3; break;
+ case SCTP_CID_SHUTDOWN_ACK:
+ DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
+ i = 4; break;
+ case SCTP_CID_ERROR:
+ DEBUGP("SCTP_CID_ERROR\n");
+ i = 5; break;
+ case SCTP_CID_COOKIE_ECHO:
+ DEBUGP("SCTP_CID_COOKIE_ECHO\n");
+ i = 6; break;
+ case SCTP_CID_COOKIE_ACK:
+ DEBUGP("SCTP_CID_COOKIE_ACK\n");
+ i = 7; break;
+ case SCTP_CID_SHUTDOWN_COMPLETE:
+ DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
+ i = 8; break;
+ default:
+ /* Other chunks like DATA, SACK, HEARTBEAT and
+ its ACK do not cause a change in state */
+ DEBUGP("Unknown chunk type, Will stay in %s\n",
+ sctp_conntrack_names[cur_state]);
+ return cur_state;
+ }
+
+ DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
+ dir, sctp_conntrack_names[cur_state], chunk_type,
+ sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+
+ return sctp_conntracks[dir][i][cur_state];
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int sctp_packet(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ enum sctp_conntrack newconntrack, oldsctpstate;
+ sctp_sctphdr_t _sctph, *sh;
+ sctp_chunkhdr_t _sch, *sch;
+ u_int32_t offset, count;
+ char map[256 / sizeof (char)] = {0};
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return -1;
+
+ if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
+ return -1;
+
+ /* Check the verification tag (Sec 8.5) */
+ if (!test_bit(SCTP_CID_INIT, (void *)map)
+ && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
+ && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
+ && !test_bit(SCTP_CID_ABORT, (void *)map)
+ && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
+ && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+ DEBUGP("Verification tag check failed\n");
+ return -1;
+ }
+
+ oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ write_lock_bh(&sctp_lock);
+
+ /* Special cases of Verification tag check (Sec 8.5.1) */
+ if (sch->type == SCTP_CID_INIT) {
+ /* Sec 8.5.1 (A) */
+ if (sh->vtag != 0) {
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_ABORT) {
+ /* Sec 8.5.1 (B) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+ && !(sh->vtag == conntrack->proto.sctp.vtag
+ [1 - CTINFO2DIR(ctinfo)])) {
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+ /* Sec 8.5.1 (C) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+ && !(sh->vtag == conntrack->proto.sctp.vtag
+ [1 - CTINFO2DIR(ctinfo)]
+ && (sch->flags & 1))) {
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+ } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+ /* Sec 8.5.1 (D) */
+ if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+ }
+
+ oldsctpstate = conntrack->proto.sctp.state;
+ newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
+
+ /* Invalid */
+ if (newconntrack == SCTP_CONNTRACK_MAX) {
+ DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
+ CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+
+ /* If it is an INIT or an INIT ACK note down the vtag */
+ if (sch->type == SCTP_CID_INIT
+ || sch->type == SCTP_CID_INIT_ACK) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL) {
+ write_unlock_bh(&sctp_lock);
+ return -1;
+ }
+ DEBUGP("Setting vtag %x for dir %d\n",
+ ih->init_tag, !CTINFO2DIR(ctinfo));
+ conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
+ }
+
+ conntrack->proto.sctp.state = newconntrack;
+ if (oldsctpstate != newconntrack)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
+ write_unlock_bh(&sctp_lock);
+ }
+
+ nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
+
+ if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
+ && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
+ && newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
+ DEBUGP("Setting assured bit\n");
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ nf_conntrack_event_cache(IPCT_STATUS, skb);
+ }
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ enum sctp_conntrack newconntrack;
+ sctp_sctphdr_t _sctph, *sh;
+ sctp_chunkhdr_t _sch, *sch;
+ u_int32_t offset, count;
+ char map[256 / sizeof (char)] = {0};
+
+ DEBUGP(__FUNCTION__);
+ DEBUGP("\n");
+
+ sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return 0;
+
+ if (do_basic_checks(conntrack, skb, dataoff, map) != 0)
+ return 0;
+
+ /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+ if ((test_bit (SCTP_CID_ABORT, (void *)map))
+ || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
+ || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
+ return 0;
+ }
+
+ newconntrack = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
+ /* Don't need lock here: this conntrack not in circulation yet */
+ newconntrack = new_state(IP_CT_DIR_ORIGINAL,
+ SCTP_CONNTRACK_NONE, sch->type);
+
+ /* Invalid: delete conntrack */
+ if (newconntrack == SCTP_CONNTRACK_MAX) {
+ DEBUGP("nf_conntrack_sctp: invalid new deleting.\n");
+ return 0;
+ }
+
+ /* Copy the vtag into the state info */
+ if (sch->type == SCTP_CID_INIT) {
+ if (sh->vtag == 0) {
+ sctp_inithdr_t _inithdr, *ih;
+
+ ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+ sizeof(_inithdr), &_inithdr);
+ if (ih == NULL)
+ return 0;
+
+ DEBUGP("Setting vtag %x for new conn\n",
+ ih->init_tag);
+
+ conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] =
+ ih->init_tag;
+ } else {
+ /* Sec 8.5.1 (A) */
+ return 0;
+ }
+ }
+ /* If it is a shutdown ack OOTB packet, we expect a return
+ shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+ else {
+ DEBUGP("Setting vtag %x for new conn OOTB\n",
+ sh->vtag);
+ conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+ }
+
+ conntrack->proto.sctp.state = newconntrack;
+ }
+
+ return 1;
+}
+
+struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = {
+ .l3proto = PF_INET,
+ .proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .new = sctp_new,
+ .destroy = NULL,
+ .me = THIS_MODULE
+};
+
+struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = {
+ .l3proto = PF_INET6,
+ .proto = IPPROTO_SCTP,
+ .name = "sctp",
+ .pkt_to_tuple = sctp_pkt_to_tuple,
+ .invert_tuple = sctp_invert_tuple,
+ .print_tuple = sctp_print_tuple,
+ .print_conntrack = sctp_print_conntrack,
+ .packet = sctp_packet,
+ .new = sctp_new,
+ .destroy = NULL,
+ .me = THIS_MODULE
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table nf_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
+ .procname = "nf_conntrack_sctp_timeout_closed",
+ .data = &nf_ct_sctp_timeout_closed,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
+ .procname = "nf_conntrack_sctp_timeout_cookie_wait",
+ .data = &nf_ct_sctp_timeout_cookie_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
+ .procname = "nf_conntrack_sctp_timeout_cookie_echoed",
+ .data = &nf_ct_sctp_timeout_cookie_echoed,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
+ .procname = "nf_conntrack_sctp_timeout_established",
+ .data = &nf_ct_sctp_timeout_established,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
+ .procname = "nf_conntrack_sctp_timeout_shutdown_sent",
+ .data = &nf_ct_sctp_timeout_shutdown_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
+ .procname = "nf_conntrack_sctp_timeout_shutdown_recd",
+ .data = &nf_ct_sctp_timeout_shutdown_recd,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
+ .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
+ .data = &nf_ct_sctp_timeout_shutdown_ack_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = nf_ct_sysctl_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = nf_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct ctl_table_header *nf_ct_sysctl_header;
+#endif
+
+int __init init(void)
+{
+ int ret;
+
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4);
+ if (ret) {
+ printk("nf_conntrack_proto_sctp4: protocol register failed\n");
+ goto out;
+ }
+ ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6);
+ if (ret) {
+ printk("nf_conntrack_proto_sctp6: protocol register failed\n");
+ goto cleanup_sctp4;
+ }
+
+#ifdef CONFIG_SYSCTL
+ nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
+ if (nf_ct_sysctl_header == NULL) {
+ printk("nf_conntrack_proto_sctp: can't register to sysctl.\n");
+ goto cleanup;
+ }
+#endif
+
+ return ret;
+
+#ifdef CONFIG_SYSCTL
+ cleanup:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
+#endif
+ cleanup_sctp4:
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
+ out:
+ DEBUGP("SCTP conntrack module loading %s\n",
+ ret ? "failed": "succeeded");
+ return ret;
+}
+
+void __exit fini(void)
+{
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6);
+ nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4);
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(nf_ct_sysctl_header);
+#endif
+ DEBUGP("SCTP conntrack module unloaded\n");
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
new file mode 100644
index 00000000000..5a6fcf349bd
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -0,0 +1,1169 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
+ * - Real stateful connection tracking
+ * - Modified state transitions table
+ * - Window scaling support added
+ * - SACK support added
+ *
+ * Willy Tarreau:
+ * - State table bugfixes
+ * - More robust state changes
+ * - Tuning timer parameters
+ *
+ * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - genelized Layer 3 protocol part.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+ *
+ * version 2.2
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+
+#if 0
+#define DEBUGP printk
+#define DEBUGP_VARS
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.tcp */
+static DEFINE_RWLOCK(tcp_lock);
+
+/* "Be conservative in what you do,
+ be liberal in what you accept from others."
+ If it's non-zero, we mark only out of window RST segments as INVALID. */
+int nf_ct_tcp_be_liberal = 0;
+
+/* When connection is picked up from the middle, how many packets are required
+ to pass in each direction when we assume we are in sync - if any side uses
+ window scaling, we lost the game.
+ If it is set to zero, we disable picking up already established
+ connections. */
+int nf_ct_tcp_loose = 3;
+
+/* Max number of the retransmitted packets without receiving an (acceptable)
+ ACK from the destination. If this number is reached, a shorter timer
+ will be started. */
+int nf_ct_tcp_max_retrans = 3;
+
+ /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR */
+
+static const char *tcp_conntrack_names[] = {
+ "NONE",
+ "SYN_SENT",
+ "SYN_RECV",
+ "ESTABLISHED",
+ "FIN_WAIT",
+ "CLOSE_WAIT",
+ "LAST_ACK",
+ "TIME_WAIT",
+ "CLOSE",
+ "LISTEN"
+};
+
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS;
+unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS;
+unsigned long nf_ct_tcp_timeout_established = 5 DAYS;
+unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS;
+unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS;
+unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS;
+unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS;
+unsigned long nf_ct_tcp_timeout_close = 10 SECS;
+
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+ Linux uses 15 packets as limit, which corresponds
+ to ~13-30min depending on RTO. */
+unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS;
+
+static unsigned long * tcp_timeouts[]
+= { NULL, /* TCP_CONNTRACK_NONE */
+ &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
+ &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
+ &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
+ &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
+ &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
+ &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
+ &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
+ &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
+ NULL, /* TCP_CONNTRACK_LISTEN */
+ };
+
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sLI TCP_CONNTRACK_LISTEN
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+ TCP_SYN_SET,
+ TCP_SYNACK_SET,
+ TCP_FIN_SET,
+ TCP_ACK_SET,
+ TCP_RST_SET,
+ TCP_NONE_SET,
+};
+
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE: initial state
+ * SYN_SENT: SYN-only packet seen
+ * SYN_RECV: SYN-ACK packet seen
+ * ESTABLISHED: ACK packet seen
+ * FIN_WAIT: FIN packet seen
+ * CLOSE_WAIT: ACK seen (after FIN)
+ * LAST_ACK: FIN seen (after FIN)
+ * TIME_WAIT: last ACK seen
+ * CLOSE: closed connection
+ *
+ * LISTEN state is not used.
+ *
+ * Packets marked as IGNORED (sIG):
+ * if they may be either invalid or valid
+ * and the receiver may send back a connection
+ * closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ * if they are invalid
+ * or we do not support the request (simultaneous open)
+ */
+static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/*
+ * sNO -> sSS Initialize a new connection
+ * sSS -> sSS Retransmitted SYN
+ * sSR -> sIG Late retransmitted SYN?
+ * sES -> sIG Error: SYNs in window outside the SYN_SENT state
+ * are errors. Receiver will reply with RST
+ * and close the connection.
+ * Or we are not in sync and hold a dead connection.
+ * sFW -> sIG
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sSS Reopened connection (RFC 1122).
+ * sCL -> sSS
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * A SYN/ACK from the client is always invalid:
+ * - either it tries to set up a simultaneous open, which is
+ * not supported;
+ * - or the firewall has just been inserted between the two hosts
+ * during the session set-up. The SYN will be retransmitted
+ * by the true client (or it'll time out).
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sNO -> sIV Too late and no reason to do anything...
+ * sSS -> sIV Client migth not send FIN in this state:
+ * we enforce waiting for a SYN/ACK reply first.
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions, waiting for
+ * the last ACK.
+ * Migth be a retransmitted FIN as well...
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN. Remain in the same state.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ * sNO -> sES Assumed.
+ * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
+ * sSR -> sES Established state is reached.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected.
+ * sTW -> sTW Retransmitted last ACK. Remain in the same state.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ },
+ {
+/* REPLY */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * sNO -> sIV Never reached.
+ * sSS -> sIV Simultaneous open, not supported
+ * sSR -> sIV Simultaneous open, not supported.
+ * sES -> sIV Server may not initiate a connection.
+ * sFW -> sIV
+ * sCW -> sIV
+ * sLA -> sIV
+ * sTW -> sIV Reopened connection, but server may not do it.
+ * sCL -> sIV
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/*
+ * sSS -> sSR Standard open.
+ * sSR -> sSR Retransmitted SYN/ACK.
+ * sES -> sIG Late retransmitted SYN/ACK?
+ * sFW -> sIG Might be SYN/ACK answering ignored SYN
+ * sCW -> sIG
+ * sLA -> sIG
+ * sTW -> sIG
+ * sCL -> sIG
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ * sSS -> sIV Server might not send FIN in this state.
+ * sSR -> sFW Close started.
+ * sES -> sFW
+ * sFW -> sLA FIN seen in both directions.
+ * sCW -> sLA
+ * sLA -> sLA Retransmitted FIN.
+ * sTW -> sTW
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ * sSS -> sIV Might be a half-open connection.
+ * sSR -> sSR Might answer late resent SYN.
+ * sES -> sES :-)
+ * sFW -> sCW Normal close request answered by ACK.
+ * sCW -> sCW
+ * sLA -> sTW Last ACK detected.
+ * sTW -> sTW Retransmitted last ACK.
+ * sCL -> sCL
+ */
+/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
+/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ }
+};
+
+static int tcp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct tcphdr _hdr, *hp;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.tcp.port = hp->source;
+ tuple->dst.u.tcp.port = hp->dest;
+
+ return 1;
+}
+
+static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+ tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int tcp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.tcp.port),
+ ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int tcp_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ enum tcp_conntrack state;
+
+ read_lock_bh(&tcp_lock);
+ state = conntrack->proto.tcp.state;
+ read_unlock_bh(&tcp_lock);
+
+ return seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+ if (tcph->rst) return TCP_RST_SET;
+ else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+ else if (tcph->fin) return TCP_FIN_SET;
+ else if (tcph->ack) return TCP_ACK_SET;
+ else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+ in IP Filter' by Guido van Rooij.
+
+ http://www.nluug.nl/events/sane2000/papers.html
+ http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+
+ The boundaries and the conditions are changed according to RFC793:
+ the packet must intersect the window (i.e. segments may be
+ after the right or before the left edge) and thus receivers may ACK
+ segments after the right edge of the window.
+
+ td_maxend = max(sack + max(win,1)) seen in reply packets
+ td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+ td_maxwin += seq + len - sender.td_maxend
+ if seq + len > sender.td_maxend
+ td_end = max(seq + len) seen in sent packets
+
+ I. Upper bound for valid data: seq <= sender.td_maxend
+ II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
+ III. Upper bound for valid ack: sack <= receiver.td_end
+ IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
+
+ where sack is the highest right edge of sack block found in the packet.
+
+ The upper bound limit for a valid ack is not ignored -
+ we doesn't have to deal with fragments.
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+ size_t len,
+ unsigned int dataoff,
+ struct tcphdr *tcph)
+{
+ /* XXX Should I use payload length field in IP/IPv6 header ?
+ * - YK */
+ return (seq + len - dataoff - tcph->doff*4
+ + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST 66000
+#define MAXACKWINDOW(sender) \
+ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
+ : MAXACKWINCONST)
+
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct tcphdr *tcph,
+ struct ip_ct_tcp_state *state)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ state->td_scale =
+ state->flags = 0;
+
+ while (length > 0) {
+ int opcode=*ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize=*ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK_PERM
+ && opsize == TCPOLEN_SACK_PERM)
+ state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+ else if (opcode == TCPOPT_WINDOW
+ && opsize == TCPOLEN_WINDOW) {
+ state->td_scale = *(u_int8_t *)ptr;
+
+ if (state->td_scale > 14) {
+ /* See RFC1323 */
+ state->td_scale = 14;
+ }
+ state->flags |=
+ IP_CT_TCP_FLAG_WINDOW_SCALE;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
+ struct tcphdr *tcph, __u32 *sack)
+{
+ unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+ unsigned char *ptr;
+ int length = (tcph->doff*4) - sizeof(struct tcphdr);
+ __u32 tmp;
+
+ if (!length)
+ return;
+
+ ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
+ length, buff);
+ BUG_ON(ptr == NULL);
+
+ /* Fast path for timestamp-only option */
+ if (length == TCPOLEN_TSTAMP_ALIGNED*4
+ && *(__u32 *)ptr ==
+ __constant_ntohl((TCPOPT_NOP << 24)
+ | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))
+ return;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize, i;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ break; /* don't parse partial options */
+
+ if (opcode == TCPOPT_SACK
+ && opsize >= (TCPOLEN_SACK_BASE
+ + TCPOLEN_SACK_PERBLOCK)
+ && !((opsize - TCPOLEN_SACK_BASE)
+ % TCPOLEN_SACK_PERBLOCK)) {
+ for (i = 0;
+ i < (opsize - TCPOLEN_SACK_BASE);
+ i += TCPOLEN_SACK_PERBLOCK) {
+ memcpy(&tmp, (__u32 *)(ptr + i) + 1,
+ sizeof(__u32));
+ tmp = ntohl(tmp);
+
+ if (after(tmp, *sack))
+ *sack = tmp;
+ }
+ return;
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+static int tcp_in_window(struct ip_ct_tcp *state,
+ enum ip_conntrack_dir dir,
+ unsigned int index,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct tcphdr *tcph,
+ int pf)
+{
+ struct ip_ct_tcp_state *sender = &state->seen[dir];
+ struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+ __u32 seq, ack, sack, end, win, swin;
+ int res;
+
+ /*
+ * Get the required data from the packet.
+ */
+ seq = ntohl(tcph->seq);
+ ack = sack = ntohl(tcph->ack_seq);
+ win = ntohs(tcph->window);
+ end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
+
+ if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+ tcp_sack(skb, dataoff, tcph, &sack);
+
+ DEBUGP("tcp_in_window: START\n");
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack=%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ if (sender->td_end == 0) {
+ /*
+ * Initialize sender data.
+ */
+ if (tcph->syn && tcph->ack) {
+ /*
+ * Outgoing SYN-ACK in reply to a SYN.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /*
+ * RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+ && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+ sender->td_scale =
+ receiver->td_scale = 0;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ sender->td_end = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+ sender->td_maxend = end + sender->td_maxwin;
+ }
+ } else if (((state->state == TCP_CONNTRACK_SYN_SENT
+ && dir == IP_CT_DIR_ORIGINAL)
+ || (state->state == TCP_CONNTRACK_SYN_RECV
+ && dir == IP_CT_DIR_REPLY))
+ && after(end, sender->td_end)) {
+ /*
+ * RFC 793: "if a TCP is reinitialized ... then it need
+ * not wait at all; it must only be sure to use sequence
+ * numbers larger than those recently used."
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ }
+
+ if (!(tcph->ack)) {
+ /*
+ * If there is no ACK, just pretend it was set and OK.
+ */
+ ack = sack = receiver->td_end;
+ } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
+ (TCP_FLAG_ACK|TCP_FLAG_RST))
+ && (ack == 0)) {
+ /*
+ * Broken TCP stacks, that set ACK in RST packets as well
+ * with zero ack value.
+ */
+ ack = sack = receiver->td_end;
+ }
+
+ if (seq == end
+ && (!tcph->rst
+ || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+ /*
+ * Packets contains no data: we assume it is valid
+ * and check the ack value only.
+ * However RST segments are always validated by their
+ * SEQ number, except when seq == 0 (reset sent answering
+ * SYN.
+ */
+ seq = end = sender->td_end;
+
+ DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "seq=%u ack=%u sack =%u win=%u end=%u\n",
+ NIPQUAD(iph->saddr), ntohs(tcph->source),
+ NIPQUAD(iph->daddr), ntohs(tcph->dest),
+ seq, ack, sack, win, end);
+ DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+
+ DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+ before(seq, sender->td_maxend + 1),
+ after(end, sender->td_end - receiver->td_maxwin - 1),
+ before(sack, receiver->td_end + 1),
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+
+ if (sender->loose || receiver->loose ||
+ (before(seq, sender->td_maxend + 1) &&
+ after(end, sender->td_end - receiver->td_maxwin - 1) &&
+ before(sack, receiver->td_end + 1) &&
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
+ /*
+ * Take into account window scaling (RFC 1323).
+ */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /*
+ * Update sender data.
+ */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end))
+ sender->td_end = end;
+ /*
+ * Update receiver data.
+ */
+ if (after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+
+ /*
+ * Check retransmissions.
+ */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir
+ && state->last_seq == seq
+ && state->last_ack == ack
+ && state->last_end == end)
+ state->retrans++;
+ else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->retrans = 0;
+ }
+ }
+ /*
+ * Close the window of disabled window tracking :-)
+ */
+ if (sender->loose)
+ sender->loose--;
+
+ res = 1;
+ } else {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: %s ",
+ before(seq, sender->td_maxend + 1) ?
+ after(end, sender->td_end - receiver->td_maxwin - 1) ?
+ before(sack, receiver->td_end + 1) ?
+ after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+ : "ACK is under the lower bound (possible overly delayed ACK)"
+ : "ACK is over the upper bound (ACKed data not seen yet)"
+ : "SEQ is under the lower bound (already ACKed data retransmitted)"
+ : "SEQ is over the upper bound (over the window of the receiver)");
+
+ res = nf_ct_tcp_be_liberal;
+ }
+
+ DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+ "receiver end=%u maxend=%u maxwin=%u\n",
+ res, sender->td_end, sender->td_maxend, sender->td_maxwin,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+ return res;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+/* Update sender->td_end after NAT successfully mangled the packet */
+/* Caller must linearize skb at tcp header. */
+void nf_conntrack_tcp_update(struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conn *conntrack,
+ int dir)
+{
+ struct tcphdr *tcph = (void *)skb->data + dataoff;
+ __u32 end;
+#ifdef DEBUGP_VARS
+ struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
+ struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
+#endif
+
+ end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
+
+ write_lock_bh(&tcp_lock);
+ /*
+ * We have to worry for the ack in the reply packet only...
+ */
+ if (after(end, conntrack->proto.tcp.seen[dir].td_end))
+ conntrack->proto.tcp.seen[dir].td_end = end;
+ conntrack->proto.tcp.last_end = end;
+ write_unlock_bh(&tcp_lock);
+ DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+}
+
+#endif
+
+#define TH_FIN 0x01
+#define TH_SYN 0x02
+#define TH_RST 0x04
+#define TH_PUSH 0x08
+#define TH_ACK 0x10
+#define TH_URG 0x20
+#define TH_ECE 0x40
+#define TH_CWR 0x80
+
+/* table of valid flag combinations - ECE and CWR are always valid */
+static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+{
+ [TH_SYN] = 1,
+ [TH_SYN|TH_ACK] = 1,
+ [TH_SYN|TH_PUSH] = 1,
+ [TH_SYN|TH_ACK|TH_PUSH] = 1,
+ [TH_RST] = 1,
+ [TH_RST|TH_ACK] = 1,
+ [TH_RST|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK] = 1,
+ [TH_ACK] = 1,
+ [TH_ACK|TH_PUSH] = 1,
+ [TH_ACK|TH_URG] = 1,
+ [TH_ACK|TH_URG|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_PUSH] = 1,
+ [TH_FIN|TH_ACK|TH_URG] = 1,
+ [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
+static int tcp_error(struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum,
+ int(*csum)(const struct sk_buff *,unsigned int))
+{
+ struct tcphdr _tcph, *th;
+ unsigned int tcplen = skb->len - dataoff;
+ u_int8_t tcpflags;
+
+ /* Smaller that minimal TCP header? */
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Not whole TCP header or malformed packet */
+ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the semantic of CHECKSUM_HW is different there
+ * and moreover root might send raw packets.
+ */
+ /* FIXME: Source route IP option packets --RR */
+ if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+ (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
+ && skb->ip_summed != CHECKSUM_UNNECESSARY
+ && csum(skb, dataoff)) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: bad TCP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ /* Check TCP flags. */
+ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+ if (!tcp_valid_flags[tcpflags]) {
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid TCP flag combination ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+static int csum4(const struct sk_buff *skb, unsigned int dataoff)
+{
+ return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len - dataoff, IPPROTO_TCP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, dataoff,
+ skb->len - dataoff, 0));
+}
+
+static int csum6(const struct sk_buff *skb, unsigned int dataoff)
+{
+ return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
+ skb->len - dataoff, IPPROTO_TCP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, dataoff, skb->len - dataoff,
+ 0));
+}
+
+static int tcp_error4(struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
+}
+
+static int tcp_error6(struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ enum tcp_conntrack new_state, old_state;
+ enum ip_conntrack_dir dir;
+ struct tcphdr *th, _tcph;
+ unsigned long timeout;
+ unsigned int index;
+
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ write_lock_bh(&tcp_lock);
+ old_state = conntrack->proto.tcp.state;
+ dir = CTINFO2DIR(ctinfo);
+ index = get_conntrack_index(th);
+ new_state = tcp_conntracks[dir][index][old_state];
+
+ switch (new_state) {
+ case TCP_CONNTRACK_IGNORE:
+ /* Either SYN in ORIGINAL
+ * or SYN/ACK in REPLY. */
+ if (index == TCP_SYNACK_SET
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && conntrack->proto.tcp.last_dir != dir
+ && ntohl(th->ack_seq) ==
+ conntrack->proto.tcp.last_end) {
+ /* This SYN/ACK acknowledges a SYN that we earlier
+ * ignored as invalid. This means that the client and
+ * the server are both in sync, while the firewall is
+ * not. We kill this session and block the SYN/ACK so
+ * that the client cannot but retransmit its SYN and
+ * thus initiate a clean new session.
+ */
+ write_unlock_bh(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: killing out of sync session ");
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return -NF_DROP;
+ }
+ conntrack->proto.tcp.last_index = index;
+ conntrack->proto.tcp.last_dir = dir;
+ conntrack->proto.tcp.last_seq = ntohl(th->seq);
+ conntrack->proto.tcp.last_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
+
+ write_unlock_bh(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid packed ignored ");
+ return NF_ACCEPT;
+ case TCP_CONNTRACK_MAX:
+ /* Invalid packet */
+ DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+ dir, get_conntrack_index(th),
+ old_state);
+ write_unlock_bh(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_tcp: invalid state ");
+ return -NF_ACCEPT;
+ case TCP_CONNTRACK_SYN_SENT:
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ break;
+ if ((conntrack->proto.tcp.seen[dir].flags &
+ IP_CT_TCP_FLAG_CLOSE_INIT)
+ || after(ntohl(th->seq),
+ conntrack->proto.tcp.seen[dir].td_end)) {
+ /* Attempt to reopen a closed connection.
+ * Delete this connection and look up again. */
+ write_unlock_bh(&tcp_lock);
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return -NF_REPEAT;
+ } else {
+ write_unlock_bh(&tcp_lock);
+ if (LOG_INVALID(IPPROTO_TCP))
+ nf_log_packet(pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid SYN");
+ return -NF_ACCEPT;
+ }
+ case TCP_CONNTRACK_CLOSE:
+ if (index == TCP_RST_SET
+ && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+ && conntrack->proto.tcp.last_index == TCP_SYN_SET
+ && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
+ /* RST sent to invalid SYN we had let trough
+ * SYN was in window then, tear down connection.
+ * We skip window checking, because packet might ACK
+ * segments we ignored in the SYN. */
+ goto in_window;
+ }
+ /* Just fall trough */
+ default:
+ /* Keep compilers happy. */
+ break;
+ }
+
+ if (!tcp_in_window(&conntrack->proto.tcp, dir, index,
+ skb, dataoff, th, pf)) {
+ write_unlock_bh(&tcp_lock);
+ return -NF_ACCEPT;
+ }
+ in_window:
+ /* From now on we have got in-window packets */
+ conntrack->proto.tcp.last_index = index;
+
+ DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+ "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+ NIPQUAD(iph->saddr), ntohs(th->source),
+ NIPQUAD(iph->daddr), ntohs(th->dest),
+ (th->syn ? 1 : 0), (th->ack ? 1 : 0),
+ (th->fin ? 1 : 0), (th->rst ? 1 : 0),
+ old_state, new_state);
+
+ conntrack->proto.tcp.state = new_state;
+ if (old_state != new_state
+ && (new_state == TCP_CONNTRACK_FIN_WAIT
+ || new_state == TCP_CONNTRACK_CLOSE))
+ conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+ timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans
+ && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans
+ ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
+ write_unlock_bh(&tcp_lock);
+
+ nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ if (new_state != old_state)
+ nf_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
+ if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ /* If only reply is a RST, we can consider ourselves not to
+ have an established connection: this is a fairly common
+ problem case, so we can delete the conntrack
+ immediately. --RR */
+ if (th->rst) {
+ if (del_timer(&conntrack->timeout))
+ conntrack->timeout.function((unsigned long)
+ conntrack);
+ return NF_ACCEPT;
+ }
+ } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+ && (old_state == TCP_CONNTRACK_SYN_RECV
+ || old_state == TCP_CONNTRACK_ESTABLISHED)
+ && new_state == TCP_CONNTRACK_ESTABLISHED) {
+ /* Set ASSURED if we see see valid ack in ESTABLISHED
+ after SYN_RECV or a valid answer for a picked up
+ connection. */
+ set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ nf_conntrack_event_cache(IPCT_STATUS, skb);
+ }
+ nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int tcp_new(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ enum tcp_conntrack new_state;
+ struct tcphdr *th, _tcph;
+#ifdef DEBUGP_VARS
+ struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
+ struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
+#endif
+
+ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
+ BUG_ON(th == NULL);
+
+ /* Don't need lock here: this conntrack not in circulation yet */
+ new_state
+ = tcp_conntracks[0][get_conntrack_index(th)]
+ [TCP_CONNTRACK_NONE];
+
+ /* Invalid: delete conntrack */
+ if (new_state >= TCP_CONNTRACK_MAX) {
+ DEBUGP("nf_ct_tcp: invalid new deleting.\n");
+ return 0;
+ }
+
+ if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ /* SYN packet */
+ conntrack->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+ conntrack->proto.tcp.seen[0].td_maxwin = 1;
+ conntrack->proto.tcp.seen[0].td_maxend =
+ conntrack->proto.tcp.seen[0].td_end;
+
+ tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]);
+ conntrack->proto.tcp.seen[1].flags = 0;
+ conntrack->proto.tcp.seen[0].loose =
+ conntrack->proto.tcp.seen[1].loose = 0;
+ } else if (nf_ct_tcp_loose == 0) {
+ /* Don't try to pick up connections. */
+ return 0;
+ } else {
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ conntrack->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+ conntrack->proto.tcp.seen[0].td_maxwin = 1;
+ conntrack->proto.tcp.seen[0].td_maxend =
+ conntrack->proto.tcp.seen[0].td_end +
+ conntrack->proto.tcp.seen[0].td_maxwin;
+ conntrack->proto.tcp.seen[0].td_scale = 0;
+
+ /* We assume SACK. Should we assume window scaling too? */
+ conntrack->proto.tcp.seen[0].flags =
+ conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
+ conntrack->proto.tcp.seen[0].loose =
+ conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose;
+ }
+
+ conntrack->proto.tcp.seen[1].td_end = 0;
+ conntrack->proto.tcp.seen[1].td_maxend = 0;
+ conntrack->proto.tcp.seen[1].td_maxwin = 1;
+ conntrack->proto.tcp.seen[1].td_scale = 0;
+
+ /* tcp_packet will set them */
+ conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
+ conntrack->proto.tcp.last_index = TCP_NONE_SET;
+
+ DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+ return 1;
+}
+
+struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 =
+{
+ .l3proto = PF_INET,
+ .proto = IPPROTO_TCP,
+ .name = "tcp",
+ .pkt_to_tuple = tcp_pkt_to_tuple,
+ .invert_tuple = tcp_invert_tuple,
+ .print_tuple = tcp_print_tuple,
+ .print_conntrack = tcp_print_conntrack,
+ .packet = tcp_packet,
+ .new = tcp_new,
+ .error = tcp_error4,
+};
+
+struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 =
+{
+ .l3proto = PF_INET6,
+ .proto = IPPROTO_TCP,
+ .name = "tcp",
+ .pkt_to_tuple = tcp_pkt_to_tuple,
+ .invert_tuple = tcp_invert_tuple,
+ .print_tuple = tcp_print_tuple,
+ .print_conntrack = tcp_print_conntrack,
+ .packet = tcp_packet,
+ .new = tcp_new,
+ .error = tcp_error6,
+};
+
+EXPORT_SYMBOL(nf_conntrack_protocol_tcp4);
+EXPORT_SYMBOL(nf_conntrack_protocol_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
new file mode 100644
index 00000000000..3cae7ce420d
--- /dev/null
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -0,0 +1,216 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - enable working with Layer 3 protocol independent connection tracking.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+
+unsigned long nf_ct_udp_timeout = 30*HZ;
+unsigned long nf_ct_udp_timeout_stream = 180*HZ;
+
+static int udp_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{
+ struct udphdr _hdr, *hp;
+
+ /* Actually only need first 8 bytes. */
+ hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL)
+ return 0;
+
+ tuple->src.u.udp.port = hp->source;
+ tuple->dst.u.udp.port = hp->dest;
+
+ return 1;
+}
+
+static int udp_invert_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udp_print_tuple(struct seq_file *s,
+ const struct nf_conntrack_tuple *tuple)
+{
+ return seq_printf(s, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int udp_print_conntrack(struct seq_file *s,
+ const struct nf_conn *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct nf_conn *conntrack,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ nf_ct_refresh_acct(conntrack, ctinfo, skb,
+ nf_ct_udp_timeout_stream);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ nf_conntrack_event_cache(IPCT_STATUS, skb);
+ } else
+ nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb,
+ unsigned int dataoff)
+{
+ return 1;
+}
+
+static int udp_error(struct sk_buff *skb, unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum,
+ int (*csum)(const struct sk_buff *, unsigned int))
+{
+ unsigned int udplen = skb->len - dataoff;
+ struct udphdr _hdr, *hdr;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (hdr == NULL) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: short packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Truncated/malformed packets */
+ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: truncated/malformed packet ");
+ return -NF_ACCEPT;
+ }
+
+ /* Packet with no checksum */
+ if (!hdr->check)
+ return NF_ACCEPT;
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the semantic of CHECKSUM_HW is different there
+ * and moreover root might send raw packets.
+ * FIXME: Source route IP option packets --RR */
+ if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
+ (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING))
+ && skb->ip_summed != CHECKSUM_UNNECESSARY
+ && csum(skb, dataoff)) {
+ if (LOG_INVALID(IPPROTO_UDP))
+ nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ "nf_ct_udp: bad UDP checksum ");
+ return -NF_ACCEPT;
+ }
+
+ return NF_ACCEPT;
+}
+
+static int csum4(const struct sk_buff *skb, unsigned int dataoff)
+{
+ return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+ skb->len - dataoff, IPPROTO_UDP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, dataoff,
+ skb->len - dataoff, 0));
+}
+
+static int csum6(const struct sk_buff *skb, unsigned int dataoff)
+{
+ return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
+ skb->len - dataoff, IPPROTO_UDP,
+ skb->ip_summed == CHECKSUM_HW ? skb->csum
+ : skb_checksum(skb, dataoff, skb->len - dataoff,
+ 0));
+}
+
+static int udp_error4(struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4);
+}
+
+static int udp_error6(struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info *ctinfo,
+ int pf,
+ unsigned int hooknum)
+{
+ return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6);
+}
+
+struct nf_conntrack_protocol nf_conntrack_protocol_udp4 =
+{
+ .l3proto = PF_INET,
+ .proto = IPPROTO_UDP,
+ .name = "udp",
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .print_conntrack = udp_print_conntrack,
+ .packet = udp_packet,
+ .new = udp_new,
+ .error = udp_error4,
+};
+
+struct nf_conntrack_protocol nf_conntrack_protocol_udp6 =
+{
+ .l3proto = PF_INET6,
+ .proto = IPPROTO_UDP,
+ .name = "udp",
+ .pkt_to_tuple = udp_pkt_to_tuple,
+ .invert_tuple = udp_invert_tuple,
+ .print_tuple = udp_print_tuple,
+ .print_conntrack = udp_print_conntrack,
+ .packet = udp_packet,
+ .new = udp_new,
+ .error = udp_error6,
+};
+
+EXPORT_SYMBOL(nf_conntrack_protocol_udp4);
+EXPORT_SYMBOL(nf_conntrack_protocol_udp6);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
new file mode 100644
index 00000000000..5af381f9fe3
--- /dev/null
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -0,0 +1,869 @@
+/* This file contains all the functions required for the standalone
+ nf_conntrack module.
+
+ These are not required by the compatibility layer.
+*/
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
+ * - generalize L3 protocol dependent part.
+ *
+ * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_protocol.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+
+extern atomic_t nf_conntrack_count;
+DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
+
+static int kill_l3proto(struct nf_conn *i, void *data)
+{
+ return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
+ ((struct nf_conntrack_l3proto *)data)->l3proto);
+}
+
+static int kill_proto(struct nf_conn *i, void *data)
+{
+ struct nf_conntrack_protocol *proto;
+ proto = (struct nf_conntrack_protocol *)data;
+ return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
+ proto->proto) &&
+ (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num ==
+ proto->l3proto);
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_l3proto *l3proto,
+ struct nf_conntrack_protocol *proto)
+{
+ return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple);
+}
+
+#ifdef CONFIG_NF_CT_ACCT
+static unsigned int
+seq_print_counters(struct seq_file *s,
+ const struct ip_conntrack_counter *counter)
+{
+ return seq_printf(s, "packets=%llu bytes=%llu ",
+ (unsigned long long)counter->packets,
+ (unsigned long long)counter->bytes);
+}
+#else
+#define seq_print_counters(x, y) 0
+#endif
+
+struct ct_iter_state {
+ unsigned int bucket;
+};
+
+static struct list_head *ct_get_first(struct seq_file *seq)
+{
+ struct ct_iter_state *st = seq->private;
+
+ for (st->bucket = 0;
+ st->bucket < nf_conntrack_htable_size;
+ st->bucket++) {
+ if (!list_empty(&nf_conntrack_hash[st->bucket]))
+ return nf_conntrack_hash[st->bucket].next;
+ }
+ return NULL;
+}
+
+static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
+{
+ struct ct_iter_state *st = seq->private;
+
+ head = head->next;
+ while (head == &nf_conntrack_hash[st->bucket]) {
+ if (++st->bucket >= nf_conntrack_htable_size)
+ return NULL;
+ head = nf_conntrack_hash[st->bucket].next;
+ }
+ return head;
+}
+
+static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct list_head *head = ct_get_first(seq);
+
+ if (head)
+ while (pos && (head = ct_get_next(seq, head)))
+ pos--;
+ return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ read_lock_bh(&nf_conntrack_lock);
+ return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return ct_get_next(s, v);
+}
+
+static void ct_seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&nf_conntrack_lock);
+}
+
+/* return 0 on success, 1 in case of error */
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+ const struct nf_conntrack_tuple_hash *hash = v;
+ const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash);
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_protocol *proto;
+
+ ASSERT_READ_LOCK(&nf_conntrack_lock);
+ NF_CT_ASSERT(conntrack);
+
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ return 0;
+
+ l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src.l3num);
+
+ NF_CT_ASSERT(l3proto);
+ proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src.l3num,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ NF_CT_ASSERT(proto);
+
+ if (seq_printf(s, "%-8s %u %-8s %u %ld ",
+ l3proto->name,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num,
+ proto->name,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
+ timer_pending(&conntrack->timeout)
+ ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0)
+ return -ENOSPC;
+
+ if (l3proto->print_conntrack(s, conntrack))
+ return -ENOSPC;
+
+ if (proto->print_conntrack(s, conntrack))
+ return -ENOSPC;
+
+ if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ l3proto, proto))
+ return -ENOSPC;
+
+ if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
+ return -ENOSPC;
+
+ if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
+ if (seq_printf(s, "[UNREPLIED] "))
+ return -ENOSPC;
+
+ if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
+ l3proto, proto))
+ return -ENOSPC;
+
+ if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
+ return -ENOSPC;
+
+ if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
+ if (seq_printf(s, "[ASSURED] "))
+ return -ENOSPC;
+
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ if (seq_printf(s, "mark=%u ", conntrack->mark))
+ return -ENOSPC;
+#endif
+
+ if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
+ return -ENOSPC;
+
+ return 0;
+}
+
+static struct seq_operations ct_seq_ops = {
+ .start = ct_seq_start,
+ .next = ct_seq_next,
+ .stop = ct_seq_stop,
+ .show = ct_seq_show
+};
+
+static int ct_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct ct_iter_state *st;
+ int ret;
+
+ st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
+ if (st == NULL)
+ return -ENOMEM;
+ ret = seq_open(file, &ct_seq_ops);
+ if (ret)
+ goto out_free;
+ seq = file->private_data;
+ seq->private = st;
+ memset(st, 0, sizeof(struct ct_iter_state));
+ return ret;
+out_free:
+ kfree(st);
+ return ret;
+}
+
+static struct file_operations ct_file_ops = {
+ .owner = THIS_MODULE,
+ .open = ct_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/* expects */
+static void *exp_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct list_head *e = &nf_conntrack_expect_list;
+ loff_t i;
+
+ /* strange seq_file api calls stop even if we fail,
+ * thus we need to grab lock since stop unlocks */
+ read_lock_bh(&nf_conntrack_lock);
+
+ if (list_empty(e))
+ return NULL;
+
+ for (i = 0; i <= *pos; i++) {
+ e = e->next;
+ if (e == &nf_conntrack_expect_list)
+ return NULL;
+ }
+ return e;
+}
+
+static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct list_head *e = v;
+
+ ++*pos;
+ e = e->next;
+
+ if (e == &nf_conntrack_expect_list)
+ return NULL;
+
+ return e;
+}
+
+static void exp_seq_stop(struct seq_file *s, void *v)
+{
+ read_unlock_bh(&nf_conntrack_lock);
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+ struct nf_conntrack_expect *expect = v;
+
+ if (expect->timeout.function)
+ seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+ ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+ else
+ seq_printf(s, "- ");
+ seq_printf(s, "l3proto = %u proto=%u ",
+ expect->tuple.src.l3num,
+ expect->tuple.dst.protonum);
+ print_tuple(s, &expect->tuple,
+ nf_ct_find_l3proto(expect->tuple.src.l3num),
+ nf_ct_find_proto(expect->tuple.src.l3num,
+ expect->tuple.dst.protonum));
+ return seq_putc(s, '\n');
+}
+
+static struct seq_operations exp_seq_ops = {
+ .start = exp_seq_start,
+ .next = exp_seq_next,
+ .stop = exp_seq_stop,
+ .show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &exp_seq_ops);
+}
+
+static struct file_operations exp_file_ops = {
+ .owner = THIS_MODULE,
+ .open = exp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return &per_cpu(nf_conntrack_stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ int cpu;
+
+ for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return &per_cpu(nf_conntrack_stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ unsigned int nr_conntracks = atomic_read(&nf_conntrack_count);
+ struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x %08x %08x \n",
+ nr_conntracks,
+ st->searched,
+ st->found,
+ st->new,
+ st->invalid,
+ st->ignore,
+ st->delete,
+ st->delete_list,
+ st->insert,
+ st->insert_failed,
+ st->drop,
+ st->early_drop,
+ st->error,
+
+ st->expect_new,
+ st->expect_create,
+ st->expect_delete
+ );
+ return 0;
+}
+
+static struct seq_operations ct_cpu_seq_ops = {
+ .start = ct_cpu_seq_start,
+ .next = ct_cpu_seq_next,
+ .stop = ct_cpu_seq_stop,
+ .show = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &ct_cpu_seq_ops);
+}
+
+static struct file_operations ct_cpu_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = ct_cpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+#endif /* CONFIG_PROC_FS */
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
+
+/* From nf_conntrack_core.c */
+extern int nf_conntrack_max;
+extern unsigned int nf_conntrack_htable_size;
+
+/* From nf_conntrack_proto_tcp.c */
+extern unsigned long nf_ct_tcp_timeout_syn_sent;
+extern unsigned long nf_ct_tcp_timeout_syn_recv;
+extern unsigned long nf_ct_tcp_timeout_established;
+extern unsigned long nf_ct_tcp_timeout_fin_wait;
+extern unsigned long nf_ct_tcp_timeout_close_wait;
+extern unsigned long nf_ct_tcp_timeout_last_ack;
+extern unsigned long nf_ct_tcp_timeout_time_wait;
+extern unsigned long nf_ct_tcp_timeout_close;
+extern unsigned long nf_ct_tcp_timeout_max_retrans;
+extern int nf_ct_tcp_loose;
+extern int nf_ct_tcp_be_liberal;
+extern int nf_ct_tcp_max_retrans;
+
+/* From nf_conntrack_proto_udp.c */
+extern unsigned long nf_ct_udp_timeout;
+extern unsigned long nf_ct_udp_timeout_stream;
+
+/* From nf_conntrack_proto_generic.c */
+extern unsigned long nf_ct_generic_timeout;
+
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table_header *nf_ct_sysctl_header;
+
+static ctl_table nf_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_NF_CONNTRACK_MAX,
+ .procname = "nf_conntrack_max",
+ .data = &nf_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_COUNT,
+ .procname = "nf_conntrack_count",
+ .data = &nf_conntrack_count,
+ .maxlen = sizeof(int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_BUCKETS,
+ .procname = "nf_conntrack_buckets",
+ .data = &nf_conntrack_htable_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0444,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
+ .procname = "nf_conntrack_tcp_timeout_syn_sent",
+ .data = &nf_ct_tcp_timeout_syn_sent,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
+ .procname = "nf_conntrack_tcp_timeout_syn_recv",
+ .data = &nf_ct_tcp_timeout_syn_recv,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
+ .procname = "nf_conntrack_tcp_timeout_established",
+ .data = &nf_ct_tcp_timeout_established,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
+ .procname = "nf_conntrack_tcp_timeout_fin_wait",
+ .data = &nf_ct_tcp_timeout_fin_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
+ .procname = "nf_conntrack_tcp_timeout_close_wait",
+ .data = &nf_ct_tcp_timeout_close_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
+ .procname = "nf_conntrack_tcp_timeout_last_ack",
+ .data = &nf_ct_tcp_timeout_last_ack,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
+ .procname = "nf_conntrack_tcp_timeout_time_wait",
+ .data = &nf_ct_tcp_timeout_time_wait,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
+ .procname = "nf_conntrack_tcp_timeout_close",
+ .data = &nf_ct_tcp_timeout_close,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT,
+ .procname = "nf_conntrack_udp_timeout",
+ .data = &nf_ct_udp_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
+ .procname = "nf_conntrack_udp_timeout_stream",
+ .data = &nf_ct_udp_timeout_stream,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT,
+ .procname = "nf_conntrack_generic_timeout",
+ .data = &nf_ct_generic_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_LOG_INVALID,
+ .procname = "nf_conntrack_log_invalid",
+ .data = &nf_ct_log_invalid,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &log_invalid_proto_min,
+ .extra2 = &log_invalid_proto_max,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
+ .procname = "nf_conntrack_tcp_timeout_max_retrans",
+ .data = &nf_ct_tcp_timeout_max_retrans,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE,
+ .procname = "nf_conntrack_tcp_loose",
+ .data = &nf_ct_tcp_loose,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
+ .procname = "nf_conntrack_tcp_be_liberal",
+ .data = &nf_ct_tcp_be_liberal,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
+ .procname = "nf_conntrack_tcp_max_retrans",
+ .data = &nf_ct_tcp_max_retrans,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+
+ { .ctl_name = 0 }
+};
+
+#define NET_NF_CONNTRACK_MAX 2089
+
+static ctl_table nf_ct_netfilter_table[] = {
+ {
+ .ctl_name = NET_NETFILTER,
+ .procname = "netfilter",
+ .mode = 0555,
+ .child = nf_ct_sysctl_table,
+ },
+ {
+ .ctl_name = NET_NF_CONNTRACK_MAX,
+ .procname = "nf_conntrack_max",
+ .data = &nf_conntrack_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table nf_ct_net_table[] = {
+ {
+ .ctl_name = CTL_NET,
+ .procname = "net",
+ .mode = 0555,
+ .child = nf_ct_netfilter_table,
+ },
+ { .ctl_name = 0 }
+};
+EXPORT_SYMBOL(nf_ct_log_invalid);
+#endif /* CONFIG_SYSCTL */
+
+static int init_or_cleanup(int init)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+#endif
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = nf_conntrack_init();
+ if (ret < 0)
+ goto cleanup_nothing;
+
+#ifdef CONFIG_PROC_FS
+ proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops);
+ if (!proc) goto cleanup_init;
+
+ proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440,
+ &exp_file_ops);
+ if (!proc_exp) goto cleanup_proc;
+
+ proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat);
+ if (!proc_stat)
+ goto cleanup_proc_exp;
+
+ proc_stat->proc_fops = &ct_cpu_seq_fops;
+ proc_stat->owner = THIS_MODULE;
+#endif
+#ifdef CONFIG_SYSCTL
+ nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0);
+ if (nf_ct_sysctl_header == NULL) {
+ printk("nf_conntrack: can't register to sysctl.\n");
+ ret = -ENOMEM;
+ goto cleanup_proc_stat;
+ }
+#endif
+
+ return ret;
+
+ cleanup:
+#ifdef CONFIG_SYSCTL
+ unregister_sysctl_table(nf_ct_sysctl_header);
+ cleanup_proc_stat:
+#endif
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_conntrack", proc_net_stat);
+ cleanup_proc_exp:
+ proc_net_remove("nf_conntrack_expect");
+ cleanup_proc:
+ proc_net_remove("nf_conntrack");
+ cleanup_init:
+#endif /* CNFIG_PROC_FS */
+ nf_conntrack_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
+{
+ int ret = 0;
+
+ write_lock_bh(&nf_conntrack_lock);
+ if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) {
+ ret = -EBUSY;
+ goto out;
+ }
+ nf_ct_l3protos[proto->l3proto] = proto;
+out:
+ write_unlock_bh(&nf_conntrack_lock);
+
+ return ret;
+}
+
+void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+{
+ write_lock_bh(&nf_conntrack_lock);
+ nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto;
+ write_unlock_bh(&nf_conntrack_lock);
+
+ /* Somebody could be still looking at the proto in bh. */
+ synchronize_net();
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(kill_l3proto, proto);
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+ them. --RR */
+int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto)
+{
+ int ret = 0;
+
+retry:
+ write_lock_bh(&nf_conntrack_lock);
+ if (nf_ct_protos[proto->l3proto]) {
+ if (nf_ct_protos[proto->l3proto][proto->proto]
+ != &nf_conntrack_generic_protocol) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ } else {
+ /* l3proto may be loaded latter. */
+ struct nf_conntrack_protocol **proto_array;
+ int i;
+
+ write_unlock_bh(&nf_conntrack_lock);
+
+ proto_array = (struct nf_conntrack_protocol **)
+ kmalloc(MAX_NF_CT_PROTO *
+ sizeof(struct nf_conntrack_protocol *),
+ GFP_KERNEL);
+ if (proto_array == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < MAX_NF_CT_PROTO; i++)
+ proto_array[i] = &nf_conntrack_generic_protocol;
+
+ write_lock_bh(&nf_conntrack_lock);
+ if (nf_ct_protos[proto->l3proto]) {
+ /* bad timing, but no problem */
+ write_unlock_bh(&nf_conntrack_lock);
+ kfree(proto_array);
+ } else {
+ nf_ct_protos[proto->l3proto] = proto_array;
+ write_unlock_bh(&nf_conntrack_lock);
+ }
+
+ /*
+ * Just once because array is never freed until unloading
+ * nf_conntrack.ko
+ */
+ goto retry;
+ }
+
+ nf_ct_protos[proto->l3proto][proto->proto] = proto;
+
+out_unlock:
+ write_unlock_bh(&nf_conntrack_lock);
+out:
+ return ret;
+}
+
+void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto)
+{
+ write_lock_bh(&nf_conntrack_lock);
+ nf_ct_protos[proto->l3proto][proto->proto]
+ = &nf_conntrack_generic_protocol;
+ write_unlock_bh(&nf_conntrack_lock);
+
+ /* Somebody could be still looking at the proto in bh. */
+ synchronize_net();
+
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_cleanup(kill_proto, proto);
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+/* Some modules need us, but don't depend directly on any symbol.
+ They should call this. */
+void need_nf_conntrack(void)
+{
+}
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(nf_conntrack_chain);
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache);
+EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+#endif
+EXPORT_SYMBOL(nf_conntrack_l3proto_register);
+EXPORT_SYMBOL(nf_conntrack_l3proto_unregister);
+EXPORT_SYMBOL(nf_conntrack_protocol_register);
+EXPORT_SYMBOL(nf_conntrack_protocol_unregister);
+EXPORT_SYMBOL(nf_ct_invert_tuplepr);
+EXPORT_SYMBOL(nf_conntrack_alter_reply);
+EXPORT_SYMBOL(nf_conntrack_destroyed);
+EXPORT_SYMBOL(need_nf_conntrack);
+EXPORT_SYMBOL(nf_conntrack_helper_register);
+EXPORT_SYMBOL(nf_conntrack_helper_unregister);
+EXPORT_SYMBOL(nf_ct_iterate_cleanup);
+EXPORT_SYMBOL(__nf_ct_refresh_acct);
+EXPORT_SYMBOL(nf_ct_protos);
+EXPORT_SYMBOL(nf_ct_find_proto);
+EXPORT_SYMBOL(nf_ct_l3protos);
+EXPORT_SYMBOL(nf_conntrack_expect_alloc);
+EXPORT_SYMBOL(nf_conntrack_expect_put);
+EXPORT_SYMBOL(nf_conntrack_expect_related);
+EXPORT_SYMBOL(nf_conntrack_unexpect_related);
+EXPORT_SYMBOL(nf_conntrack_tuple_taken);
+EXPORT_SYMBOL(nf_conntrack_htable_size);
+EXPORT_SYMBOL(nf_conntrack_lock);
+EXPORT_SYMBOL(nf_conntrack_hash);
+EXPORT_SYMBOL(nf_conntrack_untracked);
+EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+EXPORT_SYMBOL(nf_conntrack_tcp_update);
+#endif
+EXPORT_SYMBOL(__nf_conntrack_confirm);
+EXPORT_SYMBOL(nf_ct_get_tuple);
+EXPORT_SYMBOL(nf_ct_invert_tuple);
+EXPORT_SYMBOL(nf_conntrack_in);
+EXPORT_SYMBOL(__nf_conntrack_attach);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d10d552d9c4..d3a4f30a7f2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb,
/* QUEUE == DROP if noone is waiting, to be safe. */
read_lock(&queue_handler_lock);
- if (!queue_handler[pf]->outfn) {
+ if (!queue_handler[pf] || !queue_handler[pf]->outfn) {
read_unlock(&queue_handler_lock);
kfree_skb(*skb);
return 1;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4bc27a6334c..a60c59b9763 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
}
-int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
{
memset(tb, 0, sizeof(struct nfattr *) * maxattr);
@@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
tb[flavor-1] = nfa;
nfa = NFA_NEXT(nfa, len);
}
-
- return 0;
}
/**
@@ -225,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
NFNL_SUBSYS_ID(nlh->nlmsg_type),
NFNL_MSG_TYPE(nlh->nlmsg_type));
+ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
+ DEBUGP("missing CAP_NET_ADMIN\n");
+ *errp = -EPERM;
+ return -1;
+ }
+
/* Only requests are handled by kernel now. */
if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
DEBUGP("received non-request message\n");
@@ -250,7 +254,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
ss = nfnetlink_get_subsys(type);
if (!ss)
#endif
- goto err_inval;
+ goto err_inval;
}
nc = nfnetlink_find_client(type, ss);
@@ -259,13 +263,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
goto err_inval;
}
- if (nc->cap_required &&
- !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
- DEBUGP("permission denied for type %d\n", type);
- *errp = -EPERM;
- return -1;
- }
-
{
u_int16_t attr_count =
ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index efcd10f996b..cba63729313 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid)
goto out_unlock;
}
- inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
if (!inst)
goto out_unlock;
- memset(inst, 0, sizeof(*inst));
INIT_HLIST_NODE(&inst->hlist);
inst->lock = SPIN_LOCK_UNLOCKED;
/* needs to be two, since we _put() after creation */
@@ -863,11 +862,9 @@ out_put:
static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
[NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
- .attr_count = NFULA_MAX,
- .cap_required = CAP_NET_ADMIN, },
+ .attr_count = NFULA_MAX, },
[NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFULA_CFG_MAX, },
};
static struct nfnetlink_subsystem nfulnl_subsys = {
@@ -962,10 +959,9 @@ static int nful_open(struct inode *inode, struct file *file)
struct iter_state *is;
int ret;
- is = kmalloc(sizeof(*is), GFP_KERNEL);
+ is = kzalloc(sizeof(*is), GFP_KERNEL);
if (!is)
return -ENOMEM;
- memset(is, 0, sizeof(*is));
ret = seq_open(file, &nful_seq_ops);
if (ret < 0)
goto out_free;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index eaa44c49567..f28460b61e4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid)
goto out_unlock;
}
- inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+ inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
if (!inst)
goto out_unlock;
- memset(inst, 0, sizeof(*inst));
inst->queue_num = queue_num;
inst->peer_pid = pid;
inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
@@ -932,14 +931,11 @@ out_put:
static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
[NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_MAX, },
[NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_MAX, },
[NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .cap_required = CAP_NET_ADMIN },
+ .attr_count = NFQA_CFG_MAX, },
};
static struct nfnetlink_subsystem nfqnl_subsys = {
@@ -1036,10 +1032,9 @@ static int nfqnl_open(struct inode *inode, struct file *file)
struct iter_state *is;
int ret;
- is = kmalloc(sizeof(*is), GFP_KERNEL);
+ is = kzalloc(sizeof(*is), GFP_KERNEL);
if (!is)
return -ENOMEM;
- memset(is, 0, sizeof(*is));
ret = seq_open(file, &nfqnl_seq_ops);
if (ret < 0)
goto out_free;
diff --git a/net/netlink/Makefile b/net/netlink/Makefile
index 39d9c2dcd03..e3589c2de49 100644
--- a/net/netlink/Makefile
+++ b/net/netlink/Makefile
@@ -2,4 +2,4 @@
# Makefile for the netlink driver.
#
-obj-y := af_netlink.o
+obj-y := af_netlink.o attr.o genetlink.o
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5ca283537bc..8c38ee6d255 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -58,6 +58,7 @@
#include <net/sock.h>
#include <net/scm.h>
+#include <net/netlink.h>
#define Nprintk(a...)
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -427,7 +428,8 @@ static int netlink_release(struct socket *sock)
spin_lock(&nlk->cb_lock);
if (nlk->cb) {
- nlk->cb->done(nlk->cb);
+ if (nlk->cb->done)
+ nlk->cb->done(nlk->cb);
netlink_destroy_callback(nlk->cb);
nlk->cb = NULL;
}
@@ -1322,7 +1324,8 @@ static int netlink_dump(struct sock *sk)
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk, skb->len);
- cb->done(cb);
+ if (cb->done)
+ cb->done(cb);
nlk->cb = NULL;
spin_unlock(&nlk->cb_lock);
@@ -1409,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
}
+static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+ struct nlmsghdr *, int *))
+{
+ unsigned int total_len;
+ struct nlmsghdr *nlh;
+ int err;
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ nlh = (struct nlmsghdr *) skb->data;
+
+ if (skb->len < nlh->nlmsg_len)
+ return 0;
+
+ total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len);
+
+ if (cb(skb, nlh, &err) < 0) {
+ /* Not an error, but we have to interrupt processing
+ * here. Note: that in this case we do not pull
+ * message from skb, it will be processed later.
+ */
+ if (err == 0)
+ return -1;
+ netlink_ack(skb, nlh, err);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+
+ skb_pull(skb, total_len);
+ }
+
+ return 0;
+}
+
+/**
+ * nelink_run_queue - Process netlink receive queue.
+ * @sk: Netlink socket containing the queue
+ * @qlen: Place to store queue length upon entry
+ * @cb: Callback function invoked for each netlink message found
+ *
+ * Processes as much as there was in the queue upon entry and invokes
+ * a callback function for each netlink message found. The callback
+ * function may refuse a message by returning a negative error code
+ * but setting the error pointer to 0 in which case this function
+ * returns with a qlen != 0.
+ *
+ * qlen must be initialized to 0 before the initial entry, afterwards
+ * the function may be called repeatedly until qlen reaches 0.
+ */
+void netlink_run_queue(struct sock *sk, unsigned int *qlen,
+ int (*cb)(struct sk_buff *, struct nlmsghdr *, int *))
+{
+ struct sk_buff *skb;
+
+ if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue))
+ *qlen = skb_queue_len(&sk->sk_receive_queue);
+
+ for (; *qlen; (*qlen)--) {
+ skb = skb_dequeue(&sk->sk_receive_queue);
+ if (netlink_rcv_skb(skb, cb)) {
+ if (skb->len)
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ else {
+ kfree_skb(skb);
+ (*qlen)--;
+ }
+ break;
+ }
+
+ kfree_skb(skb);
+ }
+}
+
+/**
+ * netlink_queue_skip - Skip netlink message while processing queue.
+ * @nlh: Netlink message to be skipped
+ * @skb: Socket buffer containing the netlink messages.
+ *
+ * Pulls the given netlink message off the socket buffer so the next
+ * call to netlink_queue_run() will not reconsider the message.
+ */
+void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb)
+{
+ int msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+
+ if (msglen > skb->len)
+ msglen = skb->len;
+
+ skb_pull(skb, msglen);
+}
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
@@ -1657,6 +1748,8 @@ out:
core_initcall(netlink_proto_init);
EXPORT_SYMBOL(netlink_ack);
+EXPORT_SYMBOL(netlink_run_queue);
+EXPORT_SYMBOL(netlink_queue_skip);
EXPORT_SYMBOL(netlink_broadcast);
EXPORT_SYMBOL(netlink_dump_start);
EXPORT_SYMBOL(netlink_kernel_create);
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
new file mode 100644
index 00000000000..fffef4ab276
--- /dev/null
+++ b/net/netlink/attr.c
@@ -0,0 +1,328 @@
+/*
+ * NETLINK Netlink attributes
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+
+static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = {
+ [NLA_U8] = sizeof(u8),
+ [NLA_U16] = sizeof(u16),
+ [NLA_U32] = sizeof(u32),
+ [NLA_U64] = sizeof(u64),
+ [NLA_STRING] = 1,
+ [NLA_NESTED] = NLA_HDRLEN,
+};
+
+static int validate_nla(struct nlattr *nla, int maxtype,
+ struct nla_policy *policy)
+{
+ struct nla_policy *pt;
+ int minlen = 0;
+
+ if (nla->nla_type <= 0 || nla->nla_type > maxtype)
+ return 0;
+
+ pt = &policy[nla->nla_type];
+
+ BUG_ON(pt->type > NLA_TYPE_MAX);
+
+ if (pt->minlen)
+ minlen = pt->minlen;
+ else if (pt->type != NLA_UNSPEC)
+ minlen = nla_attr_minlen[pt->type];
+
+ if (pt->type == NLA_FLAG && nla_len(nla) > 0)
+ return -ERANGE;
+
+ if (nla_len(nla) < minlen)
+ return -ERANGE;
+
+ return 0;
+}
+
+/**
+ * nla_validate - Validate a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ *
+ * Validates all attributes in the specified attribute stream against the
+ * specified policy. Attributes with a type exceeding maxtype will be
+ * ignored. See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int nla_validate(struct nlattr *head, int len, int maxtype,
+ struct nla_policy *policy)
+{
+ struct nlattr *nla;
+ int rem, err;
+
+ nla_for_each_attr(nla, head, len, rem) {
+ err = validate_nla(nla, maxtype, policy);
+ if (err < 0)
+ goto errout;
+ }
+
+ err = 0;
+errout:
+ return err;
+}
+
+/**
+ * nla_parse - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessable via the attribute type. Attributes with a type
+ * exceeding maxtype will be silently ignored for backwards compatibility
+ * reasons. policy may be set to NULL if no validation is required.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
+ struct nla_policy *policy)
+{
+ struct nlattr *nla;
+ int rem, err;
+
+ memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
+
+ nla_for_each_attr(nla, head, len, rem) {
+ u16 type = nla->nla_type;
+
+ if (type > 0 && type <= maxtype) {
+ if (policy) {
+ err = validate_nla(nla, maxtype, policy);
+ if (err < 0)
+ goto errout;
+ }
+
+ tb[type] = nla;
+ }
+ }
+
+ if (unlikely(rem > 0))
+ printk(KERN_WARNING "netlink: %d bytes leftover after parsing "
+ "attributes.\n", rem);
+
+ err = 0;
+errout:
+ return err;
+}
+
+/**
+ * nla_find - Find a specific attribute in a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @attrtype: type of attribute to look for
+ *
+ * Returns the first attribute in the stream matching the specified type.
+ */
+struct nlattr *nla_find(struct nlattr *head, int len, int attrtype)
+{
+ struct nlattr *nla;
+ int rem;
+
+ nla_for_each_attr(nla, head, len, rem)
+ if (nla->nla_type == attrtype)
+ return nla;
+
+ return NULL;
+}
+
+/**
+ * nla_strlcpy - Copy string attribute payload into a sized buffer
+ * @dst: where to copy the string to
+ * @src: attribute to copy the string from
+ * @dstsize: size of destination buffer
+ *
+ * Copies at most dstsize - 1 bytes into the destination buffer.
+ * The result is always a valid NUL-terminated string. Unlike
+ * strlcpy the destination buffer is always padded out.
+ *
+ * Returns the length of the source buffer.
+ */
+size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
+{
+ size_t srclen = nla_len(nla);
+ char *src = nla_data(nla);
+
+ if (srclen > 0 && src[srclen - 1] == '\0')
+ srclen--;
+
+ if (dstsize > 0) {
+ size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen;
+
+ memset(dst, 0, dstsize);
+ memcpy(dst, src, len);
+ }
+
+ return srclen;
+}
+
+/**
+ * nla_memcpy - Copy a netlink attribute into another memory area
+ * @dest: where to copy to memcpy
+ * @src: netlink attribute to copy from
+ * @count: size of the destination area
+ *
+ * Note: The number of bytes copied is limited by the length of
+ * attribute's payload. memcpy
+ *
+ * Returns the number of bytes copied.
+ */
+int nla_memcpy(void *dest, struct nlattr *src, int count)
+{
+ int minlen = min_t(int, count, nla_len(src));
+
+ memcpy(dest, nla_data(src), minlen);
+
+ return minlen;
+}
+
+/**
+ * nla_memcmp - Compare an attribute with sized memory area
+ * @nla: netlink attribute
+ * @data: memory area
+ * @size: size of memory area
+ */
+int nla_memcmp(const struct nlattr *nla, const void *data,
+ size_t size)
+{
+ int d = nla_len(nla) - size;
+
+ if (d == 0)
+ d = memcmp(nla_data(nla), data, size);
+
+ return d;
+}
+
+/**
+ * nla_strcmp - Compare a string attribute against a string
+ * @nla: netlink string attribute
+ * @str: another string
+ */
+int nla_strcmp(const struct nlattr *nla, const char *str)
+{
+ int len = strlen(str) + 1;
+ int d = nla_len(nla) - len;
+
+ if (d == 0)
+ d = memcmp(nla_data(nla), str, len);
+
+ return d;
+}
+
+/**
+ * __nla_reserve - reserve room for attribute on the skb
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it.
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
+{
+ struct nlattr *nla;
+
+ nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen));
+ nla->nla_type = attrtype;
+ nla->nla_len = nla_attr_size(attrlen);
+
+ memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));
+
+ return nla;
+}
+
+/**
+ * nla_reserve - reserve room for attribute on the skb
+ * @skb: socket buffer to reserve room on
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ *
+ * Adds a netlink attribute header to a socket buffer and reserves
+ * room for the payload but does not copy it.
+ *
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
+{
+ if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
+ return NULL;
+
+ return __nla_reserve(skb, attrtype, attrlen);
+}
+
+/**
+ * __nla_put - Add a netlink attribute to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute header and payload.
+ */
+void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
+ const void *data)
+{
+ struct nlattr *nla;
+
+ nla = __nla_reserve(skb, attrtype, attrlen);
+ memcpy(nla_data(nla), data, attrlen);
+}
+
+
+/**
+ * nla_put - Add a netlink attribute to a socket buffer
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * Returns -1 if the tailroom of the skb is insufficient to store
+ * the attribute header and payload.
+ */
+int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
+{
+ if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
+ return -1;
+
+ __nla_put(skb, attrtype, attrlen, data);
+ return 0;
+}
+
+
+EXPORT_SYMBOL(nla_validate);
+EXPORT_SYMBOL(nla_parse);
+EXPORT_SYMBOL(nla_find);
+EXPORT_SYMBOL(nla_strlcpy);
+EXPORT_SYMBOL(__nla_reserve);
+EXPORT_SYMBOL(nla_reserve);
+EXPORT_SYMBOL(__nla_put);
+EXPORT_SYMBOL(nla_put);
+EXPORT_SYMBOL(nla_memcpy);
+EXPORT_SYMBOL(nla_memcmp);
+EXPORT_SYMBOL(nla_strcmp);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
new file mode 100644
index 00000000000..287cfcc5695
--- /dev/null
+++ b/net/netlink/genetlink.c
@@ -0,0 +1,579 @@
+/*
+ * NETLINK Generic Netlink Family
+ *
+ * Authors: Jamal Hadi Salim
+ * Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+struct sock *genl_sock = NULL;
+
+static DECLARE_MUTEX(genl_sem); /* serialization of message processing */
+
+static void genl_lock(void)
+{
+ down(&genl_sem);
+}
+
+static int genl_trylock(void)
+{
+ return down_trylock(&genl_sem);
+}
+
+static void genl_unlock(void)
+{
+ up(&genl_sem);
+
+ if (genl_sock && genl_sock->sk_receive_queue.qlen)
+ genl_sock->sk_data_ready(genl_sock, 0);
+}
+
+#define GENL_FAM_TAB_SIZE 16
+#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1)
+
+static struct list_head family_ht[GENL_FAM_TAB_SIZE];
+
+static int genl_ctrl_event(int event, void *data);
+
+static inline unsigned int genl_family_hash(unsigned int id)
+{
+ return id & GENL_FAM_TAB_MASK;
+}
+
+static inline struct list_head *genl_family_chain(unsigned int id)
+{
+ return &family_ht[genl_family_hash(id)];
+}
+
+static struct genl_family *genl_family_find_byid(unsigned int id)
+{
+ struct genl_family *f;
+
+ list_for_each_entry(f, genl_family_chain(id), family_list)
+ if (f->id == id)
+ return f;
+
+ return NULL;
+}
+
+static struct genl_family *genl_family_find_byname(char *name)
+{
+ struct genl_family *f;
+ int i;
+
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+ list_for_each_entry(f, genl_family_chain(i), family_list)
+ if (strcmp(f->name, name) == 0)
+ return f;
+
+ return NULL;
+}
+
+static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family)
+{
+ struct genl_ops *ops;
+
+ list_for_each_entry(ops, &family->ops_list, ops_list)
+ if (ops->cmd == cmd)
+ return ops;
+
+ return NULL;
+}
+
+/* Of course we are going to have problems once we hit
+ * 2^16 alive types, but that can only happen by year 2K
+*/
+static inline u16 genl_generate_id(void)
+{
+ static u16 id_gen_idx;
+ int overflowed = 0;
+
+ do {
+ if (id_gen_idx == 0)
+ id_gen_idx = GENL_MIN_ID;
+
+ if (++id_gen_idx > GENL_MAX_ID) {
+ if (!overflowed) {
+ overflowed = 1;
+ id_gen_idx = 0;
+ continue;
+ } else
+ return 0;
+ }
+
+ } while (genl_family_find_byid(id_gen_idx));
+
+ return id_gen_idx;
+}
+
+/**
+ * genl_register_ops - register generic netlink operations
+ * @family: generic netlink family
+ * @ops: operations to be registered
+ *
+ * Registers the specified operations and assigns them to the specified
+ * family. Either a doit or dumpit callback must be specified or the
+ * operation will fail. Only one operation structure per command
+ * identifier may be registered.
+ *
+ * See include/net/genetlink.h for more documenation on the operations
+ * structure.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
+{
+ int err = -EINVAL;
+
+ if (ops->dumpit == NULL && ops->doit == NULL)
+ goto errout;
+
+ if (genl_get_cmd(ops->cmd, family)) {
+ err = -EEXIST;
+ goto errout;
+ }
+
+ genl_lock();
+ list_add_tail(&ops->ops_list, &family->ops_list);
+ genl_unlock();
+
+ genl_ctrl_event(CTRL_CMD_NEWOPS, ops);
+ err = 0;
+errout:
+ return err;
+}
+
+/**
+ * genl_unregister_ops - unregister generic netlink operations
+ * @family: generic netlink family
+ * @ops: operations to be unregistered
+ *
+ * Unregisters the specified operations and unassigns them from the
+ * specified family. The operation blocks until the current message
+ * processing has finished and doesn't start again until the
+ * unregister process has finished.
+ *
+ * Note: It is not necessary to unregister all operations before
+ * unregistering the family, unregistering the family will cause
+ * all assigned operations to be unregistered automatically.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
+{
+ struct genl_ops *rc;
+
+ genl_lock();
+ list_for_each_entry(rc, &family->ops_list, ops_list) {
+ if (rc == ops) {
+ list_del(&ops->ops_list);
+ genl_unlock();
+ genl_ctrl_event(CTRL_CMD_DELOPS, ops);
+ return 0;
+ }
+ }
+ genl_unlock();
+
+ return -ENOENT;
+}
+
+/**
+ * genl_register_family - register a generic netlink family
+ * @family: generic netlink family
+ *
+ * Registers the specified family after validating it first. Only one
+ * family may be registered with the same family name or identifier.
+ * The family id may equal GENL_ID_GENERATE causing an unique id to
+ * be automatically generated and assigned.
+ *
+ * Return 0 on success or a negative error code.
+ */
+int genl_register_family(struct genl_family *family)
+{
+ int err = -EINVAL;
+
+ if (family->id && family->id < GENL_MIN_ID)
+ goto errout;
+
+ if (family->id > GENL_MAX_ID)
+ goto errout;
+
+ INIT_LIST_HEAD(&family->ops_list);
+
+ genl_lock();
+
+ if (genl_family_find_byname(family->name)) {
+ err = -EEXIST;
+ goto errout_locked;
+ }
+
+ if (genl_family_find_byid(family->id)) {
+ err = -EEXIST;
+ goto errout_locked;
+ }
+
+ if (!try_module_get(family->owner)) {
+ err = -EBUSY;
+ goto errout_locked;
+ }
+
+ if (family->id == GENL_ID_GENERATE) {
+ u16 newid = genl_generate_id();
+
+ if (!newid) {
+ err = -ENOMEM;
+ goto errout_locked;
+ }
+
+ family->id = newid;
+ }
+
+ if (family->maxattr) {
+ family->attrbuf = kmalloc((family->maxattr+1) *
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (family->attrbuf == NULL) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ } else
+ family->attrbuf = NULL;
+
+ list_add_tail(&family->family_list, genl_family_chain(family->id));
+ genl_unlock();
+
+ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family);
+
+ return 0;
+
+errout_locked:
+ genl_unlock();
+errout:
+ return err;
+}
+
+/**
+ * genl_unregister_family - unregister generic netlink family
+ * @family: generic netlink family
+ *
+ * Unregisters the specified family.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int genl_unregister_family(struct genl_family *family)
+{
+ struct genl_family *rc;
+
+ genl_lock();
+
+ list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
+ if (family->id != rc->id || strcmp(rc->name, family->name))
+ continue;
+
+ list_del(&rc->family_list);
+ INIT_LIST_HEAD(&family->ops_list);
+ genl_unlock();
+
+ module_put(family->owner);
+ kfree(family->attrbuf);
+ genl_ctrl_event(CTRL_CMD_DELFAMILY, family);
+ return 0;
+ }
+
+ genl_unlock();
+
+ return -ENOENT;
+}
+
+static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ int *errp)
+{
+ struct genl_ops *ops;
+ struct genl_family *family;
+ struct genl_info info;
+ struct genlmsghdr *hdr = nlmsg_data(nlh);
+ int hdrlen, err = -EINVAL;
+
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
+ goto ignore;
+
+ if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+ goto ignore;
+
+ family = genl_family_find_byid(nlh->nlmsg_type);
+ if (family == NULL) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ hdrlen = GENL_HDRLEN + family->hdrsize;
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ goto errout;
+
+ ops = genl_get_cmd(hdr->cmd, family);
+ if (ops == NULL) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) {
+ err = -EPERM;
+ goto errout;
+ }
+
+ if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (ops->dumpit == NULL) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ *errp = err = netlink_dump_start(genl_sock, skb, nlh,
+ ops->dumpit, NULL);
+ if (err == 0)
+ skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len),
+ skb->len));
+ return -1;
+ }
+
+ if (ops->doit == NULL) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (family->attrbuf) {
+ err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr,
+ ops->policy);
+ if (err < 0)
+ goto errout;
+ }
+
+ info.snd_seq = nlh->nlmsg_seq;
+ info.snd_pid = NETLINK_CB(skb).pid;
+ info.nlhdr = nlh;
+ info.genlhdr = nlmsg_data(nlh);
+ info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
+ info.attrs = family->attrbuf;
+
+ *errp = err = ops->doit(skb, &info);
+ return err;
+
+ignore:
+ return 0;
+
+errout:
+ *errp = err;
+ return -1;
+}
+
+static void genl_rcv(struct sock *sk, int len)
+{
+ unsigned int qlen = 0;
+
+ do {
+ if (genl_trylock())
+ return;
+ netlink_run_queue(sk, &qlen, &genl_rcv_msg);
+ genl_unlock();
+ } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen);
+}
+
+/**************************************************************************
+ * Controller
+ **************************************************************************/
+
+static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
+ u32 flags, struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd,
+ family->version);
+ if (hdr == NULL)
+ return -1;
+
+ NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name);
+ NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id);
+
+ return genlmsg_end(skb, hdr);
+
+nla_put_failure:
+ return genlmsg_cancel(skb, hdr);
+}
+
+static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
+{
+
+ int i, n = 0;
+ struct genl_family *rt;
+ int chains_to_skip = cb->args[0];
+ int fams_to_skip = cb->args[1];
+
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
+ if (i < chains_to_skip)
+ continue;
+ n = 0;
+ list_for_each_entry(rt, genl_family_chain(i), family_list) {
+ if (++n < fams_to_skip)
+ continue;
+ if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, CTRL_CMD_NEWFAMILY) < 0)
+ goto errout;
+ }
+
+ fams_to_skip = 0;
+ }
+
+errout:
+ cb->args[0] = i;
+ cb->args[1] = n;
+
+ return skb->len;
+}
+
+static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
+ int seq, int cmd)
+{
+ struct sk_buff *skb;
+ int err;
+
+ skb = nlmsg_new(NLMSG_GOODSIZE);
+ if (skb == NULL)
+ return ERR_PTR(-ENOBUFS);
+
+ err = ctrl_fill_info(family, pid, seq, 0, skb, cmd);
+ if (err < 0) {
+ nlmsg_free(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = {
+ [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 },
+ [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING },
+};
+
+static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ struct genl_family *res = NULL;
+ int err = -EINVAL;
+
+ if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
+ u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
+ res = genl_family_find_byid(id);
+ }
+
+ if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
+ char name[GENL_NAMSIZ];
+
+ if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME],
+ GENL_NAMSIZ) >= GENL_NAMSIZ)
+ goto errout;
+
+ res = genl_family_find_byname(name);
+ }
+
+ if (res == NULL) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq,
+ CTRL_CMD_NEWFAMILY);
+ if (IS_ERR(msg)) {
+ err = PTR_ERR(msg);
+ goto errout;
+ }
+
+ err = genlmsg_unicast(msg, info->snd_pid);
+errout:
+ return err;
+}
+
+static int genl_ctrl_event(int event, void *data)
+{
+ struct sk_buff *msg;
+
+ if (genl_sock == NULL)
+ return 0;
+
+ switch (event) {
+ case CTRL_CMD_NEWFAMILY:
+ case CTRL_CMD_DELFAMILY:
+ msg = ctrl_build_msg(data, 0, 0, event);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ genlmsg_multicast(msg, 0, GENL_ID_CTRL);
+ break;
+ }
+
+ return 0;
+}
+
+static struct genl_ops genl_ctrl_ops = {
+ .cmd = CTRL_CMD_GETFAMILY,
+ .doit = ctrl_getfamily,
+ .dumpit = ctrl_dumpfamily,
+ .policy = ctrl_policy,
+};
+
+static struct genl_family genl_ctrl = {
+ .id = GENL_ID_CTRL,
+ .name = "nlctrl",
+ .version = 0x1,
+ .maxattr = CTRL_ATTR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init genl_init(void)
+{
+ int i, err;
+
+ for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
+ INIT_LIST_HEAD(&family_ht[i]);
+
+ err = genl_register_family(&genl_ctrl);
+ if (err < 0)
+ goto errout;
+
+ err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops);
+ if (err < 0)
+ goto errout_register;
+
+ netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV);
+ genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID,
+ genl_rcv, THIS_MODULE);
+ if (genl_sock == NULL) {
+ panic("GENL: Cannot initialize generic netlink\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+
+errout_register:
+ genl_unregister_family(&genl_ctrl);
+errout:
+ panic("GENL: Cannot register controller: %d\n", err);
+ return err;
+}
+
+subsys_initcall(genl_init);
+
+EXPORT_SYMBOL(genl_sock);
+EXPORT_SYMBOL(genl_register_ops);
+EXPORT_SYMBOL(genl_unregister_ops);
+EXPORT_SYMBOL(genl_register_family);
+EXPORT_SYMBOL(genl_unregister_family);
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index b18fe504301..8631b65a731 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -240,8 +240,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh)
if ((s = rose_neigh_list) == rose_neigh) {
rose_neigh_list = rose_neigh->next;
spin_unlock_bh(&rose_neigh_list_lock);
- if (rose_neigh->digipeat != NULL)
- kfree(rose_neigh->digipeat);
+ kfree(rose_neigh->digipeat);
kfree(rose_neigh);
return;
}
@@ -250,8 +249,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh)
if (s->next == rose_neigh) {
s->next = rose_neigh->next;
spin_unlock_bh(&rose_neigh_list_lock);
- if (rose_neigh->digipeat != NULL)
- kfree(rose_neigh->digipeat);
+ kfree(rose_neigh->digipeat);
kfree(rose_neigh);
return;
}
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 122c086ee2d..dbe6105e83a 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -23,6 +23,7 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/icmp.h>
+#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/ip.h>
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans)
/* we'll probably need to checksum it (didn't call
* sock_recvmsg) */
- if (pkt->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)
- csum_fold(skb_checksum(pkt, 0, pkt->len,
- pkt->csum))) {
- kfree_skb(pkt);
- rxrpc_krxiod_queue_transport(trans);
- _leave(" CSUM failed");
- return;
- }
+ if (skb_checksum_complete(pkt)) {
+ kfree_skb(pkt);
+ rxrpc_krxiod_queue_transport(trans);
+ _leave(" CSUM failed");
+ return;
}
addr = pkt->nh.iph->saddr;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7f34e7fd767..55cd5327fbd 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -40,9 +40,10 @@ config NET_SCHED
The available schedulers are listed in the following questions; you
can say Y to as many as you like. If unsure, say N now.
+if NET_SCHED
+
choice
prompt "Packet scheduler clock source"
- depends on NET_SCHED
default NET_SCH_CLK_JIFFIES
---help---
Packet schedulers need a monotonic clock that increments at a static
@@ -98,11 +99,9 @@ config NET_SCH_CLK_CPU
endchoice
comment "Queueing/Scheduling"
- depends on NET_SCHED
config NET_SCH_CBQ
tristate "Class Based Queueing (CBQ)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
scheduling algorithm. This algorithm classifies the waiting packets
@@ -120,7 +119,6 @@ config NET_SCH_CBQ
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
@@ -135,7 +133,6 @@ config NET_SCH_HTB
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
@@ -145,7 +142,7 @@ config NET_SCH_HFSC
config NET_SCH_ATM
tristate "ATM Virtual Circuits (ATM)"
- depends on NET_SCHED && ATM
+ depends on ATM
---help---
Say Y here if you want to use the ATM pseudo-scheduler. This
provides a framework for invoking classifiers, which in turn
@@ -159,7 +156,6 @@ config NET_SCH_ATM
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
- depends on NET_SCHED
---help---
Say Y here if you want to use an n-band priority queue packet
scheduler.
@@ -169,7 +165,6 @@ config NET_SCH_PRIO
config NET_SCH_RED
tristate "Random Early Detection (RED)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
@@ -181,7 +176,6 @@ config NET_SCH_RED
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm .
@@ -193,7 +187,6 @@ config NET_SCH_SFQ
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
@@ -206,7 +199,6 @@ config NET_SCH_TEQL
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
@@ -218,7 +210,6 @@ config NET_SCH_TBF
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
- depends on NET_SCHED
---help---
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
@@ -230,7 +221,6 @@ config NET_SCH_GRED
config NET_SCH_DSMARK
tristate "Differentiated Services marker (DSMARK)"
- depends on NET_SCHED
---help---
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
@@ -242,7 +232,6 @@ config NET_SCH_DSMARK
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
- depends on NET_SCHED
---help---
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
@@ -255,7 +244,6 @@ config NET_SCH_NETEM
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
- depends on NET_SCHED
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
@@ -264,14 +252,12 @@ config NET_SCH_INGRESS
module will be called sch_ingress.
comment "Classification"
- depends on NET_SCHED
config NET_CLS
boolean
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to be able to classify packets using
@@ -282,7 +268,6 @@ config NET_CLS_BASIC
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to be able to classify packets based on
@@ -294,7 +279,6 @@ config NET_CLS_TCINDEX
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
- depends NET_SCHED
select NET_CLS_ROUTE
select NET_CLS
---help---
@@ -306,11 +290,9 @@ config NET_CLS_ROUTE4
config NET_CLS_ROUTE
bool
- default n
config NET_CLS_FW
tristate "Netfilter mark (FW)"
- depends NET_SCHED
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
@@ -321,7 +303,6 @@ config NET_CLS_FW
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here to be able to classify packetes using a universal
@@ -345,7 +326,6 @@ config CLS_U32_MARK
config NET_CLS_RSVP
tristate "IPv4 Resource Reservation Protocol (RSVP)"
- depends on NET_SCHED
select NET_CLS
select NET_ESTIMATOR
---help---
@@ -361,7 +341,6 @@ config NET_CLS_RSVP
config NET_CLS_RSVP6
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
- depends on NET_SCHED
select NET_CLS
select NET_ESTIMATOR
---help---
@@ -377,7 +356,6 @@ config NET_CLS_RSVP6
config NET_EMATCH
bool "Extended Matches"
- depends NET_SCHED
select NET_CLS
---help---
Say Y here if you want to use extended matches on top of classifiers
@@ -456,7 +434,7 @@ config NET_EMATCH_TEXT
config NET_CLS_ACT
bool "Actions"
- depends on EXPERIMENTAL && NET_SCHED
+ depends on EXPERIMENTAL
select NET_ESTIMATOR
---help---
Say Y here if you want to use traffic control actions. Actions
@@ -539,7 +517,7 @@ config NET_ACT_SIMP
config NET_CLS_POLICE
bool "Traffic Policing (obsolete)"
- depends on NET_SCHED && NET_CLS_ACT!=y
+ depends on NET_CLS_ACT!=y
select NET_ESTIMATOR
---help---
Say Y here if you want to do traffic policing, i.e. strict
@@ -549,7 +527,7 @@ config NET_CLS_POLICE
config NET_CLS_IND
bool "Incoming device classification"
- depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW)
+ depends on NET_CLS_U32 || NET_CLS_FW
---help---
Say Y here to extend the u32 and fw classifier to support
classification based on the incoming device. This option is
@@ -557,11 +535,12 @@ config NET_CLS_IND
config NET_ESTIMATOR
bool "Rate estimator"
- depends on NET_SCHED
---help---
Say Y here to allow using rate estimators to estimate the current
rate-of-flow for network devices, queues, etc. This module is
automaticaly selected if needed but can be selected manually for
statstical purposes.
+endif # NET_SCHED
+
endmenu
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 29d8b9a4d16..75470486e40 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -298,8 +298,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
return 0;
errout:
- if (f)
- kfree(f);
+ kfree(f);
return err;
}
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 02996ac05c7..520ff716dab 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -525,8 +525,7 @@ reinsert:
return 0;
errout:
- if (f)
- kfree(f);
+ kfree(f);
return err;
}
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 006168d6937..572f06be3b0 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -555,8 +555,7 @@ insert:
goto insert;
errout:
- if (f)
- kfree(f);
+ kfree(f);
errout2:
tcf_exts_destroy(tp, &e);
return err;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 404d9d83a7f..9f921174c8a 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -194,8 +194,7 @@ found:
}
tcf_unbind_filter(tp, &r->res);
tcf_exts_destroy(tp, &r->exts);
- if (f)
- kfree(f);
+ kfree(f);
return 0;
}
@@ -442,10 +441,8 @@ static void tcindex_destroy(struct tcf_proto *tp)
walker.skip = 0;
walker.fn = &tcindex_destroy_element;
tcindex_walk(tp,&walker);
- if (p->perfect)
- kfree(p->perfect);
- if (p->h)
- kfree(p->h);
+ kfree(p->perfect);
+ kfree(p->h);
kfree(p);
tp->root = NULL;
}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 364b87d8645..2b670479dde 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -347,7 +347,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
if (n->ht_down)
n->ht_down->refcnt--;
#ifdef CONFIG_CLS_U32_PERF
- if (n && (NULL != n->pf))
+ if (n)
kfree(n->pf);
#endif
kfree(n);
@@ -680,7 +680,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
return 0;
}
#ifdef CONFIG_CLS_U32_PERF
- if (n && (NULL != n->pf))
+ if (n)
kfree(n->pf);
#endif
kfree(n);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index cf68a59fdc5..700844d49d7 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -561,8 +561,7 @@ static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
static void meta_var_destroy(struct meta_value *v)
{
- if (v->val)
- kfree((void *) v->val);
+ kfree((void *) v->val);
}
static void meta_var_apply_extras(struct meta_value *v,
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index ebfe2e7d21b..64b047c6556 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -298,6 +298,11 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
struct tcf_ematch_tree_hdr *tree_hdr;
struct tcf_ematch *em;
+ if (!rta) {
+ memset(tree, 0, sizeof(*tree));
+ return 0;
+ }
+
if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
goto errout;
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 25c171c3271..29a2dd9f302 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -15,247 +15,281 @@
* from Ren Liu
* - More error checks
*
- *
- *
- * For all the glorious comments look at Alexey's sch_red.c
+ * For all the glorious comments look at include/net/red.h
*/
#include <linux/config.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
#include <net/pkt_sched.h>
+#include <net/red.h>
-#if 1 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
+#define GRED_DEF_PRIO (MAX_DPs / 2)
+#define GRED_VQ_MASK (MAX_DPs - 1)
struct gred_sched_data;
struct gred_sched;
struct gred_sched_data
{
-/* Parameters */
u32 limit; /* HARD maximal queue length */
- u32 qth_min; /* Min average length threshold: A scaled */
- u32 qth_max; /* Max average length threshold: A scaled */
u32 DP; /* the drop pramaters */
- char Wlog; /* log(W) */
- char Plog; /* random number bits */
- u32 Scell_max;
- u32 Rmask;
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
- u32 forced; /* packets dropped for exceeding limits */
- u32 early; /* packets dropped as a warning */
- u32 other; /* packets dropped by invoking drop() */
- u32 pdrop; /* packets dropped because we exceeded physical queue limits */
- char Scell_log;
- u8 Stab[256];
- u8 prio; /* the prio of this vq */
-
-/* Variables */
- unsigned long qave; /* Average queue length: A scaled */
- int qcount; /* Packets since last random number generation */
- u32 qR; /* Cached random number */
-
- psched_time_t qidlestart; /* Start of idle period */
+ u8 prio; /* the prio of this vq */
+
+ struct red_parms parms;
+ struct red_stats stats;
+};
+
+enum {
+ GRED_WRED_MODE = 1,
+ GRED_RIO_MODE,
};
struct gred_sched
{
struct gred_sched_data *tab[MAX_DPs];
- u32 DPs;
- u32 def;
- u8 initd;
- u8 grio;
- u8 eqp;
+ unsigned long flags;
+ u32 red_flags;
+ u32 DPs;
+ u32 def;
+ struct red_parms wred_set;
};
-static int
-gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static inline int gred_wred_mode(struct gred_sched *table)
{
- psched_time_t now;
- struct gred_sched_data *q=NULL;
- struct gred_sched *t= qdisc_priv(sch);
- unsigned long qave=0;
- int i=0;
+ return test_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_enable_wred_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline void gred_disable_wred_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_WRED_MODE, &table->flags);
+}
+
+static inline int gred_rio_mode(struct gred_sched *table)
+{
+ return test_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_enable_rio_mode(struct gred_sched *table)
+{
+ __set_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline void gred_disable_rio_mode(struct gred_sched *table)
+{
+ __clear_bit(GRED_RIO_MODE, &table->flags);
+}
+
+static inline int gred_wred_mode_check(struct Qdisc *sch)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ int i;
- if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) {
- D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n");
- goto do_enqueue;
+ /* Really ugly O(n^2) but shouldn't be necessary too frequent. */
+ for (i = 0; i < table->DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ int n;
+
+ if (q == NULL)
+ continue;
+
+ for (n = 0; n < table->DPs; n++)
+ if (table->tab[n] && table->tab[n] != q &&
+ table->tab[n]->prio == q->prio)
+ return 1;
}
+ return 0;
+}
+
+static inline unsigned int gred_backlog(struct gred_sched *table,
+ struct gred_sched_data *q,
+ struct Qdisc *sch)
+{
+ if (gred_wred_mode(table))
+ return sch->qstats.backlog;
+ else
+ return q->backlog;
+}
+
+static inline u16 tc_index_to_dp(struct sk_buff *skb)
+{
+ return skb->tc_index & GRED_VQ_MASK;
+}
+
+static inline void gred_load_wred_set(struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ q->parms.qavg = table->wred_set.qavg;
+ q->parms.qidlestart = table->wred_set.qidlestart;
+}
+
+static inline void gred_store_wred_set(struct gred_sched *table,
+ struct gred_sched_data *q)
+{
+ table->wred_set.qavg = q->parms.qavg;
+}
+
+static inline int gred_use_ecn(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_ECN;
+}
- if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) {
- printk("GRED: setting to default (%d)\n ",t->def);
- if (!(q=t->tab[t->def])) {
- DPRINTK("GRED: setting to default FAILED! dropping!! "
- "(%d)\n ", t->def);
- goto drop;
+static inline int gred_use_harddrop(struct gred_sched *t)
+{
+ return t->red_flags & TC_RED_HARDDROP;
+}
+
+static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct gred_sched_data *q=NULL;
+ struct gred_sched *t= qdisc_priv(sch);
+ unsigned long qavg = 0;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ dp = t->def;
+
+ if ((q = t->tab[dp]) == NULL) {
+ /* Pass through packets not assigned to a DP
+ * if no default DP has been configured. This
+ * allows for DP flows to be left untouched.
+ */
+ if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len)
+ return qdisc_enqueue_tail(skb, sch);
+ else
+ goto drop;
}
+
/* fix tc_index? --could be controvesial but needed for
requeueing */
- skb->tc_index=(skb->tc_index&0xfffffff0) | t->def;
+ skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
- D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d "
- "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog,
- sch->qstats.backlog);
- /* sum up all the qaves of prios <= to ours to get the new qave*/
- if (!t->eqp && t->grio) {
- for (i=0;i<t->DPs;i++) {
- if ((!t->tab[i]) || (i==q->DP))
- continue;
-
- if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart)))
- qave +=t->tab[i]->qave;
+ /* sum up all the qaves of prios <= to ours to get the new qave */
+ if (!gred_wred_mode(t) && gred_rio_mode(t)) {
+ int i;
+
+ for (i = 0; i < t->DPs; i++) {
+ if (t->tab[i] && t->tab[i]->prio < q->prio &&
+ !red_is_idling(&t->tab[i]->parms))
+ qavg +=t->tab[i]->parms.qavg;
}
-
+
}
q->packetsin++;
- q->bytesin+=skb->len;
+ q->bytesin += skb->len;
- if (t->eqp && t->grio) {
- qave=0;
- q->qave=t->tab[t->def]->qave;
- q->qidlestart=t->tab[t->def]->qidlestart;
- }
+ if (gred_wred_mode(t))
+ gred_load_wred_set(t, q);
- if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
- long us_idle;
- PSCHED_GET_TIME(now);
- us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
- PSCHED_SET_PASTPERFECT(q->qidlestart);
+ q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
- q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF];
- } else {
- if (t->eqp) {
- q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
- } else {
- q->qave += q->backlog - (q->qave >> q->Wlog);
- }
+ if (red_is_idling(&q->parms))
+ red_end_of_idle_period(&q->parms);
- }
-
-
- if (t->eqp && t->grio)
- t->tab[t->def]->qave=q->qave;
-
- if ((q->qave+qave) < q->qth_min) {
- q->qcount = -1;
-enqueue:
- if (q->backlog + skb->len <= q->limit) {
- q->backlog += skb->len;
-do_enqueue:
- __skb_queue_tail(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- return 0;
- } else {
- q->pdrop++;
- }
+ if (gred_wred_mode(t))
+ gred_store_wred_set(t, q);
-drop:
- kfree_skb(skb);
- sch->qstats.drops++;
- return NET_XMIT_DROP;
- }
- if ((q->qave+qave) >= q->qth_max) {
- q->qcount = -1;
- sch->qstats.overlimits++;
- q->forced++;
- goto drop;
+ switch (red_action(&q->parms, q->parms.qavg + qavg)) {
+ case RED_DONT_MARK:
+ break;
+
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+ q->stats.forced_mark++;
+ break;
}
- if (++q->qcount) {
- if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
- goto enqueue;
- q->qcount = 0;
- q->qR = net_random()&q->Rmask;
- sch->qstats.overlimits++;
- q->early++;
- goto drop;
+
+ if (q->backlog + skb->len <= q->limit) {
+ q->backlog += skb->len;
+ return qdisc_enqueue_tail(skb, sch);
}
- q->qR = net_random()&q->Rmask;
- goto enqueue;
+
+ q->stats.pdrop++;
+drop:
+ return qdisc_drop(skb, sch);
+
+congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
}
-static int
-gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
+static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
+ struct gred_sched *t = qdisc_priv(sch);
struct gred_sched_data *q;
- struct gred_sched *t= qdisc_priv(sch);
- q= t->tab[(skb->tc_index&0xf)];
-/* error checking here -- probably unnecessary */
- PSCHED_SET_PASTPERFECT(q->qidlestart);
-
- __skb_queue_head(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->qstats.requeues++;
- q->backlog += skb->len;
- return 0;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x "
+ "for requeue, screwing up backlog.\n",
+ tc_index_to_dp(skb));
+ } else {
+ if (red_is_idling(&q->parms))
+ red_end_of_idle_period(&q->parms);
+ q->backlog += skb->len;
+ }
+
+ return qdisc_requeue(skb, sch);
}
-static struct sk_buff *
-gred_dequeue(struct Qdisc* sch)
+static struct sk_buff *gred_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
- struct gred_sched_data *q;
- struct gred_sched *t= qdisc_priv(sch);
+ struct gred_sched *t = qdisc_priv(sch);
+
+ skb = qdisc_dequeue_head(sch);
- skb = __skb_dequeue(&sch->q);
if (skb) {
- sch->qstats.backlog -= skb->len;
- q= t->tab[(skb->tc_index&0xf)];
- if (q) {
- q->backlog -= skb->len;
- if (!q->backlog && !t->eqp)
- PSCHED_GET_TIME(q->qidlestart);
+ struct gred_sched_data *q;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "GRED: Unable to relocate "
+ "VQ 0x%x after dequeue, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
- D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
+ q->backlog -= skb->len;
+
+ if (!q->backlog && !gred_wred_mode(t))
+ red_start_of_idle_period(&q->parms);
}
+
return skb;
}
- if (t->eqp) {
- q= t->tab[t->def];
- if (!q)
- D2PRINTK("no default VQ set: Results will be "
- "screwed up\n");
- else
- PSCHED_GET_TIME(q->qidlestart);
- }
+ if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
+ red_start_of_idle_period(&t->wred_set);
return NULL;
}
@@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch)
static unsigned int gred_drop(struct Qdisc* sch)
{
struct sk_buff *skb;
+ struct gred_sched *t = qdisc_priv(sch);
- struct gred_sched_data *q;
- struct gred_sched *t= qdisc_priv(sch);
-
- skb = __skb_dequeue_tail(&sch->q);
+ skb = qdisc_dequeue_tail(sch);
if (skb) {
unsigned int len = skb->len;
- sch->qstats.backlog -= len;
- sch->qstats.drops++;
- q= t->tab[(skb->tc_index&0xf)];
- if (q) {
- q->backlog -= len;
- q->other++;
- if (!q->backlog && !t->eqp)
- PSCHED_GET_TIME(q->qidlestart);
+ struct gred_sched_data *q;
+ u16 dp = tc_index_to_dp(skb);
+
+ if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "GRED: Unable to relocate "
+ "VQ 0x%x while dropping, screwing up "
+ "backlog.\n", tc_index_to_dp(skb));
} else {
- D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf);
+ q->backlog -= len;
+ q->stats.other++;
+
+ if (!q->backlog && !gred_wred_mode(t))
+ red_start_of_idle_period(&q->parms);
}
- kfree_skb(skb);
+ qdisc_drop(skb, sch);
return len;
}
- q=t->tab[t->def];
- if (!q) {
- D2PRINTK("no default VQ set: Results might be screwed up\n");
- return 0;
- }
+ if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
+ red_start_of_idle_period(&t->wred_set);
- PSCHED_GET_TIME(q->qidlestart);
return 0;
}
@@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch)
static void gred_reset(struct Qdisc* sch)
{
int i;
- struct gred_sched_data *q;
- struct gred_sched *t= qdisc_priv(sch);
+ struct gred_sched *t = qdisc_priv(sch);
+
+ qdisc_reset_queue(sch);
- __skb_queue_purge(&sch->q);
+ for (i = 0; i < t->DPs; i++) {
+ struct gred_sched_data *q = t->tab[i];
- sch->qstats.backlog = 0;
+ if (!q)
+ continue;
- for (i=0;i<t->DPs;i++) {
- q= t->tab[i];
- if (!q)
- continue;
- PSCHED_SET_PASTPERFECT(q->qidlestart);
- q->qave = 0;
- q->qcount = -1;
+ red_restart(&q->parms);
q->backlog = 0;
- q->other=0;
- q->forced=0;
- q->pdrop=0;
- q->early=0;
}
}
-static int gred_change(struct Qdisc *sch, struct rtattr *opt)
+static inline void gred_destroy_vq(struct gred_sched_data *q)
+{
+ kfree(q);
+}
+
+static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps)
{
struct gred_sched *table = qdisc_priv(sch);
- struct gred_sched_data *q;
- struct tc_gred_qopt *ctl;
struct tc_gred_sopt *sopt;
- struct rtattr *tb[TCA_GRED_STAB];
- struct rtattr *tb2[TCA_GRED_DPS];
int i;
- if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+ if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt))
return -EINVAL;
- if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
- rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+ sopt = RTA_DATA(dps);
+
+ if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
+ return -EINVAL;
- if (tb2[TCA_GRED_DPS-1] == 0)
- return -EINVAL;
+ sch_tree_lock(sch);
+ table->DPs = sopt->DPs;
+ table->def = sopt->def_DP;
+ table->red_flags = sopt->flags;
+
+ /*
+ * Every entry point to GRED is synchronized with the above code
+ * and the DP is checked against DPs, i.e. shadowed VQs can no
+ * longer be found so we can unlock right here.
+ */
+ sch_tree_unlock(sch);
+
+ if (sopt->grio) {
+ gred_enable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
+ } else {
+ gred_disable_rio_mode(table);
+ gred_disable_wred_mode(table);
+ }
- sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
- table->DPs=sopt->DPs;
- table->def=sopt->def_DP;
- table->grio=sopt->grio;
- table->initd=0;
- /* probably need to clear all the table DP entries as well */
- return 0;
- }
+ for (i = table->DPs; i < MAX_DPs; i++) {
+ if (table->tab[i]) {
+ printk(KERN_WARNING "GRED: Warning: Destroying "
+ "shadowed VQ 0x%x\n", i);
+ gred_destroy_vq(table->tab[i]);
+ table->tab[i] = NULL;
+ }
+ }
+ return 0;
+}
- if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 ||
- RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
- RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
- return -EINVAL;
+static inline int gred_change_vq(struct Qdisc *sch, int dp,
+ struct tc_gred_qopt *ctl, int prio, u8 *stab)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct gred_sched_data *q;
- ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
- if (ctl->DP > MAX_DPs-1 ) {
- /* misbehaving is punished! Put in the default drop probability */
- DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP "
- "set to default at %d\n",ctl->DP,table->def);
- ctl->DP=table->def;
- }
-
- if (table->tab[ctl->DP] == NULL) {
- table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data),
- GFP_KERNEL);
- if (NULL == table->tab[ctl->DP])
+ if (table->tab[dp] == NULL) {
+ table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL);
+ if (table->tab[dp] == NULL)
return -ENOMEM;
- memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data)));
- }
- q= table->tab[ctl->DP];
-
- if (table->grio) {
- if (ctl->prio <=0) {
- if (table->def && table->tab[table->def]) {
- DPRINTK("\nGRED: DP %u does not have a prio"
- "setting default to %d\n",ctl->DP,
- table->tab[table->def]->prio);
- q->prio=table->tab[table->def]->prio;
- } else {
- DPRINTK("\nGRED: DP %u does not have a prio"
- " setting default to 8\n",ctl->DP);
- q->prio=8;
- }
- } else {
- q->prio=ctl->prio;
- }
- } else {
- q->prio=8;
+ memset(table->tab[dp], 0, sizeof(*q));
}
-
- q->DP=ctl->DP;
- q->Wlog = ctl->Wlog;
- q->Plog = ctl->Plog;
+ q = table->tab[dp];
+ q->DP = dp;
+ q->prio = prio;
q->limit = ctl->limit;
- q->Scell_log = ctl->Scell_log;
- q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
- q->Scell_max = (255<<q->Scell_log);
- q->qth_min = ctl->qth_min<<ctl->Wlog;
- q->qth_max = ctl->qth_max<<ctl->Wlog;
- q->qave=0;
- q->backlog=0;
- q->qcount = -1;
- q->other=0;
- q->forced=0;
- q->pdrop=0;
- q->early=0;
-
- PSCHED_SET_PASTPERFECT(q->qidlestart);
- memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
-
- if ( table->initd && table->grio) {
- /* this looks ugly but it's not in the fast path */
- for (i=0;i<table->DPs;i++) {
- if ((!table->tab[i]) || (i==q->DP) )
- continue;
- if (table->tab[i]->prio == q->prio ){
- /* WRED mode detected */
- table->eqp=1;
- break;
- }
- }
- }
- if (!table->initd) {
- table->initd=1;
- /*
- the first entry also goes into the default until
- over-written
- */
-
- if (table->tab[table->def] == NULL) {
- table->tab[table->def]=
- kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL);
- if (NULL == table->tab[table->def])
- return -ENOMEM;
-
- memset(table->tab[table->def], 0,
- (sizeof(struct gred_sched_data)));
- }
- q= table->tab[table->def];
- q->DP=table->def;
- q->Wlog = ctl->Wlog;
- q->Plog = ctl->Plog;
- q->limit = ctl->limit;
- q->Scell_log = ctl->Scell_log;
- q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
- q->Scell_max = (255<<q->Scell_log);
- q->qth_min = ctl->qth_min<<ctl->Wlog;
- q->qth_max = ctl->qth_max<<ctl->Wlog;
-
- if (table->grio)
- q->prio=table->tab[ctl->DP]->prio;
- else
- q->prio=8;
-
- q->qcount = -1;
- PSCHED_SET_PASTPERFECT(q->qidlestart);
- memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256);
- }
- return 0;
+ if (q->backlog == 0)
+ red_end_of_idle_period(&q->parms);
+ red_set_parms(&q->parms,
+ ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
+ ctl->Scell_log, stab);
+
+ return 0;
}
-static int gred_init(struct Qdisc *sch, struct rtattr *opt)
+static int gred_change(struct Qdisc *sch, struct rtattr *opt)
{
struct gred_sched *table = qdisc_priv(sch);
- struct tc_gred_sopt *sopt;
- struct rtattr *tb[TCA_GRED_STAB];
- struct rtattr *tb2[TCA_GRED_DPS];
+ struct tc_gred_qopt *ctl;
+ struct rtattr *tb[TCA_GRED_MAX];
+ int err = -EINVAL, prio = GRED_DEF_PRIO;
+ u8 *stab;
- if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt))
+ if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
return -EINVAL;
- if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) {
- rtattr_parse_nested(tb2, TCA_GRED_DPS, opt);
+ if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL)
+ return gred_change_table_def(sch, opt);
+
+ if (tb[TCA_GRED_PARMS-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) ||
+ tb[TCA_GRED_STAB-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256)
+ return -EINVAL;
+
+ ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]);
+ stab = RTA_DATA(tb[TCA_GRED_STAB-1]);
+
+ if (ctl->DP >= table->DPs)
+ goto errout;
- if (tb2[TCA_GRED_DPS-1] == 0)
- return -EINVAL;
+ if (gred_rio_mode(table)) {
+ if (ctl->prio == 0) {
+ int def_prio = GRED_DEF_PRIO;
- sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]);
- table->DPs=sopt->DPs;
- table->def=sopt->def_DP;
- table->grio=sopt->grio;
- table->initd=0;
- return 0;
+ if (table->tab[table->def])
+ def_prio = table->tab[table->def]->prio;
+
+ printk(KERN_DEBUG "GRED: DP %u does not have a prio "
+ "setting default to %d\n", ctl->DP, def_prio);
+
+ prio = def_prio;
+ } else
+ prio = ctl->prio;
+ }
+
+ sch_tree_lock(sch);
+
+ err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
+ if (err < 0)
+ goto errout_locked;
+
+ if (gred_rio_mode(table)) {
+ gred_disable_wred_mode(table);
+ if (gred_wred_mode_check(sch))
+ gred_enable_wred_mode(table);
}
- DPRINTK("\n GRED_INIT error!\n");
- return -EINVAL;
+ err = 0;
+
+errout_locked:
+ sch_tree_unlock(sch);
+errout:
+ return err;
}
-static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int gred_init(struct Qdisc *sch, struct rtattr *opt)
{
- unsigned long qave;
- struct rtattr *rta;
- struct tc_gred_qopt *opt = NULL ;
- struct tc_gred_qopt *dst;
- struct gred_sched *table = qdisc_priv(sch);
- struct gred_sched_data *q;
- int i;
- unsigned char *b = skb->tail;
+ struct rtattr *tb[TCA_GRED_MAX];
- rta = (struct rtattr*)b;
- RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+ if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt))
+ return -EINVAL;
- opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL);
+ if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1])
+ return -EINVAL;
- if (opt == NULL) {
- DPRINTK("gred_dump:failed to malloc for %Zd\n",
- sizeof(struct tc_gred_qopt)*MAX_DPs);
- goto rtattr_failure;
- }
+ return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]);
+}
- memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs);
+static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct gred_sched *table = qdisc_priv(sch);
+ struct rtattr *parms, *opts = NULL;
+ int i;
+ struct tc_gred_sopt sopt = {
+ .DPs = table->DPs,
+ .def_DP = table->def,
+ .grio = gred_rio_mode(table),
+ .flags = table->red_flags,
+ };
- if (!table->initd) {
- DPRINTK("NO GRED Queues setup!\n");
- }
+ opts = RTA_NEST(skb, TCA_OPTIONS);
+ RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
+ parms = RTA_NEST(skb, TCA_GRED_PARMS);
+
+ for (i = 0; i < MAX_DPs; i++) {
+ struct gred_sched_data *q = table->tab[i];
+ struct tc_gred_qopt opt;
- for (i=0;i<MAX_DPs;i++) {
- dst= &opt[i];
- q= table->tab[i];
+ memset(&opt, 0, sizeof(opt));
if (!q) {
/* hack -- fix at some point with proper message
This is how we indicate to tc that there is no VQ
at this DP */
- dst->DP=MAX_DPs+i;
- continue;
+ opt.DP = MAX_DPs + i;
+ goto append_opt;
}
- dst->limit=q->limit;
- dst->qth_min=q->qth_min>>q->Wlog;
- dst->qth_max=q->qth_max>>q->Wlog;
- dst->DP=q->DP;
- dst->backlog=q->backlog;
- if (q->qave) {
- if (table->eqp && table->grio) {
- q->qidlestart=table->tab[table->def]->qidlestart;
- q->qave=table->tab[table->def]->qave;
- }
- if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
- long idle;
- psched_time_t now;
- PSCHED_GET_TIME(now);
- idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
- qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF];
- dst->qave = qave >> q->Wlog;
-
- } else {
- dst->qave = q->qave >> q->Wlog;
- }
- } else {
- dst->qave = 0;
+ opt.limit = q->limit;
+ opt.DP = q->DP;
+ opt.backlog = q->backlog;
+ opt.prio = q->prio;
+ opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
+ opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
+ opt.Wlog = q->parms.Wlog;
+ opt.Plog = q->parms.Plog;
+ opt.Scell_log = q->parms.Scell_log;
+ opt.other = q->stats.other;
+ opt.early = q->stats.prob_drop;
+ opt.forced = q->stats.forced_drop;
+ opt.pdrop = q->stats.pdrop;
+ opt.packets = q->packetsin;
+ opt.bytesin = q->bytesin;
+
+ if (gred_wred_mode(table)) {
+ q->parms.qidlestart =
+ table->tab[table->def]->parms.qidlestart;
+ q->parms.qavg = table->tab[table->def]->parms.qavg;
}
-
-
- dst->Wlog = q->Wlog;
- dst->Plog = q->Plog;
- dst->Scell_log = q->Scell_log;
- dst->other = q->other;
- dst->forced = q->forced;
- dst->early = q->early;
- dst->pdrop = q->pdrop;
- dst->prio = q->prio;
- dst->packets=q->packetsin;
- dst->bytesin=q->bytesin;
+
+ opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
+
+append_opt:
+ RTA_APPEND(skb, sizeof(opt), &opt);
}
- RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt);
- rta->rta_len = skb->tail - b;
+ RTA_NEST_END(skb, parms);
- kfree(opt);
- return skb->len;
+ return RTA_NEST_END(skb, opts);
rtattr_failure:
- if (opt)
- kfree(opt);
- DPRINTK("gred_dump: FAILURE!!!!\n");
-
-/* also free the opt struct here */
- skb_trim(skb, b - skb->data);
- return -1;
+ return RTA_NEST_CANCEL(skb, opts);
}
static void gred_destroy(struct Qdisc *sch)
@@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch)
struct gred_sched *table = qdisc_priv(sch);
int i;
- for (i = 0;i < table->DPs; i++) {
+ for (i = 0; i < table->DPs; i++) {
if (table->tab[i])
- kfree(table->tab[i]);
+ gred_destroy_vq(table->tab[i]);
}
}
static struct Qdisc_ops gred_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "gred",
.priv_size = sizeof(struct gred_sched),
.enqueue = gred_enqueue,
@@ -621,10 +599,13 @@ static int __init gred_module_init(void)
{
return register_qdisc(&gred_qdisc_ops);
}
-static void __exit gred_module_exit(void)
+
+static void __exit gred_module_exit(void)
{
unregister_qdisc(&gred_qdisc_ops);
}
+
module_init(gred_module_init)
module_exit(gred_module_exit)
+
MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d5003..cdc8d283791 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
#include <net/pkt_sched.h>
+#define VERSION "1.1"
+
/* Network Emulation Queuing algorithm.
====================================
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|| q->counter < q->gap /* inside last reordering gap */
|| q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
+ psched_tdiff_t delay;
+
+ delay = tabledist(q->latency, q->jitter,
+ &q->delay_cor, q->delay_dist);
+
PSCHED_GET_TIME(now);
- PSCHED_TADD2(now, tabledist(q->latency, q->jitter,
- &q->delay_cor, q->delay_dist),
- cb->time_to_send);
+ PSCHED_TADD2(now, delay, cb->time_to_send);
++q->counter;
ret = q->qdisc->enqueue(skb, q->qdisc);
} else {
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
const struct netem_skb_cb *cb
= (const struct netem_skb_cb *)skb->cb;
psched_time_t now;
- long delay;
/* if more time remaining? */
PSCHED_GET_TIME(now);
- delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
- pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
- if (delay <= 0) {
+
+ if (PSCHED_TLESS(cb->time_to_send, now)) {
pr_debug("netem_dequeue: return skb=%p\n", skb);
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
- }
+ } else {
+ psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
+
+ if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+ sch->qstats.drops++;
- mod_timer(&q->timer, jiffies + delay);
- sch->flags |= TCQ_F_THROTTLED;
+ /* After this qlen is confused */
+ printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
+ q->qdisc->ops->id);
- if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
- sch->qstats.drops++;
+ sch->q.qlen--;
+ }
+
+ mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
+ sch->flags |= TCQ_F_THROTTLED;
+ }
}
return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
del_timer_sync(&q->timer);
}
+/* Pass size change message down to embedded FIFO */
static int set_fifo_limit(struct Qdisc *q, int limit)
{
struct rtattr *rta;
int ret = -ENOMEM;
+ /* Hack to avoid sending change message to non-FIFO */
+ if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+ return 0;
+
rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (rta) {
rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
return 0;
}
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+ u32 limit;
+};
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *list = &sch->q;
+ const struct netem_skb_cb *ncb
+ = (const struct netem_skb_cb *)nskb->cb;
+ struct sk_buff *skb;
+
+ if (likely(skb_queue_len(list) < q->limit)) {
+ skb_queue_reverse_walk(list, skb) {
+ const struct netem_skb_cb *cb
+ = (const struct netem_skb_cb *)skb->cb;
+
+ if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+ break;
+ }
+
+ __skb_queue_after(list, skb, nskb);
+
+ sch->qstats.backlog += nskb->len;
+ sch->bstats.bytes += nskb->len;
+ sch->bstats.packets++;
+
+ return NET_XMIT_SUCCESS;
+ }
+
+ return qdisc_drop(nskb, sch);
+}
+
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+
+ if (opt) {
+ struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+ if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+ return -EINVAL;
+
+ q->limit = ctl->limit;
+ } else
+ q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+
+ return 0;
+}
+
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fifo_sched_data *q = qdisc_priv(sch);
+ struct tc_fifo_qopt opt = { .limit = q->limit };
+
+ RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+rtattr_failure:
+ return -1;
+}
+
+static struct Qdisc_ops tfifo_qdisc_ops = {
+ .id = "tfifo",
+ .priv_size = sizeof(struct fifo_sched_data),
+ .enqueue = tfifo_enqueue,
+ .dequeue = qdisc_dequeue_head,
+ .requeue = qdisc_requeue,
+ .drop = qdisc_queue_drop,
+ .init = tfifo_init,
+ .reset = qdisc_reset_queue,
+ .change = tfifo_init,
+ .dump = tfifo_dump,
+};
+
static int netem_init(struct Qdisc *sch, struct rtattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
q->timer.function = netem_watchdog;
q->timer.data = (unsigned long) sch;
- q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+ q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
if (!q->qdisc) {
pr_debug("netem: qdisc create failed\n");
return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
static int __init netem_module_init(void)
{
+ pr_info("netem: version " VERSION "\n");
return register_qdisc(&netem_qdisc_ops);
}
static void __exit netem_module_exit(void)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 7845d045eec..dccfa44c2d7 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -9,76 +9,23 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
- * J Hadi Salim <hadi@nortel.com> 980914: computation fixes
+ * J Hadi Salim 980914: computation fixes
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
- * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support
+ * J Hadi Salim 980816: ECN support
*/
#include <linux/config.h>
#include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
#include <linux/skbuff.h>
-#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-#include <net/dsfield.h>
+#include <net/red.h>
-/* Random Early Detection (RED) algorithm.
- =======================================
-
- Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
- for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
-
- This file codes a "divisionless" version of RED algorithm
- as written down in Fig.17 of the paper.
-
-Short description.
-------------------
-
- When a new packet arrives we calculate the average queue length:
-
- avg = (1-W)*avg + W*current_queue_len,
-
- W is the filter time constant (chosen as 2^(-Wlog)), it controls
- the inertia of the algorithm. To allow larger bursts, W should be
- decreased.
-
- if (avg > th_max) -> packet marked (dropped).
- if (avg < th_min) -> packet passes.
- if (th_min < avg < th_max) we calculate probability:
-
- Pb = max_P * (avg - th_min)/(th_max-th_min)
-
- and mark (drop) packet with this probability.
- Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
- max_P should be small (not 1), usually 0.01..0.02 is good value.
-
- max_P is chosen as a number, so that max_P/(th_max-th_min)
- is a negative power of two in order arithmetics to contain
- only shifts.
-
-
- Parameters, settable by user:
+/* Parameters, settable by user:
-----------------------------
limit - bytes (must be > qth_max + burst)
@@ -89,243 +36,93 @@ Short description.
arbitrarily high (well, less than ram size)
Really, this limit will never be reached
if RED works correctly.
-
- qth_min - bytes (should be < qth_max/2)
- qth_max - bytes (should be at least 2*qth_min and less limit)
- Wlog - bits (<32) log(1/W).
- Plog - bits (<32)
-
- Plog is related to max_P by formula:
-
- max_P = (qth_max-qth_min)/2^Plog;
-
- F.e. if qth_max=128K and qth_min=32K, then Plog=22
- corresponds to max_P=0.02
-
- Scell_log
- Stab
-
- Lookup table for log((1-W)^(t/t_ave).
-
-
-NOTES:
-
-Upper bound on W.
------------------
-
- If you want to allow bursts of L packets of size S,
- you should choose W:
-
- L + 1 - th_min/S < (1-(1-W)^L)/W
-
- th_min/S = 32 th_min/S = 4
-
- log(W) L
- -1 33
- -2 35
- -3 39
- -4 46
- -5 57
- -6 75
- -7 101
- -8 135
- -9 190
- etc.
*/
struct red_sched_data
{
-/* Parameters */
- u32 limit; /* HARD maximal queue length */
- u32 qth_min; /* Min average length threshold: A scaled */
- u32 qth_max; /* Max average length threshold: A scaled */
- u32 Rmask;
- u32 Scell_max;
- unsigned char flags;
- char Wlog; /* log(W) */
- char Plog; /* random number bits */
- char Scell_log;
- u8 Stab[256];
-
-/* Variables */
- unsigned long qave; /* Average queue length: A scaled */
- int qcount; /* Packets since last random number generation */
- u32 qR; /* Cached random number */
-
- psched_time_t qidlestart; /* Start of idle period */
- struct tc_red_xstats st;
+ u32 limit; /* HARD maximal queue length */
+ unsigned char flags;
+ struct red_parms parms;
+ struct red_stats stats;
};
-static int red_ecn_mark(struct sk_buff *skb)
+static inline int red_use_ecn(struct red_sched_data *q)
{
- if (skb->nh.raw + 20 > skb->tail)
- return 0;
-
- switch (skb->protocol) {
- case __constant_htons(ETH_P_IP):
- if (INET_ECN_is_not_ect(skb->nh.iph->tos))
- return 0;
- IP_ECN_set_ce(skb->nh.iph);
- return 1;
- case __constant_htons(ETH_P_IPV6):
- if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
- return 0;
- IP6_ECN_set_ce(skb->nh.ipv6h);
- return 1;
- default:
- return 0;
- }
+ return q->flags & TC_RED_ECN;
}
-static int
-red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static inline int red_use_harddrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
- psched_time_t now;
+ q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog);
- if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
- long us_idle;
- int shift;
+ if (red_is_idling(&q->parms))
+ red_end_of_idle_period(&q->parms);
- PSCHED_GET_TIME(now);
- us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
- PSCHED_SET_PASTPERFECT(q->qidlestart);
+ switch (red_action(&q->parms, q->parms.qavg)) {
+ case RED_DONT_MARK:
+ break;
-/*
- The problem: ideally, average length queue recalcultion should
- be done over constant clock intervals. This is too expensive, so that
- the calculation is driven by outgoing packets.
- When the queue is idle we have to model this clock by hand.
-
- SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
- dummy packets as a burst after idle time, i.e.
-
- q->qave *= (1-W)^m
-
- This is an apparently overcomplicated solution (f.e. we have to precompute
- a table to make this calculation in reasonable time)
- I believe that a simpler model may be used here,
- but it is field for experiments.
-*/
- shift = q->Stab[us_idle>>q->Scell_log];
-
- if (shift) {
- q->qave >>= shift;
- } else {
- /* Approximate initial part of exponent
- with linear function:
- (1-W)^m ~= 1-mW + ...
-
- Seems, it is the best solution to
- problem of too coarce exponent tabulation.
- */
-
- us_idle = (q->qave * us_idle)>>q->Scell_log;
- if (us_idle < q->qave/2)
- q->qave -= us_idle;
- else
- q->qave >>= 1;
- }
- } else {
- q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
- /* NOTE:
- q->qave is fixed point number with point at Wlog.
- The formulae above is equvalent to floating point
- version:
-
- qave = qave*(1-W) + sch->qstats.backlog*W;
- --ANK (980924)
- */
- }
+ case RED_PROB_MARK:
+ sch->qstats.overlimits++;
+ if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
- if (q->qave < q->qth_min) {
- q->qcount = -1;
-enqueue:
- if (sch->qstats.backlog + skb->len <= q->limit) {
- __skb_queue_tail(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- return NET_XMIT_SUCCESS;
- } else {
- q->st.pdrop++;
- }
- kfree_skb(skb);
- sch->qstats.drops++;
- return NET_XMIT_DROP;
- }
- if (q->qave >= q->qth_max) {
- q->qcount = -1;
- sch->qstats.overlimits++;
-mark:
- if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
- q->st.early++;
- goto drop;
- }
- q->st.marked++;
- goto enqueue;
- }
+ q->stats.prob_mark++;
+ break;
+
+ case RED_HARD_MARK:
+ sch->qstats.overlimits++;
+ if (red_use_harddrop(q) || !red_use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
- if (++q->qcount) {
- /* The formula used below causes questions.
-
- OK. qR is random number in the interval 0..Rmask
- i.e. 0..(2^Plog). If we used floating point
- arithmetics, it would be: (2^Plog)*rnd_num,
- where rnd_num is less 1.
-
- Taking into account, that qave have fixed
- point at Wlog, and Plog is related to max_P by
- max_P = (qth_max-qth_min)/2^Plog; two lines
- below have the following floating point equivalent:
-
- max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
-
- Any questions? --ANK (980924)
- */
- if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
- goto enqueue;
- q->qcount = 0;
- q->qR = net_random()&q->Rmask;
- sch->qstats.overlimits++;
- goto mark;
+ q->stats.forced_mark++;
+ break;
}
- q->qR = net_random()&q->Rmask;
- goto enqueue;
-drop:
- kfree_skb(skb);
- sch->qstats.drops++;
+ if (sch->qstats.backlog + skb->len <= q->limit)
+ return qdisc_enqueue_tail(skb, sch);
+
+ q->stats.pdrop++;
+ return qdisc_drop(skb, sch);
+
+congestion_drop:
+ qdisc_drop(skb, sch);
return NET_XMIT_CN;
}
-static int
-red_requeue(struct sk_buff *skb, struct Qdisc* sch)
+static int red_requeue(struct sk_buff *skb, struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
- PSCHED_SET_PASTPERFECT(q->qidlestart);
+ if (red_is_idling(&q->parms))
+ red_end_of_idle_period(&q->parms);
- __skb_queue_head(&sch->q, skb);
- sch->qstats.backlog += skb->len;
- sch->qstats.requeues++;
- return 0;
+ return qdisc_requeue(skb, sch);
}
-static struct sk_buff *
-red_dequeue(struct Qdisc* sch)
+static struct sk_buff * red_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
- skb = __skb_dequeue(&sch->q);
- if (skb) {
- sch->qstats.backlog -= skb->len;
- return skb;
- }
- PSCHED_GET_TIME(q->qidlestart);
- return NULL;
+ skb = qdisc_dequeue_head(sch);
+
+ if (skb == NULL && !red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+
+ return skb;
}
static unsigned int red_drop(struct Qdisc* sch)
@@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch)
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
- skb = __skb_dequeue_tail(&sch->q);
+ skb = qdisc_dequeue_tail(sch);
if (skb) {
unsigned int len = skb->len;
- sch->qstats.backlog -= len;
- sch->qstats.drops++;
- q->st.other++;
- kfree_skb(skb);
+ q->stats.other++;
+ qdisc_drop(skb, sch);
return len;
}
- PSCHED_GET_TIME(q->qidlestart);
+
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+
return 0;
}
@@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
- __skb_queue_purge(&sch->q);
- sch->qstats.backlog = 0;
- PSCHED_SET_PASTPERFECT(q->qidlestart);
- q->qave = 0;
- q->qcount = -1;
+ qdisc_reset_queue(sch);
+ red_restart(&q->parms);
}
static int red_change(struct Qdisc *sch, struct rtattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
- struct rtattr *tb[TCA_RED_STAB];
+ struct rtattr *tb[TCA_RED_MAX];
struct tc_red_qopt *ctl;
- if (opt == NULL ||
- rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
- tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
+ if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt))
+ return -EINVAL;
+
+ if (tb[TCA_RED_PARMS-1] == NULL ||
RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
- RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
+ tb[TCA_RED_STAB-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE)
return -EINVAL;
ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
sch_tree_lock(sch);
q->flags = ctl->flags;
- q->Wlog = ctl->Wlog;
- q->Plog = ctl->Plog;
- q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
- q->Scell_log = ctl->Scell_log;
- q->Scell_max = (255<<q->Scell_log);
- q->qth_min = ctl->qth_min<<ctl->Wlog;
- q->qth_max = ctl->qth_max<<ctl->Wlog;
q->limit = ctl->limit;
- memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
- q->qcount = -1;
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ RTA_DATA(tb[TCA_RED_STAB-1]));
+
if (skb_queue_empty(&sch->q))
- PSCHED_SET_PASTPERFECT(q->qidlestart);
+ red_end_of_idle_period(&q->parms);
+
sch_tree_unlock(sch);
return 0;
}
@@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt)
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct red_sched_data *q = qdisc_priv(sch);
- unsigned char *b = skb->tail;
- struct rtattr *rta;
- struct tc_red_qopt opt;
-
- rta = (struct rtattr*)b;
- RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
- opt.limit = q->limit;
- opt.qth_min = q->qth_min>>q->Wlog;
- opt.qth_max = q->qth_max>>q->Wlog;
- opt.Wlog = q->Wlog;
- opt.Plog = q->Plog;
- opt.Scell_log = q->Scell_log;
- opt.flags = q->flags;
+ struct rtattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = RTA_NEST(skb, TCA_OPTIONS);
RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
- rta->rta_len = skb->tail - b;
-
- return skb->len;
+ return RTA_NEST_END(skb, opts);
rtattr_failure:
- skb_trim(skb, b - skb->data);
- return -1;
+ return RTA_NEST_CANCEL(skb, opts);
}
static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct red_sched_data *q = qdisc_priv(sch);
-
- return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
+ struct tc_red_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct Qdisc_ops red_qdisc_ops = {
- .next = NULL,
- .cl_ops = NULL,
.id = "red",
.priv_size = sizeof(struct red_sched_data),
.enqueue = red_enqueue,
@@ -450,10 +243,13 @@ static int __init red_module_init(void)
{
return register_qdisc(&red_qdisc_ops);
}
-static void __exit red_module_exit(void)
+
+static void __exit red_module_exit(void)
{
unregister_qdisc(&red_qdisc_ops);
}
+
module_init(red_module_init)
module_exit(red_module_exit)
+
MODULE_LICENSE("GPL");
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 12b0f582a66..dec68a60477 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
*/
asoc->max_burst = sctp_max_burst;
- /* Copy things from the endpoint. */
+ /* initialize association timers */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
+
+ /* sctpimpguide Section 2.12.2
+ * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
+ * recommended value of 5 times 'RTO.Max'.
+ */
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
+ = 5 * asoc->rto_max;
+
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+ SCTP_DEFAULT_TIMEOUT_SACK;
+ asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
+ sp->autoclose * HZ;
+
+ /* Initilizes the timers */
for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
- asoc->timeouts[i] = ep->timeouts[i];
init_timer(&asoc->timers[i]);
asoc->timers[i].function = sctp_timer_events[i];
asoc->timers[i].data = (unsigned long) asoc;
@@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
* RFC 6 - A SCTP receiver MUST be able to receive a minimum of
* 1500 bytes in one SCTP packet.
*/
- if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW)
+ if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
else
- asoc->rwnd = sk->sk_rcvbuf;
+ asoc->rwnd = sk->sk_rcvbuf/2;
asoc->a_rwnd = asoc->rwnd;
@@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
/* Set the sndbuf size for transmit. */
asoc->sndbuf_used = 0;
+ /* Initialize the receive memory counter */
+ atomic_set(&asoc->rmem_alloc, 0);
+
init_waitqueue_head(&asoc->wait);
asoc->c.my_vtag = sctp_generate_tag(ep);
@@ -344,9 +367,7 @@ void sctp_association_free(struct sctp_association *asoc)
}
/* Free peer's cached cookie. */
- if (asoc->peer.cookie) {
- kfree(asoc->peer.cookie);
- }
+ kfree(asoc->peer.cookie);
/* Release the transport structures. */
list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
@@ -382,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc)
spin_unlock_bh(&sctp_assocs_id_lock);
}
+ BUG_TRAP(!atomic_read(&asoc->rmem_alloc));
+
if (asoc->base.malloced) {
kfree(asoc);
SCTP_DBG_OBJCNT_DEC(assoc);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 96984f7a2d6..67bd53070ee 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
struct sock *sk,
gfp_t gfp)
{
- struct sctp_sock *sp = sctp_sk(sk);
memset(ep, 0, sizeof(struct sctp_endpoint));
/* Initialize the base structure. */
@@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Create the lists of associations. */
INIT_LIST_HEAD(&ep->asocs);
- /* Set up the base timeout information. */
- ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
- msecs_to_jiffies(sp->rtoinfo.srto_initial);
- ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0;
-
- /* sctpimpguide-05 Section 2.12.2
- * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
- * recommended value of 5 times 'RTO.Max'.
- */
- ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
- = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
-
- ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
- ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
- ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
-
/* Use SCTP specific send buffer space queues. */
ep->sndbuf_policy = sctp_sndbuf_policy;
sk->sk_write_space = sctp_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ /* Get the receive buffer policy for this endpoint */
+ ep->rcvbuf_policy = sctp_rcvbuf_policy;
+
/* Initialize the secret key used with cookie. */
get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE);
ep->last_key = ep->current_key = 0;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 28f32243397..b24ff2c1aef 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb)
return 0;
}
-/* The free routine for skbuffs that sctp receives */
-static void sctp_rfree(struct sk_buff *skb)
-{
- atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc);
- sock_rfree(skb);
-}
-
-/* The ownership wrapper routine to do receive buffer accounting */
-static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb_set_owner_r(skb,sk);
- skb->destructor = sctp_rfree;
- atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
-}
-
struct sctp_input_cb {
union {
struct inet_skb_parm h4;
@@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb)
rcvr = &ep->base;
}
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
- goto discard_release;
-
/*
* RFC 2960, 8.4 - Handle "Out of the blue" Packets.
* An SCTP packet is called an "out of the blue" (OOTB)
@@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb)
}
SCTP_INPUT_CB(skb)->chunk = chunk;
- sctp_rcv_set_owner_r(skb,sk);
-
/* Remember what endpoint is to handle this packet. */
chunk->rcvr = rcvr;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 26de4d3e1bd..f775d78aa59 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc,
{
struct rtable *rt = (struct rtable *)dst;
+ if (!asoc)
+ return;
+
if (rt) {
saddr->v4.sin_family = AF_INET;
saddr->v4.sin_port = asoc->base.bind_addr.port;
@@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void)
/* Sendbuffer growth - do per-socket accounting */
sctp_sndbuf_policy = 0;
+ /* Rcvbuffer growth - do per-socket accounting */
+ sctp_rcvbuf_policy = 0;
+
/* HB.interval - 30 seconds */
sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 660c61bdf16..f9573eba5c7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -254,8 +254,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
aiparam.adaption_ind = htonl(sp->adaption_ind);
sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
nodata:
- if (addrs.v)
- kfree(addrs.v);
+ kfree(addrs.v);
return retval;
}
@@ -347,8 +346,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
nomem_chunk:
kfree(cookie);
nomem_cookie:
- if (addrs.v)
- kfree(addrs.v);
+ kfree(addrs.v);
return retval;
}
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f84173ea8ec..823947170a3 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
NULL,
sctp_generate_t4_rto_event,
sctp_generate_t5_shutdown_guard_event,
- sctp_generate_heartbeat_event,
+ NULL,
sctp_generate_sack_event,
sctp_generate_autoclose_event,
};
@@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
* increased due to timer expirations.
*/
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
- asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
+ asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
- asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
+ asoc->rto_initial;
}
if (sctp_state(asoc, ESTABLISHED) ||
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 505c7de10c5..475bfb4972d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
sctp_verb_t deliver;
int tmp;
__u32 tsn;
+ int account_value;
+ struct sock *sk = asoc->base.sk;
data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
@@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc,
/* ASSERT: Now skb->data is really the user data. */
+ /*
+ * if we are established, and we have used up our receive
+ * buffer memory, drop the frame
+ */
+ if (asoc->state == SCTP_STATE_ESTABLISHED) {
+ /*
+ * If the receive buffer policy is 1, then each
+ * association can allocate up to sk_rcvbuf bytes
+ * otherwise, all the associations in aggregate
+ * may allocate up to sk_rcvbuf bytes
+ */
+ if (asoc->ep->rcvbuf_policy)
+ account_value = atomic_read(&asoc->rmem_alloc);
+ else
+ account_value = atomic_read(&sk->sk_rmem_alloc);
+
+ if (account_value > sk->sk_rcvbuf)
+ return SCTP_IERROR_IGNORE_TSN;
+ }
+
/* Process ECN based congestion.
*
* Since the chunk structure is reused for all chunks within
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b529af5e6f2..abab81f3818 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1932,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
if (copy_from_user(&sp->autoclose, optval, optlen))
return -EFAULT;
- sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
return 0;
}
@@ -5115,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
+ sock_rfree(skb);
__skb_unlink(skb, &oldsk->sk_receive_queue);
__skb_queue_tail(&newsk->sk_receive_queue, skb);
+ skb_set_owner_r(skb, newsk);
}
}
@@ -5144,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
event = sctp_skb2event(skb);
if (event->asoc == assoc) {
+ sock_rfree(skb);
__skb_unlink(skb, &oldsp->pd_lobby);
__skb_queue_tail(queue, skb);
+ skb_set_owner_r(skb, newsk);
}
}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 75b28dd634f..fcd7096c953 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -121,6 +121,14 @@ static ctl_table sctp_table[] = {
.proc_handler = &proc_dointvec
},
{
+ .ctl_name = NET_SCTP_RCVBUF_POLICY,
+ .procname = "rcvbuf_policy",
+ .data = &sctp_rcvbuf_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
.ctl_name = NET_SCTP_PATH_MAX_RETRANS,
.procname = "path_max_retrans",
.data = &sctp_max_retrans_path,
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e049f41faa4..ba97f974f57 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
struct sctp_association *asoc);
static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
-/* Stub skb destructor. */
-static void sctp_stub_rfree(struct sk_buff *skb)
-{
-/* WARNING: This function is just a warning not to use the
- * skb destructor. If the skb is shared, we may get the destructor
- * callback on some processor that does not own the sock_lock. This
- * was occuring with PACKET socket applications that were monitoring
- * our skbs. We can't take the sock_lock, because we can't risk
- * recursing if we do really own the sock lock. Instead, do all
- * of our rwnd manipulation while we own the sock_lock outright.
- */
-}
-
/* Initialize an ULP event from an given skb. */
SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
{
@@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
*/
sctp_association_hold((struct sctp_association *)asoc);
skb = sctp_event2skb(event);
- skb->sk = asoc->base.sk;
event->asoc = (struct sctp_association *)asoc;
- skb->destructor = sctp_stub_rfree;
+ atomic_add(skb->truesize, &event->asoc->rmem_alloc);
+ skb_set_owner_r(skb, asoc->base.sk);
}
/* A simple destructor to give up the reference to the association. */
static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
{
- sctp_association_put(event->asoc);
+ struct sctp_association *asoc = event->asoc;
+ struct sk_buff *skb = sctp_event2skb(event);
+
+ atomic_sub(skb->truesize, &asoc->rmem_alloc);
+ sctp_association_put(asoc);
}
/* Create and initialize an SCTP_ASSOC_CHANGE event.
@@ -922,7 +913,6 @@ done:
/* Free a ulpevent that has an owner. It includes releasing the reference
* to the owner, updating the rwnd in case of a DATA event and freeing the
* skb.
- * See comments in sctp_stub_rfree().
*/
void sctp_ulpevent_free(struct sctp_ulpevent *event)
{
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 13f8ae97945..d0dfdfd5e79 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -143,6 +143,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
out_err:
- if (md5cksum.data) kfree(md5cksum.data);
+ kfree(md5cksum.data);
return GSS_S_FAILURE;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index 2030475d98e..db055fd7d77 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -176,6 +176,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
ret = GSS_S_COMPLETE;
out:
- if (md5cksum.data) kfree(md5cksum.data);
+ kfree(md5cksum.data);
return ret;
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index b048bf672da..f8bac6ccd52 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -60,8 +60,7 @@ gss_mech_free(struct gss_api_mech *gm)
for (i = 0; i < gm->gm_pf_num; i++) {
pf = &gm->gm_pfs[i];
- if (pf->auth_domain_name)
- kfree(pf->auth_domain_name);
+ kfree(pf->auth_domain_name);
pf->auth_domain_name = NULL;
}
}
diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c
index 148201e929d..d1e12b25d6e 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_seal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c
@@ -122,8 +122,7 @@ spkm3_make_token(struct spkm3_ctx *ctx,
return GSS_S_COMPLETE;
out_err:
- if (md5cksum.data)
- kfree(md5cksum.data);
+ kfree(md5cksum.data);
token->data = NULL;
token->len = 0;
return GSS_S_FAILURE;
diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c
index 46c08a0710f..1f824578d77 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_token.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_token.c
@@ -259,8 +259,7 @@ spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **ck
ret = GSS_S_COMPLETE;
out:
- if (spkm3_ctx_id.data)
- kfree(spkm3_ctx_id.data);
+ kfree(spkm3_ctx_id.data);
return ret;
}
diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
index c3c0d958610..241d5b30dfc 100644
--- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c
+++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
@@ -120,9 +120,7 @@ spkm3_read_token(struct spkm3_ctx *ctx,
/* XXX: need to add expiration and sequencing */
ret = GSS_S_COMPLETE;
out:
- if (md5cksum.data)
- kfree(md5cksum.data);
- if (wire_cksum.data)
- kfree(wire_cksum.data);
+ kfree(md5cksum.data);
+ kfree(wire_cksum.data);
return ret;
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 702ede309b0..61c3abeacca 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -55,6 +55,7 @@ static void call_bind(struct rpc_task *task);
static void call_bind_status(struct rpc_task *task);
static void call_transmit(struct rpc_task *task);
static void call_status(struct rpc_task *task);
+static void call_transmit_status(struct rpc_task *task);
static void call_refresh(struct rpc_task *task);
static void call_refreshresult(struct rpc_task *task);
static void call_timeout(struct rpc_task *task);
@@ -672,6 +673,18 @@ call_allocate(struct rpc_task *task)
rpc_exit(task, -ERESTARTSYS);
}
+static inline int
+rpc_task_need_encode(struct rpc_task *task)
+{
+ return task->tk_rqstp->rq_snd_buf.len == 0;
+}
+
+static inline void
+rpc_task_force_reencode(struct rpc_task *task)
+{
+ task->tk_rqstp->rq_snd_buf.len = 0;
+}
+
/*
* 3. Encode arguments of an RPC call
*/
@@ -867,12 +880,14 @@ call_transmit(struct rpc_task *task)
if (task->tk_status != 0)
return;
/* Encode here so that rpcsec_gss can use correct sequence number. */
- if (task->tk_rqstp->rq_bytes_sent == 0) {
+ if (rpc_task_need_encode(task)) {
+ task->tk_rqstp->rq_bytes_sent = 0;
call_encode(task);
/* Did the encode result in an error condition? */
if (task->tk_status != 0)
goto out_nosend;
}
+ task->tk_action = call_transmit_status;
xprt_transmit(task);
if (task->tk_status < 0)
return;
@@ -884,6 +899,7 @@ call_transmit(struct rpc_task *task)
out_nosend:
/* release socket write lock before attempting to handle error */
xprt_abort_transmit(task);
+ rpc_task_force_reencode(task);
}
/*
@@ -915,7 +931,6 @@ call_status(struct rpc_task *task)
break;
case -ECONNREFUSED:
case -ENOTCONN:
- req->rq_bytes_sent = 0;
if (clnt->cl_autobind)
clnt->cl_port = 0;
task->tk_action = call_bind;
@@ -937,7 +952,18 @@ call_status(struct rpc_task *task)
}
/*
- * 6a. Handle RPC timeout
+ * 6a. Handle transmission errors.
+ */
+static void
+call_transmit_status(struct rpc_task *task)
+{
+ if (task->tk_status != -EAGAIN)
+ rpc_task_force_reencode(task);
+ call_status(task);
+}
+
+/*
+ * 6b. Handle RPC timeout
* We do not release the request slot, so we keep using the
* same XID for all retransmits.
*/
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 4f188d0a5d1..81e00a6c19d 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -603,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
return ERR_PTR(error);
dir = nd->dentry->d_inode;
down(&dir->i_sem);
- dentry = lookup_hash(&nd->last, nd->dentry);
+ dentry = lookup_hash(nd);
if (IS_ERR(dentry))
goto out_err;
if (dentry->d_inode) {
@@ -665,7 +665,7 @@ rpc_rmdir(char *path)
return error;
dir = nd.dentry->d_inode;
down(&dir->i_sem);
- dentry = lookup_hash(&nd.last, nd.dentry);
+ dentry = lookup_hash(&nd);
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
goto out_release;
@@ -726,7 +726,7 @@ rpc_unlink(char *path)
return error;
dir = nd.dentry->d_inode;
down(&dir->i_sem);
- dentry = lookup_hash(&nd.last, nd.dentry);
+ dentry = lookup_hash(&nd);
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
goto out_release;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 8f97e90f36c..eb330d4f66d 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -6,6 +6,9 @@
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*/
+#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/udp.h>
@@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
return -1;
if ((unsigned short)csum_fold(desc.csum))
return -1;
+ if (unlikely(skb->ip_summed == CHECKSUM_HW))
+ netdev_rx_csum_fault(skb->dev);
return 0;
no_checksum:
if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e9bd91265f7..e4296c8b861 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -196,12 +196,9 @@ svc_exit_thread(struct svc_rqst *rqstp)
struct svc_serv *serv = rqstp->rq_server;
svc_release_buffer(rqstp);
- if (rqstp->rq_resp)
- kfree(rqstp->rq_resp);
- if (rqstp->rq_argp)
- kfree(rqstp->rq_argp);
- if (rqstp->rq_auth_data)
- kfree(rqstp->rq_auth_data);
+ kfree(rqstp->rq_resp);
+ kfree(rqstp->rq_argp);
+ kfree(rqstp->rq_auth_data);
kfree(rqstp);
/* Release the server */
@@ -313,6 +310,11 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */
progp = serv->sv_program;
+
+ for (progp = serv->sv_program; progp; progp = progp->pg_next)
+ if (prog == progp->pg_prog)
+ break;
+
/*
* Decode auth data, and add verifier to reply buffer.
* We do this before anything else in order to get a decent
@@ -320,7 +322,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
*/
auth_res = svc_authenticate(rqstp, &auth_stat);
/* Also give the program a chance to reject this call: */
- if (auth_res == SVC_OK) {
+ if (auth_res == SVC_OK && progp) {
auth_stat = rpc_autherr_badcred;
auth_res = progp->pg_authenticate(rqstp);
}
@@ -340,10 +342,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
case SVC_COMPLETE:
goto sendit;
}
-
- for (progp = serv->sv_program; progp; progp = progp->pg_next)
- if (prog == progp->pg_prog)
- break;
+
if (progp == NULL)
goto err_bad_prog;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f16e7cdd615..c6a51911e71 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
/* we can use it in-place */
rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
rqstp->rq_arg.head[0].iov_len = len;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
- if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
- skb_free_datagram(svsk->sk_sk, skb);
- return 0;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ if (skb_checksum_complete(skb)) {
+ skb_free_datagram(svsk->sk_sk, skb);
+ return 0;
}
rqstp->rq_skbuff = skb;
}
@@ -1181,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
arg->tail[0].iov_len = 0;
try_to_freeze();
+ cond_resched();
if (signalled())
return -EINTR;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 32df43372ee..aaf08cdd19f 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -992,8 +992,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
err = 0;
out:
- if (elem)
- kfree(elem);
+ kfree(elem);
if (ppages)
kunmap(*ppages);
return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 41feca3bef8..acc73ba8bad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
if (err)
goto fail;
- err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
+ err = vfs_permission(&nd, MAY_WRITE);
if (err)
goto put_fail;
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index 596cb96e5f4..59fec59b213 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -1099,7 +1099,7 @@ static void release_driver(struct sock *sk)
sock_reset_flag(sk, SOCK_ZAPPED);
wp = wp_sk(sk);
- if (wp && wp->mbox) {
+ if (wp) {
kfree(wp->mbox);
wp->mbox = NULL;
}
@@ -1186,10 +1186,8 @@ static void wanpipe_kill_sock_timer (unsigned long data)
return;
}
- if (wp_sk(sk)) {
- kfree(wp_sk(sk));
- wp_sk(sk) = NULL;
- }
+ kfree(wp_sk(sk));
+ wp_sk(sk) = NULL;
if (atomic_read(&sk->sk_refcnt) != 1) {
atomic_set(&sk->sk_refcnt, 1);
@@ -1219,10 +1217,8 @@ static void wanpipe_kill_sock_accept (struct sock *sk)
sk->sk_socket = NULL;
- if (wp_sk(sk)) {
- kfree(wp_sk(sk));
- wp_sk(sk) = NULL;
- }
+ kfree(wp_sk(sk));
+ wp_sk(sk) = NULL;
if (atomic_read(&sk->sk_refcnt) != 1) {
atomic_set(&sk->sk_refcnt, 1);
@@ -1243,10 +1239,8 @@ static void wanpipe_kill_sock_irq (struct sock *sk)
sk->sk_socket = NULL;
- if (wp_sk(sk)) {
- kfree(wp_sk(sk));
- wp_sk(sk) = NULL;
- }
+ kfree(wp_sk(sk));
+ wp_sk(sk) = NULL;
if (atomic_read(&sk->sk_refcnt) != 1) {
atomic_set(&sk->sk_refcnt, 1);
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 13b650ad22e..bcf7b3faa76 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -714,10 +714,8 @@ static int wanrouter_device_new_if(struct wan_device *wandev,
}
/* This code has moved from del_if() function */
- if (dev->priv) {
- kfree(dev->priv);
- dev->priv = NULL;
- }
+ kfree(dev->priv);
+ dev->priv = NULL;
#ifdef CONFIG_WANPIPE_MULTPPP
if (cnf->config_id == WANCONFIG_MPPP)
@@ -851,10 +849,8 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name)
/* Due to new interface linking method using dev->priv,
* this code has moved from del_if() function.*/
- if (dev->priv){
- kfree(dev->priv);
- dev->priv=NULL;
- }
+ kfree(dev->priv);
+ dev->priv=NULL;
unregister_netdev(dev);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8b9a4747417..7cf48aa6c95 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -62,14 +62,10 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
if (del_timer(&x->timer))
BUG();
- if (x->aalg)
- kfree(x->aalg);
- if (x->ealg)
- kfree(x->ealg);
- if (x->calg)
- kfree(x->calg);
- if (x->encap)
- kfree(x->encap);
+ kfree(x->aalg);
+ kfree(x->ealg);
+ kfree(x->calg);
+ kfree(x->encap);
if (x->type) {
x->type->destructor(x);
xfrm_put_type(x->type);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c35336a0f71..0cdd9a07e04 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -18,7 +18,6 @@
#include <linux/string.h>
#include <linux/net.h>
#include <linux/skbuff.h>
-#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
@@ -26,6 +25,7 @@
#include <linux/security.h>
#include <net/sock.h>
#include <net/xfrm.h>
+#include <net/netlink.h>
#include <asm/uaccess.h>
static struct sock *xfrm_nl;
@@ -948,11 +948,6 @@ static struct xfrm_link {
[XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy },
};
-static int xfrm_done(struct netlink_callback *cb)
-{
- return 0;
-}
-
static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
{
struct rtattr *xfrma[XFRMA_MAX];
@@ -984,20 +979,15 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err
if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) &&
(nlh->nlmsg_flags & NLM_F_DUMP)) {
- u32 rlen;
-
if (link->dump == NULL)
goto err_einval;
if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh,
- link->dump,
- xfrm_done)) != 0) {
+ link->dump, NULL)) != 0) {
return -1;
}
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- skb_pull(skb, rlen);
+
+ netlink_queue_skip(nlh, skb);
return -1;
}
@@ -1032,60 +1022,13 @@ err_einval:
return -1;
}
-static int xfrm_user_rcv_skb(struct sk_buff *skb)
-{
- int err;
- struct nlmsghdr *nlh;
-
- while (skb->len >= NLMSG_SPACE(0)) {
- u32 rlen;
-
- nlh = (struct nlmsghdr *) skb->data;
- if (nlh->nlmsg_len < sizeof(*nlh) ||
- skb->len < nlh->nlmsg_len)
- return 0;
- rlen = NLMSG_ALIGN(nlh->nlmsg_len);
- if (rlen > skb->len)
- rlen = skb->len;
- if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) {
- if (err == 0)
- return -1;
- netlink_ack(skb, nlh, err);
- } else if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(skb, nlh, 0);
- skb_pull(skb, rlen);
- }
-
- return 0;
-}
-
static void xfrm_netlink_rcv(struct sock *sk, int len)
{
- unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+ unsigned int qlen = 0;
do {
- struct sk_buff *skb;
-
down(&xfrm_cfg_sem);
-
- if (qlen > skb_queue_len(&sk->sk_receive_queue))
- qlen = skb_queue_len(&sk->sk_receive_queue);
-
- for (; qlen; qlen--) {
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (xfrm_user_rcv_skb(skb)) {
- if (skb->len)
- skb_queue_head(&sk->sk_receive_queue,
- skb);
- else {
- kfree_skb(skb);
- qlen--;
- }
- break;
- }
- kfree_skb(skb);
- }
-
+ netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg);
up(&xfrm_cfg_sem);
} while (qlen);