diff options
Diffstat (limited to 'net')
173 files changed, 11320 insertions, 2575 deletions
diff --git a/net/802/p8023.c b/net/802/p8023.c index 6368d3dce44..d23e906456e 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -54,8 +54,7 @@ struct datalink_proto *make_8023_client(void) */ void destroy_8023_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_8023_client); diff --git a/net/Makefile b/net/Makefile index 4aa2f46d2a5..f5141b9d4f3 100644 --- a/net/Makefile +++ b/net/Makefile @@ -15,8 +15,8 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ -obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 8e37e71e34f..1b683f30265 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1138,10 +1138,8 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; /* * Handle digi-peaters to be used. diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 73cfc3411c4..4cf87540fb3 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -401,10 +401,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, } if (dp.ndigi == 0) { - if (ax25->digipeat != NULL) { - kfree(ax25->digipeat); - ax25->digipeat = NULL; - } + kfree(ax25->digipeat); + ax25->digipeat = NULL; } else { /* Reverse the source SABM's path */ memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi)); diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 26b77d97222..b1e945bd6ed 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -54,15 +54,13 @@ void ax25_rt_device_down(struct net_device *dev) if (s->dev == dev) { if (ax25_route_list == s) { ax25_route_list = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); } else { for (t = ax25_route_list; t != NULL; t = t->next) { if (t->next == s) { t->next = s->next; - if (s->digipeat != NULL) - kfree(s->digipeat); + kfree(s->digipeat); kfree(s); break; } @@ -90,10 +88,8 @@ static int ax25_rt_add(struct ax25_routes_struct *route) while (ax25_rt != NULL) { if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 && ax25_rt->dev == ax25_dev->dev) { - if (ax25_rt->digipeat != NULL) { - kfree(ax25_rt->digipeat); - ax25_rt->digipeat = NULL; - } + kfree(ax25_rt->digipeat); + ax25_rt->digipeat = NULL; if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock(&ax25_route_lock); @@ -145,8 +141,7 @@ static int ax25_rt_add(struct ax25_routes_struct *route) static void ax25_rt_destroy(ax25_route *ax25_rt) { if (atomic_read(&ax25_rt->ref) == 0) { - if (ax25_rt->digipeat != NULL) - kfree(ax25_rt->digipeat); + kfree(ax25_rt->digipeat); kfree(ax25_rt); return; } @@ -530,9 +525,7 @@ void __exit ax25_rt_free(void) s = ax25_rt; ax25_rt = ax25_rt->next; - if (s->digipeat != NULL) - kfree(s->digipeat); - + kfree(s->digipeat); kfree(s); } write_unlock(&ax25_route_lock); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 03532062a46..ea616e3fc98 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -36,7 +36,6 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/poll.h> -#include <linux/proc_fs.h> #include <net/sock.h> #if defined(CONFIG_KMOD) @@ -50,10 +49,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" - -struct proc_dir_entry *proc_bt; -EXPORT_SYMBOL(proc_bt); +#define VERSION "2.8" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 @@ -312,10 +308,6 @@ static int __init bt_init(void) { BT_INFO("Core ver %s", VERSION); - proc_bt = proc_mkdir("bluetooth", NULL); - if (proc_bt) - proc_bt->owner = THIS_MODULE; - sock_register(&bt_sock_family_ops); BT_INFO("HCI device and connection manager initialized"); @@ -334,8 +326,6 @@ static void __exit bt_exit(void) bt_sysfs_cleanup(); sock_unregister(PF_BLUETOOTH); - - remove_proc_entry("bluetooth", NULL); } subsys_initcall(bt_init); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index cf0df1c8c93..9106354c781 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -183,7 +183,7 @@ static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) static void hci_init_req(struct hci_dev *hdev, unsigned long opt) { struct sk_buff *skb; - __u16 param; + __le16 param; BT_DBG("%s %ld", hdev->name, opt); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b61b4e8e36f..eb64555d1fb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -242,7 +242,7 @@ static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb break; status = *((__u8 *) skb->data); - setting = __le16_to_cpu(get_unaligned((__u16 *) sent)); + setting = __le16_to_cpu(get_unaligned((__le16 *) sent)); if (!status && hdev->voice_setting != setting) { hdev->voice_setting = setting; @@ -728,7 +728,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data; - __u16 *ptr; + __le16 *ptr; int i; skb_pull(skb, sizeof(*ev)); @@ -742,7 +742,7 @@ static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *s tasklet_disable(&hdev->tx_task); - for (i = 0, ptr = (__u16 *) skb->data; i < ev->num_hndl; i++) { + for (i = 0, ptr = (__le16 *) skb->data; i < ev->num_hndl; i++) { struct hci_conn *conn; __u16 handle, count; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 799e448750a..1d6d0a15c09 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -416,7 +416,7 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, skb->dev = (void *) hdev; if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { - u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); + u16 opcode = __le16_to_cpu(get_unaligned((__le16 *) skb->data)); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 7856bc26acc..bd7568ac87f 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -103,7 +103,7 @@ static void bt_release(struct class_device *cdev) kfree(hdev); } -static struct class bt_class = { +struct class bt_class = { .name = "bluetooth", .release = bt_release, #ifdef CONFIG_HOTPLUG @@ -111,6 +111,8 @@ static struct class bt_class = { #endif }; +EXPORT_SYMBOL_GPL(bt_class); + int hci_register_sysfs(struct hci_dev *hdev) { struct class_device *cdev = &hdev->class_dev; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 860444a7fc0..cdb9cfafd96 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -660,9 +660,7 @@ unlink: failed: up_write(&hidp_session_sem); - if (session->input) - kfree(session->input); - + kfree(session->input); kfree(session); return err; } diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 59b2dd36baa..e3bb11ca423 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -38,9 +38,8 @@ #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <linux/list.h> +#include <linux/device.h> #include <net/sock.h> #include <asm/system.h> @@ -56,7 +55,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.7" +#define VERSION "2.8" static struct proto_ops l2cap_sock_ops; @@ -2137,94 +2136,29 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *l2cap_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&l2cap_sk_list.lock); - sk_for_each(sk, node, &l2cap_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} + sk_for_each(sk, node, &l2cap_sk_list.head) { + struct l2cap_pinfo *pi = l2cap_pi(sk); -static void *l2cap_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - (*pos)++; - return sk_next(e); -} + str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, + pi->omtu, pi->link_mode); + } -static void l2cap_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&l2cap_sk_list.lock); -} -static int l2cap_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - struct l2cap_pinfo *pi = l2cap_pi(sk); - - seq_printf(seq, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, - pi->omtu, pi->link_mode); - return 0; + return (str - buf); } -static struct seq_operations l2cap_seq_ops = { - .start = l2cap_seq_start, - .next = l2cap_seq_next, - .stop = l2cap_seq_stop, - .show = l2cap_seq_show -}; - -static int l2cap_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &l2cap_seq_ops); -} - -static struct file_operations l2cap_seq_fops = { - .owner = THIS_MODULE, - .open = l2cap_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init l2cap_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("l2cap", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &l2cap_seq_fops; - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - remove_proc_entry("l2cap", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init l2cap_proc_init(void) -{ - return 0; -} - -static void __exit l2cap_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); static struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, @@ -2266,7 +2200,7 @@ static struct hci_proto l2cap_hci_proto = { static int __init l2cap_init(void) { int err; - + err = proto_register(&l2cap_proto, 0); if (err < 0) return err; @@ -2284,7 +2218,7 @@ static int __init l2cap_init(void) goto error; } - l2cap_proc_init(); + class_create_file(&bt_class, &class_attr_l2cap); BT_INFO("L2CAP ver %s", VERSION); BT_INFO("L2CAP socket layer initialized"); @@ -2298,7 +2232,7 @@ error: static void __exit l2cap_exit(void) { - l2cap_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_l2cap); if (bt_sock_unregister(BTPROTO_L2CAP) < 0) BT_ERR("L2CAP socket unregistration failed"); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index c3d56ead840..0d89d643413 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -35,9 +35,8 @@ #include <linux/signal.h> #include <linux/init.h> #include <linux/wait.h> +#include <linux/device.h> #include <linux/net.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> #include <net/sock.h> #include <asm/uaccess.h> #include <asm/unaligned.h> @@ -47,17 +46,13 @@ #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> -#define VERSION "1.5" +#define VERSION "1.6" #ifndef CONFIG_BT_RFCOMM_DEBUG #undef BT_DBG #define BT_DBG(D...) #endif -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_bt_rfcomm; -#endif - static struct task_struct *rfcomm_thread; static DECLARE_MUTEX(rfcomm_sem); @@ -2001,117 +1996,32 @@ static struct hci_cb rfcomm_cb = { .encrypt_cfm = rfcomm_encrypt_cfm }; -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) { struct rfcomm_session *s; struct list_head *pp, *p; - loff_t l = *pos; + char *str = buf; rfcomm_lock(); list_for_each(p, &session_list) { s = list_entry(p, struct rfcomm_session, list); - list_for_each(pp, &s->dlcs) - if (!l--) { - seq->private = s; - return pp; - } - } - return NULL; -} + list_for_each(pp, &s->dlcs) { + struct sock *sk = s->sock->sk; + struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list); -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct rfcomm_session *s = seq->private; - struct list_head *pp, *p = e; - (*pos)++; - - if (p->next != &s->dlcs) - return p->next; - - list_for_each(p, &session_list) { - s = list_entry(p, struct rfcomm_session, list); - __list_for_each(pp, &s->dlcs) { - seq->private = s; - return pp; + str += sprintf(str, "%s %s %ld %d %d %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); } } - return NULL; -} -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ rfcomm_unlock(); -} - -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct rfcomm_session *s = seq->private; - struct sock *sk = s->sock->sk; - struct rfcomm_dlc *d = list_entry(e, struct rfcomm_dlc, list); - - seq_printf(seq, "%s %s %ld %d %d %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; - -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); -} - -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_proc_init(void) -{ - struct proc_dir_entry *p; - - proc_bt_rfcomm = proc_mkdir("rfcomm", proc_bt); - if (proc_bt_rfcomm) { - proc_bt_rfcomm->owner = THIS_MODULE; - - p = create_proc_entry("dlc", S_IRUGO, proc_bt_rfcomm); - if (p) - p->proc_fops = &rfcomm_seq_fops; - } - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - remove_proc_entry("dlc", proc_bt_rfcomm); - remove_proc_entry("rfcomm", proc_bt); + return (str - buf); } -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL); /* ---- Initialization ---- */ static int __init rfcomm_init(void) @@ -2122,9 +2032,7 @@ static int __init rfcomm_init(void) kernel_thread(rfcomm_run, NULL, CLONE_KERNEL); - BT_INFO("RFCOMM ver %s", VERSION); - - rfcomm_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm_dlc); rfcomm_init_sockets(); @@ -2132,11 +2040,15 @@ static int __init rfcomm_init(void) rfcomm_init_ttys(); #endif + BT_INFO("RFCOMM ver %s", VERSION); + return 0; } static void __exit rfcomm_exit(void) { + class_remove_file(&bt_class, &class_attr_rfcomm_dlc); + hci_unregister_cb(&rfcomm_cb); /* Terminate working thread. @@ -2153,8 +2065,6 @@ static void __exit rfcomm_exit(void) #endif rfcomm_cleanup_sockets(); - - rfcomm_proc_cleanup(); } module_init(rfcomm_init); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index a2b30f0aedb..6c34261b232 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -42,8 +42,7 @@ #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/list.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> +#include <linux/device.h> #include <net/sock.h> #include <asm/system.h> @@ -887,89 +886,26 @@ done: return result; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&rfcomm_sk_list.lock); - sk_for_each(sk, node, &rfcomm_sk_list.head) - if (!l--) - return sk; - return NULL; -} - -static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &rfcomm_sk_list.head) { + str += sprintf(str, "%s %s %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, rfcomm_pi(sk)->channel); + } -static void rfcomm_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&rfcomm_sk_list.lock); -} -static int rfcomm_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), - sk->sk_state, rfcomm_pi(sk)->channel); - return 0; -} - -static struct seq_operations rfcomm_seq_ops = { - .start = rfcomm_seq_start, - .next = rfcomm_seq_next, - .stop = rfcomm_seq_stop, - .show = rfcomm_seq_show -}; - -static int rfcomm_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rfcomm_seq_ops); + return (str - buf); } -static struct file_operations rfcomm_seq_fops = { - .owner = THIS_MODULE, - .open = rfcomm_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init rfcomm_sock_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sock", S_IRUGO, proc_bt_rfcomm); - if (!p) - return -ENOMEM; - p->proc_fops = &rfcomm_seq_fops; - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - remove_proc_entry("sock", proc_bt_rfcomm); -} - -#else /* CONFIG_PROC_FS */ - -static int __init rfcomm_sock_proc_init(void) -{ - return 0; -} - -static void __exit rfcomm_sock_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); static struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, @@ -997,7 +933,7 @@ static struct net_proto_family rfcomm_sock_family_ops = { .create = rfcomm_sock_create }; -int __init rfcomm_init_sockets(void) +int __init rfcomm_init_sockets(void) { int err; @@ -1009,7 +945,7 @@ int __init rfcomm_init_sockets(void) if (err < 0) goto error; - rfcomm_sock_proc_init(); + class_create_file(&bt_class, &class_attr_rfcomm); BT_INFO("RFCOMM socket layer initialized"); @@ -1023,7 +959,7 @@ error: void __exit rfcomm_cleanup_sockets(void) { - rfcomm_sock_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_rfcomm); if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) BT_ERR("RFCOMM socket layer unregistration failed"); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 997e42df115..9cb00dc6c08 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -38,8 +38,7 @@ #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/skbuff.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> +#include <linux/device.h> #include <linux/list.h> #include <net/sock.h> @@ -55,7 +54,7 @@ #define BT_DBG(D...) #endif -#define VERSION "0.4" +#define VERSION "0.5" static struct proto_ops sco_sock_ops; @@ -893,91 +892,26 @@ drop: return 0; } -/* ---- Proc fs support ---- */ -#ifdef CONFIG_PROC_FS -static void *sco_seq_start(struct seq_file *seq, loff_t *pos) +static ssize_t sco_sysfs_show(struct class *dev, char *buf) { struct sock *sk; struct hlist_node *node; - loff_t l = *pos; + char *str = buf; read_lock_bh(&sco_sk_list.lock); - sk_for_each(sk, node, &sco_sk_list.head) - if (!l--) - goto found; - sk = NULL; -found: - return sk; -} - -static void *sco_seq_next(struct seq_file *seq, void *e, loff_t *pos) -{ - struct sock *sk = e; - (*pos)++; - return sk_next(sk); -} + sk_for_each(sk, node, &sco_sk_list.head) { + str += sprintf(str, "%s %s %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state); + } -static void sco_seq_stop(struct seq_file *seq, void *e) -{ read_unlock_bh(&sco_sk_list.lock); -} - -static int sco_seq_show(struct seq_file *seq, void *e) -{ - struct sock *sk = e; - seq_printf(seq, "%s %s %d\n", - batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state); - return 0; -} -static struct seq_operations sco_seq_ops = { - .start = sco_seq_start, - .next = sco_seq_next, - .stop = sco_seq_stop, - .show = sco_seq_show -}; - -static int sco_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &sco_seq_ops); + return (str - buf); } -static struct file_operations sco_seq_fops = { - .owner = THIS_MODULE, - .open = sco_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int __init sco_proc_init(void) -{ - struct proc_dir_entry *p = create_proc_entry("sco", S_IRUGO, proc_bt); - if (!p) - return -ENOMEM; - p->owner = THIS_MODULE; - p->proc_fops = &sco_seq_fops; - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - remove_proc_entry("sco", proc_bt); -} - -#else /* CONFIG_PROC_FS */ - -static int __init sco_proc_init(void) -{ - return 0; -} - -static void __exit sco_proc_cleanup(void) -{ - return; -} -#endif /* CONFIG_PROC_FS */ +static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); static struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, @@ -1035,7 +969,7 @@ static int __init sco_init(void) goto error; } - sco_proc_init(); + class_create_file(&bt_class, &class_attr_sco); BT_INFO("SCO (Voice Link) ver %s", VERSION); BT_INFO("SCO socket layer initialized"); @@ -1049,7 +983,7 @@ error: static void __exit sco_exit(void) { - sco_proc_cleanup(); + class_remove_file(&bt_class, &class_attr_sco); if (bt_sock_unregister(BTPROTO_SCO) < 0) BT_ERR("SCO socket unregistration failed"); diff --git a/net/core/datagram.c b/net/core/datagram.c index d219435d086..1bcfef51ac5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -350,6 +350,20 @@ fault: return -EFAULT; } +unsigned int __skb_checksum_complete(struct sk_buff *skb) +{ + unsigned int sum; + + sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete); + /** * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. * @skb: skbuff @@ -363,7 +377,7 @@ fault: * -EFAULT - fault during copy. Beware, in this case iovec * can be modified! */ -int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov) { unsigned int csum; @@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, iov++; if (iov->iov_len < chunk) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, - skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) goto fault; @@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, goto fault; if ((unsigned short)csum_fold(csum)) goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); iov->iov_len -= chunk; iov->iov_base += chunk; } diff --git a/net/core/dev.c b/net/core/dev.c index 8d154159527..0b48e294aaf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1108,6 +1108,18 @@ out: return ret; } +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", dev->name); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + #ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index db098ff3cd6..cb530eef0e3 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -194,8 +194,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) done: spin_unlock_bh(&dev->xmit_lock); - if (dmi1) - kfree(dmi1); + kfree(dmi1); return err; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad..49424a42a2c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { - if (uh->check == 0) + unsigned int psum; + + if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) return 0; - if (skb->ip_summed == CHECKSUM_HW) - return csum_tcpudp_magic( - saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_HW && + !(u16)csum_fold(csum_add(psum, skb->csum))) + return 0; - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + skb->csum = psum; - return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } /* @@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) if (ulen != len) goto out; - if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) goto out; if (np->local_ip && np->local_ip != ntohl(iph->daddr)) goto out; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9bed7569ce3..8700379685e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -49,6 +49,7 @@ #include <net/udp.h> #include <net/sock.h> #include <net/pkt_sched.h> +#include <net/netlink.h> DECLARE_MUTEX(rtnl_sem); @@ -462,11 +463,6 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); } -static int rtnetlink_done(struct netlink_callback *cb) -{ - return 0; -} - /* Protected by RTNL sempahore. */ static struct rtattr **rta_buf; static int rtattr_max; @@ -524,8 +520,6 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) } if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { - u32 rlen; - if (link->dumpit == NULL) link = &(rtnetlink_links[PF_UNSPEC][type]); @@ -533,14 +527,11 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) goto err_inval; if ((*errp = netlink_dump_start(rtnl, skb, nlh, - link->dumpit, - rtnetlink_done)) != 0) { + link->dumpit, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -579,75 +570,13 @@ err_inval: return -1; } -/* - * Process one packet of messages. - * Malformed skbs with wrong lengths of messages are discarded silently. - */ - -static inline int rtnetlink_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (rtnetlink_rcv_msg(skb, nlh, &err)) { - /* Not error, but we must interrupt processing here: - * Note, that in this case we do not pull message - * from skb, it will be processed later. - */ - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags&NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - -/* - * rtnetlink input queue processing routine: - * - process as much as there was in the queue upon entry. - * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, - * that will occur, when a dump started. - */ - static void rtnetlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - rtnl_lock(); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (rtnetlink_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &rtnetlink_rcv_msg); up(&rtnl_sem); netdev_run_todo(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 95501e40100..b7d13a4fff4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -336,6 +336,9 @@ void __kfree_skb(struct sk_buff *skb) } #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif @@ -414,9 +417,17 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); #endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); nf_bridge_get(skb->nf_bridge); @@ -474,6 +485,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + new->nfct_reasm = old->nfct_reasm; + nf_conntrack_get_reasm(old->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif diff --git a/net/core/sock.c b/net/core/sock.c index 9602ceb3bac..13cc3be4f05 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1242,8 +1242,7 @@ static void sock_def_write_space(struct sock *sk) static void sock_def_destruct(struct sock *sk) { - if (sk->sk_protinfo) - kfree(sk->sk_protinfo); + kfree(sk->sk_protinfo); } void sk_send_sigurg(struct sock *sk) diff --git a/net/core/stream.c b/net/core/stream.c index ac9edfdf874..15bfd03e802 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { struct task_struct *tsk = current; DEFINE_WAIT(wait); + int done; - while (1) { + do { if (sk->sk_err) return sock_error(sk); if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) @@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; - if (sk_wait_event(sk, timeo_p, - !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) - break; + done = sk_wait_event(sk, timeo_p, + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); sk->sk_write_pending--; - } + } while (!done); return 0; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6298cf58ff9..ca03521112c 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; EXPORT_SYMBOL_GPL(dccp_hashinfo); @@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk) int ret; if (snum == 0) { - int rover; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; + int rover = net_random() % (high - low) + low; struct hlist_node *node; struct inet_timewait_sock *tw = NULL; local_bh_disable(); - - /* TODO. Actually it is not so bad idea to remove - * dccp_hashinfo.portalloc_lock before next submission to - * Linus. - * As soon as we touch this place at all it is time to think. - * - * Now it protects single _advisory_ variable - * dccp_hashinfo.port_rover, hence it is mostly useless. - * Code will work nicely if we just delete it, but - * I am afraid in contented case it will work not better or - * even worse: another cpu just will hit the same bucket - * and spin there. - * So some cpu salt could remove both contention and - * memory pingpong. Any ideas how to do this in a nice way? - */ - spin_lock(&dccp_hashinfo.portalloc_lock); - rover = dccp_hashinfo.port_rover; - do { - rover++; - if ((rover < low) || (rover > high)) - rover = low; head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; spin_lock(&head->lock); @@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk) next_port: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); local_bh_enable(); @@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk) ok: /* All locks still held and bhs disabled */ - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); - inet_bind_hash(sk, tb, rover); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(rover); @@ -1289,10 +1263,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(&dccp_hashinfo, sk); - if (dp->dccps_service_list != NULL) { - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - } + kfree(dp->dccps_service_list); + dp->dccps_service_list = NULL; ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index a021c3422f6..e0ace7cbb99 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -238,8 +238,7 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service, lock_sock(sk); dp->dccps_service = service; - if (dp->dccps_service_list != NULL) - kfree(dp->dccps_service_list); + kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3f25cadccdd..f89e55f814d 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, goto out; } - rv = dn_check_state(sk, NULL, 0, &timeo, flags); - if (rv) - goto out; - if (sk->sk_shutdown & RCV_SHUTDOWN) { - if (!(flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - rv = -EPIPE; + rv = 0; goto out; } + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { rv = -EOPNOTSUPP; goto out; @@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & SEND_SHUTDOWN) { err = -EPIPE; + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); goto out_err; } diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index eeba56f9932..6f8b5658cb4 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -784,16 +784,14 @@ struct dn_fib_table *dn_fib_get_table(int n, int create) static void dn_fib_del_tree(int n) { - struct dn_fib_table *t; + struct dn_fib_table *t; - write_lock(&dn_fib_tables_lock); - t = dn_fib_tables[n]; - dn_fib_tables[n] = NULL; - write_unlock(&dn_fib_tables_lock); + write_lock(&dn_fib_tables_lock); + t = dn_fib_tables[n]; + dn_fib_tables[n] = NULL; + write_unlock(&dn_fib_tables_lock); - if (t) { - kfree(t); - } + kfree(t); } struct dn_fib_table *dn_fib_empty_table(void) diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c index 98a494be603..9d57b4fb644 100644 --- a/net/ethernet/pe2.c +++ b/net/ethernet/pe2.c @@ -32,8 +32,7 @@ struct datalink_proto *make_EII_client(void) void destroy_EII_client(struct datalink_proto *dl) { - if (dl) - kfree(dl); + kfree(dl); } EXPORT_SYMBOL(destroy_EII_client); diff --git a/net/ieee80211/ieee80211_crypt.c b/net/ieee80211/ieee80211_crypt.c index f3b6aa3be63..ecc9bb196ab 100644 --- a/net/ieee80211/ieee80211_crypt.c +++ b/net/ieee80211/ieee80211_crypt.c @@ -11,16 +11,14 @@ * */ -#include <linux/config.h> -#include <linux/version.h> +#include <linux/errno.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> -#include <asm/string.h> -#include <asm/errno.h> - +#include <linux/string.h> #include <net/ieee80211.h> + MODULE_AUTHOR("Jouni Malinen"); MODULE_DESCRIPTION("HostAP crypto"); MODULE_LICENSE("GPL"); @@ -30,32 +28,20 @@ struct ieee80211_crypto_alg { struct ieee80211_crypto_ops *ops; }; -struct ieee80211_crypto { - struct list_head algs; - spinlock_t lock; -}; - -static struct ieee80211_crypto *hcrypt; +static LIST_HEAD(ieee80211_crypto_algs); +static DEFINE_SPINLOCK(ieee80211_crypto_lock); void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) { - struct list_head *ptr, *n; - struct ieee80211_crypt_data *entry; + struct ieee80211_crypt_data *entry, *next; unsigned long flags; spin_lock_irqsave(&ieee->lock, flags); - - if (list_empty(&ieee->crypt_deinit_list)) - goto unlock; - - for (ptr = ieee->crypt_deinit_list.next, n = ptr->next; - ptr != &ieee->crypt_deinit_list; ptr = n, n = ptr->next) { - entry = list_entry(ptr, struct ieee80211_crypt_data, list); - + list_for_each_entry_safe(entry, next, &ieee->crypt_deinit_list, list) { if (atomic_read(&entry->refcnt) != 0 && !force) continue; - list_del(ptr); + list_del(&entry->list); if (entry->ops) { entry->ops->deinit(entry->priv); @@ -63,7 +49,6 @@ void ieee80211_crypt_deinit_entries(struct ieee80211_device *ieee, int force) } kfree(entry); } - unlock: spin_unlock_irqrestore(&ieee->lock, flags); } @@ -126,9 +111,6 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) unsigned long flags; struct ieee80211_crypto_alg *alg; - if (hcrypt == NULL) - return -1; - alg = kmalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) return -ENOMEM; @@ -136,9 +118,9 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) memset(alg, 0, sizeof(*alg)); alg->ops = ops; - spin_lock_irqsave(&hcrypt->lock, flags); - list_add(&alg->list, &hcrypt->algs); - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_add(&alg->list, &ieee80211_crypto_algs); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); printk(KERN_DEBUG "ieee80211_crypt: registered algorithm '%s'\n", ops->name); @@ -148,64 +130,49 @@ int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops) int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *del_alg = NULL; - - if (hcrypt == NULL) - return -1; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (alg->ops == ops) { - list_del(&alg->list); - del_alg = alg; - break; - } - } - spin_unlock_irqrestore(&hcrypt->lock, flags); - if (del_alg) { - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s'\n", ops->name); - kfree(del_alg); + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (alg->ops == ops) + goto found; } - - return del_alg ? 0 : -1; + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " + "'%s'\n", ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + kfree(alg); + return 0; } struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name) { + struct ieee80211_crypto_alg *alg; unsigned long flags; - struct list_head *ptr; - struct ieee80211_crypto_alg *found_alg = NULL; - - if (hcrypt == NULL) - return NULL; - - spin_lock_irqsave(&hcrypt->lock, flags); - for (ptr = hcrypt->algs.next; ptr != &hcrypt->algs; ptr = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - if (strcmp(alg->ops->name, name) == 0) { - found_alg = alg; - break; - } + + spin_lock_irqsave(&ieee80211_crypto_lock, flags); + list_for_each_entry(alg, &ieee80211_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; } - spin_unlock_irqrestore(&hcrypt->lock, flags); + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return NULL; - if (found_alg) - return found_alg->ops; - else - return NULL; + found: + spin_unlock_irqrestore(&ieee80211_crypto_lock, flags); + return alg->ops; } static void *ieee80211_crypt_null_init(int keyidx) { return (void *)1; } + static void ieee80211_crypt_null_deinit(void *priv) { } @@ -214,56 +181,18 @@ static struct ieee80211_crypto_ops ieee80211_crypt_null = { .name = "NULL", .init = ieee80211_crypt_null_init, .deinit = ieee80211_crypt_null_deinit, - .encrypt_mpdu = NULL, - .decrypt_mpdu = NULL, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = NULL, - .get_key = NULL, - .extra_mpdu_prefix_len = 0, - .extra_mpdu_postfix_len = 0, .owner = THIS_MODULE, }; static int __init ieee80211_crypto_init(void) { - int ret = -ENOMEM; - - hcrypt = kmalloc(sizeof(*hcrypt), GFP_KERNEL); - if (!hcrypt) - goto out; - - memset(hcrypt, 0, sizeof(*hcrypt)); - INIT_LIST_HEAD(&hcrypt->algs); - spin_lock_init(&hcrypt->lock); - - ret = ieee80211_register_crypto_ops(&ieee80211_crypt_null); - if (ret < 0) { - kfree(hcrypt); - hcrypt = NULL; - } - out: - return ret; + return ieee80211_register_crypto_ops(&ieee80211_crypt_null); } static void __exit ieee80211_crypto_deinit(void) { - struct list_head *ptr, *n; - - if (hcrypt == NULL) - return; - - for (ptr = hcrypt->algs.next, n = ptr->next; ptr != &hcrypt->algs; - ptr = n, n = ptr->next) { - struct ieee80211_crypto_alg *alg = - (struct ieee80211_crypto_alg *)ptr; - list_del(ptr); - printk(KERN_DEBUG "ieee80211_crypt: unregistered algorithm " - "'%s' (deinit)\n", alg->ops->name); - kfree(alg); - } - - kfree(hcrypt); + ieee80211_unregister_crypto_ops(&ieee80211_crypt_null); + BUG_ON(!list_empty(&ieee80211_crypto_algs)); } EXPORT_SYMBOL(ieee80211_crypt_deinit_entries); diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c index 05a853c1301..47022172850 100644 --- a/net/ieee80211/ieee80211_crypt_ccmp.c +++ b/net/ieee80211/ieee80211_crypt_ccmp.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c index 2e34f29b795..e0988320efb 100644 --- a/net/ieee80211/ieee80211_crypt_tkip.c +++ b/net/ieee80211/ieee80211_crypt_tkip.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c index 7c08ed2f262..073aebdf0f6 100644 --- a/net/ieee80211/ieee80211_crypt_wep.c +++ b/net/ieee80211/ieee80211_crypt_wep.c @@ -10,7 +10,6 @@ */ #include <linux/config.h> -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> diff --git a/net/ieee80211/ieee80211_geo.c b/net/ieee80211/ieee80211_geo.c index c4b54ef8f6d..610cc5cbc25 100644 --- a/net/ieee80211/ieee80211_geo.c +++ b/net/ieee80211/ieee80211_geo.c @@ -38,7 +38,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> diff --git a/net/ieee80211/ieee80211_module.c b/net/ieee80211/ieee80211_module.c index f66d792cd20..321287bc887 100644 --- a/net/ieee80211/ieee80211_module.c +++ b/net/ieee80211/ieee80211_module.c @@ -45,7 +45,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index ce694cf5c16..03efaacbdb7 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> @@ -370,6 +369,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* Put this code here so that we avoid duplicating it in all * Rx paths. - Jean II */ #ifdef IW_WIRELESS_SPY /* defined in iw_handler.h */ +#ifdef CONFIG_NET_RADIO /* If spy monitoring on */ if (ieee->spy_data.spy_number > 0) { struct iw_quality wstats; @@ -396,6 +396,7 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, /* Update spy records */ wireless_spy_update(ieee->dev, hdr->addr2, &wstats); } +#endif /* CONFIG_NET_RADIO */ #endif /* IW_WIRELESS_SPY */ #ifdef NOT_YET diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c index 95ccbadbf55..445f206e65e 100644 --- a/net/ieee80211/ieee80211_tx.c +++ b/net/ieee80211/ieee80211_tx.c @@ -38,7 +38,6 @@ #include <linux/slab.h> #include <linux/tcp.h> #include <linux/types.h> -#include <linux/version.h> #include <linux/wireless.h> #include <linux/etherdevice.h> #include <asm/uaccess.h> diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c index 1ce7af9bec3..181755f2aa8 100644 --- a/net/ieee80211/ieee80211_wx.c +++ b/net/ieee80211/ieee80211_wx.c @@ -161,9 +161,11 @@ static inline char *ipw2100_translate_scan(struct ieee80211_device *ieee, (ieee->perfect_rssi - ieee->worst_rssi) - (ieee->perfect_rssi - network->stats.rssi) * (15 * (ieee->perfect_rssi - ieee->worst_rssi) + - 62 * (ieee->perfect_rssi - network->stats.rssi))) / - ((ieee->perfect_rssi - ieee->worst_rssi) * - (ieee->perfect_rssi - ieee->worst_rssi)); + 62 * (ieee->perfect_rssi - + network->stats.rssi))) / + ((ieee->perfect_rssi - + ieee->worst_rssi) * (ieee->perfect_rssi - + ieee->worst_rssi)); if (iwe.u.qual.qual > 100) iwe.u.qual.qual = 100; else if (iwe.u.qual.qual < 1) @@ -520,7 +522,8 @@ int ieee80211_wx_set_encodeext(struct ieee80211_device *ieee, crypt = &ieee->crypt[idx]; group_key = 1; } else { - if (idx != 0) + /* some Cisco APs use idx>0 for unicast in dynamic WEP */ + if (idx != 0 && ext->alg != IW_ENCODE_ALG_WEP) return -EINVAL; if (ieee->iw_mode == IW_MODE_INFRA) crypt = &ieee->crypt[idx]; @@ -688,7 +691,8 @@ int ieee80211_wx_get_encodeext(struct ieee80211_device *ieee, } else idx = ieee->tx_keyidx; - if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY) + if (!ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY && + ext->alg != IW_ENCODE_ALG_WEP) if (idx != 0 || ieee->iw_mode != IW_MODE_INFRA) return -EINVAL; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a9d84f93442..eaa150c33b0 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -147,8 +147,7 @@ void inet_sock_destruct(struct sock *sk) BUG_TRAP(!sk->sk_wmem_queued); BUG_TRAP(!sk->sk_forward_alloc); - if (inet->opt) - kfree(inet->opt); + kfree(inet->opt); dst_release(sk->sk_dst_cache); sk_refcnt_debug_dec(sk); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 990633c09df..2267c1fad87 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -266,8 +266,7 @@ int ip_rt_ioctl(unsigned int cmd, void __user *arg) if (tb) err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); } - if (rta.rta_mx) - kfree(rta.rta_mx); + kfree(rta.rta_mx); } rtnl_unlock(); return err; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec56..e3eceecd049 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc8406..c04607b4921 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b..3fe021f1a56 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; do { - rover++; - if (rover > high) - rover = low; head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 71f3c7350c6..39061ed53cf 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -724,12 +724,6 @@ done: return skb->len; } -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - static __inline__ int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { @@ -760,8 +754,7 @@ inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) goto err_inval; } return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); + inet_diag_dump, NULL); } else { return inet_diag_get_exact(skb, nlh); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c..8ce0ce2ee48 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53..4e9c74b54b1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bce4e875193..dbe12da8d8b 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -510,8 +510,7 @@ static int ip_options_get_finish(struct ip_options **optp, kfree(opt); return -EINVAL; } - if (*optp) - kfree(*optp); + kfree(*optp); *optp = opt; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 17758234a3e..11c2f68254f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -353,7 +353,8 @@ packet_routed: ip_options_build(skb, opt, inet->daddr, rt, 0); } - ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + ip_select_ident_more(iph, &rt->u.dst, sk, + (skb_shinfo(skb)->tso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -1262,10 +1263,8 @@ int ip_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; @@ -1289,10 +1288,8 @@ void ip_flush_pending_frames(struct sock *sk) kfree_skb(skb); inet->cork.flags &= ~IPCORK_OPT; - if (inet->cork.opt) { - kfree(inet->cork.opt); - inet->cork.opt = NULL; - } + kfree(inet->cork.opt); + inet->cork.opt = NULL; if (inet->cork.rt) { ip_rt_put(inet->cork.rt); inet->cork.rt = NULL; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 2f0b47da5b3..4f2d8725730 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -202,8 +202,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s if (ra->sk == sk) { if (on) { write_unlock_bh(&ip_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; @@ -446,8 +445,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, #endif } opt = xchg(&inet->opt, opt); - if (opt) - kfree(opt); + kfree(opt); break; } case IP_PKTINFO: @@ -828,10 +826,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_mc_msfilter(sk, msf, ifindex); mc_msf_out: - if (msf) - kfree(msf); - if (gsf) - kfree(gsf); + kfree(msf); + kfree(gsf); break; } case IP_ROUTER_ALERT: diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index fc6f95aaa96..d7eb680101c 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -110,8 +110,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) return 0; out: - if (inc->timeout_table) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); return ret; } @@ -136,8 +135,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) list_del(&inc->a_list); - if (inc->timeout_table != NULL) - kfree(inc->timeout_table); + kfree(inc->timeout_table); kfree(inc); } diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 981cc3244ef..1a0843cd58a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -1009,11 +1009,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, if (sysctl_ip_vs_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); - } else { - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); return NF_DROP; } diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c index bd7d75b6abe..d34a9fa608e 100644 --- a/net/ipv4/multipath_wrandom.c +++ b/net/ipv4/multipath_wrandom.c @@ -207,16 +207,12 @@ static void wrandom_select_route(const struct flowi *flp, decision = mpc->rt; last_power = mpc->power; - if (last_mpc) - kfree(last_mpc); - + kfree(last_mpc); last_mpc = mpc; } - if (last_mpc) { - /* concurrent __multipath_flush may lead to !last_mpc */ - kfree(last_mpc); - } + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); decision->u.dst.__use++; *rp = decision; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 7d917e4ce1d..9d3c8b5f327 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -5,6 +5,20 @@ menu "IP: Netfilter Configuration" depends on INET && NETFILTER +config NF_CONNTRACK_IPV4 + tristate "IPv4 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv4 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + # connection tracking, helpers and protocols config IP_NF_CONNTRACK tristate "Connection tracking (required for masq/NAT)" @@ -209,8 +223,8 @@ config IP_NF_MATCH_PKTTYPE tristate "Packet type match support" depends on IP_NF_IPTABLES help - Packet type matching allows you to match a packet by - its "class", eg. BROADCAST, MULTICAST, ... + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... Typical usage: iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG @@ -317,7 +331,8 @@ config IP_NF_MATCH_TCPMSS config IP_NF_MATCH_HELPER tristate "Helper match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Helper matching allows you to match packets in dynamic connections tracked by a conntrack-helper, ie. ip_conntrack_ftp @@ -326,7 +341,8 @@ config IP_NF_MATCH_HELPER config IP_NF_MATCH_STATE tristate "Connection state match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help Connection state matching allows you to match packets based on their relationship to a tracked connection (ie. previous packets). This @@ -336,7 +352,8 @@ config IP_NF_MATCH_STATE config IP_NF_MATCH_CONNTRACK tristate "Connection tracking match support" - depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help This is a general conntrack match module, a superset of the state match. @@ -422,7 +439,8 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -433,7 +451,8 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' - depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + depends on IP_NF_IPTABLES + depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -747,7 +766,8 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' - depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + depends on IP_NF_MANGLE + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -759,7 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_IPTABLES && EXPERIMENTAL + depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing @@ -782,7 +803,7 @@ config IP_NF_RAW config IP_NF_TARGET_NOTRACK tristate 'NOTRACK target support' depends on IP_NF_RAW - depends on IP_NF_CONNTRACK + depends on IP_NF_CONNTRACK || NF_CONNTRACK_IPV4 help The NOTRACK target allows a select rule to specify which packets *not* to enter the conntrack/NAT diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index dab4b58dd31..058c48e258f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -103,3 +103,9 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o + +# objects for l3 independent conntrack +nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f8..59e12b02b22 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 926a6684643..4108a5e12b3 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master, exp_orig->expectfn = pptp_expectfn; exp_orig->flags = 0; - exp_orig->dir = IP_CT_DIR_ORIGINAL; - /* both expectations are identical apart from tuple */ memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); - exp_reply->dir = !exp_orig->dir; - if (ip_nat_pptp_hook_exp_gre) ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); else { diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f..2dea1db1440 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include <linux/moduleparam.h> #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 166e6069f12..de9f4464438 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -28,11 +28,8 @@ #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> -#include <linux/rtnetlink.h> #include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> @@ -58,14 +55,17 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct ip_conntrack_tuple *tuple) { struct ip_conntrack_protocol *proto; + int ret = 0; NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (proto && proto->tuple_to_nfattr) - return proto->tuple_to_nfattr(skb, tuple); + if (likely(proto && proto->tuple_to_nfattr)) { + ret = proto->tuple_to_nfattr(skb, tuple); + ip_conntrack_proto_put(proto); + } - return 0; + return ret; nfattr_failure: return -1; @@ -175,7 +175,7 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nfattr *nest_count = NFA_NEST(skb, type); - u_int64_t tmp; + u_int32_t tmp; tmp = htonl(ct->counters[dir].packets); NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp); @@ -467,7 +467,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -479,9 +479,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - - if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_IP_MAX, attr); if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; @@ -497,12 +495,9 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } -static const int cta_min_proto[CTA_PROTO_MAX] = { +static const size_t cta_min_proto[CTA_PROTO_MAX] = { [CTA_PROTO_NUM-1] = sizeof(u_int16_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), @@ -521,8 +516,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTO_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; @@ -539,9 +533,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, } return ret; - -nfattr_failure: - return -1; } static inline int @@ -555,8 +546,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, memset(tuple, 0, sizeof(*tuple)); - if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]); if (!tb[CTA_TUPLE_IP-1]) return -EINVAL; @@ -583,13 +573,10 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; @@ -603,11 +590,10 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr); if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) - goto nfattr_failure; + return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); if (!npt) @@ -626,11 +612,13 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -642,8 +630,10 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -665,9 +655,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("leaving\n"); return 0; - -nfattr_failure: - return -1; } #endif @@ -678,8 +665,7 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) DEBUGP("entered %s\n", __FUNCTION__); - if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_HELP_MAX, attr); if (!tb[CTA_HELP_NAME-1]) return -EINVAL; @@ -687,11 +673,16 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); return 0; - -nfattr_failure: - return -1; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -703,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -785,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -804,7 +801,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, ct = tuplehash_to_ctrack(h); err = -ENOMEM; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) { ip_conntrack_put(ct); return -ENOMEM; @@ -815,7 +812,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, IPCTNL_MSG_CT_NEW, 1, ct); ip_conntrack_put(ct); if (err <= 0) - goto out; + goto free; err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); if (err < 0) @@ -824,10 +821,10 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("leaving\n"); return 0; +free: + kfree_skb(skb2); out: - if (skb2) - kfree_skb(skb2); - return -1; + return err; } static inline int @@ -957,8 +954,7 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; int err = 0; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0) - goto nfattr_failure; + nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); if (!proto) @@ -969,9 +965,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) ip_conntrack_proto_put(proto); return err; - -nfattr_failure: - return -ENOMEM; } static int @@ -1005,6 +998,11 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) return err; } +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("all done\n"); return 0; } @@ -1048,6 +1046,11 @@ ctnetlink_create_conntrack(struct nfattr *cda[], if (ct->helper) ip_conntrack_helper_put(ct->helper); +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (cda[CTA_MARK-1]) + ct->mark = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_MARK-1])); +#endif + DEBUGP("conntrack with id %u inserted\n", ct->id); return 0; @@ -1066,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1271,6 +1277,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1282,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1312,6 +1326,14 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (!exp) return -ENOENT; + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + err = -ENOMEM; skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1322,21 +1344,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, 1, exp); if (err <= 0) - goto out; + goto free; ip_conntrack_expect_put(exp); - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto free; - - return err; + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +free: + kfree_skb(skb2); out: ip_conntrack_expect_put(exp); -free: - if (skb2) - kfree_skb(skb2); return err; } @@ -1349,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1392,7 +1412,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, ip_conntrack_expect_put(exp); } } - write_unlock(&ip_conntrack_lock); + write_unlock_bh(&ip_conntrack_lock); } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); @@ -1478,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) @@ -1520,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { @@ -1559,6 +1575,8 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .cb = ctnl_exp_cb, }; +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); + static int __init ctnetlink_init(void) { int ret; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 98f0015dd25..e4d6b268e8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter.h> @@ -151,13 +152,13 @@ icmp_error_message(struct sk_buff *skb, /* Not enough header? */ inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); if (inside == NULL) - return NF_ACCEPT; + return -NF_ACCEPT; /* Ignore ICMP's containing fragments (shouldn't happen) */ if (inside->ip.frag_off & htons(IP_OFFSET)) { DEBUGP("icmp_error_track: fragment of proto %u\n", inside->ip.protocol); - return NF_ACCEPT; + return -NF_ACCEPT; } innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); @@ -166,7 +167,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Ordinarily, we'd expect the inverted tupleproto, but it's @@ -174,7 +175,7 @@ icmp_error_message(struct sk_buff *skb, if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); ip_conntrack_proto_put(innerproto); - return NF_ACCEPT; + return -NF_ACCEPT; } ip_conntrack_proto_put(innerproto); @@ -190,7 +191,7 @@ icmp_error_message(struct sk_buff *skb, if (!h) { DEBUGP("icmp_error_track: no match\n"); - return NF_ACCEPT; + return -NF_ACCEPT; } /* Reverse direction from that found */ if (DIRECTION(h) != IP_CT_DIR_REPLY) @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: @@ -296,7 +293,8 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], struct ip_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1]) + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) return -1; tuple->dst.u.icmp.type = @@ -304,7 +302,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[], tuple->dst.u.icmp.code = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); tuple->src.u.icmp.id = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); return 0; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index d6701cafbcc..ee3b7d6c4d2 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -357,13 +357,24 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; struct nfattr *tb[CTA_PROTOINFO_TCP_MAX]; - if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0) - goto nfattr_failure; + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!attr) + return 0; + + nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; @@ -374,9 +385,6 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) write_unlock_bh(&tcp_lock); return 0; - -nfattr_failure: - return -1; } #endif @@ -813,6 +821,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525..d3c5a371f99 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index c5e3abd2467..762f4d93936 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum) * removed until we've grabbed the reference */ preempt_disable(); p = __ip_nat_proto_find(protonum); - if (p) { - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - } + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; preempt_enable(); return p; diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index 3cdd0684d30..e546203f566 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -73,6 +73,7 @@ static void pptp_nat_expected(struct ip_conntrack *ct, struct ip_conntrack_tuple t; struct ip_ct_pptp_master *ct_pptp_info; struct ip_nat_pptp *nat_pptp_info; + struct ip_nat_range range; ct_pptp_info = &master->help.ct_pptp_info; nat_pptp_info = &master->nat.help.nat_pptp_info; @@ -110,7 +111,30 @@ static void pptp_nat_expected(struct ip_conntrack *ct, DEBUGP("not found!\n"); } - ip_nat_follow_master(ct, exp); + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + if (exp->dir == IP_CT_DIR_ORIGINAL) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + if (exp->dir == IP_CT_DIR_REPLY) { + range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + range.min = range.max = exp->saved_proto; + } + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); } /* outbound packets == from PNS to PAC */ @@ -213,9 +237,10 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, /* alter expectation for PNS->PAC direction */ invert_tuplepr(&inv_t, &expect_orig->tuple); - expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); + expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); + expect_orig->dir = IP_CT_DIR_ORIGINAL; inv_t.src.ip = reply_t->src.ip; inv_t.dst.ip = reply_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); @@ -233,6 +258,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); + expect_reply->dir = IP_CT_DIR_REPLY; inv_t.src.ip = orig_t->src.ip; inv_t.dst.ip = orig_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index 7c128540167..f7cad7cf1ae 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb, break; case GRE_VERSION_PPTP: DEBUGP("call_id -> 0x%04x\n", - ntohl(tuple->dst.u.gre.key)); - pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key)); + ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; break; default: DEBUGP("can't nat unknown GRE version\n"); diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 99bbef56f84..f0099a646a0 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) struct ip_nat_protocol ip_nat_unknown_protocol = { .name = "unknown", - .me = THIS_MODULE, + /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, .unique_tuple = unknown_unique_tuple, diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 93b2c5111bb..8acb7ed40b4 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1161,8 +1161,7 @@ static int snmp_parse_mangle(unsigned char *msg, if (!snmp_object_decode(&ctx, obj)) { if (*obj) { - if ((*obj)->id) - kfree((*obj)->id); + kfree((*obj)->id); kfree(*obj); } kfree(obj); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9bcb398fbc1..45c52d8f4d9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,7 +29,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #define CLUSTERIP_VERSION "0.8" @@ -316,14 +316,14 @@ target(struct sk_buff **pskb, { const struct ipt_clusterip_tgt_info *cipinfo = targinfo; enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - u_int32_t hash; + u_int32_t *mark, hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ - if (!ct) { + mark = nf_ct_get_mark((*pskb), &ctinfo); + if (mark == NULL) { printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); /* FIXME: need to drop invalid ones, since replies * to outgoing connections of other nodes will be @@ -346,7 +346,7 @@ target(struct sk_buff **pskb, switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + *mark = hash; break; case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: @@ -363,7 +363,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, *mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 13463802133..8acac5a40a9 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -29,7 +29,7 @@ MODULE_LICENSE("GPL"); #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CONNMARK.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static unsigned int target(struct sk_buff **pskb, @@ -43,24 +43,24 @@ target(struct sk_buff **pskb, u_int32_t diff; u_int32_t nfmark; u_int32_t newmark; + u_int32_t ctinfo; + u_int32_t *ctmark = nf_ct_get_mark(*pskb, &ctinfo); - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); - if (ct) { + if (ctmark) { switch(markinfo->mode) { case IPT_CONNMARK_SET: - newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; - if (newmark != ct->mark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | markinfo->mark; + if (newmark != *ctmark) + *ctmark = newmark; break; case IPT_CONNMARK_SAVE: - newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); - if (ct->mark != newmark) - ct->mark = newmark; + newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (*ctmark != newmark) + *ctmark = newmark; break; case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; - diff = (ct->mark ^ nfmark) & markinfo->mask; + diff = (*ctmark ^ nfmark) & markinfo->mask; if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; break; @@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = { static int __init init(void) { + need_ip_conntrack(); return ipt_register_target(&ipt_connmark_reg); } diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c index a4bb9b3bc29..e3c69d072c6 100644 --- a/net/ipv4/netfilter/ipt_NOTRACK.c +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -5,7 +5,7 @@ #include <linux/skbuff.h> #include <linux/netfilter_ipv4/ip_tables.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static unsigned int target(struct sk_buff **pskb, @@ -23,7 +23,7 @@ target(struct sk_buff **pskb, If there is a real ct entry correspondig to this packet, it'll hang aroun till timing out. We don't deal with it for performance reasons. JK */ - (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + nf_ct_untrack(*pskb); (*pskb)->nfctinfo = IP_CT_NEW; nf_conntrack_get((*pskb)->nfct); diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index df4a42c6da2..d68a048b717 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -10,7 +10,7 @@ */ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_connbytes.h> @@ -46,60 +46,59 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connbytes_info *sinfo = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct; u_int64_t what = 0; /* initialize to make gcc happy */ + const struct ip_conntrack_counter *counters; - if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + if (!(counters = nf_ct_get_counters(skb))) return 0; /* no match */ switch (sinfo->what) { case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_REPLY].packets; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].packets; - what += ct->counters[IP_CT_DIR_REPLY].packets; + what = counters[IP_CT_DIR_ORIGINAL].packets; + what += counters[IP_CT_DIR_REPLY].packets; break; } break; case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; break; case IPT_CONNBYTES_DIR_REPLY: - what = ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_REPLY].bytes; break; case IPT_CONNBYTES_DIR_BOTH: - what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; - what += ct->counters[IP_CT_DIR_REPLY].bytes; + what = counters[IP_CT_DIR_ORIGINAL].bytes; + what += counters[IP_CT_DIR_REPLY].bytes; break; } break; case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: - what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, - ct->counters[IP_CT_DIR_ORIGINAL].packets); + what = div64_64(counters[IP_CT_DIR_ORIGINAL].bytes, + counters[IP_CT_DIR_ORIGINAL].packets); break; case IPT_CONNBYTES_DIR_REPLY: - what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, - ct->counters[IP_CT_DIR_REPLY].packets); + what = div64_64(counters[IP_CT_DIR_REPLY].bytes, + counters[IP_CT_DIR_REPLY].packets); break; case IPT_CONNBYTES_DIR_BOTH: { u_int64_t bytes; u_int64_t pkts; - bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + - ct->counters[IP_CT_DIR_REPLY].bytes; - pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ - ct->counters[IP_CT_DIR_REPLY].packets; + bytes = counters[IP_CT_DIR_ORIGINAL].bytes + + counters[IP_CT_DIR_REPLY].bytes; + pkts = counters[IP_CT_DIR_ORIGINAL].packets+ + counters[IP_CT_DIR_REPLY].packets; /* FIXME_THEORETICAL: what to do if sum * overflows ? */ diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index bf8de47ce00..5306ef293b9 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -28,7 +28,7 @@ MODULE_LICENSE("GPL"); #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_connmark.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> static int match(const struct sk_buff *skb, @@ -39,12 +39,12 @@ match(const struct sk_buff *skb, int *hotdrop) { const struct ipt_connmark_info *info = matchinfo; - enum ip_conntrack_info ctinfo; - struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); - if (!ct) + u_int32_t ctinfo; + const u_int32_t *ctmark = nf_ct_get_mark(skb, &ctinfo); + if (!ctmark) return 0; - return ((ct->mark & info->mask) == info->mark) ^ info->invert; + return (((*ctmark) & info->mask) == info->mark) ^ info->invert; } static int diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c index c1d22801b7c..c8d18705469 100644 --- a/net/ipv4/netfilter/ipt_conntrack.c +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -10,7 +10,14 @@ #include <linux/module.h> #include <linux/skbuff.h> + +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_tuple.h> +#else +#include <net/netfilter/nf_conntrack.h> +#endif + #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_conntrack.h> @@ -18,6 +25,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables connection tracking match module"); +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) + static int match(const struct sk_buff *skb, const struct net_device *in, @@ -102,6 +111,93 @@ match(const struct sk_buff *skb, return 1; } +#else /* CONFIG_IP_NF_CONNTRACK */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &nf_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +#endif /* CONFIG_NF_IP_CONNTRACK */ + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c index 3e7dd014de4..bf14e1c7798 100644 --- a/net/ipv4/netfilter/ipt_helper.c +++ b/net/ipv4/netfilter/ipt_helper.c @@ -13,9 +13,15 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#else +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#endif #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_helper.h> @@ -29,6 +35,7 @@ MODULE_DESCRIPTION("iptables helper match module"); #define DEBUGP(format, args...) #endif +#if defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE) static int match(const struct sk_buff *skb, const struct net_device *in, @@ -73,6 +80,53 @@ out_unlock: return ret; } +#else /* CONFIG_IP_NF_CONNTRACK */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = nf_ct_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + read_lock_bh(&nf_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + read_unlock_bh(&nf_conntrack_lock); + return ret; +} +#endif + static int check(const char *tablename, const struct ipt_ip *ip, void *matchinfo, diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index b1511b97ea5..4d7f16b70ce 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -10,7 +10,7 @@ #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <net/netfilter/nf_conntrack_compat.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_state.h> @@ -30,9 +30,9 @@ match(const struct sk_buff *skb, enum ip_conntrack_info ctinfo; unsigned int statebit; - if (skb->nfct == &ip_conntrack_untracked.ct_general) + if (nf_ct_is_untracked(skb)) statebit = IPT_STATE_UNTRACKED; - else if (!ip_conntrack_get(skb, &ctinfo)) + else if (!nf_ct_get_ctinfo(skb, &ctinfo)) statebit = IPT_STATE_INVALID; else statebit = IPT_STATE_BIT(ctinfo); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c new file mode 100644 index 00000000000..8202c1c0afa --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -0,0 +1,571 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - move L3 protocol dependent part to this file. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add get_features() to support various size of conntrack + * structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[2], *ap; + ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), + sizeof(u_int32_t) * 2, _addrs); + if (ap == NULL) + return 0; + + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + + return 1; +} + +static int ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u3.ip = orig->dst.u3.ip; + tuple->dst.u3.ip = orig->src.u3.ip; + + return 1; +} + +static int ipv4_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.u3.ip), + NIPQUAD(tuple->dst.u3.ip)); +} + +static int ipv4_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns new sk_buff, or NULL */ +static struct sk_buff * +nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb_orphan(skb); + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (skb) + ip_send_check(skb->nh.iph); + + return skb; +} + +static int +ipv4_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ipv4_prepare: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return -NF_DROP; + } + + *dataoff = (*pskb)->nh.raw - (*pskb)->data + (*pskb)->nh.iph->ihl*4; + *protonum = (*pskb)->nh.iph->protocol; + + return NF_ACCEPT; +} + +int nat_module_is_loaded = 0; +static u_int32_t ipv4_get_features(const struct nf_conntrack_tuple *tuple) +{ + if (nat_module_is_loaded) + return NF_CT_F_NAT; + + return NF_CT_F_BASIC; +} + +static unsigned int ipv4_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(pskb); +} + +static unsigned int ipv4_conntrack_help(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, + (*pskb)->nh.raw - (*pskb)->data + + (*pskb)->nh.iph->ihl*4, + ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = nf_ct_ipv4_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ipv4_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ipv4_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +static unsigned int ipv4_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return nf_conntrack_in(PF_INET, hooknum, pskb); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv4_conntrack_defrag_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_in_ops = { + .hook = ipv4_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv4_conntrack_defrag_local_out_ops = { + .hook = ipv4_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv4_conntrack_local_out_ops = { + .hook = ipv4_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* helpers */ +static struct nf_hook_ops ipv4_conntrack_helper_out_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + +static struct nf_hook_ops ipv4_conntrack_helper_in_ops = { + .hook = ipv4_conntrack_help, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, +}; + + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv4_conntrack_out_ops = { + .hook = ipv4_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +static struct nf_hook_ops ipv4_conntrack_local_in_ops = { + .hook = ipv4_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, +}; + +#ifdef CONFIG_SYSCTL +/* From nf_conntrack_proto_icmp.c */ +extern unsigned long nf_ct_icmp_timeout; +static struct ctl_table_header *nf_ct_ipv4_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "nf_conntrack_icmp_timeout", + .data = &nf_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + NF_CT_TUPLE_U_BLANK(&tuple); + tuple.src.u3.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.u3.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.u3.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.u3.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { + .l3proto = PF_INET, + .name = "ipv4", + .pkt_to_tuple = ipv4_pkt_to_tuple, + .invert_tuple = ipv4_invert_tuple, + .print_tuple = ipv4_print_tuple, + .print_conntrack = ipv4_print_conntrack, + .prepare = ipv4_prepare, + .get_features = ipv4_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp4; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmp; +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + goto cleanup_nothing; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register tcp.\n"); + goto cleanup_sockopt; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmp); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register icmp.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register ipv4\n"); + goto cleanup_icmp; + } + + ret = nf_register_hook(&ipv4_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing defrag hook.\n"); + goto cleanup_ipv4; + } + ret = nf_register_hook(&ipv4_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv4_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local helper hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv4_conntrack_helper_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } + + ret = nf_register_hook(&ipv4_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } + + ret = nf_register_hook(&ipv4_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv4_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv4_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + /* For use by REJECT target */ + ip_ct_attach = __nf_conntrack_attach; + + return ret; + + cleanup: + synchronize_net(); + ip_ct_attach = NULL; +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv4_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv4_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv4_conntrack_out_ops); + cleanup_helperoutops: + nf_unregister_hook(&ipv4_conntrack_helper_out_ops); + cleanup_helperinops: + nf_unregister_hook(&ipv4_conntrack_helper_in_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv4_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv4_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv4_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv4_conntrack_defrag_ops); + cleanup_ipv4: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); + cleanup_icmp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmp); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp4); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp4); + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(nf_ct_ipv4_gather_frags); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 00000000000..7ddb5c08f7b --- /dev/null +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,301 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_icmp.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/icmp.h> +#include <linux/seq_file.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> + +unsigned long nf_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, unsigned int dataoff) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct nf_conntrack_protocol *innerproto; + struct nf_conntrack_tuple_hash *h; + int dataoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return -NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_message: fragment of proto %u\n", + inside->ip.protocol); + return -NF_ACCEPT; + } + + innerproto = nf_ct_find_proto(PF_INET, inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, + inside->ip.protocol, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: ! get_tuple p=%u", + inside->ip.protocol); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, + &nf_conntrack_l3proto_ipv4, innerproto)) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = nf_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + /* Reverse direction from that found */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, + "nf_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, + "nf_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmp = +{ + .list = { NULL, NULL }, + .l3proto = PF_INET, + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmp); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65268562351..01444a02b48 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a958..9ac7a4f46bd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2112,7 +2113,6 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e060904..1d0cd86621b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e8..c7cc62c8dc1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde08..63cf7e54084 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e9..3284cfb993e 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623d..40dbb387751 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578d..bf2e23086bc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. @@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smooth things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; i<num_sacks; i++, sp++) { - struct sk_buff *skb; - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive @@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4372,6 +4471,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474..4d5021e1929 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; static int tcp_v4_get_port(struct sock *sk, unsigned short snum) @@ -825,8 +823,7 @@ out: */ static void tcp_v4_reqsk_destructor(struct request_sock *req) { - if (inet_rsk(req)->opt) - kfree(inet_rsk(req)->opt); + kfree(inet_rsk(req)->opt); } static inline void syn_flood_warning(struct sk_buff *skb) @@ -1113,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1219,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4..1b66a2ac432 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; @@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f..029c70dfb58 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf552..26d7486ee50 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c..e1880959614 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f..b7d296a8ac6 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } } - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0..2422a5f7195 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2c5f57299d6..56a09a4ac41 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -35,6 +35,9 @@ * YOSHIFUJI Hideaki @USAGI : ARCnet support * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to * seq_file. + * YOSHIFUJI Hideaki @USAGI : improved source address + * selection; consider scope, + * status etc. */ #include <linux/config.h> @@ -193,46 +196,51 @@ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; #endif const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; -int ipv6_addr_type(const struct in6_addr *addr) +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +static inline unsigned ipv6_addr_scope2type(unsigned scope) +{ + switch(scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +int __ipv6_addr_type(const struct in6_addr *addr) { - int type; u32 st; st = addr->s6_addr32[0]; - if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { - type = IPV6_ADDR_MULTICAST; - - switch((st & htonl(0x00FF0000))) { - case __constant_htonl(0x00010000): - type |= IPV6_ADDR_LOOPBACK; - break; - - case __constant_htonl(0x00020000): - type |= IPV6_ADDR_LINKLOCAL; - break; - - case __constant_htonl(0x00050000): - type |= IPV6_ADDR_SITELOCAL; - break; - }; - return type; - } - - type = IPV6_ADDR_UNICAST; - /* Consider all addresses with the first three bits different of - 000 and 111 as finished. + 000 and 111 as unicasts. */ if ((st & htonl(0xE0000000)) != htonl(0x00000000) && (st & htonl(0xE0000000)) != htonl(0xE0000000)) - return type; - - if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) - return (IPV6_ADDR_LINKLOCAL | type); + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) - return (IPV6_ADDR_SITELOCAL | type); + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { if (addr->s6_addr32[2] == 0) { @@ -240,24 +248,20 @@ int ipv6_addr_type(const struct in6_addr *addr) return IPV6_ADDR_ANY; if (addr->s6_addr32[3] == htonl(0x00000001)) - return (IPV6_ADDR_LOOPBACK | type); + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ - return (IPV6_ADDR_COMPATv4 | type); + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } if (addr->s6_addr32[2] == htonl(0x0000ffff)) - return IPV6_ADDR_MAPPED; + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ } - st &= htonl(0xFF000000); - if (st == 0) - return IPV6_ADDR_RESERVED; - st &= htonl(0xFE000000); - if (st == htonl(0x02000000)) - return IPV6_ADDR_RESERVED; /* for NSAP */ - if (st == htonl(0x04000000)) - return IPV6_ADDR_RESERVED; /* for IPX */ - return type; + return (IPV6_ADDR_RESERVED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } static void addrconf_del_timer(struct inet6_ifaddr *ifp) @@ -805,138 +809,276 @@ out: #endif /* - * Choose an appropriate source address - * should do: - * i) get an address with an appropriate scope - * ii) see if there is a specific route for the destination and use - * an address of the attached interface - * iii) don't use deprecated addresses + * Choose an appropriate source address (RFC3484) */ -static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +struct ipv6_saddr_score { + int addr_type; + unsigned int attrs; + int matchlen; + unsigned int scope; + unsigned int rule; +}; + +#define IPV6_SADDR_SCORE_LOCAL 0x0001 +#define IPV6_SADDR_SCORE_PREFERRED 0x0004 +#define IPV6_SADDR_SCORE_HOA 0x0008 +#define IPV6_SADDR_SCORE_OIF 0x0010 +#define IPV6_SADDR_SCORE_LABEL 0x0020 +#define IPV6_SADDR_SCORE_PRIVACY 0x0040 + +static int inline ipv6_saddr_preferred(int type) { - int pref; - pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; -#ifdef CONFIG_IPV6_PRIVACY - pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; -#endif - return pref; + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + return 1; + return 0; } -#ifdef CONFIG_IPV6_PRIVACY -#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) -#else -#define IPV6_GET_SADDR_MAXSCORE(score) (score) -#endif +/* static matching label */ +static int inline ipv6_saddr_label(const struct in6_addr *addr, int type) +{ + /* + * prefix (longest match) label + * ----------------------------- + * ::1/128 0 + * ::/0 1 + * 2002::/16 2 + * ::/96 3 + * ::ffff:0:0/96 4 + */ + if (type & IPV6_ADDR_LOOPBACK) + return 0; + else if (type & IPV6_ADDR_COMPATv4) + return 3; + else if (type & IPV6_ADDR_MAPPED) + return 4; + else if (addr->s6_addr16[0] == htons(0x2002)) + return 2; + return 1; +} -int ipv6_dev_get_saddr(struct net_device *dev, +int ipv6_dev_get_saddr(struct net_device *daddr_dev, struct in6_addr *daddr, struct in6_addr *saddr) { - struct inet6_ifaddr *ifp = NULL; - struct inet6_ifaddr *match = NULL; - struct inet6_dev *idev; - int scope; - int err; - int hiscore = -1, score; + struct ipv6_saddr_score hiscore; + struct inet6_ifaddr *ifa_result = NULL; + int daddr_type = __ipv6_addr_type(daddr); + int daddr_scope = __ipv6_addr_src_scope(daddr_type); + u32 daddr_label = ipv6_saddr_label(daddr, daddr_type); + struct net_device *dev; - scope = ipv6_addr_scope(daddr); + memset(&hiscore, 0, sizeof(hiscore)); - /* - * known dev - * search dev and walk through dev addresses - */ + read_lock(&dev_base_lock); + read_lock(&addrconf_lock); - if (dev) { - if (dev->flags & IFF_LOOPBACK) - scope = IFA_HOST; + for (dev = dev_base; dev; dev=dev->next) { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + + /* Rule 0: Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if ((daddr_type & IPV6_ADDR_MULTICAST || + daddr_scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + daddr_dev && dev != daddr_dev) + continue; - read_lock(&addrconf_lock); idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + if (!idev) + continue; - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + read_lock_bh(&idev->lock); + for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { + struct ipv6_saddr_score score; - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - read_unlock(&addrconf_lock); - goto out; - } + score.addr_type = __ipv6_addr_type(&ifa->addr); + + /* Rule 0: Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if (unlikely(score.addr_type == IPV6_ADDR_ANY || + score.addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ADDRCONF: unspecified / multicast address" + "assigned as unicast address on %s", + dev->name); + continue; + } + + score.attrs = 0; + score.matchlen = 0; + score.scope = 0; + score.rule = 0; + + if (ifa_result == NULL) { + /* record it if the first available entry */ + goto record_it; + } + + /* Rule 1: Prefer same address */ + if (hiscore.rule < 1) { + if (ipv6_addr_equal(&ifa_result->addr, daddr)) + hiscore.attrs |= IPV6_SADDR_SCORE_LOCAL; + hiscore.rule++; + } + if (ipv6_addr_equal(&ifa->addr, daddr)) { + score.attrs |= IPV6_SADDR_SCORE_LOCAL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LOCAL)) { + score.rule = 1; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LOCAL) + continue; } - read_unlock_bh(&idev->lock); - } - read_unlock(&addrconf_lock); - } - if (scope == IFA_LINK) - goto out; + /* Rule 2: Prefer appropriate scope */ + if (hiscore.rule < 2) { + hiscore.scope = __ipv6_addr_src_scope(hiscore.addr_type); + hiscore.rule++; + } + score.scope = __ipv6_addr_src_scope(score.addr_type); + if (hiscore.scope < score.scope) { + if (hiscore.scope < daddr_scope) { + score.rule = 2; + goto record_it; + } else + continue; + } else if (score.scope < hiscore.scope) { + if (score.scope < daddr_scope) + continue; + else { + score.rule = 2; + goto record_it; + } + } - /* - * dev == NULL or search failed for specified dev - */ + /* Rule 3: Avoid deprecated address */ + if (hiscore.rule < 3) { + if (ipv6_saddr_preferred(hiscore.addr_type) || + !(ifa_result->flags & IFA_F_DEPRECATED)) + hiscore.attrs |= IPV6_SADDR_SCORE_PREFERRED; + hiscore.rule++; + } + if (ipv6_saddr_preferred(score.addr_type) || + !(ifa->flags & IFA_F_DEPRECATED)) { + score.attrs |= IPV6_SADDR_SCORE_PREFERRED; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED)) { + score.rule = 3; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PREFERRED) + continue; + } - read_lock(&dev_base_lock); - read_lock(&addrconf_lock); - for (dev = dev_base; dev; dev=dev->next) { - idev = __in6_dev_get(dev); - if (idev) { - read_lock_bh(&idev->lock); - for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope) { - if (ifp->flags&IFA_F_TENTATIVE) - continue; -#ifdef CONFIG_IPV6_PRIVACY - score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); -#else - score = ipv6_saddr_pref(ifp, 0); -#endif - if (score <= hiscore) - continue; + /* Rule 4: Prefer home address -- not implemented yet */ - if (match) - in6_ifa_put(match); - match = ifp; - hiscore = score; - in6_ifa_hold(ifp); + /* Rule 5: Prefer outgoing interface */ + if (hiscore.rule < 5) { + if (daddr_dev == NULL || + daddr_dev == ifa_result->idev->dev) + hiscore.attrs |= IPV6_SADDR_SCORE_OIF; + hiscore.rule++; + } + if (daddr_dev == NULL || + daddr_dev == ifa->idev->dev) { + score.attrs |= IPV6_SADDR_SCORE_OIF; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_OIF)) { + score.rule = 5; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_OIF) + continue; + } - if (IPV6_GET_SADDR_MAXSCORE(score)) { - read_unlock_bh(&idev->lock); - goto out_unlock_base; - } + /* Rule 6: Prefer matching label */ + if (hiscore.rule < 6) { + if (ipv6_saddr_label(&ifa_result->addr, hiscore.addr_type) == daddr_label) + hiscore.attrs |= IPV6_SADDR_SCORE_LABEL; + hiscore.rule++; + } + if (ipv6_saddr_label(&ifa->addr, score.addr_type) == daddr_label) { + score.attrs |= IPV6_SADDR_SCORE_LABEL; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_LABEL)) { + score.rule = 6; + goto record_it; } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_LABEL) + continue; } - read_unlock_bh(&idev->lock); + +#ifdef CONFIG_IPV6_PRIVACY + /* Rule 7: Prefer public address + * Note: prefer temprary address if use_tempaddr >= 2 + */ + if (hiscore.rule < 7) { + if ((!(ifa_result->flags & IFA_F_TEMPORARY)) ^ + (ifa_result->idev->cnf.use_tempaddr >= 2)) + hiscore.attrs |= IPV6_SADDR_SCORE_PRIVACY; + hiscore.rule++; + } + if ((!(ifa->flags & IFA_F_TEMPORARY)) ^ + (ifa->idev->cnf.use_tempaddr >= 2)) { + score.attrs |= IPV6_SADDR_SCORE_PRIVACY; + if (!(hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY)) { + score.rule = 7; + goto record_it; + } + } else { + if (hiscore.attrs & IPV6_SADDR_SCORE_PRIVACY) + continue; + } +#endif + /* Rule 8: Use longest matching prefix */ + if (hiscore.rule < 8) { + hiscore.matchlen = ipv6_addr_diff(&ifa_result->addr, daddr); + hiscore.rule++; + } + score.matchlen = ipv6_addr_diff(&ifa->addr, daddr); + if (score.matchlen > hiscore.matchlen) { + score.rule = 8; + goto record_it; + } +#if 0 + else if (score.matchlen < hiscore.matchlen) + continue; +#endif + + /* Final Rule: choose first available one */ + continue; +record_it: + if (ifa_result) + in6_ifa_put(ifa_result); + in6_ifa_hold(ifa); + ifa_result = ifa; + hiscore = score; } + read_unlock_bh(&idev->lock); } - -out_unlock_base: read_unlock(&addrconf_lock); read_unlock(&dev_base_lock); -out: - err = -EADDRNOTAVAIL; - if (match) { - ipv6_addr_copy(saddr, &match->addr); - err = 0; - in6_ifa_put(match); - } - - return err; + if (!ifa_result) + return -EADDRNOTAVAIL; + + ipv6_addr_copy(saddr, &ifa_result->addr); + in6_ifa_put(ifa_result); + return 0; } @@ -2950,8 +3092,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nlmsg_failure: rtattr_failure: - if (array) - kfree(array); + kfree(array); skb_trim(skb, b - skb->data); return -1; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 4f8795af2ed..c63b8ce0e1b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -699,12 +699,14 @@ static int __init inet6_init(void) /* Register the family here so that the init calls below will * be able to create sockets. (?? is this dangerous ??) */ - (void) sock_register(&inet6_family_ops); + err = sock_register(&inet6_family_ops); + if (err) + goto out_unregister_raw_proto; /* Initialise ipv6 mibs */ err = init_ipv6_mibs(); if (err) - goto out_unregister_raw_proto; + goto out_unregister_sock; /* * ipngwg API draft makes clear that the correct semantics @@ -796,6 +798,8 @@ icmp_fail: ipv6_sysctl_unregister(); #endif cleanup_ipv6_mibs(); +out_unregister_sock: + sock_unregister(PF_INET6); out_unregister_raw_proto: proto_unregister(&rawv6_prot); out_unregister_udp_proto: diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 23e540365a1..1bdf0fb8bf8 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) daddr = &skb->nh.ipv6h->daddr; /* Perform checksum. */ - if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) { - if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, - skb_checksum(skb, 0, skb->len, 0))) { + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0); + if (__skb_checksum_complete(skb)) { LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", NIP6(*saddr), NIP6(*daddr)); goto discard_it; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4fcc5a7acf6..1bf6d9a769e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -127,56 +127,6 @@ static __inline__ int addr_bit_set(void *token, int fn_bit) return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; } -/* - * find the first different bit between two addresses - * length of address must be a multiple of 32bits - */ - -static __inline__ int addr_diff(void *token1, void *token2, int addrlen) -{ - __u32 *a1 = token1; - __u32 *a2 = token2; - int i; - - addrlen >>= 2; - - for (i = 0; i < addrlen; i++) { - __u32 xb; - - xb = a1[i] ^ a2[i]; - - if (xb) { - int j = 31; - - xb = ntohl(xb); - - while ((xb & (1 << j)) == 0) - j--; - - return (i * 32 + 31 - j); - } - } - - /* - * we should *never* get to this point since that - * would mean the addrs are equal - * - * However, we do get to it 8) And exacly, when - * addresses are equal 8) - * - * ip route add 1111::/128 via ... - * ip route add 1111::/64 via ... - * and we are here. - * - * Ideally, this function should stop comparison - * at prefix length. It does not, but it is still OK, - * if returned value is greater than prefix length. - * --ANK (980803) - */ - - return addrlen<<5; -} - static __inline__ struct fib6_node * node_alloc(void) { struct fib6_node *fn; @@ -296,11 +246,11 @@ insert_above: /* find 1st bit in difference between the 2 addrs. - See comment in addr_diff: bit may be an invalid value, + See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ - bit = addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, addrlen); /* * (intermediate)[in] diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6e348042693..a6026d2787d 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -176,6 +176,11 @@ resubmit: if (ipprot->flags & INET6_PROTO_FINAL) { struct ipv6hdr *hdr; + /* Free reference early: we don't need it any more, + and it may hold ip_conntrack module loaded + indefinitely. */ + nf_reset(skb); + skb_postpull_rcsum(skb, skb->nh.raw, skb->h.raw - skb->nh.raw); hdr = skb->nh.ipv6h; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 614296a920c..c1fa693511a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -441,9 +441,15 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); to->nfct = from->nfct; nf_conntrack_get(to->nfct); to->nfctinfo = from->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(to->nfct_reasm); + to->nfct_reasm = from->nfct_reasm; + nf_conntrack_get_reasm(to->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(to->nf_bridge); to->nf_bridge = from->nf_bridge; @@ -587,8 +593,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb->next = NULL; } - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err == 0) { IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); @@ -1186,10 +1191,8 @@ int ip6_push_pending_frames(struct sock *sk) out: inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; @@ -1214,10 +1217,8 @@ void ip6_flush_pending_frames(struct sock *sk) inet->cork.flags &= ~IPCORK_OPT; - if (np->cork.opt) { - kfree(np->cork.opt); - np->cork.opt = NULL; - } + kfree(np->cork.opt); + np->cork.opt = NULL; if (np->cork.rt) { dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cf94372d1af..e315d0f80af 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -525,6 +525,7 @@ ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + read_unlock(&ip6ip6_lock); kfree_skb(skb); return 0; } @@ -756,8 +757,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } ip6_tnl_dst_store(t, dst); - if (opt) - kfree(opt); + kfree(opt); t->recursion--; return 0; @@ -766,8 +766,7 @@ tx_err_link_failure: dst_link_failure(skb); tx_err_dst_release: dst_release(dst); - if (opt) - kfree(opt); + kfree(opt); tx_err: stats->tx_errors++; stats->tx_dropped++; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 85bfbc69b2c..55917fb1709 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -130,8 +130,7 @@ static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, s out_put_cpu: put_cpu(); out: - if (tmp_hdr) - kfree(tmp_hdr); + kfree(tmp_hdr); if (err) goto error_out; return nexthdr; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 8567873d0dd..25757ade989 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -80,8 +80,7 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) if (ra->sk == sk) { if (sel>=0) { write_unlock_bh(&ip6_ra_lock); - if (new_ra) - kfree(new_ra); + kfree(new_ra); return -EADDRINUSE; } @@ -288,7 +287,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; if (optlen == 0) - optval = 0; + optval = NULL; /* hop-by-hop / destination options are privileged option */ retv = -EPERM; diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c index 37a4a99c9fe..16482785bdf 100644 --- a/net/ipv6/ipv6_syms.c +++ b/net/ipv6/ipv6_syms.c @@ -7,7 +7,7 @@ #include <net/ip6_route.h> #include <net/xfrm.h> -EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(__ipv6_addr_type); EXPORT_SYMBOL(icmpv6_send); EXPORT_SYMBOL(icmpv6_statistics); EXPORT_SYMBOL(icmpv6_err_convert); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index bb7ccfe33f2..060d6120241 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,10 +5,20 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" depends on INET && IPV6 && NETFILTER && EXPERIMENTAL -#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK -#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK -#fi +config NF_CONNTRACK_IPV6 + tristate "IPv6 support for new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + config IP6_NF_QUEUE tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- @@ -114,7 +124,6 @@ config IP6_NF_MATCH_OWNER To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES config IP6_NF_MATCH_MARK tristate "netfilter MARK match support" depends on IP6_NF_IPTABLES @@ -170,15 +179,6 @@ config IP6_NF_MATCH_PHYSDEV To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES -# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then -# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES -# fi -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES -# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES -# fi # The targets config IP6_NF_FILTER tristate "Packet filtering" @@ -220,12 +220,6 @@ config IP6_NF_TARGET_NFQUEUE To compile it as a module, choose M here. If unsure, say N. -# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then -# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER -# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER -# fi -# fi config IP6_NF_MANGLE tristate "Packet mangling" depends on IP6_NF_IPTABLES @@ -236,7 +230,6 @@ config IP6_NF_MANGLE To compile it as a module, choose M here. If unsure, say N. -# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE config IP6_NF_TARGET_MARK tristate "MARK target support" depends on IP6_NF_MANGLE @@ -266,7 +259,6 @@ config IP6_NF_TARGET_HL To compile it as a module, choose M here. If unsure, say N. -#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' depends on IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2b2c370e8b1..9ab5b2ca1f5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -27,3 +27,9 @@ obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o + +# objects for l3 independent conntrack +nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index 0c7584f9217..eab8fb864ee 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -56,9 +56,9 @@ checkentry(const char *tablename, return 1; } -static struct ip6t_target ip6t_mark_reg = { - .name = "MARK", - .target = target, +static struct ip6t_target ip6t_mark_reg = { + .name = "MARK", + .target = target, .checkentry = checkentry, .me = THIS_MODULE }; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 00000000000..753a3ae8502 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -0,0 +1,556 @@ +/* + * Copyright (C)2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - support Layer 3 protocol independent connection tracking. + * Based on the original ip_conntrack code which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add get_features() to support various size of conntrack + * structures. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + u_int32_t _addrs[8], *ap; + + ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), + sizeof(_addrs), _addrs); + if (ap == NULL) + return 0; + + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); + + return 1; +} + +static int ipv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%x:%x:%x:%x:%x:%x:%x:%x dst=%x:%x:%x:%x:%x:%x:%x:%x ", + NIP6(*((struct in6_addr *)tuple->src.u3.ip6)), + NIP6(*((struct in6_addr *)tuple->dst.u3.ip6))); +} + +static int ipv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* + * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * if success, *nexthdr is updated by type/protocol of this header. + * + * NOTES: - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns -1. + * - if packet is fragmented, return pointer of the fragment header. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + */ + +int nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, + int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + break; + if (nexthdr == NEXTHDR_FRAGMENT) + break; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +static int +ipv6_prepare(struct sk_buff **pskb, unsigned int hooknum, unsigned int *dataoff, + u_int8_t *protonum) +{ + unsigned int extoff; + unsigned char pnum; + int protoff; + + extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; + pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + + /* + * (protoff == (*pskb)->len) mean that the packet doesn't have no data + * except of IPv6 & ext headers. but it's tracked anyway. - YK + */ + if ((protoff < 0) || (protoff > (*pskb)->len)) { + DEBUGP("ip6_conntrack_core: can't find proto in pkt\n"); + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -NF_ACCEPT; + } + + *dataoff = protoff; + *protonum = pnum; + return NF_ACCEPT; +} + +static u_int32_t ipv6_get_features(const struct nf_conntrack_tuple *tuple) +{ + return NF_CT_F_BASIC; +} + +static unsigned int ipv6_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret, protoff; + unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) + - (*pskb)->data; + unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; + + protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, + (*pskb)->len - extoff); + if (protoff < 0 || protoff > (*pskb)->len || + pnum == NEXTHDR_FRAGMENT) { + DEBUGP("proto header not found\n"); + return NF_ACCEPT; + } + + ret = ct->helper->help(pskb, protoff, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + + return nf_conntrack_confirm(pskb); +} + +extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); +extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, + struct net_device *out, + int (*okfn)(struct sk_buff *)); +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + + /* Previously seen (loopback)? */ + if ((*pskb)->nfct) + return NF_ACCEPT; + + reasm = nf_ct_frag6_gather(*pskb); + + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occured or not fragmented */ + if (reasm == *pskb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm = (*pskb)->nfct_reasm; + + /* This packet is fragmented and has reassembled packet. */ + if (reasm) { + /* Reassembled packet isn't parsed yet ? */ + if (!reasm->nfct) { + unsigned int ret; + + ret = nf_conntrack_in(PF_INET6, hooknum, &reasm); + if (ret != NF_ACCEPT) + return ret; + } + nf_conntrack_get(reasm->nfct); + (*pskb)->nfct = reasm->nfct; + return NF_ACCEPT; + } + + return nf_conntrack_in(PF_INET6, hooknum, pskb); +} + +static unsigned int ipv6_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct ipv6hdr)) { + if (net_ratelimit()) + printk("ipv6_conntrack_local: packet too short\n"); + return NF_ACCEPT; + } + return ipv6_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ipv6_conntrack_defrag_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ipv6_conntrack_in_ops = { + .hook = ipv6_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_local_out_ops = { + .hook = ipv6_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ipv6_conntrack_defrag_local_out_ops = { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ipv6_conntrack_out_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, +}; + +static struct nf_hook_ops ipv6_conntrack_local_in_ops = { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_LAST-1, +}; + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_proto_icmpv6.c */ +extern unsigned long nf_ct_icmpv6_timeout; + +/* From nf_conntrack_frag6.c */ +extern unsigned long nf_ct_frag6_timeout; +extern unsigned int nf_ct_frag6_low_thresh; +extern unsigned int nf_ct_frag6_high_thresh; + +static struct ctl_table_header *nf_ct_ipv6_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_ICMPV6_TIMEOUT, + .procname = "nf_conntrack_icmpv6_timeout", + .data = &nf_ct_icmpv6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_TIMEOUT, + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_ct_frag6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_LOW_THRESH, + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_ct_frag6_low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_ct_frag6_high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +#endif + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { + .l3proto = PF_INET6, + .name = "ipv6", + .pkt_to_tuple = ipv6_pkt_to_tuple, + .invert_tuple = ipv6_invert_tuple, + .print_tuple = ipv6_print_tuple, + .print_conntrack = ipv6_print_conntrack, + .prepare = ipv6_prepare, + .get_features = ipv6_get_features, + .me = THIS_MODULE, +}; + +extern struct nf_conntrack_protocol nf_conntrack_protocol_tcp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_udp6; +extern struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6; +extern int nf_ct_frag6_init(void); +extern void nf_ct_frag6_cleanup(void); +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't initialize frag6.\n"); + goto cleanup_nothing; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_tcp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register tcp.\n"); + goto cleanup_frag6; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_udp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_icmpv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register icmpv6.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register ipv6\n"); + goto cleanup_icmpv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_ipv6; + } + + ret = nf_register_hook(&ipv6_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local_out defrag " + "hook.\n"); + goto cleanup_defragops; + } + + ret = nf_register_hook(&ipv6_conntrack_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local out hook.\n"); + goto cleanup_inops; + } + + ret = nf_register_hook(&ipv6_conntrack_out_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + + ret = nf_register_hook(&ipv6_conntrack_local_in_ops); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + +#ifdef CONFIG_SYSCTL + nf_ct_ipv6_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_ipv6_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + return ret; + + cleanup: + synchronize_net(); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_ipv6_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ipv6_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ipv6_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ipv6_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ipv6_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ipv6_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ipv6_conntrack_defrag_ops); + cleanup_ipv6: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + cleanup_icmpv6: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_icmpv6); + cleanup_udp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_udp6); + cleanup_tcp: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_tcp6); + cleanup_frag6: + nf_ct_frag6_cleanup(); + cleanup_nothing: + return ret; +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); + +static int __init init(void) +{ + need_nf_conntrack(); + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +void need_ip6_conntrack(void) +{ +} + +EXPORT_SYMBOL(need_ip6_conntrack); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 00000000000..c0f1da5497a --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,272 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - ICMPv6 tracking support. Derived from the original ip_conntrack code + * net/ipv4/netfilter/ip_conntrack_proto_icmp.c which had the following + * copyright information: + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <linux/seq_file.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> + +unsigned long nf_ct_icmpv6_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct icmp6hdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return 1; +} + +static int icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_QUERY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_REPLY +1 + }; + + __u8 type = orig->dst.u.icmp.type - 128; + if (type >= sizeof(invmap) || !invmap[type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmpv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmpv6_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmpv6_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + static u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128 >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type - 128]) { + /* Can't create a new ICMPv6 `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + NF_CT_DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +extern int +nf_ct_ipv6_skip_exthdr(struct sk_buff *skb, int start, u8 *nexthdrp, int len); +extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +static int +icmpv6_error_message(struct sk_buff *skb, + unsigned int icmp6off, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple intuple, origtuple; + struct nf_conntrack_tuple_hash *h; + struct icmp6hdr _hdr, *hp; + unsigned int inip6off; + struct nf_conntrack_protocol *inproto; + u_int8_t inprotonum; + unsigned int inprotoff; + + NF_CT_ASSERT(skb->nfct == NULL); + + hp = skb_header_pointer(skb, icmp6off, sizeof(_hdr), &_hdr); + if (hp == NULL) { + DEBUGP("icmpv6_error: Can't get ICMPv6 hdr.\n"); + return -NF_ACCEPT; + } + + inip6off = icmp6off + sizeof(_hdr); + if (skb_copy_bits(skb, inip6off+offsetof(struct ipv6hdr, nexthdr), + &inprotonum, sizeof(inprotonum)) != 0) { + DEBUGP("icmpv6_error: Can't get nexthdr in inner IPv6 header.\n"); + return -NF_ACCEPT; + } + inprotoff = nf_ct_ipv6_skip_exthdr(skb, + inip6off + sizeof(struct ipv6hdr), + &inprotonum, + skb->len - inip6off + - sizeof(struct ipv6hdr)); + + if ((inprotoff < 0) || (inprotoff > skb->len) || + (inprotonum == NEXTHDR_FRAGMENT)) { + DEBUGP("icmpv6_error: Can't get protocol header in ICMPv6 payload.\n"); + return -NF_ACCEPT; + } + + inproto = nf_ct_find_proto(PF_INET6, inprotonum); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, + &origtuple, &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, + &nf_conntrack_l3proto_ipv6, inproto)) { + DEBUGP("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(&intuple, NULL); + if (!h) { + DEBUGP("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +static int +icmpv6_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) +{ + struct icmp6hdr _ih, *icmp6h; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + if (LOG_INVALID(IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: short packet "); + return -NF_ACCEPT; + } + + if (hooknum != NF_IP6_PRE_ROUTING) + goto skipped; + + /* Ignore it if the checksum's bogus. */ + if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_ICMPV6, + skb_checksum(skb, dataoff, + skb->len - dataoff, 0))) { + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed\n"); + return -NF_ACCEPT; + } + +skipped: + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(skb, dataoff, ctinfo, hooknum); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_icmpv6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_ICMPV6, + .name = "icmpv6", + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .print_tuple = icmpv6_print_tuple, + .print_conntrack = icmpv6_print_conntrack, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_icmpv6); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 00000000000..c2c52af9e56 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -0,0 +1,897 @@ +/* + * IPv6 fragment reassembly for connection tracking + * + * Copyright (C)2004 USAGI/WIDE Project + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> +#include <linux/jhash.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <linux/sysctl.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NF_CT_FRAG6_HIGH_THRESH 262144 /* == 256*1024 */ +#define NF_CT_FRAG6_LOW_THRESH 196608 /* == 192*1024 */ +#define NF_CT_FRAG6_TIMEOUT IPV6_FRAG_TIMEOUT + +unsigned int nf_ct_frag6_high_thresh = 256*1024; +unsigned int nf_ct_frag6_low_thresh = 192*1024; +unsigned long nf_ct_frag6_timeout = IPV6_FRAG_TIMEOUT; + +struct nf_ct_frag6_skb_cb +{ + struct inet6_skb_parm h; + int offset; + struct sk_buff *orig; +}; + +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) + +struct nf_ct_frag6_queue +{ + struct nf_ct_frag6_queue *next; + struct list_head lru_list; /* lru list member */ + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* expire timer */ + struct sk_buff *fragments; + int len; + int meat; + struct timeval stamp; + unsigned int csum; + __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + __u16 nhoffset; + struct nf_ct_frag6_queue **pprev; +}; + +/* Hash table. */ + +#define FRAG6Q_HASHSZ 64 + +static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; +static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static u32 nf_ct_frag6_hash_rnd; +static LIST_HEAD(nf_ct_frag6_lru_list); +int nf_ct_frag6_nqueues = 0; + +static __inline__ void __fq_unlink(struct nf_ct_frag6_queue *fq) +{ + if (fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + list_del(&fq->lru_list); + nf_ct_frag6_nqueues--; +} + +static __inline__ void fq_unlink(struct nf_ct_frag6_queue *fq) +{ + write_lock(&nf_ct_frag6_lock); + __fq_unlink(fq); + write_unlock(&nf_ct_frag6_lock); +} + +static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + u32 a, b, c; + + a = saddr->s6_addr32[0]; + b = saddr->s6_addr32[1]; + c = saddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += nf_ct_frag6_hash_rnd; + __jhash_mix(a, b, c); + + a += saddr->s6_addr32[3]; + b += daddr->s6_addr32[0]; + c += daddr->s6_addr32[1]; + __jhash_mix(a, b, c); + + a += daddr->s6_addr32[2]; + b += daddr->s6_addr32[3]; + c += id; + __jhash_mix(a, b, c); + + return c & (FRAG6Q_HASHSZ - 1); +} + +static struct timer_list nf_ct_frag6_secret_timer; +int nf_ct_frag6_secret_interval = 10 * 60 * HZ; + +static void nf_ct_frag6_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&nf_ct_frag6_lock); + get_random_bytes(&nf_ct_frag6_hash_rnd, sizeof(u32)); + for (i = 0; i < FRAG6Q_HASHSZ; i++) { + struct nf_ct_frag6_queue *q; + + q = nf_ct_frag6_hash[i]; + while (q) { + struct nf_ct_frag6_queue *next = q->next; + unsigned int hval = ip6qhashfn(q->id, + &q->saddr, + &q->daddr); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = nf_ct_frag6_hash[hval]) != NULL) + q->next->pprev = &q->next; + nf_ct_frag6_hash[hval] = q; + q->pprev = &nf_ct_frag6_hash[hval]; + } + + q = next; + } + } + write_unlock(&nf_ct_frag6_lock); + + mod_timer(&nf_ct_frag6_secret_timer, now + nf_ct_frag6_secret_interval); +} + +atomic_t nf_ct_frag6_mem = ATOMIC_INIT(0); + +/* Memory Tracking Functions. */ +static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &nf_ct_frag6_mem); + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); + + kfree_skb(skb); +} + +static inline void frag_free_queue(struct nf_ct_frag6_queue *fq, + unsigned int *work) +{ + if (work) + *work -= sizeof(struct nf_ct_frag6_queue); + atomic_sub(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + kfree(fq); +} + +static inline struct nf_ct_frag6_queue *frag_alloc_queue(void) +{ + struct nf_ct_frag6_queue *fq = kmalloc(sizeof(struct nf_ct_frag6_queue), GFP_ATOMIC); + + if (!fq) + return NULL; + atomic_add(sizeof(struct nf_ct_frag6_queue), &nf_ct_frag6_mem); + return fq; +} + +/* Destruction primitives. */ + +/* Complete destruction of fq. */ +static void nf_ct_frag6_destroy(struct nf_ct_frag6_queue *fq, + unsigned int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); + + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + frag_free_queue(fq, work); +} + +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq, unsigned int *work) +{ + if (atomic_dec_and_test(&fq->refcnt)) + nf_ct_frag6_destroy(fq, work); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + } +} + +static void nf_ct_frag6_evictor(void) +{ + struct nf_ct_frag6_queue *fq; + struct list_head *tmp; + unsigned int work; + + work = atomic_read(&nf_ct_frag6_mem); + if (work <= nf_ct_frag6_low_thresh) + return; + + work -= nf_ct_frag6_low_thresh; + while (work > 0) { + read_lock(&nf_ct_frag6_lock); + if (list_empty(&nf_ct_frag6_lru_list)) { + read_unlock(&nf_ct_frag6_lock); + return; + } + tmp = nf_ct_frag6_lru_list.next; + BUG_ON(tmp == NULL); + fq = list_entry(tmp, struct nf_ct_frag6_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq, &work); + } +} + +static void nf_ct_frag6_expire(unsigned long data) +{ + struct nf_ct_frag6_queue *fq = (struct nf_ct_frag6_queue *) data; + + spin_lock(&fq->lock); + + if (fq->last_in & COMPLETE) + goto out; + + fq_kill(fq); + +out: + spin_unlock(&fq->lock); + fq_put(fq, NULL); +} + +/* Creation primitives. */ + + +static struct nf_ct_frag6_queue *nf_ct_frag6_intern(unsigned int hash, + struct nf_ct_frag6_queue *fq_in) +{ + struct nf_ct_frag6_queue *fq; + + write_lock(&nf_ct_frag6_lock); +#ifdef CONFIG_SMP + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && + !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&nf_ct_frag6_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in, NULL); + return fq; + } + } +#endif + fq = fq_in; + + if (!mod_timer(&fq->timer, jiffies + nf_ct_frag6_timeout)) + atomic_inc(&fq->refcnt); + + atomic_inc(&fq->refcnt); + if ((fq->next = nf_ct_frag6_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + nf_ct_frag6_hash[hash] = fq; + fq->pprev = &nf_ct_frag6_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + nf_ct_frag6_nqueues++; + write_unlock(&nf_ct_frag6_lock); + return fq; +} + + +static struct nf_ct_frag6_queue * +nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + + if ((fq = frag_alloc_queue()) == NULL) { + DEBUGP("Can't alloc new queue\n"); + goto oom; + } + + memset(fq, 0, sizeof(struct nf_ct_frag6_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); + + init_timer(&fq->timer); + fq->timer.function = nf_ct_frag6_expire; + fq->timer.data = (long) fq; + fq->lock = SPIN_LOCK_UNLOCKED; + atomic_set(&fq->refcnt, 1); + + return nf_ct_frag6_intern(hash, fq); + +oom: + return NULL; +} + +static __inline__ struct nf_ct_frag6_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct nf_ct_frag6_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&nf_ct_frag6_lock); + for (fq = nf_ct_frag6_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + !ipv6_addr_cmp(src, &fq->saddr) && + !ipv6_addr_cmp(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&nf_ct_frag6_lock); + return fq; + } + } + read_unlock(&nf_ct_frag6_lock); + + return nf_ct_frag6_create(hash, id, src, dst); +} + + +static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->last_in & COMPLETE) { + DEBUGP("Allready completed\n"); + goto err; + } + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + DEBUGP("offset is too large.\n"); + return -1; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, + (u8*)(fhdr + 1) - skb->nh.raw, + 0)); + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) { + DEBUGP("already received last fragment\n"); + goto err; + } + fq->last_in |= LAST_IN; + fq->len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + DEBUGP("the end of this fragment is not rounded to 8 bytes.\n"); + return -1; + } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) { + DEBUGP("last packet already reached.\n"); + goto err; + } + fq->len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { + DEBUGP("queue: message is too short.\n"); + goto err; + } + if (end-offset < skb->len) { + if (pskb_trim(skb, end - offset)) { + DEBUGP("Can't trim\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for (next = fq->fragments; next != NULL; next = next->next) { + if (NFCT_FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) { + DEBUGP("overlap\n"); + goto err; + } + if (!pskb_pull(skb, i)) { + DEBUGP("Can't pull\n"); + goto err; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && NFCT_FRAG6_CB(next)->offset < end) { + /* overlap is 'i' bytes */ + int i = end - NFCT_FRAG6_CB(next)->offset; + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + DEBUGP("Eat head of the overlapped parts.: %d", i); + if (!pskb_pull(next, i)) + goto err; + + /* next fragment */ + NFCT_FRAG6_CB(next)->offset += i; + fq->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + NFCT_FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; + + skb->dev = NULL; + skb_get_timestamp(skb, &fq->stamp); + fq->meat += skb->len; + atomic_add(skb->truesize, &nf_ct_frag6_mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->last_in |= FIRST_IN; + } + write_lock(&nf_ct_frag6_lock); + list_move_tail(&fq->lru_list, &nf_ct_frag6_lru_list); + write_unlock(&nf_ct_frag6_lock); + return 0; + +err: + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static struct sk_buff * +nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +{ + struct sk_buff *fp, *op, *head = fq->fragments; + int payload_len; + + fq_kill(fq); + + BUG_TRAP(head != NULL); + BUG_TRAP(NFCT_FRAG6_CB(head)->offset == 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) { + DEBUGP("payload len is too large.\n"); + goto out_oversize; + } + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + DEBUGP("skb is cloned but can't expand head"); + goto out_oom; + } + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) { + DEBUGP("Can't alloc skb\n"); + goto out_oom; + } + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; i<skb_shinfo(head)->nr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; + atomic_add(clone->truesize, &nf_ct_frag6_mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + head->nh.raw[fq->nhoffset] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + head->h.raw = head->data; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &nf_ct_frag6_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &nf_ct_frag6_mem); + } + + head->next = NULL; + head->dev = dev; + skb_set_timestamp(head, &fq->stamp); + head->nh.ipv6h->payload_len = htons(payload_len); + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + + fq->fragments = NULL; + + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ + fp = skb_shinfo(head)->frag_list; + if (NFCT_FRAG6_CB(fp)->orig == NULL) + /* at above code, head skb is divided into two skbs. */ + fp = fp->next; + + op = NFCT_FRAG6_CB(head)->orig; + for (; fp; fp = fp->next) { + struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; + + op->next = orig; + op = orig; + NFCT_FRAG6_CB(fp)->orig = NULL; + } + + return head; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); +out_fail: + return NULL; +} + +/* + * find the header just before Fragment Header. + * + * if success return 0 and set ... + * (*prevhdrp): the value of "Next Header Field" in the header + * just before Fragment Header. + * (*prevhoff): the offset of "Next Header Field" in the header + * just before Fragment Header. + * (*fhoff) : the offset of Fragment Header. + * + * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c + * + */ +static int +find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) +{ + u8 nexthdr = skb->nh.ipv6h->nexthdr; + u8 prev_nhoff = (u8 *)&skb->nh.ipv6h->nexthdr - skb->data; + int start = (u8 *)(skb->nh.ipv6h+1) - skb->data; + int len = skb->len - start; + u8 prevhdr = NEXTHDR_IPV6; + + while (nexthdr != NEXTHDR_FRAGMENT) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (!ipv6_ext_hdr(nexthdr)) { + return -1; + } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + DEBUGP("too short\n"); + return -1; + } + if (nexthdr == NEXTHDR_NONE) { + DEBUGP("next header is none\n"); + return -1; + } + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + prevhdr = nexthdr; + prev_nhoff = start; + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + if (len < 0) + return -1; + + *prevhdrp = prevhdr; + *prevhoff = prev_nhoff; + *fhoff = start; + + return 0; +} + +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) +{ + struct sk_buff *clone; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct nf_ct_frag6_queue *fq; + struct ipv6hdr *hdr; + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; + + /* Jumbo payload inhibits frag. header */ + if (skb->nh.ipv6h->payload_len == 0) { + DEBUGP("payload len = 0\n"); + return skb; + } + + if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) + return skb; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone == NULL) { + DEBUGP("Can't clone skb\n"); + return skb; + } + + NFCT_FRAG6_CB(clone)->orig = skb; + + if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { + DEBUGP("message is too short.\n"); + goto ret_orig; + } + + clone->h.raw = clone->data + fhoff; + hdr = clone->nh.ipv6h; + fhdr = (struct frag_hdr *)clone->h.raw; + + if (!(fhdr->frag_off & htons(0xFFF9))) { + DEBUGP("Invalid fragment offset\n"); + /* It is not a fragmented frame */ + goto ret_orig; + } + + if (atomic_read(&nf_ct_frag6_mem) > nf_ct_frag6_high_thresh) + nf_ct_frag6_evictor(); + + fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + DEBUGP("Can't find and can't create new queue\n"); + goto ret_orig; + } + + spin_lock(&fq->lock); + + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + spin_unlock(&fq->lock); + DEBUGP("Can't insert skb to queue\n"); + fq_put(fq, NULL); + goto ret_orig; + } + + if (fq->last_in == (FIRST_IN|LAST_IN) && fq->meat == fq->len) { + ret_skb = nf_ct_frag6_reasm(fq, dev); + if (ret_skb == NULL) + DEBUGP("Can't reassemble fragmented packets\n"); + } + spin_unlock(&fq->lock); + + fq_put(fq, NULL); + return ret_skb; + +ret_orig: + kfree_skb(clone); + return skb; +} + +void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s;) { + nf_conntrack_put_reasm(s->nfct_reasm); + nf_conntrack_get_reasm(skb); + s->nfct_reasm = skb; + + s2 = s->next; + NF_HOOK_THRESH(PF_INET6, hooknum, s, in, out, okfn, + NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + s = s2; + } + nf_conntrack_put_reasm(skb); +} + +int nf_ct_frag6_kfree_frags(struct sk_buff *skb) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s; s = s2) { + + s2 = s->next; + kfree_skb(s); + } + + kfree_skb(skb); + + return 0; +} + +int nf_ct_frag6_init(void) +{ + nf_ct_frag6_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_secret_timer.function = nf_ct_frag6_secret_rebuild; + nf_ct_frag6_secret_timer.expires = jiffies + + nf_ct_frag6_secret_interval; + add_timer(&nf_ct_frag6_secret_timer); + + return 0; +} + +void nf_ct_frag6_cleanup(void) +{ + del_timer(&nf_ct_frag6_secret_timer); + nf_ct_frag6_low_thresh = 0; + nf_ct_frag6_evictor(); +} diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a1265a320b1..8e9628f1c4c 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -174,8 +174,10 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ - if (clone) + if (clone) { + nf_reset(clone); rawv6_rcv(sk, clone); + } } sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, IP6CB(skb)->iif); @@ -296,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { if ((raw6_sk(sk)->checksum || sk->sk_filter) && - skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - /* FIXME: increment a raw6 drops counter here */ - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_checksum_complete(skb)) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; } /* Charge it to the socket. */ @@ -335,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (skb->ip_summed == CHECKSUM_HW) { - skb_postpull_rcsum(skb, skb->nh.raw, - skb->h.raw - skb->nh.raw); + if (skb->ip_summed == CHECKSUM_HW) { + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } - if (skb->ip_summed == CHECKSUM_NONE) - skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, - skb->len, inet->num, 0); } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); if (inet->hdrincl) { - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + if (skb_checksum_complete(skb)) { /* FIXME: increment a raw6 drops counter here */ kfree_skb(skb); return 0; } - skb->ip_summed = CHECKSUM_UNNECESSARY; } rawv6_rcv_skb(sk, skb); @@ -405,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); } else { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e4fe9ee484d..5d316cb72ec 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -74,7 +74,7 @@ struct ip6frag_skb_cb struct frag_queue { - struct frag_queue *next; + struct hlist_node list; struct list_head lru_list; /* lru list member */ __u32 id; /* fragment id */ @@ -95,14 +95,13 @@ struct frag_queue #define FIRST_IN 2 #define LAST_IN 1 __u16 nhoffset; - struct frag_queue **pprev; }; /* Hash table. */ #define IP6Q_HASHSZ 64 -static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static struct hlist_head ip6_frag_hash[IP6Q_HASHSZ]; static DEFINE_RWLOCK(ip6_frag_lock); static u32 ip6_frag_hash_rnd; static LIST_HEAD(ip6_frag_lru_list); @@ -110,9 +109,7 @@ int ip6_frag_nqueues = 0; static __inline__ void __fq_unlink(struct frag_queue *fq) { - if(fq->next) - fq->next->pprev = fq->pprev; - *fq->pprev = fq->next; + hlist_del(&fq->list); list_del(&fq->lru_list); ip6_frag_nqueues--; } @@ -163,28 +160,21 @@ static void ip6_frag_secret_rebuild(unsigned long dummy) get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); for (i = 0; i < IP6Q_HASHSZ; i++) { struct frag_queue *q; + struct hlist_node *p, *n; - q = ip6_frag_hash[i]; - while (q) { - struct frag_queue *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ip6_frag_hash[i], list) { unsigned int hval = ip6qhashfn(q->id, &q->saddr, &q->daddr); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ip6_frag_hash[hval]) != NULL) - q->next->pprev = &q->next; - ip6_frag_hash[hval] = q; - q->pprev = &ip6_frag_hash[hval]; - } + hlist_add_head(&q->list, + &ip6_frag_hash[hval]); - q = next; + } } } write_unlock(&ip6_frag_lock); @@ -337,10 +327,13 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, struct frag_queue *fq_in) { struct frag_queue *fq; +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ip6_frag_lock); #ifdef CONFIG_SMP - for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == fq_in->id && ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { @@ -358,10 +351,7 @@ static struct frag_queue *ip6_frag_intern(unsigned int hash, atomic_inc(&fq->refcnt); atomic_inc(&fq->refcnt); - if((fq->next = ip6_frag_hash[hash]) != NULL) - fq->next->pprev = &fq->next; - ip6_frag_hash[hash] = fq; - fq->pprev = &ip6_frag_hash[hash]; + hlist_add_head(&fq->list, &ip6_frag_hash[hash]); INIT_LIST_HEAD(&fq->lru_list); list_add_tail(&fq->lru_list, &ip6_frag_lru_list); ip6_frag_nqueues++; @@ -401,10 +391,11 @@ static __inline__ struct frag_queue * fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) { struct frag_queue *fq; + struct hlist_node *n; unsigned int hash = ip6qhashfn(id, src, dst); read_lock(&ip6_frag_lock); - for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { if (fq->id == id && ipv6_addr_equal(src, &fq->saddr) && ipv6_addr_equal(dst, &fq->daddr)) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 227e99ed510..a7a537b5059 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1701,16 +1701,14 @@ static void fib6_dump_end(struct netlink_callback *cb) fib6_walker_unlink(w); kfree(w); } - if (cb->args[1]) { - cb->done = (void*)cb->args[1]; - cb->args[1] = 0; - } + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); - return cb->done(cb); + return cb->done ? cb->done(cb) : 0; } int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d693cb988b7..62c0e5bd931 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&tcp_hashinfo.portalloc_lock); - if (tcp_hashinfo.port_rover < low) - rover = low; - else - rover = tcp_hashinfo.port_rover; - do { rover++; - if (rover > high) - rover = low; + do { head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - tcp_hashinfo.port_rover = rover; - spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -1408,20 +1401,18 @@ out: static int tcp_v6_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb->csum)) + &skb->nh.ipv6h->daddr,skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); + } } + + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, 0); + if (skb->len <= 76) { - if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr,0); + return __skb_checksum_complete(skb); } return 0; } @@ -1582,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) goto discard_it; if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v6_checksum_init(skb) < 0)) + tcp_v6_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd..e671153b47b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -248,7 +248,7 @@ try_again: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else if (msg->msg_flags&MSG_TRUNC) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); @@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return -1; } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - UDP6_INC_STATS_BH(UDP_MIB_INERRORS); - kfree_skb(skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; } if (sock_queue_rcv_skb(sk,skb)<0) { @@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) uh = skb->h.uh; } - if (skb->ip_summed==CHECKSUM_HW) { + if (skb->ip_summed == CHECKSUM_HW && + !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; - if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; - } - } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); @@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard; - if (skb->ip_summed != CHECKSUM_UNNECESSARY && - (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + if (skb_checksum_complete(skb)) goto discard; UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); diff --git a/net/irda/discovery.c b/net/irda/discovery.c index c4ba5fa1446..3fefc822c1c 100644 --- a/net/irda/discovery.c +++ b/net/irda/discovery.c @@ -194,8 +194,7 @@ void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force) /* Remove it from the log */ curr = hashbin_remove_this(log, (irda_queue_t *) curr); - if (curr) - kfree(curr); + kfree(curr); } } diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c index 6fec428b451..75f2666e863 100644 --- a/net/irda/irias_object.c +++ b/net/irda/irias_object.c @@ -122,8 +122,7 @@ static void __irias_delete_attrib(struct ias_attrib *attrib) IRDA_ASSERT(attrib != NULL, return;); IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); - if (attrib->name) - kfree(attrib->name); + kfree(attrib->name); irias_delete_value(attrib->value); attrib->magic = ~IAS_ATTRIB_MAGIC; @@ -136,8 +135,7 @@ void __irias_delete_object(struct ias_object *obj) IRDA_ASSERT(obj != NULL, return;); IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); - if (obj->name) - kfree(obj->name); + kfree(obj->name); hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib); @@ -562,14 +560,12 @@ void irias_delete_value(struct ias_value *value) /* No need to deallocate */ break; case IAS_STRING: - /* If string, deallocate string */ - if (value->t.string != NULL) - kfree(value->t.string); + /* Deallocate string */ + kfree(value->t.string); break; case IAS_OCT_SEQ: - /* If byte stream, deallocate byte stream */ - if (value->t.oct_seq != NULL) - kfree(value->t.oct_seq); + /* Deallocate byte stream */ + kfree(value->t.oct_seq); break; default: IRDA_DEBUG(0, "%s(), Unknown value type!\n", __FUNCTION__); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 59d02cbbeb9..c3f0b078345 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -116,7 +116,9 @@ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) struct llc_sock* llc = llc_sk(sk); int rc = 0; - if (unlikely(llc_data_accept_state(llc->state) || llc->p_flag)) { + if (unlikely(llc_data_accept_state(llc->state) || + llc->remote_busy_flag || + llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); rc = llc_ui_wait_for_busy_core(sk, timeout); @@ -542,6 +544,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) if (sk_wait_event(sk, &timeout, (sk->sk_shutdown & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && + !llc->remote_busy_flag && !llc->p_flag))) break; rc = -ERESTARTSYS; diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index b0bcfb1f12d..8169f24ed33 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -866,7 +866,8 @@ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } - if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) { + if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) + % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; @@ -994,8 +995,8 @@ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; - if (llc->npta > 127) - llc->npta = 127 ; + if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; @@ -1065,9 +1066,10 @@ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); - llc->k -= unacked_pdu; - if (llc->k < 2) - llc->k = 2; + if (llc->k - unacked_pdu < 1) + llc->k = 1; + else + llc->k -= unacked_pdu; return 0; } @@ -1084,8 +1086,8 @@ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) struct llc_sock *llc = llc_sk(sk); llc->k += 1; - if (llc->k > 128) - llc->k = 128 ; + if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) + llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } @@ -1309,7 +1311,7 @@ int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { - llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128; + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8296b38bf27..a84f9221e5f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1,3 +1,6 @@ +menu "Core Netfilter Configuration" + depends on NET && NETFILTER + config NETFILTER_NETLINK tristate "Netfilter netlink interface" help @@ -22,3 +25,74 @@ config NETFILTER_NETLINK_LOG and is also scheduled to replace the old syslog-based ipt_LOG and ip6t_LOG modules. +config NF_CONNTRACK + tristate "Layer 3 Independent Connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK=n + default n + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + depends on NF_CONNTRACK + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified aboutchanges in the connection tracking state. + + If unsure, say `N'. + +config NF_CT_PROTO_SCTP + tristate 'SCTP protocol on new connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL && NF_CONNTRACK + default n + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + Documentation/modules.txt. If unsure, say `N'. + +config NF_CONNTRACK_FTP + tristate "FTP support on new connection tracking (EXPERIMENTAL)" + depends on EXPERIMENTAL && NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + This is FTP support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +endmenu diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index b3b44f8b415..55f019ad2c0 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -5,3 +5,11 @@ obj-$(CONFIG_NETFILTER) = netfilter.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o + +nf_conntrack-objs := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o + +obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o + +# SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 00000000000..1da678303d7 --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c @@ -0,0 +1,1545 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - generalize L3 protocol denendent part. + * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - add support various size of conntrack structures. + * + * Derived from net/ipv4/netfilter/ip_conntrack_core.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/socket.h> + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#define NF_CONNTRACK_VERSION "0.4.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DEFINE_RWLOCK(nf_conntrack_lock); + +/* nf_conntrack_standalone needs this */ +atomic_t nf_conntrack_count = ATOMIC_INIT(0); + +void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL; +LIST_HEAD(nf_conntrack_expect_list); +struct nf_conntrack_protocol **nf_ct_protos[PF_MAX]; +struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX]; +static LIST_HEAD(helpers); +unsigned int nf_conntrack_htable_size = 0; +int nf_conntrack_max; +struct list_head *nf_conntrack_hash; +static kmem_cache_t *nf_conntrack_expect_cachep; +struct nf_conn nf_conntrack_untracked; +unsigned int nf_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int nf_conntrack_vmalloc; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +struct notifier_block *nf_conntrack_chain; +struct notifier_block *nf_conntrack_expect_chain; + +DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) +{ + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct) + && ecache->events) + notifier_call_chain(&nf_conntrack_chain, ecache->events, + ecache->ct); + + ecache->events = 0; + nf_ct_put(ecache->ct); + ecache->ct = NULL; +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling for freeing the skb */ +void nf_ct_deliver_cached_events(const struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ecache->ct == ct) + __nf_ct_deliver_cached_events(ecache); + local_bh_enable(); +} + +/* Deliver cached events for old pending events, if current conntrack != old */ +void __nf_ct_event_cache_init(struct nf_conn *ct) +{ + struct nf_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(nf_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __nf_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called + * while packets are still passing through the code */ +static void nf_ct_event_cache_flush(void) +{ + struct nf_conntrack_ecache *ecache; + int cpu; + + for_each_cpu(cpu) { + ecache = &per_cpu(nf_conntrack_ecache, cpu); + if (ecache->ct) + nf_ct_put(ecache->ct); + } +} +#else +static inline void nf_ct_event_cache_flush(void) {} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); + +/* + * This scheme offers various size of "struct nf_conn" dependent on + * features(helper, nat, ...) + */ + +#define NF_CT_FEATURES_NAMELEN 256 +static struct { + /* name of slab cache. printed in /proc/slabinfo */ + char *name; + + /* size of slab cache */ + size_t size; + + /* slab cache pointer */ + kmem_cache_t *cachep; + + /* allocated slab cache + modules which uses this slab cache */ + int use; + + /* Initialization */ + int (*init_conntrack)(struct nf_conn *, u_int32_t); + +} nf_ct_cache[NF_CT_F_NUM]; + +/* protect members of nf_ct_cache except of "use" */ +DEFINE_RWLOCK(nf_ct_cache_lock); + +/* This avoids calling kmem_cache_create() with same name simultaneously */ +DECLARE_MUTEX(nf_ct_cache_mutex); + +extern struct nf_conntrack_protocol nf_conntrack_generic_protocol; +struct nf_conntrack_protocol * +nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol) +{ + if (unlikely(nf_ct_protos[l3proto] == NULL)) + return &nf_conntrack_generic_protocol; + + return nf_ct_protos[l3proto][protocol]; +} + +static int nf_conntrack_hash_rnd_initted; +static unsigned int nf_conntrack_hash_rnd; + +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) +{ + unsigned int a, b; + a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all), + ((tuple->src.l3num) << 16) | tuple->dst.protonum); + b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all), + (tuple->src.u.all << 16) | tuple->dst.u.all); + + return jhash_2words(a, b, rnd) % size; +} + +static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, nf_conntrack_htable_size, + nf_conntrack_hash_rnd); +} + +/* Initialize "struct nf_conn" which has spaces for helper */ +static int +init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features) +{ + + conntrack->help = (union nf_conntrack_help *) + (((unsigned long)conntrack->data + + (__alignof__(union nf_conntrack_help) - 1)) + & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1)))); + return 0; +} + +int nf_conntrack_register_cache(u_int32_t features, const char *name, + size_t size, + int (*init)(struct nf_conn *, u_int32_t)) +{ + int ret = 0; + char *cache_name; + kmem_cache_t *cachep; + + DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n", + features, name, size); + + if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) { + DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n", + features); + return -EINVAL; + } + + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + /* e.g: multiple helpers are loaded */ + if (nf_ct_cache[features].use > 0) { + DEBUGP("nf_conntrack_register_cache: already resisterd.\n"); + if ((!strncmp(nf_ct_cache[features].name, name, + NF_CT_FEATURES_NAMELEN)) + && nf_ct_cache[features].size == size + && nf_ct_cache[features].init_conntrack == init) { + DEBUGP("nf_conntrack_register_cache: reusing.\n"); + nf_ct_cache[features].use++; + ret = 0; + } else + ret = -EBUSY; + + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return ret; + } + write_unlock_bh(&nf_ct_cache_lock); + + /* + * The memory space for name of slab cache must be alive until + * cache is destroyed. + */ + cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC); + if (cache_name == NULL) { + DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n"); + ret = -ENOMEM; + goto out_up_mutex; + } + + if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN) + >= NF_CT_FEATURES_NAMELEN) { + printk("nf_conntrack_register_cache: name too long\n"); + ret = -EINVAL; + goto out_free_name; + } + + cachep = kmem_cache_create(cache_name, size, 0, 0, + NULL, NULL); + if (!cachep) { + printk("nf_conntrack_register_cache: Can't create slab cache " + "for the features = 0x%x\n", features); + ret = -ENOMEM; + goto out_free_name; + } + + write_lock_bh(&nf_ct_cache_lock); + nf_ct_cache[features].use = 1; + nf_ct_cache[features].size = size; + nf_ct_cache[features].init_conntrack = init; + nf_ct_cache[features].cachep = cachep; + nf_ct_cache[features].name = cache_name; + write_unlock_bh(&nf_ct_cache_lock); + + goto out_up_mutex; + +out_free_name: + kfree(cache_name); +out_up_mutex: + up(&nf_ct_cache_mutex); + return ret; +} + +/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */ +void nf_conntrack_unregister_cache(u_int32_t features) +{ + kmem_cache_t *cachep; + char *name; + + /* + * This assures that kmem_cache_create() isn't called before destroying + * slab cache. + */ + DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features); + down(&nf_ct_cache_mutex); + + write_lock_bh(&nf_ct_cache_lock); + if (--nf_ct_cache[features].use > 0) { + write_unlock_bh(&nf_ct_cache_lock); + up(&nf_ct_cache_mutex); + return; + } + cachep = nf_ct_cache[features].cachep; + name = nf_ct_cache[features].name; + nf_ct_cache[features].cachep = NULL; + nf_ct_cache[features].name = NULL; + nf_ct_cache[features].init_conntrack = NULL; + nf_ct_cache[features].size = 0; + write_unlock_bh(&nf_ct_cache_lock); + + synchronize_net(); + + kmem_cache_destroy(cachep); + kfree(name); + + up(&nf_ct_cache_mutex); +} + +int +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(tuple); + + tuple->src.l3num = l3num; + if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + return 0; + + tuple->dst.protonum = protonum; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_protocol *protocol) +{ + NF_CT_TUPLE_U_BLANK(inverse); + + inverse->src.l3num = orig->src.l3num; + if (l3proto->invert_tuple(inverse, orig) == 0) + return 0; + + inverse->dst.dir = !orig->dst.dir; + + inverse->dst.protonum = orig->dst.protonum; + return protocol->invert_tuple(inverse, orig); +} + +/* nf_conntrack_expect helper functions */ +static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +{ + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(!timer_pending(&exp->timeout)); + list_del(&exp->list); + NF_CT_STAT_INC(expect_delete); + exp->master->expecting--; + nf_conntrack_expect_put(exp); +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct nf_conntrack_expect *exp = (void *)ul_expect; + + write_lock_bh(&nf_conntrack_lock); + nf_ct_unlink_expect(exp); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct nf_conntrack_expect * +find_expectation(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && nf_ct_is_confirmed(i->master)) { + if (i->flags & NF_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + return i; + } + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct nf_conn *ct) +{ + struct nf_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + } +} + +static void +clean_from_lists(struct nf_conn *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + ASSERT_WRITE_LOCK(&nf_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); + NF_CT_ASSERT(!timer_pending(&ct->timeout)); + + nf_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); + if (l3proto && l3proto->destroy) + l3proto->destroy(ct); + + proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (nf_conntrack_destroyed) + nf_conntrack_destroyed(ct); + + write_lock_bh(&nf_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + NF_CT_STAT_INC(delete); + write_unlock_bh(&nf_conntrack_lock); + + if (ct->master) + nf_ct_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + + write_lock_bh(&nf_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + NF_CT_STAT_INC(delete_list); + clean_from_lists(ct); + write_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + ASSERT_READ_LOCK(&nf_conntrack_lock); + return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack + && nf_ct_tuple_equal(tuple, &i->tuple); +} + +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + ASSERT_READ_LOCK(&nf_conntrack_lock); + list_for_each_entry(h, &nf_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + NF_CT_STAT_INC(found); + return h; + } + NF_CT_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + read_unlock_bh(&nf_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__nf_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(*pskb, &ctinfo); + + /* ipt_REJECT uses nf_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + write_lock_bh(&nf_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&nf_conntrack_hash[hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&nf_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct nf_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&nf_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&nf_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + NF_CT_STAT_INC(insert); + write_unlock_bh(&nf_conntrack_lock); + if (ct->helper) + nf_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + nf_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; + } + + NF_CT_STAT_INC(insert_failed); + write_unlock_bh(&nf_conntrack_lock); + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct nf_conntrack_tuple_hash *h; + + read_lock_bh(&nf_conntrack_lock); + h = __nf_conntrack_find(tuple, ignored_conntrack); + read_unlock_bh(&nf_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct nf_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, + &nf_ct_tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct = NULL; + int dropped = 0; + + read_lock_bh(&nf_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + read_unlock_bh(&nf_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + NF_CT_STAT_INC(early_drop); + } + nf_ct_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct nf_conntrack_helper *i, + const struct nf_conntrack_tuple *rtuple) +{ + return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct nf_conntrack_helper * +nf_ct_find_helper(const struct nf_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct nf_conntrack_helper *, + tuple); +} + +static struct nf_conn * +__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + const struct nf_conntrack_l3proto *l3proto) +{ + struct nf_conn *conntrack = NULL; + u_int32_t features = 0; + + if (!nf_conntrack_hash_rnd_initted) { + get_random_bytes(&nf_conntrack_hash_rnd, 4); + nf_conntrack_hash_rnd_initted = 1; + } + + if (nf_conntrack_max + && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ + if (!early_drop(&nf_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + /* find features needed by this conntrack. */ + features = l3proto->get_features(orig); + read_lock_bh(&nf_conntrack_lock); + if (nf_ct_find_helper(repl) != NULL) + features |= NF_CT_F_HELP; + read_unlock_bh(&nf_conntrack_lock); + + DEBUGP("nf_conntrack_alloc: features=0x%x\n", features); + + read_lock_bh(&nf_ct_cache_lock); + + if (!nf_ct_cache[features].use) { + DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n", + features); + goto out; + } + + conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC); + if (conntrack == NULL) { + DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n"); + goto out; + } + + memset(conntrack, 0, nf_ct_cache[features].size); + conntrack->features = features; + if (nf_ct_cache[features].init_conntrack && + nf_ct_cache[features].init_conntrack(conntrack, features) < 0) { + DEBUGP("nf_conntrack_alloc: failed to init\n"); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + conntrack = NULL; + goto out; + } + + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + atomic_inc(&nf_conntrack_count); +out: + read_unlock_bh(&nf_ct_cache_lock); + return conntrack; +} + +struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_conntrack_l3proto *l3proto; + + l3proto = nf_ct_find_l3proto(orig->src.l3num); + return __nf_conntrack_alloc(orig, repl, l3proto); +} + +void nf_conntrack_free(struct nf_conn *conntrack) +{ + u_int32_t features = conntrack->features; + NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM); + DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features, + conntrack); + kmem_cache_free(nf_ct_cache[features].cachep, conntrack); + atomic_dec(&nf_conntrack_count); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct nf_conntrack_tuple_hash * +init_conntrack(const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *protocol, + struct sk_buff *skb, + unsigned int dataoff) +{ + struct nf_conn *conntrack; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_expect *exp; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto); + if (conntrack == NULL || IS_ERR(conntrack)) { + DEBUGP("Can't allocate conntrack.\n"); + return (struct nf_conntrack_tuple_hash *)conntrack; + } + + if (!protocol->new(conntrack, skb, dataoff)) { + nf_conntrack_free(conntrack); + DEBUGP("init conntrack: can't track with proto module\n"); + return NULL; + } + + write_lock_bh(&nf_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#ifdef CONFIG_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + NF_CT_STAT_INC(expect_new); + } else { + conntrack->helper = nf_ct_find_helper(&repl_tuple); + + NF_CT_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + write_unlock_bh(&nf_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + nf_conntrack_expect_put(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct nf_conn * +resolve_normal_ct(struct sk_buff *skb, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto, + int *set_reply, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data), + dataoff, l3num, protonum, &tuple, l3proto, + proto)) { + DEBUGP("resolve_normal_ct: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, l3proto, proto, skb, dataoff); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = nf_ct_tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: normal packet for %p\n", ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("nf_conntrack_in: related packet for %p\n", ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("nf_conntrack_in: new packet for %p\n", ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +unsigned int +nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + unsigned int dataoff; + u_int8_t protonum; + int set_reply = 0; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + NF_CT_STAT_INC(ignore); + return NF_ACCEPT; + } + + l3proto = nf_ct_find_l3proto((u_int16_t)pf); + if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { + DEBUGP("not prepared to track yet or error occured\n"); + return -ret; + } + + proto = nf_ct_find_proto((u_int16_t)pf, protonum); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL && + (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -ret; + } + + ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto, + &set_reply, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + NF_CT_STAT_INC(drop); + return NF_DROP; + } + + NF_CT_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do */ + DEBUGP("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + NF_CT_STAT_INC(invalid); + return -ret; + } + + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_STATUS, *pskb); + + return ret; +} + +int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) +{ + return nf_ct_invert_tuple(inverse, orig, + nf_ct_find_l3proto(orig->src.l3num), + nf_ct_find_proto(orig->src.l3num, + orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct nf_conntrack_tuple intersect_mask; + int count; + + intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num; + intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; + intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all; + intersect_mask.dst.protonum = a->mask.dst.protonum + & b->mask.dst.protonum; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.src.u3.all[count] = + a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; + } + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.dst.u3.all[count] = + a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count]; + } + + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + return a->master == b->master + && nf_ct_tuple_equal(&a->tuple, &b->tuple) + && nf_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_expect *i; + + write_lock_bh(&nf_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + write_unlock_bh(&nf_conntrack_lock); + nf_conntrack_expect_put(i); + return; + } + } + write_unlock_bh(&nf_conntrack_lock); +} + +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me) +{ + struct nf_conntrack_expect *new; + + new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = me; + atomic_set(&new->use, 1); + return new; +} + +void nf_conntrack_expect_put(struct nf_conntrack_expect *exp) +{ + if (atomic_dec_and_test(&exp->use)) + kmem_cache_free(nf_conntrack_expect_cachep, exp); +} + +static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp) +{ + atomic_inc(&exp->use); + exp->master->expecting++; + list_add(&exp->list, &nf_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + + atomic_inc(&exp->use); + NF_CT_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct nf_conn *master) +{ + struct nf_conntrack_expect *i; + + list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_conntrack_expect_put(i); + } + break; + } + } +} + +static inline int refresh_timer(struct nf_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int nf_conntrack_expect_related(struct nf_conntrack_expect *expect) +{ + struct nf_conntrack_expect *i; + int ret; + + DEBUGP("nf_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask); + + write_lock_bh(&nf_conntrack_lock); + list_for_each_entry(i, &nf_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + nf_conntrack_expect_insert(expect); + nf_conntrack_expect_event(IPEXP_NEW, expect); + ret = 0; +out: + write_unlock_bh(&nf_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __nf_conntrack_confirm */ +void nf_conntrack_alter_reply(struct nf_conn *conntrack, + const struct nf_conntrack_tuple *newreply) +{ + write_lock_bh(&nf_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + NF_CT_DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = nf_ct_find_helper(newreply); + write_unlock_bh(&nf_conntrack_lock); +} + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + int ret; + BUG_ON(me->timeout == 0); + + ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help", + sizeof(struct nf_conn) + + sizeof(union nf_conntrack_help) + + __alignof__(union nf_conntrack_help), + init_conntrack_for_helper); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n"); + return ret; + } + write_lock_bh(&nf_conntrack_lock); + list_prepend(&helpers, me); + write_unlock_bh(&nf_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_helper *me) +{ + if (nf_ct_tuplehash_to_ctrack(i)->helper == me) { + nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i)); + nf_ct_tuplehash_to_ctrack(i)->helper = NULL; + } + return 0; +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + unsigned int i; + struct nf_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + write_lock_bh(&nf_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_conntrack_expect_put(exp); + } + } + + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me); + for (i = 0; i < nf_conntrack_htable_size; i++) + LIST_FIND_W(&nf_conntrack_hash[i], unhelp, + struct nf_conntrack_tuple_hash *, me); + write_unlock_bh(&nf_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) +{ + int event = 0; + + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + NF_CT_ASSERT(skb); + + write_lock_bh(&nf_conntrack_lock); + + /* If not in hash table, timer will not be active yet */ + if (!nf_ct_is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + event = IPCT_REFRESH; + } else { + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + event = IPCT_REFRESH; + } + } + +#ifdef CONFIG_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + skb->len - (unsigned int)(skb->nh.raw - skb->data); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&nf_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + nf_conntrack_event_cache(event, skb); +} + +/* Used by ipt_REJECT and ip6t_REJECT. */ +void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = nf_ct_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct nf_conntrack_tuple_hash *i, + int (*iter)(struct nf_conn *i, void *data), + void *data) +{ + return iter(nf_ct_tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct nf_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct nf_conn *i, void *data), + void *data, unsigned int *bucket) +{ + struct nf_conntrack_tuple_hash *h = NULL; + + write_lock_bh(&nf_conntrack_lock); + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct nf_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); + write_unlock_bh(&nf_conntrack_lock); + + return h; +} + +void +nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +{ + struct nf_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + nf_ct_put(ct); + } +} + +static int kill_all(struct nf_conn *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size) +{ + if (vmalloced) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(void) +{ + int i; + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_ct_event_cache_flush(); + i_see_dead_people: + nf_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&nf_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + for (i = 0; i < NF_CT_F_NUM; i++) { + if (nf_ct_cache[i].use == 0) + continue; + + NF_CT_ASSERT(nf_ct_cache[i].use == 1); + nf_ct_cache[i].use = 1; + nf_conntrack_unregister_cache(i); + } + kmem_cache_destroy(nf_conntrack_expect_cachep); + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + if (nf_ct_protos[i]) { + kfree(nf_ct_protos[i]); + nf_ct_protos[i] = NULL; + } +} + +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct nf_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehahs for the new table anyway, so we also can + * use a newrandom seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!list_empty(&nf_conntrack_hash[i])) { + h = list_entry(nf_conntrack_hash[i].next, + struct nf_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = nf_conntrack_htable_size; + old_vmalloced = nf_conntrack_vmalloc; + old_hash = nf_conntrack_hash; + + nf_conntrack_htable_size = hashsize; + nf_conntrack_vmalloc = vmalloced; + nf_conntrack_hash = hash; + nf_conntrack_hash_rnd = rnd; + write_unlock_bh(&nf_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +int __init nf_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 8192; + if (nf_conntrack_htable_size < 16) + nf_conntrack_htable_size = 16; + } + nf_conntrack_max = 8 * nf_conntrack_htable_size; + + printk("nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); + + nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size, + &nf_conntrack_vmalloc); + if (!nf_conntrack_hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_out; + } + + ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic", + sizeof(struct nf_conn), NULL); + if (ret < 0) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + goto err_free_hash; + } + + nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL, NULL); + if (!nf_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create nf_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&nf_conntrack_lock); + for (i = 0; i < PF_MAX; i++) + nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + nf_conntrack_unregister_cache(NF_CT_F_BASIC); +err_free_hash: + free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); +err_out: + return -ENOMEM; +} diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 00000000000..65080e269f2 --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c @@ -0,0 +1,698 @@ +/* FTP extension for connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * - track EPRT and EPSV commands with IPv6 address. + * + * Derived from net/ipv4/netfilter/ip_conntrack_ftp.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/ctype.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char *ftp_buffer; + +static DEFINE_SPINLOCK(nf_ftp_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*nf_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, + char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +/* This code is based on inet_pton() in glibc-2.2.4 */ +static int +get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) +{ + static const char xdigits[] = "0123456789abcdef"; + u_int8_t tmp[16], *tp, *endp, *colonp; + int ch, saw_xdigit; + u_int32_t val; + size_t clen = 0; + + tp = memset(tmp, '\0', sizeof(tmp)); + endp = tp + sizeof(tmp); + colonp = NULL; + + /* Leading :: requires some special handling. */ + if (*src == ':'){ + if (*++src != ':') { + DEBUGP("invalid \":\" at the head of addr\n"); + return 0; + } + clen++; + } + + saw_xdigit = 0; + val = 0; + while ((clen < dlen) && (*src != term)) { + const char *pch; + + ch = tolower(*src++); + clen++; + + pch = strchr(xdigits, ch); + if (pch != NULL) { + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return 0; + + saw_xdigit = 1; + continue; + } + if (ch != ':') { + DEBUGP("get_ipv6_addr: invalid char. \'%c\'\n", ch); + return 0; + } + + if (!saw_xdigit) { + if (colonp) { + DEBUGP("invalid location of \"::\".\n"); + return 0; + } + colonp = tp; + continue; + } else if (*src == term) { + DEBUGP("trancated IPv6 addr\n"); + return 0; + } + + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + + saw_xdigit = 0; + val = 0; + continue; + } + if (saw_xdigit) { + if (tp + 2 > endp) + return 0; + *tp++ = (u_int8_t) (val >> 8) & 0xff; + *tp++ = (u_int8_t) val & 0xff; + } + if (colonp != NULL) { + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + int i; + + if (tp == endp) + return 0; + + for (i = 1; i <= n; i++) { + endp[- i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp || (*src != term)) + return 0; + + memcpy(dst->s6_addr, tmp, sizeof(dst->s6_addr)); + return clen; +} + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + int length; + u_int32_t array[6]; + + length = try_number(data, dlen, array, 6, ',', term); + if (length == 0) + return 0; + + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + (array[2] << 8) | array[3]); + cmd->u.tcp.port = htons((array[4] << 8) | array[5]); + return length; +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int16_t *port) +{ + u_int16_t tmp_port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (tmp_port == 0) + break; + *port = htons(tmp_port); + DEBUGP("get_port: return %d\n", tmp_port); + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + tmp_port = tmp_port*10 + data[i] - '0'; + else { /* Some other crap */ + DEBUGP("get_port: invalid char.\n"); + break; + } + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ +static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, + then delimiter again. */ + if (dlen <= 3) { + DEBUGP("EPRT: too short\n"); + return 0; + } + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { + DEBUGP("try_eprt: invalid delimitter.\n"); + return 0; + } + + if ((cmd->l3num == PF_INET && data[1] != '1') || + (cmd->l3num == PF_INET6 && data[1] != '2')) { + DEBUGP("EPRT: invalid protocol number.\n"); + return 0; + } + + DEBUGP("EPRT: Got %c%c%c\n", delim, data[1], delim); + + if (data[1] == '1') { + u_int32_t array[4]; + + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length != 0) + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } else { + /* Now we have IPv6 address. */ + length = get_ipv6_addr(data + 3, dlen - 3, + (struct in6_addr *)cmd->u3.ip6, delim); + } + + if (length == 0) + return 0; + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, + struct nf_conntrack_man *, char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, cmd, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][oldest] = nl_seq; + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } +} + +static int help(struct sk_buff **pskb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help->ct_ftp_info; + struct nf_conntrack_expect *exp; + struct nf_conntrack_man cmd = {}; + + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = protoff + th->doff * 4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, datalen, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("nf_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP/IPv6 addr to expected address (it's not mentioned + in EPSV responses) */ + cmd.l3num = ct->tuplehash[dir].tuple.src.l3num; + memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all)); + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, datalen, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + &cmd, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + (int)matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); + + exp = nf_conntrack_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.u3 = ct->tuplehash[!dir].tuple.dst.u3; + + /* Update the ftp info */ + if ((cmd.l3num == ct->tuplehash[dir].tuple.src.l3num) && + memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all))) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + if (cmd.l3num == PF_INET) { + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + NIPQUAD(cmd.u3.ip), + NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip)); + } else { + DEBUGP("conntrack_ftp: NOT RECORDING: %x:%x:%x:%x:%x:%x:%x:%x != %x:%x:%x:%x:%x:%x:%x:%x\n", + NIP6(*((struct in6_addr *)cmd.u3.ip6)), + NIP6(*((struct in6_addr *)ct->tuplehash[dir] + .tuple.src.u3.ip6))); + } + + /* Thanks to Cristiano Lincoln Mattos + <lincoln@cesar.org.br> for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + goto out_put_expect; + } + memcpy(&exp->tuple.dst.u3, &cmd.u3.all, + sizeof(exp->tuple.dst.u3)); + } + + exp->tuple.src.u3 = ct->tuplehash[!dir].tuple.src.u3; + exp->tuple.src.l3num = cmd.l3num; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.u.tcp.port = cmd.u.tcp.port; + exp->tuple.dst.protonum = IPPROTO_TCP; + + exp->mask = (struct nf_conntrack_tuple) + { .src = { .l3num = 0xFFFF, + .u = { .tcp = { 0 }}, + }, + .dst = { .protonum = 0xFF, + .u = { .tcp = { 0xFFFF }}, + }, + }; + if (cmd.l3num == PF_INET) { + exp->mask.src.u3.ip = 0xFFFFFFFF; + exp->mask.dst.u3.ip = 0xFFFFFFFF; + } else { + memset(exp->mask.src.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + memset(exp->mask.dst.u3.ip6, 0xFF, + sizeof(exp->mask.src.u3.ip6)); + } + + exp->expectfn = NULL; + exp->flags = 0; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (nf_nat_ftp_hook) + ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (nf_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + +out_put_expect: + nf_conntrack_expect_put(exp); + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info, dir, *pskb); + out: + spin_unlock_bh(&nf_ftp_lock); + return ret; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2]; +static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")]; + +/* don't make this __exit, since it's called from __init ! */ +static void fini(void) +{ + int i, j; + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + if (ftp[i][j].me == NULL) + continue; + + DEBUGP("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&ftp[i][j]); + } + } + + kfree(ftp_buffer); +} + +static int __init init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 FTP connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + memset(&ftp[i], 0, sizeof(struct nf_conntrack_helper)); + + ftp[i][0].tuple.src.l3num = PF_INET; + ftp[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; + ftp[i][j].mask.src.u.tcp.port = 0xFFFF; + ftp[i][j].mask.dst.protonum = 0xFF; + ftp[i][j].max_expected = 1; + ftp[i][j].timeout = 5 * 60; /* 5 Minutes */ + ftp[i][j].me = THIS_MODULE; + ftp[i][j].help = help; + tmpname = &ftp_names[i][j][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i][j].name = tmpname; + + DEBUGP("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&ftp[i][j]); + if (ret) { + printk("nf_ct_ftp: failed to register helper " + " for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + fini(); + return ret; + } + } + } + + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 00000000000..7de4f06c63c --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -0,0 +1,98 @@ +/* + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * Based largely upon the original ip_conntrack code which + * had the following copyright information: + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_PER_CPU(struct nf_conntrack_stat, nf_conntrack_stat); + +static int generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return 1; +} + +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +static int +generic_prepare(struct sk_buff **pskb, unsigned int hooknum, + unsigned int *dataoff, u_int8_t *protonum) +{ + /* Never track !!! */ + return -NF_ACCEPT; +} + + +static u_int32_t generic_get_features(const struct nf_conntrack_tuple *tuple) + +{ + return NF_CT_F_BASIC; +} + +struct nf_conntrack_l3proto nf_conntrack_generic_l3proto = { + .l3proto = PF_UNSPEC, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .prepare = generic_prepare, + .get_features = generic_get_features, + .me = THIS_MODULE, +}; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 00000000000..36425f6c833 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -0,0 +1,85 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_generic.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +unsigned long nf_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct nf_conn *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_generic_protocol = +{ + .l3proto = PF_UNSPEC, + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 00000000000..3a600f77b4e --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -0,0 +1,670 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 17 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with L3 protocol independent connection tracking. + * + * Derived from net/ipv4/ip_conntrack_sctp.c + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> +#include <linux/seq_file.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DEFINE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long nf_ct_sctp_timeout_closed = 10 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long nf_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long nf_ct_sctp_timeout_established = 5 DAYS; +static unsigned long nf_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long nf_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &nf_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &nf_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &nf_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &nf_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &nf_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &nf_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &nf_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + read_lock_bh(&sctp_lock); + state = conntrack->proto.sctp.state; + read_unlock_bh(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ +for (offset = dataoff + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + write_lock_bh(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + write_unlock_bh(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + write_unlock_bh(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + write_unlock_bh(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + write_unlock_bh(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + write_unlock_bh(&sctp_lock); + } + + nf_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + enum sctp_conntrack newconntrack; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, dataoff, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("nf_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp4 = { + .l3proto = PF_INET, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_sctp6 = { + .l3proto = PF_INET6, + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "nf_conntrack_sctp_timeout_closed", + .data = &nf_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .data = &nf_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .data = &nf_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_sctp_timeout_established", + .data = &nf_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .data = &nf_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .data = &nf_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &nf_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *nf_ct_sysctl_header; +#endif + +int __init init(void) +{ + int ret; + + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp4); + if (ret) { + printk("nf_conntrack_proto_sctp4: protocol register failed\n"); + goto out; + } + ret = nf_conntrack_protocol_register(&nf_conntrack_protocol_sctp6); + if (ret) { + printk("nf_conntrack_proto_sctp6: protocol register failed\n"); + goto cleanup_sctp4; + } + +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); +#endif + cleanup_sctp4: + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +void __exit fini(void) +{ + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp6); + nf_conntrack_protocol_unregister(&nf_conntrack_protocol_sctp4); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 00000000000..5a6fcf349bd --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -0,0 +1,1169 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * 27 Oct 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - genelized Layer 3 protocol part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_tcp.c + * + * version 2.2 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> + +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DEFINE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int nf_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int nf_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int nf_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long nf_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long nf_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long nf_ct_tcp_timeout_established = 5 DAYS; +unsigned long nf_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long nf_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long nf_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long nf_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long nf_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + enum tcp_conntrack state; + + read_lock_bh(&tcp_lock); + state = conntrack->proto.tcp.state; + read_unlock_bh(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + unsigned int dataoff, + struct tcphdr *tcph) +{ + /* XXX Should I use payload length field in IP/IPv6 header ? + * - YK */ + return (seq + len - dataoff - tcph->doff*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, + struct tcphdr *tcph, __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode = *ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + memcpy(&tmp, (__u32 *)(ptr + i) + 1, + sizeof(__u32)); + tmp = ntohl(tmp); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + struct tcphdr *tcph, + int pf) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, dataoff, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = nf_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +/* Caller must linearize skb at tcp header. */ +void nf_conntrack_tcp_update(struct sk_buff *skb, + unsigned int dataoff, + struct nf_conn *conntrack, + int dir) +{ + struct tcphdr *tcph = (void *)skb->data + dataoff; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); + + write_lock_bh(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + write_unlock_bh(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, + [TH_SYN|TH_ACK|TH_PUSH] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int(*csum)(const struct sk_buff *,unsigned int)) +{ + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - dataoff; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int tcp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int tcp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return tcp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + write_lock_bh(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); + + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packed ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + write_unlock_bh(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } else { + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, + NULL, "nf_ct_tcp: invalid SYN"); + return -NF_ACCEPT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, dataoff, th, pf)) { + write_unlock_bh(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans + ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + nf_conntrack_event_cache(IPCT_STATUS, skb); + } + nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff) +{ + enum tcp_conntrack new_state; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("nf_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = nf_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_tcp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_tcp4); +EXPORT_SYMBOL(nf_conntrack_protocol_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 00000000000..3cae7ce420d --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -0,0 +1,216 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - enable working with Layer 3 protocol independent connection tracking. + * + * Derived from net/ipv4/netfilter/ip_conntrack_proto_udp.c + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_protocol.h> + +unsigned long nf_ct_udp_timeout = 30*HZ; +unsigned long nf_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct nf_conn *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct nf_conn *conntrack, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + int pf, + unsigned int hooknum) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + nf_ct_refresh_acct(conntrack, ctinfo, skb, + nf_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + nf_conntrack_event_cache(IPCT_STATUS, skb); + } else + nf_ct_refresh_acct(conntrack, ctinfo, skb, nf_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct nf_conn *conntrack, const struct sk_buff *skb, + unsigned int dataoff) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum, + int (*csum)(const struct sk_buff *, unsigned int)) +{ + unsigned int udplen = skb->len - dataoff; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) + && skb->ip_summed != CHECKSUM_UNNECESSARY + && csum(skb, dataoff)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static int csum4(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, + skb->len - dataoff, 0)); +} + +static int csum6(const struct sk_buff *skb, unsigned int dataoff) +{ + return csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, + skb->len - dataoff, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, dataoff, skb->len - dataoff, + 0)); +} + +static int udp_error4(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum4); +} + +static int udp_error6(struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + int pf, + unsigned int hooknum) +{ + return udp_error(skb, dataoff, ctinfo, pf, hooknum, csum6); +} + +struct nf_conntrack_protocol nf_conntrack_protocol_udp4 = +{ + .l3proto = PF_INET, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error4, +}; + +struct nf_conntrack_protocol nf_conntrack_protocol_udp6 = +{ + .l3proto = PF_INET6, + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error6, +}; + +EXPORT_SYMBOL(nf_conntrack_protocol_udp4); +EXPORT_SYMBOL(nf_conntrack_protocol_udp6); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 00000000000..5af381f9fe3 --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c @@ -0,0 +1,869 @@ +/* This file contains all the functions required for the standalone + nf_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * - generalize L3 protocol dependent part. + * + * Derived from net/ipv4/netfilter/ip_conntrack_standalone.c + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#include <linux/netdevice.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_protocol.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t nf_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + +static int kill_l3proto(struct nf_conn *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + ((struct nf_conntrack_l3proto *)data)->l3proto); +} + +static int kill_proto(struct nf_conn *i, void *data) +{ + struct nf_conntrack_protocol *proto; + proto = (struct nf_conntrack_protocol *)data; + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + proto->proto) && + (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == + proto->l3proto); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_protocol *proto) +{ + return l3proto->print_tuple(s, tuple) || proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < nf_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&nf_conntrack_hash[st->bucket])) + return nf_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &nf_conntrack_hash[st->bucket]) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; + head = nf_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&nf_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +/* return 0 on success, 1 in case of error */ +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct nf_conntrack_tuple_hash *hash = v; + const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_protocol *proto; + + ASSERT_READ_LOCK(&nf_conntrack_lock); + NF_CT_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return 0; + + l3proto = nf_ct_find_l3proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num); + + NF_CT_ASSERT(l3proto); + proto = nf_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src.l3num, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + NF_CT_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %-8s %u %ld ", + l3proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) + return -ENOSPC; + + if (l3proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%u ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &nf_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + read_lock_bh(&nf_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &nf_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + ++*pos; + e = e->next; + + if (e == &nf_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&nf_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + seq_printf(s, "l3proto = %u proto=%u ", + expect->tuple.src.l3num, + expect->tuple.dst.protonum); + print_tuple(s, &expect->tuple, + nf_ct_find_l3proto(expect->tuple.src.l3num), + nf_ct_find_proto(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return &per_cpu(nf_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_PROC_FS */ + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From nf_conntrack_core.c */ +extern int nf_conntrack_max; +extern unsigned int nf_conntrack_htable_size; + +/* From nf_conntrack_proto_tcp.c */ +extern unsigned long nf_ct_tcp_timeout_syn_sent; +extern unsigned long nf_ct_tcp_timeout_syn_recv; +extern unsigned long nf_ct_tcp_timeout_established; +extern unsigned long nf_ct_tcp_timeout_fin_wait; +extern unsigned long nf_ct_tcp_timeout_close_wait; +extern unsigned long nf_ct_tcp_timeout_last_ack; +extern unsigned long nf_ct_tcp_timeout_time_wait; +extern unsigned long nf_ct_tcp_timeout_close; +extern unsigned long nf_ct_tcp_timeout_max_retrans; +extern int nf_ct_tcp_loose; +extern int nf_ct_tcp_be_liberal; +extern int nf_ct_tcp_max_retrans; + +/* From nf_conntrack_proto_udp.c */ +extern unsigned long nf_ct_udp_timeout; +extern unsigned long nf_ct_udp_timeout_stream; + +/* From nf_conntrack_proto_generic.c */ +extern unsigned long nf_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *nf_ct_sysctl_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_COUNT, + .procname = "nf_conntrack_count", + .data = &nf_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_BUCKETS, + .procname = "nf_conntrack_buckets", + .data = &nf_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .data = &nf_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .data = &nf_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "nf_conntrack_tcp_timeout_established", + .data = &nf_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .data = &nf_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "nf_conntrack_tcp_timeout_close_wait", + .data = &nf_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "nf_conntrack_tcp_timeout_last_ack", + .data = &nf_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "nf_conntrack_tcp_timeout_time_wait", + .data = &nf_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "nf_conntrack_tcp_timeout_close", + .data = &nf_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "nf_conntrack_udp_timeout", + .data = &nf_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "nf_conntrack_udp_timeout_stream", + .data = &nf_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "nf_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, + .procname = "nf_conntrack_log_invalid", + .data = &nf_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .data = &nf_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, + .procname = "nf_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "nf_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + + { .ctl_name = 0 } +}; + +#define NET_NF_CONNTRACK_MAX 2089 + +static ctl_table nf_ct_netfilter_table[] = { + { + .ctl_name = NET_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = nf_ct_sysctl_table, + }, + { + .ctl_name = NET_NF_CONNTRACK_MAX, + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table nf_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nf_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; +EXPORT_SYMBOL(nf_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + proc = proc_net_fops_create("nf_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("nf_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif +#ifdef CONFIG_SYSCTL + nf_ct_sysctl_header = register_sysctl_table(nf_ct_net_table, 0); + if (nf_ct_sysctl_header == NULL) { + printk("nf_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_proc_stat; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_sysctl_header); + cleanup_proc_stat: +#endif +#ifdef CONFIG_PROC_FS + remove_proc_entry("nf_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("nf_conntrack_expect"); + cleanup_proc: + proc_net_remove("nf_conntrack"); + cleanup_init: +#endif /* CNFIG_PROC_FS */ + nf_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_generic_l3proto) { + ret = -EBUSY; + goto out; + } + nf_ct_l3protos[proto->l3proto] = proto; +out: + write_unlock_bh(&nf_conntrack_lock); + + return ret; +} + +void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_l3protos[proto->l3proto] = &nf_conntrack_generic_l3proto; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_l3proto, proto); +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int nf_conntrack_protocol_register(struct nf_conntrack_protocol *proto) +{ + int ret = 0; + +retry: + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + if (nf_ct_protos[proto->l3proto][proto->proto] + != &nf_conntrack_generic_protocol) { + ret = -EBUSY; + goto out_unlock; + } + } else { + /* l3proto may be loaded latter. */ + struct nf_conntrack_protocol **proto_array; + int i; + + write_unlock_bh(&nf_conntrack_lock); + + proto_array = (struct nf_conntrack_protocol **) + kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_protocol *), + GFP_KERNEL); + if (proto_array == NULL) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MAX_NF_CT_PROTO; i++) + proto_array[i] = &nf_conntrack_generic_protocol; + + write_lock_bh(&nf_conntrack_lock); + if (nf_ct_protos[proto->l3proto]) { + /* bad timing, but no problem */ + write_unlock_bh(&nf_conntrack_lock); + kfree(proto_array); + } else { + nf_ct_protos[proto->l3proto] = proto_array; + write_unlock_bh(&nf_conntrack_lock); + } + + /* + * Just once because array is never freed until unloading + * nf_conntrack.ko + */ + goto retry; + } + + nf_ct_protos[proto->l3proto][proto->proto] = proto; + +out_unlock: + write_unlock_bh(&nf_conntrack_lock); +out: + return ret; +} + +void nf_conntrack_protocol_unregister(struct nf_conntrack_protocol *proto) +{ + write_lock_bh(&nf_conntrack_lock); + nf_ct_protos[proto->l3proto][proto->proto] + = &nf_conntrack_generic_protocol; + write_unlock_bh(&nf_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_cleanup(kill_proto, proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_nf_conntrack(void) +{ +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(nf_conntrack_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); +EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +#endif +EXPORT_SYMBOL(nf_conntrack_l3proto_register); +EXPORT_SYMBOL(nf_conntrack_l3proto_unregister); +EXPORT_SYMBOL(nf_conntrack_protocol_register); +EXPORT_SYMBOL(nf_conntrack_protocol_unregister); +EXPORT_SYMBOL(nf_ct_invert_tuplepr); +EXPORT_SYMBOL(nf_conntrack_alter_reply); +EXPORT_SYMBOL(nf_conntrack_destroyed); +EXPORT_SYMBOL(need_nf_conntrack); +EXPORT_SYMBOL(nf_conntrack_helper_register); +EXPORT_SYMBOL(nf_conntrack_helper_unregister); +EXPORT_SYMBOL(nf_ct_iterate_cleanup); +EXPORT_SYMBOL(__nf_ct_refresh_acct); +EXPORT_SYMBOL(nf_ct_protos); +EXPORT_SYMBOL(nf_ct_find_proto); +EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL(nf_conntrack_expect_alloc); +EXPORT_SYMBOL(nf_conntrack_expect_put); +EXPORT_SYMBOL(nf_conntrack_expect_related); +EXPORT_SYMBOL(nf_conntrack_unexpect_related); +EXPORT_SYMBOL(nf_conntrack_tuple_taken); +EXPORT_SYMBOL(nf_conntrack_htable_size); +EXPORT_SYMBOL(nf_conntrack_lock); +EXPORT_SYMBOL(nf_conntrack_hash); +EXPORT_SYMBOL(nf_conntrack_untracked); +EXPORT_SYMBOL_GPL(nf_conntrack_find_get); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(nf_conntrack_tcp_update); +#endif +EXPORT_SYMBOL(__nf_conntrack_confirm); +EXPORT_SYMBOL(nf_ct_get_tuple); +EXPORT_SYMBOL(nf_ct_invert_tuple); +EXPORT_SYMBOL(nf_conntrack_in); +EXPORT_SYMBOL(__nf_conntrack_attach); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d10d552d9c4..d3a4f30a7f2 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -117,7 +117,7 @@ int nf_queue(struct sk_buff **skb, /* QUEUE == DROP if noone is waiting, to be safe. */ read_lock(&queue_handler_lock); - if (!queue_handler[pf]->outfn) { + if (!queue_handler[pf] || !queue_handler[pf]->outfn) { read_unlock(&queue_handler_lock); kfree_skb(*skb); return 1; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 4bc27a6334c..a60c59b9763 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -128,7 +128,7 @@ void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); } -int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +void nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) { memset(tb, 0, sizeof(struct nfattr *) * maxattr); @@ -138,8 +138,6 @@ int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) tb[flavor-1] = nfa; nfa = NFA_NEXT(nfa, len); } - - return 0; } /** @@ -225,6 +223,12 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, NFNL_SUBSYS_ID(nlh->nlmsg_type), NFNL_MSG_TYPE(nlh->nlmsg_type)); + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) { + DEBUGP("missing CAP_NET_ADMIN\n"); + *errp = -EPERM; + return -1; + } + /* Only requests are handled by kernel now. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { DEBUGP("received non-request message\n"); @@ -250,7 +254,7 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, ss = nfnetlink_get_subsys(type); if (!ss) #endif - goto err_inval; + goto err_inval; } nc = nfnetlink_find_client(type, ss); @@ -259,13 +263,6 @@ static inline int nfnetlink_rcv_msg(struct sk_buff *skb, goto err_inval; } - if (nc->cap_required && - !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { - DEBUGP("permission denied for type %d\n", type); - *errp = -EPERM; - return -1; - } - { u_int16_t attr_count = ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index efcd10f996b..cba63729313 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -146,11 +146,10 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; } - inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) goto out_unlock; - memset(inst, 0, sizeof(*inst)); INIT_HLIST_NODE(&inst->hlist); inst->lock = SPIN_LOCK_UNLOCKED; /* needs to be two, since we _put() after creation */ @@ -863,11 +862,9 @@ out_put: static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, - .cap_required = CAP_NET_ADMIN, }, + .attr_count = NFULA_MAX, }, [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFULA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfulnl_subsys = { @@ -962,10 +959,9 @@ static int nful_open(struct inode *inode, struct file *file) struct iter_state *is; int ret; - is = kmalloc(sizeof(*is), GFP_KERNEL); + is = kzalloc(sizeof(*is), GFP_KERNEL); if (!is) return -ENOMEM; - memset(is, 0, sizeof(*is)); ret = seq_open(file, &nful_seq_ops); if (ret < 0) goto out_free; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index eaa44c49567..f28460b61e4 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -136,11 +136,10 @@ instance_create(u_int16_t queue_num, int pid) goto out_unlock; } - inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) goto out_unlock; - memset(inst, 0, sizeof(*inst)); inst->queue_num = queue_num; inst->peer_pid = pid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; @@ -932,14 +931,11 @@ out_put: static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_MAX, }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = NFQA_CFG_MAX, }, }; static struct nfnetlink_subsystem nfqnl_subsys = { @@ -1036,10 +1032,9 @@ static int nfqnl_open(struct inode *inode, struct file *file) struct iter_state *is; int ret; - is = kmalloc(sizeof(*is), GFP_KERNEL); + is = kzalloc(sizeof(*is), GFP_KERNEL); if (!is) return -ENOMEM; - memset(is, 0, sizeof(*is)); ret = seq_open(file, &nfqnl_seq_ops); if (ret < 0) goto out_free; diff --git a/net/netlink/Makefile b/net/netlink/Makefile index 39d9c2dcd03..e3589c2de49 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -2,4 +2,4 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o +obj-y := af_netlink.o attr.o genetlink.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ca283537bc..8c38ee6d255 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -58,6 +58,7 @@ #include <net/sock.h> #include <net/scm.h> +#include <net/netlink.h> #define Nprintk(a...) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) @@ -427,7 +428,8 @@ static int netlink_release(struct socket *sock) spin_lock(&nlk->cb_lock); if (nlk->cb) { - nlk->cb->done(nlk->cb); + if (nlk->cb->done) + nlk->cb->done(nlk->cb); netlink_destroy_callback(nlk->cb); nlk->cb = NULL; } @@ -1322,7 +1324,8 @@ static int netlink_dump(struct sock *sk) skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); - cb->done(cb); + if (cb->done) + cb->done(cb); nlk->cb = NULL; spin_unlock(&nlk->cb_lock); @@ -1409,6 +1412,94 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); } +static int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, + struct nlmsghdr *, int *)) +{ + unsigned int total_len; + struct nlmsghdr *nlh; + int err; + + while (skb->len >= nlmsg_total_size(0)) { + nlh = (struct nlmsghdr *) skb->data; + + if (skb->len < nlh->nlmsg_len) + return 0; + + total_len = min(NLMSG_ALIGN(nlh->nlmsg_len), skb->len); + + if (cb(skb, nlh, &err) < 0) { + /* Not an error, but we have to interrupt processing + * here. Note: that in this case we do not pull + * message from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + + skb_pull(skb, total_len); + } + + return 0; +} + +/** + * nelink_run_queue - Process netlink receive queue. + * @sk: Netlink socket containing the queue + * @qlen: Place to store queue length upon entry + * @cb: Callback function invoked for each netlink message found + * + * Processes as much as there was in the queue upon entry and invokes + * a callback function for each netlink message found. The callback + * function may refuse a message by returning a negative error code + * but setting the error pointer to 0 in which case this function + * returns with a qlen != 0. + * + * qlen must be initialized to 0 before the initial entry, afterwards + * the function may be called repeatedly until qlen reaches 0. + */ +void netlink_run_queue(struct sock *sk, unsigned int *qlen, + int (*cb)(struct sk_buff *, struct nlmsghdr *, int *)) +{ + struct sk_buff *skb; + + if (!*qlen || *qlen > skb_queue_len(&sk->sk_receive_queue)) + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (netlink_rcv_skb(skb, cb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { + kfree_skb(skb); + (*qlen)--; + } + break; + } + + kfree_skb(skb); + } +} + +/** + * netlink_queue_skip - Skip netlink message while processing queue. + * @nlh: Netlink message to be skipped + * @skb: Socket buffer containing the netlink messages. + * + * Pulls the given netlink message off the socket buffer so the next + * call to netlink_queue_run() will not reconsider the message. + */ +void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb) +{ + int msglen = NLMSG_ALIGN(nlh->nlmsg_len); + + if (msglen > skb->len) + msglen = skb->len; + + skb_pull(skb, msglen); +} #ifdef CONFIG_PROC_FS struct nl_seq_iter { @@ -1657,6 +1748,8 @@ out: core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); +EXPORT_SYMBOL(netlink_run_queue); +EXPORT_SYMBOL(netlink_queue_skip); EXPORT_SYMBOL(netlink_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); diff --git a/net/netlink/attr.c b/net/netlink/attr.c new file mode 100644 index 00000000000..fffef4ab276 --- /dev/null +++ b/net/netlink/attr.c @@ -0,0 +1,328 @@ +/* + * NETLINK Netlink attributes + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> +#include <net/netlink.h> + +static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = { + [NLA_U8] = sizeof(u8), + [NLA_U16] = sizeof(u16), + [NLA_U32] = sizeof(u32), + [NLA_U64] = sizeof(u64), + [NLA_STRING] = 1, + [NLA_NESTED] = NLA_HDRLEN, +}; + +static int validate_nla(struct nlattr *nla, int maxtype, + struct nla_policy *policy) +{ + struct nla_policy *pt; + int minlen = 0; + + if (nla->nla_type <= 0 || nla->nla_type > maxtype) + return 0; + + pt = &policy[nla->nla_type]; + + BUG_ON(pt->type > NLA_TYPE_MAX); + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (pt->type == NLA_FLAG && nla_len(nla) > 0) + return -ERANGE; + + if (nla_len(nla) < minlen) + return -ERANGE; + + return 0; +} + +/** + * nla_validate - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Attributes with a type exceeding maxtype will be + * ignored. See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +int nla_validate(struct nlattr *head, int len, int maxtype, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + nla_for_each_attr(nla, head, len, rem) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + err = 0; +errout: + return err; +} + +/** + * nla_parse - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessable via the attribute type. Attributes with a type + * exceeding maxtype will be silently ignored for backwards compatibility + * reasons. policy may be set to NULL if no validation is required. + * + * Returns 0 on success or a negative error code. + */ +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + nla_for_each_attr(nla, head, len, rem) { + u16 type = nla->nla_type; + + if (type > 0 && type <= maxtype) { + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + tb[type] = nla; + } + } + + if (unlikely(rem > 0)) + printk(KERN_WARNING "netlink: %d bytes leftover after parsing " + "attributes.\n", rem); + + err = 0; +errout: + return err; +} + +/** + * nla_find - Find a specific attribute in a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @attrtype: type of attribute to look for + * + * Returns the first attribute in the stream matching the specified type. + */ +struct nlattr *nla_find(struct nlattr *head, int len, int attrtype) +{ + struct nlattr *nla; + int rem; + + nla_for_each_attr(nla, head, len, rem) + if (nla->nla_type == attrtype) + return nla; + + return NULL; +} + +/** + * nla_strlcpy - Copy string attribute payload into a sized buffer + * @dst: where to copy the string to + * @src: attribute to copy the string from + * @dstsize: size of destination buffer + * + * Copies at most dstsize - 1 bytes into the destination buffer. + * The result is always a valid NUL-terminated string. Unlike + * strlcpy the destination buffer is always padded out. + * + * Returns the length of the source buffer. + */ +size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) +{ + size_t srclen = nla_len(nla); + char *src = nla_data(nla); + + if (srclen > 0 && src[srclen - 1] == '\0') + srclen--; + + if (dstsize > 0) { + size_t len = (srclen >= dstsize) ? dstsize - 1 : srclen; + + memset(dst, 0, dstsize); + memcpy(dst, src, len); + } + + return srclen; +} + +/** + * nla_memcpy - Copy a netlink attribute into another memory area + * @dest: where to copy to memcpy + * @src: netlink attribute to copy from + * @count: size of the destination area + * + * Note: The number of bytes copied is limited by the length of + * attribute's payload. memcpy + * + * Returns the number of bytes copied. + */ +int nla_memcpy(void *dest, struct nlattr *src, int count) +{ + int minlen = min_t(int, count, nla_len(src)); + + memcpy(dest, nla_data(src), minlen); + + return minlen; +} + +/** + * nla_memcmp - Compare an attribute with sized memory area + * @nla: netlink attribute + * @data: memory area + * @size: size of memory area + */ +int nla_memcmp(const struct nlattr *nla, const void *data, + size_t size) +{ + int d = nla_len(nla) - size; + + if (d == 0) + d = memcmp(nla_data(nla), data, size); + + return d; +} + +/** + * nla_strcmp - Compare a string attribute against a string + * @nla: netlink string attribute + * @str: another string + */ +int nla_strcmp(const struct nlattr *nla, const char *str) +{ + int len = strlen(str) + 1; + int d = nla_len(nla) - len; + + if (d == 0) + d = memcmp(nla_data(nla), str, len); + + return d; +} + +/** + * __nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + struct nlattr *nla; + + nla = (struct nlattr *) skb_put(skb, nla_total_size(attrlen)); + nla->nla_type = attrtype; + nla->nla_len = nla_attr_size(attrlen); + + memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); + + return nla; +} + +/** + * nla_reserve - reserve room for attribute on the skb + * @skb: socket buffer to reserve room on + * @attrtype: attribute type + * @attrlen: length of attribute payload + * + * Adds a netlink attribute header to a socket buffer and reserves + * room for the payload but does not copy it. + * + * Returns NULL if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return NULL; + + return __nla_reserve(skb, attrtype, attrlen); +} + +/** + * __nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * The caller is responsible to ensure that the skb provides enough + * tailroom for the attribute header and payload. + */ +void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nlattr *nla; + + nla = __nla_reserve(skb, attrtype, attrlen); + memcpy(nla_data(nla), data, attrlen); +} + + +/** + * nla_put - Add a netlink attribute to a socket buffer + * @skb: socket buffer to add attribute to + * @attrtype: attribute type + * @attrlen: length of attribute payload + * @data: head of attribute payload + * + * Returns -1 if the tailroom of the skb is insufficient to store + * the attribute header and payload. + */ +int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) + return -1; + + __nla_put(skb, attrtype, attrlen, data); + return 0; +} + + +EXPORT_SYMBOL(nla_validate); +EXPORT_SYMBOL(nla_parse); +EXPORT_SYMBOL(nla_find); +EXPORT_SYMBOL(nla_strlcpy); +EXPORT_SYMBOL(__nla_reserve); +EXPORT_SYMBOL(nla_reserve); +EXPORT_SYMBOL(__nla_put); +EXPORT_SYMBOL(nla_put); +EXPORT_SYMBOL(nla_memcpy); +EXPORT_SYMBOL(nla_memcmp); +EXPORT_SYMBOL(nla_strcmp); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c new file mode 100644 index 00000000000..287cfcc5695 --- /dev/null +++ b/net/netlink/genetlink.c @@ -0,0 +1,579 @@ +/* + * NETLINK Generic Netlink Family + * + * Authors: Jamal Hadi Salim + * Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/genetlink.h> + +struct sock *genl_sock = NULL; + +static DECLARE_MUTEX(genl_sem); /* serialization of message processing */ + +static void genl_lock(void) +{ + down(&genl_sem); +} + +static int genl_trylock(void) +{ + return down_trylock(&genl_sem); +} + +static void genl_unlock(void) +{ + up(&genl_sem); + + if (genl_sock && genl_sock->sk_receive_queue.qlen) + genl_sock->sk_data_ready(genl_sock, 0); +} + +#define GENL_FAM_TAB_SIZE 16 +#define GENL_FAM_TAB_MASK (GENL_FAM_TAB_SIZE - 1) + +static struct list_head family_ht[GENL_FAM_TAB_SIZE]; + +static int genl_ctrl_event(int event, void *data); + +static inline unsigned int genl_family_hash(unsigned int id) +{ + return id & GENL_FAM_TAB_MASK; +} + +static inline struct list_head *genl_family_chain(unsigned int id) +{ + return &family_ht[genl_family_hash(id)]; +} + +static struct genl_family *genl_family_find_byid(unsigned int id) +{ + struct genl_family *f; + + list_for_each_entry(f, genl_family_chain(id), family_list) + if (f->id == id) + return f; + + return NULL; +} + +static struct genl_family *genl_family_find_byname(char *name) +{ + struct genl_family *f; + int i; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + list_for_each_entry(f, genl_family_chain(i), family_list) + if (strcmp(f->name, name) == 0) + return f; + + return NULL; +} + +static struct genl_ops *genl_get_cmd(u8 cmd, struct genl_family *family) +{ + struct genl_ops *ops; + + list_for_each_entry(ops, &family->ops_list, ops_list) + if (ops->cmd == cmd) + return ops; + + return NULL; +} + +/* Of course we are going to have problems once we hit + * 2^16 alive types, but that can only happen by year 2K +*/ +static inline u16 genl_generate_id(void) +{ + static u16 id_gen_idx; + int overflowed = 0; + + do { + if (id_gen_idx == 0) + id_gen_idx = GENL_MIN_ID; + + if (++id_gen_idx > GENL_MAX_ID) { + if (!overflowed) { + overflowed = 1; + id_gen_idx = 0; + continue; + } else + return 0; + } + + } while (genl_family_find_byid(id_gen_idx)); + + return id_gen_idx; +} + +/** + * genl_register_ops - register generic netlink operations + * @family: generic netlink family + * @ops: operations to be registered + * + * Registers the specified operations and assigns them to the specified + * family. Either a doit or dumpit callback must be specified or the + * operation will fail. Only one operation structure per command + * identifier may be registered. + * + * See include/net/genetlink.h for more documenation on the operations + * structure. + * + * Returns 0 on success or a negative error code. + */ +int genl_register_ops(struct genl_family *family, struct genl_ops *ops) +{ + int err = -EINVAL; + + if (ops->dumpit == NULL && ops->doit == NULL) + goto errout; + + if (genl_get_cmd(ops->cmd, family)) { + err = -EEXIST; + goto errout; + } + + genl_lock(); + list_add_tail(&ops->ops_list, &family->ops_list); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWOPS, ops); + err = 0; +errout: + return err; +} + +/** + * genl_unregister_ops - unregister generic netlink operations + * @family: generic netlink family + * @ops: operations to be unregistered + * + * Unregisters the specified operations and unassigns them from the + * specified family. The operation blocks until the current message + * processing has finished and doesn't start again until the + * unregister process has finished. + * + * Note: It is not necessary to unregister all operations before + * unregistering the family, unregistering the family will cause + * all assigned operations to be unregistered automatically. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops) +{ + struct genl_ops *rc; + + genl_lock(); + list_for_each_entry(rc, &family->ops_list, ops_list) { + if (rc == ops) { + list_del(&ops->ops_list); + genl_unlock(); + genl_ctrl_event(CTRL_CMD_DELOPS, ops); + return 0; + } + } + genl_unlock(); + + return -ENOENT; +} + +/** + * genl_register_family - register a generic netlink family + * @family: generic netlink family + * + * Registers the specified family after validating it first. Only one + * family may be registered with the same family name or identifier. + * The family id may equal GENL_ID_GENERATE causing an unique id to + * be automatically generated and assigned. + * + * Return 0 on success or a negative error code. + */ +int genl_register_family(struct genl_family *family) +{ + int err = -EINVAL; + + if (family->id && family->id < GENL_MIN_ID) + goto errout; + + if (family->id > GENL_MAX_ID) + goto errout; + + INIT_LIST_HEAD(&family->ops_list); + + genl_lock(); + + if (genl_family_find_byname(family->name)) { + err = -EEXIST; + goto errout_locked; + } + + if (genl_family_find_byid(family->id)) { + err = -EEXIST; + goto errout_locked; + } + + if (!try_module_get(family->owner)) { + err = -EBUSY; + goto errout_locked; + } + + if (family->id == GENL_ID_GENERATE) { + u16 newid = genl_generate_id(); + + if (!newid) { + err = -ENOMEM; + goto errout_locked; + } + + family->id = newid; + } + + if (family->maxattr) { + family->attrbuf = kmalloc((family->maxattr+1) * + sizeof(struct nlattr *), GFP_KERNEL); + if (family->attrbuf == NULL) { + err = -ENOMEM; + goto errout; + } + } else + family->attrbuf = NULL; + + list_add_tail(&family->family_list, genl_family_chain(family->id)); + genl_unlock(); + + genl_ctrl_event(CTRL_CMD_NEWFAMILY, family); + + return 0; + +errout_locked: + genl_unlock(); +errout: + return err; +} + +/** + * genl_unregister_family - unregister generic netlink family + * @family: generic netlink family + * + * Unregisters the specified family. + * + * Returns 0 on success or a negative error code. + */ +int genl_unregister_family(struct genl_family *family) +{ + struct genl_family *rc; + + genl_lock(); + + list_for_each_entry(rc, genl_family_chain(family->id), family_list) { + if (family->id != rc->id || strcmp(rc->name, family->name)) + continue; + + list_del(&rc->family_list); + INIT_LIST_HEAD(&family->ops_list); + genl_unlock(); + + module_put(family->owner); + kfree(family->attrbuf); + genl_ctrl_event(CTRL_CMD_DELFAMILY, family); + return 0; + } + + genl_unlock(); + + return -ENOENT; +} + +static inline int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + int *errp) +{ + struct genl_ops *ops; + struct genl_family *family; + struct genl_info info; + struct genlmsghdr *hdr = nlmsg_data(nlh); + int hdrlen, err = -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + goto ignore; + + if (nlh->nlmsg_type < NLMSG_MIN_TYPE) + goto ignore; + + family = genl_family_find_byid(nlh->nlmsg_type); + if (family == NULL) { + err = -ENOENT; + goto errout; + } + + hdrlen = GENL_HDRLEN + family->hdrsize; + if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) + goto errout; + + ops = genl_get_cmd(hdr->cmd, family); + if (ops == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if ((ops->flags & GENL_ADMIN_PERM) && security_netlink_recv(skb)) { + err = -EPERM; + goto errout; + } + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (ops->dumpit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + *errp = err = netlink_dump_start(genl_sock, skb, nlh, + ops->dumpit, NULL); + if (err == 0) + skb_pull(skb, min(NLMSG_ALIGN(nlh->nlmsg_len), + skb->len)); + return -1; + } + + if (ops->doit == NULL) { + err = -EOPNOTSUPP; + goto errout; + } + + if (family->attrbuf) { + err = nlmsg_parse(nlh, hdrlen, family->attrbuf, family->maxattr, + ops->policy); + if (err < 0) + goto errout; + } + + info.snd_seq = nlh->nlmsg_seq; + info.snd_pid = NETLINK_CB(skb).pid; + info.nlhdr = nlh; + info.genlhdr = nlmsg_data(nlh); + info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; + info.attrs = family->attrbuf; + + *errp = err = ops->doit(skb, &info); + return err; + +ignore: + return 0; + +errout: + *errp = err; + return -1; +} + +static void genl_rcv(struct sock *sk, int len) +{ + unsigned int qlen = 0; + + do { + if (genl_trylock()) + return; + netlink_run_queue(sk, &qlen, &genl_rcv_msg); + genl_unlock(); + } while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen); +} + +/************************************************************************** + * Controller + **************************************************************************/ + +static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, + u32 flags, struct sk_buff *skb, u8 cmd) +{ + void *hdr; + + hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd, + family->version); + if (hdr == NULL) + return -1; + + NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name); + NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id); + + return genlmsg_end(skb, hdr); + +nla_put_failure: + return genlmsg_cancel(skb, hdr); +} + +static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) +{ + + int i, n = 0; + struct genl_family *rt; + int chains_to_skip = cb->args[0]; + int fams_to_skip = cb->args[1]; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) { + if (i < chains_to_skip) + continue; + n = 0; + list_for_each_entry(rt, genl_family_chain(i), family_list) { + if (++n < fams_to_skip) + continue; + if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + skb, CTRL_CMD_NEWFAMILY) < 0) + goto errout; + } + + fams_to_skip = 0; + } + +errout: + cb->args[0] = i; + cb->args[1] = n; + + return skb->len; +} + +static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, + int seq, int cmd) +{ + struct sk_buff *skb; + int err; + + skb = nlmsg_new(NLMSG_GOODSIZE); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + err = ctrl_fill_info(family, pid, seq, 0, skb, cmd); + if (err < 0) { + nlmsg_free(skb); + return ERR_PTR(err); + } + + return skb; +} + +static struct nla_policy ctrl_policy[CTRL_ATTR_MAX+1] __read_mostly = { + [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, + [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_STRING }, +}; + +static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + struct genl_family *res = NULL; + int err = -EINVAL; + + if (info->attrs[CTRL_ATTR_FAMILY_ID]) { + u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); + res = genl_family_find_byid(id); + } + + if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { + char name[GENL_NAMSIZ]; + + if (nla_strlcpy(name, info->attrs[CTRL_ATTR_FAMILY_NAME], + GENL_NAMSIZ) >= GENL_NAMSIZ) + goto errout; + + res = genl_family_find_byname(name); + } + + if (res == NULL) { + err = -ENOENT; + goto errout; + } + + msg = ctrl_build_msg(res, info->snd_pid, info->snd_seq, + CTRL_CMD_NEWFAMILY); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto errout; + } + + err = genlmsg_unicast(msg, info->snd_pid); +errout: + return err; +} + +static int genl_ctrl_event(int event, void *data) +{ + struct sk_buff *msg; + + if (genl_sock == NULL) + return 0; + + switch (event) { + case CTRL_CMD_NEWFAMILY: + case CTRL_CMD_DELFAMILY: + msg = ctrl_build_msg(data, 0, 0, event); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + genlmsg_multicast(msg, 0, GENL_ID_CTRL); + break; + } + + return 0; +} + +static struct genl_ops genl_ctrl_ops = { + .cmd = CTRL_CMD_GETFAMILY, + .doit = ctrl_getfamily, + .dumpit = ctrl_dumpfamily, + .policy = ctrl_policy, +}; + +static struct genl_family genl_ctrl = { + .id = GENL_ID_CTRL, + .name = "nlctrl", + .version = 0x1, + .maxattr = CTRL_ATTR_MAX, + .owner = THIS_MODULE, +}; + +static int __init genl_init(void) +{ + int i, err; + + for (i = 0; i < GENL_FAM_TAB_SIZE; i++) + INIT_LIST_HEAD(&family_ht[i]); + + err = genl_register_family(&genl_ctrl); + if (err < 0) + goto errout; + + err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops); + if (err < 0) + goto errout_register; + + netlink_set_nonroot(NETLINK_GENERIC, NL_NONROOT_RECV); + genl_sock = netlink_kernel_create(NETLINK_GENERIC, GENL_MAX_ID, + genl_rcv, THIS_MODULE); + if (genl_sock == NULL) { + panic("GENL: Cannot initialize generic netlink\n"); + return -ENOMEM; + } + + return 0; + +errout_register: + genl_unregister_family(&genl_ctrl); +errout: + panic("GENL: Cannot register controller: %d\n", err); + return err; +} + +subsys_initcall(genl_init); + +EXPORT_SYMBOL(genl_sock); +EXPORT_SYMBOL(genl_register_ops); +EXPORT_SYMBOL(genl_unregister_ops); +EXPORT_SYMBOL(genl_register_family); +EXPORT_SYMBOL(genl_unregister_family); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b18fe504301..8631b65a731 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -240,8 +240,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } @@ -250,8 +249,7 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if (s->next == rose_neigh) { s->next = rose_neigh->next; spin_unlock_bh(&rose_neigh_list_lock); - if (rose_neigh->digipeat != NULL) - kfree(rose_neigh->digipeat); + kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2d..dbe6105e83a 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -23,6 +23,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> +#include <linux/skbuff.h> #include <net/sock.h> #include <net/ip.h> #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) /* we'll probably need to checksum it (didn't call * sock_recvmsg) */ - if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short) - csum_fold(skb_checksum(pkt, 0, pkt->len, - pkt->csum))) { - kfree_skb(pkt); - rxrpc_krxiod_queue_transport(trans); - _leave(" CSUM failed"); - return; - } + if (skb_checksum_complete(pkt)) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" CSUM failed"); + return; } addr = pkt->nh.iph->saddr; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7f34e7fd767..55cd5327fbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -40,9 +40,10 @@ config NET_SCHED The available schedulers are listed in the following questions; you can say Y to as many as you like. If unsure, say N now. +if NET_SCHED + choice prompt "Packet scheduler clock source" - depends on NET_SCHED default NET_SCH_CLK_JIFFIES ---help--- Packet schedulers need a monotonic clock that increments at a static @@ -98,11 +99,9 @@ config NET_SCH_CLK_CPU endchoice comment "Queueing/Scheduling" - depends on NET_SCHED config NET_SCH_CBQ tristate "Class Based Queueing (CBQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Class-Based Queueing (CBQ) packet scheduling algorithm. This algorithm classifies the waiting packets @@ -120,7 +119,6 @@ config NET_SCH_CBQ config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Token Buckets (HTB) packet scheduling algorithm. See @@ -135,7 +133,6 @@ config NET_SCH_HTB config NET_SCH_HFSC tristate "Hierarchical Fair Service Curve (HFSC)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Hierarchical Fair Service Curve (HFSC) packet scheduling algorithm. @@ -145,7 +142,7 @@ config NET_SCH_HFSC config NET_SCH_ATM tristate "ATM Virtual Circuits (ATM)" - depends on NET_SCHED && ATM + depends on ATM ---help--- Say Y here if you want to use the ATM pseudo-scheduler. This provides a framework for invoking classifiers, which in turn @@ -159,7 +156,6 @@ config NET_SCH_ATM config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" - depends on NET_SCHED ---help--- Say Y here if you want to use an n-band priority queue packet scheduler. @@ -169,7 +165,6 @@ config NET_SCH_PRIO config NET_SCH_RED tristate "Random Early Detection (RED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Random Early Detection (RED) packet scheduling algorithm. @@ -181,7 +176,6 @@ config NET_SCH_RED config NET_SCH_SFQ tristate "Stochastic Fairness Queueing (SFQ)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) packet scheduling algorithm . @@ -193,7 +187,6 @@ config NET_SCH_SFQ config NET_SCH_TEQL tristate "True Link Equalizer (TEQL)" - depends on NET_SCHED ---help--- Say Y here if you want to use the True Link Equalizer (TLE) packet scheduling algorithm. This queueing discipline allows the combination @@ -206,7 +199,6 @@ config NET_SCH_TEQL config NET_SCH_TBF tristate "Token Bucket Filter (TBF)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Token Bucket Filter (TBF) packet scheduling algorithm. @@ -218,7 +210,6 @@ config NET_SCH_TBF config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" - depends on NET_SCHED ---help--- Say Y here if you want to use the Generic Random Early Detection (GRED) packet scheduling algorithm for some of your network devices @@ -230,7 +221,6 @@ config NET_SCH_GRED config NET_SCH_DSMARK tristate "Differentiated Services marker (DSMARK)" - depends on NET_SCHED ---help--- Say Y if you want to schedule packets according to the Differentiated Services architecture proposed in RFC 2475. @@ -242,7 +232,6 @@ config NET_SCH_DSMARK config NET_SCH_NETEM tristate "Network emulator (NETEM)" - depends on NET_SCHED ---help--- Say Y if you want to emulate network delay, loss, and packet re-ordering. This is often useful to simulate networks when @@ -255,7 +244,6 @@ config NET_SCH_NETEM config NET_SCH_INGRESS tristate "Ingress Qdisc" - depends on NET_SCHED ---help--- Say Y here if you want to use classifiers for incoming packets. If unsure, say Y. @@ -264,14 +252,12 @@ config NET_SCH_INGRESS module will be called sch_ingress. comment "Classification" - depends on NET_SCHED config NET_CLS boolean config NET_CLS_BASIC tristate "Elementary classification (BASIC)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets using @@ -282,7 +268,6 @@ config NET_CLS_BASIC config NET_CLS_TCINDEX tristate "Traffic-Control Index (TCINDEX)" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to be able to classify packets based on @@ -294,7 +279,6 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - depends NET_SCHED select NET_CLS_ROUTE select NET_CLS ---help--- @@ -306,11 +290,9 @@ config NET_CLS_ROUTE4 config NET_CLS_ROUTE bool - default n config NET_CLS_FW tristate "Netfilter mark (FW)" - depends NET_SCHED select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -321,7 +303,6 @@ config NET_CLS_FW config NET_CLS_U32 tristate "Universal 32bit comparisons w/ hashing (U32)" - depends NET_SCHED select NET_CLS ---help--- Say Y here to be able to classify packetes using a universal @@ -345,7 +326,6 @@ config CLS_U32_MARK config NET_CLS_RSVP tristate "IPv4 Resource Reservation Protocol (RSVP)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -361,7 +341,6 @@ config NET_CLS_RSVP config NET_CLS_RSVP6 tristate "IPv6 Resource Reservation Protocol (RSVP6)" - depends on NET_SCHED select NET_CLS select NET_ESTIMATOR ---help--- @@ -377,7 +356,6 @@ config NET_CLS_RSVP6 config NET_EMATCH bool "Extended Matches" - depends NET_SCHED select NET_CLS ---help--- Say Y here if you want to use extended matches on top of classifiers @@ -456,7 +434,7 @@ config NET_EMATCH_TEXT config NET_CLS_ACT bool "Actions" - depends on EXPERIMENTAL && NET_SCHED + depends on EXPERIMENTAL select NET_ESTIMATOR ---help--- Say Y here if you want to use traffic control actions. Actions @@ -539,7 +517,7 @@ config NET_ACT_SIMP config NET_CLS_POLICE bool "Traffic Policing (obsolete)" - depends on NET_SCHED && NET_CLS_ACT!=y + depends on NET_CLS_ACT!=y select NET_ESTIMATOR ---help--- Say Y here if you want to do traffic policing, i.e. strict @@ -549,7 +527,7 @@ config NET_CLS_POLICE config NET_CLS_IND bool "Incoming device classification" - depends on NET_SCHED && (NET_CLS_U32 || NET_CLS_FW) + depends on NET_CLS_U32 || NET_CLS_FW ---help--- Say Y here to extend the u32 and fw classifier to support classification based on the incoming device. This option is @@ -557,11 +535,12 @@ config NET_CLS_IND config NET_ESTIMATOR bool "Rate estimator" - depends on NET_SCHED ---help--- Say Y here to allow using rate estimators to estimate the current rate-of-flow for network devices, queues, etc. This module is automaticaly selected if needed but can be selected manually for statstical purposes. +endif # NET_SCHED + endmenu diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 29d8b9a4d16..75470486e40 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -298,8 +298,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 02996ac05c7..520ff716dab 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -525,8 +525,7 @@ reinsert: return 0; errout: - if (f) - kfree(f); + kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 006168d6937..572f06be3b0 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -555,8 +555,7 @@ insert: goto insert; errout: - if (f) - kfree(f); + kfree(f); errout2: tcf_exts_destroy(tp, &e); return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 404d9d83a7f..9f921174c8a 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -194,8 +194,7 @@ found: } tcf_unbind_filter(tp, &r->res); tcf_exts_destroy(tp, &r->exts); - if (f) - kfree(f); + kfree(f); return 0; } @@ -442,10 +441,8 @@ static void tcindex_destroy(struct tcf_proto *tp) walker.skip = 0; walker.fn = &tcindex_destroy_element; tcindex_walk(tp,&walker); - if (p->perfect) - kfree(p->perfect); - if (p->h) - kfree(p->h); + kfree(p->perfect); + kfree(p->h); kfree(p); tp->root = NULL; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 364b87d8645..2b670479dde 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -347,7 +347,7 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) if (n->ht_down) n->ht_down->refcnt--; #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); @@ -680,7 +680,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, return 0; } #ifdef CONFIG_CLS_U32_PERF - if (n && (NULL != n->pf)) + if (n) kfree(n->pf); #endif kfree(n); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index cf68a59fdc5..700844d49d7 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -561,8 +561,7 @@ static int meta_var_change(struct meta_value *dst, struct rtattr *rta) static void meta_var_destroy(struct meta_value *v) { - if (v->val) - kfree((void *) v->val); + kfree((void *) v->val); } static void meta_var_apply_extras(struct meta_value *v, diff --git a/net/sched/ematch.c b/net/sched/ematch.c index ebfe2e7d21b..64b047c6556 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -298,6 +298,11 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; + if (!rta) { + memset(tree, 0, sizeof(*tree)); + return 0; + } + if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) goto errout; diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 25c171c3271..29a2dd9f302 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -15,247 +15,281 @@ * from Ren Liu * - More error checks * - * - * - * For all the glorious comments look at Alexey's sch_red.c + * For all the glorious comments look at include/net/red.h */ #include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/pkt_sched.h> +#include <net/red.h> -#if 1 /* control */ -#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define DPRINTK(format,args...) -#endif - -#if 0 /* data */ -#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) -#else -#define D2PRINTK(format,args...) -#endif +#define GRED_DEF_PRIO (MAX_DPs / 2) +#define GRED_VQ_MASK (MAX_DPs - 1) struct gred_sched_data; struct gred_sched; struct gred_sched_data { -/* Parameters */ u32 limit; /* HARD maximal queue length */ - u32 qth_min; /* Min average length threshold: A scaled */ - u32 qth_max; /* Max average length threshold: A scaled */ u32 DP; /* the drop pramaters */ - char Wlog; /* log(W) */ - char Plog; /* random number bits */ - u32 Scell_max; - u32 Rmask; u32 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ - u32 forced; /* packets dropped for exceeding limits */ - u32 early; /* packets dropped as a warning */ - u32 other; /* packets dropped by invoking drop() */ - u32 pdrop; /* packets dropped because we exceeded physical queue limits */ - char Scell_log; - u8 Stab[256]; - u8 prio; /* the prio of this vq */ - -/* Variables */ - unsigned long qave; /* Average queue length: A scaled */ - int qcount; /* Packets since last random number generation */ - u32 qR; /* Cached random number */ - - psched_time_t qidlestart; /* Start of idle period */ + u8 prio; /* the prio of this vq */ + + struct red_parms parms; + struct red_stats stats; +}; + +enum { + GRED_WRED_MODE = 1, + GRED_RIO_MODE, }; struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; - u32 DPs; - u32 def; - u8 initd; - u8 grio; - u8 eqp; + unsigned long flags; + u32 red_flags; + u32 DPs; + u32 def; + struct red_parms wred_set; }; -static int -gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static inline int gred_wred_mode(struct gred_sched *table) { - psched_time_t now; - struct gred_sched_data *q=NULL; - struct gred_sched *t= qdisc_priv(sch); - unsigned long qave=0; - int i=0; + return test_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_enable_wred_mode(struct gred_sched *table) +{ + __set_bit(GRED_WRED_MODE, &table->flags); +} + +static inline void gred_disable_wred_mode(struct gred_sched *table) +{ + __clear_bit(GRED_WRED_MODE, &table->flags); +} + +static inline int gred_rio_mode(struct gred_sched *table) +{ + return test_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_enable_rio_mode(struct gred_sched *table) +{ + __set_bit(GRED_RIO_MODE, &table->flags); +} + +static inline void gred_disable_rio_mode(struct gred_sched *table) +{ + __clear_bit(GRED_RIO_MODE, &table->flags); +} + +static inline int gred_wred_mode_check(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; - if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { - D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); - goto do_enqueue; + /* Really ugly O(n^2) but shouldn't be necessary too frequent. */ + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + int n; + + if (q == NULL) + continue; + + for (n = 0; n < table->DPs; n++) + if (table->tab[n] && table->tab[n] != q && + table->tab[n]->prio == q->prio) + return 1; } + return 0; +} + +static inline unsigned int gred_backlog(struct gred_sched *table, + struct gred_sched_data *q, + struct Qdisc *sch) +{ + if (gred_wred_mode(table)) + return sch->qstats.backlog; + else + return q->backlog; +} + +static inline u16 tc_index_to_dp(struct sk_buff *skb) +{ + return skb->tc_index & GRED_VQ_MASK; +} + +static inline void gred_load_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + q->parms.qavg = table->wred_set.qavg; + q->parms.qidlestart = table->wred_set.qidlestart; +} + +static inline void gred_store_wred_set(struct gred_sched *table, + struct gred_sched_data *q) +{ + table->wred_set.qavg = q->parms.qavg; +} + +static inline int gred_use_ecn(struct gred_sched *t) +{ + return t->red_flags & TC_RED_ECN; +} - if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { - printk("GRED: setting to default (%d)\n ",t->def); - if (!(q=t->tab[t->def])) { - DPRINTK("GRED: setting to default FAILED! dropping!! " - "(%d)\n ", t->def); - goto drop; +static inline int gred_use_harddrop(struct gred_sched *t) +{ + return t->red_flags & TC_RED_HARDDROP; +} + +static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct gred_sched_data *q=NULL; + struct gred_sched *t= qdisc_priv(sch); + unsigned long qavg = 0; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + dp = t->def; + + if ((q = t->tab[dp]) == NULL) { + /* Pass through packets not assigned to a DP + * if no default DP has been configured. This + * allows for DP flows to be left untouched. + */ + if (skb_queue_len(&sch->q) < sch->dev->tx_queue_len) + return qdisc_enqueue_tail(skb, sch); + else + goto drop; } + /* fix tc_index? --could be controvesial but needed for requeueing */ - skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; + skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " - "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, - sch->qstats.backlog); - /* sum up all the qaves of prios <= to ours to get the new qave*/ - if (!t->eqp && t->grio) { - for (i=0;i<t->DPs;i++) { - if ((!t->tab[i]) || (i==q->DP)) - continue; - - if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) - qave +=t->tab[i]->qave; + /* sum up all the qaves of prios <= to ours to get the new qave */ + if (!gred_wred_mode(t) && gred_rio_mode(t)) { + int i; + + for (i = 0; i < t->DPs; i++) { + if (t->tab[i] && t->tab[i]->prio < q->prio && + !red_is_idling(&t->tab[i]->parms)) + qavg +=t->tab[i]->parms.qavg; } - + } q->packetsin++; - q->bytesin+=skb->len; + q->bytesin += skb->len; - if (t->eqp && t->grio) { - qave=0; - q->qave=t->tab[t->def]->qave; - q->qidlestart=t->tab[t->def]->qidlestart; - } + if (gred_wred_mode(t)) + gred_load_wred_set(t, q); - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long us_idle; - PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - PSCHED_SET_PASTPERFECT(q->qidlestart); + q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch)); - q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; - } else { - if (t->eqp) { - q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); - } else { - q->qave += q->backlog - (q->qave >> q->Wlog); - } + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - } - - - if (t->eqp && t->grio) - t->tab[t->def]->qave=q->qave; - - if ((q->qave+qave) < q->qth_min) { - q->qcount = -1; -enqueue: - if (q->backlog + skb->len <= q->limit) { - q->backlog += skb->len; -do_enqueue: - __skb_queue_tail(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return 0; - } else { - q->pdrop++; - } + if (gred_wred_mode(t)) + gred_store_wred_set(t, q); -drop: - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; - } - if ((q->qave+qave) >= q->qth_max) { - q->qcount = -1; - sch->qstats.overlimits++; - q->forced++; - goto drop; + switch (red_action(&q->parms, q->parms.qavg + qavg)) { + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (gred_use_harddrop(t) || !gred_use_ecn(t) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + q->stats.forced_mark++; + break; } - if (++q->qcount) { - if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) - goto enqueue; - q->qcount = 0; - q->qR = net_random()&q->Rmask; - sch->qstats.overlimits++; - q->early++; - goto drop; + + if (q->backlog + skb->len <= q->limit) { + q->backlog += skb->len; + return qdisc_enqueue_tail(skb, sch); } - q->qR = net_random()&q->Rmask; - goto enqueue; + + q->stats.pdrop++; +drop: + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); + return NET_XMIT_CN; } -static int -gred_requeue(struct sk_buff *skb, struct Qdisc* sch) +static int gred_requeue(struct sk_buff *skb, struct Qdisc* sch) { + struct gred_sched *t = qdisc_priv(sch); struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); - q= t->tab[(skb->tc_index&0xf)]; -/* error checking here -- probably unnecessary */ - PSCHED_SET_PASTPERFECT(q->qidlestart); - - __skb_queue_head(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->qstats.requeues++; - q->backlog += skb->len; - return 0; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate VQ 0x%x " + "for requeue, screwing up backlog.\n", + tc_index_to_dp(skb)); + } else { + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); + q->backlog += skb->len; + } + + return qdisc_requeue(skb, sch); } -static struct sk_buff * -gred_dequeue(struct Qdisc* sch) +static struct sk_buff *gred_dequeue(struct Qdisc* sch) { struct sk_buff *skb; - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched *t = qdisc_priv(sch); + + skb = qdisc_dequeue_head(sch); - skb = __skb_dequeue(&sch->q); if (skb) { - sch->qstats.backlog -= skb->len; - q= t->tab[(skb->tc_index&0xf)]; - if (q) { - q->backlog -= skb->len; - if (!q->backlog && !t->eqp) - PSCHED_GET_TIME(q->qidlestart); + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate " + "VQ 0x%x after dequeue, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { - D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + q->backlog -= skb->len; + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); } + return skb; } - if (t->eqp) { - q= t->tab[t->def]; - if (!q) - D2PRINTK("no default VQ set: Results will be " - "screwed up\n"); - else - PSCHED_GET_TIME(q->qidlestart); - } + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); return NULL; } @@ -263,36 +297,34 @@ gred_dequeue(struct Qdisc* sch) static unsigned int gred_drop(struct Qdisc* sch) { struct sk_buff *skb; + struct gred_sched *t = qdisc_priv(sch); - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); - - skb = __skb_dequeue_tail(&sch->q); + skb = qdisc_dequeue_tail(sch); if (skb) { unsigned int len = skb->len; - sch->qstats.backlog -= len; - sch->qstats.drops++; - q= t->tab[(skb->tc_index&0xf)]; - if (q) { - q->backlog -= len; - q->other++; - if (!q->backlog && !t->eqp) - PSCHED_GET_TIME(q->qidlestart); + struct gred_sched_data *q; + u16 dp = tc_index_to_dp(skb); + + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "GRED: Unable to relocate " + "VQ 0x%x while dropping, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { - D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + q->backlog -= len; + q->stats.other++; + + if (!q->backlog && !gred_wred_mode(t)) + red_start_of_idle_period(&q->parms); } - kfree_skb(skb); + qdisc_drop(skb, sch); return len; } - q=t->tab[t->def]; - if (!q) { - D2PRINTK("no default VQ set: Results might be screwed up\n"); - return 0; - } + if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) + red_start_of_idle_period(&t->wred_set); - PSCHED_GET_TIME(q->qidlestart); return 0; } @@ -300,293 +332,241 @@ static unsigned int gred_drop(struct Qdisc* sch) static void gred_reset(struct Qdisc* sch) { int i; - struct gred_sched_data *q; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched *t = qdisc_priv(sch); + + qdisc_reset_queue(sch); - __skb_queue_purge(&sch->q); + for (i = 0; i < t->DPs; i++) { + struct gred_sched_data *q = t->tab[i]; - sch->qstats.backlog = 0; + if (!q) + continue; - for (i=0;i<t->DPs;i++) { - q= t->tab[i]; - if (!q) - continue; - PSCHED_SET_PASTPERFECT(q->qidlestart); - q->qave = 0; - q->qcount = -1; + red_restart(&q->parms); q->backlog = 0; - q->other=0; - q->forced=0; - q->pdrop=0; - q->early=0; } } -static int gred_change(struct Qdisc *sch, struct rtattr *opt) +static inline void gred_destroy_vq(struct gred_sched_data *q) +{ + kfree(q); +} + +static inline int gred_change_table_def(struct Qdisc *sch, struct rtattr *dps) { struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; - struct tc_gred_qopt *ctl; struct tc_gred_sopt *sopt; - struct rtattr *tb[TCA_GRED_STAB]; - struct rtattr *tb2[TCA_GRED_DPS]; int i; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + if (dps == NULL || RTA_PAYLOAD(dps) < sizeof(*sopt)) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { - rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + sopt = RTA_DATA(dps); + + if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs) + return -EINVAL; - if (tb2[TCA_GRED_DPS-1] == 0) - return -EINVAL; + sch_tree_lock(sch); + table->DPs = sopt->DPs; + table->def = sopt->def_DP; + table->red_flags = sopt->flags; + + /* + * Every entry point to GRED is synchronized with the above code + * and the DP is checked against DPs, i.e. shadowed VQs can no + * longer be found so we can unlock right here. + */ + sch_tree_unlock(sch); + + if (sopt->grio) { + gred_enable_rio_mode(table); + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); + } else { + gred_disable_rio_mode(table); + gred_disable_wred_mode(table); + } - sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); - table->DPs=sopt->DPs; - table->def=sopt->def_DP; - table->grio=sopt->grio; - table->initd=0; - /* probably need to clear all the table DP entries as well */ - return 0; - } + for (i = table->DPs; i < MAX_DPs; i++) { + if (table->tab[i]) { + printk(KERN_WARNING "GRED: Warning: Destroying " + "shadowed VQ 0x%x\n", i); + gred_destroy_vq(table->tab[i]); + table->tab[i] = NULL; + } + } + return 0; +} - if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || - RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || - RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) - return -EINVAL; +static inline int gred_change_vq(struct Qdisc *sch, int dp, + struct tc_gred_qopt *ctl, int prio, u8 *stab) +{ + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; - ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); - if (ctl->DP > MAX_DPs-1 ) { - /* misbehaving is punished! Put in the default drop probability */ - DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " - "set to default at %d\n",ctl->DP,table->def); - ctl->DP=table->def; - } - - if (table->tab[ctl->DP] == NULL) { - table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), - GFP_KERNEL); - if (NULL == table->tab[ctl->DP]) + if (table->tab[dp] == NULL) { + table->tab[dp] = kmalloc(sizeof(*q), GFP_KERNEL); + if (table->tab[dp] == NULL) return -ENOMEM; - memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); - } - q= table->tab[ctl->DP]; - - if (table->grio) { - if (ctl->prio <=0) { - if (table->def && table->tab[table->def]) { - DPRINTK("\nGRED: DP %u does not have a prio" - "setting default to %d\n",ctl->DP, - table->tab[table->def]->prio); - q->prio=table->tab[table->def]->prio; - } else { - DPRINTK("\nGRED: DP %u does not have a prio" - " setting default to 8\n",ctl->DP); - q->prio=8; - } - } else { - q->prio=ctl->prio; - } - } else { - q->prio=8; + memset(table->tab[dp], 0, sizeof(*q)); } - - q->DP=ctl->DP; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; + q = table->tab[dp]; + q->DP = dp; + q->prio = prio; q->limit = ctl->limit; - q->Scell_log = ctl->Scell_log; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; - q->qave=0; - q->backlog=0; - q->qcount = -1; - q->other=0; - q->forced=0; - q->pdrop=0; - q->early=0; - - PSCHED_SET_PASTPERFECT(q->qidlestart); - memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); - - if ( table->initd && table->grio) { - /* this looks ugly but it's not in the fast path */ - for (i=0;i<table->DPs;i++) { - if ((!table->tab[i]) || (i==q->DP) ) - continue; - if (table->tab[i]->prio == q->prio ){ - /* WRED mode detected */ - table->eqp=1; - break; - } - } - } - if (!table->initd) { - table->initd=1; - /* - the first entry also goes into the default until - over-written - */ - - if (table->tab[table->def] == NULL) { - table->tab[table->def]= - kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); - if (NULL == table->tab[table->def]) - return -ENOMEM; - - memset(table->tab[table->def], 0, - (sizeof(struct gred_sched_data))); - } - q= table->tab[table->def]; - q->DP=table->def; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; - q->limit = ctl->limit; - q->Scell_log = ctl->Scell_log; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; - - if (table->grio) - q->prio=table->tab[ctl->DP]->prio; - else - q->prio=8; - - q->qcount = -1; - PSCHED_SET_PASTPERFECT(q->qidlestart); - memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); - } - return 0; + if (q->backlog == 0) + red_end_of_idle_period(&q->parms); + red_set_parms(&q->parms, + ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, + ctl->Scell_log, stab); + + return 0; } -static int gred_init(struct Qdisc *sch, struct rtattr *opt) +static int gred_change(struct Qdisc *sch, struct rtattr *opt) { struct gred_sched *table = qdisc_priv(sch); - struct tc_gred_sopt *sopt; - struct rtattr *tb[TCA_GRED_STAB]; - struct rtattr *tb2[TCA_GRED_DPS]; + struct tc_gred_qopt *ctl; + struct rtattr *tb[TCA_GRED_MAX]; + int err = -EINVAL, prio = GRED_DEF_PRIO; + u8 *stab; - if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) return -EINVAL; - if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { - rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + if (tb[TCA_GRED_PARMS-1] == NULL && tb[TCA_GRED_STAB-1] == NULL) + return gred_change_table_def(sch, opt); + + if (tb[TCA_GRED_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || + tb[TCA_GRED_STAB-1] == NULL || + RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); + stab = RTA_DATA(tb[TCA_GRED_STAB-1]); + + if (ctl->DP >= table->DPs) + goto errout; - if (tb2[TCA_GRED_DPS-1] == 0) - return -EINVAL; + if (gred_rio_mode(table)) { + if (ctl->prio == 0) { + int def_prio = GRED_DEF_PRIO; - sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); - table->DPs=sopt->DPs; - table->def=sopt->def_DP; - table->grio=sopt->grio; - table->initd=0; - return 0; + if (table->tab[table->def]) + def_prio = table->tab[table->def]->prio; + + printk(KERN_DEBUG "GRED: DP %u does not have a prio " + "setting default to %d\n", ctl->DP, def_prio); + + prio = def_prio; + } else + prio = ctl->prio; + } + + sch_tree_lock(sch); + + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + if (err < 0) + goto errout_locked; + + if (gred_rio_mode(table)) { + gred_disable_wred_mode(table); + if (gred_wred_mode_check(sch)) + gred_enable_wred_mode(table); } - DPRINTK("\n GRED_INIT error!\n"); - return -EINVAL; + err = 0; + +errout_locked: + sch_tree_unlock(sch); +errout: + return err; } -static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +static int gred_init(struct Qdisc *sch, struct rtattr *opt) { - unsigned long qave; - struct rtattr *rta; - struct tc_gred_qopt *opt = NULL ; - struct tc_gred_qopt *dst; - struct gred_sched *table = qdisc_priv(sch); - struct gred_sched_data *q; - int i; - unsigned char *b = skb->tail; + struct rtattr *tb[TCA_GRED_MAX]; - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_MAX, opt)) + return -EINVAL; - opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); + if (tb[TCA_GRED_PARMS-1] || tb[TCA_GRED_STAB-1]) + return -EINVAL; - if (opt == NULL) { - DPRINTK("gred_dump:failed to malloc for %Zd\n", - sizeof(struct tc_gred_qopt)*MAX_DPs); - goto rtattr_failure; - } + return gred_change_table_def(sch, tb[TCA_GRED_DPS-1]); +} - memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct gred_sched *table = qdisc_priv(sch); + struct rtattr *parms, *opts = NULL; + int i; + struct tc_gred_sopt sopt = { + .DPs = table->DPs, + .def_DP = table->def, + .grio = gred_rio_mode(table), + .flags = table->red_flags, + }; - if (!table->initd) { - DPRINTK("NO GRED Queues setup!\n"); - } + opts = RTA_NEST(skb, TCA_OPTIONS); + RTA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + parms = RTA_NEST(skb, TCA_GRED_PARMS); + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + struct tc_gred_qopt opt; - for (i=0;i<MAX_DPs;i++) { - dst= &opt[i]; - q= table->tab[i]; + memset(&opt, 0, sizeof(opt)); if (!q) { /* hack -- fix at some point with proper message This is how we indicate to tc that there is no VQ at this DP */ - dst->DP=MAX_DPs+i; - continue; + opt.DP = MAX_DPs + i; + goto append_opt; } - dst->limit=q->limit; - dst->qth_min=q->qth_min>>q->Wlog; - dst->qth_max=q->qth_max>>q->Wlog; - dst->DP=q->DP; - dst->backlog=q->backlog; - if (q->qave) { - if (table->eqp && table->grio) { - q->qidlestart=table->tab[table->def]->qidlestart; - q->qave=table->tab[table->def]->qave; - } - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long idle; - psched_time_t now; - PSCHED_GET_TIME(now); - idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; - dst->qave = qave >> q->Wlog; - - } else { - dst->qave = q->qave >> q->Wlog; - } - } else { - dst->qave = 0; + opt.limit = q->limit; + opt.DP = q->DP; + opt.backlog = q->backlog; + opt.prio = q->prio; + opt.qth_min = q->parms.qth_min >> q->parms.Wlog; + opt.qth_max = q->parms.qth_max >> q->parms.Wlog; + opt.Wlog = q->parms.Wlog; + opt.Plog = q->parms.Plog; + opt.Scell_log = q->parms.Scell_log; + opt.other = q->stats.other; + opt.early = q->stats.prob_drop; + opt.forced = q->stats.forced_drop; + opt.pdrop = q->stats.pdrop; + opt.packets = q->packetsin; + opt.bytesin = q->bytesin; + + if (gred_wred_mode(table)) { + q->parms.qidlestart = + table->tab[table->def]->parms.qidlestart; + q->parms.qavg = table->tab[table->def]->parms.qavg; } - - - dst->Wlog = q->Wlog; - dst->Plog = q->Plog; - dst->Scell_log = q->Scell_log; - dst->other = q->other; - dst->forced = q->forced; - dst->early = q->early; - dst->pdrop = q->pdrop; - dst->prio = q->prio; - dst->packets=q->packetsin; - dst->bytesin=q->bytesin; + + opt.qave = red_calc_qavg(&q->parms, q->parms.qavg); + +append_opt: + RTA_APPEND(skb, sizeof(opt), &opt); } - RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); - rta->rta_len = skb->tail - b; + RTA_NEST_END(skb, parms); - kfree(opt); - return skb->len; + return RTA_NEST_END(skb, opts); rtattr_failure: - if (opt) - kfree(opt); - DPRINTK("gred_dump: FAILURE!!!!\n"); - -/* also free the opt struct here */ - skb_trim(skb, b - skb->data); - return -1; + return RTA_NEST_CANCEL(skb, opts); } static void gred_destroy(struct Qdisc *sch) @@ -594,15 +574,13 @@ static void gred_destroy(struct Qdisc *sch) struct gred_sched *table = qdisc_priv(sch); int i; - for (i = 0;i < table->DPs; i++) { + for (i = 0; i < table->DPs; i++) { if (table->tab[i]) - kfree(table->tab[i]); + gred_destroy_vq(table->tab[i]); } } static struct Qdisc_ops gred_qdisc_ops = { - .next = NULL, - .cl_ops = NULL, .id = "gred", .priv_size = sizeof(struct gred_sched), .enqueue = gred_enqueue, @@ -621,10 +599,13 @@ static int __init gred_module_init(void) { return register_qdisc(&gred_qdisc_ops); } -static void __exit gred_module_exit(void) + +static void __exit gred_module_exit(void) { unregister_qdisc(&gred_qdisc_ops); } + module_init(gred_module_init) module_exit(gred_module_exit) + MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bb9bf8d5003..cdc8d283791 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,6 +25,8 @@ #include <net/pkt_sched.h> +#define VERSION "1.1" + /* Network Emulation Queuing algorithm. ==================================== @@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; + psched_tdiff_t delay; + + delay = tabledist(q->latency, q->jitter, + &q->delay_cor, q->delay_dist); + PSCHED_GET_TIME(now); - PSCHED_TADD2(now, tabledist(q->latency, q->jitter, - &q->delay_cor, q->delay_dist), - cb->time_to_send); + PSCHED_TADD2(now, delay, cb->time_to_send); ++q->counter; ret = q->qdisc->enqueue(skb, q->qdisc); } else { @@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) const struct netem_skb_cb *cb = (const struct netem_skb_cb *)skb->cb; psched_time_t now; - long delay; /* if more time remaining? */ PSCHED_GET_TIME(now); - delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); - pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay); - if (delay <= 0) { + + if (PSCHED_TLESS(cb->time_to_send, now)) { pr_debug("netem_dequeue: return skb=%p\n", skb); sch->q.qlen--; sch->flags &= ~TCQ_F_THROTTLED; return skb; - } + } else { + psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now); + + if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; - mod_timer(&q->timer, jiffies + delay); - sch->flags |= TCQ_F_THROTTLED; + /* After this qlen is confused */ + printk(KERN_ERR "netem: queue discpline %s could not requeue\n", + q->qdisc->ops->id); - if (q->qdisc->ops->requeue(skb, q->qdisc) != 0) - sch->qstats.drops++; + sch->q.qlen--; + } + + mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay)); + sch->flags |= TCQ_F_THROTTLED; + } } return NULL; @@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch) del_timer_sync(&q->timer); } +/* Pass size change message down to embedded FIFO */ static int set_fifo_limit(struct Qdisc *q, int limit) { struct rtattr *rta; int ret = -ENOMEM; + /* Hack to avoid sending change message to non-FIFO */ + if (strncmp(q->ops->id + 1, "fifo", 4) != 0) + return 0; + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (rta) { rta->rta_type = RTM_NEWQDISC; @@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) return 0; } +/* + * Special case version of FIFO queue for use by netem. + * It queues in order based on timestamps in skb's + */ +struct fifo_sched_data { + u32 limit; +}; + +static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *list = &sch->q; + const struct netem_skb_cb *ncb + = (const struct netem_skb_cb *)nskb->cb; + struct sk_buff *skb; + + if (likely(skb_queue_len(list) < q->limit)) { + skb_queue_reverse_walk(list, skb) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + + if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send)) + break; + } + + __skb_queue_after(list, skb, nskb); + + sch->qstats.backlog += nskb->len; + sch->bstats.bytes += nskb->len; + sch->bstats.packets++; + + return NET_XMIT_SUCCESS; + } + + return qdisc_drop(nskb, sch); +} + +static int tfifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt) { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (RTA_PAYLOAD(opt) < sizeof(*ctl)) + return -EINVAL; + + q->limit = ctl->limit; + } else + q->limit = max_t(u32, sch->dev->tx_queue_len, 1); + + return 0; +} + +static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + struct tc_fifo_qopt opt = { .limit = q->limit }; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + return -1; +} + +static struct Qdisc_ops tfifo_qdisc_ops = { + .id = "tfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = tfifo_enqueue, + .dequeue = qdisc_dequeue_head, + .requeue = qdisc_requeue, + .drop = qdisc_queue_drop, + .init = tfifo_init, + .reset = qdisc_reset_queue, + .change = tfifo_init, + .dump = tfifo_dump, +}; + static int netem_init(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt) q->timer.function = netem_watchdog; q->timer.data = (unsigned long) sch; - q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops); if (!q->qdisc) { pr_debug("netem: qdisc create failed\n"); return -ENOMEM; @@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = { static int __init netem_module_init(void) { + pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 7845d045eec..dccfa44c2d7 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -9,76 +9,23 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: - * J Hadi Salim <hadi@nortel.com> 980914: computation fixes + * J Hadi Salim 980914: computation fixes * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. - * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support + * J Hadi Salim 980816: ECN support */ #include <linux/config.h> #include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> -#include <net/sock.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> -#include <net/dsfield.h> +#include <net/red.h> -/* Random Early Detection (RED) algorithm. - ======================================= - - Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways - for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. - - This file codes a "divisionless" version of RED algorithm - as written down in Fig.17 of the paper. - -Short description. ------------------- - - When a new packet arrives we calculate the average queue length: - - avg = (1-W)*avg + W*current_queue_len, - - W is the filter time constant (chosen as 2^(-Wlog)), it controls - the inertia of the algorithm. To allow larger bursts, W should be - decreased. - - if (avg > th_max) -> packet marked (dropped). - if (avg < th_min) -> packet passes. - if (th_min < avg < th_max) we calculate probability: - - Pb = max_P * (avg - th_min)/(th_max-th_min) - - and mark (drop) packet with this probability. - Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). - max_P should be small (not 1), usually 0.01..0.02 is good value. - - max_P is chosen as a number, so that max_P/(th_max-th_min) - is a negative power of two in order arithmetics to contain - only shifts. - - - Parameters, settable by user: +/* Parameters, settable by user: ----------------------------- limit - bytes (must be > qth_max + burst) @@ -89,243 +36,93 @@ Short description. arbitrarily high (well, less than ram size) Really, this limit will never be reached if RED works correctly. - - qth_min - bytes (should be < qth_max/2) - qth_max - bytes (should be at least 2*qth_min and less limit) - Wlog - bits (<32) log(1/W). - Plog - bits (<32) - - Plog is related to max_P by formula: - - max_P = (qth_max-qth_min)/2^Plog; - - F.e. if qth_max=128K and qth_min=32K, then Plog=22 - corresponds to max_P=0.02 - - Scell_log - Stab - - Lookup table for log((1-W)^(t/t_ave). - - -NOTES: - -Upper bound on W. ------------------ - - If you want to allow bursts of L packets of size S, - you should choose W: - - L + 1 - th_min/S < (1-(1-W)^L)/W - - th_min/S = 32 th_min/S = 4 - - log(W) L - -1 33 - -2 35 - -3 39 - -4 46 - -5 57 - -6 75 - -7 101 - -8 135 - -9 190 - etc. */ struct red_sched_data { -/* Parameters */ - u32 limit; /* HARD maximal queue length */ - u32 qth_min; /* Min average length threshold: A scaled */ - u32 qth_max; /* Max average length threshold: A scaled */ - u32 Rmask; - u32 Scell_max; - unsigned char flags; - char Wlog; /* log(W) */ - char Plog; /* random number bits */ - char Scell_log; - u8 Stab[256]; - -/* Variables */ - unsigned long qave; /* Average queue length: A scaled */ - int qcount; /* Packets since last random number generation */ - u32 qR; /* Cached random number */ - - psched_time_t qidlestart; /* Start of idle period */ - struct tc_red_xstats st; + u32 limit; /* HARD maximal queue length */ + unsigned char flags; + struct red_parms parms; + struct red_stats stats; }; -static int red_ecn_mark(struct sk_buff *skb) +static inline int red_use_ecn(struct red_sched_data *q) { - if (skb->nh.raw + 20 > skb->tail) - return 0; - - switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - if (INET_ECN_is_not_ect(skb->nh.iph->tos)) - return 0; - IP_ECN_set_ce(skb->nh.iph); - return 1; - case __constant_htons(ETH_P_IPV6): - if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) - return 0; - IP6_ECN_set_ce(skb->nh.ipv6h); - return 1; - default: - return 0; - } + return q->flags & TC_RED_ECN; } -static int -red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static inline int red_use_harddrop(struct red_sched_data *q) +{ + return q->flags & TC_RED_HARDDROP; +} + +static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - psched_time_t now; + q->parms.qavg = red_calc_qavg(&q->parms, sch->qstats.backlog); - if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { - long us_idle; - int shift; + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); - PSCHED_SET_PASTPERFECT(q->qidlestart); + switch (red_action(&q->parms, q->parms.qavg)) { + case RED_DONT_MARK: + break; -/* - The problem: ideally, average length queue recalcultion should - be done over constant clock intervals. This is too expensive, so that - the calculation is driven by outgoing packets. - When the queue is idle we have to model this clock by hand. - - SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) - dummy packets as a burst after idle time, i.e. - - q->qave *= (1-W)^m - - This is an apparently overcomplicated solution (f.e. we have to precompute - a table to make this calculation in reasonable time) - I believe that a simpler model may be used here, - but it is field for experiments. -*/ - shift = q->Stab[us_idle>>q->Scell_log]; - - if (shift) { - q->qave >>= shift; - } else { - /* Approximate initial part of exponent - with linear function: - (1-W)^m ~= 1-mW + ... - - Seems, it is the best solution to - problem of too coarce exponent tabulation. - */ - - us_idle = (q->qave * us_idle)>>q->Scell_log; - if (us_idle < q->qave/2) - q->qave -= us_idle; - else - q->qave >>= 1; - } - } else { - q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); - /* NOTE: - q->qave is fixed point number with point at Wlog. - The formulae above is equvalent to floating point - version: - - qave = qave*(1-W) + sch->qstats.backlog*W; - --ANK (980924) - */ - } + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } - if (q->qave < q->qth_min) { - q->qcount = -1; -enqueue: - if (sch->qstats.backlog + skb->len <= q->limit) { - __skb_queue_tail(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->bstats.bytes += skb->len; - sch->bstats.packets++; - return NET_XMIT_SUCCESS; - } else { - q->st.pdrop++; - } - kfree_skb(skb); - sch->qstats.drops++; - return NET_XMIT_DROP; - } - if (q->qave >= q->qth_max) { - q->qcount = -1; - sch->qstats.overlimits++; -mark: - if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { - q->st.early++; - goto drop; - } - q->st.marked++; - goto enqueue; - } + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (red_use_harddrop(q) || !red_use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } - if (++q->qcount) { - /* The formula used below causes questions. - - OK. qR is random number in the interval 0..Rmask - i.e. 0..(2^Plog). If we used floating point - arithmetics, it would be: (2^Plog)*rnd_num, - where rnd_num is less 1. - - Taking into account, that qave have fixed - point at Wlog, and Plog is related to max_P by - max_P = (qth_max-qth_min)/2^Plog; two lines - below have the following floating point equivalent: - - max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount - - Any questions? --ANK (980924) - */ - if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) - goto enqueue; - q->qcount = 0; - q->qR = net_random()&q->Rmask; - sch->qstats.overlimits++; - goto mark; + q->stats.forced_mark++; + break; } - q->qR = net_random()&q->Rmask; - goto enqueue; -drop: - kfree_skb(skb); - sch->qstats.drops++; + if (sch->qstats.backlog + skb->len <= q->limit) + return qdisc_enqueue_tail(skb, sch); + + q->stats.pdrop++; + return qdisc_drop(skb, sch); + +congestion_drop: + qdisc_drop(skb, sch); return NET_XMIT_CN; } -static int -red_requeue(struct sk_buff *skb, struct Qdisc* sch) +static int red_requeue(struct sk_buff *skb, struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - PSCHED_SET_PASTPERFECT(q->qidlestart); + if (red_is_idling(&q->parms)) + red_end_of_idle_period(&q->parms); - __skb_queue_head(&sch->q, skb); - sch->qstats.backlog += skb->len; - sch->qstats.requeues++; - return 0; + return qdisc_requeue(skb, sch); } -static struct sk_buff * -red_dequeue(struct Qdisc* sch) +static struct sk_buff * red_dequeue(struct Qdisc* sch) { struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); - skb = __skb_dequeue(&sch->q); - if (skb) { - sch->qstats.backlog -= skb->len; - return skb; - } - PSCHED_GET_TIME(q->qidlestart); - return NULL; + skb = qdisc_dequeue_head(sch); + + if (skb == NULL && !red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + + return skb; } static unsigned int red_drop(struct Qdisc* sch) @@ -333,16 +130,17 @@ static unsigned int red_drop(struct Qdisc* sch) struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); - skb = __skb_dequeue_tail(&sch->q); + skb = qdisc_dequeue_tail(sch); if (skb) { unsigned int len = skb->len; - sch->qstats.backlog -= len; - sch->qstats.drops++; - q->st.other++; - kfree_skb(skb); + q->stats.other++; + qdisc_drop(skb, sch); return len; } - PSCHED_GET_TIME(q->qidlestart); + + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + return 0; } @@ -350,43 +148,38 @@ static void red_reset(struct Qdisc* sch) { struct red_sched_data *q = qdisc_priv(sch); - __skb_queue_purge(&sch->q); - sch->qstats.backlog = 0; - PSCHED_SET_PASTPERFECT(q->qidlestart); - q->qave = 0; - q->qcount = -1; + qdisc_reset_queue(sch); + red_restart(&q->parms); } static int red_change(struct Qdisc *sch, struct rtattr *opt) { struct red_sched_data *q = qdisc_priv(sch); - struct rtattr *tb[TCA_RED_STAB]; + struct rtattr *tb[TCA_RED_MAX]; struct tc_red_qopt *ctl; - if (opt == NULL || - rtattr_parse_nested(tb, TCA_RED_STAB, opt) || - tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + if (opt == NULL || rtattr_parse_nested(tb, TCA_RED_MAX, opt)) + return -EINVAL; + + if (tb[TCA_RED_PARMS-1] == NULL || RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || - RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) + tb[TCA_RED_STAB-1] == NULL || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < RED_STAB_SIZE) return -EINVAL; ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); sch_tree_lock(sch); q->flags = ctl->flags; - q->Wlog = ctl->Wlog; - q->Plog = ctl->Plog; - q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; - q->Scell_log = ctl->Scell_log; - q->Scell_max = (255<<q->Scell_log); - q->qth_min = ctl->qth_min<<ctl->Wlog; - q->qth_max = ctl->qth_max<<ctl->Wlog; q->limit = ctl->limit; - memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); - q->qcount = -1; + red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, + ctl->Plog, ctl->Scell_log, + RTA_DATA(tb[TCA_RED_STAB-1])); + if (skb_queue_empty(&sch->q)) - PSCHED_SET_PASTPERFECT(q->qidlestart); + red_end_of_idle_period(&q->parms); + sch_tree_unlock(sch); return 0; } @@ -399,39 +192,39 @@ static int red_init(struct Qdisc* sch, struct rtattr *opt) static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); - unsigned char *b = skb->tail; - struct rtattr *rta; - struct tc_red_qopt opt; - - rta = (struct rtattr*)b; - RTA_PUT(skb, TCA_OPTIONS, 0, NULL); - opt.limit = q->limit; - opt.qth_min = q->qth_min>>q->Wlog; - opt.qth_max = q->qth_max>>q->Wlog; - opt.Wlog = q->Wlog; - opt.Plog = q->Plog; - opt.Scell_log = q->Scell_log; - opt.flags = q->flags; + struct rtattr *opts = NULL; + struct tc_red_qopt opt = { + .limit = q->limit, + .flags = q->flags, + .qth_min = q->parms.qth_min >> q->parms.Wlog, + .qth_max = q->parms.qth_max >> q->parms.Wlog, + .Wlog = q->parms.Wlog, + .Plog = q->parms.Plog, + .Scell_log = q->parms.Scell_log, + }; + + opts = RTA_NEST(skb, TCA_OPTIONS); RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); - rta->rta_len = skb->tail - b; - - return skb->len; + return RTA_NEST_END(skb, opts); rtattr_failure: - skb_trim(skb, b - skb->data); - return -1; + return RTA_NEST_CANCEL(skb, opts); } static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct red_sched_data *q = qdisc_priv(sch); - - return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); + struct tc_red_xstats st = { + .early = q->stats.prob_drop + q->stats.forced_drop, + .pdrop = q->stats.pdrop, + .other = q->stats.other, + .marked = q->stats.prob_mark + q->stats.forced_mark, + }; + + return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops red_qdisc_ops = { - .next = NULL, - .cl_ops = NULL, .id = "red", .priv_size = sizeof(struct red_sched_data), .enqueue = red_enqueue, @@ -450,10 +243,13 @@ static int __init red_module_init(void) { return register_qdisc(&red_qdisc_ops); } -static void __exit red_module_exit(void) + +static void __exit red_module_exit(void) { unregister_qdisc(&red_qdisc_ops); } + module_init(red_module_init) module_exit(red_module_exit) + MODULE_LICENSE("GPL"); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 12b0f582a66..dec68a60477 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -128,9 +128,29 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->max_burst = sctp_max_burst; - /* Copy things from the endpoint. */ + /* initialize association timers */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * asoc->rto_max; + + asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + sp->autoclose * HZ; + + /* Initilizes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - asoc->timeouts[i] = ep->timeouts[i]; init_timer(&asoc->timers[i]); asoc->timers[i].function = sctp_timer_events[i]; asoc->timers[i].data = (unsigned long) asoc; @@ -157,10 +177,10 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a * RFC 6 - A SCTP receiver MUST be able to receive a minimum of * 1500 bytes in one SCTP packet. */ - if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) + if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW) asoc->rwnd = SCTP_DEFAULT_MINWINDOW; else - asoc->rwnd = sk->sk_rcvbuf; + asoc->rwnd = sk->sk_rcvbuf/2; asoc->a_rwnd = asoc->rwnd; @@ -172,6 +192,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* Set the sndbuf size for transmit. */ asoc->sndbuf_used = 0; + /* Initialize the receive memory counter */ + atomic_set(&asoc->rmem_alloc, 0); + init_waitqueue_head(&asoc->wait); asoc->c.my_vtag = sctp_generate_tag(ep); @@ -344,9 +367,7 @@ void sctp_association_free(struct sctp_association *asoc) } /* Free peer's cached cookie. */ - if (asoc->peer.cookie) { - kfree(asoc->peer.cookie); - } + kfree(asoc->peer.cookie); /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { @@ -382,6 +403,8 @@ static void sctp_association_destroy(struct sctp_association *asoc) spin_unlock_bh(&sctp_assocs_id_lock); } + BUG_TRAP(!atomic_read(&asoc->rmem_alloc)); + if (asoc->base.malloced) { kfree(asoc); SCTP_DBG_OBJCNT_DEC(assoc); diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 96984f7a2d6..67bd53070ee 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -70,7 +70,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct sock *sk, gfp_t gfp) { - struct sctp_sock *sp = sctp_sk(sk); memset(ep, 0, sizeof(struct sctp_endpoint)); /* Initialize the base structure. */ @@ -100,33 +99,14 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Create the lists of associations. */ INIT_LIST_HEAD(&ep->asocs); - /* Set up the base timeout information. */ - ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = - msecs_to_jiffies(sp->rtoinfo.srto_initial); - ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; - - /* sctpimpguide-05 Section 2.12.2 - * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the - * recommended value of 5 times 'RTO.Max'. - */ - ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] - = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); - - ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout; - ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; - /* Use SCTP specific send buffer space queues. */ ep->sndbuf_policy = sctp_sndbuf_policy; sk->sk_write_space = sctp_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + /* Get the receive buffer policy for this endpoint */ + ep->rcvbuf_policy = sctp_rcvbuf_policy; + /* Initialize the secret key used with cookie. */ get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); ep->last_key = ep->current_key = 0; diff --git a/net/sctp/input.c b/net/sctp/input.c index 28f32243397..b24ff2c1aef 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -100,21 +100,6 @@ static inline int sctp_rcv_checksum(struct sk_buff *skb) return 0; } -/* The free routine for skbuffs that sctp receives */ -static void sctp_rfree(struct sk_buff *skb) -{ - atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); - sock_rfree(skb); -} - -/* The ownership wrapper routine to do receive buffer accounting */ -static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_set_owner_r(skb,sk); - skb->destructor = sctp_rfree; - atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); -} - struct sctp_input_cb { union { struct inet_skb_parm h4; @@ -217,9 +202,6 @@ int sctp_rcv(struct sk_buff *skb) rcvr = &ep->base; } - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - goto discard_release; - /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) @@ -256,8 +238,6 @@ int sctp_rcv(struct sk_buff *skb) } SCTP_INPUT_CB(skb)->chunk = chunk; - sctp_rcv_set_owner_r(skb,sk); - /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 26de4d3e1bd..f775d78aa59 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -530,6 +530,9 @@ static void sctp_v4_get_saddr(struct sctp_association *asoc, { struct rtable *rt = (struct rtable *)dst; + if (!asoc) + return; + if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = asoc->base.bind_addr.port; @@ -1047,6 +1050,9 @@ SCTP_STATIC __init int sctp_init(void) /* Sendbuffer growth - do per-socket accounting */ sctp_sndbuf_policy = 0; + /* Rcvbuffer growth - do per-socket accounting */ + sctp_rcvbuf_policy = 0; + /* HB.interval - 30 seconds */ sctp_hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 660c61bdf16..f9573eba5c7 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -254,8 +254,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, aiparam.adaption_ind = htonl(sp->adaption_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); nodata: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } @@ -347,8 +346,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, nomem_chunk: kfree(cookie); nomem_cookie: - if (addrs.v) - kfree(addrs.v); + kfree(addrs.v); return retval; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f84173ea8ec..823947170a3 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -385,7 +385,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { NULL, sctp_generate_t4_rto_event, sctp_generate_t5_shutdown_guard_event, - sctp_generate_heartbeat_event, + NULL, sctp_generate_sack_event, sctp_generate_autoclose_event, }; @@ -689,9 +689,9 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]; + asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = - asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]; + asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED) || diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 505c7de10c5..475bfb4972d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5160,6 +5160,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, sctp_verb_t deliver; int tmp; __u32 tsn; + int account_value; + struct sock *sk = asoc->base.sk; data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); @@ -5169,6 +5171,26 @@ static int sctp_eat_data(const struct sctp_association *asoc, /* ASSERT: Now skb->data is really the user data. */ + /* + * if we are established, and we have used up our receive + * buffer memory, drop the frame + */ + if (asoc->state == SCTP_STATE_ESTABLISHED) { + /* + * If the receive buffer policy is 1, then each + * association can allocate up to sk_rcvbuf bytes + * otherwise, all the associations in aggregate + * may allocate up to sk_rcvbuf bytes + */ + if (asoc->ep->rcvbuf_policy) + account_value = atomic_read(&asoc->rmem_alloc); + else + account_value = atomic_read(&sk->sk_rmem_alloc); + + if (account_value > sk->sk_rcvbuf) + return SCTP_IERROR_IGNORE_TSN; + } + /* Process ECN based congestion. * * Since the chunk structure is reused for all chunks within diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b529af5e6f2..abab81f3818 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1932,7 +1932,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; return 0; } @@ -5115,8 +5114,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); + skb_set_owner_r(skb, newsk); } } @@ -5144,8 +5145,10 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { + sock_rfree(skb); __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); + skb_set_owner_r(skb, newsk); } } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 75b28dd634f..fcd7096c953 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -121,6 +121,14 @@ static ctl_table sctp_table[] = { .proc_handler = &proc_dointvec }, { + .ctl_name = NET_SCTP_RCVBUF_POLICY, + .procname = "rcvbuf_policy", + .data = &sctp_rcvbuf_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = NET_SCTP_PATH_MAX_RETRANS, .procname = "path_max_retrans", .data = &sctp_max_retrans_path, diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index e049f41faa4..ba97f974f57 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -52,19 +52,6 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); -/* Stub skb destructor. */ -static void sctp_stub_rfree(struct sk_buff *skb) -{ -/* WARNING: This function is just a warning not to use the - * skb destructor. If the skb is shared, we may get the destructor - * callback on some processor that does not own the sock_lock. This - * was occuring with PACKET socket applications that were monitoring - * our skbs. We can't take the sock_lock, because we can't risk - * recursing if we do really own the sock lock. Instead, do all - * of our rwnd manipulation while we own the sock_lock outright. - */ -} - /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) { @@ -111,15 +98,19 @@ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); - skb->sk = asoc->base.sk; event->asoc = (struct sctp_association *)asoc; - skb->destructor = sctp_stub_rfree; + atomic_add(skb->truesize, &event->asoc->rmem_alloc); + skb_set_owner_r(skb, asoc->base.sk); } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { - sctp_association_put(event->asoc); + struct sctp_association *asoc = event->asoc; + struct sk_buff *skb = sctp_event2skb(event); + + atomic_sub(skb->truesize, &asoc->rmem_alloc); + sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. @@ -922,7 +913,6 @@ done: /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. - * See comments in sctp_stub_rfree(). */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 13f8ae97945..d0dfdfd5e79 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -143,6 +143,6 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); out_err: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return GSS_S_FAILURE; } diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 2030475d98e..db055fd7d77 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -176,6 +176,6 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, ret = GSS_S_COMPLETE; out: - if (md5cksum.data) kfree(md5cksum.data); + kfree(md5cksum.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index b048bf672da..f8bac6ccd52 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -60,8 +60,7 @@ gss_mech_free(struct gss_api_mech *gm) for (i = 0; i < gm->gm_pf_num; i++) { pf = &gm->gm_pfs[i]; - if (pf->auth_domain_name) - kfree(pf->auth_domain_name); + kfree(pf->auth_domain_name); pf->auth_domain_name = NULL; } } diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 148201e929d..d1e12b25d6e 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -122,8 +122,7 @@ spkm3_make_token(struct spkm3_ctx *ctx, return GSS_S_COMPLETE; out_err: - if (md5cksum.data) - kfree(md5cksum.data); + kfree(md5cksum.data); token->data = NULL; token->len = 0; return GSS_S_FAILURE; diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c index 46c08a0710f..1f824578d77 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_token.c +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -259,8 +259,7 @@ spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **ck ret = GSS_S_COMPLETE; out: - if (spkm3_ctx_id.data) - kfree(spkm3_ctx_id.data); + kfree(spkm3_ctx_id.data); return ret; } diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index c3c0d958610..241d5b30dfc 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -120,9 +120,7 @@ spkm3_read_token(struct spkm3_ctx *ctx, /* XXX: need to add expiration and sequencing */ ret = GSS_S_COMPLETE; out: - if (md5cksum.data) - kfree(md5cksum.data); - if (wire_cksum.data) - kfree(wire_cksum.data); + kfree(md5cksum.data); + kfree(wire_cksum.data); return ret; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 702ede309b0..61c3abeacca 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -55,6 +55,7 @@ static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); +static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_timeout(struct rpc_task *task); @@ -672,6 +673,18 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } +static inline int +rpc_task_need_encode(struct rpc_task *task) +{ + return task->tk_rqstp->rq_snd_buf.len == 0; +} + +static inline void +rpc_task_force_reencode(struct rpc_task *task) +{ + task->tk_rqstp->rq_snd_buf.len = 0; +} + /* * 3. Encode arguments of an RPC call */ @@ -867,12 +880,14 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (task->tk_rqstp->rq_bytes_sent == 0) { + if (rpc_task_need_encode(task)) { + task->tk_rqstp->rq_bytes_sent = 0; call_encode(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) goto out_nosend; } + task->tk_action = call_transmit_status; xprt_transmit(task); if (task->tk_status < 0) return; @@ -884,6 +899,7 @@ call_transmit(struct rpc_task *task) out_nosend: /* release socket write lock before attempting to handle error */ xprt_abort_transmit(task); + rpc_task_force_reencode(task); } /* @@ -915,7 +931,6 @@ call_status(struct rpc_task *task) break; case -ECONNREFUSED: case -ENOTCONN: - req->rq_bytes_sent = 0; if (clnt->cl_autobind) clnt->cl_port = 0; task->tk_action = call_bind; @@ -937,7 +952,18 @@ call_status(struct rpc_task *task) } /* - * 6a. Handle RPC timeout + * 6a. Handle transmission errors. + */ +static void +call_transmit_status(struct rpc_task *task) +{ + if (task->tk_status != -EAGAIN) + rpc_task_force_reencode(task); + call_status(task); +} + +/* + * 6b. Handle RPC timeout * We do not release the request slot, so we keep using the * same XID for all retransmits. */ diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 4f188d0a5d1..81e00a6c19d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -603,7 +603,7 @@ rpc_lookup_negative(char *path, struct nameidata *nd) return ERR_PTR(error); dir = nd->dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd->last, nd->dentry); + dentry = lookup_hash(nd); if (IS_ERR(dentry)) goto out_err; if (dentry->d_inode) { @@ -665,7 +665,7 @@ rpc_rmdir(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; @@ -726,7 +726,7 @@ rpc_unlink(char *path) return error; dir = nd.dentry->d_inode; down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) { error = PTR_ERR(dentry); goto out_release; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 8f97e90f36c..eb330d4f66d 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -6,6 +6,9 @@ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ +#include <linux/compiler.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> #include <linux/types.h> #include <linux/pagemap.h> #include <linux/udp.h> @@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) return -1; if ((unsigned short)csum_fold(desc.csum)) return -1; + if (unlikely(skb->ip_summed == CHECKSUM_HW)) + netdev_rx_csum_fault(skb->dev); return 0; no_checksum: if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e9bd91265f7..e4296c8b861 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -196,12 +196,9 @@ svc_exit_thread(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; svc_release_buffer(rqstp); - if (rqstp->rq_resp) - kfree(rqstp->rq_resp); - if (rqstp->rq_argp) - kfree(rqstp->rq_argp); - if (rqstp->rq_auth_data) - kfree(rqstp->rq_auth_data); + kfree(rqstp->rq_resp); + kfree(rqstp->rq_argp); + kfree(rqstp->rq_auth_data); kfree(rqstp); /* Release the server */ @@ -313,6 +310,11 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */ progp = serv->sv_program; + + for (progp = serv->sv_program; progp; progp = progp->pg_next) + if (prog == progp->pg_prog) + break; + /* * Decode auth data, and add verifier to reply buffer. * We do this before anything else in order to get a decent @@ -320,7 +322,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) */ auth_res = svc_authenticate(rqstp, &auth_stat); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK) { + if (auth_res == SVC_OK && progp) { auth_stat = rpc_autherr_badcred; auth_res = progp->pg_authenticate(rqstp); } @@ -340,10 +342,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) case SVC_COMPLETE: goto sendit; } - - for (progp = serv->sv_program; progp; progp = progp->pg_next) - if (prog == progp->pg_prog) - break; + if (progp == NULL) goto err_bad_prog; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f16e7cdd615..c6a51911e71 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { - skb_free_datagram(svsk->sk_sk, skb); - return 0; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb_checksum_complete(skb)) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; } rqstp->rq_skbuff = skb; } @@ -1181,6 +1178,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) arg->tail[0].iov_len = 0; try_to_freeze(); + cond_resched(); if (signalled()) return -EINTR; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 32df43372ee..aaf08cdd19f 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -992,8 +992,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, err = 0; out: - if (elem) - kfree(elem); + kfree(elem); if (ppages) kunmap(*ppages); return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41feca3bef8..acc73ba8bad 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -676,7 +676,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); if (err) goto fail; - err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); + err = vfs_permission(&nd, MAY_WRITE); if (err) goto put_fail; diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 596cb96e5f4..59fec59b213 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -1099,7 +1099,7 @@ static void release_driver(struct sock *sk) sock_reset_flag(sk, SOCK_ZAPPED); wp = wp_sk(sk); - if (wp && wp->mbox) { + if (wp) { kfree(wp->mbox); wp->mbox = NULL; } @@ -1186,10 +1186,8 @@ static void wanpipe_kill_sock_timer (unsigned long data) return; } - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1219,10 +1217,8 @@ static void wanpipe_kill_sock_accept (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); @@ -1243,10 +1239,8 @@ static void wanpipe_kill_sock_irq (struct sock *sk) sk->sk_socket = NULL; - if (wp_sk(sk)) { - kfree(wp_sk(sk)); - wp_sk(sk) = NULL; - } + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; if (atomic_read(&sk->sk_refcnt) != 1) { atomic_set(&sk->sk_refcnt, 1); diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 13b650ad22e..bcf7b3faa76 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -714,10 +714,8 @@ static int wanrouter_device_new_if(struct wan_device *wandev, } /* This code has moved from del_if() function */ - if (dev->priv) { - kfree(dev->priv); - dev->priv = NULL; - } + kfree(dev->priv); + dev->priv = NULL; #ifdef CONFIG_WANPIPE_MULTPPP if (cnf->config_id == WANCONFIG_MPPP) @@ -851,10 +849,8 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name) /* Due to new interface linking method using dev->priv, * this code has moved from del_if() function.*/ - if (dev->priv){ - kfree(dev->priv); - dev->priv=NULL; - } + kfree(dev->priv); + dev->priv=NULL; unregister_netdev(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8b9a4747417..7cf48aa6c95 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -62,14 +62,10 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (del_timer(&x->timer)) BUG(); - if (x->aalg) - kfree(x->aalg); - if (x->ealg) - kfree(x->ealg); - if (x->calg) - kfree(x->calg); - if (x->encap) - kfree(x->encap); + kfree(x->aalg); + kfree(x->ealg); + kfree(x->calg); + kfree(x->encap); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c35336a0f71..0cdd9a07e04 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -18,7 +18,6 @@ #include <linux/string.h> #include <linux/net.h> #include <linux/skbuff.h> -#include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> @@ -26,6 +25,7 @@ #include <linux/security.h> #include <net/sock.h> #include <net/xfrm.h> +#include <net/netlink.h> #include <asm/uaccess.h> static struct sock *xfrm_nl; @@ -948,11 +948,6 @@ static struct xfrm_link { [XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = { .doit = xfrm_flush_policy }, }; -static int xfrm_done(struct netlink_callback *cb) -{ - return 0; -} - static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { struct rtattr *xfrma[XFRMA_MAX]; @@ -984,20 +979,15 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *err if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || type == (XFRM_MSG_GETPOLICY - XFRM_MSG_BASE)) && (nlh->nlmsg_flags & NLM_F_DUMP)) { - u32 rlen; - if (link->dump == NULL) goto err_einval; if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, - link->dump, - xfrm_done)) != 0) { + link->dump, NULL)) != 0) { return -1; } - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - skb_pull(skb, rlen); + + netlink_queue_skip(nlh, skb); return -1; } @@ -1032,60 +1022,13 @@ err_einval: return -1; } -static int xfrm_user_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr *nlh; - - while (skb->len >= NLMSG_SPACE(0)) { - u32 rlen; - - nlh = (struct nlmsghdr *) skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || - skb->len < nlh->nlmsg_len) - return 0; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (rlen > skb->len) - rlen = skb->len; - if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { - if (err == 0) - return -1; - netlink_ack(skb, nlh, err); - } else if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, 0); - skb_pull(skb, rlen); - } - - return 0; -} - static void xfrm_netlink_rcv(struct sock *sk, int len) { - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + unsigned int qlen = 0; do { - struct sk_buff *skb; - down(&xfrm_cfg_sem); - - if (qlen > skb_queue_len(&sk->sk_receive_queue)) - qlen = skb_queue_len(&sk->sk_receive_queue); - - for (; qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - if (xfrm_user_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->sk_receive_queue, - skb); - else { - kfree_skb(skb); - qlen--; - } - break; - } - kfree_skb(skb); - } - + netlink_run_queue(sk, &qlen, &xfrm_user_rcv_msg); up(&xfrm_cfg_sem); } while (qlen); |