From 39d48157ac1a0ff3ec81212e5451bfd1bf5f50db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 21 Jul 2008 08:28:37 -0700 Subject: atl1: Do not wake queue before queue has been started. Based upon a bug report by Alexey Dobriyan, the patch is also tested by him and confirmed to fix the problem. Packet flow during link state events should not be done by waking and stopping the TX queue anyways, that is handled transparently by netif_carrier_{on,off}(). So, remove the netif_{wake,stop}_queue() calls in the link check code, and add the necessary netif_start_queue() call to atl1_up(). Signed-off-by: David S. Miller --- drivers/net/atlx/atl1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c index 3e22e7817bc..f12e3d12474 100644 --- a/drivers/net/atlx/atl1.c +++ b/drivers/net/atlx/atl1.c @@ -1308,7 +1308,6 @@ static u32 atl1_check_link(struct atl1_adapter *adapter) dev_info(&adapter->pdev->dev, "link is down\n"); adapter->link_speed = SPEED_0; netif_carrier_off(netdev); - netif_stop_queue(netdev); } return 0; } @@ -1358,7 +1357,6 @@ static u32 atl1_check_link(struct atl1_adapter *adapter) if (!netif_carrier_ok(netdev)) { /* Link down -> Up */ netif_carrier_on(netdev); - netif_wake_queue(netdev); } return 0; } @@ -2627,6 +2625,7 @@ static s32 atl1_up(struct atl1_adapter *adapter) mod_timer(&adapter->watchdog_timer, jiffies); atlx_irq_enable(adapter); atl1_check_link(adapter); + netif_start_queue(netdev); return 0; err_up: -- cgit v1.2.3 From afc079465e991ffb7fe197d1ad80eb8140e2c341 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 21 Jul 2008 08:29:54 -0700 Subject: gianfar: do not touch net queue in adjust_link phylib callback If the net queue has not been started, we'll get this nice oops and non-working ethernet: PHY: 0:01 - Link is Up - 1000/Full ------------[ cut here ]------------ kernel BUG at net/core/dev.c:1328! Oops: Exception in kernel mode, sig: 5 [#1] MPC837x RDB Modules linked in: NIP: c02544a0 LR: c01a17d0 CTR: c01a16ac REGS: cf837e40 TRAP: 0700 Not tainted (2.6.26-05253-g14b395e) MSR: 00021032 CR: 22042044 XER: 00000000 TASK = cf819400[5] 'events/0' THREAD: cf836000 GPR00: c01a17d0 cf837ef0 cf819400 c03d8d08 cf8469a0 00000064 00000000 00000000 GPR08: c03d8d08 00000001 00000001 cf899ba0 22044044 00000000 0fffd000 00000000 GPR16: 0fff3028 0fff6cf0 00000000 0fff8390 0ff494a0 00000004 00000000 00000000 GPR24: c0361a00 00001058 cf9f6600 00009032 cf846800 cf846b80 00000001 00000014 NIP [c02544a0] __netif_schedule+0x28/0x8c LR [c01a17d0] adjust_link+0x124/0x1cc Call Trace: [cf837ef0] [c03fb3a0] 0xc03fb3a0 (unreliable) [cf837f10] [c01a17d0] adjust_link+0x124/0x1cc [cf837f40] [c01a8e28] phy_state_machine+0x2e0/0x448 [cf837f60] [c0040254] run_workqueue+0xc8/0x168 [cf837f90] [c00408d8] worker_thread+0x70/0xd0 [cf837fd0] [c0044630] kthread+0x48/0x84 [cf837ff0] [c0012610] kernel_thread+0x44/0x60 Instruction dump: 7c0803a6 4e800020 3d20c03e 9421ffe0 7c0802a6 7c681b78 39298d08 7c694a78 7d290034 90010024 bfa10014 5529d97e <0f090000> 39600002 38030024 7d200028 ---[ end trace 13dfd73ee42d0c30 ]--- Since the driver is using phylib (which is doing netif_carrier_on/off()), we should simply remove netif_tx_schedule_all() from adjust_link(). Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- drivers/net/gianfar.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index 45a63172852..39b45e901be 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -1725,7 +1725,6 @@ static void adjust_link(struct net_device *dev) if (!priv->oldlink) { new_state = 1; priv->oldlink = 1; - netif_tx_schedule_all(dev); } } else if (priv->oldlink) { new_state = 1; -- cgit v1.2.3 From ebbdbd7c02f4f8dea84e2956aa942bd18e1ddf93 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 21 Jul 2008 08:30:36 -0700 Subject: ucc_geth: do not touch net queue in adjust_link phylib callback If the net queue has not been started, we'll get this nice oops and non-working ethernet: ------------[ cut here ]------------ Kernel BUG at c01f4648 [verbose debug info unavailable] Oops: Exception in kernel mode, sig: 5 [#1] MPC836x RDK Modules linked in: NIP: c01f4648 LR: c01c0a10 CTR: c01c08e4 REGS: cf839e40 TRAP: 0700 Not tainted (2.6.26-05254-gc7b9969) MSR: 00021032 CR: 22042044 XER: 00000000 TASK = cf828c30[4] 'events/0' THREAD: cf838000 GPR00: c01c0a10 cf839ef0 cf828c30 c035ceb0 cf8469a0 00000064 00000000 00000000 GPR08: c035ceb0 00000001 00000001 cf99c280 22044044 7ca81020 0fffc000 00000000 GPR16: 0fff2544 0fff63c0 00000000 0fff78e0 0ffa5580 00000004 00000000 00000000 GPR24: 02082000 cf9d0000 d1068000 00009032 cf846800 cf846b80 00000001 00000014 NIP [c01f4648] __netif_schedule+0x28/0x8c LR [c01c0a10] adjust_link+0x12c/0x1e4 Call Trace: [cf839ef0] [c0380f50] 0xc0380f50 (unreliable) [cf839f10] [c01c0a10] adjust_link+0x12c/0x1e4 [cf839f40] [c01c2628] phy_state_machine+0x2e0/0x448 [cf839f60] [c00425e8] run_workqueue+0xc8/0x168 [cf839f90] [c0042c6c] worker_thread+0x70/0xd0 [cf839fd0] [c0046954] kthread+0x48/0x84 [cf839ff0] [c0012488] kernel_thread+0x44/0x60 Instruction dump: 7c0803a6 4e800020 3d20c036 9421ffe0 7c0802a6 7c681b78 3929ceb0 7c694a78 7d290034 90010024 bfa10014 5529d97e <0f090000> 39600002 38030024 7d200028 ---[ end trace a57d367843bd2904 ]--- Since the driver is using phylib (which is doing netif_carrier_on/off()), we should simply remove netif_tx_schedule_all() from adjust_link(). Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- drivers/net/ucc_geth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c index 756ba10b79d..8f944e57fd5 100644 --- a/drivers/net/ucc_geth.c +++ b/drivers/net/ucc_geth.c @@ -1588,7 +1588,6 @@ static void adjust_link(struct net_device *dev) if (!ugeth->oldlink) { new_state = 1; ugeth->oldlink = 1; - netif_tx_schedule_all(dev); } } else if (ugeth->oldlink) { new_state = 1; -- cgit v1.2.3 From fd24c4af6e82231391fa09875ae6378fa1399f0f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 21 Jul 2008 08:34:49 -0700 Subject: sunhme: Remove stop/wake TX queue calls in set-multicast-list handler. Based upon a bug report by Alexander Beregalov and commentary from Ben Hutchings. These are totally unnecessary, in particular because this driver's ->hard_start_xmit() handler takes the same driver spinlock that the set-multicast-list handler uses. Signed-off-by: David S. Miller --- drivers/net/sunhme.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c index 1aa425be306..b79d5f018f7 100644 --- a/drivers/net/sunhme.c +++ b/drivers/net/sunhme.c @@ -2377,8 +2377,6 @@ static void happy_meal_set_multicast(struct net_device *dev) spin_lock_irq(&hp->happy_lock); - netif_stop_queue(dev); - if ((dev->flags & IFF_ALLMULTI) || (dev->mc_count > 64)) { hme_write32(hp, bregs + BMAC_HTABLE0, 0xffff); hme_write32(hp, bregs + BMAC_HTABLE1, 0xffff); @@ -2410,8 +2408,6 @@ static void happy_meal_set_multicast(struct net_device *dev) hme_write32(hp, bregs + BMAC_HTABLE3, hash_table[3]); } - netif_wake_queue(dev); - spin_unlock_irq(&hp->happy_lock); } -- cgit v1.2.3 From c3ee84163e5bc0dc2e1ccf1d3fc412debca73bab Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 21 Jul 2008 09:18:07 -0700 Subject: pkt_sched: Remove unused variable skb in dev_deactivate_queue function. Removed unused variable 'skb' in the dev_deactivate_queue function Signed-off-by: Daniel Lezcano Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0ddf69286f9..09dead33580 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -576,7 +576,6 @@ static void dev_deactivate_queue(struct net_device *dev, void *_qdisc_default) { struct Qdisc *qdisc_default = _qdisc_default; - struct sk_buff *skb = NULL; struct Qdisc *qdisc; qdisc = dev_queue->qdisc; @@ -588,8 +587,6 @@ static void dev_deactivate_queue(struct net_device *dev, spin_unlock_bh(qdisc_lock(qdisc)); } - - kfree_skb(skb); } static bool some_qdisc_is_running(struct net_device *dev, int lock) -- cgit v1.2.3 From b6b2fed1f4802b8fcc9d7548a8f785225d38f9a3 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 21 Jul 2008 09:48:06 -0700 Subject: net: Improve simple_tx_hash(). Based upon feedback from Eric Dumazet and Andi Kleen. Cure several deficiencies in simple_tx_hash() by using jhash + reciprocol multiply. 1) Eliminates expensive modulus operation. 2) Makes hash less attackable by using random seed. 3) Eliminates endianness hash distribution issues. Signed-off-by: David S. Miller --- net/core/dev.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 2eed17bcb2d..7e2d5274333 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -124,6 +124,8 @@ #include #include #include +#include +#include #include "net-sysfs.h" @@ -1668,34 +1670,37 @@ out_kfree_skb: * --BLG */ +static u32 simple_tx_hashrnd; +static int simple_tx_hashrnd_initialized = 0; + static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) { - u32 *addr, *ports, hash, ihl; + u32 addr1, addr2, ports; + u32 hash, ihl; u8 ip_proto; - int alen; + + if (unlikely(!simple_tx_hashrnd_initialized)) { + get_random_bytes(&simple_tx_hashrnd, 4); + simple_tx_hashrnd_initialized = 1; + } switch (skb->protocol) { case __constant_htons(ETH_P_IP): ip_proto = ip_hdr(skb)->protocol; - addr = &ip_hdr(skb)->saddr; + addr1 = ip_hdr(skb)->saddr; + addr2 = ip_hdr(skb)->daddr; ihl = ip_hdr(skb)->ihl; - alen = 2; break; case __constant_htons(ETH_P_IPV6): ip_proto = ipv6_hdr(skb)->nexthdr; - addr = &ipv6_hdr(skb)->saddr.s6_addr32[0]; + addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3]; + addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3]; ihl = (40 >> 2); - alen = 8; break; default: return 0; } - ports = (u32 *) (skb_network_header(skb) + (ihl * 4)); - - hash = 0; - while (alen--) - hash ^= *addr++; switch (ip_proto) { case IPPROTO_TCP: @@ -1705,14 +1710,17 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - hash ^= *ports; + ports = *((u32 *) (skb_network_header(skb) + (ihl * 4))); break; default: + ports = 0; break; } - return hash % dev->real_num_tx_queues; + hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd); + + return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } static struct netdev_queue *dev_pick_tx(struct net_device *dev, -- cgit v1.2.3 From 867d79fb9a4d5929ad8335c896fcfe11c3b2ef14 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 21 Jul 2008 09:54:18 -0700 Subject: net: In __netif_schedule() use WARN_ON instead of BUG_ON Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 7e2d5274333..cbc34c0db37 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1327,7 +1327,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) void __netif_schedule(struct Qdisc *q) { - BUG_ON(q == &noop_qdisc); + if (WARN_ON_ONCE(q == &noop_qdisc)) + return; if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) { struct softnet_data *sd; -- cgit v1.2.3 From d3678b463df73f5060d7420915080e19baeb379b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 21 Jul 2008 09:56:13 -0700 Subject: Revert "pkt_sched: Make default qdisc nonshared-multiqueue safe." This reverts commit a0c80b80e0fb48129e4e9d6a9ede914f9ff1850d. After discussions with Jamal and Herbert on netdev, we should provide at least minimal prioritization at the qdisc level even in multiqueue situations. Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 99 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 09dead33580..cb625b4d6da 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -356,44 +356,99 @@ static struct Qdisc noqueue_qdisc = { }; -static int fifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +#define PFIFO_FAST_BANDS 3 + +static inline struct sk_buff_head *prio2list(struct sk_buff *skb, + struct Qdisc *qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + return list + prio2band[skb->priority & TC_PRIO_MAX]; +} + +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) { - struct sk_buff_head *list = &qdisc->q; + struct sk_buff_head *list = prio2list(skb, qdisc); - if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) + if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) { + qdisc->q.qlen++; return __qdisc_enqueue_tail(skb, qdisc, list); + } return qdisc_drop(skb, qdisc); } -static struct sk_buff *fifo_fast_dequeue(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) { - struct sk_buff_head *list = &qdisc->q; + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); - if (!skb_queue_empty(list)) - return __qdisc_dequeue_head(qdisc, list); + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + if (!skb_queue_empty(list + prio)) { + qdisc->q.qlen--; + return __qdisc_dequeue_head(qdisc, list + prio); + } + } return NULL; } -static int fifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) { - return __qdisc_requeue(skb, qdisc, &qdisc->q); + qdisc->q.qlen++; + return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc)); } -static void fifo_fast_reset(struct Qdisc* qdisc) +static void pfifo_fast_reset(struct Qdisc* qdisc) { - __qdisc_reset_queue(qdisc, &qdisc->q); + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + __qdisc_reset_queue(qdisc, list + prio); + qdisc->qstats.backlog = 0; + qdisc->q.qlen = 0; } -static struct Qdisc_ops fifo_fast_ops __read_mostly = { - .id = "fifo_fast", - .priv_size = 0, - .enqueue = fifo_fast_enqueue, - .dequeue = fifo_fast_dequeue, - .requeue = fifo_fast_requeue, - .reset = fifo_fast_reset, +static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) +{ + struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; + + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +nla_put_failure: + return -1; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) + skb_queue_head_init(list + prio); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops __read_mostly = { + .id = "pfifo_fast", + .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head), + .enqueue = pfifo_fast_enqueue, + .dequeue = pfifo_fast_dequeue, + .requeue = pfifo_fast_requeue, + .init = pfifo_fast_init, + .reset = pfifo_fast_reset, + .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; @@ -522,7 +577,7 @@ static void attach_one_default_qdisc(struct net_device *dev, if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev, dev_queue, - &fifo_fast_ops, TC_H_ROOT); + &pfifo_fast_ops, TC_H_ROOT); if (!qdisc) { printk(KERN_INFO "%s: activation failed\n", dev->name); return; @@ -550,9 +605,9 @@ void dev_activate(struct net_device *dev) int need_watchdog; /* No queueing discipline is attached to device; - * create default one i.e. fifo_fast for devices, - * which need queueing and noqueue_qdisc for - * virtual interfaces. + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces */ if (dev_all_qdisc_sleeping_noop(dev)) -- cgit v1.2.3 From ae6134bdf3197206fba95563d755d2fa50d90ddd Mon Sep 17 00:00:00 2001 From: Micah Dowty Date: Mon, 21 Jul 2008 09:59:09 -0700 Subject: hdlcdrv: Fix CRC calculation. This is a trivial patch against the hdlcdrv module that fixes its CRC calculation. The finished CRC was overwriting the first two bytes of each packet rather than being appended to the end. I've tested this with 2.6.8 and 2.6.10-rc1, but hdlcdrv hasn't changed much recently so it should work with many other kernel versions. Signed-off-by: Micah Dowty Acked-by: Thomas Sailer Signed-off-by: David S. Miller --- drivers/net/hamradio/hdlcdrv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index ae9629fa688..c258a0586e6 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -88,6 +88,7 @@ static inline void append_crc_ccitt(unsigned char *buffer, int len) { unsigned int crc = crc_ccitt(0xffff, buffer, len) ^ 0xffff; + buffer += len; *buffer++ = crc; *buffer++ = crc >> 8; } -- cgit v1.2.3 From 0dbff689c2f299e8f63911247925f2728d087688 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 21 Jul 2008 10:00:51 -0700 Subject: netfilter: nf_nat_core: eliminate useless find_appropriate_src for IP_NAT_RANGE_PROTO_RANDOM Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index d2a887fc8d9..6c6a3cba8d5 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -240,12 +240,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, This is only required for source (ie. NAT/masq) mappings. So far, we don't do local source mappings, so multiple manips not an issue. */ - if (maniptype == IP_NAT_MANIP_SRC) { + if (maniptype == IP_NAT_MANIP_SRC && + !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { if (find_appropriate_src(orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); - if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) - if (!nf_nat_used_tuple(tuple, ct)) - return; + if (!nf_nat_used_tuple(tuple, ct)) + return; } } -- cgit v1.2.3 From 07a7c1070ed382ad4562e3a0d453fd2001d92f7b Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Mon, 21 Jul 2008 10:01:14 -0700 Subject: netlink: add NLA_PUT_BE64 macro Add NLA_PUT_BE64 macro required for 64bit counters in netfilter Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/netlink.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/net/netlink.h b/include/net/netlink.h index dfc3701dfcc..18024b8cecb 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -896,6 +896,9 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, #define NLA_PUT_U64(skb, attrtype, value) \ NLA_PUT_TYPE(skb, u64, attrtype, value) +#define NLA_PUT_BE64(skb, attrtype, value) \ + NLA_PUT_TYPE(skb, __be64, attrtype, value) + #define NLA_PUT_STRING(skb, attrtype, value) \ NLA_PUT(skb, attrtype, strlen(value) + 1, value) -- cgit v1.2.3 From 584015727a3b88b46602b20077b46cd04f8b4ab3 Mon Sep 17 00:00:00 2001 From: Krzysztof Piotr Oledzki Date: Mon, 21 Jul 2008 10:01:34 -0700 Subject: netfilter: accounting rework: ct_extend + 64bit counters (v4) Initially netfilter has had 64bit counters for conntrack-based accounting, but it was changed in 2.6.14 to save memory. Unfortunately in-kernel 64bit counters are still required, for example for "connbytes" extension. However, 64bit counters waste a lot of memory and it was not possible to enable/disable it runtime. This patch: - reimplements accounting with respect to the extension infrastructure, - makes one global version of seq_print_acct() instead of two seq_print_counters(), - makes it possible to enable it at boot time (for CONFIG_SYSCTL/CONFIG_SYSFS=n), - makes it possible to enable/disable it at runtime by sysctl or sysfs, - extends counters from 32bit to 64bit, - renames ip_conntrack_counter -> nf_conn_counter, - enables accounting code unconditionally (no longer depends on CONFIG_NF_CT_ACCT), - set initial accounting enable state based on CONFIG_NF_CT_ACCT - removes buggy IPCT_COUNTER_FILLING event handling. If accounting is enabled newly created connections get additional acct extend. Old connections are not changed as it is not possible to add a ct_extend area to confirmed conntrack. Accounting is performed for all connections with acct extend regardless of a current state of "net.netfilter.nf_conntrack_acct". Signed-off-by: Krzysztof Piotr Oledzki Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- Documentation/feature-removal-schedule.txt | 10 ++ Documentation/kernel-parameters.txt | 7 ++ include/linux/netfilter/nf_conntrack_common.h | 8 +- include/linux/netfilter/nfnetlink_conntrack.h | 8 +- include/net/netfilter/nf_conntrack.h | 6 -- include/net/netfilter/nf_conntrack_acct.h | 51 ++++++++++ include/net/netfilter/nf_conntrack_extend.h | 2 + .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 18 +--- net/netfilter/Kconfig | 9 ++ net/netfilter/Makefile | 2 +- net/netfilter/nf_conntrack_acct.c | 104 +++++++++++++++++++++ net/netfilter/nf_conntrack_core.c | 39 +++++--- net/netfilter/nf_conntrack_netlink.c | 44 +++++---- net/netfilter/nf_conntrack_standalone.c | 18 +--- net/netfilter/xt_connbytes.c | 8 +- 15 files changed, 248 insertions(+), 86 deletions(-) create mode 100644 include/net/netfilter/nf_conntrack_acct.h create mode 100644 net/netfilter/nf_conntrack_acct.c diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 86334b6f823..9f73587219e 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -336,3 +336,13 @@ When: After the only user (hal) has seen a release with the patches Why: Over 1K .text/.data size reduction, data is available in other ways (ioctls) Who: Johannes Berg + +--------------------------- + +What: CONFIG_NF_CT_ACCT +When: 2.6.29 +Why: Accounting can now be enabled/disabled without kernel recompilation. + Currently used only to set a default value for a feature that is also + controlled by a kernel/module/sysfs/sysctl parameter. +Who: Krzysztof Piotr Oledzki + diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 09ad7450647..e4ef2758440 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1279,6 +1279,13 @@ and is between 256 and 4096 characters. It is defined in the file This usage is only documented in each driver source file if at all. + nf_conntrack.acct= + [NETFILTER] Enable connection tracking flow accounting + 0 to disable accounting + 1 to enable accounting + Default value depends on CONFIG_NF_CT_ACCT that is + going to be removed in 2.6.29. + nfsaddrs= [NFS] See Documentation/filesystems/nfsroot.txt. diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index bad1eb760f6..885cbe28226 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -122,7 +122,7 @@ enum ip_conntrack_events IPCT_NATINFO_BIT = 10, IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), - /* Counter highest bit has been set */ + /* Counter highest bit has been set, unused */ IPCT_COUNTER_FILLING_BIT = 11, IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT), @@ -145,12 +145,6 @@ enum ip_conntrack_expect_events { }; #ifdef __KERNEL__ -struct ip_conntrack_counter -{ - u_int32_t packets; - u_int32_t bytes; -}; - struct ip_conntrack_stat { unsigned int searched; diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 759bc043dc6..c19595c8930 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -115,10 +115,10 @@ enum ctattr_protoinfo_sctp { enum ctattr_counters { CTA_COUNTERS_UNSPEC, - CTA_COUNTERS_PACKETS, /* old 64bit counters */ - CTA_COUNTERS_BYTES, /* old 64bit counters */ - CTA_COUNTERS32_PACKETS, - CTA_COUNTERS32_BYTES, + CTA_COUNTERS_PACKETS, /* 64bit counters */ + CTA_COUNTERS_BYTES, /* 64bit counters */ + CTA_COUNTERS32_PACKETS, /* old 32bit counters, unused */ + CTA_COUNTERS32_BYTES, /* old 32bit counters, unused */ __CTA_COUNTERS_MAX }; #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8f5b75734dd..0741ad592da 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -88,7 +88,6 @@ struct nf_conn_help { u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; }; - #include #include @@ -111,11 +110,6 @@ struct nf_conn /* Timer function; drops refcnt when it goes off. */ struct timer_list timeout; -#ifdef CONFIG_NF_CT_ACCT - /* Accounting Information (same cache line as other written members) */ - struct ip_conntrack_counter counters[IP_CT_DIR_MAX]; -#endif - #if defined(CONFIG_NF_CONNTRACK_MARK) u_int32_t mark; #endif diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h new file mode 100644 index 00000000000..5d5ae55d54c --- /dev/null +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -0,0 +1,51 @@ +/* + * (C) 2008 Krzysztof Piotr Oledzki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _NF_CONNTRACK_ACCT_H +#define _NF_CONNTRACK_ACCT_H +#include +#include +#include +#include + +struct nf_conn_counter { + u_int64_t packets; + u_int64_t bytes; +}; + +extern int nf_ct_acct; + +static inline +struct nf_conn_counter *nf_conn_acct_find(const struct nf_conn *ct) +{ + return nf_ct_ext_find(ct, NF_CT_EXT_ACCT); +} + +static inline +struct nf_conn_counter *nf_ct_acct_ext_add(struct nf_conn *ct, gfp_t gfp) +{ + struct nf_conn_counter *acct; + + if (!nf_ct_acct) + return NULL; + + acct = nf_ct_ext_add(ct, NF_CT_EXT_ACCT, gfp); + if (!acct) + pr_debug("failed to add accounting extension area"); + + + return acct; +}; + +extern unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir); + +extern int nf_conntrack_acct_init(void); +extern void nf_conntrack_acct_fini(void); + +#endif /* _NF_CONNTRACK_ACCT_H */ diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index f80c0ed6d87..da8ee52613a 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -7,11 +7,13 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, NF_CT_EXT_NAT, + NF_CT_EXT_ACCT, NF_CT_EXT_NUM, }; #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat +#define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 40a46d48249..3a020720e40 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -18,19 +18,7 @@ #include #include #include - -#ifdef CONFIG_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif +#include struct ct_iter_state { unsigned int bucket; @@ -127,7 +115,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_ORIGINAL])) + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) return -ENOSPC; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) @@ -138,7 +126,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_REPLY])) + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) return -ENOSPC; if (test_bit(IPS_ASSURED_BIT, &ct->status)) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 316c7af1d2b..ee898e74808 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -49,6 +49,15 @@ config NF_CT_ACCT Those counters can be used for flow-based accounting or the `connbytes' match. + Please note that currently this option only sets a default state. + You may change it at boot time with nf_conntrack.acct=0/1 kernel + paramater or by loading the nf_conntrack module with acct=0/1. + + You may also disable/enable it on a running system with: + sysctl net.netfilter.nf_conntrack_acct=0/1 + + This option will be removed in 2.6.29. + If unsure, say `N'. config NF_CONNTRACK_MARK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5c4b183f642..3bd2cc556ae 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,6 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o obj-$(CONFIG_NETFILTER) = netfilter.o diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c new file mode 100644 index 00000000000..59bd8b903a1 --- /dev/null +++ b/net/netfilter/nf_conntrack_acct.c @@ -0,0 +1,104 @@ +/* Accouting handling for netfilter. */ + +/* + * (C) 2008 Krzysztof Piotr Oledzki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_NF_CT_ACCT +#define NF_CT_ACCT_DEFAULT 1 +#else +#define NF_CT_ACCT_DEFAULT 0 +#endif + +int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT; +EXPORT_SYMBOL_GPL(nf_ct_acct); + +module_param_named(acct, nf_ct_acct, bool, 0644); +MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *acct_sysctl_header; +static struct ctl_table acct_sysctl_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nf_conntrack_acct", + .data = &nf_ct_acct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) +{ + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)acct[dir].packets, + (unsigned long long)acct[dir].bytes); +}; +EXPORT_SYMBOL_GPL(seq_print_acct); + +static struct nf_ct_ext_type acct_extend __read_mostly = { + .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), + .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), + .id = NF_CT_EXT_ACCT, +}; + +int nf_conntrack_acct_init(void) +{ + int ret; + +#ifdef CONFIG_NF_CT_ACCT + printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n"); + printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n"); + printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n"); +#endif + + ret = nf_ct_extend_register(&acct_extend); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); + return ret; + } + +#ifdef CONFIG_SYSCTL + acct_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, + acct_sysctl_table); + + if (!acct_sysctl_header) { + nf_ct_extend_unregister(&acct_extend); + + printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + return -ENOMEM; + } +#endif + + return 0; +} + +void nf_conntrack_acct_fini(void) +{ +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(acct_sysctl_header); +#endif + nf_ct_extend_unregister(&acct_extend); +} diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 28d03e64200..c519d090bdb 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -37,6 +37,7 @@ #include #include #include +#include #define NF_CONNTRACK_VERSION "0.5.0" @@ -555,6 +556,8 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, return NULL; } + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(tuple); if (exp) { @@ -828,17 +831,16 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: -#ifdef CONFIG_NF_CT_ACCT if (do_acct) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - skb->len - skb_network_offset(skb); + struct nf_conn_counter *acct; - if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) - || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) - event |= IPCT_COUNTER_FILLING; + acct = nf_conn_acct_find(ct); + if (acct) { + acct[CTINFO2DIR(ctinfo)].packets++; + acct[CTINFO2DIR(ctinfo)].bytes += + skb->len - skb_network_offset(skb); + } } -#endif spin_unlock_bh(&nf_conntrack_lock); @@ -853,15 +855,19 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { -#ifdef CONFIG_NF_CT_ACCT if (do_acct) { + struct nf_conn_counter *acct; + spin_lock_bh(&nf_conntrack_lock); - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - skb->len - skb_network_offset(skb); + acct = nf_conn_acct_find(ct); + if (acct) { + acct[CTINFO2DIR(ctinfo)].packets++; + acct[CTINFO2DIR(ctinfo)].bytes += + skb->len - skb_network_offset(skb); + } spin_unlock_bh(&nf_conntrack_lock); } -#endif + if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); return true; @@ -1029,6 +1035,7 @@ void nf_conntrack_cleanup(void) nf_conntrack_proto_fini(); nf_conntrack_helper_fini(); nf_conntrack_expect_fini(); + nf_conntrack_acct_fini(); } struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) @@ -1168,6 +1175,10 @@ int __init nf_conntrack_init(void) if (ret < 0) goto out_fini_expect; + ret = nf_conntrack_acct_init(); + if (ret < 0) + goto out_fini_helper; + /* For use by REJECT target */ rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); @@ -1180,6 +1191,8 @@ int __init nf_conntrack_init(void) return ret; +out_fini_helper: + nf_conntrack_helper_fini(); out_fini_expect: nf_conntrack_expect_fini(); out_fini_proto: diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 95a7967731f..105a616c5c7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -37,6 +37,7 @@ #include #include #include +#include #ifdef CONFIG_NF_NAT_NEEDED #include #include @@ -206,22 +207,26 @@ nla_put_failure: return -1; } -#ifdef CONFIG_NF_CT_ACCT static int ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir) { enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nlattr *nest_count; + const struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; nest_count = nla_nest_start(skb, type | NLA_F_NESTED); if (!nest_count) goto nla_put_failure; - NLA_PUT_BE32(skb, CTA_COUNTERS32_PACKETS, - htonl(ct->counters[dir].packets)); - NLA_PUT_BE32(skb, CTA_COUNTERS32_BYTES, - htonl(ct->counters[dir].bytes)); + NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS, + cpu_to_be64(acct[dir].packets)); + NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES, + cpu_to_be64(acct[dir].bytes)); nla_nest_end(skb, nest_count); @@ -230,9 +235,6 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, nla_put_failure: return -1; } -#else -#define ctnetlink_dump_counters(a, b, c) (0) -#endif #ifdef CONFIG_NF_CONNTRACK_MARK static inline int @@ -501,11 +503,6 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, goto nla_put_failure; #endif - if (events & IPCT_COUNTER_FILLING && - (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)) - goto nla_put_failure; - if (events & IPCT_RELATED && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; @@ -576,11 +573,15 @@ restart: cb->args[1] = (unsigned long)ct; goto out; } -#ifdef CONFIG_NF_CT_ACCT + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == - IPCTNL_MSG_CT_GET_CTRZERO) - memset(&ct->counters, 0, sizeof(ct->counters)); -#endif + IPCTNL_MSG_CT_GET_CTRZERO) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) + memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX])); + } } if (cb->args[1]) { cb->args[1] = 0; @@ -832,14 +833,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, u_int8_t u3 = nfmsg->nfgen_family; int err = 0; - if (nlh->nlmsg_flags & NLM_F_DUMP) { -#ifndef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) - return -ENOTSUPP; -#endif + if (nlh->nlmsg_flags & NLM_F_DUMP) return netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, ctnetlink_done); - } if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); @@ -1152,6 +1148,8 @@ ctnetlink_create_conntrack(struct nlattr *cda[], goto err; } + nf_ct_acct_ext_add(ct, GFP_KERNEL); + #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 46ea542d0df..869ef9349d0 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -25,6 +25,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); @@ -38,19 +39,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, } EXPORT_SYMBOL_GPL(print_tuple); -#ifdef CONFIG_NF_CT_ACCT -static unsigned int -seq_print_counters(struct seq_file *s, - const struct ip_conntrack_counter *counter) -{ - return seq_printf(s, "packets=%llu bytes=%llu ", - (unsigned long long)counter->packets, - (unsigned long long)counter->bytes); -} -#else -#define seq_print_counters(x, y) 0 -#endif - struct ct_iter_state { unsigned int bucket; }; @@ -146,7 +134,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_ORIGINAL])) + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) return -ENOSPC; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) @@ -157,7 +145,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l3proto, l4proto)) return -ENOSPC; - if (seq_print_counters(s, &ct->counters[IP_CT_DIR_REPLY])) + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) return -ENOSPC; if (test_bit(IPS_ASSURED_BIT, &ct->status)) diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index d7e8983cd37..3e39c4fe193 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -8,6 +8,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); @@ -27,12 +28,15 @@ connbytes_mt(const struct sk_buff *skb, const struct net_device *in, u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; - const struct ip_conntrack_counter *counters; + const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; - counters = ct->counters; + + counters = nf_conn_acct_find(ct); + if (!counters) + return false; switch (sinfo->what) { case XT_CONNBYTES_PKTS: -- cgit v1.2.3 From 280763c053fee297d95b474f2c145990670371e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 21 Jul 2008 10:02:12 -0700 Subject: netfilter: xt_time: fix time's time_mt()'s use of do_div() Fix netfilter xt_time's time_mt()'s use of do_div() on an s64 by using div_s64() instead. This was introduced by patch ee4411a1b1e0b679c99686629b5eab5a072ce49f ("[NETFILTER]: x_tables: add xt_time match"). Signed-off-by: David Howells Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index ed76baab473..9f328593287 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -173,7 +173,7 @@ time_mt(const struct sk_buff *skb, const struct net_device *in, __net_timestamp((struct sk_buff *)skb); stamp = ktime_to_ns(skb->tstamp); - do_div(stamp, NSEC_PER_SEC); + stamp = div_s64(stamp, NSEC_PER_SEC); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ -- cgit v1.2.3 From 72961ecf84d67d6359a1b30f9b2a8427f13e1e71 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 21 Jul 2008 10:02:35 -0700 Subject: netfilter: nfnetlink_log: send complete hardware header This patch adds some fields to NFLOG to be able to send the complete hardware header with all necessary informations. It sends to userspace: * the type of hardware link * the lenght of hardware header * the hardware header Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_log.h | 3 +++ net/netfilter/nfnetlink_log.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h index a8572133292..f661731f3cb 100644 --- a/include/linux/netfilter/nfnetlink_log.h +++ b/include/linux/netfilter/nfnetlink_log.h @@ -48,6 +48,9 @@ enum nfulnl_attr_type { NFULA_SEQ, /* instance-local sequence number */ NFULA_SEQ_GLOBAL, /* global sequence number */ NFULA_GID, /* group id of socket */ + NFULA_HWTYPE, /* hardware type */ + NFULA_HWHEADER, /* hardware header */ + NFULA_HWLEN, /* hardware header length */ __NFULA_MAX }; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b8173af8c24..9a35b57ab76 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -453,6 +453,14 @@ __build_packet_message(struct nfulnl_instance *inst, } } + if (indev && skb_mac_header_was_set(skb)) { + NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); + NLA_PUT_BE16(inst->skb, NFULA_HWLEN, + htons(skb->dev->hard_header_len)); + NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, + skb_mac_header(skb)); + } + if (skb->tstamp.tv64) { struct nfulnl_msg_packet_timestamp ts; struct timeval tv = ktime_to_timeval(skb->tstamp); -- cgit v1.2.3 From db1a75bdcc1766dc7e1fae9201ae287dcbcb6c66 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 21 Jul 2008 10:02:59 -0700 Subject: netfilter: xt_TCPMSS: collapse tcpmss_reverse_mtu{4,6} into one function Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_TCPMSS.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 217e2b68632..beb5094703c 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -147,17 +147,21 @@ tcpmss_mangle_packet(struct sk_buff *skb, return TCPOLEN_MSS; } -static u_int32_t tcpmss_reverse_mtu4(const struct iphdr *iph) +static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, + unsigned int family) { - struct flowi fl = { - .fl4_dst = iph->saddr, - }; + struct flowi fl = {}; const struct nf_afinfo *ai; struct rtable *rt = NULL; u_int32_t mtu = ~0U; + if (family == PF_INET) + fl.fl4_dst = ip_hdr(skb)->saddr; + else + fl.fl6_dst = ipv6_hdr(skb)->saddr; + rcu_read_lock(); - ai = nf_get_afinfo(AF_INET); + ai = nf_get_afinfo(family); if (ai != NULL) ai->route((struct dst_entry **)&rt, &fl); rcu_read_unlock(); @@ -178,7 +182,8 @@ tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, __be16 newlen; int ret; - ret = tcpmss_mangle_packet(skb, targinfo, tcpmss_reverse_mtu4(iph), + ret = tcpmss_mangle_packet(skb, targinfo, + tcpmss_reverse_mtu(skb, PF_INET), iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) @@ -193,28 +198,6 @@ tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, } #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static u_int32_t tcpmss_reverse_mtu6(const struct ipv6hdr *iph) -{ - struct flowi fl = { - .fl6_dst = iph->saddr, - }; - const struct nf_afinfo *ai; - struct rtable *rt = NULL; - u_int32_t mtu = ~0U; - - rcu_read_lock(); - ai = nf_get_afinfo(AF_INET6); - if (ai != NULL) - ai->route((struct dst_entry **)&rt, &fl); - rcu_read_unlock(); - - if (rt != NULL) { - mtu = dst_mtu(&rt->u.dst); - dst_release(&rt->u.dst); - } - return mtu; -} - static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, const struct net_device *out, unsigned int hooknum, @@ -229,7 +212,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); if (tcphoff < 0) return NF_DROP; - ret = tcpmss_mangle_packet(skb, targinfo, tcpmss_reverse_mtu6(ipv6h), + ret = tcpmss_mangle_packet(skb, targinfo, + tcpmss_reverse_mtu(skb, PF_INET6), tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) -- cgit v1.2.3 From c71529e42ce39c167dc53430cb8f3d5634af77df Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 21 Jul 2008 10:03:23 -0700 Subject: netfilter: nf_nat_sip: c= is optional for session According to RFC2327, the connection information is optional in the session description since it can be specified in the media description instead. My provider does exactly that and does not provide any connection information in the session description. As a result the new kernel drops all invite responses. This patch makes it optional as documented. Signed-off-by: Herbert Xu Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_sip.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 4334d5cabc5..14544320c54 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -318,11 +318,11 @@ static int mangle_content_len(struct sk_buff *skb, buffer, buflen); } -static unsigned mangle_sdp_packet(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, unsigned int *datalen, - enum sdp_header_types type, - enum sdp_header_types term, - char *buffer, int buflen) +static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, + unsigned int dataoff, unsigned int *datalen, + enum sdp_header_types type, + enum sdp_header_types term, + char *buffer, int buflen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); @@ -330,9 +330,9 @@ static unsigned mangle_sdp_packet(struct sk_buff *skb, const char **dptr, if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, &matchoff, &matchlen) <= 0) - return 0; + return -ENOENT; return mangle_packet(skb, dptr, datalen, matchoff, matchlen, - buffer, buflen); + buffer, buflen) ? 0 : -EINVAL; } static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, @@ -346,8 +346,8 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, unsigned int buflen; buflen = sprintf(buffer, NIPQUAD_FMT, NIPQUAD(addr->ip)); - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, - buffer, buflen)) + if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, + buffer, buflen)) return 0; return mangle_content_len(skb, dptr, datalen); @@ -381,15 +381,27 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, /* Mangle session description owner and contact addresses */ buflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(addr->ip)); - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, + if (mangle_sdp_packet(skb, dptr, dataoff, datalen, SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, buffer, buflen)) return 0; - if (!mangle_sdp_packet(skb, dptr, dataoff, datalen, - SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, - buffer, buflen)) + switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, + SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, + buffer, buflen)) { + case 0: + /* + * RFC 2327: + * + * Session description + * + * c=* (connection information - not required if included in all media) + */ + case -ENOENT: + break; + default: return 0; + } return mangle_content_len(skb, dptr, datalen); } -- cgit v1.2.3 From 5547cd0ae8b46db9a084505239294eed9b8c8e2d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 21 Jul 2008 10:03:49 -0700 Subject: netfilter: nf_conntrack_sctp: fix sparse warnings Introduced by a258860e (netfilter: ctnetlink: add full support for SCTP to ctnetlink): net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: incorrect type in argument 1 (different base types) net/netfilter/nf_conntrack_proto_sctp.c:483:2: expected unsigned int [unsigned] [usertype] x net/netfilter/nf_conntrack_proto_sctp.c:483:2: got restricted unsigned int const net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:483:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: incorrect type in argument 1 (different base types) net/netfilter/nf_conntrack_proto_sctp.c:487:2: expected unsigned int [unsigned] [usertype] x net/netfilter/nf_conntrack_proto_sctp.c:487:2: got restricted unsigned int const net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:487:2: warning: cast from restricted type net/netfilter/nf_conntrack_proto_sctp.c:532:42: warning: incorrect type in assignment (different base types) net/netfilter/nf_conntrack_proto_sctp.c:532:42: expected restricted unsigned int net/netfilter/nf_conntrack_proto_sctp.c:532:42: got unsigned int net/netfilter/nf_conntrack_proto_sctp.c:534:39: warning: incorrect type in assignment (different base types) net/netfilter/nf_conntrack_proto_sctp.c:534:39: expected restricted unsigned int net/netfilter/nf_conntrack_proto_sctp.c:534:39: got unsigned int Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_sctp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 41183a4d2d6..30aa5b94a77 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -482,11 +482,11 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, NLA_PUT_BE32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, - htonl(ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL])); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]); NLA_PUT_BE32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, - htonl(ct->proto.sctp.vtag[IP_CT_DIR_REPLY])); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY]); read_unlock_bh(&sctp_lock); @@ -530,9 +530,9 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) write_lock_bh(&sctp_lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = - ntohl(nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL])); + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = - ntohl(nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])); + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); write_unlock_bh(&sctp_lock); return 0; -- cgit v1.2.3