From c80a5cdfc5ca6533cb893154f546370da1fdb8f0 Mon Sep 17 00:00:00 2001
From: Doug Leith <doug.leith@nuim.ie>
Date: Mon, 25 May 2009 22:44:59 -0700
Subject: tcp: tcp_vegas ssthresh bugfix

This patch fixes ssthresh accounting issues in tcp_vegas when cwnd decreases

Signed-off-by: Doug Leith <doug.leith@nuim.ie>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_vegas.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a453aac91bd..c6743eec9b7 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 }
 EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
 
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
 static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 			 */
 			diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
 
-			if (diff > gamma && tp->snd_ssthresh > 2 ) {
+			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Going too fast. Time to slow down
 				 * and switch to congestion avoidance.
 				 */
-				tp->snd_ssthresh = 2;
 
 				/* Set cwnd to match the actual rate
 				 * exactly:
@@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 				 * utilization.
 				 */
 				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
 
 			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Slow start.  */
@@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 					 * we slow down.
 					 */
 					tp->snd_cwnd--;
+					tp->snd_ssthresh
+						= tcp_vegas_ssthresh(tp);
 				} else if (diff < alpha) {
 					/* We don't have enough extra packets
 					 * in the network, so speed up.
-- 
cgit v1.2.3


From 915219441d566f1da0caa0e262be49b666159e17 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 28 May 2009 21:35:47 -0700
Subject: tcp: Use SKB queue and list helpers instead of doing it by-hand.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       |  17 +++-----
 net/ipv4/tcp_input.c | 118 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 90 insertions(+), 45 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0fb8b441f1f..17b89c523f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -439,12 +439,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			 !tp->urg_data ||
 			 before(tp->urg_seq, tp->copied_seq) ||
 			 !before(tp->urg_seq, tp->rcv_nxt)) {
+			struct sk_buff *skb;
+
 			answ = tp->rcv_nxt - tp->copied_seq;
 
 			/* Subtract 1, if FIN is in queue. */
-			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
-				answ -=
-		       tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
+			skb = skb_peek_tail(&sk->sk_receive_queue);
+			if (answ && skb)
+				answ -= tcp_hdr(skb)->fin;
 		} else
 			answ = tp->urg_seq - tp->copied_seq;
 		release_sock(sk);
@@ -1382,11 +1384,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 		/* Next get a buffer. */
 
-		skb = skb_peek(&sk->sk_receive_queue);
-		do {
-			if (!skb)
-				break;
-
+		skb_queue_walk(&sk->sk_receive_queue, skb) {
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
@@ -1403,8 +1401,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
 			WARN_ON(!(flags & MSG_PEEK));
-			skb = skb->next;
-		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+		}
 
 		/* Well, if we have backlog, try to process it now yet. */
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeb8a92aa41..ba34a23c1bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4426,7 +4426,7 @@ drop:
 		}
 		__skb_queue_head(&tp->out_of_order_queue, skb);
 	} else {
-		struct sk_buff *skb1 = tp->out_of_order_queue.prev;
+		struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
 		u32 seq = TCP_SKB_CB(skb)->seq;
 		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
@@ -4443,15 +4443,18 @@ drop:
 		}
 
 		/* Find place to insert this segment. */
-		do {
+		while (1) {
 			if (!after(TCP_SKB_CB(skb1)->seq, seq))
 				break;
-		} while ((skb1 = skb1->prev) !=
-			 (struct sk_buff *)&tp->out_of_order_queue);
+			if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+				skb1 = NULL;
+				break;
+			}
+			skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+		}
 
 		/* Do skb overlap to previous one? */
-		if (skb1 != (struct sk_buff *)&tp->out_of_order_queue &&
-		    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+		if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
 			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 				/* All the bits are present. Drop. */
 				__kfree_skb(skb);
@@ -4463,24 +4466,41 @@ drop:
 				tcp_dsack_set(sk, seq,
 					      TCP_SKB_CB(skb1)->end_seq);
 			} else {
-				skb1 = skb1->prev;
+				if (skb_queue_is_first(&tp->out_of_order_queue,
+						       skb1))
+					skb1 = NULL;
+				else
+					skb1 = skb_queue_prev(
+						&tp->out_of_order_queue,
+						skb1);
 			}
 		}
-		__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+		if (!skb1)
+			__skb_queue_head(&tp->out_of_order_queue, skb);
+		else
+			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 
 		/* And clean segments covered by new one as whole. */
-		while ((skb1 = skb->next) !=
-		       (struct sk_buff *)&tp->out_of_order_queue &&
-		       after(end_seq, TCP_SKB_CB(skb1)->seq)) {
-			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+		if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) {
+			struct sk_buff *n;
+
+			skb1 = skb_queue_next(&tp->out_of_order_queue, skb1);
+			skb_queue_walk_from_safe(&tp->out_of_order_queue,
+						 skb1, n) {
+				if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+					break;
+				if (before(end_seq,
+					   TCP_SKB_CB(skb1)->end_seq)) {
+					tcp_dsack_extend(sk,
+							 TCP_SKB_CB(skb1)->seq,
+							 end_seq);
+					break;
+				}
+				__skb_unlink(skb1, &tp->out_of_order_queue);
 				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
-						 end_seq);
-				break;
+						 TCP_SKB_CB(skb1)->end_seq);
+				__kfree_skb(skb1);
 			}
-			__skb_unlink(skb1, &tp->out_of_order_queue);
-			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
-					 TCP_SKB_CB(skb1)->end_seq);
-			__kfree_skb(skb1);
 		}
 
 add_sack:
@@ -4492,7 +4512,10 @@ add_sack:
 static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 					struct sk_buff_head *list)
 {
-	struct sk_buff *next = skb->next;
+	struct sk_buff *next = NULL;
+
+	if (!skb_queue_is_last(list, skb))
+		next = skb_queue_next(list, skb);
 
 	__skb_unlink(skb, list);
 	__kfree_skb(skb);
@@ -4503,6 +4526,9 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 
 /* Collapse contiguous sequence of skbs head..tail with
  * sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the list.
+ *
  * Segments with FIN/SYN are not collapsed (only because this
  * simplifies code)
  */
@@ -4511,15 +4537,23 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 	     struct sk_buff *head, struct sk_buff *tail,
 	     u32 start, u32 end)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb, *n;
+	bool end_of_skbs;
 
 	/* First, check that queue is collapsible and find
 	 * the point where collapsing can be useful. */
-	for (skb = head; skb != tail;) {
+	skb = head;
+restart:
+	end_of_skbs = true;
+	skb_queue_walk_from_safe(list, skb, n) {
+		if (skb == tail)
+			break;
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 			skb = tcp_collapse_one(sk, skb, list);
-			continue;
+			if (!skb)
+				break;
+			goto restart;
 		}
 
 		/* The first skb to collapse is:
@@ -4529,16 +4563,24 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 		 */
 		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
 		    (tcp_win_from_space(skb->truesize) > skb->len ||
-		     before(TCP_SKB_CB(skb)->seq, start) ||
-		     (skb->next != tail &&
-		      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+		     before(TCP_SKB_CB(skb)->seq, start))) {
+			end_of_skbs = false;
 			break;
+		}
+
+		if (!skb_queue_is_last(list, skb)) {
+			struct sk_buff *next = skb_queue_next(list, skb);
+			if (next != tail &&
+			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+				end_of_skbs = false;
+				break;
+			}
+		}
 
 		/* Decided to skip this, advance start seq. */
 		start = TCP_SKB_CB(skb)->end_seq;
-		skb = skb->next;
 	}
-	if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
 		return;
 
 	while (before(start, end)) {
@@ -4583,7 +4625,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 				skb = tcp_collapse_one(sk, skb, list);
-				if (skb == tail ||
+				if (!skb ||
+				    skb == tail ||
 				    tcp_hdr(skb)->syn ||
 				    tcp_hdr(skb)->fin)
 					return;
@@ -4610,17 +4653,21 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	head = skb;
 
 	for (;;) {
-		skb = skb->next;
+		struct sk_buff *next = NULL;
+
+		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+			next = skb_queue_next(&tp->out_of_order_queue, skb);
+		skb = next;
 
 		/* Segment is terminated when we see gap or when
 		 * we are at the end of all the queue. */
-		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
 			tcp_collapse(sk, &tp->out_of_order_queue,
 				     head, skb, start, end);
 			head = skb;
-			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+			if (!skb)
 				break;
 			/* Start new segment */
 			start = TCP_SKB_CB(skb)->seq;
@@ -4681,10 +4728,11 @@ static int tcp_prune_queue(struct sock *sk)
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
-	tcp_collapse(sk, &sk->sk_receive_queue,
-		     sk->sk_receive_queue.next,
-		     (struct sk_buff *)&sk->sk_receive_queue,
-		     tp->copied_seq, tp->rcv_nxt);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		tcp_collapse(sk, &sk->sk_receive_queue,
+			     skb_peek(&sk->sk_receive_queue),
+			     NULL,
+			     tp->copied_seq, tp->rcv_nxt);
 	sk_mem_reclaim(sk);
 
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
-- 
cgit v1.2.3


From 28e72216d7e7af7050f171a87c1eecba93d01ea6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 May 2009 10:44:30 +0000
Subject: net: unset IFF_XMIT_DST_RELEASE in ipip_tunnel_setup()

ipip_tunnel_xmit() might need skb->dst, so tell dev_hard_start_xmit()
to no release it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9054139795a..bb2f1b17fbf 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -713,6 +713,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
 static void ipip_tunnel_init(struct net_device *dev)
-- 
cgit v1.2.3


From 108bfa895cacd1a7c1767e85be105b213e5dce50 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 28 May 2009 22:35:10 +0000
Subject: net: unset IFF_XMIT_DST_RELEASE in ipgre_tunnel_setup()

ipgre_tunnel_xmit() might need skb->dst, so tell dev_hard_start_xmit()
to no release it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index e62510d5ea5..77436e2732e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1238,6 +1238,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
 static int ipgre_tunnel_init(struct net_device *dev)
-- 
cgit v1.2.3


From 2df9001edc382c331f338f45d259feeaa740c418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Fri, 29 May 2009 15:02:29 -0700
Subject: tcp: fix loop in ofo handling code and reduce its complexity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Somewhat luckily, I was looking into these parts with very fine
comb because I've made somewhat similar changes on the same
area (conflicts that arose weren't that lucky though). The loop
was very much overengineered recently in commit 915219441d566
(tcp: Use SKB queue and list helpers instead of doing it
by-hand), while it basically just wants to know if there are
skbs after 'skb'.

Also it got broken because skb1 = skb->next got translated into
skb1 = skb1->next (though abstracted) improperly. Note that
'skb1' is pointing to previous sk_buff than skb or NULL if at
head. Two things went wrong:
- We'll kfree 'skb' on the first iteration instead of the
  skbuff following 'skb' (it would require required SACK reneging
  to recover I think).
- The list head case where 'skb1' is NULL is checked too early
  and the loop won't execute whereas it previously did.

Conclusion, mostly revert the recent changes which makes the
cset very messy looking but using proper accessor in the
previous-like version.

The effective changes against the original can be viewed with:
  git-diff 915219441d566f1da0caa0e262be49b666159e17^ \
		net/ipv4/tcp_input.c | sed -n -e '57,70 p'

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ba34a23c1bf..2bdb0da237e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4481,26 +4481,20 @@ drop:
 			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 
 		/* And clean segments covered by new one as whole. */
-		if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) {
-			struct sk_buff *n;
+		while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
+			skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
 
-			skb1 = skb_queue_next(&tp->out_of_order_queue, skb1);
-			skb_queue_walk_from_safe(&tp->out_of_order_queue,
-						 skb1, n) {
-				if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
-					break;
-				if (before(end_seq,
-					   TCP_SKB_CB(skb1)->end_seq)) {
-					tcp_dsack_extend(sk,
-							 TCP_SKB_CB(skb1)->seq,
-							 end_seq);
-					break;
-				}
-				__skb_unlink(skb1, &tp->out_of_order_queue);
+			if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+				break;
+			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
-						 TCP_SKB_CB(skb1)->end_seq);
-				__kfree_skb(skb1);
+						 end_seq);
+				break;
 			}
+			__skb_unlink(skb1, &tp->out_of_order_queue);
+			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
+					 TCP_SKB_CB(skb1)->end_seq);
+			__kfree_skb(skb1);
 		}
 
 add_sack:
-- 
cgit v1.2.3


From 4d52cfbef6266092d535237ba5a4b981458ab171 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 00:42:16 -0700
Subject: net: ipv4/ip_sockglue.c cleanups

Pure cleanups

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 73 +++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 43c05854d75..21b0187123d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -157,38 +157,39 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 	/* Ordered by supposed usage frequency */
 	if (flags & 1)
 		ip_cmsg_recv_pktinfo(msg, skb);
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 
 	if (flags & 1)
 		ip_cmsg_recv_ttl(msg, skb);
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 
 	if (flags & 1)
 		ip_cmsg_recv_tos(msg, skb);
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 
 	if (flags & 1)
 		ip_cmsg_recv_opts(msg, skb);
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 
 	if (flags & 1)
 		ip_cmsg_recv_retopts(msg, skb);
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 
 	if (flags & 1)
 		ip_cmsg_recv_security(msg, skb);
 
-	if ((flags>>=1) == 0)
+	if ((flags >>= 1) == 0)
 		return;
 	if (flags & 1)
 		ip_cmsg_recv_dstaddr(msg, skb);
 
 }
+EXPORT_SYMBOL(ip_cmsg_recv);
 
 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
 {
@@ -203,7 +204,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
 			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
-			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
+			err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
+					     err < 40 ? err : 40);
 			if (err)
 				return err;
 			break;
@@ -238,7 +240,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
 struct ip_ra_chain *ip_ra_chain;
 DEFINE_RWLOCK(ip_ra_lock);
 
-int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+int ip_ra_control(struct sock *sk, unsigned char on,
+		  void (*destructor)(struct sock *))
 {
 	struct ip_ra_chain *ra, *new_ra, **rap;
 
@@ -248,7 +251,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
 	write_lock_bh(&ip_ra_lock);
-	for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+	for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
 		if (ra->sk == sk) {
 			if (on) {
 				write_unlock_bh(&ip_ra_lock);
@@ -416,7 +419,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
 	/* Reset and regenerate socket error */
 	spin_lock_bh(&sk->sk_error_queue.lock);
 	sk->sk_err = 0;
-	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+	skb2 = skb_peek(&sk->sk_error_queue);
+	if (skb2 != NULL) {
 		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
 		spin_unlock_bh(&sk->sk_error_queue.lock);
 		sk->sk_error_report(sk);
@@ -431,8 +435,8 @@ out:
 
 
 /*
- *	Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
- *	an IP socket.
+ *	Socket option code for IP. This is the end of the line after any
+ *	TCP,UDP etc options on an IP socket.
  */
 
 static int do_ip_setsockopt(struct sock *sk, int level,
@@ -474,7 +478,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	switch (optname) {
 	case IP_OPTIONS:
 	{
-		struct ip_options * opt = NULL;
+		struct ip_options *opt = NULL;
 		if (optlen > 40 || optlen < 0)
 			goto e_inval;
 		err = ip_options_get_from_user(sock_net(sk), &opt,
@@ -556,9 +560,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		break;
 	case IP_TTL:
-		if (optlen<1)
+		if (optlen < 1)
 			goto e_inval;
-		if (val != -1 && (val < 1 || val>255))
+		if (val != -1 && (val < 0 || val > 255))
 			goto e_inval;
 		inet->uc_ttl = val;
 		break;
@@ -570,7 +574,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->hdrincl = val ? 1 : 0;
 		break;
 	case IP_MTU_DISCOVER:
-		if (val<0 || val>3)
+		if (val < 0 || val > 3)
 			goto e_inval;
 		inet->pmtudisc = val;
 		break;
@@ -582,7 +586,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	case IP_MULTICAST_TTL:
 		if (sk->sk_type == SOCK_STREAM)
 			goto e_inval;
-		if (optlen<1)
+		if (optlen < 1)
 			goto e_inval;
 		if (val == -1)
 			val = 1;
@@ -591,7 +595,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		inet->mc_ttl = val;
 		break;
 	case IP_MULTICAST_LOOP:
-		if (optlen<1)
+		if (optlen < 1)
 			goto e_inval;
 		inet->mc_loop = !!val;
 		break;
@@ -613,7 +617,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		} else {
 			memset(&mreq, 0, sizeof(mreq));
 			if (optlen >= sizeof(struct in_addr) &&
-			    copy_from_user(&mreq.imr_address, optval, sizeof(struct in_addr)))
+			    copy_from_user(&mreq.imr_address, optval,
+					   sizeof(struct in_addr)))
 				break;
 		}
 
@@ -677,7 +682,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	}
 	case IP_MSFILTER:
 	{
-		extern int sysctl_igmp_max_msf;
 		struct ip_msfilter *msf;
 
 		if (optlen < IP_MSFILTER_SIZE(0))
@@ -831,7 +835,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 	}
 	case MCAST_MSFILTER:
 	{
-		extern int sysctl_igmp_max_msf;
 		struct sockaddr_in *psin;
 		struct ip_msfilter *msf = NULL;
 		struct group_filter *gsf = NULL;
@@ -849,9 +852,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			break;
 		}
 		err = -EFAULT;
-		if (copy_from_user(gsf, optval, optlen)) {
+		if (copy_from_user(gsf, optval, optlen))
 			goto mc_msf_out;
-		}
+
 		/* numsrc >= (4G-140)/128 overflow in 32 bits */
 		if (gsf->gf_numsrc >= 0x1ffffff ||
 		    gsf->gf_numsrc > sysctl_igmp_max_msf) {
@@ -879,7 +882,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		msf->imsf_fmode = gsf->gf_fmode;
 		msf->imsf_numsrc = gsf->gf_numsrc;
 		err = -EADDRNOTAVAIL;
-		for (i=0; i<gsf->gf_numsrc; ++i) {
+		for (i = 0; i < gsf->gf_numsrc; ++i) {
 			psin = (struct sockaddr_in *)&gsf->gf_slist[i];
 
 			if (psin->sin_family != AF_INET)
@@ -890,7 +893,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		gsf = NULL;
 
 		err = ip_mc_msfilter(sk, msf, ifindex);
-	mc_msf_out:
+mc_msf_out:
 		kfree(msf);
 		kfree(gsf);
 		break;
@@ -900,7 +903,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		break;
 
 	case IP_FREEBIND:
-		if (optlen<1)
+		if (optlen < 1)
 			goto e_inval;
 		inet->freebind = !!val;
 		break;
@@ -957,6 +960,7 @@ int ip_setsockopt(struct sock *sk, int level,
 #endif
 	return err;
 }
+EXPORT_SYMBOL(ip_setsockopt);
 
 #ifdef CONFIG_COMPAT
 int compat_ip_setsockopt(struct sock *sk, int level, int optname,
@@ -986,13 +990,12 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname,
 #endif
 	return err;
 }
-
 EXPORT_SYMBOL(compat_ip_setsockopt);
 #endif
 
 /*
- *	Get the options. Note for future reference. The GET of IP options gets the
- *	_received_ ones. The set sets the _sent_ ones.
+ *	Get the options. Note for future reference. The GET of IP options gets
+ *	the _received_ ones. The set sets the _sent_ ones.
  */
 
 static int do_ip_getsockopt(struct sock *sk, int level, int optname,
@@ -1143,7 +1146,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 		}
 		err = ip_mc_gsfget(sk, &gsf,
-				   (struct group_filter __user *)optval, optlen);
+				   (struct group_filter __user *)optval,
+				   optlen);
 		release_sock(sk);
 		return err;
 	}
@@ -1187,7 +1191,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	}
 	release_sock(sk);
 
-	if (len < sizeof(int) && len > 0 && val>=0 && val<=255) {
+	if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
 		unsigned char ucval = (unsigned char)val;
 		len = 1;
 		if (put_user(len, optlen))
@@ -1230,6 +1234,7 @@ int ip_getsockopt(struct sock *sk, int level,
 #endif
 	return err;
 }
+EXPORT_SYMBOL(ip_getsockopt);
 
 #ifdef CONFIG_COMPAT
 int compat_ip_getsockopt(struct sock *sk, int level, int optname,
@@ -1262,11 +1267,5 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 #endif
 	return err;
 }
-
 EXPORT_SYMBOL(compat_ip_getsockopt);
 #endif
-
-EXPORT_SYMBOL(ip_cmsg_recv);
-
-EXPORT_SYMBOL(ip_getsockopt);
-EXPORT_SYMBOL(ip_setsockopt);
-- 
cgit v1.2.3


From f771bef98004d9d141b085d987a77d06669d4f4f Mon Sep 17 00:00:00 2001
From: Nivedita Singhvi <niv@us.ibm.com>
Date: Thu, 28 May 2009 07:00:46 +0000
Subject: ipv4: New multicast-all socket option

After some discussion offline with Christoph Lameter and David Stevens
regarding multicast behaviour in Linux, I'm submitting a slightly
modified patch from the one Christoph submitted earlier.

This patch provides a new socket option IP_MULTICAST_ALL.

In this case, default behaviour is _unchanged_ from the current
Linux standard. The socket option is set by default to provide
original behaviour. Sockets wishing to receive data only from
multicast groups they join explicitly will need to clear this
socket option.

Signed-off-by: Nivedita Singhvi <niv@us.ibm.com>
Signed-off-by: Christoph Lameter<cl@linux.com>
Acked-by: David Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c     |  1 +
 net/ipv4/igmp.c        |  2 +-
 net/ipv4/ip_sockglue.c | 11 +++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5abee4c9744..d8736217858 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -375,6 +375,7 @@ lookup_protocol:
 	inet->uc_ttl	= -1;
 	inet->mc_loop	= 1;
 	inet->mc_ttl	= 1;
+	inet->mc_all	= 1;
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9eb6219af61..e6058a50379 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2196,7 +2196,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
 			break;
 	}
 	if (!pmc)
-		return 1;
+		return inet->mc_all;
 	psl = pmc->sflist;
 	if (!psl)
 		return pmc->sfmode == MCAST_EXCLUDE;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 21b0187123d..cb49936856e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -453,6 +453,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
 			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
 	    optname == IP_MULTICAST_TTL ||
+	    optname == IP_MULTICAST_ALL ||
 	    optname == IP_MULTICAST_LOOP ||
 	    optname == IP_RECVORIGDSTADDR) {
 		if (optlen >= sizeof(int)) {
@@ -898,6 +899,13 @@ mc_msf_out:
 		kfree(gsf);
 		break;
 	}
+	case IP_MULTICAST_ALL:
+		if (optlen < 1)
+			goto e_inval;
+		if (val != 0 && val != 1)
+			goto e_inval;
+		inet->mc_all = val;
+		break;
 	case IP_ROUTER_ALERT:
 		err = ip_ra_control(sk, val ? 1 : 0, NULL);
 		break;
@@ -1151,6 +1159,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		release_sock(sk);
 		return err;
 	}
+	case IP_MULTICAST_ALL:
+		val = inet->mc_all;
+		break;
 	case IP_PKTOPTIONS:
 	{
 		struct msghdr msg;
-- 
cgit v1.2.3


From 511c3f92ad5b6d9f8f6464be1b4f85f0422be91a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:14:27 +0000
Subject: net: skb->rtable accessor

Define skb_rtable(const struct sk_buff *skb) accessor to get rtable from skb

Delete skb->rtable field

Setting rtable is not allowed, just set dst instead as rtable is an alias.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c                      |  4 ++--
 net/ipv4/icmp.c                     | 10 +++++-----
 net/ipv4/igmp.c                     |  2 +-
 net/ipv4/ip_forward.c               |  2 +-
 net/ipv4/ip_gre.c                   |  4 ++--
 net/ipv4/ip_input.c                 |  2 +-
 net/ipv4/ip_options.c               | 16 ++++++++--------
 net/ipv4/ip_output.c                | 10 +++++-----
 net/ipv4/ip_sockglue.c              |  2 +-
 net/ipv4/ipip.c                     |  2 +-
 net/ipv4/ipmr.c                     |  6 +++---
 net/ipv4/netfilter/ipt_MASQUERADE.c |  2 +-
 net/ipv4/netfilter/nf_nat_helper.c  |  4 ++--
 net/ipv4/route.c                    | 37 ++++++++++++++++++++++---------------
 net/ipv4/tcp_ipv4.c                 |  4 ++--
 15 files changed, 57 insertions(+), 50 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f11931c1838..816494f271a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -474,7 +474,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 		return 1;
 	}
 
-	paddr = skb->rtable->rt_gateway;
+	paddr = skb_rtable(skb)->rt_gateway;
 
 	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
 		return 0;
@@ -817,7 +817,7 @@ static int arp_process(struct sk_buff *skb)
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
 	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
 
 		if (addr_type == RTN_LOCAL) {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 3f50807237e..94f75efae93 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -356,7 +356,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
 static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 {
 	struct ipcm_cookie ipc;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->u.dst.dev);
 	struct sock *sk;
 	struct inet_sock *inet;
@@ -416,7 +416,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	struct iphdr *iph;
 	int room;
 	struct icmp_bxm icmp_param;
-	struct rtable *rt = skb_in->rtable;
+	struct rtable *rt = skb_rtable(skb_in);
 	struct ipcm_cookie ipc;
 	__be32 saddr;
 	u8  tos;
@@ -596,7 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
-			rt2 = skb_in->rtable;
+			rt2 = skb_rtable(skb_in);
 			skb_in->dst = odst;
 		}
 
@@ -926,7 +926,7 @@ static void icmp_address(struct sk_buff *skb)
 
 static void icmp_address_reply(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
 	struct in_ifaddr *ifa;
@@ -970,7 +970,7 @@ static void icmp_discard(struct sk_buff *skb)
 int icmp_rcv(struct sk_buff *skb)
 {
 	struct icmphdr *icmph;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->u.dst.dev);
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e6058a50379..afabd2758b6 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -948,7 +948,7 @@ int igmp_rcv(struct sk_buff *skb)
 	case IGMPV2_HOST_MEMBERSHIP_REPORT:
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
 		/* Is it our report looped back? */
-		if (skb->rtable->fl.iif == 0)
+		if (skb_rtable(skb)->fl.iif == 0)
 			break;
 		/* don't rely on MC router hearing unicast reports */
 		if (skb->pkt_type == PACKET_MULTICAST ||
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index df3fe50bbf0..0761cd9bbd1 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -81,7 +81,7 @@ int ip_forward(struct sk_buff *skb)
 	if (!xfrm4_route_forward(skb))
 		goto drop;
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto sr_failed;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 77436e2732e..85ddad45a91 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -602,7 +602,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
 			/* Looped back packet, drop it! */
-			if (skb->rtable->fl.iif == 0)
+			if (skb_rtable(skb)->fl.iif == 0)
 				goto drop;
 			stats->multicast++;
 			skb->pkt_type = PACKET_BROADCAST;
@@ -704,7 +704,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		if (skb->protocol == htons(ETH_P_IP)) {
-			rt = skb->rtable;
+			rt = skb_rtable(skb);
 			if ((dst = rt->rt_gateway) == 0)
 				goto tx_error_icmp;
 		}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 40f6206b2aa..cea784b0aa4 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -357,7 +357,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	if (iph->ihl > 5 && ip_rcv_options(skb))
 		goto drop;
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
 				skb->len);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 2c88da6e786..7e1074ffdbd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -102,7 +102,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 	sptr = skb_network_header(skb);
 	dptr = dopt->__data;
 
-	daddr = skb->rtable->rt_spec_dst;
+	daddr = skb_rtable(skb)->rt_spec_dst;
 
 	if (sopt->rr) {
 		optlen  = sptr[sopt->rr+1];
@@ -257,7 +257,7 @@ int ip_options_compile(struct net *net,
 	struct rtable *rt = NULL;
 
 	if (skb != NULL) {
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		optptr = (unsigned char *)&(ip_hdr(skb)[1]);
 	} else
 		optptr = opt->__data;
@@ -550,7 +550,7 @@ void ip_forward_options(struct sk_buff *skb)
 {
 	struct   ip_options * opt	= &(IPCB(skb)->opt);
 	unsigned char * optptr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	unsigned char *raw = skb_network_header(skb);
 
 	if (opt->rr_needaddr) {
@@ -598,7 +598,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	__be32 nexthop;
 	struct iphdr *iph = ip_hdr(skb);
 	unsigned char *optptr = skb_network_header(skb) + opt->srr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct rtable *rt2;
 	int err;
 
@@ -623,13 +623,13 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
-		rt = skb->rtable;
-		skb->rtable = NULL;
+		rt = skb_rtable(skb);
+		skb->dst = NULL;
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
-		rt2 = skb->rtable;
+		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
 			ip_rt_put(rt2);
-			skb->rtable = rt;
+			skb->dst = &rt->u.dst;
 			return -EINVAL;
 		}
 		ip_rt_put(rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ea19c37ccc0..8d845ebfcca 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -140,7 +140,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 			  __be32 saddr, __be32 daddr, struct ip_options *opt)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 
 	/* Build the IP header. */
@@ -238,7 +238,7 @@ static int ip_finish_output(struct sk_buff *skb)
 int ip_mc_output(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct net_device *dev = rt->u.dst.dev;
 
 	/*
@@ -319,7 +319,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 	/* Skip all of this if the packet is already routed,
 	 * f.e. by something like SCTP.
 	 */
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt != NULL)
 		goto packet_routed;
 
@@ -440,7 +440,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	unsigned int mtu, hlen, left, len, ll_rs, pad;
 	int offset;
 	__be16 not_last_frag;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	int err = 0;
 
 	dev = rt->u.dst.dev;
@@ -1362,7 +1362,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 	} replyopts;
 	struct ipcm_cookie ipc;
 	__be32 daddr;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	if (ip_options_echo(&replyopts.opt, skb))
 		return;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cb49936856e..fc7993e9061 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -57,7 +57,7 @@
 static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 {
 	struct in_pktinfo info;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
 	if (rt) {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index bb2f1b17fbf..0c6e7bf18a4 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -416,7 +416,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (!dst) {
 		/* NBMA tunnel */
-		if ((rt = skb->rtable) == NULL) {
+		if ((rt = skb_rtable(skb)) == NULL) {
 			stats->tx_fifo_errors++;
 			goto tx_error;
 		}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 13e9dd3012b..69dd058283e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1354,7 +1354,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
 	if (net->ipv4.vif_table[vif].dev != skb->dev) {
 		int true_vifi;
 
-		if (skb->rtable->fl.iif == 0) {
+		if (skb_rtable(skb)->fl.iif == 0) {
 			/* It is our own packet, looped back.
 			   Very complicated situation...
 
@@ -1430,7 +1430,7 @@ int ip_mr_input(struct sk_buff *skb)
 {
 	struct mfc_cache *cache;
 	struct net *net = dev_net(skb->dev);
-	int local = skb->rtable->rt_flags&RTCF_LOCAL;
+	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
 
 	/* Packet is looped back after forward, it should not be
 	   forwarded second time, but still can be delivered locally.
@@ -1646,7 +1646,7 @@ int ipmr_get_route(struct net *net,
 {
 	int err;
 	struct mfc_cache *cache;
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 
 	read_lock(&mrt_lock);
 	cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index f389f60cb10..c0992c75bda 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
 		return NF_ACCEPT;
 
 	mr = par->targinfo;
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 	if (!newsrc) {
 		printk("MASQUERADE: %s ate my IP address\n", par->out->name);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index cf7a42bf982..155c008626c 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -140,7 +140,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct tcphdr *tcph;
 	int oldlen, datalen;
@@ -218,7 +218,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct udphdr *udph;
 	int datalen, oldlen;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 28205e5bfa9..f20060ac2f0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1064,7 +1064,8 @@ work_done:
 out:	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(unsigned hash, struct rtable *rt,
+			  struct rtable **rp, struct sk_buff *skb)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -1114,7 +1115,10 @@ restart:
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
-			*rp = rth;
+			if (rp)
+				*rp = rth;
+			else
+				skb->dst = &rth->u.dst;
 			return 0;
 		}
 
@@ -1210,7 +1214,10 @@ restart:
 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
-	*rp = rt;
+	if (rp)
+		*rp = rt;
+	else
+		skb->dst = &rt->u.dst;
 	return 0;
 }
 
@@ -1407,7 +1414,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 							&netevent);
 
 				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt))
+				if (!rt_intern_hash(hash, rt, &rt, NULL))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1473,7 +1480,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
 
 	if (!in_dev)
@@ -1521,7 +1528,7 @@ out:
 
 static int ip_error(struct sk_buff *skb)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	unsigned long now;
 	int code;
 
@@ -1698,7 +1705,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
-	rt = skb->rtable;
+	rt = skb_rtable(skb);
 	if (rt)
 		dst_set_expires(&rt->u.dst, 0);
 }
@@ -1858,7 +1865,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -2019,7 +2026,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
 		       rt_genid(dev_net(rth->u.dst.dev)));
-	return rt_intern_hash(hash, rth, &skb->rtable);
+	return rt_intern_hash(hash, rth, NULL, skb);
 }
 
 /*
@@ -2175,7 +2182,7 @@ local_input:
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, &skb->rtable);
+	err = rt_intern_hash(hash, rth, NULL, skb);
 	goto done;
 
 no_route:
@@ -2244,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->rtable = rth;
+			skb->dst = &rth->u.dst;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2420,7 +2427,7 @@ static int ip_mkroute_output(struct rtable **rp,
 	if (err == 0) {
 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
 			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp);
+		err = rt_intern_hash(hash, rth, rp, NULL);
 	}
 
 	return err;
@@ -2763,7 +2770,7 @@ static int rt_fill_info(struct net *net,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
 			int nowait, unsigned int flags)
 {
-	struct rtable *rt = skb->rtable;
+	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	long expires;
@@ -2907,7 +2914,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
 
-		rt = skb->rtable;
+		rt = skb_rtable(skb);
 		if (err == 0 && rt->u.dst.error)
 			err = -rt->u.dst.error;
 	} else {
@@ -2927,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->rtable = rt;
+	skb->dst = &rt->u.dst;
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fc79e341628..319c8852644 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -546,7 +546,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	if (th->rst)
 		return;
 
-	if (skb->rtable->rt_type != RTN_LOCAL)
+	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 		return;
 
 	/* Swap the send and the receive. */
@@ -1185,7 +1185,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 #endif
 
 	/* Never answer to SYNs send to broadcast or multicast */
-	if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 		goto drop;
 
 	/* TW buckets are converted to open requests without
-- 
cgit v1.2.3


From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 2 Jun 2009 05:19:30 +0000
Subject: net: skb->dst accessors

Define three accessors to get/set dst attached to a skb

struct dst_entry *skb_dst(const struct sk_buff *skb)

void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)

void skb_dst_drop(struct sk_buff *skb)
This one should replace occurrences of :
dst_release(skb->dst)
skb->dst = NULL;

Delete skb->dst field

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c                         |  2 +-
 net/ipv4/icmp.c                        | 10 +++++-----
 net/ipv4/igmp.c                        |  4 ++--
 net/ipv4/ip_forward.c                  |  4 ++--
 net/ipv4/ip_fragment.c                 |  2 +-
 net/ipv4/ip_gre.c                      | 23 +++++++++++------------
 net/ipv4/ip_input.c                    |  6 +++---
 net/ipv4/ip_options.c                  |  6 +++---
 net/ipv4/ip_output.c                   | 20 ++++++++++----------
 net/ipv4/ipip.c                        | 13 ++++++-------
 net/ipv4/ipmr.c                        | 13 ++++++-------
 net/ipv4/netfilter.c                   | 28 ++++++++++++++++------------
 net/ipv4/netfilter/ipt_REJECT.c        |  7 +++----
 net/ipv4/netfilter/nf_nat_standalone.c |  7 +++----
 net/ipv4/raw.c                         |  2 +-
 net/ipv4/route.c                       | 14 +++++++-------
 net/ipv4/tcp_ipv4.c                    |  4 ++--
 net/ipv4/tcp_output.c                  |  2 +-
 net/ipv4/udp.c                         |  4 ++--
 net/ipv4/xfrm4_input.c                 |  2 +-
 net/ipv4/xfrm4_mode_tunnel.c           |  4 ++--
 net/ipv4/xfrm4_output.c                |  6 +++---
 22 files changed, 91 insertions(+), 92 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 816494f271a..8a3881e28ac 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -468,7 +468,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 	__be32 paddr;
 	struct neighbour *n;
 
-	if (!skb->dst) {
+	if (!skb_dst(skb)) {
 		printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
 		kfree_skb(skb);
 		return 1;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 94f75efae93..97c410e8438 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -591,13 +591,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 				goto relookup_failed;
 
 			/* Ugh! */
-			odst = skb_in->dst;
+			odst = skb_dst(skb_in);
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
 			rt2 = skb_rtable(skb_in);
-			skb_in->dst = odst;
+			skb_dst_set(skb_in, odst);
 		}
 
 		if (err)
@@ -659,7 +659,7 @@ static void icmp_unreach(struct sk_buff *skb)
 	u32 info = 0;
 	struct net *net;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 
 	/*
 	 *	Incomplete header ?
@@ -822,7 +822,7 @@ static void icmp_echo(struct sk_buff *skb)
 {
 	struct net *net;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 	if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
 		struct icmp_bxm icmp_param;
 
@@ -873,7 +873,7 @@ static void icmp_timestamp(struct sk_buff *skb)
 out:
 	return;
 out_err:
-	ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS);
+	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
 	goto out;
 }
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index afabd2758b6..01b4284ed69 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -311,7 +311,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 		return NULL;
 	}
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	skb->dev = dev;
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -659,7 +659,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 		return -1;
 	}
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 0761cd9bbd1..a2991bc8e32 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -42,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
@@ -123,7 +123,7 @@ sr_failed:
 
 too_many_hops:
 	/* Tell the sender its packet died... */
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
 	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
 drop:
 	kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7985346653b..1f1b82475ea 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -573,7 +573,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	struct ipq *qp;
 	struct net *net;
 
-	net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev);
+	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
 	/* Start by cleaning up the memory. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 85ddad45a91..44e2a3d2359 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -643,8 +643,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		stats->rx_packets++;
 		stats->rx_bytes += len;
 		skb->dev = tunnel->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 
 		skb_reset_network_header(skb);
@@ -698,7 +697,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if ((dst = tiph->daddr) == 0) {
 		/* NBMA tunnel */
 
-		if (skb->dst == NULL) {
+		if (skb_dst(skb) == NULL) {
 			stats->tx_fifo_errors++;
 			goto tx_error;
 		}
@@ -712,7 +711,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
 			struct in6_addr *addr6;
 			int addr_type;
-			struct neighbour *neigh = skb->dst->neighbour;
+			struct neighbour *neigh = skb_dst(skb)->neighbour;
 
 			if (neigh == NULL)
 				goto tx_error;
@@ -766,10 +765,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (df)
 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
 	else
-		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		df |= (old_iph->frag_off&htons(IP_DF));
@@ -783,14 +782,14 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 #ifdef CONFIG_IPV6
 	else if (skb->protocol == htons(ETH_P_IPV6)) {
-		struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 
-		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
 			if ((tunnel->parms.iph.daddr &&
 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
 			    rt6->rt6i_dst.plen == 128) {
 				rt6->rt6i_flags |= RTF_MODIFIED;
-				skb->dst->metrics[RTAX_MTU-1] = mtu;
+				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
 			}
 		}
 
@@ -837,8 +836,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index cea784b0aa4..490ce20faf3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -329,7 +329,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */
-	if (skb->dst == NULL) {
+	if (skb_dst(skb) == NULL) {
 		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
 					 skb->dev);
 		if (unlikely(err)) {
@@ -344,9 +344,9 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (unlikely(skb->dst->tclassid)) {
+	if (unlikely(skb_dst(skb)->tclassid)) {
 		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
-		u32 idx = skb->dst->tclassid;
+		u32 idx = skb_dst(skb)->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes += skb->len;
 		st[(idx>>16)&0xFF].i_packets++;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 7e1074ffdbd..94bf105ef3c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -143,7 +143,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 						__be32 addr;
 
 						memcpy(&addr, sptr+soffset-1, 4);
-						if (inet_addr_type(dev_net(skb->dst->dev), addr) != RTN_LOCAL) {
+						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) {
 							dopt->ts_needtime = 1;
 							soffset += 8;
 						}
@@ -624,12 +624,12 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
 		rt = skb_rtable(skb);
-		skb->dst = NULL;
+		skb_dst_set(skb, NULL);
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
 		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
 			ip_rt_put(rt2);
-			skb->dst = &rt->u.dst;
+			skb_dst_set(skb, &rt->u.dst);
 			return -EINVAL;
 		}
 		ip_rt_put(rt);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d845ebfcca..3d6167fb2d9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -95,7 +95,7 @@ int __ip_local_out(struct sk_buff *skb)
 
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
-	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
+	return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
 }
 
@@ -118,7 +118,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	__skb_pull(newskb, skb_network_offset(newskb));
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
-	WARN_ON(!newskb->dst);
+	WARN_ON(!skb_dst(newskb));
 	netif_rx(newskb);
 	return 0;
 }
@@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = (struct rtable *)dst;
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
@@ -217,14 +217,14 @@ static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 
 	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-	       skb->dst->dev->mtu : dst_mtu(skb->dst);
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 }
 
 static int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
-	if (skb->dst->xfrm != NULL) {
+	if (skb_dst(skb)->xfrm != NULL) {
 		IPCB(skb)->flags |= IPSKB_REROUTED;
 		return dst_output(skb);
 	}
@@ -296,7 +296,7 @@ int ip_mc_output(struct sk_buff *skb)
 
 int ip_output(struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dst->dev;
+	struct net_device *dev = skb_dst(skb)->dev;
 
 	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 
@@ -355,7 +355,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 		}
 		sk_setup_caps(sk, &rt->u.dst);
 	}
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 
 packet_routed:
 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -401,8 +401,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	dst_release(to->dst);
-	to->dst = dst_clone(from->dst);
+	skb_dst_drop(to);
+	skb_dst_set(to, dst_clone(skb_dst(from)));
 	to->dev = from->dev;
 	to->mark = from->mark;
 
@@ -1294,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk)
 	 * on dst refcount
 	 */
 	inet->cork.dst = NULL;
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 
 	if (iph->protocol == IPPROTO_ICMP)
 		icmp_out_count(net, ((struct icmphdr *)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 0c6e7bf18a4..93e2b787da2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -370,8 +370,7 @@ static int ipip_rcv(struct sk_buff *skb)
 		tunnel->dev->stats.rx_packets++;
 		tunnel->dev->stats.rx_bytes += skb->len;
 		skb->dev = tunnel->dev;
-		dst_release(skb->dst);
-		skb->dst = NULL;
+		skb_dst_drop(skb);
 		nf_reset(skb);
 		ipip_ecn_decapsulate(iph, skb);
 		netif_rx(skb);
@@ -447,15 +446,15 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (tiph->frag_off)
 		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
 	else
-		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
 	if (mtu < 68) {
 		stats->collisions++;
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-	if (skb->dst)
-		skb->dst->ops->update_pmtu(skb->dst, mtu);
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	df |= (old_iph->frag_off&htons(IP_DF));
 
@@ -502,8 +501,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 
 	/*
 	 *	Push down and install the IPIP header.
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 69dd058283e..ffd98610446 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -651,7 +651,7 @@ static int ipmr_cache_report(struct net *net,
 	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
 	msg = (struct igmpmsg *)skb_network_header(skb);
 	msg->im_vif = vifi;
-	skb->dst = dst_clone(pkt->dst);
+	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 
 	/*
 	 *	Add our header
@@ -1201,7 +1201,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	iph->protocol	=	IPPROTO_IPIP;
 	iph->ihl	=	5;
 	iph->tot_len	=	htons(skb->len);
-	ip_select_ident(iph, skb->dst, NULL);
+	ip_select_ident(iph, skb_dst(skb), NULL);
 	ip_send_check(iph);
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1212,7 +1212,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options * opt	= &(IPCB(skb)->opt);
 
-	IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
+	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
@@ -1290,8 +1290,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
 	vif->pkt_out++;
 	vif->bytes_out += skb->len;
 
-	dst_release(skb->dst);
-	skb->dst = &rt->u.dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->u.dst);
 	ip_decrease_ttl(ip_hdr(skb));
 
 	/* FIXME: forward and output firewalls used to be called here.
@@ -1543,8 +1543,7 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
 	skb->protocol = htons(ETH_P_IP);
 	skb->ip_summed = 0;
 	skb->pkt_type = PACKET_HOST;
-	dst_release(skb->dst);
-	skb->dst = NULL;
+	skb_dst_drop(skb);
 	reg_dev->stats.rx_bytes += skb->len;
 	reg_dev->stats.rx_packets++;
 	nf_reset(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index fdf6811c31a..1725dc0ef68 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,7 +12,7 @@
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 {
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
@@ -41,8 +41,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 			return -1;
 
 		/* Drop old route. */
-		dst_release(skb->dst);
-		skb->dst = &rt->u.dst;
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->u.dst);
 	} else {
 		/* non-local src, find valid iif to satisfy
 		 * rp-filter when calling ip_route_input. */
@@ -50,7 +50,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
-		odst = skb->dst;
+		odst = skb_dst(skb);
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
 				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
 			dst_release(&rt->u.dst);
@@ -60,18 +60,22 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		dst_release(odst);
 	}
 
-	if (skb->dst->error)
+	if (skb_dst(skb)->error)
 		return -1;
 
 #ifdef CONFIG_XFRM
 	if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
-	    xfrm_decode_session(skb, &fl, AF_INET) == 0)
-		if (xfrm_lookup(net, &skb->dst, &fl, skb->sk, 0))
+	    xfrm_decode_session(skb, &fl, AF_INET) == 0) {
+		struct dst_entry *dst = skb_dst(skb);
+		skb_dst_set(skb, NULL);
+		if (xfrm_lookup(net, &dst, &fl, skb->sk, 0))
 			return -1;
+		skb_dst_set(skb, dst);
+	}
 #endif
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb->dst->dev->hard_header_len;
+	hh_len = skb_dst(skb)->dev->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
 		return -1;
@@ -92,7 +96,7 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
 	if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
 		return -1;
 
-	dst = skb->dst;
+	dst = skb_dst(skb);
 	if (dst->xfrm)
 		dst = ((struct xfrm_dst *)dst)->route;
 	dst_hold(dst);
@@ -100,11 +104,11 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
 	if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0)
 		return -1;
 
-	dst_release(skb->dst);
-	skb->dst = dst;
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb->dst->dev->hard_header_len;
+	hh_len = skb_dst(skb)->dev->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
 		return -1;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 0b4b6e0ff2b..c93ae44bff2 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -108,17 +108,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		addr_type = RTN_LOCAL;
 
 	/* ip_route_me_harder expects skb->dst to be set */
-	dst_hold(oldskb->dst);
-	nskb->dst = oldskb->dst;
+	skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
 
 	if (ip_route_me_harder(nskb, addr_type))
 		goto free_nskb;
 
-	niph->ttl	= dst_metric(nskb->dst, RTAX_HOPLIMIT);
+	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
 	nskb->ip_summed = CHECKSUM_NONE;
 
 	/* "Never happens" */
-	if (nskb->len > dst_mtu(nskb->dst))
+	if (nskb->len > dst_mtu(skb_dst(nskb)))
 		goto free_nskb;
 
 	nf_ct_attach(nskb, oldskb);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index b7dd695691a..5567bd0d075 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -167,10 +167,9 @@ nf_nat_in(unsigned int hooknum,
 
 	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
-	    daddr != ip_hdr(skb)->daddr) {
-		dst_release(skb->dst);
-		skb->dst = NULL;
-	}
+	    daddr != ip_hdr(skb)->daddr)
+		skb_dst_drop(skb);
+
 	return ret;
 }
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f774651f0a4..3dc9171a272 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -343,7 +343,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb->dst = dst_clone(&rt->u.dst);
+	skb_dst_set(skb, dst_clone(&rt->u.dst));
 
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f20060ac2f0..a849bb15d86 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1118,7 +1118,7 @@ restart:
 			if (rp)
 				*rp = rth;
 			else
-				skb->dst = &rth->u.dst;
+				skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 
@@ -1217,7 +1217,7 @@ restart:
 	if (rp)
 		*rp = rt;
 	else
-		skb->dst = &rt->u.dst;
+		skb_dst_set(skb, &rt->u.dst);
 	return 0;
 }
 
@@ -2251,7 +2251,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb->dst = &rth->u.dst;
+			skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2934,7 +2934,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb->dst = &rt->u.dst;
+	skb_dst_set(skb, &rt->u.dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -2975,15 +2975,15 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb->dst = dst_clone(&rt->u.dst);
+			skb_dst_set(skb, dst_clone(&rt->u.dst));
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
-				dst_release(xchg(&skb->dst, NULL));
+				skb_dst_drop(skb);
 				rcu_read_unlock_bh();
 				goto done;
 			}
-			dst_release(xchg(&skb->dst, NULL));
+			skb_dst_drop(skb);
 		}
 		rcu_read_unlock_bh();
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 319c8852644..5a1ca2698c8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -590,7 +590,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 
-	net = dev_net(skb->dst->dev);
+	net = dev_net(skb_dst(skb)->dev);
 	ip_send_reply(net->ipv4.tcp_sock, skb,
 		      &arg, arg.iov[0].iov_len);
 
@@ -617,7 +617,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 			];
 	} rep;
 	struct ip_reply_arg arg;
-	struct net *net = dev_net(skb->dst->dev);
+	struct net *net = dev_net(skb_dst(skb)->dev);
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 79c39dc9b01..416fc4c2e7e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2202,7 +2202,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	/* Reserve space for headers. */
 	skb_reserve(skb, MAX_TCP_HEADER);
 
-	skb->dst = dst_clone(dst);
+	skb_dst_set(skb, dst_clone(dst));
 
 	mss = dst_metric(dst, RTAX_ADVMSS);
 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7a1d1ce22e6..8f4158d7c9a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -328,7 +328,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 	if (unlikely(sk = skb_steal_sock(skb)))
 		return sk;
 	else
-		return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
+		return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
 					 iph->daddr, dport, inet_iif(skb),
 					 udptable);
 }
@@ -1237,7 +1237,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	struct sock *sk;
 	struct udphdr *uh;
 	unsigned short ulen;
-	struct rtable *rt = (struct rtable*)skb->dst;
+	struct rtable *rt = skb_rtable(skb);
 	__be32 saddr, daddr;
 	struct net *net = dev_net(skb->dev);
 
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 4ec2162a437..f9f922a0ba8 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -23,7 +23,7 @@ int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
 
 static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 {
-	if (skb->dst == NULL) {
+	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 7135279f3f8..3444f3b34ec 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -28,7 +28,7 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
  */
 static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct dst_entry *dst = skb->dst;
+	struct dst_entry *dst = skb_dst(skb);
 	struct iphdr *top_iph;
 	int flags;
 
@@ -41,7 +41,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	top_iph->ihl = 5;
 	top_iph->version = 4;
 
-	top_iph->protocol = xfrm_af2proto(skb->dst->ops->family);
+	top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
 
 	/* DS disclosed */
 	top_iph->tos = INET_ECN_encapsulate(XFRM_MODE_SKB_CB(skb)->tos,
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 8c3180adddb..c908bd99bcb 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -29,7 +29,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
 		goto out;
 
-	dst = skb->dst;
+	dst = skb_dst(skb);
 	mtu = dst_mtu(dst);
 	if (skb->len > mtu) {
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(xfrm4_prepare_output);
 static int xfrm4_output_finish(struct sk_buff *skb)
 {
 #ifdef CONFIG_NETFILTER
-	if (!skb->dst->xfrm) {
+	if (!skb_dst(skb)->xfrm) {
 		IPCB(skb)->flags |= IPSKB_REROUTED;
 		return dst_output(skb);
 	}
@@ -87,6 +87,6 @@ static int xfrm4_output_finish(struct sk_buff *skb)
 int xfrm4_output(struct sk_buff *skb)
 {
 	return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb,
-			    NULL, skb->dst->dev, xfrm4_output_finish,
+			    NULL, skb_dst(skb)->dev, xfrm4_output_finish,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-- 
cgit v1.2.3


From 2307f866f542f3397d24f78d0efd74f4ab214a96 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 3 Jun 2009 21:43:26 -0700
Subject: ipv4: remove ip_mc_drop_socket() declaration from af_inet.c.

ip_mc_drop_socket() method is declared in linux/igmp.h, which
is included anyhow in af_inet.c. So there is no need for this declaration.
This patch removes it from af_inet.c.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d8736217858..566ea6c4321 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -116,7 +116,6 @@
 #include <linux/mroute.h>
 #endif
 
-extern void ip_mc_drop_socket(struct sock *sk);
 
 /* The inetsw table contains everything that inet_create needs to
  * build a new socket.
-- 
cgit v1.2.3


From d7fcf1a5cae2c970e9afe7192fe0c13d931247e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Jun 2009 00:19:37 -0700
Subject: ipv4: Use frag list abstraction interfaces.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 4 ++--
 net/ipv4/ip_output.c   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1f1b82475ea..575f9bd51cc 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -507,7 +507,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_shinfo(head)->frag_list) {
+	if (skb_has_frags(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
@@ -516,7 +516,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		clone->next = head->next;
 		head->next = clone;
 		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
-		skb_shinfo(head)->frag_list = NULL;
+		skb_frag_list_init(head);
 		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
 			plen += skb_shinfo(head)->frags[i].size;
 		clone->len = clone->data_len = head->data_len - plen;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3d6167fb2d9..9248d2807ba 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -474,7 +474,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	 * LATER: this step can be merged to real generation of fragments,
 	 * we can switch to copy when see the first bad fragment.
 	 */
-	if (skb_shinfo(skb)->frag_list) {
+	if (skb_has_frags(skb)) {
 		struct sk_buff *frag;
 		int first_len = skb_pagelen(skb);
 		int truesizes = 0;
@@ -485,7 +485,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		    skb_cloned(skb))
 			goto slow_path;
 
-		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+		skb_walk_frags(skb, frag) {
 			/* Correct geometry. */
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
@@ -510,7 +510,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		err = 0;
 		offset = 0;
 		frag = skb_shinfo(skb)->frag_list;
-		skb_shinfo(skb)->frag_list = NULL;
+		skb_frag_list_init(skb);
 		skb->data_len = first_len - skb_headlen(skb);
 		skb->truesize -= truesizes;
 		skb->len = first_len;
-- 
cgit v1.2.3


From 343a99724e4431d8618bea2eb7552e12c965425a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Jun 2009 00:23:58 -0700
Subject: netfilter: Use frag list abstraction interfaces.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_proto_sctp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 65e470bc612..6983e41c0b0 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -33,6 +33,7 @@ sctp_manip_pkt(struct sk_buff *skb,
 	       enum nf_nat_manip_type maniptype)
 {
 	const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+	struct sk_buff *frag;
 	sctp_sctphdr_t *hdr;
 	unsigned int hdroff = iphdroff + iph->ihl*4;
 	__be32 oldip, newip;
@@ -57,8 +58,8 @@ sctp_manip_pkt(struct sk_buff *skb,
 	}
 
 	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
-	for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
-		crc32 = sctp_update_cksum((u8 *)skb->data, skb_headlen(skb),
+	skb_walk_frags(skb, frag);
+		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
 					  crc32);
 	crc32 = sctp_end_cksum(crc32);
 	hdr->checksum = crc32;
-- 
cgit v1.2.3


From 0808dc80939b08ec215f472e17a5d8f6b148037e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 9 Jun 2009 18:05:28 -0700
Subject: netfilter: Fix extra semi-colon in skb_walk_frags() changes.

Noticed by Jesper Dangaard Brouer

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_nat_proto_sctp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 6983e41c0b0..3fc598eeeb1 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -58,7 +58,7 @@ sctp_manip_pkt(struct sk_buff *skb,
 	}
 
 	crc32 = sctp_start_cksum((u8 *)hdr, skb_headlen(skb) - hdroff);
-	skb_walk_frags(skb, frag);
+	skb_walk_frags(skb, frag)
 		crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
 					  crc32);
 	crc32 = sctp_end_cksum(crc32);
-- 
cgit v1.2.3


From 2b85a34e911bf483c27cfdd124aeb1605145dc80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 11 Jun 2009 02:55:43 -0700
Subject: net: No more expensive sock_hold()/sock_put() on each tx

One of the problem with sock memory accounting is it uses
a pair of sock_hold()/sock_put() for each transmitted packet.

This slows down bidirectional flows because the receive path
also needs to take a refcount on socket and might use a different
cpu than transmit path or transmit completion path. So these
two atomic operations also trigger cache line bounces.

We can see this in tx or tx/rx workloads (media gateways for example),
where sock_wfree() can be in top five functions in profiles.

We use this sock_hold()/sock_put() so that sock freeing
is delayed until all tx packets are completed.

As we also update sk_wmem_alloc, we could offset sk_wmem_alloc
by one unit at init time, until sk_free() is called.
Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc)
to decrement initial offset and atomicaly check if any packets
are in flight.

skb_set_owner_w() doesnt call sock_hold() anymore

sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc
reached 0 to perform the final freeing.

Drawback is that a skb->truesize error could lead to unfreeable sockets, or
even worse, prematurely calling __sk_free() on a live socket.

Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s
on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt
contention point. 5 % speedup on a UDP transmit workload (depends
on number of flows), lowering TX completion cpu usage.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9248d2807ba..24702628266 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -498,7 +498,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 			BUG_ON(frag->sk);
 			if (skb->sk) {
-				sock_hold(skb->sk);
 				frag->sk = skb->sk;
 				frag->destructor = sock_wfree;
 				truesizes += frag->truesize;
-- 
cgit v1.2.3