9 files changed, 155 insertions, 83 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4286d832166..fdfb8fe8c38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
  */
 
 /**
- *	__skb_queue_head - queue a buffer at the list head
+ *	__skb_queue_after - queue a buffer at the list head
  *	@list: list to use
+ *	@prev: place after this buffer
  *	@newsk: buffer to queue
  *
- *	Queue a buffer at the start of a list. This function takes no locks
+ *	Queue a buffer int the middle of a list. This function takes no locks
  *	and you must therefore hold required locks before calling it.
  *
  *	A buffer cannot be placed on two lists at the same time.
  */
-extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
-static inline void __skb_queue_head(struct sk_buff_head *list,
-				    struct sk_buff *newsk)
+static inline void __skb_queue_after(struct sk_buff_head *list,
+				     struct sk_buff *prev,
+				     struct sk_buff *newsk)
 {
-	struct sk_buff *prev, *next;
-
+	struct sk_buff *next;
 	list->qlen++;
-	prev = (struct sk_buff *)list;
+
 	next = prev->next;
 	newsk->next = next;
 	newsk->prev = prev;
@@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list,
 }
 
 /**
+ *	__skb_queue_head - queue a buffer at the list head
+ *	@list: list to use
+ *	@newsk: buffer to queue
+ *
+ *	Queue a buffer at the start of a list. This function takes no locks
+ *	and you must therefore hold required locks before calling it.
+ *
+ *	A buffer cannot be placed on two lists at the same time.
+ */
+extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
+static inline void __skb_queue_head(struct sk_buff_head *list,
+				    struct sk_buff *newsk)
+{
+	__skb_queue_after(list, (struct sk_buff *)list, newsk);
+}
+
+/**
  *	__skb_queue_tail - queue a buffer at the list tail
  *	@list: list to use
  *	@newsk: buffer to queue
@@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr)
 		     prefetch(skb->next), (skb != (struct sk_buff *)(queue));	\
 		     skb = skb->next)
 
+#define skb_queue_reverse_walk(queue, skb) \
+		for (skb = (queue)->prev;					\
+		     prefetch(skb->prev), (skb != (struct sk_buff *)(queue));	\
+		     skb = skb->prev)
+
 
 extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
 					 int noblock, int *err);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f50f9596834..07840baa934 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -125,9 +125,7 @@ struct inet_hashinfo {
 	rwlock_t			lhash_lock ____cacheline_aligned;
 	atomic_t			lhash_users;
 	wait_queue_head_t		lhash_wait;
-	spinlock_t			portalloc_lock;
 	kmem_cache_t			*bind_bucket_cachep;
-	int				port_rover;
 };
 
 static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
diff --git a/net/core/stream.c b/net/core/stream.c
index ac9edfdf874..15bfd03e802 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 {
 	struct task_struct *tsk = current;
 	DEFINE_WAIT(wait);
+	int done;
 
-	while (1) {
+	do {
 		if (sk->sk_err)
 			return sock_error(sk);
 		if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
@@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 		sk->sk_write_pending++;
-		if (sk_wait_event(sk, timeo_p,
-				  !((1 << sk->sk_state) & 
-				    ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))))
-			break;
+		done = sk_wait_event(sk, timeo_p,
+				     !((1 << sk->sk_state) & 
+				       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
 		finish_wait(sk->sk_sleep, &wait);
 		sk->sk_write_pending--;
-	}
+	} while (!done);
 	return 0;
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 6298cf58ff9..4b9bc81ae1a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
-	.port_rover	= 1024 - 1,
 };
 
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk)
 	int ret;
 
  	if (snum == 0) {
- 		int rover;
  		int low = sysctl_local_port_range[0];
  		int high = sysctl_local_port_range[1];
  		int remaining = (high - low) + 1;
+ 		int rover = net_random() % (high - low) + low;
 		struct hlist_node *node;
  		struct inet_timewait_sock *tw = NULL;
 
  		local_bh_disable();
-
- 		/* TODO. Actually it is not so bad idea to remove
- 		 * dccp_hashinfo.portalloc_lock before next submission to
-		 * Linus.
- 		 * As soon as we touch this place at all it is time to think.
- 		 *
- 		 * Now it protects single _advisory_ variable
-		 * dccp_hashinfo.port_rover, hence it is mostly useless.
- 		 * Code will work nicely if we just delete it, but
- 		 * I am afraid in contented case it will work not better or
- 		 * even worse: another cpu just will hit the same bucket
- 		 * and spin there.
- 		 * So some cpu salt could remove both contention and
- 		 * memory pingpong. Any ideas how to do this in a nice way?
- 		 */
- 		spin_lock(&dccp_hashinfo.portalloc_lock);
- 		rover = dccp_hashinfo.port_rover;
-
  		do {
- 			rover++;
- 			if ((rover < low) || (rover > high))
- 				rover = low;
  			head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
 						    dccp_hashinfo.bhash_size)];
  			spin_lock(&head->lock);
@@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk)
 
  		next_port:
  			spin_unlock(&head->lock);
+ 			if (++rover > high)
+ 				rover = low;
  		} while (--remaining > 0);
- 		dccp_hashinfo.port_rover = rover;
- 		spin_unlock(&dccp_hashinfo.portalloc_lock);
 
  		local_bh_enable();
 
@@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk)
 
 ok:
  		/* All locks still held and bhs disabled */
- 		dccp_hashinfo.port_rover = rover;
- 		spin_unlock(&dccp_hashinfo.portalloc_lock);
-
  		inet_bind_hash(sk, tb, rover);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(rover);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 94468a76c5b..3fe021f1a56 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		int rover = net_random() % (high - low) + low;
 
-		spin_lock(&hashinfo->portalloc_lock);
-		if (hashinfo->port_rover < low)
-			rover = low;
-		else
-			rover = hashinfo->port_rover;
 		do {
-			rover++;
-			if (rover > high)
-				rover = low;
 			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
 			spin_lock(&head->lock);
 			inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo,
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover > high)
+				rover = low;
 		} while (--remaining > 0);
-		hashinfo->port_rover = rover;
-		spin_unlock(&hashinfo->portalloc_lock);
 
 		/* Exhausted local port range during search?  It is not
 		 * possible for us to be holding one of the bind hash
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3f0013a958..72b7c22e1ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2112,7 +2112,6 @@ void __init tcp_init(void)
 		sysctl_tcp_max_orphans >>= (3 - order);
 		sysctl_max_syn_backlog = 128;
 	}
-	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
 
 	sysctl_tcp_mem[0] =  768 << order;
 	sysctl_tcp_mem[1] = 1024 << order;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c85819d8474..49d67cd75ed 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
 	.lhash_lock	= RW_LOCK_UNLOCKED,
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
-	.port_rover	= 1024 - 1,
 };
 
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d693cb988b7..d746d3b27ef 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
-		int rover;
+		int rover = net_random() % (high - low) + low;
 
-		spin_lock(&tcp_hashinfo.portalloc_lock);
-		if (tcp_hashinfo.port_rover < low)
-			rover = low;
-		else
-			rover = tcp_hashinfo.port_rover;
-		do {	rover++;
-			if (rover > high)
-				rover = low;
+		do {
 			head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
 			spin_lock(&head->lock);
 			inet_bind_bucket_for_each(tb, node, &head->chain)
@@ -132,9 +125,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 			break;
 		next:
 			spin_unlock(&head->lock);
+			if (++rover > high)
+				rover = low;
 		} while (--remaining > 0);
-		tcp_hashinfo.port_rover = rover;
-		spin_unlock(&tcp_hashinfo.portalloc_lock);
 
 		/* Exhausted local port range during search?  It is not
 		 * possible for us to be holding one of the bind hash
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index bb9bf8d5003..cdc8d283791 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -25,6 +25,8 @@
 
 #include <net/pkt_sched.h>
 
+#define VERSION "1.1"
+
 /*	Network Emulation Queuing algorithm.
 	====================================
 
@@ -185,10 +187,13 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	    || q->counter < q->gap 	/* inside last reordering gap */
 	    || q->reorder < get_crandom(&q->reorder_cor)) {
 		psched_time_t now;
+		psched_tdiff_t delay;
+
+		delay = tabledist(q->latency, q->jitter,
+				  &q->delay_cor, q->delay_dist);
+
 		PSCHED_GET_TIME(now);
-		PSCHED_TADD2(now, tabledist(q->latency, q->jitter, 
-					    &q->delay_cor, q->delay_dist),
-			     cb->time_to_send);
+		PSCHED_TADD2(now, delay, cb->time_to_send);
 		++q->counter;
 		ret = q->qdisc->enqueue(skb, q->qdisc);
 	} else {
@@ -248,24 +253,31 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 		const struct netem_skb_cb *cb
 			= (const struct netem_skb_cb *)skb->cb;
 		psched_time_t now;
-		long delay;
 
 		/* if more time remaining? */
 		PSCHED_GET_TIME(now);
-		delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
-		pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
-		if (delay <= 0) {
+
+		if (PSCHED_TLESS(cb->time_to_send, now)) {
 			pr_debug("netem_dequeue: return skb=%p\n", skb);
 			sch->q.qlen--;
 			sch->flags &= ~TCQ_F_THROTTLED;
 			return skb;
-		}
+		} else {
+			psched_tdiff_t delay = PSCHED_TDIFF(cb->time_to_send, now);
+
+			if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+				sch->qstats.drops++;
 
-		mod_timer(&q->timer, jiffies + delay);
-		sch->flags |= TCQ_F_THROTTLED;
+				/* After this qlen is confused */
+				printk(KERN_ERR "netem: queue discpline %s could not requeue\n",
+				       q->qdisc->ops->id);
 
-		if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
-			sch->qstats.drops++;
+				sch->q.qlen--;
+			}
+
+			mod_timer(&q->timer, jiffies + PSCHED_US2JIFFIE(delay));
+			sch->flags |= TCQ_F_THROTTLED;
+		}
 	}
 
 	return NULL;
@@ -290,11 +302,16 @@ static void netem_reset(struct Qdisc *sch)
 	del_timer_sync(&q->timer);
 }
 
+/* Pass size change message down to embedded FIFO */
 static int set_fifo_limit(struct Qdisc *q, int limit)
 {
         struct rtattr *rta;
 	int ret = -ENOMEM;
 
+	/* Hack to avoid sending change message to non-FIFO */
+	if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
+		return 0;
+
 	rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
 	if (rta) {
 		rta->rta_type = RTM_NEWQDISC;
@@ -426,6 +443,84 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 	return 0;
 }
 
+/*
+ * Special case version of FIFO queue for use by netem.
+ * It queues in order based on timestamps in skb's
+ */
+struct fifo_sched_data {
+	u32 limit;
+};
+
+static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+	struct sk_buff_head *list = &sch->q;
+	const struct netem_skb_cb *ncb
+		= (const struct netem_skb_cb *)nskb->cb;
+	struct sk_buff *skb;
+
+	if (likely(skb_queue_len(list) < q->limit)) {
+		skb_queue_reverse_walk(list, skb) {
+			const struct netem_skb_cb *cb
+				= (const struct netem_skb_cb *)skb->cb;
+
+			if (PSCHED_TLESS(cb->time_to_send, ncb->time_to_send))
+				break;
+		}
+
+		__skb_queue_after(list, skb, nskb);
+
+		sch->qstats.backlog += nskb->len;
+		sch->bstats.bytes += nskb->len;
+		sch->bstats.packets++;
+
+		return NET_XMIT_SUCCESS;
+	}
+
+	return qdisc_drop(nskb, sch);
+}
+
+static int tfifo_init(struct Qdisc *sch, struct rtattr *opt)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+
+	if (opt) {
+		struct tc_fifo_qopt *ctl = RTA_DATA(opt);
+		if (RTA_PAYLOAD(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		q->limit = ctl->limit;
+	} else
+		q->limit = max_t(u32, sch->dev->tx_queue_len, 1);
+
+	return 0;
+}
+
+static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fifo_sched_data *q = qdisc_priv(sch);
+	struct tc_fifo_qopt opt = { .limit = q->limit };
+
+	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+	return skb->len;
+
+rtattr_failure:
+	return -1;
+}
+
+static struct Qdisc_ops tfifo_qdisc_ops = {
+	.id		=	"tfifo",
+	.priv_size	=	sizeof(struct fifo_sched_data),
+	.enqueue	=	tfifo_enqueue,
+	.dequeue	=	qdisc_dequeue_head,
+	.requeue	=	qdisc_requeue,
+	.drop		=	qdisc_queue_drop,
+	.init		=	tfifo_init,
+	.reset		=	qdisc_reset_queue,
+	.change		=	tfifo_init,
+	.dump		=	tfifo_dump,
+};
+
 static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -438,7 +533,7 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 	q->timer.function = netem_watchdog;
 	q->timer.data = (unsigned long) sch;
 
-	q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+	q->qdisc = qdisc_create_dflt(sch->dev, &tfifo_qdisc_ops);
 	if (!q->qdisc) {
 		pr_debug("netem: qdisc create failed\n");
 		return -ENOMEM;
@@ -601,6 +696,7 @@ static struct Qdisc_ops netem_qdisc_ops = {
 
 static int __init netem_module_init(void)
 {
+	pr_info("netem: version " VERSION "\n");
 	return register_qdisc(&netem_qdisc_ops);
 }
 static void __exit netem_module_exit(void)