diff options
author | Dmitry Torokhov <dtor_core@ameritech.net> | 2005-07-11 00:58:04 -0500 |
---|---|---|
committer | Dmitry Torokhov <dtor_core@ameritech.net> | 2005-07-11 00:58:04 -0500 |
commit | e0d21d9cca25f424f3129649be48a63c128ed42d (patch) | |
tree | 0a7d407639876e02deef1721817615eaa8c673a3 /net | |
parent | beffbdc2211826b174c68307b1b48c93c05d7ded (diff) | |
parent | 5c23804a0941a111752fdacefe0bea2db1b4d93f (diff) |
Merge rsync://www.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'net')
48 files changed, 1120 insertions, 576 deletions
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 2e341de3e76..901eff7ebe7 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in return kernel_sendmsg(sock, &msg, &iv, 1, len); } -static int cmtp_process_transmit(struct cmtp_session *session) +static void cmtp_process_transmit(struct cmtp_session *session) { struct sk_buff *skb, *nskb; unsigned char *hdr; @@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session) if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) { BT_ERR("Can't allocate memory for new frame"); - return -ENOMEM; + return; } while ((skb = skb_dequeue(&session->transmit))) { @@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session) cmtp_send_frame(session, nskb->data, nskb->len); kfree_skb(nskb); - - return skb_queue_len(&session->transmit); } static int cmtp_session(void *arg) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index affbc55462e..de8af5f4239 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len) return kernel_sendmsg(sock, &msg, &iv, 1, len); } -static int hidp_process_transmit(struct hidp_session *session) +static void hidp_process_transmit(struct hidp_session *session) { struct sk_buff *skb; @@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session) hidp_set_timer(session); kfree_skb(skb); } - - return skb_queue_len(&session->ctrl_transmit) + - skb_queue_len(&session->intr_transmit); } static int hidp_session(void *arg) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f3f6355a278..63a123c5c41 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo) for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current) || !timeo) + if (!skb_queue_empty(&sk->sk_receive_queue) || + sk->sk_err || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current) || + !timeo) break; set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 6d689200bcf..6304590fd36 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) BT_DBG("tty %p dev %p", tty, dev); - if (skb_queue_len(&dlc->tx_queue)) + if (!skb_queue_empty(&dlc->tx_queue)) return dlc->mtu; return 0; diff --git a/net/core/dev.c b/net/core/dev.c index 7016e0c36b3..ff9dc029233 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1127,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) extern void skb_release_data(struct sk_buff *); /* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, int gfp_mask) +int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask) { unsigned int size; u8 *data; @@ -2089,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; - dev->flags |= IFF_PROMISC; if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; - if (dev->flags ^ old_flags) { + else + dev->flags |= IFF_PROMISC; + if (dev->flags != old_flags) { dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : diff --git a/net/core/filter.c b/net/core/filter.c index f3b88205ace..cd91a24f972 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -36,7 +36,7 @@ #include <linux/filter.h> /* No hurry in this branch */ -static u8 *load_pointer(struct sk_buff *skb, int k) +static void *__load_pointer(struct sk_buff *skb, int k) { u8 *ptr = NULL; @@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k) return NULL; } +static inline void *load_pointer(struct sk_buff *skb, int k, + unsigned int size, void *buffer) +{ + if (k >= 0) + return skb_header_pointer(skb, k, size, buffer); + else { + if (k >= SKF_AD_OFF) + return NULL; + return __load_pointer(skb, k); + } +} + /** * sk_run_filter - run a filter on a socket * @skb: buffer to run the filter on @@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k) int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) { - unsigned char *data = skb->data; - /* len is UNSIGNED. Byte wide insns relies only on implicit - type casts to prevent reading arbitrary memory locations. - */ - unsigned int len = skb->len-skb->data_len; struct sock_filter *fentry; /* We walk down these */ + void *ptr; u32 A = 0; /* Accumulator */ u32 X = 0; /* Index Register */ u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + u32 tmp; int k; int pc; @@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) case BPF_LD|BPF_W|BPF_ABS: k = fentry->k; load_w: - if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { - A = ntohl(*(u32*)&data[k]); + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { + A = ntohl(*(u32 *)ptr); continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohl(*(u32*)ptr); - continue; - } - } else { - u32 _tmp, *p; - p = skb_header_pointer(skb, k, 4, &_tmp); - if (p != NULL) { - A = ntohl(*p); - continue; - } - } return 0; case BPF_LD|BPF_H|BPF_ABS: k = fentry->k; load_h: - if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { - A = ntohs(*(u16*)&data[k]); + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { + A = ntohs(*(u16 *)ptr); continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = ntohs(*(u16*)ptr); - continue; - } - } else { - u16 _tmp, *p; - p = skb_header_pointer(skb, k, 2, &_tmp); - if (p != NULL) { - A = ntohs(*p); - continue; - } - } return 0; case BPF_LD|BPF_B|BPF_ABS: k = fentry->k; load_b: - if (k >= 0 && (unsigned int)k < len) { - A = data[k]; + ptr = load_pointer(skb, k, 1, &tmp); + if (ptr != NULL) { + A = *(u8 *)ptr; continue; } - if (k < 0) { - u8 *ptr; - - if (k >= SKF_AD_OFF) - break; - ptr = load_pointer(skb, k); - if (ptr) { - A = *ptr; - continue; - } - } else { - u8 _tmp, *p; - p = skb_header_pointer(skb, k, 1, &_tmp); - if (p != NULL) { - A = *p; - continue; - } - } return 0; case BPF_LD|BPF_W|BPF_LEN: - A = len; + A = skb->len; continue; case BPF_LDX|BPF_W|BPF_LEN: - X = len; + X = skb->len; continue; case BPF_LD|BPF_W|BPF_IND: k = X + fentry->k; @@ -259,10 +217,12 @@ load_b: k = X + fentry->k; goto load_b; case BPF_LDX|BPF_B|BPF_MSH: - if (fentry->k >= len) - return 0; - X = (data[fentry->k] & 0xf) << 2; - continue; + ptr = load_pointer(skb, fentry->k, 1, &tmp); + if (ptr != NULL) { + X = (*(u8 *)ptr & 0xf) << 2; + continue; + } + return 0; case BPF_LD|BPF_IMM: A = fentry->k; continue; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb73b2190ec..d9f7b06fe88 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ -struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) +struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) { struct sk_buff *skb; u8 *data; @@ -182,7 +182,8 @@ nodata: * %GFP_ATOMIC. */ struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, - unsigned int size, int gfp_mask) + unsigned int size, + unsigned int __nocast gfp_mask) { struct sk_buff *skb; u8 *data; @@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb) * %GFP_ATOMIC. */ -struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) { struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); @@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) C(ip_summed); C(priority); C(protocol); - C(security); n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); @@ -422,7 +422,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->pkt_type = old->pkt_type; new->stamp = old->stamp; new->destructor = NULL; - new->security = old->security; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; new->nfcache = old->nfcache; @@ -462,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) * header is going to be modified. Use pskb_copy() instead. */ -struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) +struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask) { int headerlen = skb->data - skb->head; /* @@ -501,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) * The returned buffer has a reference count of 1. */ -struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) +struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask) { /* * Allocate the copy buffer @@ -559,7 +558,8 @@ out: * reloaded after call to this function. */ -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + unsigned int __nocast gfp_mask) { int i; u8 *data; @@ -649,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) * only by netfilter in the cases when checksum is recalculated? --ANK */ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, - int newheadroom, int newtailroom, int gfp_mask) + int newheadroom, int newtailroom, + unsigned int __nocast gfp_mask) { /* * Allocate the copy buffer diff --git a/net/core/sock.c b/net/core/sock.c index a6ec3ada7f9..8b35ccdc2b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -622,7 +622,8 @@ lenout: * @prot: struct proto associated with this new sock instance * @zero_it: if we should zero the newly allocated sock */ -struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) +struct sock *sk_alloc(int family, unsigned int __nocast priority, + struct proto *prot, int zero_it) { struct sock *sk = NULL; kmem_cache_t *slab = prot->slab; @@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk) /* * Allocate a skb from the socket's send buffer. */ -struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, + unsigned int __nocast priority) { if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { struct sk_buff * skb = alloc_skb(size, priority); @@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int /* * Allocate a skb from the socket's receive buffer. */ -struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, + unsigned int __nocast priority) { if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { struct sk_buff *skb = alloc_skb(size, priority); @@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int /* * Allocate a memory block from the socket's option memory buffer. */ -void *sock_kmalloc(struct sock *sk, int size, int priority) +void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority) { if ((unsigned)size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 29bb3cd2196..96a02800cd2 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk) * we are double checking that we are not sending too * many of these keepalive frames. */ - if (skb_queue_len(&scp->other_xmit_queue) == 0) + if (skb_queue_empty(&scp->other_xmit_queue)) dn_nsp_send_link(sk, DN_NOCHANGE, 0); } @@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table struct dn_scp *scp = DN_SK(sk); int mask = datagram_poll(file, sock, wait); - if (skb_queue_len(&scp->other_receive_queue)) + if (!skb_queue_empty(&scp->other_receive_queue)) mask |= POLLRDBAND; return mask; @@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCATMARK: lock_sock(sk); - val = (skb_queue_len(&scp->other_receive_queue) != 0); + val = !skb_queue_empty(&scp->other_receive_queue); if (scp->state != DN_RUN) val = -ENOTCONN; release_sock(sk); @@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int int len = 0; if (flags & MSG_OOB) - return skb_queue_len(q) ? 1 : 0; + return !skb_queue_empty(q) ? 1 : 0; while(skb != (struct sk_buff *)q) { struct dn_skb_cb *cb = DN_SKB_CB(skb); @@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_err) goto out; - if (skb_queue_len(&scp->other_receive_queue)) { + if (!skb_queue_empty(&scp->other_receive_queue)) { if (!(flags & MSG_OOB)) { msg->msg_flags |= MSG_OOB; if (!scp->other_report) { diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 9934b25720e..99bc061759c 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) if (t < s_t) continue; if (t > s_t) - memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + memset(&cb->args[1], 0, + sizeof(cb->args) - sizeof(cb->args[0])); tb = dn_fib_get_table(t, 0); if (tb == NULL) continue; diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 42abbf3f524..8cce1fdbda9 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk) dn_nsp_output(sk); - if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue)) + if (!skb_queue_empty(&scp->data_xmit_queue) || + !skb_queue_empty(&scp->other_xmit_queue)) scp->persist = dn_nsp_persist(sk); return 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 658e7977924..ef7468376ae 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void) static int ipv4_proc_init(void); extern void ipfrag_init(void); +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = ip_rcv, +}; + static int __init inet_init(void) { struct sk_buff *dummy_skb; @@ -1102,6 +1111,8 @@ static int __init inet_init(void) ipfrag_init(); + dev_add_pack(&ip_packet_type); + rc = 0; out: return rc; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b56e88edf1b..4be234c7d8c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.324" +#define VERSION "0.325" #include <linux/config.h> #include <asm/uaccess.h> @@ -136,6 +136,7 @@ struct trie_use_stats { unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; + unsigned int resize_node_skipped; }; #endif @@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn); -static struct tnode *halve(struct trie *t, struct tnode *tn); +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); @@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li) kfree(li); } +static struct tnode *tnode_alloc(unsigned int size) +{ + if (size <= PAGE_SIZE) { + return kmalloc(size, GFP_KERNEL); + } else { + return (struct tnode *) + __get_free_pages(GFP_KERNEL, get_order(size)); + } +} + +static void __tnode_free(struct tnode *tn) +{ + unsigned int size = sizeof(struct tnode) + + (1<<tn->bits) * sizeof(struct node *); + + if (size <= PAGE_SIZE) + kfree(tn); + else + free_pages((unsigned long)tn, get_order(size)); +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1<<bits; int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); - struct tnode *tn = kmalloc(sz, GFP_KERNEL); + struct tnode *tn = tnode_alloc(sz); if(tn) { memset(tn, 0, sz); @@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn) printk("FL %p \n", tn); } else if(IS_TNODE(tn)) { - kfree(tn); + __tnode_free(tn); if(trie_debug > 0 ) printk("FT %p \n", tn); } @@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w static struct node *resize(struct trie *t, struct tnode *tn) { int i; + int err = 0; if (!tn) return NULL; @@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn) */ check_tnode(tn); - + + err = 0; while ((tn->full_children > 0 && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn); + tn = inflate(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } } check_tnode(tn); @@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn) * Halve as long as the number of empty children in this * node is above threshold. */ + + err = 0; while (tn->bits > 1 && 100 * (tnode_child_length(tn) - tn->empty_children) < - halve_threshold * tnode_child_length(tn)) + halve_threshold * tnode_child_length(tn)) { + + tn = halve(t, tn, &err); + + if(err) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.resize_node_skipped++; +#endif + break; + } + } - tn = halve(t, tn); /* Only one child remains */ @@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn) +static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) { struct tnode *inode; struct tnode *oldtnode = tn; @@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and inflate + * of tnode is ignored. + */ + + for(i = 0; i < olen; i++) { + struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); + + if (inode && + IS_TNODE(inode) && + inode->pos == oldtnode->pos + oldtnode->bits && + inode->bits > 1) { + struct tnode *left, *right; + + t_key m = TKEY_GET_MASK(inode->pos, 1); + + left = tnode_new(inode->key&(~m), inode->pos + 1, + inode->bits - 1); + + if(!left) { + *err = -ENOMEM; + break; + } + + right = tnode_new(inode->key|m, inode->pos + 1, + inode->bits - 1); + + if(!right) { + *err = -ENOMEM; + break; + } + + put_child(t, tn, 2*i, (struct node *) left); + put_child(t, tn, 2*i+1, (struct node *) right); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); @@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if(IS_LEAF(node) || ((struct tnode *) node)->pos > tn->pos + tn->bits - 1) { - if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, + if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) put_child(t, tn, 2*i, node); else @@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) * the position (inode->pos) */ - t_key m = TKEY_GET_MASK(inode->pos, 1); - /* Use the old key, but set the new significant * bit to zero. */ - left = tnode_new(inode->key&(~m), inode->pos + 1, - inode->bits - 1); - if(!left) - trie_bug("tnode_new failed"); - - - /* Use the old key, but set the new significant - * bit to one. - */ - right = tnode_new(inode->key|m, inode->pos + 1, - inode->bits - 1); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); + + if(!left) + BUG(); + + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); + + if(!right) + BUG(); - if(!right) - trie_bug("tnode_new failed"); - size = tnode_child_length(left); for(j = 0; j < size; j++) { put_child(t, left, j, inode->child[j]); @@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) return tn; } -static struct tnode *halve(struct trie *t, struct tnode *tn) +static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) { struct tnode *oldtnode = tn; struct node *left, *right; @@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if(!tn) - trie_bug("tnode_new failed"); + if (!tn) { + *err = -ENOMEM; + return oldtnode; + } + + /* + * Preallocate and store tnodes before the actual work so we + * don't get into an inconsistent state if memory allocation + * fails. In case of failure we return the oldnode and halve + * of tnode is ignored. + */ + + for(i = 0; i < olen; i += 2) { + left = tnode_get_child(oldtnode, i); + right = tnode_get_child(oldtnode, i+1); + + /* Two nonempty children */ + if( left && right) { + struct tnode *newBinNode = + tnode_new(left->key, tn->pos + tn->bits, 1); + + if(!newBinNode) { + *err = -ENOMEM; + break; + } + put_child(t, tn, i/2, (struct node *)newBinNode); + } + } + + if(*err) { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if( tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + *err = -ENOMEM; + return oldtnode; + } for(i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); @@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* Two nonempty children */ else { struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); + (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); if(!newBinNode) - trie_bug("tnode_new failed"); + BUG(); put_child(t, newBinNode, 0, left); put_child(t, newBinNode, 1, right); @@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); + seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); #ifdef CLEAR_STATS memset(&(t->stats), 0, sizeof(t->stats)); #endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cb759484979..279f57abfec 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb) * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently * discarded if to broadcast/multicast. */ - if (icmph->type == ICMP_ECHO && + if ((icmph->type == ICMP_ECHO || + icmph->type == ICMP_TIMESTAMP) && sysctl_icmp_echo_ignore_broadcasts) { goto error; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1f3183168a9..5088f90835a 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) { int err; u32 addr = imr->imr_multiaddr.s_addr; - struct ip_mc_socklist *iml, *i; + struct ip_mc_socklist *iml=NULL, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); + int ifindex; int count = 0; if (!MULTICAST(addr)) @@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) goto done; } - iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); - err = -EADDRINUSE; + ifindex = imr->imr_ifindex; for (i = inet->mc_list; i; i = i->next) { - if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { - /* New style additions are reference counted */ - if (imr->imr_address.s_addr == 0) { - i->count++; - err = 0; - } + if (i->multi.imr_multiaddr.s_addr == addr && + i->multi.imr_ifindex == ifindex) goto done; - } count++; } err = -ENOBUFS; - if (iml == NULL || count >= sysctl_igmp_max_memberships) + if (count >= sysctl_igmp_max_memberships) + goto done; + iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL); + if (iml == NULL) goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); iml->next = inet->mc_list; - iml->count = 1; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; inet->mc_list = iml; ip_mc_inc_group(in_dev, addr); - iml = NULL; err = 0; - done: rtnl_shunlock(); - if (iml) - sock_kfree_s(sk, iml, sizeof(*iml)); return err; } @@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml, **imlp; + struct in_device *in_dev; + u32 group = imr->imr_multiaddr.s_addr; + u32 ifindex; rtnl_lock(); + in_dev = ip_mc_find_dev(imr); + if (!in_dev) { + rtnl_unlock(); + return -ENODEV; + } + ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { - if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && - iml->multi.imr_address.s_addr==imr->imr_address.s_addr && - (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { - struct in_device *in_dev; - - in_dev = inetdev_by_index(iml->multi.imr_ifindex); - if (in_dev) - (void) ip_mc_leave_src(sk, iml, in_dev); - if (--iml->count) { - rtnl_unlock(); - if (in_dev) - in_dev_put(in_dev); - return 0; - } + if (iml->multi.imr_multiaddr.s_addr == group && + iml->multi.imr_ifindex == ifindex) { + (void) ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next; - if (in_dev) { - ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); - in_dev_put(in_dev); - } + ip_mc_dec_group(in_dev, group); rtnl_unlock(); sock_kfree_s(sk, iml, sizeof(*iml)); return 0; @@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + int leavegroup = 0; int i, j, rv; if (!MULTICAST(addr)) @@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct err = -EADDRNOTAVAIL; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { - if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) + if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr + && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } - if (!pmc) /* must have a prior join */ + if (!pmc) { /* must have a prior join */ + err = -EINVAL; goto done; + } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { - if (pmc->sfmode != omode) + if (pmc->sfmode != omode) { + err = -EINVAL; goto done; + } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); @@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct psl = pmc->sflist; if (!add) { if (!psl) - goto done; + goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i=0; i<psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, @@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct break; } if (rv) /* source not found */ + goto done; /* err = -EADDRNOTAVAIL */ + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { + leavegroup = 1; goto done; + } /* update the interface filter */ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, @@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct &mreqs->imr_sourceaddr, 1); done: rtnl_shunlock(); + if (leavegroup) + return ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) { - int err; + int err = 0; struct ip_mreqn imr; u32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; + int leavegroup = 0; if (!MULTICAST(addr)) return -EINVAL; @@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) err = -ENODEV; goto done; } - err = -EADDRNOTAVAIL; + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { + leavegroup = 1; + goto done; + } for (pmc=inet->mc_list; pmc; pmc=pmc->next) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } - if (!pmc) /* must have a prior join */ + if (!pmc) { /* must have a prior join */ + err = -EINVAL; goto done; + } if (msf->imsf_numsrc) { newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); @@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = msf->imsf_fmode; + err = 0; done: rtnl_shunlock(); + if (leavegroup) + err = ip_mc_leave_group(sk, &imr); return err; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6ce5c3292f9..9de83e6e0f1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; @@ -1329,23 +1328,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar ip_rt_put(rt); } -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = { - .type = __constant_htons(ETH_P_IP), - .func = ip_rcv, -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - void __init ip_init(void) { - dev_add_pack(&ip_packet_type); - ip_rt_init(); inet_initpeers(); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f8b172f8981..fc7c481d0d7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; err = ip_mc_join_group(sk, &mreq); - if (err) + if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; add = 1; - } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + } else /* IP_DROP_SOURCE_MEMBERSHIP */ { omode = MCAST_INCLUDE; add = 0; } @@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group(sk, &mreq); - if (err) + if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 12a1cf306f6..726ea5e8180 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -54,6 +54,7 @@ * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file + * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -70,6 +71,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; -} __attribute__((__aligned__(8))); +}; +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +/* + * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks + * The size of this table is a power of two and depends on the number of CPUS. + */ +#if NR_CPUS >= 32 +#define RT_HASH_LOCK_SZ 4096 +#elif NR_CPUS >= 16 +#define RT_HASH_LOCK_SZ 2048 +#elif NR_CPUS >= 8 +#define RT_HASH_LOCK_SZ 1024 +#elif NR_CPUS >= 4 +#define RT_HASH_LOCK_SZ 512 +#else +#define RT_HASH_LOCK_SZ 256 +#endif + +static spinlock_t *rt_hash_locks; +# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] +# define rt_hash_lock_init() { \ + int i; \ + rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ + if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ + for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ + spin_lock_init(&rt_hash_locks[i]); \ + } +#else +# define rt_hash_lock_addr(slot) NULL +# define rt_hash_lock_init() +#endif static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; @@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static int rover; - int i = rover, t; + static unsigned int rover; + unsigned int i = rover, goal; struct rtable *rth, **rthp; unsigned long now = jiffies; - - for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; - t -= ip_rt_gc_timeout) { + u64 mult; + + mult = ((u64)ip_rt_gc_interval) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - spin_lock(&rt_hash_table[i].lock); + if (*rthp == 0) + continue; + spin_lock(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ @@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy) rt_free(rth); #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock(&rt_hash_table[i].lock); + spin_unlock(rt_hash_lock_addr(i)); /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; - mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); + mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy) get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + spin_lock_bh(rt_hash_lock_addr(i)); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth; rth = next) { next = rth->u.rt_next; @@ -780,7 +818,7 @@ static int rt_garbage_collect(void) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -812,7 +850,7 @@ static int rt_garbage_collect(void) goal--; #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } - spin_unlock_bh(&rt_hash_table[k].lock); + spin_unlock_bh(rt_hash_lock_addr(k)); if (goal <= 0) break; } @@ -882,7 +920,7 @@ restart: rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && @@ -908,7 +946,7 @@ restart: rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); *rp = rth; @@ -949,7 +987,7 @@ restart: if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); @@ -990,7 +1028,7 @@ restart: } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; } @@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt) { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt) rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + spin_unlock_bh(rt_hash_lock_addr(hash)); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries); int __init ip_rt_init(void) { - int i, order, goal, rc = 0; + int rc = 0; rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ (jiffies ^ (jiffies >> 7))); #ifdef CONFIG_NET_CLS_ROUTE + { + int order; for (order = 0; (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) /* NOTHING */; @@ -3086,6 +3126,7 @@ int __init ip_rt_init(void) if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); memset(ip_rt_acct, 0, PAGE_SIZE << order); + } #endif ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", @@ -3096,36 +3137,19 @@ int __init ip_rt_init(void) if (!ipv4_dst_ops.kmem_cachep) panic("IP: failed to allocate ip_dst_cache\n"); - goal = num_physpages >> (26 - PAGE_SHIFT); - if (rhash_entries) - goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; - for (order = 0; (1UL << order) < goal; order++) - /* NOTHING */; - - do { - rt_hash_mask = (1UL << order) * PAGE_SIZE / - sizeof(struct rt_hash_bucket); - while (rt_hash_mask & (rt_hash_mask - 1)) - rt_hash_mask--; - rt_hash_table = (struct rt_hash_bucket *) - __get_free_pages(GFP_ATOMIC, order); - } while (rt_hash_table == NULL && --order > 0); - - if (!rt_hash_table) - panic("Failed to allocate IP route cache hash table\n"); - - printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", - rt_hash_mask, - (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); - - for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) - /* NOTHING */; - - rt_hash_mask--; - for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); - rt_hash_table[i].chain = NULL; - } + rt_hash_table = (struct rt_hash_bucket *) + alloc_large_system_hash("IP route cache", + sizeof(struct rt_hash_bucket), + rhash_entries, + (num_physpages >= 128 * 1024) ? + (27 - PAGE_SHIFT) : + (29 - PAGE_SHIFT), + HASH_HIGHMEM, + &rt_hash_log, + &rt_hash_mask, + 0); + memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); + rt_hash_lock_init(); ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 882436da9a3..ddb6ce4ecff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse size_t psize, int flags) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; + int mss_now, size_goal; int err; ssize_t copied; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; copied = 0; err = -EPIPE; @@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -652,7 +653,7 @@ new_segment: goto wait_for_memory; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } if (copy > size) @@ -693,7 +694,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -713,6 +714,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } out: @@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, static inline int select_size(struct sock *sk, struct tcp_sock *tp) { - int tmp = tp->mss_cache_std; + int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + if (sk->sk_route_caps & NETIF_F_TSO) + tmp = 0; + else { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } } + return tmp; } @@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int iovlen, flags; - int mss_now; + int mss_now, size_goal; int err, copied; long timeo; @@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, skb = sk->sk_write_queue.prev; if (!sk->sk_send_head || - (copy = mss_now - skb->len) <= 0) { + (copy = size_goal - skb->len) <= 0) { new_segment: /* Allocate new segment. If the interface is SG, @@ -837,7 +845,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); - copy = mss_now; + copy = size_goal; } /* Try to append data to the end of skb. */ @@ -872,11 +880,6 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } else if (page) { - /* If page is cached, align - * offset to L1 cache boundary - */ - off = (off + L1_CACHE_BYTES - 1) & - ~(L1_CACHE_BYTES - 1); if (off == PAGE_SIZE) { put_page(page); TCP_PAGE(sk) = page = NULL; @@ -937,7 +940,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len != mss_now || (flags & MSG_OOB)) + if (skb->len < mss_now || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -957,6 +960,7 @@ wait_for_memory: goto do_error; mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + size_goal = tp->xmit_size_goal; } } @@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); + NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not * necessary */ @@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, * is not empty. It is more elegant, but eats cycles, * unfortunately. */ - if (skb_queue_len(&tp->ucopy.prequeue)) + if (!skb_queue_empty(&tp->ucopy.prequeue)) goto do_prequeue; /* __ Set realtime policy in scheduler __ */ @@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } if (tp->rcv_nxt == tp->copied_seq && - skb_queue_len(&tp->ucopy.prequeue)) { + !skb_queue_empty(&tp->ucopy.prequeue)) { do_prequeue: tcp_prequeue_process(sk); @@ -1472,7 +1476,7 @@ skip_copy: } while (len > 0); if (user_recv) { - if (skb_queue_len(&tp->ucopy.prequeue)) { + if (!skb_queue_empty(&tp->ucopy.prequeue)) { int chunk; tp->ucopy.len = copied > 0 ? len : 0; @@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rto = jiffies_to_usecs(tp->rto); info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); - info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = tp->ack.rcv_mss; info->tcpi_unacked = tp->packets_out; @@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, switch (optname) { case TCP_MAXSEG: - val = tp->mss_cache_std; + val = tp->mss_cache; if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7bbbbc33eb4..53a8a5399f1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp->mss_cache_std > 1460) + if (tp->mss_cache > 1460) cwnd = 2; else - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; + tp->mss_cache = tp->mss_cache; } if (!tp->sacked_out) @@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (IsFack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache_std))) { + tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) } } -/* There is one downside to this scheme. Although we keep the - * ACK clock ticking, adjusting packet counters and advancing - * congestion window, we do not liberate socket send buffer - * space. - * - * Mucking with skb->truesize and sk->sk_wmem_alloc et al. - * then making a write space wakeup callback is a possible - * future enhancement. WARNING: it is not trivial to make. - */ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, __u32 now, __s32 *seq_rtt) { @@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt * the other end. */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1) + if (tcp_skb_pcount(skb) > 1 && + after(tp->snd_una, scb->seq)) acked |= tcp_tso_acked(sk, skb, now, &seq_rtt); break; @@ -2810,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) { + if (skb_queue_empty(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; tp->rx_opt.eff_sacks = tp->rx_opt.dsack; return; @@ -2943,13 +2935,13 @@ queue_and_out: if(th->fin) tcp_fin(skb, sk, th); - if (skb_queue_len(&tp->out_of_order_queue)) { + if (!skb_queue_empty(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (!skb_queue_len(&tp->out_of_order_queue)) + if (skb_queue_empty(&tp->out_of_order_queue)) tp->ack.pingpong = 0; } @@ -3257,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk) * This must not ever occur. */ /* First, purge the out_of_order queue. */ - if (skb_queue_len(&tp->out_of_order_queue)) { - NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, - skb_queue_len(&tp->out_of_order_queue)); + if (!skb_queue_empty(&tp->out_of_order_queue)) { + NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -3308,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } +static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +{ + /* If the user specified a specific send buffer setting, do + * not modify it. + */ + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ + if (tcp_memory_pressure) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ + if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + return 0; + + /* If we filled the congestion window, do not expand. */ + if (tp->packets_out >= tp->snd_cwnd) + return 0; + + return 1; +} /* When incoming ACK allowed to free some skb from write_queue, * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket @@ -3319,11 +3332,8 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->packets_out < tp->snd_cwnd && - !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { - int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + if (tcp_should_expand_sndbuf(sk, tp)) { + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering + 1); @@ -3346,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk) } } -static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || - tcp_packets_in_flight(tp) >= tp->snd_cwnd || - tcp_write_xmit(sk, tp->nonagle)) - tcp_check_probe_timer(sk, tp); -} - -static __inline__ void tcp_data_snd_check(struct sock *sk) +static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { - struct sk_buff *skb = sk->sk_send_head; - - if (skb != NULL) - __tcp_data_snd_check(sk, skb); + tcp_push_pending_frames(sk, tp); tcp_check_space(sk); } @@ -3655,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -3721,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); if (!tcp_ack_scheduled(tp)) goto no_ack; } @@ -3799,7 +3796,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); return 0; @@ -4109,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); return 0; } @@ -4300,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk); + tcp_data_snd_check(sk, tp); tcp_ack_snd_check(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebf112347a9..62f62bb05c2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; tp->ca_ops = &tcp_init_congestion_ops; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e17c244875..e3f8ea1bfa9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1; * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */ -int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_tso_win_divisor = 3; static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) @@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, tp->ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk) +static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { struct tcp_sock *tp = tcp_sk(sk); - tcp_dec_quickack_mode(tp); + tcp_dec_quickack_mode(tp, pkts); tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } @@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk); + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); @@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) sk->sk_send_head = skb; } -static inline void tcp_tso_set_push(struct sk_buff *skb) -{ - /* Force push to be on for any TSO frames to workaround - * problems with busted implementations like Mac OS-X that - * hold off socket receive wakeups until push is seen. - */ - if (tcp_skb_pcount(skb) > 1) - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; -} - -/* Send _single_ skb sitting at the send head. This function requires - * true push pending frames to setup probe timer etc. - */ -void tcp_push_one(struct sock *sk, unsigned cur_mss) +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = sk->sk_send_head; - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { - sk->sk_send_head = NULL; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, tp, skb); - return; - } - } -} - -void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (skb->len <= tp->mss_cache_std || + if (skb->len <= tp->mss_cache || !(sk->sk_route_caps & NETIF_F_TSO)) { /* Avoid the costly divide in the normal * non-TSO case. @@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) } else { unsigned int factor; - factor = skb->len + (tp->mss_cache_std - 1); - factor /= tp->mss_cache_std; + factor = skb->len + (tp->mss_cache - 1); + factor /= tp->mss_cache; skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = tp->mss_cache_std; + skb_shinfo(skb)->tso_size = tp->mss_cache; } } @@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) } /* Link BUFF into the send queue. */ + skb_header_release(buff); __skb_append(skb, buff); return 0; @@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + tp->mss_cache = mss_now; return mss_now; } @@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */ - -unsigned int tcp_current_mss(struct sock *sk, int large) +unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); - unsigned int do_large, mss_now; + u32 mss_now; + u16 xmit_size_goal; + int doing_tso = 0; + + mss_now = tp->mss_cache; + + if (large_allowed && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode) + doing_tso = 1; - mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != tp->pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } - do_large = (large && - (sk->sk_route_caps & NETIF_F_TSO) && - !tp->urg_mode); + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - if (do_large) { - unsigned int large_mss, factor, limit; + xmit_size_goal = mss_now; - large_mss = 65535 - tp->af_specific->net_header_len - + if (doing_tso) { + xmit_size_goal = 65535 - + tp->af_specific->net_header_len - tp->ext_header_len - tp->tcp_header_len; - if (tp->max_window && large_mss > (tp->max_window>>1)) - large_mss = max((tp->max_window>>1), - 68U - tp->tcp_header_len); + if (tp->max_window && + (xmit_size_goal > (tp->max_window >> 1))) + xmit_size_goal = max((tp->max_window >> 1), + 68U - tp->tcp_header_len); + + xmit_size_goal -= (xmit_size_goal % mss_now); + } + tp->xmit_size_goal = xmit_size_goal; - factor = large_mss / mss_now; + return mss_now; +} - /* Always keep large mss multiple of real mss, but - * do not exceed 1/tso_win_divisor of the congestion window - * so we can keep the ACK clock ticking and minimize - * bursting. - */ - limit = tp->snd_cwnd; - if (sysctl_tcp_tso_win_divisor) - limit /= sysctl_tcp_tso_win_divisor; - limit = max(1U, limit); - if (factor > limit) - factor = limit; +/* Congestion window validation. (RFC2861) */ - tp->mss_cache = mss_now * factor; +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out = tp->packets_out; + + if (packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; - mss_now = tp->mss_cache; + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); } +} - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - return mss_now; +static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +{ + u32 window, cwnd_len; + + window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + cwnd_len = mss_now * cwnd; + return min(window, cwnd_len); +} + +/* Can at least one segment of SKB be sent right now, according to the + * congestion window rules? If so, return how many segments are allowed. + */ +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 in_flight, cwnd; + + /* Don't be strict about the congestion window for the final FIN. */ + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 1; + + in_flight = tcp_packets_in_flight(tp); + cwnd = tp->snd_cwnd; + if (in_flight < cwnd) + return (cwnd - in_flight); + + return 0; +} + +/* This must be invoked the first time we consider transmitting + * SKB onto the wire. + */ +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs) { + tcp_set_skb_tso_segs(sk, skb); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static inline int tcp_minshall_check(const struct tcp_sock *tp) +{ + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + * 1. It is full sized. + * 2. Or it contains FIN. (already checked by caller) + * 3. Or TCP_NODELAY was set. + * 4. Or TCP_CORK is not set, and all sent packets are ACKed. + * With Minshall's modification: all sent small packets are ACKed. + */ + +static inline int tcp_nagle_check(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned mss_now, int nonagle) +{ + return (skb->len < mss_now && + ((nonagle&TCP_NAGLE_CORK) || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +} + +/* Return non-zero if the Nagle test allows this packet to be + * sent now. + */ +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + /* Nagle rule does not apply to frames, which sit in the middle of the + * write_queue (they have no chances to get new data). + * + * This is implemented in the callers, where they modify the 'nonagle' + * argument based upon the location of SKB in the send queue. + */ + if (nonagle & TCP_NAGLE_PUSH) + return 1; + + /* Don't use the nagle rule for urgent data (or for the final FIN). */ + if (tp->urg_mode || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) + return 1; + + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) + return 1; + + return 0; +} + +/* Does at least the first segment of SKB fit into the send window? */ +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tp->snd_una + tp->snd_wnd); +} + +/* This checks if the data bearing packet SKB (usually sk->sk_send_head) + * should be put on the wire right now. If so, it returns the number of + * packets allowed by the congestion window. + */ +static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, + unsigned int cur_mss, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cwnd_quota; + + tcp_init_tso_segs(sk, skb); + + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) + return 0; + + cwnd_quota = tcp_cwnd_test(tp, skb); + if (cwnd_quota && + !tcp_snd_wnd_test(tp, skb, cur_mss)) + cwnd_quota = 0; + + return cwnd_quota; +} + +static inline int tcp_skb_is_last(const struct sock *sk, + const struct sk_buff *skb) +{ + return skb->next == (struct sk_buff *)&sk->sk_write_queue; +} + +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb = sk->sk_send_head; + + return (skb && + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), + (tcp_skb_is_last(sk, skb) ? + TCP_NAGLE_PUSH : + tp->nonagle))); +} + +/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet + * which is put after SKB on the list. It is very much like + * tcp_fragment() except that it may make several kinds of assumptions + * in order to speed up the splitting operation. In particular, we + * know that all the data is in scatter-gather pages, and that the + * packet has never been sent out before (and thus is not cloned). + */ +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len) +{ + struct sk_buff *buff; + int nlen = skb->len - len; + u16 flags; + + /* All of a TSO frame must be composed of paged data. */ + BUG_ON(skb->len != skb->data_len); + + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC); + if (unlikely(buff == NULL)) + return -ENOMEM; + + buff->truesize = nlen; + skb->truesize -= nlen; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + + /* This packet was never sent out yet, so no SACK bits. */ + TCP_SKB_CB(buff)->sacked = 0; + + buff->ip_summed = skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(sk, skb); + tcp_set_skb_tso_segs(sk, buff); + + /* Link BUFF into the send queue. */ + skb_header_release(buff); + __skb_append(skb, buff); + + return 0; +} + +/* Try to defer sending, if possible, in order to minimize the amount + * of TSO splitting we do. View it as a kind of TSO Nagle test. + * + * This algorithm is from John Heffner. + */ +static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 send_win, cong_win, limit, in_flight; + + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) + return 0; + + if (tp->ca_state != TCP_CA_Open) + return 0; + + in_flight = tcp_packets_in_flight(tp); + + BUG_ON(tcp_skb_pcount(skb) <= 1 || + (tp->snd_cwnd <= in_flight)); + + send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + + /* From in_flight test above, we know that cwnd > in_flight. */ + cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + + limit = min(send_win, cong_win); + + /* If sk_send_head can be sent fully now, just do it. */ + if (skb->len <= limit) + return 0; + + if (sysctl_tcp_tso_win_divisor) { + u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + + /* If at least some fraction of a window is available, + * just use it. + */ + chunk /= sysctl_tcp_tso_win_divisor; + if (limit >= chunk) + return 0; + } else { + /* Different approach, try not to defer past a single + * ACK. Receiver should ACK every other full sized + * frame, so if we have space for more than 3 frames + * then send now. + */ + if (limit > tcp_max_burst(tp) * tp->mss_cache) + return 0; + } + + /* Ok, it looks like it is advisable to defer. */ + return 1; } /* This routine writes packets to the network. It advances the @@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large) * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -int tcp_write_xmit(struct sock *sk, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) { struct tcp_sock *tp = tcp_sk(sk); - unsigned int mss_now; + struct sk_buff *skb; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *skb; - int sent_pkts = 0; + if (unlikely(sk->sk_state == TCP_CLOSE)) + return 0; + + skb = sk->sk_send_head; + if (unlikely(!skb)) + return 0; + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_cwnd_test(tp, skb); + if (unlikely(!cwnd_quota)) + goto out; + + sent_pkts = 0; + while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) { + BUG_ON(!tso_segs); + + if (tso_segs == 1) { + if (unlikely(!tcp_nagle_test(tp, skb, mss_now, + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) + break; + } else { + if (tcp_tso_should_defer(sk, tp, skb)) + break; + } - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk, 1); - - while ((skb = sk->sk_send_head) && - tcp_snd_test(sk, skb, mss_now, - tcp_skb_is_last(sk, skb) ? nonagle : - TCP_NAGLE_PUSH)) { - if (skb->len > mss_now) { - if (tcp_fragment(sk, skb, mss_now)) + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (tso_fragment(sk, skb, limit)) break; } - - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); - if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) break; + } - /* Advance the send_head. This one is sent out. - * This call will increment packets_out. - */ - update_send_head(sk, tp, skb); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + break; + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); + + tcp_minshall_update(tp, mss_now, skb); + sent_pkts++; + + /* Do not optimize this to use tso_segs. If we chopped up + * the packet above, tso_segs will no longer be valid. + */ + cwnd_quota -= tcp_skb_pcount(skb); + + BUG_ON(cwnd_quota < 0); + if (!cwnd_quota) + break; + + skb = sk->sk_send_head; + if (!skb) + break; + tso_segs = tcp_init_tso_segs(sk, skb); + } + + if (likely(sent_pkts)) { + tcp_cwnd_validate(sk, tp); + return 0; + } +out: + return !tp->packets_out && sk->sk_send_head; +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp, + unsigned int cur_mss, int nonagle) +{ + struct sk_buff *skb = sk->sk_send_head; - tcp_minshall_update(tp, mss_now, skb); - sent_pkts = 1; + if (skb) { + if (tcp_write_xmit(sk, cur_mss, nonagle)) + tcp_check_probe_timer(sk, tp); + } +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + unsigned int tso_segs, cwnd_quota; + + BUG_ON(!skb || skb->len < mss_now); + + tso_segs = tcp_init_tso_segs(sk, skb); + cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); + + if (likely(cwnd_quota)) { + BUG_ON(!tso_segs); + + if (tso_segs > 1) { + u32 limit = tcp_window_allows(tp, skb, + mss_now, cwnd_quota); + + if (skb->len < limit) { + unsigned int trim = skb->len % mss_now; + + if (trim) + limit = skb->len - trim; + } + if (skb->len > limit) { + if (unlikely(tso_fragment(sk, skb, limit))) + return; + } + } else if (unlikely(skb->len > mss_now)) { + if (unlikely(tcp_fragment(sk, skb, mss_now))) + return; } - if (sent_pkts) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); - return 0; + return; } - - return !tp->packets_out && sk->sk_send_head; } - return 0; } /* This function returns the amount that we can raise the @@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (sk->sk_route_caps & NETIF_F_TSO) { sk->sk_route_caps &= ~NETIF_F_TSO; sock_set_flag(sk, SOCK_NO_LARGESEND); - tp->mss_cache = tp->mss_cache_std; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) @@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1285,7 +1613,7 @@ void tcp_send_fin(struct sock *sk) * was unread data in the receive queue. This behavior is recommended * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM */ -void tcp_send_active_reset(struct sock *sk, int priority) +void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk) if (sk->sk_route_caps & NETIF_F_TSO) { sock_set_flag(sk, SOCK_NO_LARGESEND); sk->sk_route_caps &= ~NETIF_F_TSO; - tp->mss_cache = tp->mss_cache_std; } } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(sk, skb); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_tso_set_push(skb); err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b127b449856..0084227438c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data) } tp->ack.pending &= ~TCP_ACK_TIMER; - if (skb_queue_len(&tp->ucopy.prequeue)) { + if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, - skb_queue_len(&tp->ucopy.prequeue)); + NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk->sk_backlog_rcv(sk, skb); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2b193e3df49..28d9bcab097 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -774,7 +774,6 @@ static int __init inet6_init(void) if (if6_proc_init()) goto proc_if6_fail; #endif - ipv6_packet_init(); ip6_route_init(); ip6_flowlabel_init(); err = addrconf_init(); @@ -791,6 +790,8 @@ static int __init inet6_init(void) /* Init v6 transport protocols. */ udpv6_init(); tcpv6_init(); + + ipv6_packet_init(); err = 0; out: return err; @@ -798,7 +799,6 @@ out: addrconf_fail: ip6_flowlabel_cleanup(); ip6_route_cleanup(); - ipv6_packet_cleanup(); #ifdef CONFIG_PROC_FS if6_proc_exit(); proc_if6_fail: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 06e7cdaeedc..1f2c2f9e353 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; - to->security = from->security; dst_release(to->dst); to->dst = dst_clone(from->dst); to->dev = from->dev; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 562fcd14fde..29fed6e58d0 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) } write_unlock_bh(&ipv6_sk_mc_lock); - return -ENOENT; + return -EADDRNOTAVAIL; } static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) @@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (ipv6_addr_equal(&pmc->addr, group)) break; } - if (!pmc) /* must have a prior join */ + if (!pmc) { /* must have a prior join */ + err = -EINVAL; goto done; + } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { - if (pmc->sfmode != omode) + if (pmc->sfmode != omode) { + err = -EINVAL; goto done; + } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip6_mc_add_src(idev, group, omode, 0, NULL, 0); @@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, psl = pmc->sflist; if (!add) { if (!psl) - goto done; + goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i=0; i<psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], source, @@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, break; } if (rv) /* source not found */ - goto done; + goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { @@ -488,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; + int leavegroup = 0; int i, err; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; @@ -503,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) if (!idev) return -ENODEV; dev = idev->dev; - err = -EADDRNOTAVAIL; + + err = 0; + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { + leavegroup = 1; + goto done; + } for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { if (pmc->ifindex != gsf->gf_interface) @@ -511,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) if (ipv6_addr_equal(&pmc->addr, group)) break; } - if (!pmc) /* must have a prior join */ + if (!pmc) { /* must have a prior join */ + err = -EINVAL; goto done; + } if (gsf->gf_numsrc) { newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); @@ -544,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); pmc->sflist = newpsl; pmc->sfmode = gsf->gf_fmode; + err = 0; done: read_unlock_bh(&idev->lock); in6_dev_put(idev); dev_put(dev); + if (leavegroup) + err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9dac7fdf472..f6e288dc116 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk) */ tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_clamp = ~0; - tp->mss_cache_std = tp->mss_cache = 536; + tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; diff --git a/net/irda/irlap.c b/net/irda/irlap.c index 046ad0750e4..7029618f571 100644 --- a/net/irda/irlap.c +++ b/net/irda/irlap.c @@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self) IRDA_ASSERT(self->magic == LAP_MAGIC, return;); /* Don't disconnect until all data frames are successfully sent */ - if (skb_queue_len(&self->txq) > 0) { + if (!skb_queue_empty(&self->txq)) { self->disconnect_pending = TRUE; - return; } diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c index 1cd89f5f3b7..a505b545760 100644 --- a/net/irda/irlap_event.c +++ b/net/irda/irlap_event.c @@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout) * Send out the RR frames faster if our own transmit queue is empty, or * if the peer is busy. The effect is a much faster conversation */ - if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) { + if (skb_queue_empty(&self->txq) || self->remote_busy) { if (self->fast_RR == TRUE) { /* * Assert that the fast poll timer has not reached the @@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event, IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, skb_queue_len(&self->txq)); - if (skb_queue_len(&self->txq)) { + if (!skb_queue_empty(&self->txq)) { /* Prevent race conditions with irlap_data_request() */ self->local_busy = TRUE; @@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event, #else /* CONFIG_IRDA_DYNAMIC_WINDOW */ /* Window has been adjusted for the max packet * size, so much simpler... - Jean II */ - nextfit = (skb_queue_len(&self->txq) > 0); + nextfit = !skb_queue_empty(&self->txq); #endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ /* * Send data with poll bit cleared only if window > 1 @@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event, #else /* CONFIG_IRDA_DYNAMIC_WINDOW */ /* Window has been adjusted for the max packet * size, so much simpler... - Jean II */ - nextfit = (skb_queue_len(&self->txq) > 0); + nextfit = !skb_queue_empty(&self->txq); #endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ /* * Send data with final bit cleared only if window > 1 @@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event, irlap_data_indication(self, skb, FALSE); /* Any pending data requests? */ - if ((skb_queue_len(&self->txq) > 0) && + if (!skb_queue_empty(&self->txq) && (self->window > 0)) { self->ack_required = TRUE; @@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event, /* * Any pending data requests? */ - if ((skb_queue_len(&self->txq) > 0) && + if (!skb_queue_empty(&self->txq) && (self->window > 0) && !self->remote_busy) { irlap_data_indication(self, skb, TRUE); @@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event, */ nr_status = irlap_validate_nr_received(self, info->nr); if (nr_status == NR_EXPECTED) { - if ((skb_queue_len( &self->txq) > 0) && + if (!skb_queue_empty(&self->txq) && (self->window > 0)) { self->remote_busy = FALSE; diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 040abe714aa..6dafbb43b52 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command) /* * We can now fill the window with additional data frames */ - while (skb_queue_len( &self->txq) > 0) { + while (!skb_queue_empty(&self->txq)) { IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__); - if ((skb_queue_len( &self->txq) > 0) && - (self->window > 0)) { + if (self->window > 0) { skb = skb_dequeue( &self->txq); IRDA_ASSERT(skb != NULL, return;); @@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command) * bit cleared */ if ((self->window > 1) && - skb_queue_len(&self->txq) > 0) - { + !skb_queue_empty(&self->txq)) { irlap_send_data_primary(self, skb); } else { irlap_send_data_primary_poll(self, skb); diff --git a/net/irda/irttp.c b/net/irda/irttp.c index d091ccf773b..6602d901f8b 100644 --- a/net/irda/irttp.c +++ b/net/irda/irttp.c @@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata, /* * Check if there is still data segments in the transmit queue */ - if (skb_queue_len(&self->tx_queue) > 0) { + if (!skb_queue_empty(&self->tx_queue)) { if (priority == P_HIGH) { /* * No need to send the queued data, if we are diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c index cd130c3b72b..d5bdb53a348 100644 --- a/net/llc/llc_c_ev.c +++ b/net/llc/llc_c_ev.c @@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) if (llc->dev->flags & IFF_LOOPBACK) goto out; rc = 1; - if (!skb_queue_len(&llc->pdu_unack_q)) + if (skb_queue_empty(&llc->pdu_unack_q)) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fc456a7aaec..3405fdf41b9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -858,7 +858,7 @@ static inline void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); - if (!skb_queue_len(&sk->sk_receive_queue)) + if (skb_queue_empty(&sk->sk_receive_queue)) clear_bit(0, &nlk->state); if (!test_bit(0, &nlk->state)) wake_up_interruptible(&nlk->wait); diff --git a/net/sched/Makefile b/net/sched/Makefile index 8f58cecd626..e48d0d456b3 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -4,7 +4,7 @@ obj-y := sch_generic.o -obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o sch_blackhole.o obj-$(CONFIG_NET_CLS) += cls_api.o obj-$(CONFIG_NET_CLS_ACT) += act_api.o obj-$(CONFIG_NET_ACT_POLICE) += police.o diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 48bb23c2a35..53d98f8d3d8 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol) dst->value = skb->protocol; } -META_COLLECTOR(int_security) -{ - dst->value = skb->security; -} - META_COLLECTOR(int_pkttype) { dst->value = skb->pkt_type; @@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(REALDEV)] = META_FUNC(int_realdev), [META_ID(PRIORITY)] = META_FUNC(int_priority), [META_ID(PROTOCOL)] = META_FUNC(int_protocol), - [META_ID(SECURITY)] = META_FUNC(int_security), [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 05e6e0a799d..b9a069af4a0 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) { int err; struct rtattr *kind = tca[TCA_KIND-1]; - void *p = NULL; struct Qdisc *sch; struct Qdisc_ops *ops; - int size; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_KMOD @@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) if (ops == NULL) goto err_out; - /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; - - p = kmalloc(size, GFP_KERNEL); - err = -ENOBUFS; - if (!p) + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) { + err = PTR_ERR(sch); goto err_out2; - memset(p, 0, size); - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; - - INIT_LIST_HEAD(&sch->list); - skb_queue_head_init(&sch->q); + } - if (handle == TC_H_INGRESS) + if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; - - sch->ops = ops; - sch->enqueue = ops->enqueue; - sch->dequeue = ops->dequeue; - sch->dev = dev; - dev_hold(dev); - atomic_set(&sch->refcnt, 1); - sch->stats_lock = &dev->queue_lock; - if (handle == 0) { + handle = TC_H_MAKE(TC_H_INGRESS, 0); + } else if (handle == 0) { handle = qdisc_alloc_handle(dev); err = -ENOMEM; if (handle == 0) goto err_out3; } - if (handle == TC_H_INGRESS) - sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); - else - sch->handle = handle; + sch->handle = handle; if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + err = gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, + tca[TCA_RATE-1]); + if (err) { + /* + * Any broken qdiscs that would require + * a ops->reset() here? The qdisc was never + * in action so it shouldn't be necessary. + */ + if (ops->destroy) + ops->destroy(sch); + goto err_out3; + } + } +#endif qdisc_lock_tree(dev); list_add_tail(&sch->list, &dev->qdisc_list); qdisc_unlock_tree(dev); -#ifdef CONFIG_NET_ESTIMATOR - if (tca[TCA_RATE-1]) - gen_new_estimator(&sch->bstats, &sch->rate_est, - sch->stats_lock, tca[TCA_RATE-1]); -#endif return sch; } err_out3: dev_put(dev); + kfree((char *) sch - sch->padded); err_out2: module_put(ops->owner); err_out: *errp = err; - if (p) - kfree(p); return NULL; } diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c new file mode 100644 index 00000000000..81f0b8346d1 --- /dev/null +++ b/net/sched/sch_blackhole.c @@ -0,0 +1,54 @@ +/* + * net/sched/sch_blackhole.c Black hole queue + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * + * Note: Quantum tunneling is not supported. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/pkt_sched.h> + +static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + qdisc_drop(skb, sch); + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static struct Qdisc_ops blackhole_qdisc_ops = { + .id = "blackhole", + .priv_size = 0, + .enqueue = blackhole_enqueue, + .dequeue = blackhole_dequeue, + .owner = THIS_MODULE, +}; + +static int __init blackhole_module_init(void) +{ + return register_qdisc(&blackhole_qdisc_ops); +} + +static void __exit blackhole_module_exit(void) +{ + unregister_qdisc(&blackhole_qdisc_ops); +} + +module_init(blackhole_module_init) +module_exit(blackhole_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7683b34dc6a..73e218e646a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = { .owner = THIS_MODULE, }; -struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops) { void *p; struct Qdisc *sch; - int size; + unsigned int size; + int err = -ENOBUFS; /* ensure that the Qdisc and the private data are 32-byte aligned */ - size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); - size += ops->priv_size + QDISC_ALIGN_CONST; + size = QDISC_ALIGN(sizeof(*sch)); + size += ops->priv_size + (QDISC_ALIGNTO - 1); p = kmalloc(size, GFP_KERNEL); if (!p) - return NULL; + goto errout; memset(p, 0, size); - - sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) - & ~QDISC_ALIGN_CONST); - sch->padded = (char *)sch - (char *)p; + sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); + sch->padded = (char *) sch - (char *) p; INIT_LIST_HEAD(&sch->list); skb_queue_head_init(&sch->q); @@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) dev_hold(dev); sch->stats_lock = &dev->queue_lock; atomic_set(&sch->refcnt, 1); + + return sch; +errout: + return ERR_PTR(-err); +} + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + struct Qdisc *sch; + + sch = qdisc_alloc(dev, ops); + if (IS_ERR(sch)) + goto errout; + if (!ops->init || ops->init(sch, NULL) == 0) return sch; - dev_put(dev); - kfree(p); +errout: return NULL; } @@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up); EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(noop_qdisc_ops); EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(qdisc_alloc); EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_restart); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 664d0e47374..7845d045eec 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt) memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); q->qcount = -1; - if (skb_queue_len(&sch->q) == 0) + if (skb_queue_empty(&sch->q)) PSCHED_SET_PASTPERFECT(q->qidlestart); sch_tree_unlock(sch); return 0; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 7ae6aa772da..4b47dd6f248 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a */ asoc->addip_serial = asoc->c.initial_tsn; - skb_queue_head_init(&asoc->addip_chunks); + INIT_LIST_HEAD(&asoc->addip_chunk_list); /* Make an empty list of remote transport addresses. */ INIT_LIST_HEAD(&asoc->peer.transport_addr_list); diff --git a/net/sctp/input.c b/net/sctp/input.c index 339f7acfdb6..5e085e041a6 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); } +struct sctp_input_cb { + union { + struct inet_skb_parm h4; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct inet6_skb_parm h6; +#endif + } header; + struct sctp_chunk *chunk; +}; +#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) + /* * This is the routine which IP calls when receiving an SCTP packet. */ @@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb) ret = -ENOMEM; goto discard_release; } + SCTP_INPUT_CB(skb)->chunk = chunk; sctp_rcv_set_owner_r(skb,sk); @@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb) sctp_bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_add_backlog(sk, (struct sk_buff *) chunk); + sk_add_backlog(sk, skb); else - sctp_backlog_rcv(sk, (struct sk_buff *) chunk); + sctp_backlog_rcv(sk, skb); /* Release the sock and any reference counts we took in the * lookup calls. @@ -302,14 +314,8 @@ discard_release: */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - struct sctp_chunk *chunk; - struct sctp_inq *inqueue; - - /* One day chunk will live inside the skb, but for - * now this works. - */ - chunk = (struct sctp_chunk *) skb; - inqueue = &chunk->rcvr->inqueue; + struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + struct sctp_inq *inqueue = &chunk->rcvr->inqueue; sctp_inq_push(inqueue, chunk); return 0; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index cedf4351556..2d33922c044 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -50,7 +50,7 @@ /* Initialize an SCTP inqueue. */ void sctp_inq_init(struct sctp_inq *queue) { - skb_queue_head_init(&queue->in); + INIT_LIST_HEAD(&queue->in_chunk_list); queue->in_progress = NULL; /* Create a task for delivering data. */ @@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue) /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { - struct sctp_chunk *chunk; + struct sctp_chunk *chunk, *tmp; /* Empty the queue. */ - while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL) + list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { + list_del_init(&chunk->list); sctp_chunk_free(chunk); + } /* If there is a packet which is currently being worked on, * free it as well. @@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet) * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ - skb_queue_tail(&(q->in), (struct sk_buff *) packet); + list_add_tail(&packet->list, &q->in_chunk_list); q->immediate.func(q->immediate.data); } @@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) /* Do we need to take the next packet out of the queue to process? */ if (!chunk) { + struct list_head *entry; + /* Is the queue empty? */ - if (skb_queue_empty(&queue->in)) + if (list_empty(&queue->in_chunk_list)) return NULL; + entry = queue->in_chunk_list.next; chunk = queue->in_progress = - (struct sctp_chunk *) skb_dequeue(&queue->in); + list_entry(entry, struct sctp_chunk, list); + list_del_init(entry); /* This is the first chunk in the packet. */ chunk->singleton = 1; diff --git a/net/sctp/output.c b/net/sctp/output.c index 84b5b370b09..93137163346 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; - skb_queue_head_init(&packet->chunks); + INIT_LIST_HEAD(&packet->chunk_list); if (asoc) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); overhead = sp->pf->af->net_header_len; @@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { - struct sctp_chunk *chunk; + struct sctp_chunk *chunk, *tmp; SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); - while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); sctp_chunk_free(chunk); + } if (packet->malloced) kfree(packet); @@ -276,7 +278,7 @@ append: packet->has_sack = 1; /* It is OK to send this chunk. */ - __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk); + list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: @@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) struct sctphdr *sh; __u32 crc32; struct sk_buff *nskb; - struct sctp_chunk *chunk; + struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ @@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); /* Do NOT generate a chunkless packet. */ - chunk = (struct sctp_chunk *)skb_peek(&packet->chunks); - if (unlikely(!chunk)) + if (list_empty(&packet->chunk_list)) return err; /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; /* Allocate the new skb. */ @@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) * [This whole comment explains WORD_ROUND() below.] */ SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); - while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!chunk->has_tsn) { @@ -511,7 +514,8 @@ err: * will get resent or dropped later. */ - while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 4eb81a1407b..efb72faba20 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); static inline void sctp_outq_head_data(struct sctp_outq *q, struct sctp_chunk *ch) { - __skb_queue_head(&q->out, (struct sk_buff *)ch); + list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; return; } @@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q, /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { - struct sctp_chunk *ch; - ch = (struct sctp_chunk *)__skb_dequeue(&q->out); - if (ch) + struct sctp_chunk *ch = NULL; + + if (!list_empty(&q->out_chunk_list)) { + struct list_head *entry = q->out_chunk_list.next; + + ch = list_entry(entry, struct sctp_chunk, list); + list_del_init(entry); q->out_qlen -= ch->skb->len; + } return ch; } /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { - __skb_queue_tail(&q->out, (struct sk_buff *)ch); + list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; return; } @@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary, void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) { q->asoc = asoc; - skb_queue_head_init(&q->out); - skb_queue_head_init(&q->control); + INIT_LIST_HEAD(&q->out_chunk_list); + INIT_LIST_HEAD(&q->control_chunk_list); INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); @@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *pos, *temp; - struct sctp_chunk *chunk; + struct sctp_chunk *chunk, *tmp; /* Throw away unacknowledged chunks. */ list_for_each(pos, &q->asoc->peer.transport_addr_list) { @@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q) q->error = 0; /* Throw away any leftover control chunks. */ - while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL) + list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { + list_del_init(&chunk->list); sctp_chunk_free(chunk); + } } /* Free the outqueue structure and any related pending chunks. */ @@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) break; }; } else { - __skb_queue_tail(&q->control, (struct sk_buff *) chunk); + list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); } @@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) __u16 sport = asoc->base.bind_addr.port; __u16 dport = asoc->peer.port; __u32 vtag = asoc->peer.i.init_tag; - struct sk_buff_head *queue; struct sctp_transport *transport = NULL; struct sctp_transport *new_transport; - struct sctp_chunk *chunk; + struct sctp_chunk *chunk, *tmp; sctp_xmit_t status; int error = 0; int start_timer = 0; @@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) * ... */ - queue = &q->control; - while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) { + list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { + list_del_init(&chunk->list); + /* Pick the right transport to use. */ new_transport = chunk->transport; @@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) /* Finally, transmit new packets. */ start_timer = 0; - queue = &q->out; - while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { /* RFC 2960 6.5 Every DATA chunk MUST carry a valid * stream identifier. @@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) /* See if all chunks are acked. * Make sure the empty queue handler will get run later. */ - q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) && - list_empty(&q->retransmit); + q->empty = (list_empty(&q->out_chunk_list) && + list_empty(&q->control_chunk_list) && + list_empty(&q->retransmit)); if (!q->empty) goto finish; @@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); if (!chunk->tsn_gap_acked) { - chunk->transport->flight_size -= - sctp_data_size(chunk); - q->outstanding_bytes -= sctp_data_size(chunk); + chunk->transport->flight_size -= + sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); } sctp_chunk_free(chunk); } else { @@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { - __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk); + list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5baed9bb7de..773cd93fa3d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); } + INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->resent = 0; @@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk) /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { - /* Make sure that we are not on any list. */ - skb_unlink((struct sk_buff *) chunk); + BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); /* Release our reference on the message tracker. */ @@ -2739,8 +2739,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc, asoc->addip_last_asconf = NULL; /* Send the next asconf chunk from the addip chunk queue. */ - asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks); - if (asconf) { + if (!list_empty(&asoc->addip_chunk_list)) { + struct list_head *entry = asoc->addip_chunk_list.next; + asconf = list_entry(entry, struct sctp_chunk, list); + + list_del_init(entry); + /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(asconf); if (sctp_primitive_ASCONF(asoc, asconf)) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aad55dc3792..091a66f06a3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc, * transmission. */ if (asoc->addip_last_asconf) { - __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk); + list_add_tail(&chunk->list, &asoc->addip_chunk_list); goto out; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 269f217918a..3c654e06b08 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { if (task == xprt->snd_task) return 1; - if (task == NULL) - return 0; goto out_sleep; } if (xprt->nocong || __xprt_get_cong(xprt, task)) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c420eba4876..d403e34088a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk) * may receive messages only from that peer. */ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { - if (skb_queue_len(&sk->sk_receive_queue)) { + if (!skb_queue_empty(&sk->sk_receive_queue)) { skb_queue_purge(&sk->sk_receive_queue); wake_up_interruptible_all(&unix_sk(sk)->peer_wait); @@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo) for (;;) { prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); - if (skb_queue_len(&sk->sk_receive_queue) || + if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current) || |