/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ * * Authors: Ross Biro * Fred N. van Kempen, * Arnt Gulbrandsen, * Alan Cox, * Hirokazu Takahashi, * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins : Add Encapulation Support * James Chapman : Add L2TP encapsulation type. * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "udp_impl.h" /* * Snmp MIB for the UDP layer */ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; EXPORT_SYMBOL(udp_statistics); DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; EXPORT_SYMBOL(udp_stats_in6); struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); int sysctl_udp_mem[3] __read_mostly; int sysctl_udp_rmem_min __read_mostly; int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); EXPORT_SYMBOL(sysctl_udp_rmem_min); EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, const struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) if (sk->sk_net == net && sk->sk_hash == num) return 1; return 0; } /** * __udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @udptable: hash list table, must be of UDP_HTABLE_SIZE * @saddr_comp: AF-dependent comparison of bound local IP addresses */ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_head udptable[], int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2 ) ) { struct hlist_node *node; struct hlist_head *head; struct sock *sk2; int error = 1; struct net *net = sk->sk_net; write_lock_bh(&udp_hash_lock); if (!snum) { int i, low, high, remaining; unsigned rover, best, best_size_so_far; inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; best_size_so_far = UINT_MAX; best = rover = net_random() % remaining + low; /* 1st pass: look for empty (or shortest) hash chain */ for (i = 0; i < UDP_HTABLE_SIZE; i++) { int size = 0; head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) goto gotit; sk_for_each(sk2, node, head) { if (++size >= best_size_so_far) goto next; } best_size_so_far = size; best = rover; next: /* fold back if end of range */ if (++rover > high) rover = low + ((rover - low) & (UDP_HTABLE_SIZE - 1)); } /* 2nd pass: find hole in shortest hash chain */ rover = best; for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { if (! __udp_lib_lport_inuse(net, rover, udptable)) goto gotit; rover += UDP_HTABLE_SIZE; if (rover > high) rover = low + ((rover - low) & (UDP_HTABLE_SIZE - 1)); } /* All ports in use! */ goto fail; gotit: snum = rover; } else { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && sk2 != sk && sk2->sk_net == net && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2) ) goto fail; } inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inuse_add(sk->sk_prot, 1); } error = 0; fail: write_unlock_bh(&udp_hash_lock); return error; } int udp_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)) { return __udp_lib_get_port(sk, snum, udp_hash, scmp); } /* * IOCTL requests applicable to the UDP protocol */ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: { int amount = atomic_read(&sk->sk_wmem_alloc); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; unsigned long amount; amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount * of this packet since that is all * that will be read. */ amount = skb->len - sizeof(struct udphdr); } spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } default: return -ENOIOCTLCMD; } return 0; } int udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* * 1003.1g - break association. */ sk->sk_state = TCP_CLOSE; inet->daddr = 0; inet->dport = 0; sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); inet->sport = 0; } sk_dst_reset(sk); return 0; } /* * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); int val; int err = 0; #ifdef CONFIG_IP_UDPLITE int is_udplite = IS_UDPLITE(sk); #endif if (optlencorkflag = 1; } else { up->corkflag = 0; lock_sock(sk); (*push_pending_frames)(sk); release_sock(sk); } break; case UDP_ENCAP: switch (val) { case 0: case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->encap_rcv = xfrm4_udp_encap_rcv; /* FALLTHROUGH */ case UDP_ENCAP_L2TPINUDP: up->encap_type = val; break; default: err = -ENOPROTOOPT; break; } break; #ifdef CONFIG_IP_UDPLITE /* * UDP-Lite's partial checksum coverage (RFC 3828). */ /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; up->pcslen = val; up->pcflag |= UDPLITE_SEND_CC; break; /* The receiver specifies a minimum checksum coverage value. To make * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; up->pcrlen = val; up->pcflag |= UDPLITE_RECV_CC; break; #endif default: err = -ENOPROTOOPT; break; } return err; } int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct udp_sock *up = udp_sk(sk); int val, len; if (get_user(len,optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case UDP_CORK: val = up->corkflag; break; case UDP_ENCAP: val = up->encap_type; break; /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: val = up->pcslen; break; case UDPLITE_RECV_CSCOV: val = up->pcrlen; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val,len)) return -EFAULT; return 0; } /** * udp_poll - wait for a UDP event. * @file - file struct * @sock - socket * @wait - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd * and a packet with checksum error is in the queue; * then it could get return from select indicating data available * but then block when reading it. Add special case code * to work around these arguably broken applications. */ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) { unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; int is_lite = IS_UDPLITE(sk); /* Check for false positives due to checksum errors */ if ( (mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN)){ struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); __skb_unlink(skb, rcvq); kfree_skb(skb); } spin_unlock_bh(&rcvq->lock); /* nothing to see, move along */ if (skb == NULL) mask &= ~(POLLIN | POLLRDNORM); } return mask; } /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static struct sock *udp_get_first(struct seq_file *seq) { struct sock *sk; struct udp_iter_state *state = seq->private; for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { struct hlist_node *node; sk_for_each(sk, node, state->hashtable + state->bucket) { if (sk->sk_family == state->family) goto found; } } sk = NULL; found: return sk; } static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; do { sk = sk_next(sk); try_again: ; } while (sk && sk->sk_family != state->family); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(state->hashtable + state->bucket); goto try_again; } return sk; } static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = udp_get_first(seq); if (sk) while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } static void *udp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(udp_hash_lock) { read_lock(&udp_hash_lock); return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; } static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == (void *)1) sk = udp_get_idx(seq, 0); else sk = udp_get_next(seq, v); ++*pos; return sk; } static void udp_seq_stop(struct seq_file *seq, void *v) __releases(udp_hash_lock) { read_unlock(&udp_hash_lock); } static int udp_seq_open(struct inode *inode, struct file *file) { struct udp_seq_afinfo *afinfo = PDE(inode)->data; struct seq_file *seq; int rc = -ENOMEM; struct udp_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) goto out; s->family = afinfo->family; s->hashtable = afinfo->hashtable; s->seq_ops.start = udp_seq_start; s->seq_ops.next = udp_seq_next; s->seq_ops.show = afinfo->seq_show; s->seq_ops.stop = udp_seq_stop; rc = seq_open(file, &s->seq_ops); if (rc) goto out_kfree; seq = file->private_data; seq->private = s; out: return rc; out_kfree: kfree(s); goto out; } /* ------------------------------------------------------------------------ */ int udp_proc_register(struct udp_seq_afinfo *afinfo) { struct proc_dir_entry *p; int rc = 0; if (!afinfo) return -EINVAL; afinfo->seq_fops->owner = afinfo->owner; afinfo->seq_fops->open = udp_seq_open; afinfo->seq_fops->read = seq_read; afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else rc = -ENOMEM; return rc; } void udp_proc_unregister(struct udp_seq_afinfo *afinfo) { if (!afinfo) return; proc_net_remove(&init_net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } #endif /* CONFIG_PROC_FS */ void __init udp_init(void) { unsigned long limit; /* Set the pressure threshold up by the same strategy of TCP. It is a * fraction of global memory that is up to 1/2 at 256 MB, decreasing * toward zero with the amount of memory, with a floor of 128 pages. */ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; sysctl_udp_rmem_min = SK_MEM_QUANTUM; sysctl_udp_wmem_min = SK_MEM_QUANTUM; } EXPORT_SYMBOL(udp_disconnect); EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(udp_hash_lock); EXPORT_SYMBOL(udp_ioctl); EXPORT_SYMBOL(udp_get_port); EXPORT_SYMBOL(udp_lib_getsockopt); EXPORT_SYMBOL(udp_lib_setsockopt); EXPORT_SYMBOL(udp_poll); #ifdef CONFIG_PROC_FS EXPORT_SYMBOL(udp_proc_register); EXPORT_SYMBOL(udp_proc_unregister); #endif