diff options
author | Jeff Garzik <jgarzik@pobox.com> | 2005-08-30 03:48:57 -0400 |
---|---|---|
committer | Jeff Garzik <jgarzik@pobox.com> | 2005-08-30 03:48:57 -0400 |
commit | 2fcf522509cceea524b6e7ece8fd6759b682175a (patch) | |
tree | d356e87307e451cce5497ad8daeeeb047befe489 | |
parent | da61396d24e37258817e42537c482e962b4742f7 (diff) | |
parent | 1fdab81e675c6ef76a49b8aabb7eaf4be51d1b80 (diff) |
/spare/repo/libata-dev branch 'master'
613 files changed, 28597 insertions, 10732 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 8b1430b4665..0665cb12bd6 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -135,3 +135,15 @@ Why: With the 16-bit PCMCIA subsystem now behaving (almost) like a pcmciautils package available at http://kernel.org/pub/linux/utils/kernel/pcmcia/ Who: Dominik Brodowski <linux@brodo.de> + +--------------------------- + +What: ip_queue and ip6_queue (old ipv4-only and ipv6-only netfilter queue) +When: December 2005 +Why: This interface has been obsoleted by the new layer3-independent + "nfnetlink_queue". The Kernel interface is compatible, so the old + ip[6]tables "QUEUE" targets still work and will transparently handle + all packets into nfnetlink queue number 0. Userspace users will have + to link against API-compatible library on top of libnfnetlink_queue + instead of the current 'libipq'. +Who: Harald Welte <laforge@netfilter.org> diff --git a/arch/ppc/Makefile b/arch/ppc/Makefile index f9b0d778dd8..d1b6e6dcb50 100644 --- a/arch/ppc/Makefile +++ b/arch/ppc/Makefile @@ -21,11 +21,13 @@ CC := $(CC) -m32 endif LDFLAGS_vmlinux := -Ttext $(KERNELLOAD) -Bstatic -CPPFLAGS += -Iarch/$(ARCH) +CPPFLAGS += -Iarch/$(ARCH) -Iinclude3 AFLAGS += -Iarch/$(ARCH) CFLAGS += -Iarch/$(ARCH) -msoft-float -pipe \ -ffixed-r2 -mmultiple CPP = $(CC) -E $(CFLAGS) +# Temporary hack until we have migrated to asm-powerpc +LINUXINCLUDE += -Iinclude3 CHECKFLAGS += -D__powerpc__ @@ -101,6 +103,7 @@ endef archclean: $(Q)$(MAKE) $(clean)=arch/ppc/boot + $(Q)rm -rf include3 prepare: include/asm-$(ARCH)/offsets.h checkbin @@ -110,6 +113,12 @@ arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ include/asm-$(ARCH)/offsets.h: arch/$(ARCH)/kernel/asm-offsets.s $(call filechk,gen-asm-offsets) +# Temporary hack until we have migrated to asm-powerpc +include/asm: include3/asm +include3/asm: + $(Q)if [ ! -d include3 ]; then mkdir -p include3; fi + $(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm + # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. TOUT := .tmp_gas_check diff --git a/arch/ppc/boot/utils/addRamDisk.c b/arch/ppc/boot/utils/addRamDisk.c deleted file mode 100644 index 93400dfcce7..00000000000 --- a/arch/ppc/boot/utils/addRamDisk.c +++ /dev/null @@ -1,203 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <netinet/in.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> - -#define ElfHeaderSize (64 * 1024) -#define ElfPages (ElfHeaderSize / 4096) -#define KERNELBASE (0xc0000000) - -void get4k(FILE *file, char *buf ) -{ - unsigned j; - unsigned num = fread(buf, 1, 4096, file); - for ( j=num; j<4096; ++j ) - buf[j] = 0; -} - -void put4k(FILE *file, char *buf ) -{ - fwrite(buf, 1, 4096, file); -} - -void death(const char *msg, FILE *fdesc, const char *fname) -{ - printf(msg); - fclose(fdesc); - unlink(fname); - exit(1); -} - -int main(int argc, char **argv) -{ - char inbuf[4096]; - FILE *ramDisk = NULL; - FILE *inputVmlinux = NULL; - FILE *outputVmlinux = NULL; - unsigned i = 0; - u_int32_t ramFileLen = 0; - u_int32_t ramLen = 0; - u_int32_t roundR = 0; - u_int32_t kernelLen = 0; - u_int32_t actualKernelLen = 0; - u_int32_t round = 0; - u_int32_t roundedKernelLen = 0; - u_int32_t ramStartOffs = 0; - u_int32_t ramPages = 0; - u_int32_t roundedKernelPages = 0; - u_int32_t hvReleaseData = 0; - u_int32_t eyeCatcher = 0xc8a5d9c4; - u_int32_t naca = 0; - u_int32_t xRamDisk = 0; - u_int32_t xRamDiskSize = 0; - if ( argc < 2 ) { - printf("Name of RAM disk file missing.\n"); - exit(1); - } - - if ( argc < 3 ) { - printf("Name of vmlinux file missing.\n"); - exit(1); - } - - if ( argc < 4 ) { - printf("Name of vmlinux output file missing.\n"); - exit(1); - } - - ramDisk = fopen(argv[1], "r"); - if ( ! ramDisk ) { - printf("RAM disk file \"%s\" failed to open.\n", argv[1]); - exit(1); - } - inputVmlinux = fopen(argv[2], "r"); - if ( ! inputVmlinux ) { - printf("vmlinux file \"%s\" failed to open.\n", argv[2]); - exit(1); - } - outputVmlinux = fopen(argv[3], "w+"); - if ( ! outputVmlinux ) { - printf("output vmlinux file \"%s\" failed to open.\n", argv[3]); - exit(1); - } - fseek(ramDisk, 0, SEEK_END); - ramFileLen = ftell(ramDisk); - fseek(ramDisk, 0, SEEK_SET); - printf("%s file size = %d\n", argv[1], ramFileLen); - - ramLen = ramFileLen; - - roundR = 4096 - (ramLen % 4096); - if ( roundR ) { - printf("Rounding RAM disk file up to a multiple of 4096, adding %d\n", roundR); - ramLen += roundR; - } - - printf("Rounded RAM disk size is %d\n", ramLen); - fseek(inputVmlinux, 0, SEEK_END); - kernelLen = ftell(inputVmlinux); - fseek(inputVmlinux, 0, SEEK_SET); - printf("kernel file size = %d\n", kernelLen); - if ( kernelLen == 0 ) { - printf("You must have a linux kernel specified as argv[2]\n"); - exit(1); - } - - actualKernelLen = kernelLen - ElfHeaderSize; - - printf("actual kernel length (minus ELF header) = %d\n", actualKernelLen); - - round = actualKernelLen % 4096; - roundedKernelLen = actualKernelLen; - if ( round ) - roundedKernelLen += (4096 - round); - - printf("actual kernel length rounded up to a 4k multiple = %d\n", roundedKernelLen); - - ramStartOffs = roundedKernelLen; - ramPages = ramLen / 4096; - - printf("RAM disk pages to copy = %d\n", ramPages); - - // Copy 64K ELF header - for (i=0; i<(ElfPages); ++i) { - get4k( inputVmlinux, inbuf ); - put4k( outputVmlinux, inbuf ); - } - - roundedKernelPages = roundedKernelLen / 4096; - - fseek(inputVmlinux, ElfHeaderSize, SEEK_SET); - - for ( i=0; i<roundedKernelPages; ++i ) { - get4k( inputVmlinux, inbuf ); - put4k( outputVmlinux, inbuf ); - } - - for ( i=0; i<ramPages; ++i ) { - get4k( ramDisk, inbuf ); - put4k( outputVmlinux, inbuf ); - } - - /* Close the input files */ - fclose(ramDisk); - fclose(inputVmlinux); - /* And flush the written output file */ - fflush(outputVmlinux); - - /* fseek to the hvReleaseData pointer */ - fseek(outputVmlinux, ElfHeaderSize + 0x24, SEEK_SET); - if (fread(&hvReleaseData, 4, 1, outputVmlinux) != 1) { - death("Could not read hvReleaseData pointer\n", outputVmlinux, argv[3]); - } - hvReleaseData = ntohl(hvReleaseData); /* Convert to native int */ - printf("hvReleaseData is at %08x\n", hvReleaseData); - - /* fseek to the hvReleaseData */ - fseek(outputVmlinux, ElfHeaderSize + hvReleaseData, SEEK_SET); - if (fread(inbuf, 0x40, 1, outputVmlinux) != 1) { - death("Could not read hvReleaseData\n", outputVmlinux, argv[3]); - } - /* Check hvReleaseData sanity */ - if (memcmp(inbuf, &eyeCatcher, 4) != 0) { - death("hvReleaseData is invalid\n", outputVmlinux, argv[3]); - } - /* Get the naca pointer */ - naca = ntohl(*((u_int32_t *) &inbuf[0x0c])) - KERNELBASE; - printf("naca is at %08x\n", naca); - - /* fseek to the naca */ - fseek(outputVmlinux, ElfHeaderSize + naca, SEEK_SET); - if (fread(inbuf, 0x18, 1, outputVmlinux) != 1) { - death("Could not read naca\n", outputVmlinux, argv[3]); - } - xRamDisk = ntohl(*((u_int32_t *) &inbuf[0x0c])); - xRamDiskSize = ntohl(*((u_int32_t *) &inbuf[0x14])); - /* Make sure a RAM disk isn't already present */ - if ((xRamDisk != 0) || (xRamDiskSize != 0)) { - death("RAM disk is already attached to this kernel\n", outputVmlinux, argv[3]); - } - /* Fill in the values */ - *((u_int32_t *) &inbuf[0x0c]) = htonl(ramStartOffs); - *((u_int32_t *) &inbuf[0x14]) = htonl(ramPages); - - /* Write out the new naca */ - fflush(outputVmlinux); - fseek(outputVmlinux, ElfHeaderSize + naca, SEEK_SET); - if (fwrite(inbuf, 0x18, 1, outputVmlinux) != 1) { - death("Could not write naca\n", outputVmlinux, argv[3]); - } - printf("RAM Disk of 0x%x pages size is attached to the kernel at offset 0x%08x\n", - ramPages, ramStartOffs); - - /* Done */ - fclose(outputVmlinux); - /* Set permission to executable */ - chmod(argv[3], S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH); - - return 0; -} - diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 2ce87836c67..13b262f1021 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -302,12 +302,6 @@ config GENERIC_HARDIRQS bool default y -config MSCHUNKS - bool - depends on PPC_ISERIES - default y - - config PPC_RTAS bool depends on PPC_PSERIES || PPC_BPA @@ -350,13 +344,46 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source "fs/Kconfig.binfmt" + +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" + depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) + select HOTPLUG + ---help--- + Say Y here to be able to turn CPUs off and on. + + Say N if you are unsure. + +config PROC_DEVICETREE + bool "Support for Open Firmware device tree in /proc" + depends on !PPC_ISERIES + help + This option adds a device-tree directory under /proc which contains + an image of the device tree that the kernel copies from Open + Firmware. If unsure, say Y here. + +config CMDLINE_BOOL + bool "Default bootloader kernel arguments" + depends on !PPC_ISERIES + +config CMDLINE + string "Initial kernel command string" + depends on CMDLINE_BOOL + default "console=ttyS0,9600 console=tty0 root=/dev/sda2" + help + On some platforms, there is currently no way for the boot loader to + pass arguments to the kernel. For these platforms, you can supply + some command-line options at build time by entering them here. In + most cases you will need to specify the root device here. + endmenu config ISA_DMA_API bool default y -menu "General setup" +menu "Bus Options" config ISA bool @@ -389,45 +416,12 @@ config PCI_DOMAINS bool default PCI -source "fs/Kconfig.binfmt" - source "drivers/pci/Kconfig" -config HOTPLUG_CPU - bool "Support for hot-pluggable CPUs" - depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) - select HOTPLUG - ---help--- - Say Y here to be able to turn CPUs off and on. - - Say N if you are unsure. - source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" -config PROC_DEVICETREE - bool "Support for Open Firmware device tree in /proc" - depends on !PPC_ISERIES - help - This option adds a device-tree directory under /proc which contains - an image of the device tree that the kernel copies from Open - Firmware. If unsure, say Y here. - -config CMDLINE_BOOL - bool "Default bootloader kernel arguments" - depends on !PPC_ISERIES - -config CMDLINE - string "Initial kernel command string" - depends on CMDLINE_BOOL - default "console=ttyS0,9600 console=tty0 root=/dev/sda2" - help - On some platforms, there is currently no way for the boot loader to - pass arguments to the kernel. For these platforms, you can supply - some command-line options at build time by entering them here. In - most cases you will need to specify the root device here. - endmenu source "net/Kconfig" diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile index 731b8475833..6350cce82ef 100644 --- a/arch/ppc64/Makefile +++ b/arch/ppc64/Makefile @@ -55,6 +55,8 @@ LDFLAGS := -m elf64ppc LDFLAGS_vmlinux := -Bstatic -e $(KERNELLOAD) -Ttext $(KERNELLOAD) CFLAGS += -msoft-float -pipe -mminimal-toc -mtraceback=none \ -mcall-aixdesc +# Temporary hack until we have migrated to asm-powerpc +CPPFLAGS += -Iinclude3 GCC_VERSION := $(call cc-version) GCC_BROKEN_VEC := $(shell if [ $(GCC_VERSION) -lt 0400 ] ; then echo "y"; fi ;) @@ -112,6 +114,7 @@ all: $(KBUILD_IMAGE) archclean: $(Q)$(MAKE) $(clean)=$(boot) + $(Q)rm -rf include3 prepare: include/asm-ppc64/offsets.h @@ -121,6 +124,12 @@ arch/ppc64/kernel/asm-offsets.s: include/asm include/linux/version.h \ include/asm-ppc64/offsets.h: arch/ppc64/kernel/asm-offsets.s $(call filechk,gen-asm-offsets) +# Temporary hack until we have migrated to asm-powerpc +include/asm: include3/asm +include3/asm: + $(Q)if [ ! -d include3 ]; then mkdir -p include3; fi; + $(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm + define archhelp echo '* zImage - Compressed kernel image (arch/$(ARCH)/boot/zImage)' echo ' zImage.initrd- Compressed kernel image with initrd attached,' diff --git a/arch/ppc64/boot/Makefile b/arch/ppc64/boot/Makefile index 683b2d43c15..2c5f5e73d00 100644 --- a/arch/ppc64/boot/Makefile +++ b/arch/ppc64/boot/Makefile @@ -22,8 +22,8 @@ HOSTCC := gcc -BOOTCFLAGS := $(HOSTCFLAGS) $(LINUXINCLUDE) -fno-builtin -BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional +BOOTCFLAGS := $(HOSTCFLAGS) -fno-builtin -nostdinc -isystem $(shell $(CROSS32CC) -print-file-name=include) +BOOTAFLAGS := -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc BOOTLFLAGS := -Ttext 0x00400000 -e _start -T $(srctree)/$(src)/zImage.lds OBJCOPYFLAGS := contents,alloc,load,readonly,data diff --git a/arch/ppc64/boot/addnote.c b/arch/ppc64/boot/addnote.c index 719663a694b..8041a9845ab 100644 --- a/arch/ppc64/boot/addnote.c +++ b/arch/ppc64/boot/addnote.c @@ -157,7 +157,7 @@ main(int ac, char **av) PUT_32BE(ns, strlen(arch) + 1); PUT_32BE(ns + 4, N_DESCR * 4); PUT_32BE(ns + 8, 0x1275); - strcpy(&buf[ns + 12], arch); + strcpy((char *) &buf[ns + 12], arch); ns += 12 + strlen(arch) + 1; for (i = 0; i < N_DESCR; ++i, ns += 4) PUT_32BE(ns, descr[i]); @@ -172,7 +172,7 @@ main(int ac, char **av) PUT_32BE(ns, strlen(rpaname) + 1); PUT_32BE(ns + 4, sizeof(rpanote)); PUT_32BE(ns + 8, 0x12759999); - strcpy(&buf[ns + 12], rpaname); + strcpy((char *) &buf[ns + 12], rpaname); ns += 12 + ROUNDUP(strlen(rpaname) + 1); for (i = 0; i < N_RPA_DESCR; ++i, ns += 4) PUT_32BE(ns, rpanote[i]); diff --git a/arch/ppc64/boot/crt0.S b/arch/ppc64/boot/crt0.S index 04d3e74cd72..3861e7f9cf1 100644 --- a/arch/ppc64/boot/crt0.S +++ b/arch/ppc64/boot/crt0.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include <asm/ppc_asm.h> +#include "ppc_asm.h" .text .globl _start diff --git a/arch/ppc64/boot/div64.S b/arch/ppc64/boot/div64.S index 38f7e466d7d..722f360a32a 100644 --- a/arch/ppc64/boot/div64.S +++ b/arch/ppc64/boot/div64.S @@ -13,7 +13,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include <asm/ppc_asm.h> +#include "ppc_asm.h" .globl __div64_32 __div64_32: diff --git a/arch/ppc64/boot/elf.h b/arch/ppc64/boot/elf.h new file mode 100644 index 00000000000..d4828fcf1cb --- /dev/null +++ b/arch/ppc64/boot/elf.h @@ -0,0 +1,149 @@ +#ifndef _PPC_BOOT_ELF_H_ +#define _PPC_BOOT_ELF_H_ + +/* 32-bit ELF base types. */ +typedef unsigned int Elf32_Addr; +typedef unsigned short Elf32_Half; +typedef unsigned int Elf32_Off; +typedef signed int Elf32_Sword; +typedef unsigned int Elf32_Word; + +/* 64-bit ELF base types. */ +typedef unsigned long long Elf64_Addr; +typedef unsigned short Elf64_Half; +typedef signed short Elf64_SHalf; +typedef unsigned long long Elf64_Off; +typedef signed int Elf64_Sword; +typedef unsigned int Elf64_Word; +typedef unsigned long long Elf64_Xword; +typedef signed long long Elf64_Sxword; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME 0x6474e550 + +#define PT_GNU_STACK (PT_LOOS + 0x474e551) + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* These constants define the various ELF target machines */ +#define EM_NONE 0 +#define EM_PPC 20 /* PowerPC */ +#define EM_PPC64 21 /* PowerPC64 */ + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[16]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#endif /* _PPC_BOOT_ELF_H_ */ diff --git a/arch/ppc64/boot/main.c b/arch/ppc64/boot/main.c index 199d9804f61..99e68cfbe68 100644 --- a/arch/ppc64/boot/main.c +++ b/arch/ppc64/boot/main.c @@ -8,36 +8,28 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include "ppc32-types.h" +#include <stdarg.h> +#include <stddef.h> +#include "elf.h" +#include "page.h" +#include "string.h" +#include "stdio.h" +#include "prom.h" #include "zlib.h" -#include <linux/elf.h> -#include <linux/string.h> -#include <asm/processor.h> -#include <asm/page.h> - -extern void *finddevice(const char *); -extern int getprop(void *, const char *, void *, int); -extern void printf(const char *fmt, ...); -extern int sprintf(char *buf, const char *fmt, ...); -void gunzip(void *, int, unsigned char *, int *); -void *claim(unsigned int, unsigned int, unsigned int); -void flush_cache(void *, unsigned long); -void pause(void); -extern void exit(void); - -unsigned long strlen(const char *s); -void *memmove(void *dest, const void *src, unsigned long n); -void *memcpy(void *dest, const void *src, unsigned long n); + +static void gunzip(void *, int, unsigned char *, int *); +extern void flush_cache(void *, unsigned long); + /* Value picked to match that used by yaboot */ #define PROG_START 0x01400000 #define RAM_END (256<<20) // Fixme: use OF */ -char *avail_ram; -char *begin_avail, *end_avail; -char *avail_high; -unsigned int heap_use; -unsigned int heap_max; +static char *avail_ram; +static char *begin_avail, *end_avail; +static char *avail_high; +static unsigned int heap_use; +static unsigned int heap_max; extern char _start[]; extern char _vmlinux_start[]; @@ -52,9 +44,9 @@ struct addr_range { unsigned long size; unsigned long memsize; }; -struct addr_range vmlinux = {0, 0, 0}; -struct addr_range vmlinuz = {0, 0, 0}; -struct addr_range initrd = {0, 0, 0}; +static struct addr_range vmlinux = {0, 0, 0}; +static struct addr_range vmlinuz = {0, 0, 0}; +static struct addr_range initrd = {0, 0, 0}; static char scratch[128<<10]; /* 128kB of scratch space for gunzip */ @@ -64,13 +56,6 @@ typedef void (*kernel_entry_t)( unsigned long, void *); -int (*prom)(void *); - -void *chosen_handle; -void *stdin; -void *stdout; -void *stderr; - #undef DEBUG static unsigned long claim_base = PROG_START; @@ -277,7 +262,7 @@ void zfree(void *x, void *addr, unsigned nb) #define DEFLATED 8 -void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) +static void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp) { z_stream s; int r, i, flags; diff --git a/arch/ppc64/boot/page.h b/arch/ppc64/boot/page.h new file mode 100644 index 00000000000..14eca30fef6 --- /dev/null +++ b/arch/ppc64/boot/page.h @@ -0,0 +1,34 @@ +#ifndef _PPC_BOOT_PAGE_H +#define _PPC_BOOT_PAGE_H +/* + * Copyright (C) 2001 PPC64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifdef __ASSEMBLY__ +#define ASM_CONST(x) x +#else +#define __ASM_CONST(x) x##UL +#define ASM_CONST(x) __ASM_CONST(x) +#endif + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +/* align addr on a size boundary - adjust address up/down if needed */ +#define _ALIGN_UP(addr,size) (((addr)+((size)-1))&(~((size)-1))) +#define _ALIGN_DOWN(addr,size) ((addr)&(~((size)-1))) + +/* align addr on a size boundary - adjust address up if needed */ +#define _ALIGN(addr,size) _ALIGN_UP(addr,size) + +/* to align the pointer to the (next) page boundary */ +#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) + +#endif /* _PPC_BOOT_PAGE_H */ diff --git a/arch/ppc64/boot/ppc32-types.h b/arch/ppc64/boot/ppc32-types.h deleted file mode 100644 index f7b8884f8f7..00000000000 --- a/arch/ppc64/boot/ppc32-types.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _PPC64_TYPES_H -#define _PPC64_TYPES_H - -typedef __signed__ char __s8; -typedef unsigned char __u8; - -typedef __signed__ short __s16; -typedef unsigned short __u16; - -typedef __signed__ int __s32; -typedef unsigned int __u32; - -typedef __signed__ long long __s64; -typedef unsigned long long __u64; - -typedef signed char s8; -typedef unsigned char u8; - -typedef signed short s16; -typedef unsigned short u16; - -typedef signed int s32; -typedef unsigned int u32; - -typedef signed long long s64; -typedef unsigned long long u64; - -typedef struct { - __u32 u[4]; -} __attribute((aligned(16))) __vector128; - -#define BITS_PER_LONG 32 - -typedef __vector128 vector128; - -#endif /* _PPC64_TYPES_H */ diff --git a/arch/ppc64/boot/ppc_asm.h b/arch/ppc64/boot/ppc_asm.h new file mode 100644 index 00000000000..1c2c2817f9b --- /dev/null +++ b/arch/ppc64/boot/ppc_asm.h @@ -0,0 +1,62 @@ +#ifndef _PPC64_PPC_ASM_H +#define _PPC64_PPC_ASM_H +/* + * + * Definitions used by various bits of low-level assembly code on PowerPC. + * + * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Condition Register Bit Fields */ + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + + +/* General Purpose Registers (GPRs) */ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#endif /* _PPC64_PPC_ASM_H */ diff --git a/arch/ppc64/boot/prom.c b/arch/ppc64/boot/prom.c index 5e48b80ff5a..4bea2f4dcb0 100644 --- a/arch/ppc64/boot/prom.c +++ b/arch/ppc64/boot/prom.c @@ -7,43 +7,19 @@ * 2 of the License, or (at your option) any later version. */ #include <stdarg.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/ctype.h> - -extern __u32 __div64_32(unsigned long long *dividend, __u32 divisor); - -/* The unnecessary pointer compare is there - * to check for type safety (n must be 64bit) - */ -# define do_div(n,base) ({ \ - __u32 __base = (base); \ - __u32 __rem; \ - (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ - if (((n) >> 32) == 0) { \ - __rem = (__u32)(n) % __base; \ - (n) = (__u32)(n) / __base; \ - } else \ - __rem = __div64_32(&(n), __base); \ - __rem; \ - }) +#include <stddef.h> +#include "string.h" +#include "stdio.h" +#include "prom.h" int (*prom)(void *); void *chosen_handle; + void *stdin; void *stdout; void *stderr; -void exit(void); -void *finddevice(const char *name); -int getprop(void *phandle, const char *name, void *buf, int buflen); -void chrpboot(int a1, int a2, void *prom); /* in main.c */ - -int printf(char *fmt, ...); - -/* there is no convenient header to get this from... -- paulus */ -extern unsigned long strlen(const char *); int write(void *handle, void *ptr, int nb) @@ -210,107 +186,6 @@ fputs(char *str, void *f) return write(f, str, n) == n? 0: -1; } -int -readchar(void) -{ - char ch; - - for (;;) { - switch (read(stdin, &ch, 1)) { - case 1: - return ch; - case -1: - printf("read(stdin) returned -1\r\n"); - return -1; - } - } -} - -static char line[256]; -static char *lineptr; -static int lineleft; - -int -getchar(void) -{ - int c; - - if (lineleft == 0) { - lineptr = line; - for (;;) { - c = readchar(); - if (c == -1 || c == 4) - break; - if (c == '\r' || c == '\n') { - *lineptr++ = '\n'; - putchar('\n'); - break; - } - switch (c) { - case 0177: - case '\b': - if (lineptr > line) { - putchar('\b'); - putchar(' '); - putchar('\b'); - --lineptr; - } - break; - case 'U' & 0x1F: - while (lineptr > line) { - putchar('\b'); - putchar(' '); - putchar('\b'); - --lineptr; - } - break; - default: - if (lineptr >= &line[sizeof(line) - 1]) - putchar('\a'); - else { - putchar(c); - *lineptr++ = c; - } - } - } - lineleft = lineptr - line; - lineptr = line; - } - if (lineleft == 0) - return -1; - --lineleft; - return *lineptr++; -} - - - -/* String functions lifted from lib/vsprintf.c and lib/ctype.c */ -unsigned char _ctype[] = { -_C,_C,_C,_C,_C,_C,_C,_C, /* 0-7 */ -_C,_C|_S,_C|_S,_C|_S,_C|_S,_C|_S,_C,_C, /* 8-15 */ -_C,_C,_C,_C,_C,_C,_C,_C, /* 16-23 */ -_C,_C,_C,_C,_C,_C,_C,_C, /* 24-31 */ -_S|_SP,_P,_P,_P,_P,_P,_P,_P, /* 32-39 */ -_P,_P,_P,_P,_P,_P,_P,_P, /* 40-47 */ -_D,_D,_D,_D,_D,_D,_D,_D, /* 48-55 */ -_D,_D,_P,_P,_P,_P,_P,_P, /* 56-63 */ -_P,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U, /* 64-71 */ -_U,_U,_U,_U,_U,_U,_U,_U, /* 72-79 */ -_U,_U,_U,_U,_U,_U,_U,_U, /* 80-87 */ -_U,_U,_U,_P,_P,_P,_P,_P, /* 88-95 */ -_P,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L, /* 96-103 */ -_L,_L,_L,_L,_L,_L,_L,_L, /* 104-111 */ -_L,_L,_L,_L,_L,_L,_L,_L, /* 112-119 */ -_L,_L,_L,_P,_P,_P,_P,_C, /* 120-127 */ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 128-143 */ -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 144-159 */ -_S|_SP,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 160-175 */ -_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 176-191 */ -_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U, /* 192-207 */ -_U,_U,_U,_U,_U,_U,_U,_P,_U,_U,_U,_U,_U,_U,_U,_L, /* 208-223 */ -_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L, /* 224-239 */ -_L,_L,_L,_L,_L,_L,_L,_P,_L,_L,_L,_L,_L,_L,_L,_L}; /* 240-255 */ - size_t strnlen(const char * s, size_t count) { const char *sc; @@ -320,44 +195,30 @@ size_t strnlen(const char * s, size_t count) return sc - s; } -unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base) -{ - unsigned long result = 0,value; +extern unsigned int __div64_32(unsigned long long *dividend, + unsigned int divisor); - if (!base) { - base = 10; - if (*cp == '0') { - base = 8; - cp++; - if ((*cp == 'x') && isxdigit(cp[1])) { - cp++; - base = 16; - } - } - } - while (isxdigit(*cp) && - (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { - result = result*base + value; - cp++; - } - if (endp) - *endp = (char *)cp; - return result; -} - -long simple_strtol(const char *cp,char **endp,unsigned int base) -{ - if(*cp=='-') - return -simple_strtoul(cp+1,endp,base); - return simple_strtoul(cp,endp,base); -} +/* The unnecessary pointer compare is there + * to check for type safety (n must be 64bit) + */ +# define do_div(n,base) ({ \ + unsigned int __base = (base); \ + unsigned int __rem; \ + (void)(((typeof((n)) *)0) == ((unsigned long long *)0)); \ + if (((n) >> 32) == 0) { \ + __rem = (unsigned int)(n) % __base; \ + (n) = (unsigned int)(n) / __base; \ + } else \ + __rem = __div64_32(&(n), __base); \ + __rem; \ + }) static int skip_atoi(const char **s) { - int i=0; + int i, c; - while (isdigit(**s)) - i = i*10 + *((*s)++) - '0'; + for (i = 0; '0' <= (c = **s) && c <= '9'; ++*s) + i = i*10 + c - '0'; return i; } @@ -436,9 +297,6 @@ static char * number(char * str, unsigned long long num, int base, int size, int return str; } -/* Forward decl. needed for IP address printing stuff... */ -int sprintf(char * buf, const char *fmt, ...); - int vsprintf(char *buf, const char *fmt, va_list args) { int len; @@ -477,7 +335,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) /* get field width */ field_width = -1; - if (isdigit(*fmt)) + if ('0' <= *fmt && *fmt <= '9') field_width = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; @@ -493,7 +351,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) precision = -1; if (*fmt == '.') { ++fmt; - if (isdigit(*fmt)) + if ('0' <= *fmt && *fmt <= '9') precision = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; @@ -628,7 +486,7 @@ int sprintf(char * buf, const char *fmt, ...) static char sprint_buf[1024]; int -printf(char *fmt, ...) +printf(const char *fmt, ...) { va_list args; int n; diff --git a/arch/ppc64/boot/prom.h b/arch/ppc64/boot/prom.h new file mode 100644 index 00000000000..96ab5aec740 --- /dev/null +++ b/arch/ppc64/boot/prom.h @@ -0,0 +1,18 @@ +#ifndef _PPC_BOOT_PROM_H_ +#define _PPC_BOOT_PROM_H_ + +extern int (*prom) (void *); +extern void *chosen_handle; + +extern void *stdin; +extern void *stdout; +extern void *stderr; + +extern int write(void *handle, void *ptr, int nb); +extern int read(void *handle, void *ptr, int nb); +extern void exit(void); +extern void pause(void); +extern void *finddevice(const char *); +extern void *claim(unsigned long virt, unsigned long size, unsigned long align); +extern int getprop(void *phandle, const char *name, void *buf, int buflen); +#endif /* _PPC_BOOT_PROM_H_ */ diff --git a/arch/ppc64/boot/stdio.h b/arch/ppc64/boot/stdio.h new file mode 100644 index 00000000000..24bd3a8dee9 --- /dev/null +++ b/arch/ppc64/boot/stdio.h @@ -0,0 +1,16 @@ +#ifndef _PPC_BOOT_STDIO_H_ +#define _PPC_BOOT_STDIO_H_ + +extern int printf(const char *fmt, ...); + +extern int sprintf(char *buf, const char *fmt, ...); + +extern int vsprintf(char *buf, const char *fmt, va_list args); + +extern int putc(int c, void *f); +extern int putchar(int c); +extern int getchar(void); + +extern int fputs(char *str, void *f); + +#endif /* _PPC_BOOT_STDIO_H_ */ diff --git a/arch/ppc64/boot/string.S b/arch/ppc64/boot/string.S index ba5f2d21c9e..7ade87ae771 100644 --- a/arch/ppc64/boot/string.S +++ b/arch/ppc64/boot/string.S @@ -9,7 +9,7 @@ * NOTE: this code runs in 32 bit mode and is packaged as ELF32. */ -#include <asm/ppc_asm.h> +#include "ppc_asm.h" .text .globl strcpy diff --git a/arch/ppc64/boot/string.h b/arch/ppc64/boot/string.h new file mode 100644 index 00000000000..9289258bcbd --- /dev/null +++ b/arch/ppc64/boot/string.h @@ -0,0 +1,16 @@ +#ifndef _PPC_BOOT_STRING_H_ +#define _PPC_BOOT_STRING_H_ + +extern char *strcpy(char *dest, const char *src); +extern char *strncpy(char *dest, const char *src, size_t n); +extern char *strcat(char *dest, const char *src); +extern int strcmp(const char *s1, const char *s2); +extern size_t strlen(const char *s); +extern size_t strnlen(const char *s, size_t count); + +extern void *memset(void *s, int c, size_t n); +extern void *memmove(void *dest, const void *src, unsigned long n); +extern void *memcpy(void *dest, const void *src, unsigned long n); +extern int memcmp(const void *s1, const void *s2, size_t n); + +#endif /* _PPC_BOOT_STRING_H_ */ diff --git a/arch/ppc64/boot/zlib.c b/arch/ppc64/boot/zlib.c index 78837e884b8..0d910cd2079 100644 --- a/arch/ppc64/boot/zlib.c +++ b/arch/ppc64/boot/zlib.c @@ -107,7 +107,7 @@ extern void *memcpy(void *, const void *, unsigned long); /* Diagnostic functions */ #ifdef DEBUG_ZLIB -# include <stdio.h> +# include "stdio.h" # ifndef verbose # define verbose 0 # endif diff --git a/arch/ppc64/configs/g5_defconfig b/arch/ppc64/configs/g5_defconfig index ab567741e80..fc83d933028 100644 --- a/arch/ppc64/configs/g5_defconfig +++ b/arch/ppc64/configs/g5_defconfig @@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/arch/ppc64/configs/iSeries_defconfig b/arch/ppc64/configs/iSeries_defconfig index 394ba18b58c..013d4e0e400 100644 --- a/arch/ppc64/configs/iSeries_defconfig +++ b/arch/ppc64/configs/iSeries_defconfig @@ -94,12 +94,11 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y -CONFIG_MSCHUNKS=y CONFIG_LPARCFG=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/arch/ppc64/configs/maple_defconfig b/arch/ppc64/configs/maple_defconfig index 2033fe663db..dd42892cd87 100644 --- a/arch/ppc64/configs/maple_defconfig +++ b/arch/ppc64/configs/maple_defconfig @@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_GENERIC_HARDIRQS=y CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y diff --git a/arch/ppc64/configs/pSeries_defconfig b/arch/ppc64/configs/pSeries_defconfig index 297fd522948..29f7b80b0ef 100644 --- a/arch/ppc64/configs/pSeries_defconfig +++ b/arch/ppc64/configs/pSeries_defconfig @@ -112,10 +112,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_EEH=y CONFIG_GENERIC_HARDIRQS=y CONFIG_PPC_RTAS=y diff --git a/arch/ppc64/defconfig b/arch/ppc64/defconfig index c361e7727b7..7cb4750bb7a 100644 --- a/arch/ppc64/defconfig +++ b/arch/ppc64/defconfig @@ -114,10 +114,10 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set # CONFIG_PREEMPT_BKL is not set -CONFIG_HZ_100=y -# CONFIG_HZ_250 is not set +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set -CONFIG_HZ=100 +CONFIG_HZ=250 CONFIG_EEH=y CONFIG_GENERIC_HARDIRQS=y CONFIG_PPC_RTAS=y diff --git a/arch/ppc64/kernel/LparData.c b/arch/ppc64/kernel/LparData.c index 1c11031c838..0a9c23ca2f0 100644 --- a/arch/ppc64/kernel/LparData.c +++ b/arch/ppc64/kernel/LparData.c @@ -51,6 +51,17 @@ struct HvReleaseData hvReleaseData = { 0xf4, 0x4b, 0xf6, 0xf4 }, }; +/* + * The NACA. The first dword of the naca is required by the iSeries + * hypervisor to point to itVpdAreas. The hypervisor finds the NACA + * through the pointer in hvReleaseData. + */ +struct naca_struct naca = { + .xItVpdAreas = &itVpdAreas, + .xRamDisk = 0, + .xRamDiskSize = 0, +}; + extern void system_reset_iSeries(void); extern void machine_check_iSeries(void); extern void data_access_iSeries(void); @@ -214,29 +225,3 @@ struct ItVpdAreas itVpdAreas = { 0,0 } }; - -struct msChunks msChunks; -EXPORT_SYMBOL(msChunks); - -/* Depending on whether this is called from iSeries or pSeries setup - * code, the location of the msChunks struct may or may not have - * to be reloc'd, so we force the caller to do that for us by passing - * in a pointer to the structure. - */ -unsigned long -msChunks_alloc(unsigned long mem, unsigned long num_chunks, unsigned long chunk_size) -{ - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - _msChunks->num_chunks = num_chunks; - _msChunks->chunk_size = chunk_size; - _msChunks->chunk_shift = __ilog2(chunk_size); - _msChunks->chunk_mask = (1UL<<_msChunks->chunk_shift)-1; - - mem = _ALIGN(mem, sizeof(msChunks_entry)); - _msChunks->abs = (msChunks_entry *)(mem + offset); - mem += num_chunks * sizeof(msChunks_entry); - - return mem; -} diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index 2ecccb6b4f8..f4b3bfcc109 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -11,7 +11,7 @@ obj-y := setup.o entry.o traps.o irq.o idle.o dma.o \ udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \ ptrace32.o signal32.o rtc.o init_task.o \ lmb.o cputable.o cpu_setup_power4.o idle_power4.o \ - iommu.o sysfs.o vdso.o pmc.o + iommu.o sysfs.o vdso.o pmc.o firmware.o obj-y += vdso32/ vdso64/ obj-$(CONFIG_PPC_OF) += of_device.o @@ -50,7 +50,10 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_HVCS) += hvcserver.o -obj-$(CONFIG_IBMVIO) += vio.o + +vio-obj-$(CONFIG_PPC_PSERIES) += pSeries_vio.o +vio-obj-$(CONFIG_PPC_ISERIES) += iSeries_vio.o +obj-$(CONFIG_IBMVIO) += vio.o $(vio-obj-y) obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_MPIC) += mpic.o diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c index abb9e5b5da0..17e35d0fed0 100644 --- a/arch/ppc64/kernel/asm-offsets.c +++ b/arch/ppc64/kernel/asm-offsets.c @@ -94,7 +94,8 @@ int main(void) DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_HUGETLB_PAGE - DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs)); + DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); + DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); #endif /* CONFIG_HUGETLB_PAGE */ DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr)); DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 77cec42f952..4847f2ac8c9 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -5,7 +5,7 @@ * * Modifications for ppc64: * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> - * + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -60,7 +60,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Power3+ */ .pvr_mask = 0xffff0000, @@ -73,7 +72,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Northstar */ .pvr_mask = 0xffff0000, @@ -86,7 +84,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Pulsar */ .pvr_mask = 0xffff0000, @@ -99,7 +96,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* I-star */ .pvr_mask = 0xffff0000, @@ -112,7 +108,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* S-star */ .pvr_mask = 0xffff0000, @@ -125,7 +120,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power3, - .firmware_features = COMMON_PPC64_FW, }, { /* Power4 */ .pvr_mask = 0xffff0000, @@ -138,7 +132,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* Power4+ */ .pvr_mask = 0xffff0000, @@ -151,7 +144,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970 */ .pvr_mask = 0xffff0000, @@ -166,7 +158,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970FX */ .pvr_mask = 0xffff0000, @@ -181,7 +172,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* PPC970MP */ .pvr_mask = 0xffff0000, @@ -196,7 +186,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_ppc970, - .firmware_features = COMMON_PPC64_FW, }, { /* Power5 */ .pvr_mask = 0xffff0000, @@ -211,7 +200,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* Power5 */ .pvr_mask = 0xffff0000, @@ -226,7 +214,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, }, { /* BE DD1.x */ .pvr_mask = 0xffff0000, @@ -241,7 +228,6 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_be, - .firmware_features = COMMON_PPC64_FW, }, { /* default match */ .pvr_mask = 0x00000000, @@ -254,29 +240,5 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_power4, - .firmware_features = COMMON_PPC64_FW, } }; - -firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { - {FW_FEATURE_PFT, "hcall-pft"}, - {FW_FEATURE_TCE, "hcall-tce"}, - {FW_FEATURE_SPRG0, "hcall-sprg0"}, - {FW_FEATURE_DABR, "hcall-dabr"}, - {FW_FEATURE_COPY, "hcall-copy"}, - {FW_FEATURE_ASR, "hcall-asr"}, - {FW_FEATURE_DEBUG, "hcall-debug"}, - {FW_FEATURE_PERF, "hcall-perf"}, - {FW_FEATURE_DUMP, "hcall-dump"}, - {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, - {FW_FEATURE_MIGRATE, "hcall-migrate"}, - {FW_FEATURE_PERFMON, "hcall-perfmon"}, - {FW_FEATURE_CRQ, "hcall-crq"}, - {FW_FEATURE_VIO, "hcall-vio"}, - {FW_FEATURE_RDMA, "hcall-rdma"}, - {FW_FEATURE_LLAN, "hcall-lLAN"}, - {FW_FEATURE_BULK, "hcall-bulk"}, - {FW_FEATURE_XDABR, "hcall-xdabr"}, - {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, - {FW_FEATURE_SPLPAR, "hcall-splpar"}, -}; diff --git a/arch/ppc64/kernel/firmware.c b/arch/ppc64/kernel/firmware.c new file mode 100644 index 00000000000..d8432c0fb27 --- /dev/null +++ b/arch/ppc64/kernel/firmware.c @@ -0,0 +1,47 @@ +/* + * arch/ppc64/kernel/firmware.c + * + * Extracted from cputable.c + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> + +#include <asm/firmware.h> + +unsigned long ppc64_firmware_features; + +#ifdef CONFIG_PPC_PSERIES +firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { + {FW_FEATURE_PFT, "hcall-pft"}, + {FW_FEATURE_TCE, "hcall-tce"}, + {FW_FEATURE_SPRG0, "hcall-sprg0"}, + {FW_FEATURE_DABR, "hcall-dabr"}, + {FW_FEATURE_COPY, "hcall-copy"}, + {FW_FEATURE_ASR, "hcall-asr"}, + {FW_FEATURE_DEBUG, "hcall-debug"}, + {FW_FEATURE_PERF, "hcall-perf"}, + {FW_FEATURE_DUMP, "hcall-dump"}, + {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, + {FW_FEATURE_MIGRATE, "hcall-migrate"}, + {FW_FEATURE_PERFMON, "hcall-perfmon"}, + {FW_FEATURE_CRQ, "hcall-crq"}, + {FW_FEATURE_VIO, "hcall-vio"}, + {FW_FEATURE_RDMA, "hcall-rdma"}, + {FW_FEATURE_LLAN, "hcall-lLAN"}, + {FW_FEATURE_BULK, "hcall-bulk"}, + {FW_FEATURE_XDABR, "hcall-xdabr"}, + {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_SPLPAR, "hcall-splpar"}, +}; +#endif diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S index accaa052d31..03695977562 100644 --- a/arch/ppc64/kernel/head.S +++ b/arch/ppc64/kernel/head.S @@ -23,14 +23,11 @@ * 2 of the License, or (at your option) any later version. */ -#define SECONDARY_PROCESSORS - #include <linux/config.h> #include <linux/threads.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/naca.h> #include <asm/systemcfg.h> #include <asm/ppc_asm.h> #include <asm/offsets.h> @@ -45,18 +42,13 @@ #endif /* - * hcall interface to pSeries LPAR - */ -#define H_SET_ASR 0x30 - -/* * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x2fff : pSeries Interrupt prologs - * 0x3000 - 0x3fff : Interrupt support - * 0x4000 - 0x4fff : NACA - * 0x6000 : iSeries and common interrupt prologs - * 0x9000 - 0x9fff : Initial segment table + * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs + * 0x6000 - 0x6fff : Initial (CPU0) segment table + * 0x7000 - 0x7fff : FWNMI data area + * 0x8000 - : Early init and support code */ /* @@ -94,6 +86,7 @@ END_FTR_SECTION(0, 1) /* Catch branch to 0 in real mode */ trap + #ifdef CONFIG_PPC_ISERIES /* * At offset 0x20, there is a pointer to iSeries LPAR data. @@ -103,12 +96,12 @@ END_FTR_SECTION(0, 1) .llong hvReleaseData-KERNELBASE /* - * At offset 0x28 and 0x30 are offsets to the msChunks + * At offset 0x28 and 0x30 are offsets to the mschunks_map * array (used by the iSeries LPAR debugger to do translation * between physical addresses and absolute addresses) and * to the pidhash table (also used by the debugger) */ - .llong msChunks-KERNELBASE + .llong mschunks_map-KERNELBASE .llong 0 /* pidhash-KERNELBASE SFRXXX */ /* Offset 0x38 - Pointer to start of embedded System.map */ @@ -120,7 +113,7 @@ embedded_sysmap_start: embedded_sysmap_end: .llong 0 -#else /* CONFIG_PPC_ISERIES */ +#endif /* CONFIG_PPC_ISERIES */ /* Secondary processors spin on this value until it goes to 1. */ .globl __secondary_hold_spinloop @@ -155,7 +148,7 @@ _GLOBAL(__secondary_hold) std r24,__secondary_hold_acknowledge@l(0) sync - /* All secondary cpu's wait here until told to start. */ + /* All secondary cpus wait here until told to start. */ 100: ld r4,__secondary_hold_spinloop@l(0) cmpdi 0,r4,1 bne 100b @@ -170,7 +163,6 @@ _GLOBAL(__secondary_hold) BUG_OPCODE #endif #endif -#endif /* This value is used to mark exception frames on the stack. */ .section ".toc","aw" @@ -502,33 +494,37 @@ system_call_pSeries: STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint) STD_EXCEPTION_PSERIES(0x1700, altivec_assist) + . = 0x3000 + +/*** pSeries interrupt support ***/ + /* moved from 0xf00 */ - STD_EXCEPTION_PSERIES(0x3000, performance_monitor) + STD_EXCEPTION_PSERIES(., performance_monitor) - . = 0x3100 + .align 7 _GLOBAL(do_stab_bolted_pSeries) mtcrf 0x80,r12 mfspr r12,SPRG2 EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) - - /* Space for the naca. Architected to be located at real address - * NACA_PHYS_ADDR. Various tools rely on this location being fixed. - * The first dword of the naca is required by iSeries LPAR to - * point to itVpdAreas. On pSeries native, this value is not used. - */ - . = NACA_PHYS_ADDR - .globl __end_interrupts -__end_interrupts: -#ifdef CONFIG_PPC_ISERIES - .globl naca -naca: - .llong itVpdAreas - .llong 0 /* xRamDisk */ - .llong 0 /* xRamDiskSize */ +/* + * Vectors for the FWNMI option. Share common code. + */ + .globl system_reset_fwnmi +system_reset_fwnmi: + HMT_MEDIUM + mtspr SPRG1,r13 /* save r13 */ + RUNLATCH_ON(r13) + EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) - . = 0x6100 + .globl machine_check_fwnmi +machine_check_fwnmi: + HMT_MEDIUM + mtspr SPRG1,r13 /* save r13 */ + RUNLATCH_ON(r13) + EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) +#ifdef CONFIG_PPC_ISERIES /*** ISeries-LPAR interrupt handlers ***/ STD_EXCEPTION_ISERIES(0x200, machine_check, PACA_EXMC) @@ -626,9 +622,7 @@ system_reset_iSeries: cmpwi 0,r23,0 beq iSeries_secondary_smp_loop /* Loop until told to go */ -#ifdef SECONDARY_PROCESSORS bne .__secondary_start /* Loop until told to go */ -#endif iSeries_secondary_smp_loop: /* Let the Hypervisor know we are alive */ /* 8002 is a call to HvCallCfg::getLps, a harmless Hypervisor function */ @@ -671,51 +665,8 @@ hardware_interrupt_iSeries_masked: ld r13,PACA_EXGEN+EX_R13(r13) rfid b . /* prevent speculative execution */ -#endif - -/* - * Data area reserved for FWNMI option. - */ - .= 0x7000 - .globl fwnmi_data_area -fwnmi_data_area: - -#ifdef CONFIG_PPC_ISERIES - . = LPARMAP_PHYS -#include "lparmap.s" #endif /* CONFIG_PPC_ISERIES */ -/* - * Vectors for the FWNMI option. Share common code. - */ - . = 0x8000 - .globl system_reset_fwnmi -system_reset_fwnmi: - HMT_MEDIUM - mtspr SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) - .globl machine_check_fwnmi -machine_check_fwnmi: - HMT_MEDIUM - mtspr SPRG1,r13 /* save r13 */ - RUNLATCH_ON(r13) - EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) - - /* - * Space for the initial segment table - * For LPAR, the hypervisor must fill in at least one entry - * before we get control (with relocate on) - */ - . = STAB0_PHYS_ADDR - .globl __start_stab -__start_stab: - - . = (STAB0_PHYS_ADDR + PAGE_SIZE) - .globl __end_stab -__end_stab: - - /*** Common interrupt handlers ***/ STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception) @@ -752,8 +703,8 @@ machine_check_common: * R9 contains the saved CR, r13 points to the paca, * r10 contains the (bad) kernel stack pointer, * r11 and r12 contain the saved SRR0 and SRR1. - * We switch to using the paca guard page as an emergency stack, - * save the registers there, and call kernel_bad_stack(), which panics. + * We switch to using an emergency stack, save the registers there, + * and call kernel_bad_stack(), which panics. */ bad_stack: ld r1,PACAEMERGSP(r13) @@ -906,6 +857,62 @@ fp_unavailable_common: bl .kernel_fp_unavailable_exception BUG_OPCODE +/* + * load_up_fpu(unused, unused, tsk) + * Disable FP for the task which had the FPU previously, + * and save its floating-point registers in its thread_struct. + * Enables the FPU for use in the kernel on return. + * On SMP we know the fpu is free, since we give it up every + * switch (ie, no lazy save of the FP registers). + * On entry: r13 == 'current' && last_task_used_math != 'current' + */ +_STATIC(load_up_fpu) + mfmsr r5 /* grab the current MSR */ + ori r5,r5,MSR_FP + mtmsrd r5 /* enable use of fpu now */ + isync +/* + * For SMP, we don't do lazy FPU switching because it just gets too + * horrendously complex, especially when a task switches from one CPU + * to another. Instead we call giveup_fpu in switch_to. + * + */ +#ifndef CONFIG_SMP + ld r3,last_task_used_math@got(r2) + ld r4,0(r3) + cmpdi 0,r4,0 + beq 1f + /* Save FP state to last_task_used_math's THREAD struct */ + addi r4,r4,THREAD + SAVE_32FPRS(0, r4) + mffs fr0 + stfd fr0,THREAD_FPSCR(r4) + /* Disable FP for last_task_used_math */ + ld r5,PT_REGS(r4) + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + li r6,MSR_FP|MSR_FE0|MSR_FE1 + andc r4,r4,r6 + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + /* enable use of FP after return */ + ld r4,PACACURRENT(r13) + addi r5,r4,THREAD /* Get THREAD */ + ld r4,THREAD_FPEXC_MODE(r5) + ori r12,r12,MSR_FP + or r12,r12,r4 + std r12,_MSR(r1) + lfd fr0,THREAD_FPSCR(r5) + mtfsf 0xff,fr0 + REST_32FPRS(0, r5) +#ifndef CONFIG_SMP + /* Update last_task_used_math to 'current' */ + subi r4,r5,THREAD /* Back to 'current' */ + std r4,0(r3) +#endif /* CONFIG_SMP */ + /* restore registers and return */ + b fast_exception_return + .align 7 .globl altivec_unavailable_common altivec_unavailable_common: @@ -921,6 +928,80 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) bl .altivec_unavailable_exception b .ret_from_except +#ifdef CONFIG_ALTIVEC +/* + * load_up_altivec(unused, unused, tsk) + * Disable VMX for the task which had it previously, + * and save its vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + * On SMP we know the VMX is free, since we give it up every + * switch (ie, no lazy save of the vector registers). + * On entry: r13 == 'current' && last_task_used_altivec != 'current' + */ +_STATIC(load_up_altivec) + mfmsr r5 /* grab the current MSR */ + oris r5,r5,MSR_VEC@h + mtmsrd r5 /* enable use of VMX now */ + isync + +/* + * For SMP, we don't do lazy VMX switching because it just gets too + * horrendously complex, especially when a task switches from one CPU + * to another. Instead we call giveup_altvec in switch_to. + * VRSAVE isn't dealt with here, that is done in the normal context + * switch code. Note that we could rely on vrsave value to eventually + * avoid saving all of the VREGs here... + */ +#ifndef CONFIG_SMP + ld r3,last_task_used_altivec@got(r2) + ld r4,0(r3) + cmpdi 0,r4,0 + beq 1f + /* Save VMX state to last_task_used_altivec's THREAD struct */ + addi r4,r4,THREAD + SAVE_32VRS(0,r5,r4) + mfvscr vr0 + li r10,THREAD_VSCR + stvx vr0,r10,r4 + /* Disable VMX for last_task_used_altivec */ + ld r5,PT_REGS(r4) + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r6,MSR_VEC@h + andc r4,r4,r6 + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#endif /* CONFIG_SMP */ + /* Hack: if we get an altivec unavailable trap with VRSAVE + * set to all zeros, we assume this is a broken application + * that fails to set it properly, and thus we switch it to + * all 1's + */ + mfspr r4,SPRN_VRSAVE + cmpdi 0,r4,0 + bne+ 1f + li r4,-1 + mtspr SPRN_VRSAVE,r4 +1: + /* enable use of VMX after return */ + ld r4,PACACURRENT(r13) + addi r5,r4,THREAD /* Get THREAD */ + oris r12,r12,MSR_VEC@h + std r12,_MSR(r1) + li r4,1 + li r10,THREAD_VSCR + stw r4,THREAD_USED_VR(r5) + lvx vr0,r10,r5 + mtvscr vr0 + REST_32VRS(0,r4,r5) +#ifndef CONFIG_SMP + /* Update last_task_used_math to 'current' */ + subi r4,r5,THREAD /* Back to 'current' */ + std r4,0(r3) +#endif /* CONFIG_SMP */ + /* restore registers and return */ + b fast_exception_return +#endif /* CONFIG_ALTIVEC */ + /* * Hash table stuff */ @@ -1167,6 +1248,42 @@ unrecov_slb: bl .unrecoverable_exception b 1b +/* + * Space for CPU0's segment table. + * + * On iSeries, the hypervisor must fill in at least one entry before + * we get control (with relocate on). The address is give to the hv + * as a page number (see xLparMap in LparData.c), so this must be at a + * fixed address (the linker can't compute (u64)&initial_stab >> + * PAGE_SHIFT). + */ + . = STAB0_PHYS_ADDR /* 0x6000 */ + .globl initial_stab +initial_stab: + .space 4096 + +/* + * Data area reserved for FWNMI option. + * This address (0x7000) is fixed by the RPA. + */ + .= 0x7000 + .globl fwnmi_data_area +fwnmi_data_area: + + /* iSeries does not use the FWNMI stuff, so it is safe to put + * this here, even if we later allow kernels that will boot on + * both pSeries and iSeries */ +#ifdef CONFIG_PPC_ISERIES + . = LPARMAP_PHYS +#include "lparmap.s" +/* + * This ".text" is here for old compilers that generate a trailing + * .note section when compiling .c files to .s + */ + .text +#endif /* CONFIG_PPC_ISERIES */ + + . = 0x8000 /* * On pSeries, secondary processors spin in the following code. @@ -1200,7 +1317,7 @@ _GLOBAL(pSeries_secondary_smp_init) b .kexec_wait /* next kernel might do better */ 2: mtspr SPRG3,r13 /* Save vaddr of paca in SPRG3 */ - /* From now on, r24 is expected to be logica cpuid */ + /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW lbz r23,PACAPROCSTART(r13) /* Test if this processor should */ @@ -1213,10 +1330,8 @@ _GLOBAL(pSeries_secondary_smp_init) cmpwi 0,r23,0 #ifdef CONFIG_SMP -#ifdef SECONDARY_PROCESSORS bne .__secondary_start #endif -#endif b 3b /* Loop until told to go */ #ifdef CONFIG_PPC_ISERIES @@ -1430,228 +1545,6 @@ _GLOBAL(copy_and_flush) .align 8 copy_to_here: -/* - * load_up_fpu(unused, unused, tsk) - * Disable FP for the task which had the FPU previously, - * and save its floating-point registers in its thread_struct. - * Enables the FPU for use in the kernel on return. - * On SMP we know the fpu is free, since we give it up every - * switch (ie, no lazy save of the FP registers). - * On entry: r13 == 'current' && last_task_used_math != 'current' - */ -_STATIC(load_up_fpu) - mfmsr r5 /* grab the current MSR */ - ori r5,r5,MSR_FP - mtmsrd r5 /* enable use of fpu now */ - isync -/* - * For SMP, we don't do lazy FPU switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_fpu in switch_to. - * - */ -#ifndef CONFIG_SMP - ld r3,last_task_used_math@got(r2) - ld r4,0(r3) - cmpdi 0,r4,0 - beq 1f - /* Save FP state to last_task_used_math's THREAD struct */ - addi r4,r4,THREAD - SAVE_32FPRS(0, r4) - mffs fr0 - stfd fr0,THREAD_FPSCR(r4) - /* Disable FP for last_task_used_math */ - ld r5,PT_REGS(r4) - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - li r6,MSR_FP|MSR_FE0|MSR_FE1 - andc r4,r4,r6 - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* enable use of FP after return */ - ld r4,PACACURRENT(r13) - addi r5,r4,THREAD /* Get THREAD */ - ld r4,THREAD_FPEXC_MODE(r5) - ori r12,r12,MSR_FP - or r12,r12,r4 - std r12,_MSR(r1) - lfd fr0,THREAD_FPSCR(r5) - mtfsf 0xff,fr0 - REST_32FPRS(0, r5) -#ifndef CONFIG_SMP - /* Update last_task_used_math to 'current' */ - subi r4,r5,THREAD /* Back to 'current' */ - std r4,0(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - b fast_exception_return - -/* - * disable_kernel_fp() - * Disable the FPU. - */ -_GLOBAL(disable_kernel_fp) - mfmsr r3 - rldicl r0,r3,(63-MSR_FP_LG),1 - rldicl r3,r0,(MSR_FP_LG+1),0 - mtmsrd r3 /* disable use of fpu now */ - isync - blr - -/* - * giveup_fpu(tsk) - * Disable FP for the task given as the argument, - * and save the floating-point registers in its thread_struct. - * Enables the FPU for use in the kernel on return. - */ -_GLOBAL(giveup_fpu) - mfmsr r5 - ori r5,r5,MSR_FP - mtmsrd r5 /* enable use of fpu now */ - isync - cmpdi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - ld r5,PT_REGS(r3) - cmpdi 0,r5,0 - SAVE_32FPRS(0, r3) - mffs fr0 - stfd fr0,THREAD_FPSCR(r3) - beq 1f - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - li r3,MSR_FP|MSR_FE0|MSR_FE1 - andc r4,r4,r3 /* disable FP for previous task */ - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - ld r4,last_task_used_math@got(r2) - std r5,0(r4) -#endif /* CONFIG_SMP */ - blr - - -#ifdef CONFIG_ALTIVEC - -/* - * load_up_altivec(unused, unused, tsk) - * Disable VMX for the task which had it previously, - * and save its vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - * On SMP we know the VMX is free, since we give it up every - * switch (ie, no lazy save of the vector registers). - * On entry: r13 == 'current' && last_task_used_altivec != 'current' - */ -_STATIC(load_up_altivec) - mfmsr r5 /* grab the current MSR */ - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - -/* - * For SMP, we don't do lazy VMX switching because it just gets too - * horrendously complex, especially when a task switches from one CPU - * to another. Instead we call giveup_altvec in switch_to. - * VRSAVE isn't dealt with here, that is done in the normal context - * switch code. Note that we could rely on vrsave value to eventually - * avoid saving all of the VREGs here... - */ -#ifndef CONFIG_SMP - ld r3,last_task_used_altivec@got(r2) - ld r4,0(r3) - cmpdi 0,r4,0 - beq 1f - /* Save VMX state to last_task_used_altivec's THREAD struct */ - addi r4,r4,THREAD - SAVE_32VRS(0,r5,r4) - mfvscr vr0 - li r10,THREAD_VSCR - stvx vr0,r10,r4 - /* Disable VMX for last_task_used_altivec */ - ld r5,PT_REGS(r4) - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r6,MSR_VEC@h - andc r4,r4,r6 - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#endif /* CONFIG_SMP */ - /* Hack: if we get an altivec unavailable trap with VRSAVE - * set to all zeros, we assume this is a broken application - * that fails to set it properly, and thus we switch it to - * all 1's - */ - mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 - bne+ 1f - li r4,-1 - mtspr SPRN_VRSAVE,r4 -1: - /* enable use of VMX after return */ - ld r4,PACACURRENT(r13) - addi r5,r4,THREAD /* Get THREAD */ - oris r12,r12,MSR_VEC@h - std r12,_MSR(r1) - li r4,1 - li r10,THREAD_VSCR - stw r4,THREAD_USED_VR(r5) - lvx vr0,r10,r5 - mtvscr vr0 - REST_32VRS(0,r4,r5) -#ifndef CONFIG_SMP - /* Update last_task_used_math to 'current' */ - subi r4,r5,THREAD /* Back to 'current' */ - std r4,0(r3) -#endif /* CONFIG_SMP */ - /* restore registers and return */ - b fast_exception_return - -/* - * disable_kernel_altivec() - * Disable the VMX. - */ -_GLOBAL(disable_kernel_altivec) - mfmsr r3 - rldicl r0,r3,(63-MSR_VEC_LG),1 - rldicl r3,r0,(MSR_VEC_LG+1),0 - mtmsrd r3 /* disable use of VMX now */ - isync - blr - -/* - * giveup_altivec(tsk) - * Disable VMX for the task given as the argument, - * and save the vector registers in its thread_struct. - * Enables the VMX for use in the kernel on return. - */ -_GLOBAL(giveup_altivec) - mfmsr r5 - oris r5,r5,MSR_VEC@h - mtmsrd r5 /* enable use of VMX now */ - isync - cmpdi 0,r3,0 - beqlr- /* if no previous owner, done */ - addi r3,r3,THREAD /* want THREAD of task */ - ld r5,PT_REGS(r3) - cmpdi 0,r5,0 - SAVE_32VRS(0,r4,r3) - mfvscr vr0 - li r4,THREAD_VSCR - stvx vr0,r4,r3 - beq 1f - ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) - lis r3,MSR_VEC@h - andc r4,r4,r3 /* disable FP for previous task */ - std r4,_MSR-STACK_FRAME_OVERHEAD(r5) -1: -#ifndef CONFIG_SMP - li r5,0 - ld r4,last_task_used_altivec@got(r2) - std r5,0(r4) -#endif /* CONFIG_SMP */ - blr - -#endif /* CONFIG_ALTIVEC */ - #ifdef CONFIG_SMP #ifdef CONFIG_PPC_PMAC /* @@ -2002,9 +1895,6 @@ _STATIC(start_here_common) bl .start_kernel -_GLOBAL(__setup_cpu_power3) - blr - _GLOBAL(hmt_init) #ifdef CONFIG_HMT LOADADDR(r5, hmt_thread_data) @@ -2095,20 +1985,19 @@ _GLOBAL(smp_release_cpus) /* * We put a few things here that have to be page-aligned. - * This stuff goes at the beginning of the data segment, - * which is page-aligned. + * This stuff goes at the beginning of the bss, which is page-aligned. */ - .data + .section ".bss" + .align 12 - .globl sdata -sdata: + .globl empty_zero_page empty_zero_page: - .space 4096 + .space PAGE_SIZE .globl swapper_pg_dir swapper_pg_dir: - .space 4096 + .space PAGE_SIZE /* * This space gets a copy of optional info passed to us by the bootstrap diff --git a/arch/ppc64/kernel/iSeries_htab.c b/arch/ppc64/kernel/iSeries_htab.c index b0250ae4a72..2192055a90a 100644 --- a/arch/ppc64/kernel/iSeries_htab.c +++ b/arch/ppc64/kernel/iSeries_htab.c @@ -41,6 +41,7 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { + unsigned long arpn; long slot; hpte_t lhpte; int secondary = 0; @@ -70,8 +71,10 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, slot &= 0x7fffffffffffffff; } + arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT; + lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; - lhpte.r = (physRpn_to_absRpn(prpn) << HPTE_R_RPN_SHIFT) | rflags; + lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags; /* Now fill in the actual HPTE */ HvCallHpt_addValidate(slot, secondary, &lhpte); diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c index a649edbb23b..3ffefbbc662 100644 --- a/arch/ppc64/kernel/iSeries_setup.c +++ b/arch/ppc64/kernel/iSeries_setup.c @@ -39,6 +39,7 @@ #include <asm/cputable.h> #include <asm/sections.h> #include <asm/iommu.h> +#include <asm/firmware.h> #include <asm/time.h> #include "iSeries_setup.h" @@ -314,6 +315,8 @@ static void __init iSeries_init_early(void) DBG(" -> iSeries_init_early()\n"); + ppc64_firmware_features = FW_FEATURE_ISERIES; + ppcdbg_initialize(); #if defined(CONFIG_BLK_DEV_INITRD) @@ -412,6 +415,22 @@ static void __init iSeries_init_early(void) DBG(" <- iSeries_init_early()\n"); } +struct mschunks_map mschunks_map = { + /* XXX We don't use these, but Piranha might need them. */ + .chunk_size = MSCHUNKS_CHUNK_SIZE, + .chunk_shift = MSCHUNKS_CHUNK_SHIFT, + .chunk_mask = MSCHUNKS_OFFSET_MASK, +}; +EXPORT_SYMBOL(mschunks_map); + +void mschunks_alloc(unsigned long num_chunks) +{ + klimit = _ALIGN(klimit, sizeof(u32)); + mschunks_map.mapping = (u32 *)klimit; + klimit += num_chunks * sizeof(u32); + mschunks_map.num_chunks = num_chunks; +} + /* * The iSeries may have very large memories ( > 128 GB ) and a partition * may get memory in "chunks" that may be anywhere in the 2**52 real @@ -449,7 +468,7 @@ static void __init build_iSeries_Memory_Map(void) /* Chunk size on iSeries is 256K bytes */ totalChunks = (u32)HvLpConfig_getMsChunks(); - klimit = msChunks_alloc(klimit, totalChunks, 1UL << 18); + mschunks_alloc(totalChunks); /* * Get absolute address of our load area @@ -486,7 +505,7 @@ static void __init build_iSeries_Memory_Map(void) printk("Load area size %dK\n", loadAreaSize * 256); for (nextPhysChunk = 0; nextPhysChunk < loadAreaSize; ++nextPhysChunk) - msChunks.abs[nextPhysChunk] = + mschunks_map.mapping[nextPhysChunk] = loadAreaFirstChunk + nextPhysChunk; /* @@ -495,7 +514,7 @@ static void __init build_iSeries_Memory_Map(void) */ hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress()); hptSizePages = (u32)HvCallHpt_getHptPages(); - hptSizeChunks = hptSizePages >> (msChunks.chunk_shift - PAGE_SHIFT); + hptSizeChunks = hptSizePages >> (MSCHUNKS_CHUNK_SHIFT - PAGE_SHIFT); hptLastChunk = hptFirstChunk + hptSizeChunks - 1; printk("HPT absolute addr = %016lx, size = %dK\n", @@ -552,7 +571,8 @@ static void __init build_iSeries_Memory_Map(void) (absChunk > hptLastChunk)) && ((absChunk < loadAreaFirstChunk) || (absChunk > loadAreaLastChunk))) { - msChunks.abs[nextPhysChunk] = absChunk; + mschunks_map.mapping[nextPhysChunk] = + absChunk; ++nextPhysChunk; } } @@ -944,6 +964,8 @@ void __init iSeries_early_setup(void) ppc_md.calibrate_decr = iSeries_calibrate_decr; ppc_md.progress = iSeries_progress; + /* XXX Implement enable_pmcs for iSeries */ + if (get_paca()->lppaca.shared_proc) { ppc_md.idle_loop = iseries_shared_idle; printk(KERN_INFO "Using shared processor idle loop\n"); diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c new file mode 100644 index 00000000000..6b754b0c834 --- /dev/null +++ b/arch/ppc64/kernel/iSeries_vio.c @@ -0,0 +1,155 @@ +/* + * IBM PowerPC iSeries Virtual I/O Infrastructure Support. + * + * Copyright (c) 2005 Stephen Rothwell, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/init.h> + +#include <asm/vio.h> +#include <asm/iommu.h> +#include <asm/abs_addr.h> +#include <asm/page.h> +#include <asm/iSeries/vio.h> +#include <asm/iSeries/HvTypes.h> +#include <asm/iSeries/HvLpConfig.h> +#include <asm/iSeries/HvCallXm.h> + +struct device *iSeries_vio_dev = &vio_bus_device.dev; +EXPORT_SYMBOL(iSeries_vio_dev); + +static struct iommu_table veth_iommu_table; +static struct iommu_table vio_iommu_table; + +static void __init iommu_vio_init(void) +{ + struct iommu_table *t; + struct iommu_table_cb cb; + unsigned long cbp; + unsigned long itc_entries; + + cb.itc_busno = 255; /* Bus 255 is the virtual bus */ + cb.itc_virtbus = 0xff; /* Ask for virtual bus */ + + cbp = virt_to_abs(&cb); + HvCallXm_getTceTableParms(cbp); + + itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry); + veth_iommu_table.it_size = itc_entries / 2; + veth_iommu_table.it_busno = cb.itc_busno; + veth_iommu_table.it_offset = cb.itc_offset; + veth_iommu_table.it_index = cb.itc_index; + veth_iommu_table.it_type = TCE_VB; + veth_iommu_table.it_blocksize = 1; + + t = iommu_init_table(&veth_iommu_table); + + if (!t) + printk("Virtual Bus VETH TCE table failed.\n"); + + vio_iommu_table.it_size = itc_entries - veth_iommu_table.it_size; + vio_iommu_table.it_busno = cb.itc_busno; + vio_iommu_table.it_offset = cb.itc_offset + + veth_iommu_table.it_size; + vio_iommu_table.it_index = cb.itc_index; + vio_iommu_table.it_type = TCE_VB; + vio_iommu_table.it_blocksize = 1; + + t = iommu_init_table(&vio_iommu_table); + + if (!t) + printk("Virtual Bus VIO TCE table failed.\n"); +} + +/** + * vio_register_device_iseries: - Register a new iSeries vio device. + * @voidev: The device to register. + */ +static struct vio_dev *__init vio_register_device_iseries(char *type, + uint32_t unit_num) +{ + struct vio_dev *viodev; + + /* allocate a vio_dev for this device */ + viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); + if (!viodev) + return NULL; + memset(viodev, 0, sizeof(struct vio_dev)); + + snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num); + + viodev->name = viodev->dev.bus_id; + viodev->type = type; + viodev->unit_address = unit_num; + viodev->iommu_table = &vio_iommu_table; + if (vio_register_device(viodev) == NULL) { + kfree(viodev); + return NULL; + } + return viodev; +} + +void __init probe_bus_iseries(void) +{ + HvLpIndexMap vlan_map; + struct vio_dev *viodev; + int i; + + /* there is only one of each of these */ + vio_register_device_iseries("viocons", 0); + vio_register_device_iseries("vscsi", 0); + + vlan_map = HvLpConfig_getVirtualLanIndexMap(); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) { + if ((vlan_map & (0x8000 >> i)) == 0) + continue; + viodev = vio_register_device_iseries("vlan", i); + /* veth is special and has it own iommu_table */ + viodev->iommu_table = &veth_iommu_table; + } + for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++) + vio_register_device_iseries("viodasd", i); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++) + vio_register_device_iseries("viocd", i); + for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++) + vio_register_device_iseries("viotape", i); +} + +/** + * vio_match_device_iseries: - Tell if a iSeries VIO device matches a + * vio_device_id + */ +static int vio_match_device_iseries(const struct vio_device_id *id, + const struct vio_dev *dev) +{ + return strncmp(dev->type, id->type, strlen(id->type)) == 0; +} + +static struct vio_bus_ops vio_bus_ops_iseries = { + .match = vio_match_device_iseries, +}; + +/** + * vio_bus_init_iseries: - Initialize the iSeries virtual IO bus + */ +static int __init vio_bus_init_iseries(void) +{ + int err; + + err = vio_bus_init(&vio_bus_ops_iseries); + if (err == 0) { + iommu_vio_init(); + vio_bus_device.iommu_table = &vio_iommu_table; + iSeries_vio_dev = &vio_bus_device.dev; + probe_bus_iseries(); + } + return err; +} + +__initcall(vio_bus_init_iseries); diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c index d6c6bd03d2a..5adaca2ddc9 100644 --- a/arch/ppc64/kernel/lmb.c +++ b/arch/ppc64/kernel/lmb.c @@ -28,33 +28,28 @@ void lmb_dump_all(void) { #ifdef DEBUG unsigned long i; - struct lmb *_lmb = &lmb; udbg_printf("lmb_dump_all:\n"); udbg_printf(" memory.cnt = 0x%lx\n", - _lmb->memory.cnt); + lmb.memory.cnt); udbg_printf(" memory.size = 0x%lx\n", - _lmb->memory.size); - for (i=0; i < _lmb->memory.cnt ;i++) { + lmb.memory.size); + for (i=0; i < lmb.memory.cnt ;i++) { udbg_printf(" memory.region[0x%x].base = 0x%lx\n", - i, _lmb->memory.region[i].base); - udbg_printf(" .physbase = 0x%lx\n", - _lmb->memory.region[i].physbase); + i, lmb.memory.region[i].base); udbg_printf(" .size = 0x%lx\n", - _lmb->memory.region[i].size); + lmb.memory.region[i].size); } udbg_printf("\n reserved.cnt = 0x%lx\n", - _lmb->reserved.cnt); + lmb.reserved.cnt); udbg_printf(" reserved.size = 0x%lx\n", - _lmb->reserved.size); - for (i=0; i < _lmb->reserved.cnt ;i++) { + lmb.reserved.size); + for (i=0; i < lmb.reserved.cnt ;i++) { udbg_printf(" reserved.region[0x%x].base = 0x%lx\n", - i, _lmb->reserved.region[i].base); - udbg_printf(" .physbase = 0x%lx\n", - _lmb->reserved.region[i].physbase); + i, lmb.reserved.region[i].base); udbg_printf(" .size = 0x%lx\n", - _lmb->reserved.region[i].size); + lmb.reserved.region[i].size); } #endif /* DEBUG */ } @@ -98,7 +93,6 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2) rgn->region[r1].size += rgn->region[r2].size; for (i=r2; i < rgn->cnt-1; i++) { rgn->region[i].base = rgn->region[i+1].base; - rgn->region[i].physbase = rgn->region[i+1].physbase; rgn->region[i].size = rgn->region[i+1].size; } rgn->cnt--; @@ -108,49 +102,29 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2) void __init lmb_init(void) { - struct lmb *_lmb = &lmb; - /* Create a dummy zero size LMB which will get coalesced away later. * This simplifies the lmb_add() code below... */ - _lmb->memory.region[0].base = 0; - _lmb->memory.region[0].size = 0; - _lmb->memory.cnt = 1; + lmb.memory.region[0].base = 0; + lmb.memory.region[0].size = 0; + lmb.memory.cnt = 1; /* Ditto. */ - _lmb->reserved.region[0].base = 0; - _lmb->reserved.region[0].size = 0; - _lmb->reserved.cnt = 1; + lmb.reserved.region[0].base = 0; + lmb.reserved.region[0].size = 0; + lmb.reserved.cnt = 1; } /* This routine called with relocation disabled. */ void __init lmb_analyze(void) { - unsigned long i; - unsigned long mem_size = 0; - unsigned long size_mask = 0; - struct lmb *_lmb = &lmb; -#ifdef CONFIG_MSCHUNKS - unsigned long physbase = 0; -#endif - - for (i=0; i < _lmb->memory.cnt; i++) { - unsigned long lmb_size; - - lmb_size = _lmb->memory.region[i].size; - -#ifdef CONFIG_MSCHUNKS - _lmb->memory.region[i].physbase = physbase; - physbase += lmb_size; -#else - _lmb->memory.region[i].physbase = _lmb->memory.region[i].base; -#endif - mem_size += lmb_size; - size_mask |= lmb_size; - } + int i; + + lmb.memory.size = 0; - _lmb->memory.size = mem_size; + for (i = 0; i < lmb.memory.cnt; i++) + lmb.memory.size += lmb.memory.region[i].size; } /* This routine called with relocation disabled. */ @@ -168,7 +142,6 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize); if ( adjacent > 0 ) { rgn->region[i].base -= size; - rgn->region[i].physbase -= size; rgn->region[i].size += size; coalesced++; break; @@ -195,11 +168,9 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) for (i=rgn->cnt-1; i >= 0; i--) { if (base < rgn->region[i].base) { rgn->region[i+1].base = rgn->region[i].base; - rgn->region[i+1].physbase = rgn->region[i].physbase; rgn->region[i+1].size = rgn->region[i].size; } else { rgn->region[i+1].base = base; - rgn->region[i+1].physbase = lmb_abs_to_phys(base); rgn->region[i+1].size = size; break; } @@ -213,12 +184,11 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size) long __init lmb_add(unsigned long base, unsigned long size) { - struct lmb *_lmb = &lmb; - struct lmb_region *_rgn = &(_lmb->memory); + struct lmb_region *_rgn = &(lmb.memory); /* On pSeries LPAR systems, the first LMB is our RMO region. */ if ( base == 0 ) - _lmb->rmo_size = size; + lmb.rmo_size = size; return lmb_add_region(_rgn, base, size); @@ -227,8 +197,7 @@ lmb_add(unsigned long base, unsigned long size) long __init lmb_reserve(unsigned long base, unsigned long size) { - struct lmb *_lmb = &lmb; - struct lmb_region *_rgn = &(_lmb->reserved); + struct lmb_region *_rgn = &(lmb.reserved); return lmb_add_region(_rgn, base, size); } @@ -260,13 +229,10 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) { long i, j; unsigned long base = 0; - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - struct lmb_region *_rsv = &(_lmb->reserved); - for (i=_mem->cnt-1; i >= 0; i--) { - unsigned long lmbbase = _mem->region[i].base; - unsigned long lmbsize = _mem->region[i].size; + for (i=lmb.memory.cnt-1; i >= 0; i--) { + unsigned long lmbbase = lmb.memory.region[i].base; + unsigned long lmbsize = lmb.memory.region[i].size; if ( max_addr == LMB_ALLOC_ANYWHERE ) base = _ALIGN_DOWN(lmbbase+lmbsize-size, align); @@ -276,8 +242,8 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) continue; while ( (lmbbase <= base) && - ((j = lmb_overlaps_region(_rsv,base,size)) >= 0) ) { - base = _ALIGN_DOWN(_rsv->region[j].base-size, align); + ((j = lmb_overlaps_region(&lmb.reserved,base,size)) >= 0) ) { + base = _ALIGN_DOWN(lmb.reserved.region[j].base-size, align); } if ( (base != 0) && (lmbbase <= base) ) @@ -287,62 +253,24 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr) if ( i < 0 ) return 0; - lmb_add_region(_rsv, base, size); + lmb_add_region(&lmb.reserved, base, size); return base; } +/* You must call lmb_analyze() before this. */ unsigned long __init lmb_phys_mem_size(void) { - struct lmb *_lmb = &lmb; -#ifdef CONFIG_MSCHUNKS - return _lmb->memory.size; -#else - struct lmb_region *_mem = &(_lmb->memory); - unsigned long total = 0; - int i; - - /* add all physical memory to the bootmem map */ - for (i=0; i < _mem->cnt; i++) - total += _mem->region[i].size; - return total; -#endif /* CONFIG_MSCHUNKS */ + return lmb.memory.size; } unsigned long __init lmb_end_of_DRAM(void) { - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - int idx = _mem->cnt - 1; - -#ifdef CONFIG_MSCHUNKS - return (_mem->region[idx].physbase + _mem->region[idx].size); -#else - return (_mem->region[idx].base + _mem->region[idx].size); -#endif /* CONFIG_MSCHUNKS */ - - return 0; -} - -unsigned long __init -lmb_abs_to_phys(unsigned long aa) -{ - unsigned long i, pa = aa; - struct lmb *_lmb = &lmb; - struct lmb_region *_mem = &(_lmb->memory); - - for (i=0; i < _mem->cnt; i++) { - unsigned long lmbbase = _mem->region[i].base; - unsigned long lmbsize = _mem->region[i].size; - if ( lmb_addrs_overlap(aa,1,lmbbase,lmbsize) ) { - pa = _mem->region[i].physbase + (aa - lmbbase); - break; - } - } + int idx = lmb.memory.cnt - 1; - return pa; + return (lmb.memory.region[idx].base + lmb.memory.region[idx].size); } /* @@ -353,20 +281,19 @@ void __init lmb_enforce_memory_limit(void) { extern unsigned long memory_limit; unsigned long i, limit; - struct lmb_region *mem = &(lmb.memory); if (! memory_limit) return; limit = memory_limit; - for (i = 0; i < mem->cnt; i++) { - if (limit > mem->region[i].size) { - limit -= mem->region[i].size; + for (i = 0; i < lmb.memory.cnt; i++) { + if (limit > lmb.memory.region[i].size) { + limit -= lmb.memory.region[i].size; continue; } - mem->region[i].size = limit; - mem->cnt = i + 1; + lmb.memory.region[i].size = limit; + lmb.memory.cnt = i + 1; break; } } diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c index 02e96627fa6..edad361a8db 100644 --- a/arch/ppc64/kernel/lparcfg.c +++ b/arch/ppc64/kernel/lparcfg.c @@ -29,7 +29,7 @@ #include <asm/iSeries/HvLpConfig.h> #include <asm/lppaca.h> #include <asm/hvcall.h> -#include <asm/cputable.h> +#include <asm/firmware.h> #include <asm/rtas.h> #include <asm/system.h> #include <asm/time.h> @@ -273,6 +273,7 @@ static void parse_system_parameter_string(struct seq_file *m) if (!workbuffer) { printk(KERN_ERR "%s %s kmalloc failure at line %d \n", __FILE__, __FUNCTION__, __LINE__); + kfree(local_buffer); return; } #ifdef LPARCFG_DEBUG @@ -377,7 +378,7 @@ static int lparcfg_data(struct seq_file *m, void *v) partition_active_processors = lparcfg_count_active_processors(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { unsigned long h_entitled, h_unallocated; unsigned long h_aggregation, h_resource; unsigned long pool_idle_time, pool_procs; @@ -571,7 +572,7 @@ int __init lparcfg_init(void) mode_t mode = S_IRUSR; /* Allow writing if we have FW_FEATURE_SPLPAR */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { lparcfg_fops.write = lparcfg_write; mode |= S_IWUSR; } diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S index a05b50b738e..474df0a862b 100644 --- a/arch/ppc64/kernel/misc.S +++ b/arch/ppc64/kernel/misc.S @@ -680,6 +680,104 @@ _GLOBAL(kernel_thread) ld r30,-16(r1) blr +/* + * disable_kernel_fp() + * Disable the FPU. + */ +_GLOBAL(disable_kernel_fp) + mfmsr r3 + rldicl r0,r3,(63-MSR_FP_LG),1 + rldicl r3,r0,(MSR_FP_LG+1),0 + mtmsrd r3 /* disable use of fpu now */ + isync + blr + +/* + * giveup_fpu(tsk) + * Disable FP for the task given as the argument, + * and save the floating-point registers in its thread_struct. + * Enables the FPU for use in the kernel on return. + */ +_GLOBAL(giveup_fpu) + mfmsr r5 + ori r5,r5,MSR_FP + mtmsrd r5 /* enable use of fpu now */ + isync + cmpdi 0,r3,0 + beqlr- /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + ld r5,PT_REGS(r3) + cmpdi 0,r5,0 + SAVE_32FPRS(0, r3) + mffs fr0 + stfd fr0,THREAD_FPSCR(r3) + beq 1f + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + li r3,MSR_FP|MSR_FE0|MSR_FE1 + andc r4,r4,r3 /* disable FP for previous task */ + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + ld r4,last_task_used_math@got(r2) + std r5,0(r4) +#endif /* CONFIG_SMP */ + blr + +#ifdef CONFIG_ALTIVEC + +#if 0 /* this has no callers for now */ +/* + * disable_kernel_altivec() + * Disable the VMX. + */ +_GLOBAL(disable_kernel_altivec) + mfmsr r3 + rldicl r0,r3,(63-MSR_VEC_LG),1 + rldicl r3,r0,(MSR_VEC_LG+1),0 + mtmsrd r3 /* disable use of VMX now */ + isync + blr +#endif /* 0 */ + +/* + * giveup_altivec(tsk) + * Disable VMX for the task given as the argument, + * and save the vector registers in its thread_struct. + * Enables the VMX for use in the kernel on return. + */ +_GLOBAL(giveup_altivec) + mfmsr r5 + oris r5,r5,MSR_VEC@h + mtmsrd r5 /* enable use of VMX now */ + isync + cmpdi 0,r3,0 + beqlr- /* if no previous owner, done */ + addi r3,r3,THREAD /* want THREAD of task */ + ld r5,PT_REGS(r3) + cmpdi 0,r5,0 + SAVE_32VRS(0,r4,r3) + mfvscr vr0 + li r4,THREAD_VSCR + stvx vr0,r4,r3 + beq 1f + ld r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lis r3,MSR_VEC@h + andc r4,r4,r3 /* disable FP for previous task */ + std r4,_MSR-STACK_FRAME_OVERHEAD(r5) +1: +#ifndef CONFIG_SMP + li r5,0 + ld r4,last_task_used_altivec@got(r2) + std r5,0(r4) +#endif /* CONFIG_SMP */ + blr + +#endif /* CONFIG_ALTIVEC */ + +_GLOBAL(__setup_cpu_power3) + blr + /* kexec_wait(phys_cpu) * * wait for the flag to change, indicating this kernel is going away but diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c index b80e81984ba..da580812ddf 100644 --- a/arch/ppc64/kernel/of_device.c +++ b/arch/ppc64/kernel/of_device.c @@ -236,7 +236,6 @@ void of_device_unregister(struct of_device *ofdev) struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id) { struct of_device *dev; - u32 *reg; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) @@ -250,7 +249,6 @@ struct of_device* of_platform_device_create(struct device_node *np, const char * dev->dev.bus = &of_platform_bus_type; dev->dev.release = of_release_dev; - reg = (u32 *)get_property(np, "reg", NULL); strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE); if (of_device_register(dev) != 0) { diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c index 69130522a87..9d5e1e7fc38 100644 --- a/arch/ppc64/kernel/pSeries_iommu.c +++ b/arch/ppc64/kernel/pSeries_iommu.c @@ -45,6 +45,7 @@ #include <asm/plpar_wrappers.h> #include <asm/pSeries_reconfig.h> #include <asm/systemcfg.h> +#include <asm/firmware.h> #include "pci.h" #define DBG(fmt...) @@ -546,7 +547,7 @@ void iommu_init_early_pSeries(void) } if (systemcfg->platform & PLATFORM_LPAR) { - if (cur_cpu_spec->firmware_features & FW_FEATURE_MULTITCE) { + if (firmware_has_feature(FW_FEATURE_MULTITCE)) { ppc_md.tce_build = tce_buildmulti_pSeriesLP; ppc_md.tce_free = tce_freemulti_pSeriesLP; } else { diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c index 74dd144dcce..0a3ddc9227c 100644 --- a/arch/ppc64/kernel/pSeries_lpar.c +++ b/arch/ppc64/kernel/pSeries_lpar.c @@ -52,7 +52,6 @@ EXPORT_SYMBOL(plpar_hcall_4out); EXPORT_SYMBOL(plpar_hcall_norets); EXPORT_SYMBOL(plpar_hcall_8arg_2ret); -extern void fw_feature_init(void); extern void pSeries_find_serial_port(void); @@ -279,7 +278,6 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { - unsigned long arpn = physRpn_to_absRpn(prpn); unsigned long lpar_rc; unsigned long flags; unsigned long slot; @@ -290,7 +288,7 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, if (vflags & HPTE_V_LARGE) hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags; + hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; /* Now fill in the actual HPTE */ /* Set CEC cookie to 0 */ diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c index 5bec956e44a..f0f0630cf07 100644 --- a/arch/ppc64/kernel/pSeries_setup.c +++ b/arch/ppc64/kernel/pSeries_setup.c @@ -60,7 +60,8 @@ #include <asm/nvram.h> #include <asm/plpar_wrappers.h> #include <asm/xics.h> -#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/pmc.h> #include "i8259.h" #include "mpic.h" @@ -187,6 +188,21 @@ static void __init pSeries_setup_mpic(void) " MPIC "); } +static void pseries_lpar_enable_pmcs(void) +{ + unsigned long set, reset; + + power4_enable_pmcs(); + + set = 1UL << 63; + reset = 0; + plpar_hcall_norets(H_PERFMON, set, reset); + + /* instruct hypervisor to maintain PMCs */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) + get_paca()->lppaca.pmcregs_in_use = 1; +} + static void __init pSeries_setup_arch(void) { /* Fixup ppc_md depending on the type of interrupt controller */ @@ -231,11 +247,9 @@ static void __init pSeries_setup_arch(void) pSeries_nvram_init(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) - vpa_init(boot_cpuid); - /* Choose an idle loop */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + vpa_init(boot_cpuid); if (get_paca()->lppaca.shared_proc) { printk(KERN_INFO "Using shared processor idle loop\n"); ppc_md.idle_loop = pseries_shared_idle; @@ -247,6 +261,11 @@ static void __init pSeries_setup_arch(void) printk(KERN_INFO "Using default idle loop\n"); ppc_md.idle_loop = default_idle; } + + if (systemcfg->platform & PLATFORM_LPAR) + ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; + else + ppc_md.enable_pmcs = power4_enable_pmcs; } static int __init pSeries_init_panel(void) @@ -260,11 +279,11 @@ static int __init pSeries_init_panel(void) arch_initcall(pSeries_init_panel); -/* Build up the firmware_features bitmask field +/* Build up the ppc64_firmware_features bitmask field * using contents of device-tree/ibm,hypertas-functions. * Ultimately this functionality may be moved into prom.c prom_init(). */ -void __init fw_feature_init(void) +static void __init fw_feature_init(void) { struct device_node * dn; char * hypertas; @@ -272,7 +291,7 @@ void __init fw_feature_init(void) DBG(" -> fw_feature_init()\n"); - cur_cpu_spec->firmware_features = 0; + ppc64_firmware_features = 0; dn = of_find_node_by_path("/rtas"); if (dn == NULL) { printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n"); @@ -288,7 +307,7 @@ void __init fw_feature_init(void) if ((firmware_features_table[i].name) && (strcmp(firmware_features_table[i].name,hypertas))==0) { /* we have a match */ - cur_cpu_spec->firmware_features |= + ppc64_firmware_features |= (firmware_features_table[i].val); break; } @@ -302,7 +321,7 @@ void __init fw_feature_init(void) of_node_put(dn); no_rtas: printk(KERN_INFO "firmware_features = 0x%lx\n", - cur_cpu_spec->firmware_features); + ppc64_firmware_features); DBG(" <- fw_feature_init()\n"); } diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c index 62c55a12356..79c7f322366 100644 --- a/arch/ppc64/kernel/pSeries_smp.c +++ b/arch/ppc64/kernel/pSeries_smp.c @@ -41,6 +41,7 @@ #include <asm/machdep.h> #include <asm/xics.h> #include <asm/cputable.h> +#include <asm/firmware.h> #include <asm/system.h> #include <asm/rtas.h> #include <asm/plpar_wrappers.h> @@ -326,7 +327,7 @@ static void __devinit smp_xics_setup_cpu(int cpu) if (cpu != boot_cpuid) xics_setup_cpu(); - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) + if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); cpu_clear(cpu, of_spin_map); diff --git a/arch/ppc64/kernel/pSeries_vio.c b/arch/ppc64/kernel/pSeries_vio.c new file mode 100644 index 00000000000..e0ae06f58f8 --- /dev/null +++ b/arch/ppc64/kernel/pSeries_vio.c @@ -0,0 +1,273 @@ +/* + * IBM PowerPC pSeries Virtual I/O Infrastructure Support. + * + * Copyright (c) 2003-2005 IBM Corp. + * Dave Engebretsen engebret@us.ibm.com + * Santiago Leon santil@us.ibm.com + * Hollis Blanchard <hollisb@us.ibm.com> + * Stephen Rothwell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/kobject.h> +#include <asm/iommu.h> +#include <asm/dma.h> +#include <asm/prom.h> +#include <asm/vio.h> +#include <asm/hvcall.h> + +extern struct subsystem devices_subsys; /* needed for vio_find_name() */ + +static void probe_bus_pseries(void) +{ + struct device_node *node_vroot, *of_node; + + node_vroot = find_devices("vdevice"); + if ((node_vroot == NULL) || (node_vroot->child == NULL)) + /* this machine doesn't do virtual IO, and that's ok */ + return; + + /* + * Create struct vio_devices for each virtual device in the device tree. + * Drivers will associate with them later. + */ + for (of_node = node_vroot->child; of_node != NULL; + of_node = of_node->sibling) { + printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node); + vio_register_device_node(of_node); + } +} + +/** + * vio_match_device_pseries: - Tell if a pSeries VIO device matches a + * vio_device_id + */ +static int vio_match_device_pseries(const struct vio_device_id *id, + const struct vio_dev *dev) +{ + return (strncmp(dev->type, id->type, strlen(id->type)) == 0) && + device_is_compatible(dev->dev.platform_data, id->compat); +} + +static void vio_release_device_pseries(struct device *dev) +{ + /* XXX free TCE table */ + of_node_put(dev->platform_data); +} + +static ssize_t viodev_show_devspec(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct device_node *of_node = dev->platform_data; + + return sprintf(buf, "%s\n", of_node->full_name); +} +DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL); + +static void vio_unregister_device_pseries(struct vio_dev *viodev) +{ + device_remove_file(&viodev->dev, &dev_attr_devspec); +} + +static struct vio_bus_ops vio_bus_ops_pseries = { + .match = vio_match_device_pseries, + .unregister_device = vio_unregister_device_pseries, + .release_device = vio_release_device_pseries, +}; + +/** + * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus + */ +static int __init vio_bus_init_pseries(void) +{ + int err; + + err = vio_bus_init(&vio_bus_ops_pseries); + if (err == 0) + probe_bus_pseries(); + return err; +} + +__initcall(vio_bus_init_pseries); + +/** + * vio_build_iommu_table: - gets the dma information from OF and + * builds the TCE tree. + * @dev: the virtual device. + * + * Returns a pointer to the built tce tree, or NULL if it can't + * find property. +*/ +static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) +{ + unsigned int *dma_window; + struct iommu_table *newTceTable; + unsigned long offset; + int dma_window_property_size; + + dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size); + if(!dma_window) { + return NULL; + } + + newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL); + + /* There should be some code to extract the phys-encoded offset + using prom_n_addr_cells(). However, according to a comment + on earlier versions, it's always zero, so we don't bother */ + offset = dma_window[1] >> PAGE_SHIFT; + + /* TCE table size - measured in tce entries */ + newTceTable->it_size = dma_window[4] >> PAGE_SHIFT; + /* offset for VIO should always be 0 */ + newTceTable->it_offset = offset; + newTceTable->it_busno = 0; + newTceTable->it_index = (unsigned long)dma_window[0]; + newTceTable->it_type = TCE_VB; + + return iommu_init_table(newTceTable); +} + +/** + * vio_register_device_node: - Register a new vio device. + * @of_node: The OF node for this device. + * + * Creates and initializes a vio_dev structure from the data in + * of_node (dev.platform_data) and adds it to the list of virtual devices. + * Returns a pointer to the created vio_dev or NULL if node has + * NULL device_type or compatible fields. + */ +struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) +{ + struct vio_dev *viodev; + unsigned int *unit_address; + unsigned int *irq_p; + + /* we need the 'device_type' property, in order to match with drivers */ + if ((NULL == of_node->type)) { + printk(KERN_WARNING + "%s: node %s missing 'device_type'\n", __FUNCTION__, + of_node->name ? of_node->name : "<unknown>"); + return NULL; + } + + unit_address = (unsigned int *)get_property(of_node, "reg", NULL); + if (!unit_address) { + printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__, + of_node->name ? of_node->name : "<unknown>"); + return NULL; + } + + /* allocate a vio_dev for this node */ + viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); + if (!viodev) { + return NULL; + } + memset(viodev, 0, sizeof(struct vio_dev)); + + viodev->dev.platform_data = of_node_get(of_node); + + viodev->irq = NO_IRQ; + irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL); + if (irq_p) { + int virq = virt_irq_create_mapping(*irq_p); + if (virq == NO_IRQ) { + printk(KERN_ERR "Unable to allocate interrupt " + "number for %s\n", of_node->full_name); + } else + viodev->irq = irq_offset_up(virq); + } + + snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address); + viodev->name = of_node->name; + viodev->type = of_node->type; + viodev->unit_address = *unit_address; + viodev->iommu_table = vio_build_iommu_table(viodev); + + /* register with generic device framework */ + if (vio_register_device(viodev) == NULL) { + /* XXX free TCE table */ + kfree(viodev); + return NULL; + } + device_create_file(&viodev->dev, &dev_attr_devspec); + + return viodev; +} +EXPORT_SYMBOL(vio_register_device_node); + +/** + * vio_get_attribute: - get attribute for virtual device + * @vdev: The vio device to get property. + * @which: The property/attribute to be extracted. + * @length: Pointer to length of returned data size (unused if NULL). + * + * Calls prom.c's get_property() to return the value of the + * attribute specified by the preprocessor constant @which +*/ +const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length) +{ + return get_property(vdev->dev.platform_data, (char*)which, length); +} +EXPORT_SYMBOL(vio_get_attribute); + +/* vio_find_name() - internal because only vio.c knows how we formatted the + * kobject name + * XXX once vio_bus_type.devices is actually used as a kset in + * drivers/base/bus.c, this function should be removed in favor of + * "device_find(kobj_name, &vio_bus_type)" + */ +static struct vio_dev *vio_find_name(const char *kobj_name) +{ + struct kobject *found; + + found = kset_find_obj(&devices_subsys.kset, kobj_name); + if (!found) + return NULL; + + return to_vio_dev(container_of(found, struct device, kobj)); +} + +/** + * vio_find_node - find an already-registered vio_dev + * @vnode: device_node of the virtual device we're looking for + */ +struct vio_dev *vio_find_node(struct device_node *vnode) +{ + uint32_t *unit_address; + char kobj_name[BUS_ID_SIZE]; + + /* construct the kobject name from the device node */ + unit_address = (uint32_t *)get_property(vnode, "reg", NULL); + if (!unit_address) + return NULL; + snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address); + + return vio_find_name(kobj_name); +} +EXPORT_SYMBOL(vio_find_node); + +int vio_enable_interrupts(struct vio_dev *dev) +{ + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE); + if (rc != H_Success) + printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc); + return rc; +} +EXPORT_SYMBOL(vio_enable_interrupts); + +int vio_disable_interrupts(struct vio_dev *dev) +{ + int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE); + if (rc != H_Success) + printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc); + return rc; +} +EXPORT_SYMBOL(vio_disable_interrupts); diff --git a/arch/ppc64/kernel/pacaData.c b/arch/ppc64/kernel/pacaData.c index 6316188737b..6182a2cd90a 100644 --- a/arch/ppc64/kernel/pacaData.c +++ b/arch/ppc64/kernel/pacaData.c @@ -78,7 +78,7 @@ extern unsigned long __toc_start; #define BOOTCPU_PACA_INIT(number) \ { \ - PACA_INIT_COMMON(number, 1, 0, STAB0_VIRT_ADDR) \ + PACA_INIT_COMMON(number, 1, 0, (u64)&initial_stab) \ PACA_INIT_ISERIES(number) \ } @@ -90,7 +90,7 @@ extern unsigned long __toc_start; #define BOOTCPU_PACA_INIT(number) \ { \ - PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, STAB0_VIRT_ADDR) \ + PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, (u64)&initial_stab) \ } #endif diff --git a/arch/ppc64/kernel/pmac_setup.c b/arch/ppc64/kernel/pmac_setup.c index e40877fa67c..8ff86a766cd 100644 --- a/arch/ppc64/kernel/pmac_setup.c +++ b/arch/ppc64/kernel/pmac_setup.c @@ -71,6 +71,7 @@ #include <asm/of_device.h> #include <asm/lmb.h> #include <asm/smu.h> +#include <asm/pmc.h> #include "pmac.h" #include "mpic.h" @@ -511,4 +512,5 @@ struct machdep_calls __initdata pmac_md = { .progress = pmac_progress, .check_legacy_ioport = pmac_check_legacy_ioport, .idle_loop = native_idle, + .enable_pmcs = power4_enable_pmcs, }; diff --git a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c index 67be773f9c0..cdfec7438d0 100644 --- a/arch/ppc64/kernel/pmc.c +++ b/arch/ppc64/kernel/pmc.c @@ -65,3 +65,24 @@ void release_pmc_hardware(void) spin_unlock(&pmc_owner_lock); } EXPORT_SYMBOL_GPL(release_pmc_hardware); + +void power4_enable_pmcs(void) +{ + unsigned long hid0; + + hid0 = mfspr(HID0); + hid0 |= 1UL << (63 - 20); + + /* POWER4 requires the following sequence */ + asm volatile( + "sync\n" + "mtspr %1, %0\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "mfspr %0, %1\n" + "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0): + "memory"); +} diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index f7cae05e40f..7a7e027653a 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -50,6 +50,7 @@ #include <asm/machdep.h> #include <asm/iSeries/HvCallHpt.h> #include <asm/cputable.h> +#include <asm/firmware.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/time.h> @@ -202,11 +203,10 @@ struct task_struct *__switch_to(struct task_struct *prev, new_thread = &new->thread; old_thread = ¤t->thread; -/* Collect purr utilization data per process and per processor wise */ -/* purr is nothing but processor time base */ - -#if defined(CONFIG_PPC_PSERIES) - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + /* Collect purr utilization data per process and per processor + * wise purr is nothing but processor time base + */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); long unsigned start_tb, current_tb; start_tb = old_thread->start_tb; @@ -214,8 +214,6 @@ struct task_struct *__switch_to(struct task_struct *prev, old_thread->accum_tb += (current_tb - start_tb); new_thread->start_tb = current_tb; } -#endif - local_irq_save(flags); last = _switch(old_thread, new_thread); diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index 5aca01ddd81..b2184882679 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -625,8 +625,8 @@ void __init finish_device_tree(void) static inline char *find_flat_dt_string(u32 offset) { - return ((char *)initial_boot_params) + initial_boot_params->off_dt_strings - + offset; + return ((char *)initial_boot_params) + + initial_boot_params->off_dt_strings + offset; } /** @@ -635,26 +635,33 @@ static inline char *find_flat_dt_string(u32 offset) * unflatten the tree */ static int __init scan_flat_dt(int (*it)(unsigned long node, - const char *full_path, void *data), + const char *uname, int depth, + void *data), void *data) { unsigned long p = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; int rc = 0; + int depth = -1; do { u32 tag = *((u32 *)p); char *pathp; p += 4; - if (tag == OF_DT_END_NODE) + if (tag == OF_DT_END_NODE) { + depth --; + continue; + } + if (tag == OF_DT_NOP) continue; if (tag == OF_DT_END) break; if (tag == OF_DT_PROP) { u32 sz = *((u32 *)p); p += 8; - p = _ALIGN(p, sz >= 8 ? 8 : 4); + if (initial_boot_params->version < 0x10) + p = _ALIGN(p, sz >= 8 ? 8 : 4); p += sz; p = _ALIGN(p, 4); continue; @@ -664,9 +671,18 @@ static int __init scan_flat_dt(int (*it)(unsigned long node, " device tree !\n", tag); return -EINVAL; } + depth++; pathp = (char *)p; p = _ALIGN(p + strlen(pathp) + 1, 4); - rc = it(p, pathp, data); + if ((*pathp) == '/') { + char *lp, *np; + for (lp = NULL, np = pathp; *np; np++) + if ((*np) == '/') + lp = np+1; + if (lp != NULL) + pathp = lp; + } + rc = it(p, pathp, depth, data); if (rc != 0) break; } while(1); @@ -689,17 +705,21 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name, const char *nstr; p += 4; + if (tag == OF_DT_NOP) + continue; if (tag != OF_DT_PROP) return NULL; sz = *((u32 *)p); noff = *((u32 *)(p + 4)); p += 8; - p = _ALIGN(p, sz >= 8 ? 8 : 4); + if (initial_boot_params->version < 0x10) + p = _ALIGN(p, sz >= 8 ? 8 : 4); nstr = find_flat_dt_string(noff); if (nstr == NULL) { - printk(KERN_WARNING "Can't find property index name !\n"); + printk(KERN_WARNING "Can't find property index" + " name !\n"); return NULL; } if (strcmp(name, nstr) == 0) { @@ -713,7 +733,7 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name, } static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, - unsigned long align) + unsigned long align) { void *res; @@ -727,13 +747,16 @@ static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size, static unsigned long __init unflatten_dt_node(unsigned long mem, unsigned long *p, struct device_node *dad, - struct device_node ***allnextpp) + struct device_node ***allnextpp, + unsigned long fpsize) { struct device_node *np; struct property *pp, **prev_pp = NULL; char *pathp; u32 tag; - unsigned int l; + unsigned int l, allocl; + int has_name = 0; + int new_format = 0; tag = *((u32 *)(*p)); if (tag != OF_DT_BEGIN_NODE) { @@ -742,21 +765,62 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, } *p += 4; pathp = (char *)*p; - l = strlen(pathp) + 1; + l = allocl = strlen(pathp) + 1; *p = _ALIGN(*p + l, 4); - np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + l, + /* version 0x10 has a more compact unit name here instead of the full + * path. we accumulate the full path size using "fpsize", we'll rebuild + * it later. We detect this because the first character of the name is + * not '/'. + */ + if ((*pathp) != '/') { + new_format = 1; + if (fpsize == 0) { + /* root node: special case. fpsize accounts for path + * plus terminating zero. root node only has '/', so + * fpsize should be 2, but we want to avoid the first + * level nodes to have two '/' so we use fpsize 1 here + */ + fpsize = 1; + allocl = 2; + } else { + /* account for '/' and path size minus terminal 0 + * already in 'l' + */ + fpsize += l; + allocl = fpsize; + } + } + + + np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl, __alignof__(struct device_node)); if (allnextpp) { memset(np, 0, sizeof(*np)); np->full_name = ((char*)np) + sizeof(struct device_node); - memcpy(np->full_name, pathp, l); + if (new_format) { + char *p = np->full_name; + /* rebuild full path for new format */ + if (dad && dad->parent) { + strcpy(p, dad->full_name); +#ifdef DEBUG + if ((strlen(p) + l + 1) != allocl) { + DBG("%s: p: %d, l: %d, a: %d\n", + pathp, strlen(p), l, allocl); + } +#endif + p += strlen(p); + } + *(p++) = '/'; + memcpy(p, pathp, l); + } else + memcpy(np->full_name, pathp, l); prev_pp = &np->properties; **allnextpp = np; *allnextpp = &np->allnext; if (dad != NULL) { np->parent = dad; - /* we temporarily use the `next' field as `last_child'. */ + /* we temporarily use the next field as `last_child'*/ if (dad->next == 0) dad->child = np; else @@ -770,18 +834,26 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, char *pname; tag = *((u32 *)(*p)); + if (tag == OF_DT_NOP) { + *p += 4; + continue; + } if (tag != OF_DT_PROP) break; *p += 4; sz = *((u32 *)(*p)); noff = *((u32 *)((*p) + 4)); - *p = _ALIGN((*p) + 8, sz >= 8 ? 8 : 4); + *p += 8; + if (initial_boot_params->version < 0x10) + *p = _ALIGN(*p, sz >= 8 ? 8 : 4); pname = find_flat_dt_string(noff); if (pname == NULL) { printk("Can't find property name in list !\n"); break; } + if (strcmp(pname, "name") == 0) + has_name = 1; l = strlen(pname) + 1; pp = unflatten_dt_alloc(&mem, sizeof(struct property), __alignof__(struct property)); @@ -801,6 +873,36 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, } *p = _ALIGN((*p) + sz, 4); } + /* with version 0x10 we may not have the name property, recreate + * it here from the unit name if absent + */ + if (!has_name) { + char *p = pathp, *ps = pathp, *pa = NULL; + int sz; + + while (*p) { + if ((*p) == '@') + pa = p; + if ((*p) == '/') + ps = p + 1; + p++; + } + if (pa < ps) + pa = p; + sz = (pa - ps) + 1; + pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz, + __alignof__(struct property)); + if (allnextpp) { + pp->name = "name"; + pp->length = sz; + pp->value = (unsigned char *)(pp + 1); + *prev_pp = pp; + prev_pp = &pp->next; + memcpy(pp->value, ps, sz - 1); + ((char *)pp->value)[sz - 1] = 0; + DBG("fixed up name for %s -> %s\n", pathp, pp->value); + } + } if (allnextpp) { *prev_pp = NULL; np->name = get_property(np, "name", NULL); @@ -812,11 +914,11 @@ static unsigned long __init unflatten_dt_node(unsigned long mem, np->type = "<NULL>"; } while (tag == OF_DT_BEGIN_NODE) { - mem = unflatten_dt_node(mem, p, np, allnextpp); + mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize); tag = *((u32 *)(*p)); } if (tag != OF_DT_END_NODE) { - printk("Weird tag at start of node: %x\n", tag); + printk("Weird tag at end of node: %x\n", tag); return mem; } *p += 4; @@ -842,21 +944,32 @@ void __init unflatten_device_tree(void) /* First pass, scan for size */ start = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; - size = unflatten_dt_node(0, &start, NULL, NULL); + size = unflatten_dt_node(0, &start, NULL, NULL, 0); + size = (size | 3) + 1; DBG(" size is %lx, allocating...\n", size); /* Allocate memory for the expanded device tree */ - mem = (unsigned long)abs_to_virt(lmb_alloc(size, - __alignof__(struct device_node))); + mem = lmb_alloc(size + 4, __alignof__(struct device_node)); + if (!mem) { + DBG("Couldn't allocate memory with lmb_alloc()!\n"); + panic("Couldn't allocate memory with lmb_alloc()!\n"); + } + mem = (unsigned long)abs_to_virt(mem); + + ((u32 *)mem)[size / 4] = 0xdeadbeef; + DBG(" unflattening...\n", mem); /* Second pass, do actual unflattening */ start = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; - unflatten_dt_node(mem, &start, NULL, &allnextp); + unflatten_dt_node(mem, &start, NULL, &allnextp, 0); if (*((u32 *)start) != OF_DT_END) - printk(KERN_WARNING "Weird tag at end of tree: %x\n", *((u32 *)start)); + printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); + if (((u32 *)mem)[size / 4] != 0xdeadbeef) + printk(KERN_WARNING "End of tree marker overwritten: %08x\n", + ((u32 *)mem)[size / 4] ); *allnextp = NULL; /* Get pointer to OF "/chosen" node for use everywhere */ @@ -880,7 +993,7 @@ void __init unflatten_device_tree(void) static int __init early_init_dt_scan_cpus(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { char *type = get_flat_dt_prop(node, "device_type", NULL); u32 *prop; @@ -947,13 +1060,15 @@ static int __init early_init_dt_scan_cpus(unsigned long node, } static int __init early_init_dt_scan_chosen(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { u32 *prop; u64 *prop64; extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end; - if (strcmp(full_path, "/chosen") != 0) + DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname); + + if (depth != 1 || strcmp(uname, "chosen") != 0) return 0; /* get platform type */ @@ -1003,18 +1118,20 @@ static int __init early_init_dt_scan_chosen(unsigned long node, } static int __init early_init_dt_scan_root(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { u32 *prop; - if (strcmp(full_path, "/") != 0) + if (depth != 0) return 0; prop = (u32 *)get_flat_dt_prop(node, "#size-cells", NULL); dt_root_size_cells = (prop == NULL) ? 1 : *prop; - + DBG("dt_root_size_cells = %x\n", dt_root_size_cells); + prop = (u32 *)get_flat_dt_prop(node, "#address-cells", NULL); dt_root_addr_cells = (prop == NULL) ? 2 : *prop; + DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); /* break now */ return 1; @@ -1042,7 +1159,7 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp) static int __init early_init_dt_scan_memory(unsigned long node, - const char *full_path, void *data) + const char *uname, int depth, void *data) { char *type = get_flat_dt_prop(node, "device_type", NULL); cell_t *reg, *endp; @@ -1058,7 +1175,9 @@ static int __init early_init_dt_scan_memory(unsigned long node, endp = reg + (l / sizeof(cell_t)); - DBG("memory scan node %s ...\n", full_path); + DBG("memory scan node %s ..., reg size %ld, data: %x %x %x %x, ...\n", + uname, l, reg[0], reg[1], reg[2], reg[3]); + while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { unsigned long base, size; @@ -1469,10 +1588,11 @@ struct device_node *of_find_node_by_path(const char *path) struct device_node *np = allnodes; read_lock(&devtree_lock); - for (; np != 0; np = np->allnext) + for (; np != 0; np = np->allnext) { if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0 && of_node_get(np)) break; + } read_unlock(&devtree_lock); return np; } diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c index dbbe6c79d8d..122283a1d39 100644 --- a/arch/ppc64/kernel/prom_init.c +++ b/arch/ppc64/kernel/prom_init.c @@ -892,7 +892,10 @@ static void __init prom_init_mem(void) if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR ) RELOC(alloc_top) = RELOC(rmo_top); else - RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top)); + /* Some RS64 machines have buggy firmware where claims up at 1GB + * fails. Cap at 768MB as a workaround. Still plenty of room. + */ + RELOC(alloc_top) = RELOC(rmo_top) = min(0x30000000ul, RELOC(ram_top)); prom_printf("memory layout at init:\n"); prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit)); @@ -1534,7 +1537,8 @@ static unsigned long __init dt_find_string(char *str) */ #define MAX_PROPERTY_NAME 64 -static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, +static void __init scan_dt_build_strings(phandle node, + unsigned long *mem_start, unsigned long *mem_end) { unsigned long offset = reloc_offset(); @@ -1547,16 +1551,21 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, /* get and store all property names */ prev_name = RELOC(""); for (;;) { - int rc; - /* 64 is max len of name including nul. */ namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1); - rc = call_prom("nextprop", 3, 1, node, prev_name, namep); - if (rc != 1) { + if (call_prom("nextprop", 3, 1, node, prev_name, namep) != 1) { /* No more nodes: unwind alloc */ *mem_start = (unsigned long)namep; break; } + + /* skip "name" */ + if (strcmp(namep, RELOC("name")) == 0) { + *mem_start = (unsigned long)namep; + prev_name = RELOC("name"); + continue; + } + /* get/create string entry */ soff = dt_find_string(namep); if (soff != 0) { *mem_start = (unsigned long)namep; @@ -1571,7 +1580,7 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, /* do all our children */ child = call_prom("child", 1, 1, node); - while (child != (phandle)0) { + while (child != 0) { scan_dt_build_strings(child, mem_start, mem_end); child = call_prom("peer", 1, 1, child); } @@ -1580,16 +1589,13 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, unsigned long *mem_end) { - int l, align; phandle child; - char *namep, *prev_name, *sstart, *p, *ep; + char *namep, *prev_name, *sstart, *p, *ep, *lp, *path; unsigned long soff; unsigned char *valp; unsigned long offset = reloc_offset(); - char pname[MAX_PROPERTY_NAME]; - char *path; - - path = RELOC(prom_scratch); + static char pname[MAX_PROPERTY_NAME]; + int l; dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end); @@ -1599,23 +1605,33 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, namep, *mem_end - *mem_start); if (l >= 0) { /* Didn't fit? Get more room. */ - if (l+1 > *mem_end - *mem_start) { + if ((l+1) > (*mem_end - *mem_start)) { namep = make_room(mem_start, mem_end, l+1, 1); call_prom("package-to-path", 3, 1, node, namep, l); } namep[l] = '\0'; + /* Fixup an Apple bug where they have bogus \0 chars in the * middle of the path in some properties */ for (p = namep, ep = namep + l; p < ep; p++) if (*p == '\0') { memmove(p, p+1, ep - p); - ep--; l--; + ep--; l--; p--; } - *mem_start = _ALIGN(((unsigned long) namep) + strlen(namep) + 1, 4); + + /* now try to extract the unit name in that mess */ + for (p = namep, lp = NULL; *p; p++) + if (*p == '/') + lp = p + 1; + if (lp != NULL) + memmove(namep, lp, strlen(lp) + 1); + *mem_start = _ALIGN(((unsigned long) namep) + + strlen(namep) + 1, 4); } /* get it again for debugging */ + path = RELOC(prom_scratch); memset(path, 0, PROM_SCRATCH_SIZE); call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); @@ -1623,23 +1639,27 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, prev_name = RELOC(""); sstart = (char *)RELOC(dt_string_start); for (;;) { - int rc; - - rc = call_prom("nextprop", 3, 1, node, prev_name, pname); - if (rc != 1) + if (call_prom("nextprop", 3, 1, node, prev_name, + RELOC(pname)) != 1) break; + /* skip "name" */ + if (strcmp(RELOC(pname), RELOC("name")) == 0) { + prev_name = RELOC("name"); + continue; + } + /* find string offset */ - soff = dt_find_string(pname); + soff = dt_find_string(RELOC(pname)); if (soff == 0) { - prom_printf("WARNING: Can't find string index for <%s>, node %s\n", - pname, path); + prom_printf("WARNING: Can't find string index for" + " <%s>, node %s\n", RELOC(pname), path); break; } prev_name = sstart + soff; /* get length */ - l = call_prom("getproplen", 2, 1, node, pname); + l = call_prom("getproplen", 2, 1, node, RELOC(pname)); /* sanity checks */ if (l == PROM_ERROR) @@ -1648,7 +1668,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, prom_printf("WARNING: ignoring large property "); /* It seems OF doesn't null-terminate the path :-( */ prom_printf("[%s] ", path); - prom_printf("%s length 0x%x\n", pname, l); + prom_printf("%s length 0x%x\n", RELOC(pname), l); continue; } @@ -1658,17 +1678,16 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, dt_push_token(soff, mem_start, mem_end); /* push property content */ - align = (l >= 8) ? 8 : 4; - valp = make_room(mem_start, mem_end, l, align); - call_prom("getprop", 4, 1, node, pname, valp, l); + valp = make_room(mem_start, mem_end, l, 4); + call_prom("getprop", 4, 1, node, RELOC(pname), valp, l); *mem_start = _ALIGN(*mem_start, 4); } /* Add a "linux,phandle" property. */ soff = dt_find_string(RELOC("linux,phandle")); if (soff == 0) - prom_printf("WARNING: Can't find string index for <linux-phandle>" - " node %s\n", path); + prom_printf("WARNING: Can't find string index for" + " <linux-phandle> node %s\n", path); else { dt_push_token(OF_DT_PROP, mem_start, mem_end); dt_push_token(4, mem_start, mem_end); @@ -1679,7 +1698,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* do all our children */ child = call_prom("child", 1, 1, node); - while (child != (phandle)0) { + while (child != 0) { scan_dt_build_struct(child, mem_start, mem_end); child = call_prom("peer", 1, 1, child); } @@ -1718,7 +1737,8 @@ static void __init flatten_device_tree(void) /* Build header and make room for mem rsv map */ mem_start = _ALIGN(mem_start, 4); - hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4); + hdr = make_room(&mem_start, &mem_end, + sizeof(struct boot_param_header), 4); RELOC(dt_header_start) = (unsigned long)hdr; rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8); @@ -1731,11 +1751,11 @@ static void __init flatten_device_tree(void) namep = make_room(&mem_start, &mem_end, 16, 1); strcpy(namep, RELOC("linux,phandle")); mem_start = (unsigned long)namep + strlen(namep) + 1; - RELOC(dt_string_end) = mem_start; /* Build string array */ prom_printf("Building dt strings...\n"); scan_dt_build_strings(root, &mem_start, &mem_end); + RELOC(dt_string_end) = mem_start; /* Build structure */ mem_start = PAGE_ALIGN(mem_start); @@ -1750,9 +1770,11 @@ static void __init flatten_device_tree(void) hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start); hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start); hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start); + hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start); hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start); hdr->version = OF_DT_VERSION; - hdr->last_comp_version = 1; + /* Version 16 is not backward compatible */ + hdr->last_comp_version = 0x10; /* Reserve the whole thing and copy the reserve map in, we * also bump mem_reserve_cnt to cause further reservations to @@ -1808,6 +1830,9 @@ static void __init fixup_device_tree(void) /* does it need fixup ? */ if (prom_getproplen(i2c, "interrupts") > 0) return; + + prom_printf("fixing up bogus interrupts for u3 i2c...\n"); + /* interrupt on this revision of u3 is number 0 and level */ interrupts[0] = 0; interrupts[1] = 1; diff --git a/arch/ppc64/kernel/rtas_pci.c b/arch/ppc64/kernel/rtas_pci.c index 1048817befb..1dccadaddd1 100644 --- a/arch/ppc64/kernel/rtas_pci.c +++ b/arch/ppc64/kernel/rtas_pci.c @@ -58,6 +58,21 @@ static int config_access_valid(struct device_node *dn, int where) return 0; } +static int of_device_available(struct device_node * dn) +{ + char * status; + + status = get_property(dn, "status", NULL); + + if (!status) + return 1; + + if (!strcmp(status, "okay")) + return 1; + + return 0; +} + static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val) { int returnval = -1; @@ -103,7 +118,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) + if (dn->devfn == devfn && of_device_available(dn)) return rtas_read_config(dn, where, size, val); return PCIBIOS_DEVICE_NOT_FOUND; } @@ -146,7 +161,7 @@ static int rtas_pci_write_config(struct pci_bus *bus, /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) - if (dn->devfn == devfn) + if (dn->devfn == devfn && of_device_available(dn)) return rtas_write_config(dn, where, size, val); return PCIBIOS_DEVICE_NOT_FOUND; } diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index e9c24d2dbd9..ee3b20de2e7 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -536,15 +536,19 @@ static void __init check_for_initrd(void) DBG(" -> check_for_initrd()\n"); - prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL); - if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); - prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL); + if (of_chosen) { + prop = (u64 *)get_property(of_chosen, + "linux,initrd-start", NULL); if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); - initrd_below_start_ok = 1; - } else - initrd_start = 0; + initrd_start = (unsigned long)__va(*prop); + prop = (u64 *)get_property(of_chosen, + "linux,initrd-end", NULL); + if (prop != NULL) { + initrd_end = (unsigned long)__va(*prop); + initrd_below_start_ok = 1; + } else + initrd_start = 0; + } } /* If we were passed an initrd, set the ROOT_DEV properly if the values @@ -627,7 +631,7 @@ void __init setup_system(void) * Initialize xmon */ #ifdef CONFIG_XMON_DEFAULT - xmon_init(); + xmon_init(1); #endif /* * Register early console @@ -1343,11 +1347,13 @@ static int __init early_xmon(char *p) /* ensure xmon is enabled */ if (p) { if (strncmp(p, "on", 2) == 0) - xmon_init(); + xmon_init(1); + if (strncmp(p, "off", 3) == 0) + xmon_init(0); if (strncmp(p, "early", 5) != 0) return 0; } - xmon_init(); + xmon_init(1); debugger(NULL); return 0; diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index 02b8ac4e016..f311ee7c007 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -13,6 +13,7 @@ #include <asm/current.h> #include <asm/processor.h> #include <asm/cputable.h> +#include <asm/firmware.h> #include <asm/hvcall.h> #include <asm/prom.h> #include <asm/systemcfg.h> @@ -100,6 +101,8 @@ static int __init setup_smt_snooze_delay(char *str) } __setup("smt-snooze-delay=", setup_smt_snooze_delay); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + /* * Enabling PMCs will slow partition context switch times so we only do * it the first time we write to the PMCs. @@ -109,65 +112,15 @@ static DEFINE_PER_CPU(char, pmcs_enabled); void ppc64_enable_pmcs(void) { - unsigned long hid0; -#ifdef CONFIG_PPC_PSERIES - unsigned long set, reset; -#endif /* CONFIG_PPC_PSERIES */ - /* Only need to enable them once */ if (__get_cpu_var(pmcs_enabled)) return; __get_cpu_var(pmcs_enabled) = 1; - switch (systemcfg->platform) { - case PLATFORM_PSERIES: - case PLATFORM_POWERMAC: - hid0 = mfspr(HID0); - hid0 |= 1UL << (63 - 20); - - /* POWER4 requires the following sequence */ - asm volatile( - "sync\n" - "mtspr %1, %0\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "mfspr %0, %1\n" - "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0): - "memory"); - break; - -#ifdef CONFIG_PPC_PSERIES - case PLATFORM_PSERIES_LPAR: - set = 1UL << 63; - reset = 0; - plpar_hcall_norets(H_PERFMON, set, reset); - break; -#endif /* CONFIG_PPC_PSERIES */ - - default: - break; - } - -#ifdef CONFIG_PPC_PSERIES - /* instruct hypervisor to maintain PMCs */ - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) - get_paca()->lppaca.pmcregs_in_use = 1; -#endif /* CONFIG_PPC_PSERIES */ + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); } - -#else - -/* PMC stuff */ -void ppc64_enable_pmcs(void) -{ - /* XXX Implement for iseries */ -} -#endif /* CONFIG_PPC_MULTIPLATFORM */ - EXPORT_SYMBOL(ppc64_enable_pmcs); /* XXX convert to rusty's on_one_cpu */ diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index 909462e1ade..1696e1b05bb 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -67,6 +67,7 @@ #include <asm/prom.h> #include <asm/sections.h> #include <asm/systemcfg.h> +#include <asm/firmware.h> u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; @@ -370,13 +371,11 @@ int timer_interrupt(struct pt_regs * regs) process_hvlpevents(regs); #endif -/* collect purr register values often, for accurate calculations */ -#if defined(CONFIG_PPC_PSERIES) - if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) { + /* collect purr register values often, for accurate calculations */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array); cu->current_tb = mfspr(SPRN_PURR); } -#endif irq_exit(); diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 0c0ba71ac0e..c90e1dd875c 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -1,10 +1,11 @@ /* * IBM PowerPC Virtual I/O Infrastructure Support. * - * Copyright (c) 2003 IBM Corp. + * Copyright (c) 2003-2005 IBM Corp. * Dave Engebretsen engebret@us.ibm.com * Santiago Leon santil@us.ibm.com * Hollis Blanchard <hollisb@us.ibm.com> + * Stephen Rothwell * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,61 +15,30 @@ #include <linux/init.h> #include <linux/console.h> -#include <linux/version.h> #include <linux/module.h> -#include <linux/kobject.h> #include <linux/mm.h> #include <linux/dma-mapping.h> -#include <asm/rtas.h> #include <asm/iommu.h> #include <asm/dma.h> -#include <asm/ppcdebug.h> #include <asm/vio.h> -#include <asm/hvcall.h> -#include <asm/iSeries/vio.h> -#include <asm/iSeries/HvTypes.h> -#include <asm/iSeries/HvCallXm.h> -#include <asm/iSeries/HvLpConfig.h> - -#define DBGENTER() pr_debug("%s entered\n", __FUNCTION__) - -extern struct subsystem devices_subsys; /* needed for vio_find_name() */ static const struct vio_device_id *vio_match_device( const struct vio_device_id *, const struct vio_dev *); -#ifdef CONFIG_PPC_PSERIES -static struct iommu_table *vio_build_iommu_table(struct vio_dev *); -static int vio_num_address_cells; -#endif -#ifdef CONFIG_PPC_ISERIES -static struct iommu_table veth_iommu_table; -static struct iommu_table vio_iommu_table; -#endif -static struct vio_dev vio_bus_device = { /* fake "parent" device */ +struct vio_dev vio_bus_device = { /* fake "parent" device */ .name = vio_bus_device.dev.bus_id, .type = "", -#ifdef CONFIG_PPC_ISERIES - .iommu_table = &vio_iommu_table, -#endif .dev.bus_id = "vio", .dev.bus = &vio_bus_type, }; -#ifdef CONFIG_PPC_ISERIES -static struct vio_dev *__init vio_register_device_iseries(char *type, - uint32_t unit_num); - -struct device *iSeries_vio_dev = &vio_bus_device.dev; -EXPORT_SYMBOL(iSeries_vio_dev); - -#define device_is_compatible(a, b) 1 +static struct vio_bus_ops vio_bus_ops; -#endif - -/* convert from struct device to struct vio_dev and pass to driver. +/* + * Convert from struct device to struct vio_dev and pass to driver. * dev->driver has already been set by generic code because vio_bus_match - * succeeded. */ + * succeeded. + */ static int vio_bus_probe(struct device *dev) { struct vio_dev *viodev = to_vio_dev(dev); @@ -76,15 +46,12 @@ static int vio_bus_probe(struct device *dev) const struct vio_device_id *id; int error = -ENODEV; - DBGENTER(); - if (!viodrv->probe) return error; id = vio_match_device(viodrv->id_table, viodev); - if (id) { + if (id) error = viodrv->probe(viodev, id); - } return error; } @@ -95,11 +62,8 @@ static int vio_bus_remove(struct device *dev) struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); - DBGENTER(); - - if (viodrv->remove) { + if (viodrv->remove) return viodrv->remove(viodev); - } /* driver can't remove */ return 1; @@ -135,193 +99,72 @@ void vio_unregister_driver(struct vio_driver *viodrv) EXPORT_SYMBOL(vio_unregister_driver); /** - * vio_match_device: - Tell if a VIO device has a matching VIO device id structure. - * @ids: array of VIO device id structures to search in - * @dev: the VIO device structure to match against + * vio_match_device: - Tell if a VIO device has a matching + * VIO device id structure. + * @ids: array of VIO device id structures to search in + * @dev: the VIO device structure to match against * * Used by a driver to check whether a VIO device present in the * system is in its list of supported devices. Returns the matching * vio_device_id structure or NULL if there is no match. */ -static const struct vio_device_id * vio_match_device(const struct vio_device_id *ids, - const struct vio_dev *dev) +static const struct vio_device_id *vio_match_device( + const struct vio_device_id *ids, const struct vio_dev *dev) { - DBGENTER(); - - while (ids->type) { - if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) && - device_is_compatible(dev->dev.platform_data, ids->compat)) + while (ids->type[0] != '\0') { + if (vio_bus_ops.match(ids, dev)) return ids; ids++; } return NULL; } -#ifdef CONFIG_PPC_ISERIES -void __init iommu_vio_init(void) -{ - struct iommu_table *t; - struct iommu_table_cb cb; - unsigned long cbp; - unsigned long itc_entries; - - cb.itc_busno = 255; /* Bus 255 is the virtual bus */ - cb.itc_virtbus = 0xff; /* Ask for virtual bus */ - - cbp = virt_to_abs(&cb); - HvCallXm_getTceTableParms(cbp); - - itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry); - veth_iommu_table.it_size = itc_entries / 2; - veth_iommu_table.it_busno = cb.itc_busno; - veth_iommu_table.it_offset = cb.itc_offset; - veth_iommu_table.it_index = cb.itc_index; - veth_iommu_table.it_type = TCE_VB; - veth_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&veth_iommu_table); - - if (!t) - printk("Virtual Bus VETH TCE table failed.\n"); - - vio_iommu_table.it_size = itc_entries - veth_iommu_table.it_size; - vio_iommu_table.it_busno = cb.itc_busno; - vio_iommu_table.it_offset = cb.itc_offset + - veth_iommu_table.it_size; - vio_iommu_table.it_index = cb.itc_index; - vio_iommu_table.it_type = TCE_VB; - vio_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&vio_iommu_table); - - if (!t) - printk("Virtual Bus VIO TCE table failed.\n"); -} -#endif - -#ifdef CONFIG_PPC_PSERIES -static void probe_bus_pseries(void) -{ - struct device_node *node_vroot, *of_node; - - node_vroot = find_devices("vdevice"); - if ((node_vroot == NULL) || (node_vroot->child == NULL)) - /* this machine doesn't do virtual IO, and that's ok */ - return; - - vio_num_address_cells = prom_n_addr_cells(node_vroot->child); - - /* - * Create struct vio_devices for each virtual device in the device tree. - * Drivers will associate with them later. - */ - for (of_node = node_vroot->child; of_node != NULL; - of_node = of_node->sibling) { - printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node); - vio_register_device_node(of_node); - } -} -#endif - -#ifdef CONFIG_PPC_ISERIES -static void probe_bus_iseries(void) -{ - HvLpIndexMap vlan_map = HvLpConfig_getVirtualLanIndexMap(); - struct vio_dev *viodev; - int i; - - /* there is only one of each of these */ - vio_register_device_iseries("viocons", 0); - vio_register_device_iseries("vscsi", 0); - - vlan_map = HvLpConfig_getVirtualLanIndexMap(); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) { - if ((vlan_map & (0x8000 >> i)) == 0) - continue; - viodev = vio_register_device_iseries("vlan", i); - /* veth is special and has it own iommu_table */ - viodev->iommu_table = &veth_iommu_table; - } - for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++) - vio_register_device_iseries("viodasd", i); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++) - vio_register_device_iseries("viocd", i); - for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++) - vio_register_device_iseries("viotape", i); -} -#endif - /** * vio_bus_init: - Initialize the virtual IO bus */ -static int __init vio_bus_init(void) +int __init vio_bus_init(struct vio_bus_ops *ops) { int err; + vio_bus_ops = *ops; + err = bus_register(&vio_bus_type); if (err) { printk(KERN_ERR "failed to register VIO bus\n"); return err; } - /* the fake parent of all vio devices, just to give us a nice directory */ + /* + * The fake parent of all vio devices, just to give us + * a nice directory + */ err = device_register(&vio_bus_device.dev); if (err) { - printk(KERN_WARNING "%s: device_register returned %i\n", __FUNCTION__, - err); + printk(KERN_WARNING "%s: device_register returned %i\n", + __FUNCTION__, err); return err; } -#ifdef CONFIG_PPC_PSERIES - probe_bus_pseries(); -#endif -#ifdef CONFIG_PPC_ISERIES - probe_bus_iseries(); -#endif - return 0; } -__initcall(vio_bus_init); - /* vio_dev refcount hit 0 */ static void __devinit vio_dev_release(struct device *dev) { - DBGENTER(); - -#ifdef CONFIG_PPC_PSERIES - /* XXX free TCE table */ - of_node_put(dev->platform_data); -#endif + if (vio_bus_ops.release_device) + vio_bus_ops.release_device(dev); kfree(to_vio_dev(dev)); } -#ifdef CONFIG_PPC_PSERIES -static ssize_t viodev_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct device_node *of_node = dev->platform_data; - - return sprintf(buf, "%s\n", of_node->full_name); -} -DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL); -#endif - -static ssize_t viodev_show_name(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t viodev_show_name(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_vio_dev(dev)->name); } DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL); -static struct vio_dev * __devinit vio_register_device_common( - struct vio_dev *viodev, char *name, char *type, - uint32_t unit_address, struct iommu_table *iommu_table) +struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev) { - DBGENTER(); - - viodev->name = name; - viodev->type = type; - viodev->unit_address = unit_address; - viodev->iommu_table = iommu_table; /* init generic 'struct device' fields: */ viodev->dev.parent = &vio_bus_device.dev; viodev->dev.bus = &vio_bus_type; @@ -338,222 +181,15 @@ static struct vio_dev * __devinit vio_register_device_common( return viodev; } -#ifdef CONFIG_PPC_PSERIES -/** - * vio_register_device_node: - Register a new vio device. - * @of_node: The OF node for this device. - * - * Creates and initializes a vio_dev structure from the data in - * of_node (dev.platform_data) and adds it to the list of virtual devices. - * Returns a pointer to the created vio_dev or NULL if node has - * NULL device_type or compatible fields. - */ -struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node) -{ - struct vio_dev *viodev; - unsigned int *unit_address; - unsigned int *irq_p; - - DBGENTER(); - - /* we need the 'device_type' property, in order to match with drivers */ - if ((NULL == of_node->type)) { - printk(KERN_WARNING - "%s: node %s missing 'device_type'\n", __FUNCTION__, - of_node->name ? of_node->name : "<unknown>"); - return NULL; - } - - unit_address = (unsigned int *)get_property(of_node, "reg", NULL); - if (!unit_address) { - printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__, - of_node->name ? of_node->name : "<unknown>"); - return NULL; - } - - /* allocate a vio_dev for this node */ - viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (!viodev) { - return NULL; - } - memset(viodev, 0, sizeof(struct vio_dev)); - - viodev->dev.platform_data = of_node_get(of_node); - - viodev->irq = NO_IRQ; - irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL); - if (irq_p) { - int virq = virt_irq_create_mapping(*irq_p); - if (virq == NO_IRQ) { - printk(KERN_ERR "Unable to allocate interrupt " - "number for %s\n", of_node->full_name); - } else - viodev->irq = irq_offset_up(virq); - } - - snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address); - - /* register with generic device framework */ - if (vio_register_device_common(viodev, of_node->name, of_node->type, - *unit_address, vio_build_iommu_table(viodev)) - == NULL) { - /* XXX free TCE table */ - kfree(viodev); - return NULL; - } - device_create_file(&viodev->dev, &dev_attr_devspec); - - return viodev; -} -EXPORT_SYMBOL(vio_register_device_node); -#endif - -#ifdef CONFIG_PPC_ISERIES -/** - * vio_register_device: - Register a new vio device. - * @voidev: The device to register. - */ -static struct vio_dev *__init vio_register_device_iseries(char *type, - uint32_t unit_num) -{ - struct vio_dev *viodev; - - DBGENTER(); - - /* allocate a vio_dev for this node */ - viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (!viodev) - return NULL; - memset(viodev, 0, sizeof(struct vio_dev)); - - snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num); - - return vio_register_device_common(viodev, viodev->dev.bus_id, type, - unit_num, &vio_iommu_table); -} -#endif - void __devinit vio_unregister_device(struct vio_dev *viodev) { - DBGENTER(); -#ifdef CONFIG_PPC_PSERIES - device_remove_file(&viodev->dev, &dev_attr_devspec); -#endif + if (vio_bus_ops.unregister_device) + vio_bus_ops.unregister_device(viodev); device_remove_file(&viodev->dev, &dev_attr_name); device_unregister(&viodev->dev); } EXPORT_SYMBOL(vio_unregister_device); -#ifdef CONFIG_PPC_PSERIES -/** - * vio_get_attribute: - get attribute for virtual device - * @vdev: The vio device to get property. - * @which: The property/attribute to be extracted. - * @length: Pointer to length of returned data size (unused if NULL). - * - * Calls prom.c's get_property() to return the value of the - * attribute specified by the preprocessor constant @which -*/ -const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length) -{ - return get_property(vdev->dev.platform_data, (char*)which, length); -} -EXPORT_SYMBOL(vio_get_attribute); - -/* vio_find_name() - internal because only vio.c knows how we formatted the - * kobject name - * XXX once vio_bus_type.devices is actually used as a kset in - * drivers/base/bus.c, this function should be removed in favor of - * "device_find(kobj_name, &vio_bus_type)" - */ -static struct vio_dev *vio_find_name(const char *kobj_name) -{ - struct kobject *found; - - found = kset_find_obj(&devices_subsys.kset, kobj_name); - if (!found) - return NULL; - - return to_vio_dev(container_of(found, struct device, kobj)); -} - -/** - * vio_find_node - find an already-registered vio_dev - * @vnode: device_node of the virtual device we're looking for - */ -struct vio_dev *vio_find_node(struct device_node *vnode) -{ - uint32_t *unit_address; - char kobj_name[BUS_ID_SIZE]; - - /* construct the kobject name from the device node */ - unit_address = (uint32_t *)get_property(vnode, "reg", NULL); - if (!unit_address) - return NULL; - snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address); - - return vio_find_name(kobj_name); -} -EXPORT_SYMBOL(vio_find_node); - -/** - * vio_build_iommu_table: - gets the dma information from OF and builds the TCE tree. - * @dev: the virtual device. - * - * Returns a pointer to the built tce tree, or NULL if it can't - * find property. -*/ -static struct iommu_table * vio_build_iommu_table(struct vio_dev *dev) -{ - unsigned int *dma_window; - struct iommu_table *newTceTable; - unsigned long offset; - int dma_window_property_size; - - dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size); - if(!dma_window) { - return NULL; - } - - newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL); - - /* There should be some code to extract the phys-encoded offset - using prom_n_addr_cells(). However, according to a comment - on earlier versions, it's always zero, so we don't bother */ - offset = dma_window[1] >> PAGE_SHIFT; - - /* TCE table size - measured in tce entries */ - newTceTable->it_size = dma_window[4] >> PAGE_SHIFT; - /* offset for VIO should always be 0 */ - newTceTable->it_offset = offset; - newTceTable->it_busno = 0; - newTceTable->it_index = (unsigned long)dma_window[0]; - newTceTable->it_type = TCE_VB; - - return iommu_init_table(newTceTable); -} - -int vio_enable_interrupts(struct vio_dev *dev) -{ - int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE); - if (rc != H_Success) { - printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc); - } - return rc; -} -EXPORT_SYMBOL(vio_enable_interrupts); - -int vio_disable_interrupts(struct vio_dev *dev) -{ - int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE); - if (rc != H_Success) { - printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc); - } - return rc; -} -EXPORT_SYMBOL(vio_disable_interrupts); -#endif - static dma_addr_t vio_map_single(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) { @@ -615,18 +251,8 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) const struct vio_dev *vio_dev = to_vio_dev(dev); struct vio_driver *vio_drv = to_vio_driver(drv); const struct vio_device_id *ids = vio_drv->id_table; - const struct vio_device_id *found_id; - - DBGENTER(); - if (!ids) - return 0; - - found_id = vio_match_device(ids, vio_dev); - if (found_id) - return 1; - - return 0; + return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } struct bus_type vio_bus_type = { diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S index fbff24827ae..35eb49e1b89 100644 --- a/arch/ppc64/mm/hash_low.S +++ b/arch/ppc64/mm/hash_low.S @@ -129,12 +129,10 @@ _GLOBAL(__hash_page) * code rather than call a C function...) */ BEGIN_FTR_SECTION -BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 bl .hash_page_do_lazy_icache -END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE) -END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE) +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c index a6abd3a979b..7626bb59954 100644 --- a/arch/ppc64/mm/hash_native.c +++ b/arch/ppc64/mm/hash_native.c @@ -51,7 +51,6 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, unsigned long vflags, unsigned long rflags) { - unsigned long arpn = physRpn_to_absRpn(prpn); hpte_t *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; int i; @@ -74,7 +73,7 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va, hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; if (vflags & HPTE_V_LARGE) va &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags; + hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; hptep->r = hpte_r; /* Guarantee the second dword is visible before the valid bit */ diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c index 623b5d130c3..09475c8edf7 100644 --- a/arch/ppc64/mm/hash_utils.c +++ b/arch/ppc64/mm/hash_utils.c @@ -210,7 +210,7 @@ void __init htab_initialize(void) /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { - base = lmb.memory.region[i].physbase + KERNELBASE; + base = lmb.memory.region[i].base + KERNELBASE; size = lmb.memory.region[i].size; DBG("creating mapping for region: %lx : %lx\n", base, size); @@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) int local = 0; cpumask_t tmp; - if ((ea & ~REGION_MASK) > EADDR_MASK) + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) return 1; switch (REGION_ID(ea)) { diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index f9524602818..e7833c80eb6 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -27,124 +27,94 @@ #include <linux/sysctl.h> -#define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3) -#define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT) -#define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1)) +#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) +#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) -#define HUGEPTE_INDEX_SIZE 9 -#define HUGEPGD_INDEX_SIZE 10 - -#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) -#define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE) - -static inline int hugepgd_index(unsigned long addr) -{ - return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT; -} - -static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr) +/* Modelled after find_linux_pte() */ +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { - int index; + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; - if (! mm->context.huge_pgdir) - return NULL; + BUG_ON(! in_hugepage_area(mm->context, addr)); + addr &= HPAGE_MASK; + + pg = pgd_offset(mm, addr); + if (!pgd_none(*pg)) { + pu = pud_offset(pg, addr); + if (!pud_none(*pu)) { + pm = pmd_offset(pu, addr); + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; + } + } - index = hugepgd_index(addr); - BUG_ON(index >= PTRS_PER_HUGEPGD); - return (pud_t *)(mm->context.huge_pgdir + index); + return NULL; } -static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { - int index; - - if (pud_none(*dir)) - return NULL; + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt; - index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE; - return (pte_t *)pud_page(*dir) + index; -} - -static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr) -{ BUG_ON(! in_hugepage_area(mm->context, addr)); - if (! mm->context.huge_pgdir) { - pgd_t *new; - spin_unlock(&mm->page_table_lock); - /* Don't use pgd_alloc(), because we want __GFP_REPEAT */ - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); - spin_lock(&mm->page_table_lock); + addr &= HPAGE_MASK; - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (mm->context.huge_pgdir) - pgd_free(new); - else - mm->context.huge_pgdir = new; - } - return hugepgd_offset(mm, addr); -} + pg = pgd_offset(mm, addr); + pu = pud_alloc(mm, pg, addr); -static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr) -{ - if (! pud_present(*dir)) { - pte_t *new; - - spin_unlock(&mm->page_table_lock); - new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT); - BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE)); - spin_lock(&mm->page_table_lock); - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pud_present(*dir)) { - if (new) - kmem_cache_free(zero_cache, new); - } else { - struct page *ptepage; - - if (! new) - return NULL; - ptepage = virt_to_page(new); - ptepage->mapping = (void *) mm; - ptepage->index = addr & HUGEPGDIR_MASK; - pud_populate(mm, dir, new); + if (pu) { + pm = pmd_alloc(mm, pu, addr); + if (pm) { + pt = (pte_t *)pm; + BUG_ON(!pmd_none(*pm) + && !(pte_present(*pt) && pte_huge(*pt))); + return pt; } } - return hugepte_offset(dir, addr); + return NULL; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pud_t *pud; +#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) - BUG_ON(! in_hugepage_area(mm->context, addr)); +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + int i; - pud = hugepgd_offset(mm, addr); - if (! pud) - return NULL; + if (pte_present(*ptep)) { + pte_clear(mm, addr, ptep); + flush_tlb_pending(); + } - return hugepte_offset(pud, addr); + for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + ptep++; + } } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - pud_t *pud; + unsigned long old = pte_update(ptep, ~0UL); + int i; - BUG_ON(! in_hugepage_area(mm->context, addr)); + if (old & _PAGE_HASHPTE) + hpte_update(mm, addr, old, 0); - pud = hugepgd_alloc(mm, addr); - if (! pud) - return NULL; + for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) + ptep[i] = __pte(0); - return hugepte_alloc(mm, pud, addr); + return __pte(old); } /* @@ -162,15 +132,17 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -static void flush_segments(void *parm) +static void flush_low_segments(void *parm) { - u16 segs = (unsigned long) parm; + u16 areas = (unsigned long) parm; unsigned long i; asm volatile("isync" : : : "memory"); - for (i = 0; i < 16; i++) { - if (! (segs & (1U << i))) + BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); + + for (i = 0; i < NUM_LOW_AREAS; i++) { + if (! (areas & (1U << i))) continue; asm volatile("slbie %0" : : "r" (i << SID_SHIFT)); } @@ -178,13 +150,33 @@ static void flush_segments(void *parm) asm volatile("isync" : : : "memory"); } -static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) +static void flush_high_segments(void *parm) { - unsigned long start = seg << SID_SHIFT; - unsigned long end = (seg+1) << SID_SHIFT; + u16 areas = (unsigned long) parm; + unsigned long i, j; + + asm volatile("isync" : : : "memory"); + + BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); + + for (i = 0; i < NUM_HIGH_AREAS; i++) { + if (! (areas & (1U << i))) + continue; + for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) + asm volatile("slbie %0" + :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT))); + } + + asm volatile("isync" : : : "memory"); +} + +static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) +{ + unsigned long start = area << SID_SHIFT; + unsigned long end = (area+1) << SID_SHIFT; struct vm_area_struct *vma; - BUG_ON(seg >= 16); + BUG_ON(area >= NUM_LOW_AREAS); /* Check no VMAs are in the region */ vma = find_vma(mm, start); @@ -194,20 +186,39 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) return 0; } -static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) +static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) +{ + unsigned long start = area << HTLB_AREA_SHIFT; + unsigned long end = (area+1) << HTLB_AREA_SHIFT; + struct vm_area_struct *vma; + + BUG_ON(area >= NUM_HIGH_AREAS); + + /* Check no VMAs are in the region */ + vma = find_vma(mm, start); + if (vma && (vma->vm_start < end)) + return -EBUSY; + + return 0; +} + +static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) { unsigned long i; - newsegs &= ~(mm->context.htlb_segs); - if (! newsegs) + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); + BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); + + newareas &= ~(mm->context.low_htlb_areas); + if (! newareas) return 0; /* The segments we want are already open */ - for (i = 0; i < 16; i++) - if ((1 << i) & newsegs) - if (prepare_low_seg_for_htlb(mm, i) != 0) + for (i = 0; i < NUM_LOW_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_low_area_for_htlb(mm, i) != 0) return -EBUSY; - mm->context.htlb_segs |= newsegs; + mm->context.low_htlb_areas |= newareas; /* update the paca copy of the context struct */ get_paca()->context = mm->context; @@ -215,29 +226,63 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) /* the context change must make it to memory before the flush, * so that further SLB misses do the right thing. */ mb(); - on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1); + on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); + + return 0; +} + +static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) +{ + unsigned long i; + + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); + BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) + != NUM_HIGH_AREAS); + + newareas &= ~(mm->context.high_htlb_areas); + if (! newareas) + return 0; /* The areas we want are already open */ + + for (i = 0; i < NUM_HIGH_AREAS; i++) + if ((1 << i) & newareas) + if (prepare_high_area_for_htlb(mm, i) != 0) + return -EBUSY; + + mm->context.high_htlb_areas |= newareas; + + /* update the paca copy of the context struct */ + get_paca()->context = mm->context; + + /* the context change must make it to memory before the flush, + * so that further SLB misses do the right thing. */ + mb(); + on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); return 0; } int prepare_hugepage_range(unsigned long addr, unsigned long len) { - if (within_hugepage_high_range(addr, len)) - return 0; - else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) { - int err; - /* Yes, we need both tests, in case addr+len overflows - * 64-bit arithmetic */ - err = open_low_hpage_segs(current->mm, + int err; + + if ( (addr+len) < addr ) + return -EINVAL; + + if ((addr + len) < 0x100000000UL) + err = open_low_hpage_areas(current->mm, LOW_ESID_MASK(addr, len)); - if (err) - printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" - " failed (segs: 0x%04hx)\n", addr, len, - LOW_ESID_MASK(addr, len)); + else + err = open_high_hpage_areas(current->mm, + HTLB_AREA_MASK(addr, len)); + if (err) { + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" + " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", + addr, len, + LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); return err; } - return -EINVAL; + return 0; } struct page * @@ -309,8 +354,8 @@ full_search: vma = find_vma(mm, addr); continue; } - if (touches_hugepage_high_range(addr, len)) { - addr = TASK_HPAGE_END; + if (touches_hugepage_high_range(mm, addr, len)) { + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); vma = find_vma(mm, addr); continue; } @@ -389,8 +434,9 @@ hugepage_recheck: if (touches_hugepage_low_range(mm, addr, len)) { addr = (addr & ((~0) << SID_SHIFT)) - len; goto hugepage_recheck; - } else if (touches_hugepage_high_range(addr, len)) { - addr = TASK_HPAGE_BASE - len; + } else if (touches_hugepage_high_range(mm, addr, len)) { + addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; + goto hugepage_recheck; } /* @@ -481,23 +527,28 @@ static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) return -ENOMEM; } -static unsigned long htlb_get_high_area(unsigned long len) +static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) { - unsigned long addr = TASK_HPAGE_BASE; + unsigned long addr = 0x100000000UL; struct vm_area_struct *vma; vma = find_vma(current->mm, addr); - for (vma = find_vma(current->mm, addr); - addr + len <= TASK_HPAGE_END; - vma = vma->vm_next) { + while (addr + len <= TASK_SIZE_USER64) { BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ - BUG_ON(! within_hugepage_high_range(addr, len)); + + if (! __within_hugepage_high_range(addr, len, areamask)) { + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); + vma = find_vma(current->mm, addr); + continue; + } if (!vma || (addr + len) <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, HPAGE_SIZE); - /* Because we're in a hugepage region, this alignment - * should not skip us over any VMAs */ + /* Depending on segmask this might not be a confirmed + * hugepage region, so the ALIGN could have skipped + * some VMAs */ + vma = find_vma(current->mm, addr); } return -ENOMEM; @@ -507,6 +558,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + int lastshift; + u16 areamask, curareas; + if (len & ~HPAGE_MASK) return -EINVAL; @@ -514,67 +568,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -EINVAL; if (test_thread_flag(TIF_32BIT)) { - int lastshift = 0; - u16 segmask, cursegs = current->mm->context.htlb_segs; + curareas = current->mm->context.low_htlb_areas; /* First see if we can do the mapping in the existing - * low hpage segments */ - addr = htlb_get_low_area(len, cursegs); + * low areas */ + addr = htlb_get_low_area(len, curareas); if (addr != -ENOMEM) return addr; - for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); - ! lastshift; segmask >>=1) { - if (segmask & 1) + lastshift = 0; + for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) lastshift = 1; - addr = htlb_get_low_area(len, cursegs | segmask); + addr = htlb_get_low_area(len, curareas | areamask); if ((addr != -ENOMEM) - && open_low_hpage_segs(current->mm, segmask) == 0) + && open_low_hpage_areas(current->mm, areamask) == 0) return addr; } - printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" - " enough segments\n"); - return -ENOMEM; } else { - return htlb_get_high_area(len); - } -} - -void hugetlb_mm_free_pgd(struct mm_struct *mm) -{ - int i; - pgd_t *pgdir; - - spin_lock(&mm->page_table_lock); - - pgdir = mm->context.huge_pgdir; - if (! pgdir) - goto out; - - mm->context.huge_pgdir = NULL; + curareas = current->mm->context.high_htlb_areas; - /* cleanup any hugepte pages leftover */ - for (i = 0; i < PTRS_PER_HUGEPGD; i++) { - pud_t *pud = (pud_t *)(pgdir + i); - - if (! pud_none(*pud)) { - pte_t *pte = (pte_t *)pud_page(*pud); - struct page *ptepage = virt_to_page(pte); + /* First see if we can do the mapping in the existing + * high areas */ + addr = htlb_get_high_area(len, curareas); + if (addr != -ENOMEM) + return addr; - ptepage->mapping = NULL; + lastshift = 0; + for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); + ! lastshift; areamask >>=1) { + if (areamask & 1) + lastshift = 1; - BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE)); - kmem_cache_free(zero_cache, pte); + addr = htlb_get_high_area(len, curareas | areamask); + if ((addr != -ENOMEM) + && open_high_hpage_areas(current->mm, areamask) == 0) + return addr; } - pud_clear(pud); } - - BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE)); - kmem_cache_free(zero_cache, pgdir); - - out: - spin_unlock(&mm->page_table_lock); + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" + " enough areas\n"); + return -ENOMEM; } int hash_huge_page(struct mm_struct *mm, unsigned long access, diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index b6e75b891ac..c65b87b9275 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c @@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr) break; if ((unsigned long)tmp->addr >= ioremap_bot) addr = tmp->size + (unsigned long) tmp->addr; - if (addr > IMALLOC_END-size) + if (addr >= IMALLOC_END-size) return 1; } *im_addr = addr; diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index e58a24d4287..c02dc9809ca 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -42,7 +42,6 @@ #include <asm/pgalloc.h> #include <asm/page.h> -#include <asm/abs_addr.h> #include <asm/prom.h> #include <asm/lmb.h> #include <asm/rtas.h> @@ -66,6 +65,14 @@ #include <asm/vdso.h> #include <asm/imalloc.h> +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif + int mem_init_done; unsigned long ioremap_bot = IMALLOC_BASE; static unsigned long phbs_io_bot = PHBS_IO_BASE; @@ -159,7 +166,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) ptep = pte_alloc_kernel(&init_mm, pmdp, ea); if (!ptep) return -ENOMEM; - pa = abs_to_phys(pa); set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); spin_unlock(&init_mm.page_table_lock); @@ -226,7 +232,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size, * Before that, we map using addresses going * up from ioremap_bot. imalloc will use * the addresses from ioremap_bot through - * IMALLOC_END (0xE000001fffffffff) + * IMALLOC_END * */ pa = addr & PAGE_MASK; @@ -417,12 +423,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) int index; int err; -#ifdef CONFIG_HUGETLB_PAGE - /* We leave htlb_segs as it was, but for a fork, we need to - * clear the huge_pgdir. */ - mm->context.huge_pgdir = NULL; -#endif - again: if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL)) return -ENOMEM; @@ -453,8 +453,6 @@ void destroy_context(struct mm_struct *mm) spin_unlock(&mmu_context_lock); mm->context.id = NO_CONTEXT; - - hugetlb_mm_free_pgd(mm); } /* @@ -484,9 +482,9 @@ void __init mm_init_ppc64(void) for (i = 1; i < lmb.memory.cnt; i++) { unsigned long base, prevbase, prevsize; - prevbase = lmb.memory.region[i-1].physbase; + prevbase = lmb.memory.region[i-1].base; prevsize = lmb.memory.region[i-1].size; - base = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; if (base > (prevbase + prevsize)) { io_hole_start = prevbase + prevsize; io_hole_size = base - (prevbase + prevsize); @@ -513,11 +511,8 @@ int page_is_ram(unsigned long pfn) for (i=0; i < lmb.memory.cnt; i++) { unsigned long base; -#ifdef CONFIG_MSCHUNKS - base = lmb.memory.region[i].physbase; -#else base = lmb.memory.region[i].base; -#endif + if ((paddr >= base) && (paddr < (base + lmb.memory.region[i].size))) { return 1; @@ -547,7 +542,7 @@ void __init do_init_bootmem(void) */ bootmap_pages = bootmem_bootmap_pages(total_pages); - start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE)); + start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); BUG_ON(!start); boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages); @@ -558,25 +553,25 @@ void __init do_init_bootmem(void) * present. */ for (i=0; i < lmb.memory.cnt; i++) { - unsigned long physbase, size; + unsigned long base, size; unsigned long start_pfn, end_pfn; - physbase = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; size = lmb.memory.region[i].size; - start_pfn = physbase >> PAGE_SHIFT; + start_pfn = base >> PAGE_SHIFT; end_pfn = start_pfn + (size >> PAGE_SHIFT); memory_present(0, start_pfn, end_pfn); - free_bootmem(physbase, size); + free_bootmem(base, size); } /* reserve the sections we're already using */ for (i=0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].physbase; + unsigned long base = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; - reserve_bootmem(physbase, size); + reserve_bootmem(base, size); } } @@ -615,10 +610,10 @@ static int __init setup_kcore(void) int i; for (i=0; i < lmb.memory.cnt; i++) { - unsigned long physbase, size; + unsigned long base, size; struct kcore_list *kcore_mem; - physbase = lmb.memory.region[i].physbase; + base = lmb.memory.region[i].base; size = lmb.memory.region[i].size; /* GFP_ATOMIC to avoid might_sleep warnings during boot */ @@ -626,7 +621,7 @@ static int __init setup_kcore(void) if (!kcore_mem) panic("mem_init: kmalloc failed\n"); - kclist_add(kcore_mem, __va(physbase), size); + kclist_add(kcore_mem, __va(base), size); } kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); @@ -686,9 +681,6 @@ void __init mem_init(void) mem_init_done = 1; -#ifdef CONFIG_PPC_ISERIES - iommu_vio_init(); -#endif /* Initialize the vDSO */ vdso_init(); } @@ -833,23 +825,43 @@ void __iomem * reserve_phb_iospace(unsigned long size) return virt_addr; } -kmem_cache_t *zero_cache; - -static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags) +static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) { - memset(pte, 0, PAGE_SIZE); + memset(addr, 0, kmem_cache_size(cache)); } +static const int pgtable_cache_size[2] = { + PTE_TABLE_SIZE, PMD_TABLE_SIZE +}; +static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { + "pgd_pte_cache", "pud_pmd_cache", +}; + +kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; + void pgtable_cache_init(void) { - zero_cache = kmem_cache_create("zero", - PAGE_SIZE, - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - zero_ctor, - NULL); - if (!zero_cache) - panic("pgtable_cache_init(): could not create zero_cache!\n"); + int i; + + BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); + BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); + BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); + BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); + + for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { + int size = pgtable_cache_size[i]; + const char *name = pgtable_cache_name[i]; + + pgtable_cache[i] = kmem_cache_create(name, + size, size, + SLAB_HWCACHE_ALIGN + | SLAB_MUST_HWCACHE_ALIGN, + zero_ctor, + NULL); + if (! pgtable_cache[i]) + panic("pgtable_cache_init(): could not create %s!\n", + name); + } } pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c index 0b191f2de01..c3116f0d788 100644 --- a/arch/ppc64/mm/numa.c +++ b/arch/ppc64/mm/numa.c @@ -671,7 +671,7 @@ new_range: * Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].physbase; + unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; if (pa_to_nid(physbase) != nid && diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S index 8379d678f70..bab255889c5 100644 --- a/arch/ppc64/mm/slb_low.S +++ b/arch/ppc64/mm/slb_low.S @@ -89,28 +89,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) b 9f 0: /* user address: proto-VSID = context<<15 | ESID */ - li r11,SLB_VSID_USER - - srdi. r9,r3,13 + srdi. r9,r3,USER_ESID_BITS bne- 8f /* invalid ea bits set */ #ifdef CONFIG_HUGETLB_PAGE BEGIN_FTR_SECTION - /* check against the hugepage ranges */ - cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT) - bge 6f /* >= TASK_HPAGE_END */ - cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT) - bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */ + lhz r9,PACAHIGHHTLBAREAS(r13) + srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) + srd r9,r9,r11 + andi. r9,r9,1 + bne 5f + + li r11,SLB_VSID_USER + cmpldi r3,16 - bge 6f /* 4GB..TASK_HPAGE_BASE */ + bge 6f - lhz r9,PACAHTLBSEGS(r13) + lhz r9,PACALOWHTLBAREAS(r13) srd r9,r9,r3 andi. r9,r9,1 + beq 6f -5: /* this is a hugepage user address */ - li r11,(SLB_VSID_USER|SLB_VSID_L) +5: li r11,SLB_VSID_USER|SLB_VSID_L END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c index 26f0172c452..d8a6593a13f 100644 --- a/arch/ppc64/mm/tlb.c +++ b/arch/ppc64/mm/tlb.c @@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) +struct pte_freelist_batch +{ + struct rcu_head rcu; + unsigned int index; + pgtable_free_t tables[0]; +}; + +DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); +unsigned long pte_freelist_forced_free; + +#define PTE_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ + / sizeof(pgtable_free_t)) + +#ifdef CONFIG_SMP +static void pte_free_smp_sync(void *arg) +{ + /* Do nothing, just ensure we sync with all CPUs */ +} +#endif + +/* This is only called when we are critically out of memory + * (and fail to get a page in pte_free_tlb). + */ +static void pgtable_free_now(pgtable_free_t pgf) +{ + pte_freelist_forced_free++; + + smp_call_function(pte_free_smp_sync, NULL, 0, 1); + + pgtable_free(pgf); +} + +static void pte_free_rcu_callback(struct rcu_head *head) +{ + struct pte_freelist_batch *batch = + container_of(head, struct pte_freelist_batch, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + pgtable_free(batch->tables[i]); + + free_page((unsigned long)batch); +} + +static void pte_free_submit(struct pte_freelist_batch *batch) +{ + INIT_RCU_HEAD(&batch->rcu); + call_rcu(&batch->rcu, pte_free_rcu_callback); +} + +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { /* This is safe as we are holding page_table_lock */ cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); @@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage) if (atomic_read(&tlb->mm->mm_users) < 2 || cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) { - pte_free(ptepage); + pgtable_free(pgf); return; } if (*batchp == NULL) { *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); if (*batchp == NULL) { - pte_free_now(ptepage); + pgtable_free_now(pgf); return; } (*batchp)->index = 0; } - (*batchp)->pages[(*batchp)->index++] = ptepage; + (*batchp)->tables[(*batchp)->index++] = pgf; if ((*batchp)->index == PTE_FREELIST_SIZE) { pte_free_submit(*batchp); *batchp = NULL; @@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) put_cpu(); } -#ifdef CONFIG_SMP -static void pte_free_smp_sync(void *arg) -{ - /* Do nothing, just ensure we sync with all CPUs */ -} -#endif - -/* This is only called when we are critically out of memory - * (and fail to get a page in pte_free_tlb). - */ -void pte_free_now(struct page *ptepage) -{ - pte_freelist_forced_free++; - - smp_call_function(pte_free_smp_sync, NULL, 0, 1); - - pte_free(ptepage); -} - -static void pte_free_rcu_callback(struct rcu_head *head) -{ - struct pte_freelist_batch *batch = - container_of(head, struct pte_freelist_batch, rcu); - unsigned int i; - - for (i = 0; i < batch->index; i++) - pte_free(batch->pages[i]); - free_page((unsigned long)batch); -} - -void pte_free_submit(struct pte_freelist_batch *batch) -{ - INIT_RCU_HEAD(&batch->rcu); - call_rcu(&batch->rcu, pte_free_rcu_callback); -} - void pte_free_finish(void) { /* This is safe as we are holding page_table_lock */ diff --git a/arch/ppc64/oprofile/common.c b/arch/ppc64/oprofile/common.c index b28bfda23d9..4acd1a42493 100644 --- a/arch/ppc64/oprofile/common.c +++ b/arch/ppc64/oprofile/common.c @@ -153,6 +153,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) case PV_970: case PV_970FX: + case PV_970MP: model = &op_model_power4; model->num_counters = 8; ops->cpu_type = "ppc64/970"; diff --git a/arch/ppc64/xmon/start.c b/arch/ppc64/xmon/start.c index a9265bcc79b..f86b584acd7 100644 --- a/arch/ppc64/xmon/start.c +++ b/arch/ppc64/xmon/start.c @@ -27,7 +27,7 @@ static void sysrq_handle_xmon(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { /* ensure xmon is enabled */ - xmon_init(); + xmon_init(1); debugger(pt_regs); } diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c index 05539439e6b..45908b10acd 100644 --- a/arch/ppc64/xmon/xmon.c +++ b/arch/ppc64/xmon/xmon.c @@ -2496,15 +2496,25 @@ static void dump_stab(void) } } -void xmon_init(void) -{ - __debugger = xmon; - __debugger_ipi = xmon_ipi; - __debugger_bpt = xmon_bpt; - __debugger_sstep = xmon_sstep; - __debugger_iabr_match = xmon_iabr_match; - __debugger_dabr_match = xmon_dabr_match; - __debugger_fault_handler = xmon_fault_handler; +void xmon_init(int enable) +{ + if (enable) { + __debugger = xmon; + __debugger_ipi = xmon_ipi; + __debugger_bpt = xmon_bpt; + __debugger_sstep = xmon_sstep; + __debugger_iabr_match = xmon_iabr_match; + __debugger_dabr_match = xmon_dabr_match; + __debugger_fault_handler = xmon_fault_handler; + } else { + __debugger = NULL; + __debugger_ipi = NULL; + __debugger_bpt = NULL; + __debugger_sstep = NULL; + __debugger_iabr_match = NULL; + __debugger_dabr_match = NULL; + __debugger_fault_handler = NULL; + } } void dump_segments(void) diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c index 73c6b85299c..d74a7c5e75d 100644 --- a/drivers/atm/ambassador.c +++ b/drivers/atm/ambassador.c @@ -513,7 +513,7 @@ static void rx_complete (amb_dev * dev, rx_out * rx) { // VC layer stats atomic_inc(&atm_vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); // end of our responsability atm_vcc->push (atm_vcc, skb); return; diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index f2f01cb82cb..57f1810fdcc 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -325,7 +325,7 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) result = -ENOBUFS; goto done; } - do_gettimeofday(&new_skb->stamp); + __net_timestamp(new_skb); memcpy(skb_put(new_skb,skb->len),skb->data,skb->len); out_vcc->push(out_vcc,new_skb); atomic_inc(&vcc->stats->tx); diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 10da3693476..c13c4d736ef 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -537,7 +537,7 @@ static int rx_aal0(struct atm_vcc *vcc) return 0; } skb_put(skb,length); - skb->stamp = eni_vcc->timestamp; + skb_set_timestamp(skb, &eni_vcc->timestamp); DPRINTK("got len %ld\n",length); if (do_rx_dma(vcc,skb,1,length >> 2,length >> 2)) return 1; eni_vcc->rxing++; diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index b078fa548eb..58219744f5d 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -815,7 +815,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q) skb_put (skb, qe->p1 & 0xffff); ATM_SKB(skb)->vcc = atm_vcc; atomic_inc(&atm_vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p (pushed)\n", skb); atm_vcc->push (atm_vcc, skb); fs_dprintk (FS_DEBUG_ALLOC, "Free rec-d: %p\n", pe); diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index 5f702199543..2bf723a7b6e 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -1176,7 +1176,7 @@ fore200e_push_rpd(struct fore200e* fore200e, struct atm_vcc* vcc, struct rpd* rp return -ENOMEM; } - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); #ifdef FORE200E_52BYTE_AAL0_SDU if (cell_header) { diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 28250c9b32d..fde9334059a 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1886,7 +1886,7 @@ he_service_rbrq(struct he_dev *he_dev, int group) if (rx_skb_reserve > 0) skb_reserve(skb, rx_skb_reserve); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); for (iov = he_vcc->iov_head; iov < he_vcc->iov_tail; ++iov) { diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 924a2c8988b..0cded046800 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -1034,7 +1034,7 @@ static void rx_schedule (hrz_dev * dev, int irq) { struct atm_vcc * vcc = ATM_SKB(skb)->vcc; // VC layer stats atomic_inc(&vcc->stats->rx); - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); // end of our responsability vcc->push (vcc, skb); } diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 30b7e990ed0..b4a76cade64 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -1101,7 +1101,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) cell, ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); @@ -1179,7 +1179,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) skb_trim(skb, len); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); @@ -1201,7 +1201,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe) skb_trim(skb, len); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); @@ -1340,7 +1340,7 @@ idt77252_rx_raw(struct idt77252_dev *card) ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c index ffe3afa723b..51ec1478729 100644 --- a/drivers/atm/lanai.c +++ b/drivers/atm/lanai.c @@ -1427,7 +1427,7 @@ static void vcc_rx_aal5(struct lanai_vcc *lvcc, int endptr) skb_put(skb, size); vcc_rx_memcpy(skb->data, lvcc, size); ATM_SKB(skb)->vcc = lvcc->rx.atmvcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); lvcc->rx.atmvcc->push(lvcc->rx.atmvcc, skb); atomic_inc(&lvcc->rx.atmvcc->stats->rx); out: diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index b2a7b754fd1..c57e20dcb0f 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -214,8 +214,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev); static void __devinit ns_init_card_error(ns_dev *card, int error); static scq_info *get_scq(int size, u32 scd); static void free_scq(scq_info *scq, struct atm_vcc *vcc); -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2); +static void push_rxbufs(ns_dev *, struct sk_buff *); static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs); static int ns_open(struct atm_vcc *vcc); static void ns_close(struct atm_vcc *vcc); @@ -766,6 +765,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -786,9 +786,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); /* Due to the implementation of push_rxbufs() this is 1, not 0 */ if (j == 1) { @@ -822,9 +823,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } /* Test for strange behaviour which leads to crashes */ if ((bcount = ns_stat_sfbqc_get(readl(card->membase + STAT))) < card->sbnr.min) @@ -852,6 +854,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev) ns_init_card_error(card, error); return error; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; } @@ -1078,12 +1081,18 @@ static void free_scq(scq_info *scq, struct atm_vcc *vcc) /* The handles passed must be pointers to the sk_buff containing the small or large buffer(s) cast to u32. */ -static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, - u32 handle2, u32 addr2) +static void push_rxbufs(ns_dev *card, struct sk_buff *skb) { + struct ns_skb_cb *cb = NS_SKB_CB(skb); + u32 handle1, addr1; + u32 handle2, addr2; u32 stat; unsigned long flags; + /* *BARF* */ + handle2 = addr2 = 0; + handle1 = (u32)skb; + addr1 = (u32)virt_to_bus(skb->data); #ifdef GENERAL_DEBUG if (!addr1) @@ -1093,7 +1102,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, stat = readl(card->membase + STAT); card->sbfqc = ns_stat_sfbqc_get(stat); card->lbfqc = ns_stat_lfbqc_get(stat); - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (!addr2) { @@ -1111,7 +1120,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, } } } - else /* type == BUF_LG */ + else /* buf_type == BUF_LG */ { if (!addr2) { @@ -1132,26 +1141,26 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, if (addr2) { - if (type == BUF_SM) + if (cb->buf_type == BUF_SM) { if (card->sbfqc >= card->sbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->sbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } else card->sbfqc += 2; } - else /* (type == BUF_LG) */ + else /* (buf_type == BUF_LG) */ { if (card->lbfqc >= card->lbnr.max) { - skb_unlink((struct sk_buff *) handle1); + skb_unlink((struct sk_buff *) handle1, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle1); - skb_unlink((struct sk_buff *) handle2); + skb_unlink((struct sk_buff *) handle2, &card->lbpool.queue); dev_kfree_skb_any((struct sk_buff *) handle2); return; } @@ -1166,12 +1175,12 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1, writel(handle2, card->membase + DR2); writel(addr1, card->membase + DR1); writel(handle1, card->membase + DR0); - writel(NS_CMD_WRITE_FREEBUFQ | (u32) type, card->membase + CMD); + writel(NS_CMD_WRITE_FREEBUFQ | cb->buf_type, card->membase + CMD); spin_unlock_irqrestore(&card->res_lock, flags); XPRINTK("nicstar%d: Pushing %s buffers at 0x%x and 0x%x.\n", card->index, - (type == BUF_SM ? "small" : "large"), addr1, addr2); + (cb->buf_type == BUF_SM ? "small" : "large"), addr1, addr2); } if (!card->efbie && card->sbfqc >= card->sbnr.min && @@ -1322,9 +1331,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs) card->efbie = 0; break; } + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } card->sbfqc = i; process_rsq(card); @@ -1348,9 +1358,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs) card->efbie = 0; break; } + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } card->lbfqc = i; process_rsq(card); @@ -2202,7 +2213,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) memcpy(sb->tail, cell, ATM_CELL_PAYLOAD); skb_put(sb, ATM_CELL_PAYLOAD); ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); cell += ATM_CELL_PAYLOAD; @@ -2227,6 +2238,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) recycle_rx_buf(card, skb); return; } + NS_SKB_CB(iovb)->buf_type = BUF_NONE; } else if (--card->iovpool.count < card->iovnr.min) @@ -2234,6 +2246,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) struct sk_buff *new_iovb; if ((new_iovb = alloc_skb(NS_IOVBUFSIZE, GFP_ATOMIC)) != NULL) { + NS_SKB_CB(iovb)->buf_type = BUF_NONE; skb_queue_tail(&card->iovpool.queue, new_iovb); card->iovpool.count++; } @@ -2264,7 +2277,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) if (NS_SKB(iovb)->iovcnt == 1) { - if (skb->list != &card->sbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_SM) { printk("nicstar%d: Expected a small buffer, and this is not one.\n", card->index); @@ -2278,7 +2291,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) } else /* NS_SKB(iovb)->iovcnt >= 2 */ { - if (skb->list != &card->lbpool.queue) + if (NS_SKB_CB(skb)->buf_type != BUF_LG) { printk("nicstar%d: Expected a large buffer, and this is not one.\n", card->index); @@ -2322,8 +2335,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) /* skb points to a small buffer */ if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2334,7 +2346,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) skb->destructor = ns_sb_destructor; #endif /* NS_USE_DESTRUCTORS */ ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); } @@ -2350,8 +2362,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) { if (!atm_charge(vcc, sb->truesize)) { - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2362,21 +2373,19 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) sb->destructor = ns_sb_destructor; #endif /* NS_USE_DESTRUCTORS */ ATM_SKB(sb)->vcc = vcc; - do_gettimeofday(&sb->stamp); + __net_timestamp(sb); vcc->push(vcc, sb); atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); } else /* len > NS_SMBUFSIZE, the usual case */ { if (!atm_charge(vcc, skb->truesize)) { - push_rxbufs(card, BUF_LG, (u32) skb, - (u32) virt_to_bus(skb->data), 0, 0); + push_rxbufs(card, skb); atomic_inc(&vcc->stats->rx_drop); } else @@ -2389,13 +2398,12 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) memcpy(skb->data, sb->data, NS_SMBUFSIZE); skb_put(skb, len - NS_SMBUFSIZE); ATM_SKB(skb)->vcc = vcc; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); vcc->push(vcc, skb); atomic_inc(&vcc->stats->rx); } - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); } @@ -2430,6 +2438,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) card->hbpool.count++; } } + NS_SKB_CB(hb)->buf_type = BUF_NONE; } else if (--card->hbpool.count < card->hbnr.min) @@ -2437,6 +2446,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) struct sk_buff *new_hb; if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2444,6 +2454,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) { if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL) { + NS_SKB_CB(new_hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, new_hb); card->hbpool.count++; } @@ -2473,8 +2484,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) remaining = len - iov->iov_len; iov++; /* Free the small buffer */ - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), - 0, 0); + push_rxbufs(card, sb); /* Copy all large buffers to the huge buffer and free them */ for (j = 1; j < NS_SKB(iovb)->iovcnt; j++) @@ -2485,8 +2495,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) skb_put(hb, tocopy); iov++; remaining -= tocopy; - push_rxbufs(card, BUF_LG, (u32) lb, - (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } #ifdef EXTRA_DEBUG if (remaining != 0 || hb->len != len) @@ -2496,7 +2505,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe) #ifdef NS_USE_DESTRUCTORS hb->destructor = ns_hb_destructor; #endif /* NS_USE_DESTRUCTORS */ - do_gettimeofday(&hb->stamp); + __net_timestamp(hb); vcc->push(vcc, hb); atomic_inc(&vcc->stats->rx); } @@ -2527,9 +2536,10 @@ static void ns_sb_destructor(struct sk_buff *sb) sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) break; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } while (card->sbfqc < card->sbnr.min); } @@ -2550,9 +2560,10 @@ static void ns_lb_destructor(struct sk_buff *lb) lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) break; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } while (card->lbfqc < card->lbnr.min); } @@ -2569,6 +2580,7 @@ static void ns_hb_destructor(struct sk_buff *hb) hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) break; + NS_SKB_CB(hb)->buf_type = BUF_NONE; skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; } @@ -2577,45 +2589,25 @@ static void ns_hb_destructor(struct sk_buff *hb) #endif /* NS_USE_DESTRUCTORS */ - static void recycle_rx_buf(ns_dev *card, struct sk_buff *skb) { - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } -} + struct ns_skb_cb *cb = NS_SKB_CB(skb); + if (unlikely(cb->buf_type == BUF_NONE)) { + printk("nicstar%d: What kind of rx buffer is this?\n", card->index); + dev_kfree_skb_any(skb); + } else + push_rxbufs(card, skb); +} static void recycle_iovec_rx_bufs(ns_dev *card, struct iovec *iov, int count) { - struct sk_buff *skb; - - for (; count > 0; count--) - { - skb = (struct sk_buff *) (iov++)->iov_base; - if (skb->list == &card->sbpool.queue) - push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else if (skb->list == &card->lbpool.queue) - push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), - 0, 0); - else - { - printk("nicstar%d: What kind of rx buffer is this?\n", card->index); - dev_kfree_skb_any(skb); - } - } + while (count-- > 0) + recycle_rx_buf(card, (struct sk_buff *) (iov++)->iov_base); } - static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb) { if (card->iovpool.count < card->iovnr.max) @@ -2631,7 +2623,7 @@ static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb) static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) { - skb_unlink(sb); + skb_unlink(sb, &card->sbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->sbfqc < card->sbnr.min) #else @@ -2640,10 +2632,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } if (card->sbfqc < card->sbnr.init) @@ -2652,10 +2644,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) struct sk_buff *new_sb; if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL) { + NS_SKB_CB(new_sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, new_sb); skb_reserve(new_sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) new_sb, - (u32) virt_to_bus(new_sb->data), 0, 0); + push_rxbufs(card, new_sb); } } } @@ -2664,7 +2656,7 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb) static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) { - skb_unlink(lb); + skb_unlink(lb, &card->lbpool.queue); #ifdef NS_USE_DESTRUCTORS if (card->lbfqc < card->lbnr.min) #else @@ -2673,10 +2665,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } if (card->lbfqc < card->lbnr.init) @@ -2685,10 +2677,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb) struct sk_buff *new_lb; if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL) { + NS_SKB_CB(new_lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, new_lb); skb_reserve(new_lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) new_lb, - (u32) virt_to_bus(new_lb->data), 0, 0); + push_rxbufs(card, new_lb); } } } @@ -2880,9 +2872,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL); if (sb == NULL) return -ENOMEM; + NS_SKB_CB(sb)->buf_type = BUF_SM; skb_queue_tail(&card->sbpool.queue, sb); skb_reserve(sb, NS_AAL0_HEADER); - push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0); + push_rxbufs(card, sb); } break; @@ -2894,9 +2887,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL); if (lb == NULL) return -ENOMEM; + NS_SKB_CB(lb)->buf_type = BUF_LG; skb_queue_tail(&card->lbpool.queue, lb); skb_reserve(lb, NS_SMBUFSIZE); - push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0); + push_rxbufs(card, lb); } break; @@ -2923,6 +2917,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL); if (hb == NULL) return -ENOMEM; + NS_SKB_CB(hb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->hbpool.queue, hb); card->hbpool.count++; @@ -2953,6 +2948,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) iovb = alloc_skb(NS_IOVBUFSIZE, GFP_KERNEL); if (iovb == NULL) return -ENOMEM; + NS_SKB_CB(iovb)->buf_type = BUF_NONE; ns_grab_int_lock(card, flags); skb_queue_tail(&card->iovpool.queue, iovb); card->iovpool.count++; @@ -2979,17 +2975,12 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg) } - static void which_list(ns_dev *card, struct sk_buff *skb) { - printk("It's a %s buffer.\n", skb->list == &card->sbpool.queue ? - "small" : skb->list == &card->lbpool.queue ? "large" : - skb->list == &card->hbpool.queue ? "huge" : - skb->list == &card->iovpool.queue ? "iovec" : "unknown"); + printk("skb buf_type: 0x%08x\n", NS_SKB_CB(skb)->buf_type); } - static void ns_poll(unsigned long arg) { int i; diff --git a/drivers/atm/nicstar.h b/drivers/atm/nicstar.h index ea83c46c8ba..5997bcb45b5 100644 --- a/drivers/atm/nicstar.h +++ b/drivers/atm/nicstar.h @@ -103,8 +103,14 @@ #define NS_IOREMAP_SIZE 4096 -#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ -#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +/* + * BUF_XX distinguish the Rx buffers depending on their (small/large) size. + * BUG_SM and BUG_LG are both used by the driver and the device. + * BUF_NONE is only used by the driver. + */ +#define BUF_SM 0x00000000 /* These two are used for push_rxbufs() */ +#define BUF_LG 0x00000001 /* CMD, Write_FreeBufQ, LBUF bit */ +#define BUF_NONE 0xffffffff /* Software only: */ #define NS_HBUFSIZE 65568 /* Size of max. AAL5 PDU */ #define NS_MAX_IOVECS (2 + (65568 - NS_SMBUFSIZE) / \ @@ -684,6 +690,12 @@ enum ns_regs /* Device driver structures ***************************************************/ +struct ns_skb_cb { + u32 buf_type; /* BUF_SM/BUF_LG/BUF_NONE */ +}; + +#define NS_SKB_CB(skb) ((struct ns_skb_cb *)((skb)->cb)) + typedef struct tsq_info { void *org; diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index a2b236a966e..c4b75ecf946 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -400,7 +400,7 @@ unsigned long *x; EVENT("error code 0x%x/0x%x\n",(here[3] & uPD98401_AAL5_ES) >> uPD98401_AAL5_ES_SHIFT,error); skb = ((struct rx_buffer_head *) bus_to_virt(here[2]))->skb; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); #if 0 printk("[-3..0] 0x%08lx 0x%08lx 0x%08lx 0x%08lx\n",((unsigned *) skb->data)[-3], ((unsigned *) skb->data)[-2],((unsigned *) skb->data)[-1], @@ -417,10 +417,12 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]); chan = (here[3] & uPD98401_AAL5_CHAN) >> uPD98401_AAL5_CHAN_SHIFT; if (chan < zatm_dev->chans && zatm_dev->rx_map[chan]) { + int pos = ZATM_VCC(vcc)->pool; + vcc = zatm_dev->rx_map[chan]; - if (skb == zatm_dev->last_free[ZATM_VCC(vcc)->pool]) - zatm_dev->last_free[ZATM_VCC(vcc)->pool] = NULL; - skb_unlink(skb); + if (skb == zatm_dev->last_free[pos]) + zatm_dev->last_free[pos] = NULL; + skb_unlink(skb, zatm_dev->pool + pos); } else { printk(KERN_ERR DEV_LABEL "(itf %d): RX indication " diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index 9e6f51c528b..4be976940f6 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -120,7 +120,7 @@ aoenet_xmit(struct sk_buff *sl) * (1) len doesn't include the header by default. I want this. */ static int -aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt) +aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; u32 n; diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index 46e56a25d2c..e46ecd23b3a 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -776,7 +776,7 @@ static int viodasd_remove(struct vio_dev *vdev) */ static struct vio_device_id viodasd_device_table[] __devinitdata = { { "viodasd", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viodasd_device_table); diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c index c42d7e6ac1c..1e9db0156ea 100644 --- a/drivers/bluetooth/bfusb.c +++ b/drivers/bluetooth/bfusb.c @@ -158,7 +158,7 @@ static int bfusb_send_bulk(struct bfusb *bfusb, struct sk_buff *skb) if (err) { BT_ERR("%s bulk tx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); usb_free_urb(urb); } else atomic_inc(&bfusb->pending_tx); @@ -212,7 +212,7 @@ static void bfusb_tx_complete(struct urb *urb, struct pt_regs *regs) read_lock(&bfusb->lock); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); skb_queue_tail(&bfusb->completed_q, skb); bfusb_tx_wakeup(bfusb); @@ -253,7 +253,7 @@ static int bfusb_rx_submit(struct bfusb *bfusb, struct urb *urb) if (err) { BT_ERR("%s bulk rx submit failed urb %p err %d", bfusb->hdev->name, urb, err); - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); usb_free_urb(urb); } @@ -330,7 +330,7 @@ static inline int bfusb_recv_block(struct bfusb *bfusb, int hdr, unsigned char * } skb->dev = (void *) bfusb->hdev; - skb->pkt_type = pkt_type; + bt_cb(skb)->pkt_type = pkt_type; bfusb->reassembly = skb; } else { @@ -398,7 +398,7 @@ static void bfusb_rx_complete(struct urb *urb, struct pt_regs *regs) buf += len; } - skb_unlink(skb); + skb_unlink(skb, &bfusb->pending_q); kfree_skb(skb); bfusb_rx_submit(bfusb, urb); @@ -485,7 +485,7 @@ static int bfusb_send_frame(struct sk_buff *skb) unsigned char buf[3]; int sent = 0, size, count; - BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len); + BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len); if (!hdev) { BT_ERR("Frame for unknown HCI device (hdev=NULL)"); @@ -497,7 +497,7 @@ static int bfusb_send_frame(struct sk_buff *skb) bfusb = (struct bfusb *) hdev->driver_data; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -510,7 +510,7 @@ static int bfusb_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); count = skb->len; diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c index bd2ec7e284c..26fe9c0e1d2 100644 --- a/drivers/bluetooth/bluecard_cs.c +++ b/drivers/bluetooth/bluecard_cs.c @@ -270,7 +270,7 @@ static void bluecard_write_wakeup(bluecard_info_t *info) if (!(skb = skb_dequeue(&(info->txq)))) break; - if (skb->pkt_type & 0x80) { + if (bt_cb(skb)->pkt_type & 0x80) { /* Disable RTS */ info->ctrl_reg |= REG_CONTROL_RTS; outb(info->ctrl_reg, iobase + REG_CONTROL); @@ -288,13 +288,13 @@ static void bluecard_write_wakeup(bluecard_info_t *info) /* Mark the buffer as dirty */ clear_bit(ready_bit, &(info->tx_state)); - if (skb->pkt_type & 0x80) { + if (bt_cb(skb)->pkt_type & 0x80) { DECLARE_WAIT_QUEUE_HEAD(wq); DEFINE_WAIT(wait); unsigned char baud_reg; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case PKT_BAUD_RATE_460800: baud_reg = REG_CONTROL_BAUD_RATE_460800; break; @@ -410,9 +410,9 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = buf[i]; + bt_cb(info->rx_skb)->pkt_type = buf[i]; - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case 0x00: /* init packet */ @@ -444,7 +444,7 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset) default: /* unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; kfree_skb(info->rx_skb); @@ -586,21 +586,21 @@ static int bluecard_hci_set_baud_rate(struct hci_dev *hdev, int baud) switch (baud) { case 460800: cmd[4] = 0x00; - skb->pkt_type = PKT_BAUD_RATE_460800; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_460800; break; case 230400: cmd[4] = 0x01; - skb->pkt_type = PKT_BAUD_RATE_230400; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_230400; break; case 115200: cmd[4] = 0x02; - skb->pkt_type = PKT_BAUD_RATE_115200; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_115200; break; case 57600: /* Fall through... */ default: cmd[4] = 0x03; - skb->pkt_type = PKT_BAUD_RATE_57600; + bt_cb(skb)->pkt_type = PKT_BAUD_RATE_57600; break; } @@ -680,7 +680,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb) info = (bluecard_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -693,7 +693,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); bluecard_write_wakeup(info); diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c index f696da6f417..a1bf8f066c8 100644 --- a/drivers/bluetooth/bpa10x.c +++ b/drivers/bluetooth/bpa10x.c @@ -105,7 +105,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_recv_frame(skb); } break; @@ -117,7 +117,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; hci_recv_frame(skb); } break; @@ -129,7 +129,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c if (skb) { memcpy(skb_put(skb, len), buf, len); skb->dev = (void *) data->hdev; - skb->pkt_type = HCI_VENDOR_PKT; + bt_cb(skb)->pkt_type = HCI_VENDOR_PKT; hci_recv_frame(skb); } break; @@ -190,7 +190,7 @@ static int bpa10x_recv_event(struct bpa10x_data *data, unsigned char *buf, int s } skb->dev = (void *) data->hdev; - skb->pkt_type = pkt_type; + bt_cb(skb)->pkt_type = pkt_type; memcpy(skb_put(skb, size), buf, size); @@ -307,7 +307,8 @@ unlock: read_unlock(&data->lock); } -static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe, size_t size, int flags, void *data) +static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe, + size_t size, unsigned int __nocast flags, void *data) { struct urb *urb; struct usb_ctrlrequest *cr; @@ -487,7 +488,7 @@ static int bpa10x_send_frame(struct sk_buff *skb) struct hci_dev *hdev = (struct hci_dev *) skb->dev; struct bpa10x_data *data; - BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len); + BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len); if (!hdev) { BT_ERR("Frame for unknown HCI device"); @@ -500,9 +501,9 @@ static int bpa10x_send_frame(struct sk_buff *skb) data = hdev->driver_data; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; skb_queue_tail(&data->cmd_queue, skb); diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c index adf1750ea58..2e0338d80f3 100644 --- a/drivers/bluetooth/bt3c_cs.c +++ b/drivers/bluetooth/bt3c_cs.c @@ -259,11 +259,11 @@ static void bt3c_receive(bt3c_info_t *info) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = inb(iobase + DATA_L); + bt_cb(info->rx_skb)->pkt_type = inb(iobase + DATA_L); inb(iobase + DATA_H); - //printk("bt3c: PACKET_TYPE=%02x\n", info->rx_skb->pkt_type); + //printk("bt3c: PACKET_TYPE=%02x\n", bt_cb(info->rx_skb)->pkt_type); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case HCI_EVENT_PKT: info->rx_state = RECV_WAIT_EVENT_HEADER; @@ -282,7 +282,7 @@ static void bt3c_receive(bt3c_info_t *info) default: /* Unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; clear_bit(HCI_RUNNING, &(info->hdev->flags)); @@ -439,7 +439,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb) info = (bt3c_info_t *) (hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -452,7 +452,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); spin_lock_irqsave(&(info->lock), flags); diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c index e4c59fdc0e1..89486ea7a02 100644 --- a/drivers/bluetooth/btuart_cs.c +++ b/drivers/bluetooth/btuart_cs.c @@ -211,9 +211,9 @@ static void btuart_receive(btuart_info_t *info) if (info->rx_state == RECV_WAIT_PACKET_TYPE) { info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type = inb(iobase + UART_RX); + bt_cb(info->rx_skb)->pkt_type = inb(iobase + UART_RX); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case HCI_EVENT_PKT: info->rx_state = RECV_WAIT_EVENT_HEADER; @@ -232,7 +232,7 @@ static void btuart_receive(btuart_info_t *info) default: /* Unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); info->hdev->stat.err_rx++; clear_bit(HCI_RUNNING, &(info->hdev->flags)); @@ -447,7 +447,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb) info = (btuart_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -460,7 +460,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb) }; /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &(skb->pkt_type), 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&(info->txq), skb); btuart_write_wakeup(info); diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c index e39868c3da4..84c1f883942 100644 --- a/drivers/bluetooth/dtl1_cs.c +++ b/drivers/bluetooth/dtl1_cs.c @@ -251,7 +251,7 @@ static void dtl1_receive(dtl1_info_t *info) info->rx_count = nsh->len + (nsh->len & 0x0001); break; case RECV_WAIT_DATA: - info->rx_skb->pkt_type = nsh->type; + bt_cb(info->rx_skb)->pkt_type = nsh->type; /* remove PAD byte if it exists */ if (nsh->len & 0x0001) { @@ -262,7 +262,7 @@ static void dtl1_receive(dtl1_info_t *info) /* remove NSH */ skb_pull(info->rx_skb, NSHL); - switch (info->rx_skb->pkt_type) { + switch (bt_cb(info->rx_skb)->pkt_type) { case 0x80: /* control data for the Nokia Card */ dtl1_control(info, info->rx_skb); @@ -272,12 +272,12 @@ static void dtl1_receive(dtl1_info_t *info) case 0x84: /* send frame to the HCI layer */ info->rx_skb->dev = (void *) info->hdev; - info->rx_skb->pkt_type &= 0x0f; + bt_cb(info->rx_skb)->pkt_type &= 0x0f; hci_recv_frame(info->rx_skb); break; default: /* unknown packet */ - BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type); + BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type); kfree_skb(info->rx_skb); break; } @@ -410,7 +410,7 @@ static int dtl1_hci_send_frame(struct sk_buff *skb) info = (dtl1_info_t *)(hdev->driver_data); - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; nsh.type = 0x81; diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c index 858fddb046d..0ee324e1265 100644 --- a/drivers/bluetooth/hci_bcsp.c +++ b/drivers/bluetooth/hci_bcsp.c @@ -149,7 +149,7 @@ static int bcsp_enqueue(struct hci_uart *hu, struct sk_buff *skb) return 0; } - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_ACLDATA_PKT: case HCI_COMMAND_PKT: skb_queue_tail(&bcsp->rel, skb); @@ -227,7 +227,7 @@ static struct sk_buff *bcsp_prepare_pkt(struct bcsp_struct *bcsp, u8 *data, if (!nskb) return NULL; - nskb->pkt_type = pkt_type; + bt_cb(nskb)->pkt_type = pkt_type; bcsp_slip_msgdelim(nskb); @@ -286,7 +286,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu) since they have priority */ if ((skb = skb_dequeue(&bcsp->unrel)) != NULL) { - struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type); + struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type); if (nskb) { kfree_skb(skb); return nskb; @@ -303,7 +303,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu) spin_lock_irqsave(&bcsp->unack.lock, flags); if (bcsp->unack.qlen < BCSP_TXWINSIZE && (skb = skb_dequeue(&bcsp->rel)) != NULL) { - struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type); + struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type); if (nskb) { __skb_queue_tail(&bcsp->unack, skb); mod_timer(&bcsp->tbcsp, jiffies + HZ / 4); @@ -401,7 +401,7 @@ static void bcsp_handle_le_pkt(struct hci_uart *hu) if (!nskb) return; memcpy(skb_put(nskb, 4), conf_rsp_pkt, 4); - nskb->pkt_type = BCSP_LE_PKT; + bt_cb(nskb)->pkt_type = BCSP_LE_PKT; skb_queue_head(&bcsp->unrel, nskb); hci_uart_tx_wakeup(hu); @@ -483,14 +483,14 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu) bcsp_pkt_cull(bcsp); if ((bcsp->rx_skb->data[1] & 0x0f) == 6 && bcsp->rx_skb->data[0] & 0x80) { - bcsp->rx_skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_ACLDATA_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 5 && bcsp->rx_skb->data[0] & 0x80) { - bcsp->rx_skb->pkt_type = HCI_EVENT_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 7) { - bcsp->rx_skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_SCODATA_PKT; pass_up = 1; } else if ((bcsp->rx_skb->data[1] & 0x0f) == 1 && !(bcsp->rx_skb->data[0] & 0x80)) { @@ -512,7 +512,7 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu) hdr.evt = 0xff; hdr.plen = bcsp->rx_skb->len; memcpy(skb_push(bcsp->rx_skb, HCI_EVENT_HDR_SIZE), &hdr, HCI_EVENT_HDR_SIZE); - bcsp->rx_skb->pkt_type = HCI_EVENT_PKT; + bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT; hci_recv_frame(bcsp->rx_skb); } else { diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c index 533323b60e6..cf8a22d58d9 100644 --- a/drivers/bluetooth/hci_h4.c +++ b/drivers/bluetooth/hci_h4.c @@ -112,7 +112,7 @@ static int h4_enqueue(struct hci_uart *hu, struct sk_buff *skb) BT_DBG("hu %p skb %p", hu, skb); /* Prepend skb with frame type */ - memcpy(skb_push(skb, 1), &skb->pkt_type, 1); + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); skb_queue_tail(&h4->txq, skb); return 0; } @@ -239,7 +239,7 @@ static int h4_recv(struct hci_uart *hu, void *data, int count) return 0; } h4->rx_skb->dev = (void *) hu->hdev; - h4->rx_skb->pkt_type = type; + bt_cb(h4->rx_skb)->pkt_type = type; } return count; } diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index 90be2eae52e..aed80cc2289 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -153,7 +153,7 @@ restart: break; } - hci_uart_tx_complete(hu, skb->pkt_type); + hci_uart_tx_complete(hu, bt_cb(skb)->pkt_type); kfree_skb(skb); } @@ -229,7 +229,7 @@ static int hci_uart_send_frame(struct sk_buff *skb) hu = (struct hci_uart *) hdev->driver_data; tty = hu->tty; - BT_DBG("%s: type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s: type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); hu->proto->enqueue(hu, skb); diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c index 657719b8254..67d96b5cbb9 100644 --- a/drivers/bluetooth/hci_usb.c +++ b/drivers/bluetooth/hci_usb.c @@ -127,7 +127,7 @@ static struct usb_device_id blacklist_ids[] = { { } /* Terminating entry */ }; -static struct _urb *_urb_alloc(int isoc, int gfp) +static struct _urb *_urb_alloc(int isoc, unsigned int __nocast gfp) { struct _urb *_urb = kmalloc(sizeof(struct _urb) + sizeof(struct usb_iso_packet_descriptor) * isoc, gfp); @@ -443,7 +443,7 @@ static int __tx_submit(struct hci_usb *husb, struct _urb *_urb) static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct usb_ctrlrequest *dr; struct urb *urb; @@ -451,7 +451,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) _urb = _urb_alloc(0, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; dr = kmalloc(sizeof(*dr), GFP_ATOMIC); if (!dr) { @@ -479,7 +479,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb) static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct urb *urb; int pipe; @@ -487,7 +487,7 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) _urb = _urb_alloc(0, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; } urb = &_urb->urb; @@ -505,14 +505,14 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb) #ifdef CONFIG_BT_HCIUSB_SCO static inline int hci_usb_send_isoc(struct hci_usb *husb, struct sk_buff *skb) { - struct _urb *_urb = __get_completed(husb, skb->pkt_type); + struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type); struct urb *urb; if (!_urb) { _urb = _urb_alloc(HCI_MAX_ISOC_FRAMES, GFP_ATOMIC); if (!_urb) return -ENOMEM; - _urb->type = skb->pkt_type; + _urb->type = bt_cb(skb)->pkt_type; } BT_DBG("%s skb %p len %d", husb->hdev->name, skb, skb->len); @@ -601,11 +601,11 @@ static int hci_usb_send_frame(struct sk_buff *skb) if (!test_bit(HCI_RUNNING, &hdev->flags)) return -EBUSY; - BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); husb = (struct hci_usb *) hdev->driver_data; - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_COMMAND_PKT: hdev->stat.cmd_tx++; break; @@ -627,7 +627,7 @@ static int hci_usb_send_frame(struct sk_buff *skb) read_lock(&husb->completion_lock); - skb_queue_tail(__transmit_q(husb, skb->pkt_type), skb); + skb_queue_tail(__transmit_q(husb, bt_cb(skb)->pkt_type), skb); hci_usb_tx_wakeup(husb); read_unlock(&husb->completion_lock); @@ -682,7 +682,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c return -ENOMEM; } skb->dev = (void *) husb->hdev; - skb->pkt_type = type; + bt_cb(skb)->pkt_type = type; __reassembly(husb, type) = skb; @@ -702,6 +702,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c if (!scb->expect) { /* Complete frame */ __reassembly(husb, type) = NULL; + bt_cb(skb)->pkt_type = type; hci_recv_frame(skb); } diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index f9b956fb2b8..52cbd45c308 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -1,229 +1,220 @@ -/* - BlueZ - Bluetooth protocol stack for Linux - Copyright (C) 2000-2001 Qualcomm Incorporated - - Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 as - published by the Free Software Foundation; - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY - CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, - COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS - SOFTWARE IS DISCLAIMED. -*/ - /* - * Bluetooth HCI virtual device driver. * - * $Id: hci_vhci.c,v 1.3 2002/04/17 17:37:20 maxk Exp $ + * Bluetooth virtual HCI driver + * + * Copyright (C) 2000-2001 Qualcomm Incorporated + * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> + * Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * */ -#define VERSION "1.1" #include <linux/config.h> #include <linux/module.h> -#include <linux/errno.h> #include <linux/kernel.h> -#include <linux/major.h> -#include <linux/sched.h> +#include <linux/init.h> #include <linux/slab.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/sched.h> #include <linux/poll.h> -#include <linux/fcntl.h> -#include <linux/init.h> -#include <linux/random.h> #include <linux/skbuff.h> #include <linux/miscdevice.h> -#include <asm/system.h> -#include <asm/uaccess.h> - #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> -#include "hci_vhci.h" -/* HCI device part */ +#ifndef CONFIG_BT_HCIVHCI_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "1.2" + +static int minor = MISC_DYNAMIC_MINOR; + +struct vhci_data { + struct hci_dev *hdev; + + unsigned long flags; + + wait_queue_head_t read_wait; + struct sk_buff_head readq; + + struct fasync_struct *fasync; +}; -static int hci_vhci_open(struct hci_dev *hdev) +#define VHCI_FASYNC 0x0010 + +static struct miscdevice vhci_miscdev; + +static int vhci_open_dev(struct hci_dev *hdev) { set_bit(HCI_RUNNING, &hdev->flags); - return 0; -} -static int hci_vhci_flush(struct hci_dev *hdev) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) hdev->driver_data; - skb_queue_purge(&hci_vhci->readq); return 0; } -static int hci_vhci_close(struct hci_dev *hdev) +static int vhci_close_dev(struct hci_dev *hdev) { + struct vhci_data *vhci = hdev->driver_data; + if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags)) return 0; - hci_vhci_flush(hdev); + skb_queue_purge(&vhci->readq); + return 0; } -static void hci_vhci_destruct(struct hci_dev *hdev) +static int vhci_flush(struct hci_dev *hdev) { - struct hci_vhci_struct *vhci; + struct vhci_data *vhci = hdev->driver_data; - if (!hdev) return; + skb_queue_purge(&vhci->readq); - vhci = (struct hci_vhci_struct *) hdev->driver_data; - kfree(vhci); + return 0; } -static int hci_vhci_send_frame(struct sk_buff *skb) +static int vhci_send_frame(struct sk_buff *skb) { struct hci_dev* hdev = (struct hci_dev *) skb->dev; - struct hci_vhci_struct *hci_vhci; + struct vhci_data *vhci; if (!hdev) { - BT_ERR("Frame for uknown device (hdev=NULL)"); + BT_ERR("Frame for unknown HCI device (hdev=NULL)"); return -ENODEV; } if (!test_bit(HCI_RUNNING, &hdev->flags)) return -EBUSY; - hci_vhci = (struct hci_vhci_struct *) hdev->driver_data; + vhci = hdev->driver_data; + + memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1); + skb_queue_tail(&vhci->readq, skb); - memcpy(skb_push(skb, 1), &skb->pkt_type, 1); - skb_queue_tail(&hci_vhci->readq, skb); + if (vhci->flags & VHCI_FASYNC) + kill_fasync(&vhci->fasync, SIGIO, POLL_IN); - if (hci_vhci->flags & VHCI_FASYNC) - kill_fasync(&hci_vhci->fasync, SIGIO, POLL_IN); - wake_up_interruptible(&hci_vhci->read_wait); + wake_up_interruptible(&vhci->read_wait); return 0; } -/* Character device part */ - -/* Poll */ -static unsigned int hci_vhci_chr_poll(struct file *file, poll_table * wait) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - - poll_wait(file, &hci_vhci->read_wait, wait); - - if (!skb_queue_empty(&hci_vhci->readq)) - return POLLIN | POLLRDNORM; - - return POLLOUT | POLLWRNORM; +static void vhci_destruct(struct hci_dev *hdev) +{ + kfree(hdev->driver_data); } -/* Get packet from user space buffer(already verified) */ -static inline ssize_t hci_vhci_get_user(struct hci_vhci_struct *hci_vhci, const char __user *buf, size_t count) +static inline ssize_t vhci_get_user(struct vhci_data *vhci, + const char __user *buf, size_t count) { struct sk_buff *skb; if (count > HCI_MAX_FRAME_SIZE) return -EINVAL; - if (!(skb = bt_skb_alloc(count, GFP_KERNEL))) + skb = bt_skb_alloc(count, GFP_KERNEL); + if (!skb) return -ENOMEM; - + if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); return -EFAULT; } - skb->dev = (void *) hci_vhci->hdev; - skb->pkt_type = *((__u8 *) skb->data); + skb->dev = (void *) vhci->hdev; + bt_cb(skb)->pkt_type = *((__u8 *) skb->data); skb_pull(skb, 1); hci_recv_frame(skb); return count; -} - -/* Write */ -static ssize_t hci_vhci_chr_write(struct file * file, const char __user * buf, - size_t count, loff_t *pos) -{ - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - - if (!access_ok(VERIFY_READ, buf, count)) - return -EFAULT; - - return hci_vhci_get_user(hci_vhci, buf, count); } -/* Put packet to user space buffer(already verified) */ -static inline ssize_t hci_vhci_put_user(struct hci_vhci_struct *hci_vhci, - struct sk_buff *skb, char __user *buf, - int count) +static inline ssize_t vhci_put_user(struct vhci_data *vhci, + struct sk_buff *skb, char __user *buf, int count) { - int len = count, total = 0; char __user *ptr = buf; + int len, total = 0; + + len = min_t(unsigned int, skb->len, count); - len = min_t(unsigned int, skb->len, len); if (copy_to_user(ptr, skb->data, len)) return -EFAULT; + total += len; - hci_vhci->hdev->stat.byte_tx += len; - switch (skb->pkt_type) { - case HCI_COMMAND_PKT: - hci_vhci->hdev->stat.cmd_tx++; - break; + vhci->hdev->stat.byte_tx += len; - case HCI_ACLDATA_PKT: - hci_vhci->hdev->stat.acl_tx++; - break; + switch (bt_cb(skb)->pkt_type) { + case HCI_COMMAND_PKT: + vhci->hdev->stat.cmd_tx++; + break; + + case HCI_ACLDATA_PKT: + vhci->hdev->stat.acl_tx++; + break; - case HCI_SCODATA_PKT: - hci_vhci->hdev->stat.cmd_tx++; - break; + case HCI_SCODATA_PKT: + vhci->hdev->stat.cmd_tx++; + break; }; return total; } -/* Read */ -static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t count, loff_t *pos) +static loff_t vhci_llseek(struct file * file, loff_t offset, int origin) +{ + return -ESPIPE; +} + +static ssize_t vhci_read(struct file * file, char __user * buf, size_t count, loff_t *pos) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; DECLARE_WAITQUEUE(wait, current); + struct vhci_data *vhci = file->private_data; struct sk_buff *skb; ssize_t ret = 0; - add_wait_queue(&hci_vhci->read_wait, &wait); + add_wait_queue(&vhci->read_wait, &wait); while (count) { set_current_state(TASK_INTERRUPTIBLE); - /* Read frames from device queue */ - if (!(skb = skb_dequeue(&hci_vhci->readq))) { + skb = skb_dequeue(&vhci->readq); + if (!skb) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } + if (signal_pending(current)) { ret = -ERESTARTSYS; break; } - /* Nothing to read, let's sleep */ schedule(); continue; } if (access_ok(VERIFY_WRITE, buf, count)) - ret = hci_vhci_put_user(hci_vhci, skb, buf, count); + ret = vhci_put_user(vhci, skb, buf, count); else ret = -EFAULT; @@ -231,84 +222,90 @@ static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t c break; } set_current_state(TASK_RUNNING); - remove_wait_queue(&hci_vhci->read_wait, &wait); + remove_wait_queue(&vhci->read_wait, &wait); return ret; } -static loff_t hci_vhci_chr_lseek(struct file * file, loff_t offset, int origin) +static ssize_t vhci_write(struct file *file, + const char __user *buf, size_t count, loff_t *pos) { - return -ESPIPE; -} + struct vhci_data *vhci = file->private_data; -static int hci_vhci_chr_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) -{ - return -EINVAL; + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + return vhci_get_user(vhci, buf, count); } -static int hci_vhci_chr_fasync(int fd, struct file *file, int on) +static unsigned int vhci_poll(struct file *file, poll_table *wait) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - int ret; + struct vhci_data *vhci = file->private_data; - if ((ret = fasync_helper(fd, file, on, &hci_vhci->fasync)) < 0) - return ret; - - if (on) - hci_vhci->flags |= VHCI_FASYNC; - else - hci_vhci->flags &= ~VHCI_FASYNC; + poll_wait(file, &vhci->read_wait, wait); - return 0; + if (!skb_queue_empty(&vhci->readq)) + return POLLIN | POLLRDNORM; + + return POLLOUT | POLLWRNORM; } -static int hci_vhci_chr_open(struct inode *inode, struct file * file) +static int vhci_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) { - struct hci_vhci_struct *hci_vhci = NULL; + return -EINVAL; +} + +static int vhci_open(struct inode *inode, struct file *file) +{ + struct vhci_data *vhci; struct hci_dev *hdev; - if (!(hci_vhci = kmalloc(sizeof(struct hci_vhci_struct), GFP_KERNEL))) + vhci = kmalloc(sizeof(struct vhci_data), GFP_KERNEL); + if (!vhci) return -ENOMEM; - memset(hci_vhci, 0, sizeof(struct hci_vhci_struct)); + memset(vhci, 0, sizeof(struct vhci_data)); - skb_queue_head_init(&hci_vhci->readq); - init_waitqueue_head(&hci_vhci->read_wait); + skb_queue_head_init(&vhci->readq); + init_waitqueue_head(&vhci->read_wait); - /* Initialize and register HCI device */ hdev = hci_alloc_dev(); if (!hdev) { - kfree(hci_vhci); + kfree(vhci); return -ENOMEM; } - hci_vhci->hdev = hdev; + vhci->hdev = hdev; hdev->type = HCI_VHCI; - hdev->driver_data = hci_vhci; + hdev->driver_data = vhci; + SET_HCIDEV_DEV(hdev, vhci_miscdev.dev); - hdev->open = hci_vhci_open; - hdev->close = hci_vhci_close; - hdev->flush = hci_vhci_flush; - hdev->send = hci_vhci_send_frame; - hdev->destruct = hci_vhci_destruct; + hdev->open = vhci_open_dev; + hdev->close = vhci_close_dev; + hdev->flush = vhci_flush; + hdev->send = vhci_send_frame; + hdev->destruct = vhci_destruct; hdev->owner = THIS_MODULE; - + if (hci_register_dev(hdev) < 0) { - kfree(hci_vhci); + BT_ERR("Can't register HCI device"); + kfree(vhci); hci_free_dev(hdev); return -EBUSY; } - file->private_data = hci_vhci; - return nonseekable_open(inode, file); + file->private_data = vhci; + + return nonseekable_open(inode, file); } -static int hci_vhci_chr_close(struct inode *inode, struct file *file) +static int vhci_release(struct inode *inode, struct file *file) { - struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data; - struct hci_dev *hdev = hci_vhci->hdev; + struct vhci_data *vhci = file->private_data; + struct hci_dev *hdev = vhci->hdev; if (hci_unregister_dev(hdev) < 0) { BT_ERR("Can't unregister HCI device %s", hdev->name); @@ -317,48 +314,71 @@ static int hci_vhci_chr_close(struct inode *inode, struct file *file) hci_free_dev(hdev); file->private_data = NULL; + return 0; } -static struct file_operations hci_vhci_fops = { - .owner = THIS_MODULE, - .llseek = hci_vhci_chr_lseek, - .read = hci_vhci_chr_read, - .write = hci_vhci_chr_write, - .poll = hci_vhci_chr_poll, - .ioctl = hci_vhci_chr_ioctl, - .open = hci_vhci_chr_open, - .release = hci_vhci_chr_close, - .fasync = hci_vhci_chr_fasync +static int vhci_fasync(int fd, struct file *file, int on) +{ + struct vhci_data *vhci = file->private_data; + int err; + + err = fasync_helper(fd, file, on, &vhci->fasync); + if (err < 0) + return err; + + if (on) + vhci->flags |= VHCI_FASYNC; + else + vhci->flags &= ~VHCI_FASYNC; + + return 0; +} + +static struct file_operations vhci_fops = { + .owner = THIS_MODULE, + .llseek = vhci_llseek, + .read = vhci_read, + .write = vhci_write, + .poll = vhci_poll, + .ioctl = vhci_ioctl, + .open = vhci_open, + .release = vhci_release, + .fasync = vhci_fasync, }; -static struct miscdevice hci_vhci_miscdev= -{ - VHCI_MINOR, - "hci_vhci", - &hci_vhci_fops +static struct miscdevice vhci_miscdev= { + .name = "vhci", + .fops = &vhci_fops, }; -static int __init hci_vhci_init(void) +static int __init vhci_init(void) { - BT_INFO("VHCI driver ver %s", VERSION); + BT_INFO("Virtual HCI driver ver %s", VERSION); - if (misc_register(&hci_vhci_miscdev)) { - BT_ERR("Can't register misc device %d\n", VHCI_MINOR); + vhci_miscdev.minor = minor; + + if (misc_register(&vhci_miscdev) < 0) { + BT_ERR("Can't register misc device with minor %d", minor); return -EIO; } return 0; } -static void hci_vhci_cleanup(void) +static void __exit vhci_exit(void) { - misc_deregister(&hci_vhci_miscdev); + if (misc_deregister(&vhci_miscdev) < 0) + BT_ERR("Can't unregister misc device with minor %d", minor); } -module_init(hci_vhci_init); -module_exit(hci_vhci_cleanup); +module_init(vhci_init); +module_exit(vhci_exit); + +module_param(minor, int, 0444); +MODULE_PARM_DESC(minor, "Miscellaneous minor device number"); -MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>"); -MODULE_DESCRIPTION("Bluetooth VHCI driver ver " VERSION); -MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>"); +MODULE_DESCRIPTION("Bluetooth virtual HCI driver ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); diff --git a/drivers/bluetooth/hci_vhci.h b/drivers/bluetooth/hci_vhci.h deleted file mode 100644 index 53b11f9ef76..00000000000 --- a/drivers/bluetooth/hci_vhci.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - BlueZ - Bluetooth protocol stack for Linux - Copyright (C) 2000-2001 Qualcomm Incorporated - - Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License version 2 as - published by the Free Software Foundation; - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY - CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - - ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, - COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS - SOFTWARE IS DISCLAIMED. -*/ - -/* - * $Id: hci_vhci.h,v 1.1.1.1 2002/03/08 21:03:15 maxk Exp $ - */ - -#ifndef __HCI_VHCI_H -#define __HCI_VHCI_H - -#ifdef __KERNEL__ - -struct hci_vhci_struct { - struct hci_dev *hdev; - __u32 flags; - wait_queue_head_t read_wait; - struct sk_buff_head readq; - struct fasync_struct *fasync; -}; - -/* VHCI device flags */ -#define VHCI_FASYNC 0x0010 - -#endif /* __KERNEL__ */ - -#define VHCI_DEV "/dev/vhci" -#define VHCI_MINOR 250 - -#endif /* __HCI_VHCI_H */ diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 38dd9ffbe8b..0829db58462 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -734,7 +734,7 @@ static int viocd_remove(struct vio_dev *vdev) */ static struct vio_device_id viocd_device_table[] __devinitdata = { { "viocd", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viocd_device_table); diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c index 60bb9152b83..78d681dc35a 100644 --- a/drivers/char/hvc_vio.c +++ b/drivers/char/hvc_vio.c @@ -39,7 +39,7 @@ char hvc_driver_name[] = "hvc_console"; static struct vio_device_id hvc_driver_table[] __devinitdata = { {"serial", "hvterm1"}, - { NULL, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, hvc_driver_table); diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c index 3236d240490..f47f009f925 100644 --- a/drivers/char/hvcs.c +++ b/drivers/char/hvcs.c @@ -527,7 +527,7 @@ static int khvcsd(void *unused) static struct vio_device_id hvcs_driver_table[] __devinitdata= { {"serial-server", "hvterm2"}, - { NULL, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, hvcs_driver_table); diff --git a/drivers/char/random.c b/drivers/char/random.c index 6b11d6b2129..7999da25fe4 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1589,6 +1589,40 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); #endif +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +/* Similar to secure_tcp_sequence_number but generate a 48 bit value + * bit's 32-47 increase every key exchange + * 0-31 hash(source, dest) + */ +u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport) +{ + struct timeval tv; + u64 seq; + __u32 hash[4]; + struct keydata *keyptr = get_keyptr(); + + hash[0] = saddr; + hash[1] = daddr; + hash[2] = (sport << 16) + dport; + hash[3] = keyptr->secret[11]; + + seq = half_md4_transform(hash, keyptr->secret); + seq |= ((u64)keyptr->count) << (32 - HASH_BITS); + + do_gettimeofday(&tv); + seq += tv.tv_usec + tv.tv_sec * 1000000; + seq &= (1ull << 48) - 1; +#if 0 + printk("dccp init_seq(%lx, %lx, %d, %d) = %d\n", + saddr, daddr, sport, dport, seq); +#endif + return seq; +} + +EXPORT_SYMBOL(secure_dccp_sequence_number); +#endif + #endif /* CONFIG_INET */ diff --git a/drivers/char/viotape.c b/drivers/char/viotape.c index 4764b4f9555..0aff45fac2e 100644 --- a/drivers/char/viotape.c +++ b/drivers/char/viotape.c @@ -991,7 +991,7 @@ static int viotape_remove(struct vio_dev *vdev) */ static struct vio_device_id viotape_device_table[] __devinitdata = { { "viotape", "" }, - { 0, } + { "", "" } }; MODULE_DEVICE_TABLE(vio, viotape_device_table); diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c index b248d89de8b..d633770fac8 100644 --- a/drivers/ieee1394/ieee1394_core.c +++ b/drivers/ieee1394/ieee1394_core.c @@ -681,7 +681,7 @@ static void handle_packet_response(struct hpsb_host *host, int tcode, return; } - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); if (packet->state == hpsb_queued) { packet->sendtime = jiffies; @@ -989,7 +989,7 @@ void abort_timedouts(unsigned long __opaque) packet = (struct hpsb_packet *)skb->data; if (time_before(packet->sendtime + expire, jiffies)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &host->pending_packet_queue); packet->state = hpsb_complete; packet->ack_code = ACKX_TIMEOUT; queue_packet_complete(packet); diff --git a/drivers/isdn/act2000/capi.c b/drivers/isdn/act2000/capi.c index afa46681f98..6ae6eb32211 100644 --- a/drivers/isdn/act2000/capi.c +++ b/drivers/isdn/act2000/capi.c @@ -606,7 +606,7 @@ handle_ack(act2000_card *card, act2000_chan *chan, __u8 blocknr) { if ((((m->msg.data_b3_req.fakencci >> 8) & 0xff) == chan->ncci) && (m->msg.data_b3_req.blocknr == blocknr)) { /* found corresponding DATA_B3_REQ */ - skb_unlink(tmp); + skb_unlink(tmp, &card->ackq); chan->queued -= m->msg.data_b3_req.datalen; if (m->msg.data_b3_req.flags) ret = m->msg.data_b3_req.datalen; diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c index f30e8e63ae0..96c115e1338 100644 --- a/drivers/isdn/i4l/isdn_net.c +++ b/drivers/isdn/i4l/isdn_net.c @@ -1786,7 +1786,6 @@ isdn_net_receive(struct net_device *ndev, struct sk_buff *skb) lp->stats.rx_bytes += skb->len; } skb->dev = ndev; - skb->input_dev = ndev; skb->pkt_type = PACKET_HOST; skb->mac.raw = skb->data; #ifdef ISDN_DEBUG_NET_DUMP diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c index 260a323a96d..d97a9be5469 100644 --- a/drivers/isdn/i4l/isdn_ppp.c +++ b/drivers/isdn/i4l/isdn_ppp.c @@ -1177,7 +1177,6 @@ isdn_ppp_push_higher(isdn_net_dev * net_dev, isdn_net_local * lp, struct sk_buff mlp->huptimer = 0; #endif /* CONFIG_IPPP_FILTER */ skb->dev = dev; - skb->input_dev = dev; skb->mac.raw = skb->data; netif_rx(skb); /* net_dev->local->stats.rx_packets++; done in isdn_net.c */ diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 8acc655ec1e..7babf6af4e2 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -14,8 +14,8 @@ #define DRV_MODULE_NAME "bnx2" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "1.2.19" -#define DRV_MODULE_RELDATE "May 23, 2005" +#define DRV_MODULE_VERSION "1.2.20" +#define DRV_MODULE_RELDATE "August 22, 2005" #define RUN_AT(x) (jiffies + (x)) @@ -52,7 +52,6 @@ static struct { { "HP NC370i Multifunction Gigabit Server Adapter" }, { "Broadcom NetXtreme II BCM5706 1000Base-SX" }, { "HP NC370F Multifunction Gigabit Server Adapter" }, - { 0 }, }; static struct pci_device_id bnx2_pci_tbl[] = { @@ -108,6 +107,15 @@ static struct flash_spec flash_table[] = MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl); +static inline u32 bnx2_tx_avail(struct bnx2 *bp) +{ + u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); + + if (diff > MAX_TX_DESC_CNT) + diff = (diff & MAX_TX_DESC_CNT) - 1; + return (bp->tx_ring_size - diff); +} + static u32 bnx2_reg_rd_ind(struct bnx2 *bp, u32 offset) { @@ -807,7 +815,19 @@ bnx2_setup_serdes_phy(struct bnx2 *bp) bnx2_write_phy(bp, MII_ADVERTISE, new_adv); bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART | BMCR_ANENABLE); - bp->serdes_an_pending = SERDES_AN_TIMEOUT / bp->timer_interval; + if (CHIP_NUM(bp) == CHIP_NUM_5706) { + /* Speed up link-up time when the link partner + * does not autonegotiate which is very common + * in blade servers. Some blade servers use + * IPMI for kerboard input and it's important + * to minimize link disruptions. Autoneg. involves + * exchanging base pages plus 3 next pages and + * normally completes in about 120 msec. + */ + bp->current_interval = SERDES_AN_TIMEOUT; + bp->serdes_an_pending = 1; + mod_timer(&bp->timer, jiffies + bp->current_interval); + } } return 0; @@ -1327,22 +1347,17 @@ bnx2_tx_int(struct bnx2 *bp) } } - atomic_add(tx_free_bd, &bp->tx_avail_bd); + bp->tx_cons = sw_cons; if (unlikely(netif_queue_stopped(bp->dev))) { - unsigned long flags; - - spin_lock_irqsave(&bp->tx_lock, flags); + spin_lock(&bp->tx_lock); if ((netif_queue_stopped(bp->dev)) && - (atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS)) { + (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { netif_wake_queue(bp->dev); } - spin_unlock_irqrestore(&bp->tx_lock, flags); + spin_unlock(&bp->tx_lock); } - - bp->tx_cons = sw_cons; - } static inline void @@ -1523,15 +1538,12 @@ bnx2_msi(int irq, void *dev_instance, struct pt_regs *regs) BNX2_PCICFG_INT_ACK_CMD_MASK_INT); /* Return here if interrupt is disabled. */ - if (unlikely(atomic_read(&bp->intr_sem) != 0)) { - return IRQ_RETVAL(1); - } + if (unlikely(atomic_read(&bp->intr_sem) != 0)) + return IRQ_HANDLED; - if (netif_rx_schedule_prep(dev)) { - __netif_rx_schedule(dev); - } + netif_rx_schedule(dev); - return IRQ_RETVAL(1); + return IRQ_HANDLED; } static irqreturn_t @@ -1549,22 +1561,19 @@ bnx2_interrupt(int irq, void *dev_instance, struct pt_regs *regs) if ((bp->status_blk->status_idx == bp->last_status_idx) || (REG_RD(bp, BNX2_PCICFG_MISC_STATUS) & BNX2_PCICFG_MISC_STATUS_INTA_VALUE)) - return IRQ_RETVAL(0); + return IRQ_NONE; REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD, BNX2_PCICFG_INT_ACK_CMD_USE_INT_HC_PARAM | BNX2_PCICFG_INT_ACK_CMD_MASK_INT); /* Return here if interrupt is shared and is disabled. */ - if (unlikely(atomic_read(&bp->intr_sem) != 0)) { - return IRQ_RETVAL(1); - } + if (unlikely(atomic_read(&bp->intr_sem) != 0)) + return IRQ_HANDLED; - if (netif_rx_schedule_prep(dev)) { - __netif_rx_schedule(dev); - } + netif_rx_schedule(dev); - return IRQ_RETVAL(1); + return IRQ_HANDLED; } static int @@ -1581,11 +1590,9 @@ bnx2_poll(struct net_device *dev, int *budget) (bp->status_blk->status_attn_bits_ack & STATUS_ATTN_BITS_LINK_STATE)) { - unsigned long flags; - - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock(&bp->phy_lock); bnx2_phy_int(bp); - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock(&bp->phy_lock); } if (bp->status_blk->status_tx_quick_consumer_index0 != bp->tx_cons) { @@ -1628,9 +1635,8 @@ bnx2_set_rx_mode(struct net_device *dev) struct bnx2 *bp = dev->priv; u32 rx_mode, sort_mode; int i; - unsigned long flags; - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock_bh(&bp->phy_lock); rx_mode = bp->rx_mode & ~(BNX2_EMAC_RX_MODE_PROMISCUOUS | BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG); @@ -1691,7 +1697,7 @@ bnx2_set_rx_mode(struct net_device *dev) REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode); REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode | BNX2_RPM_SORT_USER0_ENA); - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock_bh(&bp->phy_lock); } static void @@ -2960,7 +2966,6 @@ bnx2_init_tx_ring(struct bnx2 *bp) bp->tx_prod = 0; bp->tx_cons = 0; bp->tx_prod_bseq = 0; - atomic_set(&bp->tx_avail_bd, bp->tx_ring_size); val = BNX2_L2CTX_TYPE_TYPE_L2; val |= BNX2_L2CTX_TYPE_SIZE_L2; @@ -3507,11 +3512,11 @@ bnx2_test_registers(struct bnx2 *bp) rw_mask = reg_tbl[i].rw_mask; ro_mask = reg_tbl[i].ro_mask; - save_val = readl((u8 *) bp->regview + offset); + save_val = readl(bp->regview + offset); - writel(0, (u8 *) bp->regview + offset); + writel(0, bp->regview + offset); - val = readl((u8 *) bp->regview + offset); + val = readl(bp->regview + offset); if ((val & rw_mask) != 0) { goto reg_test_err; } @@ -3520,9 +3525,9 @@ bnx2_test_registers(struct bnx2 *bp) goto reg_test_err; } - writel(0xffffffff, (u8 *) bp->regview + offset); + writel(0xffffffff, bp->regview + offset); - val = readl((u8 *) bp->regview + offset); + val = readl(bp->regview + offset); if ((val & rw_mask) != rw_mask) { goto reg_test_err; } @@ -3531,11 +3536,11 @@ bnx2_test_registers(struct bnx2 *bp) goto reg_test_err; } - writel(save_val, (u8 *) bp->regview + offset); + writel(save_val, bp->regview + offset); continue; reg_test_err: - writel(save_val, (u8 *) bp->regview + offset); + writel(save_val, bp->regview + offset); ret = -ENODEV; break; } @@ -3752,10 +3757,10 @@ bnx2_test_link(struct bnx2 *bp) { u32 bmsr; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_read_phy(bp, MII_BMSR, &bmsr); bnx2_read_phy(bp, MII_BMSR, &bmsr); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); if (bmsr & BMSR_LSTATUS) { return 0; @@ -3801,6 +3806,9 @@ bnx2_timer(unsigned long data) struct bnx2 *bp = (struct bnx2 *) data; u32 msg; + if (!netif_running(bp->dev)) + return; + if (atomic_read(&bp->intr_sem) != 0) goto bnx2_restart_timer; @@ -3809,15 +3817,16 @@ bnx2_timer(unsigned long data) if ((bp->phy_flags & PHY_SERDES_FLAG) && (CHIP_NUM(bp) == CHIP_NUM_5706)) { - unsigned long flags; - spin_lock_irqsave(&bp->phy_lock, flags); + spin_lock(&bp->phy_lock); if (bp->serdes_an_pending) { bp->serdes_an_pending--; } else if ((bp->link_up == 0) && (bp->autoneg & AUTONEG_SPEED)) { u32 bmcr; + bp->current_interval = bp->timer_interval; + bnx2_read_phy(bp, MII_BMCR, &bmcr); if (bmcr & BMCR_ANENABLE) { @@ -3860,14 +3869,14 @@ bnx2_timer(unsigned long data) } } + else + bp->current_interval = bp->timer_interval; - spin_unlock_irqrestore(&bp->phy_lock, flags); + spin_unlock(&bp->phy_lock); } bnx2_restart_timer: - bp->timer.expires = RUN_AT(bp->timer_interval); - - add_timer(&bp->timer); + mod_timer(&bp->timer, jiffies + bp->current_interval); } /* Called with rtnl_lock */ @@ -3920,12 +3929,7 @@ bnx2_open(struct net_device *dev) return rc; } - init_timer(&bp->timer); - - bp->timer.expires = RUN_AT(bp->timer_interval); - bp->timer.data = (unsigned long) bp; - bp->timer.function = bnx2_timer; - add_timer(&bp->timer); + mod_timer(&bp->timer, jiffies + bp->current_interval); atomic_set(&bp->intr_sem, 0); @@ -3976,12 +3980,17 @@ bnx2_reset_task(void *data) { struct bnx2 *bp = data; + if (!netif_running(bp->dev)) + return; + + bp->in_reset_task = 1; bnx2_netif_stop(bp); bnx2_init_nic(bp); atomic_set(&bp->intr_sem, 1); bnx2_netif_start(bp); + bp->in_reset_task = 0; } static void @@ -4041,9 +4050,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) u16 prod, ring_prod; int i; - if (unlikely(atomic_read(&bp->tx_avail_bd) < - (skb_shinfo(skb)->nr_frags + 1))) { - + if (unlikely(bnx2_tx_avail(bp) < (skb_shinfo(skb)->nr_frags + 1))) { netif_stop_queue(dev); printk(KERN_ERR PFX "%s: BUG! Tx ring full when queue awake!\n", dev->name); @@ -4140,8 +4147,6 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) prod = NEXT_TX_BD(prod); bp->tx_prod_bseq += skb->len; - atomic_sub(last_frag + 1, &bp->tx_avail_bd); - REG_WR16(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BIDX, prod); REG_WR(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BSEQ, bp->tx_prod_bseq); @@ -4150,17 +4155,13 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) bp->tx_prod = prod; dev->trans_start = jiffies; - if (unlikely(atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS)) { - unsigned long flags; - - spin_lock_irqsave(&bp->tx_lock, flags); - if (atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS) { - netif_stop_queue(dev); - - if (atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS) - netif_wake_queue(dev); - } - spin_unlock_irqrestore(&bp->tx_lock, flags); + if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { + spin_lock(&bp->tx_lock); + netif_stop_queue(dev); + + if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS) + netif_wake_queue(dev); + spin_unlock(&bp->tx_lock); } return NETDEV_TX_OK; @@ -4173,7 +4174,13 @@ bnx2_close(struct net_device *dev) struct bnx2 *bp = dev->priv; u32 reset_code; - flush_scheduled_work(); + /* Calling flush_scheduled_work() may deadlock because + * linkwatch_event() may be on the workqueue and it will try to get + * the rtnl_lock which we are holding. + */ + while (bp->in_reset_task) + msleep(1); + bnx2_netif_stop(bp); del_timer_sync(&bp->timer); if (bp->wol) @@ -4390,11 +4397,11 @@ bnx2_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) bp->req_line_speed = req_line_speed; bp->req_duplex = req_duplex; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_setup_phy(bp); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -4464,19 +4471,20 @@ bnx2_nway_reset(struct net_device *dev) return -EINVAL; } - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); /* Force a link down visible on the other side */ if (bp->phy_flags & PHY_SERDES_FLAG) { bnx2_write_phy(bp, MII_BMCR, BMCR_LOOPBACK); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); msleep(20); - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); if (CHIP_NUM(bp) == CHIP_NUM_5706) { - bp->serdes_an_pending = SERDES_AN_TIMEOUT / - bp->timer_interval; + bp->current_interval = SERDES_AN_TIMEOUT; + bp->serdes_an_pending = 1; + mod_timer(&bp->timer, jiffies + bp->current_interval); } } @@ -4484,7 +4492,7 @@ bnx2_nway_reset(struct net_device *dev) bmcr &= ~BMCR_LOOPBACK; bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART | BMCR_ANENABLE); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -4670,11 +4678,11 @@ bnx2_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause) bp->autoneg &= ~AUTONEG_FLOW_CTRL; } - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); bnx2_setup_phy(bp); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return 0; } @@ -4698,7 +4706,7 @@ bnx2_set_rx_csum(struct net_device *dev, u32 data) #define BNX2_NUM_STATS 45 -struct { +static struct { char string[ETH_GSTRING_LEN]; } bnx2_stats_str_arr[BNX2_NUM_STATS] = { { "rx_bytes" }, @@ -4750,7 +4758,7 @@ struct { #define STATS_OFFSET32(offset_name) (offsetof(struct statistics_block, offset_name) / 4) -unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { +static unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { STATS_OFFSET32(stat_IfHCInOctets_hi), STATS_OFFSET32(stat_IfHCInBadOctets_hi), STATS_OFFSET32(stat_IfHCOutOctets_hi), @@ -4801,7 +4809,7 @@ unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = { /* stat_IfHCInBadOctets and stat_Dot3StatsCarrierSenseErrors are * skipped because of errata. */ -u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { +static u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { 8,0,8,8,8,8,8,8,8,8, 4,0,4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,4,4, @@ -4811,7 +4819,7 @@ u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = { #define BNX2_NUM_TESTS 6 -struct { +static struct { char string[ETH_GSTRING_LEN]; } bnx2_tests_str_arr[BNX2_NUM_TESTS] = { { "register_test (offline)" }, @@ -4910,7 +4918,7 @@ bnx2_get_ethtool_stats(struct net_device *dev, struct bnx2 *bp = dev->priv; int i; u32 *hw_stats = (u32 *) bp->stats_blk; - u8 *stats_len_arr = 0; + u8 *stats_len_arr = NULL; if (hw_stats == NULL) { memset(buf, 0, sizeof(u64) * BNX2_NUM_STATS); @@ -5012,7 +5020,7 @@ static struct ethtool_ops bnx2_ethtool_ops = { static int bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct mii_ioctl_data *data = (struct mii_ioctl_data *)&ifr->ifr_data; + struct mii_ioctl_data *data = if_mii(ifr); struct bnx2 *bp = dev->priv; int err; @@ -5024,9 +5032,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: { u32 mii_regval; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); err = bnx2_read_phy(bp, data->reg_num & 0x1f, &mii_regval); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); data->val_out = mii_regval; @@ -5037,9 +5045,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - spin_lock_irq(&bp->phy_lock); + spin_lock_bh(&bp->phy_lock); err = bnx2_write_phy(bp, data->reg_num & 0x1f, data->val_in); - spin_unlock_irq(&bp->phy_lock); + spin_unlock_bh(&bp->phy_lock); return err; @@ -5057,6 +5065,9 @@ bnx2_change_mac_addr(struct net_device *dev, void *p) struct sockaddr *addr = p; struct bnx2 *bp = dev->priv; + if (!is_valid_ether_addr(addr->sa_data)) + return -EINVAL; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); if (netif_running(dev)) bnx2_set_mac_addr(bp); @@ -5305,6 +5316,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->stats_ticks = 1000000 & 0xffff00; bp->timer_interval = HZ; + bp->current_interval = HZ; /* Disable WOL support if we are running on a SERDES chip. */ if (CHIP_BOND_ID(bp) & CHIP_BOND_ID_SERDES_BIT) { @@ -5328,6 +5340,15 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->req_line_speed = 0; if (bp->phy_flags & PHY_SERDES_FLAG) { bp->advertising = ETHTOOL_ALL_FIBRE_SPEED | ADVERTISED_Autoneg; + + reg = REG_RD_IND(bp, HOST_VIEW_SHMEM_BASE + + BNX2_PORT_HW_CFG_CONFIG); + reg &= BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK; + if (reg == BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G) { + bp->autoneg = 0; + bp->req_line_speed = bp->line_speed = SPEED_1000; + bp->req_duplex = DUPLEX_FULL; + } } else { bp->advertising = ETHTOOL_ALL_COPPER_SPEED | ADVERTISED_Autoneg; @@ -5335,11 +5356,17 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX; + init_timer(&bp->timer); + bp->timer.expires = RUN_AT(bp->timer_interval); + bp->timer.data = (unsigned long) bp; + bp->timer.function = bnx2_timer; + return 0; err_out_unmap: if (bp->regview) { iounmap(bp->regview); + bp->regview = NULL; } err_out_release: @@ -5454,6 +5481,8 @@ bnx2_remove_one(struct pci_dev *pdev) struct net_device *dev = pci_get_drvdata(pdev); struct bnx2 *bp = dev->priv; + flush_scheduled_work(); + unregister_netdev(dev); if (bp->regview) @@ -5505,12 +5534,12 @@ bnx2_resume(struct pci_dev *pdev) } static struct pci_driver bnx2_pci_driver = { - name: DRV_MODULE_NAME, - id_table: bnx2_pci_tbl, - probe: bnx2_init_one, - remove: __devexit_p(bnx2_remove_one), - suspend: bnx2_suspend, - resume: bnx2_resume, + .name = DRV_MODULE_NAME, + .id_table = bnx2_pci_tbl, + .probe = bnx2_init_one, + .remove = __devexit_p(bnx2_remove_one), + .suspend = bnx2_suspend, + .resume = bnx2_resume, }; static int __init bnx2_init(void) diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 8214a2853d0..9ad3f5740cd 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h @@ -3841,12 +3841,12 @@ struct bnx2 { struct status_block *status_blk; u32 last_status_idx; - atomic_t tx_avail_bd; struct tx_bd *tx_desc_ring; struct sw_bd *tx_buf_ring; u32 tx_prod_bseq; u16 tx_prod; u16 tx_cons; + int tx_ring_size; #ifdef BCM_VLAN struct vlan_group *vlgrp; @@ -3872,8 +3872,10 @@ struct bnx2 { char *name; int timer_interval; + int current_interval; struct timer_list timer; struct work_struct reset_task; + int in_reset_task; /* Used to synchronize phy accesses. */ spinlock_t phy_lock; @@ -3927,7 +3929,6 @@ struct bnx2 { u16 fw_wr_seq; u16 fw_drv_pulse_wr_seq; - int tx_ring_size; dma_addr_t tx_desc_mapping; @@ -3985,7 +3986,7 @@ struct bnx2 { #define PHY_LOOPBACK 2 u8 serdes_an_pending; -#define SERDES_AN_TIMEOUT (2 * HZ) +#define SERDES_AN_TIMEOUT (HZ / 3) u8 mac_addr[8]; @@ -4171,6 +4172,9 @@ struct fw_info { #define BNX2_PORT_HW_CFG_MAC_LOWER 0x00000054 #define BNX2_PORT_HW_CFG_CONFIG 0x00000058 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK 0x001f0000 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_AN 0x00000000 +#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G 0x00030000 #define BNX2_PORT_HW_CFG_IMD_MAC_A_UPPER 0x00000068 #define BNX2_PORT_HW_CFG_IMD_MAC_A_LOWER 0x0000006c diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index a2e8dda5afa..d2f34d5a808 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2419,22 +2419,19 @@ out: return 0; } -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype) +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev) { struct bonding *bond = dev->priv; struct slave *slave = NULL; int ret = NET_RX_DROP; - if (!(dev->flags & IFF_MASTER)) { + if (!(dev->flags & IFF_MASTER)) goto out; - } read_lock(&bond->lock); - slave = bond_get_slave_by_dev((struct bonding *)dev->priv, - skb->real_dev); - if (slave == NULL) { + slave = bond_get_slave_by_dev((struct bonding *)dev->priv, orig_dev); + if (!slave) goto out_unlock; - } bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len); diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h index f4682389418..673a30af566 100644 --- a/drivers/net/bonding/bond_3ad.h +++ b/drivers/net/bonding/bond_3ad.h @@ -295,6 +295,6 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave); void bond_3ad_handle_link_change(struct slave *slave, char link); int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info); int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev); -int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype); +int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev); #endif //__BOND_3AD_H__ diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 19e829b567d..f8fce396119 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -354,15 +354,14 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp) _unlock_rx_hashtbl(bond); } -static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype) +static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev) { struct bonding *bond = bond_dev->priv; struct arp_pkt *arp = (struct arp_pkt *)skb->data; int res = NET_RX_DROP; - if (!(bond_dev->flags & IFF_MASTER)) { + if (!(bond_dev->flags & IFF_MASTER)) goto out; - } if (!arp) { dprintk("Packet has no ARP data\n"); diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index ba9f0580e1f..2946e037a9b 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -98,7 +98,7 @@ static char bcast_addr[6]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; static char bpq_eth_addr[6]; -static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static int bpq_device_event(struct notifier_block *, unsigned long, void *); static const char *bpq_print_ethaddr(const unsigned char *); @@ -165,7 +165,7 @@ static inline int dev_is_ethdev(struct net_device *dev) /* * Receive an AX.25 frame via an ethernet interface. */ -static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len; char * ptr; diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index c39b0609742..32d5fabd4b1 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -1144,7 +1144,7 @@ static void ibmveth_proc_unregister_driver(void) static struct vio_device_id ibmveth_device_table[] __devinitdata= { { "network", "IBM,l-lan"}, - { 0,} + { "", "" } }; MODULE_DEVICE_TABLE(vio, ibmveth_device_table); diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c index 55af32e9bf0..183ba97785b 100644 --- a/drivers/net/iseries_veth.c +++ b/drivers/net/iseries_veth.c @@ -1370,7 +1370,7 @@ static int veth_probe(struct vio_dev *vdev, const struct vio_device_id *id) */ static struct vio_device_id veth_device_table[] __devinitdata = { { "vlan", "" }, - { NULL, NULL } + { "", "" } }; MODULE_DEVICE_TABLE(vio, veth_device_table); diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index a32668e88e0..bb71638a7c4 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -1657,7 +1657,6 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) skb->dev = ppp->dev; skb->protocol = htons(npindex_to_ethertype[npi]); skb->mac.raw = skb->data; - skb->input_dev = ppp->dev; netif_rx(skb); ppp->dev->last_rx = jiffies; } diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index ce1a9bf7b9a..82f236cc3b9 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -377,7 +377,8 @@ abort_kfree: ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; @@ -426,7 +427,8 @@ out: ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, + struct net_device *orig_dev) { struct pppoe_hdr *ph; diff --git a/drivers/net/rrunner.c b/drivers/net/rrunner.c index 12a86f96d97..ec1a18d189a 100644 --- a/drivers/net/rrunner.c +++ b/drivers/net/rrunner.c @@ -1429,6 +1429,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct rr_private *rrpriv = netdev_priv(dev); struct rr_regs __iomem *regs = rrpriv->regs; + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; struct ring_ctrl *txctrl; unsigned long flags; u32 index, len = skb->len; @@ -1460,7 +1461,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev) ifield = (u32 *)skb_push(skb, 8); ifield[0] = 0; - ifield[1] = skb->private.ifield; + ifield[1] = hcb->ifield; /* * We don't need the lock before we are actually going to start diff --git a/drivers/net/s2io.h b/drivers/net/s2io.h index 5d9270730ca..bc64d967f08 100644 --- a/drivers/net/s2io.h +++ b/drivers/net/s2io.h @@ -762,8 +762,8 @@ static inline u64 readq(void __iomem *addr) { u64 ret = 0; ret = readl(addr + 4); - (u64) ret <<= 32; - (u64) ret |= readl(addr); + ret <<= 32; + ret |= readl(addr); return ret; } diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c index 3ad0b6751f6..221354eea21 100644 --- a/drivers/net/shaper.c +++ b/drivers/net/shaper.c @@ -156,52 +156,6 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) SHAPERCB(skb)->shapelen= shaper_clocks(shaper,skb); -#ifdef SHAPER_COMPLEX /* and broken.. */ - - while(ptr && ptr!=(struct sk_buff *)&shaper->sendq) - { - if(ptr->pri<skb->pri - && jiffies - SHAPERCB(ptr)->shapeclock < SHAPER_MAXSLIP) - { - struct sk_buff *tmp=ptr->prev; - - /* - * It goes before us therefore we slip the length - * of the new frame. - */ - - SHAPERCB(ptr)->shapeclock+=SHAPERCB(skb)->shapelen; - SHAPERCB(ptr)->shapelatency+=SHAPERCB(skb)->shapelen; - - /* - * The packet may have slipped so far back it - * fell off. - */ - if(SHAPERCB(ptr)->shapelatency > SHAPER_LATENCY) - { - skb_unlink(ptr); - dev_kfree_skb(ptr); - } - ptr=tmp; - } - else - break; - } - if(ptr==NULL || ptr==(struct sk_buff *)&shaper->sendq) - skb_queue_head(&shaper->sendq,skb); - else - { - struct sk_buff *tmp; - /* - * Set the packet clock out time according to the - * frames ahead. Im sure a bit of thought could drop - * this loop. - */ - for(tmp=skb_peek(&shaper->sendq); tmp!=NULL && tmp!=ptr; tmp=tmp->next) - SHAPERCB(skb)->shapeclock+=tmp->shapelen; - skb_append(ptr,skb); - } -#else { struct sk_buff *tmp; /* @@ -220,7 +174,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev) } else skb_queue_tail(&shaper->sendq, skb); } -#endif + if(sh_debug) printk("Frame queued.\n"); if(skb_queue_len(&shaper->sendq)>SHAPER_QLEN) @@ -302,7 +256,7 @@ static void shaper_kick(struct shaper *shaper) * Pull the frame and get interrupts back on. */ - skb_unlink(skb); + skb_unlink(skb, &shaper->sendq); if (shaper->recovery < SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen) shaper->recovery = SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen; diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 6d4ab1e333b..af8263a1580 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -340,41 +340,92 @@ static struct { static void tg3_write_indirect_reg32(struct tg3 *tp, u32 off, u32 val) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - writel(val, tp->regs + off); - if ((tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) != 0) - readl(tp->regs + off); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); +} + +static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); + readl(tp->regs + off); +} + +static u32 tg3_read_indirect_reg32(struct tg3 *tp, u32 off) +{ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + +static void tg3_write_indirect_mbox(struct tg3 *tp, u32 off, u32 val) +{ + unsigned long flags; + + if (off == (MAILBOX_RCVRET_CON_IDX_0 + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_RCV_RET_RING_CON_IDX + + TG3_64BIT_REG_LOW, val); + return; + } + if (off == (MAILBOX_RCV_STD_PROD_IDX + TG3_64BIT_REG_LOW)) { + pci_write_config_dword(tp->pdev, TG3PCI_STD_RING_PROD_IDX + + TG3_64BIT_REG_LOW, val); + return; + } + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + + /* In indirect mode when disabling interrupts, we also need + * to clear the interrupt bit in the GRC local ctrl register. + */ + if ((off == (MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW)) && + (val == 0x1)) { + pci_write_config_dword(tp->pdev, TG3PCI_MISC_LOCAL_CTRL, + tp->grc_local_ctrl|GRC_LCLCTRL_CLEARINT); } } +static u32 tg3_read_indirect_mbox(struct tg3 *tp, u32 off) +{ + unsigned long flags; + u32 val; + + spin_lock_irqsave(&tp->indirect_lock, flags); + pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600); + pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val); + spin_unlock_irqrestore(&tp->indirect_lock, flags); + return val; +} + static void _tw32_flush(struct tg3 *tp, u32 off, u32 val) { - if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) { - spin_lock_bh(&tp->indirect_lock); - pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off); - pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val); - spin_unlock_bh(&tp->indirect_lock); - } else { - void __iomem *dest = tp->regs + off; - writel(val, dest); - readl(dest); /* always flush PCI write */ - } + tp->write32(tp, off, val); + if (!(tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) && + !(tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32(tp, off); /* flush */ } -static inline void _tw32_rx_mbox(struct tg3 *tp, u32 off, u32 val) +static inline void tw32_mailbox_flush(struct tg3 *tp, u32 off, u32 val) { - void __iomem *mbox = tp->regs + off; - writel(val, mbox); - if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) - readl(mbox); + tp->write32_mbox(tp, off, val); + if (!(tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) && + !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND)) + tp->read32_mbox(tp, off); } -static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val) +static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val) { void __iomem *mbox = tp->regs + off; writel(val, mbox); @@ -384,46 +435,57 @@ static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val) readl(mbox); } -#define tw32_mailbox(reg, val) writel(((val) & 0xffffffff), tp->regs + (reg)) -#define tw32_rx_mbox(reg, val) _tw32_rx_mbox(tp, reg, val) -#define tw32_tx_mbox(reg, val) _tw32_tx_mbox(tp, reg, val) +static void tg3_write32(struct tg3 *tp, u32 off, u32 val) +{ + writel(val, tp->regs + off); +} + +static u32 tg3_read32(struct tg3 *tp, u32 off) +{ + return (readl(tp->regs + off)); +} + +#define tw32_mailbox(reg, val) tp->write32_mbox(tp, reg, val) +#define tw32_mailbox_f(reg, val) tw32_mailbox_flush(tp, (reg), (val)) +#define tw32_rx_mbox(reg, val) tp->write32_rx_mbox(tp, reg, val) +#define tw32_tx_mbox(reg, val) tp->write32_tx_mbox(tp, reg, val) +#define tr32_mailbox(reg) tp->read32_mbox(tp, reg) -#define tw32(reg,val) tg3_write_indirect_reg32(tp,(reg),(val)) +#define tw32(reg,val) tp->write32(tp, reg, val) #define tw32_f(reg,val) _tw32_flush(tp,(reg),(val)) -#define tw16(reg,val) writew(((val) & 0xffff), tp->regs + (reg)) -#define tw8(reg,val) writeb(((val) & 0xff), tp->regs + (reg)) -#define tr32(reg) readl(tp->regs + (reg)) -#define tr16(reg) readw(tp->regs + (reg)) -#define tr8(reg) readb(tp->regs + (reg)) +#define tr32(reg) tp->read32(tp, reg) static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_read_mem(struct tg3 *tp, u32 off, u32 *val) { - spin_lock_bh(&tp->indirect_lock); + unsigned long flags; + + spin_lock_irqsave(&tp->indirect_lock, flags); pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off); pci_read_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val); /* Always leave this as zero. */ pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0); - spin_unlock_bh(&tp->indirect_lock); + spin_unlock_irqrestore(&tp->indirect_lock, flags); } static void tg3_disable_ints(struct tg3 *tp) { tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl | MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); } static inline void tg3_cond_int(struct tg3 *tp) @@ -439,9 +501,8 @@ static void tg3_enable_ints(struct tg3 *tp) tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - (tp->last_tag << 24)); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + (tp->last_tag << 24)); tg3_cond_int(tp); } @@ -472,8 +533,6 @@ static inline unsigned int tg3_has_work(struct tg3 *tp) */ static void tg3_restart_ints(struct tg3 *tp) { - tw32(TG3PCI_MISC_HOST_CTRL, - (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, tp->last_tag << 24); mmiowb(); @@ -3278,9 +3337,8 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs) /* No work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000000); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); } } else { /* shared interrupt */ handled = 0; @@ -3323,9 +3381,8 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id, struct pt_regs *r /* no work, shared interrupt perhaps? re-enable * interrupts, and flush that PCI write */ - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - tp->last_tag << 24); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tp->last_tag << 24); } } else { /* shared interrupt */ handled = 0; @@ -4216,7 +4273,7 @@ static void tg3_stop_fw(struct tg3 *); static int tg3_chip_reset(struct tg3 *tp) { u32 val; - u32 flags_save; + void (*write_op)(struct tg3 *, u32, u32); int i; if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X)) @@ -4228,8 +4285,9 @@ static int tg3_chip_reset(struct tg3 *tp) * fun things. So, temporarily disable the 5701 * hardware workaround, while we do the reset. */ - flags_save = tp->tg3_flags; - tp->tg3_flags &= ~TG3_FLAG_5701_REG_WRITE_BUG; + write_op = tp->write32; + if (write_op == tg3_write_flush_reg32) + tp->write32 = tg3_write32; /* do the reset */ val = GRC_MISC_CFG_CORECLK_RESET; @@ -4248,8 +4306,8 @@ static int tg3_chip_reset(struct tg3 *tp) val |= GRC_MISC_CFG_KEEP_GPHY_POWER; tw32(GRC_MISC_CFG, val); - /* restore 5701 hardware bug workaround flag */ - tp->tg3_flags = flags_save; + /* restore 5701 hardware bug workaround write method */ + tp->write32 = write_op; /* Unfortunately, we have to delay before the PCI read back. * Some 575X chips even will not respond to a PCI cfg access @@ -4635,7 +4693,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b int cpu_scratch_size, struct fw_info *info) { int err, i; - u32 orig_tg3_flags = tp->tg3_flags; void (*write_op)(struct tg3 *, u32, u32); if (cpu_base == TX_CPU_BASE && @@ -4651,11 +4708,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b else write_op = tg3_write_indirect_reg32; - /* Force use of PCI config space for indirect register - * write calls. - */ - tp->tg3_flags |= TG3_FLAG_PCIX_TARGET_HWBUG; - /* It is possible that bootcode is still loading at this point. * Get the nvram lock first before halting the cpu. */ @@ -4691,7 +4743,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b err = 0; out: - tp->tg3_flags = orig_tg3_flags; return err; } @@ -5808,8 +5859,7 @@ static int tg3_reset_hw(struct tg3 *tp) tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl); udelay(100); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); tp->last_tag = 0; if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { @@ -6198,7 +6248,8 @@ static int tg3_test_interrupt(struct tg3 *tp) HOSTCC_MODE_NOW); for (i = 0; i < 5; i++) { - int_mbox = tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + int_mbox = tr32_mailbox(MAILBOX_INTERRUPT_0 + + TG3_64BIT_REG_LOW); if (int_mbox != 0) break; msleep(10); @@ -6598,10 +6649,10 @@ static int tg3_open(struct net_device *dev) /* Mailboxes */ printk("DEBUG: SNDHOST_PROD[%08x%08x] SNDNIC_PROD[%08x%08x]\n", - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), - tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0), + tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4)); /* NIC side send descriptors. */ for (i = 0; i < 6; i++) { @@ -7901,7 +7952,7 @@ static int tg3_test_loopback(struct tg3 *tp) num_pkts++; tw32_tx_mbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW, send_idx); - tr32(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); + tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW); udelay(10); @@ -9153,14 +9204,6 @@ static int __devinit tg3_is_sun_570X(struct tg3 *tp) static int __devinit tg3_get_invariants(struct tg3 *tp) { static struct pci_device_id write_reorder_chipsets[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AA_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801AB_8) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_11) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_6) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_FE_GATE_700C) }, { }, @@ -9177,7 +9220,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) tp->tg3_flags2 |= TG3_FLG2_SUN_570X; #endif - /* If we have an AMD 762 or Intel ICH/ICH0/ICH2 chipset, write + /* If we have an AMD 762 chipset, write * reordering to the mailbox registers done by the host * controller can cause major troubles. We read back from * every mailbox register write to force the writes to be @@ -9215,6 +9258,69 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tp->pci_chip_rev_id == CHIPREV_ID_5752_A0_HW) tp->pci_chip_rev_id = CHIPREV_ID_5752_A0; + /* If we have 5702/03 A1 or A2 on certain ICH chipsets, + * we need to disable memory and use config. cycles + * only to access all registers. The 5702/03 chips + * can mistakenly decode the special cycles from the + * ICH chipsets as memory write cycles, causing corruption + * of register and memory space. Only certain ICH bridges + * will drive special cycles with non-zero data during the + * address phase which can fall within the 5703's address + * range. This is not an ICH bug as the PCI spec allows + * non-zero address during special cycles. However, only + * these ICH bridges are known to drive non-zero addresses + * during special cycles. + * + * Since special cycles do not cross PCI bridges, we only + * enable this workaround if the 5703 is on the secondary + * bus of these ICH bridges. + */ + if ((tp->pci_chip_rev_id == CHIPREV_ID_5703_A1) || + (tp->pci_chip_rev_id == CHIPREV_ID_5703_A2)) { + static struct tg3_dev_id { + u32 vendor; + u32 device; + u32 rev; + } ich_chipsets[] = { + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AA_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AB_8, + PCI_ANY_ID }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_11, + 0xa }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_6, + PCI_ANY_ID }, + { }, + }; + struct tg3_dev_id *pci_id = &ich_chipsets[0]; + struct pci_dev *bridge = NULL; + + while (pci_id->vendor != 0) { + bridge = pci_get_device(pci_id->vendor, pci_id->device, + bridge); + if (!bridge) { + pci_id++; + continue; + } + if (pci_id->rev != PCI_ANY_ID) { + u8 rev; + + pci_read_config_byte(bridge, PCI_REVISION_ID, + &rev); + if (rev > pci_id->rev) + continue; + } + if (bridge->subordinate && + (bridge->subordinate->number == + tp->pdev->bus->number)) { + + tp->tg3_flags2 |= TG3_FLG2_ICH_WORKAROUND; + pci_dev_put(bridge); + break; + } + } + } + /* Find msi capability. */ if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI); @@ -9302,6 +9408,12 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) } } + /* 5700 BX chips need to have their TX producer index mailboxes + * written twice to workaround a bug. + */ + if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) + tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; + /* Back to back register writes can cause problems on this chip, * the workaround is to read back all reg writes except those to * mailbox regs. See tg3_write_indirect_reg32(). @@ -9325,6 +9437,43 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, pci_state_reg); } + /* Default fast path register access methods */ + tp->read32 = tg3_read32; + tp->write32 = tg3_write32; + tp->read32_mbox = tg3_read32; + tp->write32_mbox = tg3_write32; + tp->write32_tx_mbox = tg3_write32; + tp->write32_rx_mbox = tg3_write32; + + /* Various workaround register access methods */ + if (tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) + tp->write32 = tg3_write_indirect_reg32; + else if (tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) + tp->write32 = tg3_write_flush_reg32; + + if ((tp->tg3_flags & TG3_FLAG_TXD_MBOX_HWBUG) || + (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)) { + tp->write32_tx_mbox = tg3_write32_tx_mbox; + if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) + tp->write32_rx_mbox = tg3_write_flush_reg32; + } + + if (tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND) { + tp->read32 = tg3_read_indirect_reg32; + tp->write32 = tg3_write_indirect_reg32; + tp->read32_mbox = tg3_read_indirect_mbox; + tp->write32_mbox = tg3_write_indirect_mbox; + tp->write32_tx_mbox = tg3_write_indirect_mbox; + tp->write32_rx_mbox = tg3_write_indirect_mbox; + + iounmap(tp->regs); + tp->regs = 0; + + pci_read_config_word(tp->pdev, PCI_COMMAND, &pci_cmd); + pci_cmd &= ~PCI_COMMAND_MEMORY; + pci_write_config_word(tp->pdev, PCI_COMMAND, pci_cmd); + } + /* Get eeprom hw config before calling tg3_set_power_state(). * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be * determined before calling tg3_set_power_state() so that @@ -9539,14 +9688,6 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) else tp->tg3_flags &= ~TG3_FLAG_POLL_SERDES; - /* 5700 BX chips need to have their TX producer index mailboxes - * written twice to workaround a bug. - */ - if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX) - tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG; - else - tp->tg3_flags &= ~TG3_FLAG_TXD_MBOX_HWBUG; - /* It seems all chips can get confused if TX buffers * straddle the 4GB address boundary in some cases. */ @@ -10469,7 +10610,10 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, return 0; err_out_iounmap: - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } err_out_free_dev: free_netdev(dev); @@ -10491,7 +10635,10 @@ static void __devexit tg3_remove_one(struct pci_dev *pdev) struct tg3 *tp = netdev_priv(dev); unregister_netdev(dev); - iounmap(tp->regs); + if (tp->regs) { + iounmap(tp->regs); + tp->regs = 0; + } free_netdev(dev); pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 5c4433c147f..c184b773e58 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2049,6 +2049,11 @@ struct tg3 { spinlock_t lock; spinlock_t indirect_lock; + u32 (*read32) (struct tg3 *, u32); + void (*write32) (struct tg3 *, u32, u32); + u32 (*read32_mbox) (struct tg3 *, u32); + void (*write32_mbox) (struct tg3 *, u32, + u32); void __iomem *regs; struct net_device *dev; struct pci_dev *pdev; @@ -2060,6 +2065,8 @@ struct tg3 { u32 msg_enable; /* begin "tx thread" cacheline section */ + void (*write32_tx_mbox) (struct tg3 *, u32, + u32); u32 tx_prod; u32 tx_cons; u32 tx_pending; @@ -2071,6 +2078,8 @@ struct tg3 { dma_addr_t tx_desc_mapping; /* begin "rx thread" cacheline section */ + void (*write32_rx_mbox) (struct tg3 *, u32, + u32); u32 rx_rcb_ptr; u32 rx_std_ptr; u32 rx_jumbo_ptr; @@ -2165,6 +2174,7 @@ struct tg3 { #define TG3_FLG2_ANY_SERDES (TG3_FLG2_PHY_SERDES | \ TG3_FLG2_MII_SERDES) #define TG3_FLG2_PARALLEL_DETECT 0x01000000 +#define TG3_FLG2_ICH_WORKAROUND 0x02000000 u32 split_mode_max_reqs; #define SPLIT_MODE_5704_MAX_REQ 3 diff --git a/drivers/net/wan/hdlc_generic.c b/drivers/net/wan/hdlc_generic.c index a63f6a2cc4f..cdd4c09c2d9 100644 --- a/drivers/net/wan/hdlc_generic.c +++ b/drivers/net/wan/hdlc_generic.c @@ -61,7 +61,7 @@ static struct net_device_stats *hdlc_get_stats(struct net_device *dev) static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *p) + struct packet_type *p, struct net_device *orig_dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto.netif_rx) diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 7f2e3653c5e..6c302e9dbca 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -86,7 +86,7 @@ static __inline__ int dev_is_ethdev(struct net_device *dev) /* * Receive a LAPB frame via an ethernet interface. */ -static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype) +static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; diff --git a/drivers/net/wan/sdla_fr.c b/drivers/net/wan/sdla_fr.c index c5f5e62aab8..0497dbdb863 100644 --- a/drivers/net/wan/sdla_fr.c +++ b/drivers/net/wan/sdla_fr.c @@ -445,7 +445,7 @@ void s508_s514_unlock(sdla_t *card, unsigned long *smp_flags); void s508_s514_lock(sdla_t *card, unsigned long *smp_flags); unsigned short calc_checksum (char *, int); -static int setup_fr_header(struct sk_buff** skb, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode); @@ -1372,7 +1372,7 @@ static int if_send(struct sk_buff* skb, struct net_device* dev) /* Move the if_header() code to here. By inserting frame * relay header in if_header() we would break the * tcpdump and other packet sniffers */ - chan->fr_header_len = setup_fr_header(&skb,dev,chan->common.usedby); + chan->fr_header_len = setup_fr_header(skb,dev,chan->common.usedby); if (chan->fr_header_len < 0 ){ ++chan->ifstats.tx_dropped; ++card->wandev.stats.tx_dropped; @@ -1597,8 +1597,6 @@ static int setup_for_delayed_transmit(struct net_device* dev, return 1; } - skb_unlink(skb); - chan->transmit_length = len; chan->delay_skb = skb; @@ -4871,18 +4869,15 @@ static void unconfig_fr (sdla_t *card) } } -static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, +static int setup_fr_header(struct sk_buff *skb, struct net_device* dev, char op_mode) { - struct sk_buff *skb = *skb_orig; fr_channel_t *chan=dev->priv; - if (op_mode == WANPIPE){ - + if (op_mode == WANPIPE) { chan->fr_header[0]=Q922_UI; switch (htons(skb->protocol)){ - case ETH_P_IP: chan->fr_header[1]=NLPID_IP; break; @@ -4894,16 +4889,14 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, } /* If we are in bridging mode, we must apply - * an Ethernet header */ - if (op_mode == BRIDGE || op_mode == BRIDGE_NODE){ - - + * an Ethernet header + */ + if (op_mode == BRIDGE || op_mode == BRIDGE_NODE) { /* Encapsulate the packet as a bridged Ethernet frame. */ #ifdef DEBUG printk(KERN_INFO "%s: encapsulating skb for frame relay\n", dev->name); #endif - chan->fr_header[0] = 0x03; chan->fr_header[1] = 0x00; chan->fr_header[2] = 0x80; @@ -4916,7 +4909,6 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev, /* Yuck. */ skb->protocol = ETH_P_802_3; return 8; - } return 0; diff --git a/drivers/net/wan/syncppp.c b/drivers/net/wan/syncppp.c index 84b65c60c79..f58c794a963 100644 --- a/drivers/net/wan/syncppp.c +++ b/drivers/net/wan/syncppp.c @@ -1447,7 +1447,7 @@ static void sppp_print_bytes (u_char *p, u16 len) * after interrupt servicing to process frames queued via netif_rx. */ -static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p) +static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index fe09d145542..2cb3c8340ca 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -1442,7 +1442,7 @@ static int ibmvscsi_remove(struct vio_dev *vdev) */ static struct vio_device_id ibmvscsi_device_table[] __devinitdata = { {"vscsi", "IBM,v-scsi"}, - {0,} + { "", "" } }; MODULE_DEVICE_TABLE(vio, ibmvscsi_device_table); diff --git a/drivers/scsi/ibmvscsi/rpa_vscsi.c b/drivers/scsi/ibmvscsi/rpa_vscsi.c index 035f615817d..8bf5652f106 100644 --- a/drivers/scsi/ibmvscsi/rpa_vscsi.c +++ b/drivers/scsi/ibmvscsi/rpa_vscsi.c @@ -28,6 +28,7 @@ */ #include <asm/vio.h> +#include <asm/prom.h> #include <asm/iommu.h> #include <asm/hvcall.h> #include <linux/dma-mapping.h> diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 4d8201422a1..7c4f6ecc1cc 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -84,7 +84,8 @@ static irqreturn_t pdc_interrupt (int irq, void *dev_instance, struct pt_regs *r static void pdc_eng_timeout(struct ata_port *ap); static int pdc_port_start(struct ata_port *ap); static void pdc_port_stop(struct ata_port *ap); -static void pdc_phy_reset(struct ata_port *ap); +static void pdc_pata_phy_reset(struct ata_port *ap); +static void pdc_sata_phy_reset(struct ata_port *ap); static void pdc_qc_prep(struct ata_queued_cmd *qc); static void pdc_tf_load_mmio(struct ata_port *ap, struct ata_taskfile *tf); static void pdc_exec_command_mmio(struct ata_port *ap, struct ata_taskfile *tf); @@ -111,19 +112,22 @@ static Scsi_Host_Template pdc_ata_sht = { .ordered_flush = 1, }; -static struct ata_port_operations pdc_ata_ops = { +static struct ata_port_operations pdc_sata_ops = { .port_disable = ata_port_disable, .tf_load = pdc_tf_load_mmio, .tf_read = ata_tf_read, .check_status = ata_check_status, .exec_command = pdc_exec_command_mmio, .dev_select = ata_std_dev_select, - .phy_reset = pdc_phy_reset, + + .phy_reset = pdc_sata_phy_reset, + .qc_prep = pdc_qc_prep, .qc_issue = pdc_qc_issue_prot, .eng_timeout = pdc_eng_timeout, .irq_handler = pdc_interrupt, .irq_clear = pdc_irq_clear, + .scr_read = pdc_sata_scr_read, .scr_write = pdc_sata_scr_write, .port_start = pdc_port_start, @@ -131,6 +135,27 @@ static struct ata_port_operations pdc_ata_ops = { .host_stop = ata_host_stop, }; +static struct ata_port_operations pdc_pata_ops = { + .port_disable = ata_port_disable, + .tf_load = pdc_tf_load_mmio, + .tf_read = ata_tf_read, + .check_status = ata_check_status, + .exec_command = pdc_exec_command_mmio, + .dev_select = ata_std_dev_select, + + .phy_reset = pdc_pata_phy_reset, + + .qc_prep = pdc_qc_prep, + .qc_issue = pdc_qc_issue_prot, + .eng_timeout = pdc_eng_timeout, + .irq_handler = pdc_interrupt, + .irq_clear = pdc_irq_clear, + + .port_start = pdc_port_start, + .port_stop = pdc_port_stop, + .host_stop = ata_host_stop, +}; + static struct ata_port_info pdc_port_info[] = { /* board_2037x */ { @@ -140,7 +165,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_sata_ops, }, /* board_20319 */ @@ -151,7 +176,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_sata_ops, }, /* board_20619 */ @@ -162,7 +187,7 @@ static struct ata_port_info pdc_port_info[] = { .pio_mask = 0x1f, /* pio0-4 */ .mwdma_mask = 0x07, /* mwdma0-2 */ .udma_mask = 0x7f, /* udma0-6 ; FIXME */ - .port_ops = &pdc_ata_ops, + .port_ops = &pdc_pata_ops, }, }; @@ -277,12 +302,23 @@ static void pdc_reset_port(struct ata_port *ap) readl(mmio); /* flush */ } -static void pdc_phy_reset(struct ata_port *ap) +static void pdc_sata_phy_reset(struct ata_port *ap) { pdc_reset_port(ap); sata_phy_reset(ap); } +static void pdc_pata_phy_reset(struct ata_port *ap) +{ + /* FIXME: add cable detect. Don't assume 40-pin cable */ + ap->cbl = ATA_CBL_PATA40; + ap->udma_mask &= ATA_UDMA_MASK_40C; + + pdc_reset_port(ap); + ata_port_probe(ap); + ata_bus_reset(ap); +} + static u32 pdc_sata_scr_read (struct ata_port *ap, unsigned int sc_reg) { if (sc_reg > SCR_CONTROL) diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c index 4528a00c45b..a2f67245f6d 100644 --- a/drivers/usb/net/usbnet.c +++ b/drivers/usb/net/usbnet.c @@ -2903,19 +2903,18 @@ static struct net_device_stats *usbnet_get_stats (struct net_device *net) * completion callbacks. 2.5 should have fixed those bugs... */ -static void defer_bh (struct usbnet *dev, struct sk_buff *skb) +static void defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; unsigned long flags; - spin_lock_irqsave (&list->lock, flags); - __skb_unlink (skb, list); - spin_unlock (&list->lock); - spin_lock (&dev->done.lock); - __skb_queue_tail (&dev->done, skb); + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock(&list->lock); + spin_lock(&dev->done.lock); + __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) - tasklet_schedule (&dev->bh); - spin_unlock_irqrestore (&dev->done.lock, flags); + tasklet_schedule(&dev->bh); + spin_unlock_irqrestore(&dev->done.lock, flags); } /* some work can't be done in tasklets, so we use keventd @@ -3120,7 +3119,7 @@ block: break; } - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->rxq); if (urb) { if (netif_running (dev->net) @@ -3490,7 +3489,7 @@ static void tx_complete (struct urb *urb, struct pt_regs *regs) urb->dev = NULL; entry->state = tx_done; - defer_bh (dev, skb); + defer_bh(dev, skb, &dev->txq); } /*-------------------------------------------------------------------------*/ diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index b5a5e04b6d3..498ad505fa5 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -86,9 +86,9 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl, dev->driver = driver; - dev->groups = 23; + dev->groups = 1; dev->seq = 1; - dev->nls = netlink_kernel_create(NETLINK_W1, NULL); + dev->nls = netlink_kernel_create(NETLINK_W1, 1, NULL, THIS_MODULE); if (!dev->nls) { printk(KERN_ERR "Failed to create new netlink socket(%u) for w1 master %s.\n", NETLINK_NFLOG, dev->dev.bus_id); @@ -225,3 +225,5 @@ void w1_remove_master_device(struct w1_bus_master *bm) EXPORT_SYMBOL(w1_add_master_device); EXPORT_SYMBOL(w1_remove_master_device); + +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_W1); diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c index 2a82fb055c7..e7b774423dd 100644 --- a/drivers/w1/w1_netlink.c +++ b/drivers/w1/w1_netlink.c @@ -51,7 +51,7 @@ void w1_netlink_send(struct w1_master *dev, struct w1_netlink_msg *msg) memcpy(data, msg, sizeof(struct w1_netlink_msg)); - NETLINK_CB(skb).dst_groups = dev->groups; + NETLINK_CB(skb).dst_group = dev->groups; netlink_broadcast(dev->nls, skb, 0, dev->groups, GFP_ATOMIC); nlmsg_failure: diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c index 93f3cd22a2e..6815b1b12b6 100644 --- a/fs/smbfs/sock.c +++ b/fs/smbfs/sock.c @@ -15,12 +15,12 @@ #include <linux/file.h> #include <linux/in.h> #include <linux/net.h> -#include <linux/tcp.h> #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/smp_lock.h> #include <linux/workqueue.h> #include <net/scm.h> +#include <net/tcp_states.h> #include <net/ip.h> #include <linux/smb_fs.h> diff --git a/include/asm-alpha/socket.h b/include/asm-alpha/socket.h index d00259d3dc7..b5193229132 100644 --- a/include/asm-alpha/socket.h +++ b/include/asm-alpha/socket.h @@ -25,6 +25,8 @@ #define SO_ERROR 0x1007 #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_RCVLOWAT 0x1010 #define SO_SNDLOWAT 0x1011 #define SO_RCVTIMEO 0x1012 diff --git a/include/asm-arm/socket.h b/include/asm-arm/socket.h index 46d20585d95..3c51da6438c 100644 --- a/include/asm-arm/socket.h +++ b/include/asm-arm/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-arm26/socket.h b/include/asm-arm26/socket.h index 46d20585d95..3c51da6438c 100644 --- a/include/asm-arm26/socket.h +++ b/include/asm-arm26/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-cris/socket.h b/include/asm-cris/socket.h index f159b4f165f..8b1da3e58c5 100644 --- a/include/asm-cris/socket.h +++ b/include/asm-cris/socket.h @@ -16,6 +16,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-frv/socket.h b/include/asm-frv/socket.h index c3be17c7de4..7177f8b9817 100644 --- a/include/asm-frv/socket.h +++ b/include/asm-frv/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-h8300/socket.h b/include/asm-h8300/socket.h index af33b8525dc..d98cf85bafc 100644 --- a/include/asm-h8300/socket.h +++ b/include/asm-h8300/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h index f949e44c2a3..67d3630c4e8 100644 --- a/include/asm-i386/checksum.h +++ b/include/asm-i386/checksum.h @@ -83,7 +83,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph, "adcl $0, %0 ;\n" "notl %0 ;\n" "2: ;\n" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h index 07f6b38ad14..802ae76195b 100644 --- a/include/asm-i386/socket.h +++ b/include/asm-i386/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-ia64/socket.h b/include/asm-ia64/socket.h index 21a9f10d6ba..a255006fb7b 100644 --- a/include/asm-ia64/socket.h +++ b/include/asm-ia64/socket.h @@ -23,6 +23,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-m32r/checksum.h b/include/asm-m32r/checksum.h index 99f37dbf255..877ebf46e9f 100644 --- a/include/asm-m32r/checksum.h +++ b/include/asm-m32r/checksum.h @@ -105,7 +105,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph, " addx %0, %3 \n" " .fillinsn\n" "2: \n" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=&r" (sum), "=r" (iph), "=r" (ihl), "=&r" (tmpreg0), "=&r" (tmpreg1) diff --git a/include/asm-m32r/socket.h b/include/asm-m32r/socket.h index 159519d9904..8b6680f223c 100644 --- a/include/asm-m32r/socket.h +++ b/include/asm-m32r/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-m68k/socket.h b/include/asm-m68k/socket.h index 8d0b9fc2d07..f578ca4b776 100644 --- a/include/asm-m68k/socket.h +++ b/include/asm-m68k/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-mips/socket.h b/include/asm-mips/socket.h index 020b4db70ee..d478a86294e 100644 --- a/include/asm-mips/socket.h +++ b/include/asm-mips/socket.h @@ -37,6 +37,8 @@ To add: #define SO_REUSEPORT 0x0200 /* Allow local address and port reuse. */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_SNDBUF 0x1001 /* Send buffer size. */ #define SO_RCVBUF 0x1002 /* Receive buffer. */ +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ diff --git a/include/asm-parisc/socket.h b/include/asm-parisc/socket.h index 4a77996c186..1bf54dc53c1 100644 --- a/include/asm-parisc/socket.h +++ b/include/asm-parisc/socket.h @@ -16,6 +16,8 @@ /* To add :#define SO_REUSEPORT 0x0200 */ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_SNDLOWAT 0x1003 #define SO_RCVLOWAT 0x1004 #define SO_SNDTIMEO 0x1005 diff --git a/include/asm-ppc64/8253pit.h b/include/asm-powerpc/8253pit.h index 285f78488cc..862708a749b 100644 --- a/include/asm-ppc64/8253pit.h +++ b/include/asm-powerpc/8253pit.h @@ -5,6 +5,6 @@ #ifndef _8253PIT_H #define _8253PIT_H -#define PIT_TICK_RATE 1193182UL +#define PIT_TICK_RATE 1193182UL #endif diff --git a/include/asm-ppc/agp.h b/include/asm-powerpc/agp.h index ca9e423307f..ca9e423307f 100644 --- a/include/asm-ppc/agp.h +++ b/include/asm-powerpc/agp.h diff --git a/include/asm-powerpc/cputime.h b/include/asm-powerpc/cputime.h new file mode 100644 index 00000000000..6d68ad7e0ea --- /dev/null +++ b/include/asm-powerpc/cputime.h @@ -0,0 +1 @@ +#include <asm-generic/cputime.h> diff --git a/include/asm-ppc/div64.h b/include/asm-powerpc/div64.h index 6cd978cefb2..6cd978cefb2 100644 --- a/include/asm-ppc/div64.h +++ b/include/asm-powerpc/div64.h diff --git a/include/asm-powerpc/emergency-restart.h b/include/asm-powerpc/emergency-restart.h new file mode 100644 index 00000000000..3711bd9d50b --- /dev/null +++ b/include/asm-powerpc/emergency-restart.h @@ -0,0 +1 @@ +#include <asm-generic/emergency-restart.h> diff --git a/include/asm-ppc/errno.h b/include/asm-powerpc/errno.h index 19f20bd41ae..19f20bd41ae 100644 --- a/include/asm-ppc/errno.h +++ b/include/asm-powerpc/errno.h diff --git a/include/asm-ppc/ioctl.h b/include/asm-powerpc/ioctl.h index 93c6acfdd0f..93c6acfdd0f 100644 --- a/include/asm-ppc/ioctl.h +++ b/include/asm-powerpc/ioctl.h diff --git a/include/asm-ppc/ioctls.h b/include/asm-powerpc/ioctls.h index f5b7f2b055e..f5b7f2b055e 100644 --- a/include/asm-ppc/ioctls.h +++ b/include/asm-powerpc/ioctls.h diff --git a/include/asm-ppc/ipc.h b/include/asm-powerpc/ipc.h index a46e3d9c2a3..a46e3d9c2a3 100644 --- a/include/asm-ppc/ipc.h +++ b/include/asm-powerpc/ipc.h diff --git a/include/asm-ppc/linkage.h b/include/asm-powerpc/linkage.h index 291c2d01c44..291c2d01c44 100644 --- a/include/asm-ppc/linkage.h +++ b/include/asm-powerpc/linkage.h diff --git a/include/asm-ppc64/local.h b/include/asm-powerpc/local.h index c11c530f74d..c11c530f74d 100644 --- a/include/asm-ppc64/local.h +++ b/include/asm-powerpc/local.h diff --git a/include/asm-ppc/namei.h b/include/asm-powerpc/namei.h index 29c9ec83213..29c9ec83213 100644 --- a/include/asm-ppc/namei.h +++ b/include/asm-powerpc/namei.h diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h new file mode 100644 index 00000000000..06a959d6723 --- /dev/null +++ b/include/asm-powerpc/percpu.h @@ -0,0 +1 @@ +#include <asm-generic/percpu.h> diff --git a/include/asm-ppc/poll.h b/include/asm-powerpc/poll.h index be5024913c6..be5024913c6 100644 --- a/include/asm-ppc/poll.h +++ b/include/asm-powerpc/poll.h diff --git a/include/asm-powerpc/resource.h b/include/asm-powerpc/resource.h new file mode 100644 index 00000000000..04bc4db8921 --- /dev/null +++ b/include/asm-powerpc/resource.h @@ -0,0 +1 @@ +#include <asm-generic/resource.h> diff --git a/include/asm-ppc/shmparam.h b/include/asm-powerpc/shmparam.h index d6250602ae6..d6250602ae6 100644 --- a/include/asm-ppc/shmparam.h +++ b/include/asm-powerpc/shmparam.h diff --git a/include/asm-ppc/string.h b/include/asm-powerpc/string.h index 22557599739..22557599739 100644 --- a/include/asm-ppc/string.h +++ b/include/asm-powerpc/string.h diff --git a/include/asm-ppc/unaligned.h b/include/asm-powerpc/unaligned.h index 45520d9b85d..45520d9b85d 100644 --- a/include/asm-ppc/unaligned.h +++ b/include/asm-powerpc/unaligned.h diff --git a/include/asm-ppc/xor.h b/include/asm-powerpc/xor.h index c82eb12a5b1..c82eb12a5b1 100644 --- a/include/asm-ppc/xor.h +++ b/include/asm-powerpc/xor.h diff --git a/include/asm-ppc/8253pit.h b/include/asm-ppc/8253pit.h deleted file mode 100644 index 285f78488cc..00000000000 --- a/include/asm-ppc/8253pit.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * 8253/8254 Programmable Interval Timer - */ - -#ifndef _8253PIT_H -#define _8253PIT_H - -#define PIT_TICK_RATE 1193182UL - -#endif diff --git a/include/asm-ppc/cputime.h b/include/asm-ppc/cputime.h deleted file mode 100644 index 8e9faf5ce72..00000000000 --- a/include/asm-ppc/cputime.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_CPUTIME_H -#define __PPC_CPUTIME_H - -#include <asm-generic/cputime.h> - -#endif /* __PPC_CPUTIME_H */ diff --git a/include/asm-ppc/emergency-restart.h b/include/asm-ppc/emergency-restart.h deleted file mode 100644 index 108d8c48e42..00000000000 --- a/include/asm-ppc/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include <asm-generic/emergency-restart.h> - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/include/asm-ppc/hdreg.h b/include/asm-ppc/hdreg.h deleted file mode 100644 index 7f7fd1af0af..00000000000 --- a/include/asm-ppc/hdreg.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/hdreg.h> diff --git a/include/asm-ppc/local.h b/include/asm-ppc/local.h deleted file mode 100644 index b08e3eced10..00000000000 --- a/include/asm-ppc/local.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_LOCAL_H -#define __PPC_LOCAL_H - -#include <asm-generic/local.h> - -#endif /* __PPC_LOCAL_H */ diff --git a/include/asm-ppc/percpu.h b/include/asm-ppc/percpu.h deleted file mode 100644 index d66667cd587..00000000000 --- a/include/asm-ppc/percpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ARCH_PPC_PERCPU__ -#define __ARCH_PPC_PERCPU__ - -#include <asm-generic/percpu.h> - -#endif /* __ARCH_PPC_PERCPU__ */ diff --git a/include/asm-ppc/resource.h b/include/asm-ppc/resource.h deleted file mode 100644 index 86a1ea23a6e..00000000000 --- a/include/asm-ppc/resource.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _PPC_RESOURCE_H -#define _PPC_RESOURCE_H - -#include <asm-generic/resource.h> - -#endif diff --git a/include/asm-ppc/socket.h b/include/asm-ppc/socket.h index 4134376b0f6..296e1a3469d 100644 --- a/include/asm-ppc/socket.h +++ b/include/asm-ppc/socket.h @@ -20,6 +20,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h index 6d4e8e78705..84c24d4cdb7 100644 --- a/include/asm-ppc64/abs_addr.h +++ b/include/asm-ppc64/abs_addr.h @@ -16,93 +16,51 @@ #include <asm/page.h> #include <asm/prom.h> #include <asm/lmb.h> +#include <asm/firmware.h> -typedef u32 msChunks_entry; -struct msChunks { +struct mschunks_map { unsigned long num_chunks; unsigned long chunk_size; unsigned long chunk_shift; unsigned long chunk_mask; - msChunks_entry *abs; + u32 *mapping; }; -extern struct msChunks msChunks; +extern struct mschunks_map mschunks_map; -extern unsigned long msChunks_alloc(unsigned long, unsigned long, unsigned long); -extern unsigned long reloc_offset(void); +/* Chunks are 256 KB */ +#define MSCHUNKS_CHUNK_SHIFT (18) +#define MSCHUNKS_CHUNK_SIZE (1UL << MSCHUNKS_CHUNK_SHIFT) +#define MSCHUNKS_OFFSET_MASK (MSCHUNKS_CHUNK_SIZE - 1) -#ifdef CONFIG_MSCHUNKS - -static inline unsigned long -chunk_to_addr(unsigned long chunk) +static inline unsigned long chunk_to_addr(unsigned long chunk) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - return chunk << _msChunks->chunk_shift; + return chunk << MSCHUNKS_CHUNK_SHIFT; } -static inline unsigned long -addr_to_chunk(unsigned long addr) +static inline unsigned long addr_to_chunk(unsigned long addr) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - - return addr >> _msChunks->chunk_shift; + return addr >> MSCHUNKS_CHUNK_SHIFT; } -static inline unsigned long -chunk_offset(unsigned long addr) +static inline unsigned long phys_to_abs(unsigned long pa) { - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); + unsigned long chunk; - return addr & _msChunks->chunk_mask; -} + /* This is a no-op on non-iSeries */ + if (!firmware_has_feature(FW_FEATURE_ISERIES)) + return pa; -static inline unsigned long -abs_chunk(unsigned long pchunk) -{ - unsigned long offset = reloc_offset(); - struct msChunks *_msChunks = PTRRELOC(&msChunks); - if ( pchunk >= _msChunks->num_chunks ) { - return pchunk; - } - return PTRRELOC(_msChunks->abs)[pchunk]; -} + chunk = addr_to_chunk(pa); -/* A macro so it can take pointers or unsigned long. */ -#define phys_to_abs(pa) \ - ({ unsigned long _pa = (unsigned long)(pa); \ - chunk_to_addr(abs_chunk(addr_to_chunk(_pa))) + chunk_offset(_pa); \ - }) + if (chunk < mschunks_map.num_chunks) + chunk = mschunks_map.mapping[chunk]; -static inline unsigned long -physRpn_to_absRpn(unsigned long rpn) -{ - unsigned long pa = rpn << PAGE_SHIFT; - unsigned long aa = phys_to_abs(pa); - return (aa >> PAGE_SHIFT); + return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK); } -/* A macro so it can take pointers or unsigned long. */ -#define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa)) - -#else /* !CONFIG_MSCHUNKS */ - -#define chunk_to_addr(chunk) ((unsigned long)(chunk)) -#define addr_to_chunk(addr) (addr) -#define chunk_offset(addr) (0) -#define abs_chunk(pchunk) (pchunk) - -#define phys_to_abs(pa) (pa) -#define physRpn_to_absRpn(rpn) (rpn) -#define abs_to_phys(aa) (aa) - -#endif /* !CONFIG_MSCHUNKS */ - /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) -#define abs_to_virt(aa) __va(abs_to_phys(aa)) +#define abs_to_virt(aa) __va(aa) #endif /* _ABS_ADDR_H */ diff --git a/include/asm-ppc64/agp.h b/include/asm-ppc64/agp.h deleted file mode 100644 index ca9e423307f..00000000000 --- a/include/asm-ppc64/agp.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef AGP_H -#define AGP_H 1 - -#include <asm/io.h> - -/* nothing much needed here */ - -#define map_page_into_agp(page) -#define unmap_page_from_agp(page) -#define flush_agp_mappings() -#define flush_agp_cache() mb() - -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) \ - ((char *)__get_free_pages(GFP_KERNEL, (order))) -#define free_gatt_pages(table, order) \ - free_pages((unsigned long)(table), (order)) - -#endif diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h index d67fa9e2607..ae6cf383010 100644 --- a/include/asm-ppc64/cputable.h +++ b/include/asm-ppc64/cputable.h @@ -56,11 +56,6 @@ struct cpu_spec { * BHT, SPD, etc... from head.S before branching to identify_machine */ cpu_setup_t cpu_setup; - - /* This is used to identify firmware features which are available - * to the kernel. - */ - unsigned long firmware_features; }; extern struct cpu_spec cpu_specs[]; @@ -71,39 +66,6 @@ static inline unsigned long cpu_has_feature(unsigned long feature) return cur_cpu_spec->cpu_features & feature; } - -/* firmware feature bitmask values */ -#define FIRMWARE_MAX_FEATURES 63 - -#define FW_FEATURE_PFT (1UL<<0) -#define FW_FEATURE_TCE (1UL<<1) -#define FW_FEATURE_SPRG0 (1UL<<2) -#define FW_FEATURE_DABR (1UL<<3) -#define FW_FEATURE_COPY (1UL<<4) -#define FW_FEATURE_ASR (1UL<<5) -#define FW_FEATURE_DEBUG (1UL<<6) -#define FW_FEATURE_TERM (1UL<<7) -#define FW_FEATURE_PERF (1UL<<8) -#define FW_FEATURE_DUMP (1UL<<9) -#define FW_FEATURE_INTERRUPT (1UL<<10) -#define FW_FEATURE_MIGRATE (1UL<<11) -#define FW_FEATURE_PERFMON (1UL<<12) -#define FW_FEATURE_CRQ (1UL<<13) -#define FW_FEATURE_VIO (1UL<<14) -#define FW_FEATURE_RDMA (1UL<<15) -#define FW_FEATURE_LLAN (1UL<<16) -#define FW_FEATURE_BULK (1UL<<17) -#define FW_FEATURE_XDABR (1UL<<18) -#define FW_FEATURE_MULTITCE (1UL<<19) -#define FW_FEATURE_SPLPAR (1UL<<20) - -typedef struct { - unsigned long val; - char * name; -} firmware_feature_t; - -extern firmware_feature_t firmware_features_table[]; - #endif /* __ASSEMBLY__ */ /* CPU kernel features */ @@ -140,10 +102,8 @@ extern firmware_feature_t firmware_features_table[]; #define CPU_FTR_MMCRA_SIHV ASM_CONST(0x0000080000000000) #define CPU_FTR_CTRL ASM_CONST(0x0000100000000000) -/* Platform firmware features */ -#define FW_FTR_ ASM_CONST(0x0000000000000001) - #ifndef __ASSEMBLY__ + #define COMMON_USER_PPC64 (PPC_FEATURE_32 | PPC_FEATURE_64 | \ PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_MMU) @@ -156,10 +116,9 @@ extern firmware_feature_t firmware_features_table[]; #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_PPCAS_ARCH_V2_BASE) #else #define CPU_FTR_PPCAS_ARCH_V2 (CPU_FTR_PPCAS_ARCH_V2_BASE | CPU_FTR_16M_PAGE) -#endif +#endif /* CONFIG_PPC_ISERIES */ -#define COMMON_PPC64_FW (0) -#endif +#endif /* __ASSEMBLY */ #ifdef __ASSEMBLY__ diff --git a/include/asm-ppc64/cputime.h b/include/asm-ppc64/cputime.h deleted file mode 100644 index 8e9faf5ce72..00000000000 --- a/include/asm-ppc64/cputime.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __PPC_CPUTIME_H -#define __PPC_CPUTIME_H - -#include <asm-generic/cputime.h> - -#endif /* __PPC_CPUTIME_H */ diff --git a/include/asm-ppc64/div64.h b/include/asm-ppc64/div64.h deleted file mode 100644 index 6cd978cefb2..00000000000 --- a/include/asm-ppc64/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/div64.h> diff --git a/include/asm-ppc64/emergency-restart.h b/include/asm-ppc64/emergency-restart.h deleted file mode 100644 index 108d8c48e42..00000000000 --- a/include/asm-ppc64/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include <asm-generic/emergency-restart.h> - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/include/asm-ppc64/errno.h b/include/asm-ppc64/errno.h deleted file mode 100644 index 69bc3b0c6cb..00000000000 --- a/include/asm-ppc64/errno.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _PPC64_ERRNO_H -#define _PPC64_ERRNO_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm-generic/errno.h> - -#undef EDEADLOCK -#define EDEADLOCK 58 /* File locking deadlock error */ - -#define _LAST_ERRNO 516 - -#endif diff --git a/include/asm-ppc64/firmware.h b/include/asm-ppc64/firmware.h new file mode 100644 index 00000000000..22bb85cf60a --- /dev/null +++ b/include/asm-ppc64/firmware.h @@ -0,0 +1,101 @@ +/* + * include/asm-ppc64/firmware.h + * + * Extracted from include/asm-ppc64/cputable.h + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __ASM_PPC_FIRMWARE_H +#define __ASM_PPC_FIRMWARE_H + +#ifdef __KERNEL__ + +#ifndef __ASSEMBLY__ + +/* firmware feature bitmask values */ +#define FIRMWARE_MAX_FEATURES 63 + +#define FW_FEATURE_PFT (1UL<<0) +#define FW_FEATURE_TCE (1UL<<1) +#define FW_FEATURE_SPRG0 (1UL<<2) +#define FW_FEATURE_DABR (1UL<<3) +#define FW_FEATURE_COPY (1UL<<4) +#define FW_FEATURE_ASR (1UL<<5) +#define FW_FEATURE_DEBUG (1UL<<6) +#define FW_FEATURE_TERM (1UL<<7) +#define FW_FEATURE_PERF (1UL<<8) +#define FW_FEATURE_DUMP (1UL<<9) +#define FW_FEATURE_INTERRUPT (1UL<<10) +#define FW_FEATURE_MIGRATE (1UL<<11) +#define FW_FEATURE_PERFMON (1UL<<12) +#define FW_FEATURE_CRQ (1UL<<13) +#define FW_FEATURE_VIO (1UL<<14) +#define FW_FEATURE_RDMA (1UL<<15) +#define FW_FEATURE_LLAN (1UL<<16) +#define FW_FEATURE_BULK (1UL<<17) +#define FW_FEATURE_XDABR (1UL<<18) +#define FW_FEATURE_MULTITCE (1UL<<19) +#define FW_FEATURE_SPLPAR (1UL<<20) +#define FW_FEATURE_ISERIES (1UL<<21) + +enum { + FW_FEATURE_PSERIES_POSSIBLE = FW_FEATURE_PFT | FW_FEATURE_TCE | + FW_FEATURE_SPRG0 | FW_FEATURE_DABR | FW_FEATURE_COPY | + FW_FEATURE_ASR | FW_FEATURE_DEBUG | FW_FEATURE_TERM | + FW_FEATURE_PERF | FW_FEATURE_DUMP | FW_FEATURE_INTERRUPT | + FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ | + FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | + FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | + FW_FEATURE_SPLPAR, + FW_FEATURE_PSERIES_ALWAYS = 0, + FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES, + FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES, + FW_FEATURE_POSSIBLE = +#ifdef CONFIG_PPC_PSERIES + FW_FEATURE_PSERIES_POSSIBLE | +#endif +#ifdef CONFIG_PPC_ISERIES + FW_FEATURE_ISERIES_POSSIBLE | +#endif + 0, + FW_FEATURE_ALWAYS = +#ifdef CONFIG_PPC_PSERIES + FW_FEATURE_PSERIES_ALWAYS & +#endif +#ifdef CONFIG_PPC_ISERIES + FW_FEATURE_ISERIES_ALWAYS & +#endif + FW_FEATURE_POSSIBLE, +}; + +/* This is used to identify firmware features which are available + * to the kernel. + */ +extern unsigned long ppc64_firmware_features; + +static inline unsigned long firmware_has_feature(unsigned long feature) +{ + return (FW_FEATURE_ALWAYS & feature) || + (FW_FEATURE_POSSIBLE & ppc64_firmware_features & feature); +} + +#ifdef CONFIG_PPC_PSERIES +typedef struct { + unsigned long val; + char * name; +} firmware_feature_t; + +extern firmware_feature_t firmware_features_table[]; +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ +#endif /* __ASM_PPC_FIRMWARE_H */ diff --git a/include/asm-ppc64/hdreg.h b/include/asm-ppc64/hdreg.h deleted file mode 100644 index 7f7fd1af0af..00000000000 --- a/include/asm-ppc64/hdreg.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/hdreg.h> diff --git a/include/asm-ppc64/imalloc.h b/include/asm-ppc64/imalloc.h index e46ff68a6e4..42adf7033a8 100644 --- a/include/asm-ppc64/imalloc.h +++ b/include/asm-ppc64/imalloc.h @@ -6,7 +6,7 @@ */ #define PHBS_IO_BASE VMALLOC_END #define IMALLOC_BASE (PHBS_IO_BASE + 0x80000000ul) /* Reserve 2 gigs for PHBs */ -#define IMALLOC_END (VMALLOC_START + EADDR_MASK) +#define IMALLOC_END (VMALLOC_START + PGTABLE_RANGE) /* imalloc region types */ diff --git a/include/asm-ppc64/ioctl.h b/include/asm-ppc64/ioctl.h deleted file mode 100644 index 42b8c5da7fb..00000000000 --- a/include/asm-ppc64/ioctl.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef _PPC64_IOCTL_H -#define _PPC64_IOCTL_H - - -/* - * This was copied from the alpha as it's a bit cleaner there. - * -- Cort - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define _IOC_NRBITS 8 -#define _IOC_TYPEBITS 8 -#define _IOC_SIZEBITS 13 -#define _IOC_DIRBITS 3 - -#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) -#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) -#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) -#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) - -#define _IOC_NRSHIFT 0 -#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) -#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) -#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) - -/* - * Direction bits _IOC_NONE could be 0, but OSF/1 gives it a bit. - * And this turns out useful to catch old ioctl numbers in header - * files for us. - */ -#define _IOC_NONE 1U -#define _IOC_READ 2U -#define _IOC_WRITE 4U - -#define _IOC(dir,type,nr,size) \ - (((dir) << _IOC_DIRSHIFT) | \ - ((type) << _IOC_TYPESHIFT) | \ - ((nr) << _IOC_NRSHIFT) | \ - ((size) << _IOC_SIZESHIFT)) - -/* provoke compile error for invalid uses of size argument */ -extern unsigned int __invalid_size_argument_for_IOC; -#define _IOC_TYPECHECK(t) \ - ((sizeof(t) == sizeof(t[1]) && \ - sizeof(t) < (1 << _IOC_SIZEBITS)) ? \ - sizeof(t) : __invalid_size_argument_for_IOC) - -/* used to create numbers */ -#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) -#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size))) -#define _IOR_BAD(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) -#define _IOW_BAD(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) -#define _IOWR_BAD(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) - -/* used to decode them.. */ -#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) -#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) -#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) - -/* various drivers, such as the pcmcia stuff, need these... */ -#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) -#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) -#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) -#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) -#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) - -#endif /* _PPC64_IOCTL_H */ diff --git a/include/asm-ppc64/ioctls.h b/include/asm-ppc64/ioctls.h deleted file mode 100644 index 48796bf3e4f..00000000000 --- a/include/asm-ppc64/ioctls.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef _ASM_PPC64_IOCTLS_H -#define _ASM_PPC64_IOCTLS_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/ioctl.h> - -#define FIOCLEX _IO('f', 1) -#define FIONCLEX _IO('f', 2) -#define FIOASYNC _IOW('f', 125, int) -#define FIONBIO _IOW('f', 126, int) -#define FIONREAD _IOR('f', 127, int) -#define TIOCINQ FIONREAD -#define FIOQSIZE _IOR('f', 128, loff_t) - -#define TIOCGETP _IOR('t', 8, struct sgttyb) -#define TIOCSETP _IOW('t', 9, struct sgttyb) -#define TIOCSETN _IOW('t', 10, struct sgttyb) /* TIOCSETP wo flush */ - -#define TIOCSETC _IOW('t', 17, struct tchars) -#define TIOCGETC _IOR('t', 18, struct tchars) -#define TCGETS _IOR('t', 19, struct termios) -#define TCSETS _IOW('t', 20, struct termios) -#define TCSETSW _IOW('t', 21, struct termios) -#define TCSETSF _IOW('t', 22, struct termios) - -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) - -#define TCSBRK _IO('t', 29) -#define TCXONC _IO('t', 30) -#define TCFLSH _IO('t', 31) - -#define TIOCSWINSZ _IOW('t', 103, struct winsize) -#define TIOCGWINSZ _IOR('t', 104, struct winsize) -#define TIOCSTART _IO('t', 110) /* start output, like ^Q */ -#define TIOCSTOP _IO('t', 111) /* stop output, like ^S */ -#define TIOCOUTQ _IOR('t', 115, int) /* output queue size */ - -#define TIOCGLTC _IOR('t', 116, struct ltchars) -#define TIOCSLTC _IOW('t', 117, struct ltchars) -#define TIOCSPGRP _IOW('t', 118, int) -#define TIOCGPGRP _IOR('t', 119, int) - -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E - -#define TIOCSTI 0x5412 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -# define TIOCM_LE 0x001 -# define TIOCM_DTR 0x002 -# define TIOCM_RTS 0x004 -# define TIOCM_ST 0x008 -# define TIOCM_SR 0x010 -# define TIOCM_CTS 0x020 -# define TIOCM_CAR 0x040 -# define TIOCM_RNG 0x080 -# define TIOCM_DSR 0x100 -# define TIOCM_CD TIOCM_CAR -# define TIOCM_RI TIOCM_RNG - -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -# define TIOCPKT_DATA 0 -# define TIOCPKT_FLUSHREAD 1 -# define TIOCPKT_FLUSHWRITE 2 -# define TIOCPKT_STOP 4 -# define TIOCPKT_START 8 -# define TIOCPKT_NOSTOP 16 -# define TIOCPKT_DOSTOP 32 - - -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TIOCGPTN _IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T',0x31, int) /* Lock/unlock Pty */ - -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ - /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ -# define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ - -#endif /* _ASM_PPC64_IOCTLS_H */ diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h index 729de5cc21d..72dcf8116b0 100644 --- a/include/asm-ppc64/iommu.h +++ b/include/asm-ppc64/iommu.h @@ -104,9 +104,6 @@ extern void iommu_devnode_init_pSeries(struct device_node *dn); #ifdef CONFIG_PPC_ISERIES -/* Initializes tables for bio buses */ -extern void __init iommu_vio_init(void); - struct iSeries_Device_Node; /* Creates table for an individual device node */ extern void iommu_devnode_init_iSeries(struct iSeries_Device_Node *dn); diff --git a/include/asm-ppc64/ipc.h b/include/asm-ppc64/ipc.h deleted file mode 100644 index a46e3d9c2a3..00000000000 --- a/include/asm-ppc64/ipc.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipc.h> diff --git a/include/asm-ppc64/linkage.h b/include/asm-ppc64/linkage.h deleted file mode 100644 index 291c2d01c44..00000000000 --- a/include/asm-ppc64/linkage.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_LINKAGE_H -#define __ASM_LINKAGE_H - -/* Nothing to see here... */ - -#endif diff --git a/include/asm-ppc64/lmb.h b/include/asm-ppc64/lmb.h index a6cbca21ac1..cb368bf0f26 100644 --- a/include/asm-ppc64/lmb.h +++ b/include/asm-ppc64/lmb.h @@ -22,7 +22,6 @@ struct lmb_property { unsigned long base; - unsigned long physbase; unsigned long size; }; diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index f0ef0637594..ff2c9287d3b 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -140,6 +140,9 @@ struct machdep_calls { /* Idle loop for this platform, leave empty for default idle loop */ int (*idle_loop)(void); + + /* Function to enable pmcs for this platform, called once per cpu. */ + void (*enable_pmcs)(void); }; extern int default_idle(void); diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index 70348a85131..ad36bb28de2 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -28,9 +28,12 @@ #define STE_VSID_SHIFT 12 /* Location of cpu0's segment table */ -#define STAB0_PAGE 0x9 +#define STAB0_PAGE 0x6 #define STAB0_PHYS_ADDR (STAB0_PAGE<<PAGE_SHIFT) -#define STAB0_VIRT_ADDR (KERNELBASE+STAB0_PHYS_ADDR) + +#ifndef __ASSEMBLY__ +extern char initial_stab[]; +#endif /* ! __ASSEMBLY */ /* * SLB @@ -259,8 +262,10 @@ extern void stabs_alloc(void); #define VSID_BITS 36 #define VSID_MODULUS ((1UL<<VSID_BITS)-1) -#define CONTEXT_BITS 20 -#define USER_ESID_BITS 15 +#define CONTEXT_BITS 19 +#define USER_ESID_BITS 16 + +#define USER_VSID_RANGE (1UL << (USER_ESID_BITS + SID_SHIFT)) /* * This macro generates asm code to compute the VSID scramble @@ -302,8 +307,7 @@ typedef unsigned long mm_context_id_t; typedef struct { mm_context_id_t id; #ifdef CONFIG_HUGETLB_PAGE - pgd_t *huge_pgdir; - u16 htlb_segs; /* bitmask */ + u16 low_htlb_areas, high_htlb_areas; #endif } mm_context_t; diff --git a/include/asm-ppc64/naca.h b/include/asm-ppc64/naca.h index bfb7caa32ea..d2afe644759 100644 --- a/include/asm-ppc64/naca.h +++ b/include/asm-ppc64/naca.h @@ -12,8 +12,6 @@ #include <asm/types.h> -#ifndef __ASSEMBLY__ - struct naca_struct { /* Kernel only data - undefined for user space */ void *xItVpdAreas; /* VPD Data 0x00 */ @@ -23,9 +21,4 @@ struct naca_struct { extern struct naca_struct naca; -#endif /* __ASSEMBLY__ */ - -#define NACA_PAGE 0x4 -#define NACA_PHYS_ADDR (NACA_PAGE<<PAGE_SHIFT) - #endif /* _NACA_H */ diff --git a/include/asm-ppc64/namei.h b/include/asm-ppc64/namei.h deleted file mode 100644 index a1412a2d102..00000000000 --- a/include/asm-ppc64/namei.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * linux/include/asm-ppc/namei.h - * Adapted from linux/include/asm-alpha/namei.h - * - * Included from linux/fs/namei.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef __PPC64_NAMEI_H -#define __PPC64_NAMEI_H - -/* This dummy routine maybe changed to something useful - * for /usr/gnemul/ emulation stuff. - * Look at asm-sparc/namei.h for details. - */ - -#define __emul_prefix() NULL - -#endif /* __PPC64_NAMEI_H */ diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index a5893a305a0..a79a08df62b 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -37,39 +37,45 @@ #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -/* For 64-bit processes the hugepage range is 1T-1.5T */ -#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000) -#define TASK_HPAGE_END ASM_CONST(0x0000018000000000) +#define HTLB_AREA_SHIFT 40 +#define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT) +#define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT) #define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \ - (1U << GET_ESID(addr))) & 0xffff) +#define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \ + - (1U << GET_HTLB_AREA(addr))) & 0xffff) #define ARCH_HAS_HUGEPAGE_ONLY_RANGE #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE +#define ARCH_HAS_SETCLEAR_HUGE_PTE #define touches_hugepage_low_range(mm, addr, len) \ - (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs) -#define touches_hugepage_high_range(addr, len) \ - (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END)) + (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas) +#define touches_hugepage_high_range(mm, addr, len) \ + (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas) #define __within_hugepage_low_range(addr, len, segmask) \ ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask)) #define within_hugepage_low_range(addr, len) \ __within_hugepage_low_range((addr), (len), \ - current->mm->context.htlb_segs) -#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \ - && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr))) + current->mm->context.low_htlb_areas) +#define __within_hugepage_high_range(addr, len, zonemask) \ + ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask)) +#define within_hugepage_high_range(addr, len) \ + __within_hugepage_high_range((addr), (len), \ + current->mm->context.high_htlb_areas) #define is_hugepage_only_range(mm, addr, len) \ - (touches_hugepage_high_range((addr), (len)) || \ + (touches_hugepage_high_range((mm), (addr), (len)) || \ touches_hugepage_low_range((mm), (addr), (len))) #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #define in_hugepage_area(context, addr) \ (cpu_has_feature(CPU_FTR_16M_PAGE) && \ - ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \ + ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \ ( ((addr) < 0x100000000L) && \ - ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) ) + ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) ) #else /* !CONFIG_HUGETLB_PAGE */ @@ -125,36 +131,42 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b. */ typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned int pmd; } pmd_t; -typedef struct { unsigned int pgd; } pgd_t; +typedef struct { unsigned long pmd; } pmd_t; +typedef struct { unsigned long pud; } pud_t; +typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) +#define pud_val(x) ((x).pud) #define pgd_val(x) ((x).pgd) #define pgprot_val(x) ((x).pgprot) -#define __pte(x) ((pte_t) { (x) } ) -#define __pmd(x) ((pmd_t) { (x) } ) -#define __pgd(x) ((pgd_t) { (x) } ) -#define __pgprot(x) ((pgprot_t) { (x) } ) +#define __pte(x) ((pte_t) { (x) }) +#define __pmd(x) ((pmd_t) { (x) }) +#define __pud(x) ((pud_t) { (x) }) +#define __pgd(x) ((pgd_t) { (x) }) +#define __pgprot(x) ((pgprot_t) { (x) }) #else /* * .. while these make it easier on the compiler */ typedef unsigned long pte_t; -typedef unsigned int pmd_t; -typedef unsigned int pgd_t; +typedef unsigned long pmd_t; +typedef unsigned long pud_t; +typedef unsigned long pgd_t; typedef unsigned long pgprot_t; #define pte_val(x) (x) #define pmd_val(x) (x) +#define pud_val(x) (x) #define pgd_val(x) (x) #define pgprot_val(x) (x) #define __pte(x) (x) #define __pmd(x) (x) +#define __pud(x) (x) #define __pgd(x) (x) #define __pgprot(x) (x) @@ -208,9 +220,6 @@ extern u64 ppc64_pft_size; /* Log 2 of page table size */ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE) -#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT) - #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) #ifdef CONFIG_DISCONTIGMEM diff --git a/include/asm-ppc64/param.h b/include/asm-ppc64/param.h index 1fad38dcf70..76c212d475b 100644 --- a/include/asm-ppc64/param.h +++ b/include/asm-ppc64/param.h @@ -1,6 +1,8 @@ #ifndef _ASM_PPC64_PARAM_H #define _ASM_PPC64_PARAM_H +#include <linux/config.h> + /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,7 +11,7 @@ */ #ifdef __KERNEL__ -# define HZ 1000 /* Internal kernel timer frequency */ +# define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ #endif diff --git a/include/asm-ppc64/percpu.h b/include/asm-ppc64/percpu.h deleted file mode 100644 index 60a659a4ce1..00000000000 --- a/include/asm-ppc64/percpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ARCH_PPC64_PERCPU__ -#define __ARCH_PPC64_PERCPU__ - -#include <asm-generic/percpu.h> - -#endif /* __ARCH_PPC64_PERCPU__ */ diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h index 4fc4b739b38..26bc49c1108 100644 --- a/include/asm-ppc64/pgalloc.h +++ b/include/asm-ppc64/pgalloc.h @@ -6,7 +6,12 @@ #include <linux/cpumask.h> #include <linux/percpu.h> -extern kmem_cache_t *zero_cache; +extern kmem_cache_t *pgtable_cache[]; + +#define PTE_CACHE_NUM 0 +#define PMD_CACHE_NUM 1 +#define PUD_CACHE_NUM 1 +#define PGD_CACHE_NUM 0 /* * This program is free software; you can redistribute it and/or @@ -15,30 +20,40 @@ extern kmem_cache_t *zero_cache; * 2 of the License, or (at your option) any later version. */ -static inline pgd_t * -pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL); + return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); } -static inline void -pgd_free(pgd_t *pgd) +static inline void pgd_free(pgd_t *pgd) { - kmem_cache_free(zero_cache, pgd); + kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); +} + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, PUD) + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(pud_t *pud) +{ + kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); } #define pud_populate(MM, PUD, PMD) pud_set(PUD, PMD) -static inline pmd_t * -pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); } -static inline void -pmd_free(pmd_t *pmd) +static inline void pmd_free(pmd_t *pmd) { - kmem_cache_free(zero_cache, pmd); + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); } #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte) @@ -47,44 +62,58 @@ pmd_free(pmd_t *pmd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], + GFP_KERNEL|__GFP_REPEAT); } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); - if (pte) - return virt_to_page(pte); - return NULL; + return virt_to_page(pte_alloc_one_kernel(mm, address)); } static inline void pte_free_kernel(pte_t *pte) { - kmem_cache_free(zero_cache, pte); + kmem_cache_free(pgtable_cache[PTE_CACHE_NUM], pte); } static inline void pte_free(struct page *ptepage) { - kmem_cache_free(zero_cache, page_address(ptepage)); + pte_free_kernel(page_address(ptepage)); } -struct pte_freelist_batch +#define PGF_CACHENUM_MASK 0xf + +typedef struct pgtable_free { + unsigned long val; +} pgtable_free_t; + +static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, + unsigned long mask) { - struct rcu_head rcu; - unsigned int index; - struct page * pages[0]; -}; + BUG_ON(cachenum > PGF_CACHENUM_MASK); -#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \ - sizeof(struct page *)) + return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum}; +} -extern void pte_free_now(struct page *ptepage); -extern void pte_free_submit(struct pte_freelist_batch *batch); +static inline void pgtable_free(pgtable_free_t pgf) +{ + void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); + int cachenum = pgf.val & PGF_CACHENUM_MASK; -DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); + kmem_cache_free(pgtable_cache[cachenum], p); +} -void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage); -#define __pmd_free_tlb(tlb, pmd) __pte_free_tlb(tlb, virt_to_page(pmd)) +void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); + +#define __pte_free_tlb(tlb, ptepage) \ + pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ + PTE_CACHE_NUM, PTE_TABLE_SIZE-1)) +#define __pmd_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ + PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) +#define __pud_free_tlb(tlb, pmd) \ + pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ + PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) #define check_pgt_cache() do { } while (0) diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index 46cf61c2ff6..c83679c9d2b 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h @@ -15,19 +15,24 @@ #include <asm/tlbflush.h> #endif /* __ASSEMBLY__ */ -#include <asm-generic/pgtable-nopud.h> - /* * Entries per page directory level. The PTE level must use a 64b record * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ #define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 10 -#define PGD_INDEX_SIZE 10 +#define PMD_INDEX_SIZE 7 +#define PUD_INDEX_SIZE 7 +#define PGD_INDEX_SIZE 9 + +#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) +#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PMD_INDEX_SIZE) #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) /* PMD_SHIFT determines what a second-level page table entry can map */ @@ -35,8 +40,13 @@ #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -/* PGDIR_SHIFT determines what a third-level page table entry can map */ -#define PGDIR_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) @@ -45,15 +55,23 @@ /* * Size of EA range mapped by our pagetables. */ -#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PGD_INDEX_SIZE + PAGE_SHIFT) -#define EADDR_MASK ((1UL << EADDR_SIZE) - 1) +#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) +#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE) + +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif /* * Define the address range of the vmalloc VM area. */ #define VMALLOC_START (0xD000000000000000ul) -#define VMALLOC_SIZE (0x10000000000UL) +#define VMALLOC_SIZE (0x80000000000UL) #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) /* @@ -154,8 +172,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; #ifndef __ASSEMBLY__ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local); - -void hugetlb_mm_free_pgd(struct mm_struct *mm); #endif /* __ASSEMBLY__ */ #define HAVE_ARCH_UNMAPPED_AREA @@ -163,7 +179,6 @@ void hugetlb_mm_free_pgd(struct mm_struct *mm); #else #define hash_huge_page(mm,a,ea,vsid,local) -1 -#define hugetlb_mm_free_pgd(mm) do {} while (0) #endif @@ -197,39 +212,45 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) #define pte_pfn(x) ((unsigned long)((pte_val(x) >> PTE_SHIFT))) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pmd_set(pmdp, ptep) \ - (pmd_val(*(pmdp)) = __ba_to_bpn(ptep)) +#define pmd_set(pmdp, ptep) ({BUG_ON((u64)ptep < KERNELBASE); pmd_val(*(pmdp)) = (unsigned long)(ptep);}) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) == 0) #define pmd_present(pmd) (pmd_val(pmd) != 0) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) -#define pmd_page_kernel(pmd) (__bpn_to_ba(pmd_val(pmd))) +#define pmd_page_kernel(pmd) (pmd_val(pmd)) #define pmd_page(pmd) virt_to_page(pmd_page_kernel(pmd)) -#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (__ba_to_bpn(pmdp))) +#define pud_set(pudp, pmdp) (pud_val(*(pudp)) = (unsigned long)(pmdp)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) ((pud_val(pud)) == 0UL) -#define pud_present(pud) (pud_val(pud) != 0UL) -#define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define pud_page(pud) (__bpn_to_ba(pud_val(pud))) +#define pud_bad(pud) ((pud_val(pud)) == 0) +#define pud_present(pud) (pud_val(pud) != 0) +#define pud_clear(pudp) (pud_val(*(pudp)) = 0) +#define pud_page(pud) (pud_val(pud)) + +#define pgd_set(pgdp, pudp) ({pgd_val(*(pgdp)) = (unsigned long)(pudp);}) +#define pgd_none(pgd) (!pgd_val(pgd)) +#define pgd_bad(pgd) (pgd_val(pgd) == 0) +#define pgd_present(pgd) (pgd_val(pgd) != 0) +#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0) +#define pgd_page(pgd) (pgd_val(pgd)) /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. */ /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */ -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff) +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff) #define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -/* Find an entry in the second-level page table.. */ +#define pud_offset(pgdp, addr) \ + (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))) + #define pmd_offset(pudp,addr) \ - ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) -/* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ - ((pte_t *) pmd_page_kernel(*(dir)) \ - + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) + (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) @@ -458,23 +479,20 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e)) + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e)) + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -/* - * Because the huge pgtables are only 2 level, they can take - * at most around 4M, much less than one hugepage which the - * process is presumably entitled to use. So we don't bother - * freeing up the pagetables on unmap, and wait until - * destroy_context() to clean up the lot. - */ +#ifdef CONFIG_HUGETLB_PAGE #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ - do { } while (0) + free_pgd_range(tlb, addr, end, floor, ceiling) +#endif /* * This gets called at the end of handling a page fault, when diff --git a/include/asm-ppc64/pmc.h b/include/asm-ppc64/pmc.h index c924748c0be..d1d297dbccf 100644 --- a/include/asm-ppc64/pmc.h +++ b/include/asm-ppc64/pmc.h @@ -26,4 +26,6 @@ typedef void (*perf_irq_t)(struct pt_regs *); int reserve_pmc_hardware(perf_irq_t new_perf_irq); void release_pmc_hardware(void); +void power4_enable_pmcs(void); + #endif /* _PPC64_PMC_H */ diff --git a/include/asm-ppc64/poll.h b/include/asm-ppc64/poll.h deleted file mode 100644 index 370fa3ba0db..00000000000 --- a/include/asm-ppc64/poll.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __PPC64_POLL_H -#define __PPC64_POLL_H - -/* - * Copyright (C) 2001 PPC64 Team, IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define POLLIN 0x0001 -#define POLLPRI 0x0002 -#define POLLOUT 0x0004 -#define POLLERR 0x0008 -#define POLLHUP 0x0010 -#define POLLNVAL 0x0020 -#define POLLRDNORM 0x0040 -#define POLLRDBAND 0x0080 -#define POLLWRNORM 0x0100 -#define POLLWRBAND 0x0200 -#define POLLMSG 0x0400 -#define POLLREMOVE 0x1000 - -struct pollfd { - int fd; - short events; - short revents; -}; - -#endif /* __PPC64_POLL_H */ diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index 352306cfb57..7bd4796f123 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -268,6 +268,7 @@ #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 +#define PV_970MP 0x0044 #define PV_BE 0x0070 /* Platforms supported by PPC64 */ @@ -382,8 +383,8 @@ extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern struct task_struct *last_task_used_math; extern struct task_struct *last_task_used_altivec; -/* 64-bit user address space is 41-bits (2TBs user VM) */ -#define TASK_SIZE_USER64 (0x0000020000000000UL) +/* 64-bit user address space is 44-bits (16TB user VM) */ +#define TASK_SIZE_USER64 (0x0000100000000000UL) /* * 32-bit user address space is 4GB - 1 page diff --git a/include/asm-ppc64/prom.h b/include/asm-ppc64/prom.h index 04b1a84f7ca..dc5330b3950 100644 --- a/include/asm-ppc64/prom.h +++ b/include/asm-ppc64/prom.h @@ -22,13 +22,15 @@ #define RELOC(x) (*PTRRELOC(&(x))) /* Definitions used by the flattened device tree */ -#define OF_DT_HEADER 0xd00dfeed /* 4: version, 4: total size */ -#define OF_DT_BEGIN_NODE 0x1 /* Start node: full name */ +#define OF_DT_HEADER 0xd00dfeed /* marker */ +#define OF_DT_BEGIN_NODE 0x1 /* Start of node, full name */ #define OF_DT_END_NODE 0x2 /* End node */ -#define OF_DT_PROP 0x3 /* Property: name off, size, content */ +#define OF_DT_PROP 0x3 /* Property: name off, size, + * content */ +#define OF_DT_NOP 0x4 /* nop */ #define OF_DT_END 0x9 -#define OF_DT_VERSION 1 +#define OF_DT_VERSION 0x10 /* * This is what gets passed to the kernel by prom_init or kexec @@ -54,7 +56,9 @@ struct boot_param_header u32 version; /* format version */ u32 last_comp_version; /* last compatible version */ /* version 2 fields below */ - u32 boot_cpuid_phys; /* Which physical CPU id we're booting on */ + u32 boot_cpuid_phys; /* Physical CPU id we're booting on */ + /* version 3 fields below */ + u32 dt_strings_size; /* size of the DT strings block */ }; diff --git a/include/asm-ppc64/resource.h b/include/asm-ppc64/resource.h deleted file mode 100644 index add031b9dfd..00000000000 --- a/include/asm-ppc64/resource.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _PPC64_RESOURCE_H -#define _PPC64_RESOURCE_H - -#include <asm-generic/resource.h> - -#endif /* _PPC64_RESOURCE_H */ diff --git a/include/asm-ppc64/shmparam.h b/include/asm-ppc64/shmparam.h deleted file mode 100644 index b2825ceff05..00000000000 --- a/include/asm-ppc64/shmparam.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _PPC64_SHMPARAM_H -#define _PPC64_SHMPARAM_H - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */ - -#endif /* _PPC64_SHMPARAM_H */ diff --git a/include/asm-ppc64/socket.h b/include/asm-ppc64/socket.h index 59e00dfc8b8..9e1af8eb2d9 100644 --- a/include/asm-ppc64/socket.h +++ b/include/asm-ppc64/socket.h @@ -21,6 +21,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-ppc64/string.h b/include/asm-ppc64/string.h deleted file mode 100644 index eeca68ef1e9..00000000000 --- a/include/asm-ppc64/string.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _PPC64_STRING_H_ -#define _PPC64_STRING_H_ - -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define __HAVE_ARCH_STRCPY -#define __HAVE_ARCH_STRNCPY -#define __HAVE_ARCH_STRLEN -#define __HAVE_ARCH_STRCMP -#define __HAVE_ARCH_STRCAT -#define __HAVE_ARCH_MEMSET -#define __HAVE_ARCH_MEMCPY -#define __HAVE_ARCH_MEMMOVE -#define __HAVE_ARCH_MEMCMP -#define __HAVE_ARCH_MEMCHR - -extern int strcasecmp(const char *, const char *); -extern int strncasecmp(const char *, const char *, int); -extern char * strcpy(char *,const char *); -extern char * strncpy(char *,const char *, __kernel_size_t); -extern __kernel_size_t strlen(const char *); -extern int strcmp(const char *,const char *); -extern char * strcat(char *, const char *); -extern void * memset(void *,int,__kernel_size_t); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * memmove(void *,const void *,__kernel_size_t); -extern int memcmp(const void *,const void *,__kernel_size_t); -extern void * memchr(const void *,int,__kernel_size_t); - -#endif /* _PPC64_STRING_H_ */ diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h index 98d120ca8a9..b9e1835351e 100644 --- a/include/asm-ppc64/system.h +++ b/include/asm-ppc64/system.h @@ -88,7 +88,7 @@ DEBUGGER_BOILERPLATE(debugger_dabr_match) DEBUGGER_BOILERPLATE(debugger_fault_handler) #ifdef CONFIG_XMON -extern void xmon_init(void); +extern void xmon_init(int enable); #endif #else @@ -302,5 +302,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) #define arch_align_stack(x) (x) +extern unsigned long reloc_offset(void); + #endif /* __KERNEL__ */ #endif diff --git a/include/asm-ppc64/unaligned.h b/include/asm-ppc64/unaligned.h deleted file mode 100644 index 636e93c4f37..00000000000 --- a/include/asm-ppc64/unaligned.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __PPC64_UNALIGNED_H -#define __PPC64_UNALIGNED_H - -/* - * The PowerPC can do unaligned accesses itself in big endian mode. - * - * The strange macros are there to make sure these can't - * be misused in a way that makes them not work on other - * architectures where unaligned accesses aren't as simple. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define get_unaligned(ptr) (*(ptr)) - -#define put_unaligned(val, ptr) ((void)( *(ptr) = (val) )) - -#endif /* __PPC64_UNALIGNED_H */ diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 20cd98ee633..03f1b95f433 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -19,13 +19,15 @@ #include <linux/errno.h> #include <linux/device.h> #include <linux/dma-mapping.h> +#include <linux/mod_devicetable.h> + #include <asm/hvcall.h> -#include <asm/prom.h> #include <asm/scatterlist.h> -/* + +/* * Architecture-specific constants for drivers to * extract attributes of the device using vio_get_attribute() -*/ + */ #define VETH_MAC_ADDR "local-mac-address" #define VETH_MCAST_FILTER_SIZE "ibm,mac-address-filters" @@ -37,64 +39,65 @@ #define VIO_IRQ_DISABLE 0UL #define VIO_IRQ_ENABLE 1UL -struct vio_dev; -struct vio_driver; -struct vio_device_id; struct iommu_table; -int vio_register_driver(struct vio_driver *drv); -void vio_unregister_driver(struct vio_driver *drv); - -#ifdef CONFIG_PPC_PSERIES -struct vio_dev * __devinit vio_register_device_node( - struct device_node *node_vdev); -#endif -void __devinit vio_unregister_device(struct vio_dev *dev); -struct vio_dev *vio_find_node(struct device_node *vnode); - -const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length); -int vio_get_irq(struct vio_dev *dev); -int vio_enable_interrupts(struct vio_dev *dev); -int vio_disable_interrupts(struct vio_dev *dev); - -extern struct dma_mapping_ops vio_dma_ops; - -extern struct bus_type vio_bus_type; - -struct vio_device_id { +/* + * The vio_dev structure is used to describe virtual I/O devices. + */ +struct vio_dev { + struct iommu_table *iommu_table; /* vio_map_* uses this */ + char *name; char *type; - char *compat; + uint32_t unit_address; + unsigned int irq; + struct device dev; }; struct vio_driver { struct list_head node; char *name; - const struct vio_device_id *id_table; /* NULL if wants all devices */ - int (*probe) (struct vio_dev *dev, const struct vio_device_id *id); /* New device inserted */ - int (*remove) (struct vio_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ + const struct vio_device_id *id_table; + int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); + int (*remove)(struct vio_dev *dev); unsigned long driver_data; - struct device_driver driver; }; +struct vio_bus_ops { + int (*match)(const struct vio_device_id *id, const struct vio_dev *dev); + void (*unregister_device)(struct vio_dev *); + void (*release_device)(struct device *); +}; + +extern struct dma_mapping_ops vio_dma_ops; +extern struct bus_type vio_bus_type; +extern struct vio_dev vio_bus_device; + +extern int vio_register_driver(struct vio_driver *drv); +extern void vio_unregister_driver(struct vio_driver *drv); + +extern struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev); +extern void __devinit vio_unregister_device(struct vio_dev *dev); + +extern int vio_bus_init(struct vio_bus_ops *); + +#ifdef CONFIG_PPC_PSERIES +struct device_node; + +extern struct vio_dev * __devinit vio_register_device_node( + struct device_node *node_vdev); +extern struct vio_dev *vio_find_node(struct device_node *vnode); +extern const void *vio_get_attribute(struct vio_dev *vdev, void *which, + int *length); +extern int vio_enable_interrupts(struct vio_dev *dev); +extern int vio_disable_interrupts(struct vio_dev *dev); +#endif + static inline struct vio_driver *to_vio_driver(struct device_driver *drv) { return container_of(drv, struct vio_driver, driver); } -/* - * The vio_dev structure is used to describe virtual I/O devices. - */ -struct vio_dev { - struct iommu_table *iommu_table; /* vio_map_* uses this */ - char *name; - char *type; - uint32_t unit_address; - unsigned int irq; - - struct device dev; -}; - static inline struct vio_dev *to_vio_dev(struct device *dev) { return container_of(dev, struct vio_dev, dev); diff --git a/include/asm-ppc64/xor.h b/include/asm-ppc64/xor.h deleted file mode 100644 index c82eb12a5b1..00000000000 --- a/include/asm-ppc64/xor.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/xor.h> diff --git a/include/asm-s390/socket.h b/include/asm-s390/socket.h index 0e96eeca4e6..15a5298c874 100644 --- a/include/asm-s390/socket.h +++ b/include/asm-s390/socket.h @@ -22,6 +22,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-sh/socket.h b/include/asm-sh/socket.h index dde696c3b4c..553904ff933 100644 --- a/include/asm-sh/socket.h +++ b/include/asm-sh/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_RCVBUFFORCE 32 +#define SO_SNDBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-sparc/socket.h b/include/asm-sparc/socket.h index c1154e3ecfd..09575b608ad 100644 --- a/include/asm-sparc/socket.h +++ b/include/asm-sparc/socket.h @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff --git a/include/asm-sparc64/socket.h b/include/asm-sparc64/socket.h index 865547a2390..59987dad335 100644 --- a/include/asm-sparc64/socket.h +++ b/include/asm-sparc64/socket.h @@ -29,6 +29,8 @@ #define SO_SNDBUF 0x1001 #define SO_RCVBUF 0x1002 +#define SO_SNDBUFFORCE 0x100a +#define SO_RCVBUFFORCE 0x100b #define SO_ERROR 0x1007 #define SO_TYPE 0x1008 diff --git a/include/asm-v850/socket.h b/include/asm-v850/socket.h index 213b852af53..0240d366a0a 100644 --- a/include/asm-v850/socket.h +++ b/include/asm-v850/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-x86_64/checksum.h b/include/asm-x86_64/checksum.h index d01356f0144..989469e8e0b 100644 --- a/include/asm-x86_64/checksum.h +++ b/include/asm-x86_64/checksum.h @@ -64,7 +64,7 @@ static inline unsigned short ip_fast_csum(unsigned char *iph, unsigned int ihl) " adcl $0, %0\n" " notl %0\n" "2:" - /* Since the input registers which are loaded with iph and ipl + /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h index d9a252ea821..f2cdbeae5d5 100644 --- a/include/asm-x86_64/socket.h +++ b/include/asm-x86_64/socket.h @@ -14,6 +14,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/asm-xtensa/socket.h b/include/asm-xtensa/socket.h index daccd05a14c..00f83f3a6d7 100644 --- a/include/asm-xtensa/socket.h +++ b/include/asm-xtensa/socket.h @@ -24,6 +24,8 @@ #define SO_BROADCAST 6 #define SO_SNDBUF 7 #define SO_RCVBUF 8 +#define SO_SNDBUFFORCE 32 +#define SO_RCVBUFFORCE 33 #define SO_KEEPALIVE 9 #define SO_OOBINLINE 10 #define SO_NO_CHECK 11 diff --git a/include/linux/dccp.h b/include/linux/dccp.h new file mode 100644 index 00000000000..007c290f74d --- /dev/null +++ b/include/linux/dccp.h @@ -0,0 +1,456 @@ +#ifndef _LINUX_DCCP_H +#define _LINUX_DCCP_H + +#include <linux/types.h> +#include <asm/byteorder.h> + +/* Structure describing an Internet (DCCP) socket address. */ +struct sockaddr_dccp { + __u16 sdccp_family; /* Address family */ + __u16 sdccp_port; /* Port number */ + __u32 sdccp_addr; /* Internet address */ + __u32 sdccp_service; /* Service */ + /* Pad to size of `struct sockaddr': 16 bytes . */ + __u32 sdccp_pad; +}; + +/** + * struct dccp_hdr - generic part of DCCP packet header + * + * @dccph_sport - Relevant port on the endpoint that sent this packet + * @dccph_dport - Relevant port on the other endpoint + * @dccph_doff - Data Offset from the start of the DCCP header, in 32-bit words + * @dccph_ccval - Used by the HC-Sender CCID + * @dccph_cscov - Parts of the packet that are covered by the Checksum field + * @dccph_checksum - Internet checksum, depends on dccph_cscov + * @dccph_x - 0 = 24 bit sequence number, 1 = 48 + * @dccph_type - packet type, see DCCP_PKT_ prefixed macros + * @dccph_seq - sequence number high or low order 24 bits, depends on dccph_x + */ +struct dccp_hdr { + __u16 dccph_sport, + dccph_dport; + __u8 dccph_doff; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 dccph_cscov:4, + dccph_ccval:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 dccph_ccval:4, + dccph_cscov:4; +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + __u16 dccph_checksum; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 dccph_x:1, + dccph_type:4, + dccph_reserved:3, + dccph_seq:24; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u32 dccph_reserved:3, + dccph_type:4, + dccph_x:1, + dccph_seq:24; +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif +}; + +/** + * struct dccp_hdr_ext - the low bits of a 48 bit seq packet + * + * @dccph_seq_low - low 24 bits of a 48 bit seq packet + */ +struct dccp_hdr_ext { + __u32 dccph_seq_low; +}; + +/** + * struct dccp_hdr_request - Conection initiation request header + * + * @dccph_req_service - Service to which the client app wants to connect + * @dccph_req_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_request { + __u32 dccph_req_service; +}; +/** + * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + */ +struct dccp_hdr_ack_bits { + __u32 dccph_reserved1:8, + dccph_ack_nr_high:24; + __u32 dccph_ack_nr_low; +}; +/** + * struct dccp_hdr_response - Conection initiation response header + * + * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR + * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR + * @dccph_resp_service - Echoes the Service Code on a received DCCP-Request + * @dccph_resp_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_response { + struct dccp_hdr_ack_bits dccph_resp_ack; + __u32 dccph_resp_service; +}; + +/** + * struct dccp_hdr_reset - Unconditionally shut down a connection + * + * @dccph_reset_service - Echoes the Service Code on a received DCCP-Request + * @dccph_reset_options - list of options (must be a multiple of 32 bits + */ +struct dccp_hdr_reset { + struct dccp_hdr_ack_bits dccph_reset_ack; + __u8 dccph_reset_code, + dccph_reset_data[3]; +}; + +enum dccp_pkt_type { + DCCP_PKT_REQUEST = 0, + DCCP_PKT_RESPONSE, + DCCP_PKT_DATA, + DCCP_PKT_ACK, + DCCP_PKT_DATAACK, + DCCP_PKT_CLOSEREQ, + DCCP_PKT_CLOSE, + DCCP_PKT_RESET, + DCCP_PKT_SYNC, + DCCP_PKT_SYNCACK, + DCCP_PKT_INVALID, +}; + +#define DCCP_NR_PKT_TYPES DCCP_PKT_INVALID + +static inline unsigned int dccp_packet_hdr_len(const __u8 type) +{ + if (type == DCCP_PKT_DATA) + return 0; + if (type == DCCP_PKT_DATAACK || + type == DCCP_PKT_ACK || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ) + return sizeof(struct dccp_hdr_ack_bits); + if (type == DCCP_PKT_REQUEST) + return sizeof(struct dccp_hdr_request); + if (type == DCCP_PKT_RESPONSE) + return sizeof(struct dccp_hdr_response); + return sizeof(struct dccp_hdr_reset); +} +enum dccp_reset_codes { + DCCP_RESET_CODE_UNSPECIFIED = 0, + DCCP_RESET_CODE_CLOSED, + DCCP_RESET_CODE_ABORTED, + DCCP_RESET_CODE_NO_CONNECTION, + DCCP_RESET_CODE_PACKET_ERROR, + DCCP_RESET_CODE_OPTION_ERROR, + DCCP_RESET_CODE_MANDATORY_ERROR, + DCCP_RESET_CODE_CONNECTION_REFUSED, + DCCP_RESET_CODE_BAD_SERVICE_CODE, + DCCP_RESET_CODE_TOO_BUSY, + DCCP_RESET_CODE_BAD_INIT_COOKIE, + DCCP_RESET_CODE_AGGRESSION_PENALTY, +}; + +/* DCCP options */ +enum { + DCCPO_PADDING = 0, + DCCPO_MANDATORY = 1, + DCCPO_MIN_RESERVED = 3, + DCCPO_MAX_RESERVED = 31, + DCCPO_NDP_COUNT = 37, + DCCPO_ACK_VECTOR_0 = 38, + DCCPO_ACK_VECTOR_1 = 39, + DCCPO_TIMESTAMP = 41, + DCCPO_TIMESTAMP_ECHO = 42, + DCCPO_ELAPSED_TIME = 43, + DCCPO_MAX = 45, + DCCPO_MIN_CCID_SPECIFIC = 128, + DCCPO_MAX_CCID_SPECIFIC = 255, +}; + +/* DCCP features */ +enum { + DCCPF_RESERVED = 0, + DCCPF_SEQUENCE_WINDOW = 3, + DCCPF_SEND_ACK_VECTOR = 6, + DCCPF_SEND_NDP_COUNT = 7, + /* 10-127 reserved */ + DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_MAX_CCID_SPECIFIC = 255, +}; + +/* DCCP socket options */ +#define DCCP_SOCKOPT_PACKET_SIZE 1 + +#ifdef __KERNEL__ + +#include <linux/in.h> +#include <linux/list.h> +#include <linux/uio.h> +#include <linux/workqueue.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_timewait_sock.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <net/tcp.h> + +enum dccp_state { + DCCP_OPEN = TCP_ESTABLISHED, + DCCP_REQUESTING = TCP_SYN_SENT, + DCCP_PARTOPEN = TCP_FIN_WAIT1, /* FIXME: + This mapping is horrible, but TCP has + no matching state for DCCP_PARTOPEN, + as TCP_SYN_RECV is already used by + DCCP_RESPOND, why don't stop using TCP + mapping of states? OK, now we don't use + sk_stream_sendmsg anymore, so doesn't + seem to exist any reason for us to + do the TCP mapping here */ + DCCP_LISTEN = TCP_LISTEN, + DCCP_RESPOND = TCP_SYN_RECV, + DCCP_CLOSING = TCP_CLOSING, + DCCP_TIME_WAIT = TCP_TIME_WAIT, + DCCP_CLOSED = TCP_CLOSE, + DCCP_MAX_STATES = TCP_MAX_STATES, +}; + +#define DCCP_STATE_MASK 0xf +#define DCCP_ACTION_FIN (1<<7) + +enum { + DCCPF_OPEN = TCPF_ESTABLISHED, + DCCPF_REQUESTING = TCPF_SYN_SENT, + DCCPF_PARTOPEN = TCPF_FIN_WAIT1, + DCCPF_LISTEN = TCPF_LISTEN, + DCCPF_RESPOND = TCPF_SYN_RECV, + DCCPF_CLOSING = TCPF_CLOSING, + DCCPF_TIME_WAIT = TCPF_TIME_WAIT, + DCCPF_CLOSED = TCPF_CLOSE, +}; + +static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) +{ + return (struct dccp_hdr *)skb->h.raw; +} + +static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr)); +} + +static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) +{ + return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); +} + +static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + return __dccp_basic_hdr_len(dh); +} + +static inline __u64 dccp_hdr_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq << 8); +#elif defined(__BIG_ENDIAN_BITFIELD) + __u64 seq_nr = ntohl(dh->dccph_seq); +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + + if (dh->dccph_x != 0) + seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low); + + return seq_nr; +} + +static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) +{ + return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) +{ + return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) +{ + const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); +#if defined(__LITTLE_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#elif defined(__BIG_ENDIAN_BITFIELD) + return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low); +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif +} + +static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) +{ + return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) +{ + return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb)); +} + +static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) +{ + return __dccp_basic_hdr_len(dh) + + dccp_packet_hdr_len(dh->dccph_type); +} + +static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) +{ + return __dccp_hdr_len(dccp_hdr(skb)); +} + + +/* initial values for each feature */ +#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 +/* FIXME: for now we're using CCID 3 (TFRC) */ +#define DCCPF_INITIAL_CCID 3 +#define DCCPF_INITIAL_SEND_ACK_VECTOR 0 +/* FIXME: for now we're default to 1 but it should really be 0 */ +#define DCCPF_INITIAL_SEND_NDP_COUNT 1 + +#define DCCP_NDP_LIMIT 0xFFFFFF + +/** + * struct dccp_options - option values for a DCCP connection + * @dccpo_sequence_window - Sequence Window Feature (section 7.5.2) + * @dccpo_ccid - Congestion Control Id (CCID) (section 10) + * @dccpo_send_ack_vector - Send Ack Vector Feature (section 11.5) + * @dccpo_send_ndp_count - Send NDP Count Feature (7.7.2) + */ +struct dccp_options { + __u64 dccpo_sequence_window; + __u8 dccpo_ccid; + __u8 dccpo_send_ack_vector; + __u8 dccpo_send_ndp_count; +}; + +extern void __dccp_options_init(struct dccp_options *dccpo); +extern void dccp_options_init(struct dccp_options *dccpo); +extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb); + +struct dccp_request_sock { + struct inet_request_sock dreq_inet_rsk; + __u64 dreq_iss; + __u64 dreq_isr; + __u32 dreq_service; +}; + +static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) +{ + return (struct dccp_request_sock *)req; +} + +extern struct inet_timewait_death_row dccp_death_row; + +/* Read about the ECN nonce to see why it is 253 */ +#define DCCP_MAX_ACK_VECTOR_LEN 253 + +struct dccp_options_received { + u32 dccpor_ndp:24, + dccpor_ack_vector_len:8; + u32 dccpor_ack_vector_idx:10; + /* 22 bits hole, try to pack */ + u32 dccpor_timestamp; + u32 dccpor_timestamp_echo; + u32 dccpor_elapsed_time; +}; + +struct ccid; + +enum dccp_role { + DCCP_ROLE_UNDEFINED, + DCCP_ROLE_LISTEN, + DCCP_ROLE_CLIENT, + DCCP_ROLE_SERVER, +}; + +/** + * struct dccp_sock - DCCP socket state + * + * @dccps_swl - sequence number window low + * @dccps_swh - sequence number window high + * @dccps_awl - acknowledgement number window low + * @dccps_awh - acknowledgement number window high + * @dccps_iss - initial sequence number sent + * @dccps_isr - initial sequence number received + * @dccps_osr - first OPEN sequence number received + * @dccps_gss - greatest sequence number sent + * @dccps_gsr - greatest valid sequence number received + * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss + * @dccps_timestamp_time - time of latest TIMESTAMP option + * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option + * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) + * @dccps_pmtu_cookie - Last pmtu seen by socket + * @dccps_packet_size - Set thru setsockopt + * @dccps_role - Role of this sock, one of %dccp_role + * @dccps_ndp_count - number of Non Data Packets since last data packet + * @dccps_hc_rx_ackpkts - receiver half connection acked packets + */ +struct dccp_sock { + /* inet_connection_sock has to be the first member of dccp_sock */ + struct inet_connection_sock dccps_inet_connection; + __u64 dccps_swl; + __u64 dccps_swh; + __u64 dccps_awl; + __u64 dccps_awh; + __u64 dccps_iss; + __u64 dccps_isr; + __u64 dccps_osr; + __u64 dccps_gss; + __u64 dccps_gsr; + __u64 dccps_gar; + unsigned long dccps_service; + struct timeval dccps_timestamp_time; + __u32 dccps_timestamp_echo; + __u32 dccps_packet_size; + unsigned long dccps_ndp_count; + __u16 dccps_ext_header_len; + __u32 dccps_pmtu_cookie; + __u32 dccps_mss_cache; + struct dccp_options dccps_options; + struct dccp_ackpkts *dccps_hc_rx_ackpkts; + void *dccps_hc_rx_ccid_private; + void *dccps_hc_tx_ccid_private; + struct ccid *dccps_hc_rx_ccid; + struct ccid *dccps_hc_tx_ccid; + struct dccp_options_received dccps_options_received; + enum dccp_role dccps_role:2; +}; + +static inline struct dccp_sock *dccp_sk(const struct sock *sk) +{ + return (struct dccp_sock *)sk; +} + +static inline const char *dccp_role(const struct sock *sk) +{ + switch (dccp_sk(sk)->dccps_role) { + case DCCP_ROLE_UNDEFINED: return "undefined"; + case DCCP_ROLE_LISTEN: return "listen"; + case DCCP_ROLE_SERVER: return "server"; + case DCCP_ROLE_CLIENT: return "client"; + } + return NULL; +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_DCCP_H */ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d7021c391b2..ed1440ea4c9 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,12 @@ struct ethtool_stats { u64 data[0]; }; +struct ethtool_perm_addr { + u32 cmd; /* ETHTOOL_GPERMADDR */ + u32 size; + u8 data[0]; +}; + struct net_device; /* Some generic methods drivers may use in their ethtool_ops */ @@ -261,6 +267,8 @@ u32 ethtool_op_get_sg(struct net_device *dev); int ethtool_op_set_sg(struct net_device *dev, u32 data); u32 ethtool_op_get_tso(struct net_device *dev); int ethtool_op_set_tso(struct net_device *dev, u32 data); +int ethtool_op_get_perm_addr(struct net_device *dev, + struct ethtool_perm_addr *addr, u8 *data); /** * ðtool_ops - Alter and report network device settings @@ -294,7 +302,8 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data); * get_strings: Return a set of strings that describe the requested objects * phys_id: Identify the device * get_stats: Return statistics about the device - * + * get_perm_addr: Gets the permanent hardware address + * * Description: * * get_settings: @@ -352,6 +361,7 @@ struct ethtool_ops { int (*phys_id)(struct net_device *, u32); int (*get_stats_count)(struct net_device *); void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*get_perm_addr)(struct net_device *, struct ethtool_perm_addr *, u8 *); int (*begin)(struct net_device *); void (*complete)(struct net_device *); }; @@ -389,6 +399,7 @@ struct ethtool_ops { #define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ #define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ #define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h index 9debe6bbe5f..bab303dafd6 100644 --- a/include/linux/hippidevice.h +++ b/include/linux/hippidevice.h @@ -26,8 +26,12 @@ #include <linux/if_hippi.h> #ifdef __KERNEL__ -extern unsigned short hippi_type_trans(struct sk_buff *skb, - struct net_device *dev); + +struct hippi_cb { + __u32 ifield; +}; + +extern __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev); extern struct net_device *alloc_hippi_dev(int sizeof_priv); #endif diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index b5b58e9c054..fc2d4c8225a 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -110,6 +110,8 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->mac.raw; } + +extern struct ctl_table ether_table[]; #endif #endif /* _LINUX_IF_ETHER_H */ diff --git a/include/linux/if_fc.h b/include/linux/if_fc.h index 33330b458b9..376a34ea472 100644 --- a/include/linux/if_fc.h +++ b/include/linux/if_fc.h @@ -44,7 +44,7 @@ struct fcllc { __u8 ssap; /* source SAP */ __u8 llc; /* LLC control field */ __u8 protid[3]; /* protocol id */ - __u16 ethertype; /* ether type field */ + __be16 ethertype; /* ether type field */ }; #endif /* _LINUX_IF_FC_H */ diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h index a912818e636..1288a161bc0 100644 --- a/include/linux/if_fddi.h +++ b/include/linux/if_fddi.h @@ -85,7 +85,7 @@ struct fddi_snap_hdr __u8 ssap; /* always 0xAA */ __u8 ctrl; /* always 0x03 */ __u8 oui[FDDI_K_OUI_LEN]; /* organizational universal id */ - __u16 ethertype; /* packet type ID field */ + __be16 ethertype; /* packet type ID field */ } __attribute__ ((packed)); /* Define FDDI LLC frame header */ diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h index 3c94b173657..511999c7eed 100644 --- a/include/linux/if_frad.h +++ b/include/linux/if_frad.h @@ -191,10 +191,12 @@ struct frad_local int buffer; /* current buffer for S508 firmware */ }; -extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)); - #endif /* __KERNEL__ */ #endif /* CONFIG_DLCI || CONFIG_DLCI_MODULE */ +#ifdef __KERNEL__ +extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)); +#endif + #endif diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h index c8ca72c46f7..94d31ca7d71 100644 --- a/include/linux/if_hippi.h +++ b/include/linux/if_hippi.h @@ -102,9 +102,9 @@ struct hippi_fp_hdr #error "Please fix <asm/byteorder.h>" #endif #else - __u32 fixed; + __be32 fixed; #endif - __u32 d2_size; + __be32 d2_size; } __attribute__ ((packed)); struct hippi_le_hdr @@ -144,7 +144,7 @@ struct hippi_snap_hdr __u8 ssap; /* always 0xAA */ __u8 ctrl; /* always 0x03 */ __u8 oui[HIPPI_OUI_LEN]; /* organizational universal id (zero)*/ - __u16 ethertype; /* packet type ID field */ + __be16 ethertype; /* packet type ID field */ } __attribute__ ((packed)); struct hippi_hdr diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h index 3fba9e2f542..5502f597cf0 100644 --- a/include/linux/if_tr.h +++ b/include/linux/if_tr.h @@ -43,12 +43,16 @@ struct trh_hdr { }; #ifdef __KERNEL__ +#include <linux/config.h> #include <linux/skbuff.h> static inline struct trh_hdr *tr_hdr(const struct sk_buff *skb) { return (struct trh_hdr *)skb->mac.raw; } +#ifdef CONFIG_SYSCTL +extern struct ctl_table tr_table[]; +#endif #endif /* This is an Token-Ring LLC structure */ diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 62a9d89dfbe..17d0c0d40b0 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -155,7 +155,6 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb, { struct net_device_stats *stats; - skb->real_dev = skb->dev; skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK]; if (skb->dev == NULL) { dev_kfree_skb_any(skb); diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 0c31ef0b5ba..28f4f3b3695 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -129,6 +129,9 @@ struct igmpv3_query { #include <linux/skbuff.h> #include <linux/in.h> +extern int sysctl_igmp_max_memberships; +extern int sysctl_igmp_max_msf; + struct ip_sf_socklist { unsigned int sl_max; diff --git a/include/linux/in.h b/include/linux/in.h index fb88c66d748..ba355384016 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -32,6 +32,7 @@ enum { IPPROTO_PUP = 12, /* PUP protocol */ IPPROTO_UDP = 17, /* User Datagram Protocol */ IPPROTO_IDP = 22, /* XNS IDP protocol */ + IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */ IPPROTO_RSVP = 46, /* RSVP protocol */ IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */ diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h new file mode 100644 index 00000000000..a4606e5810e --- /dev/null +++ b/include/linux/inet_diag.h @@ -0,0 +1,138 @@ +#ifndef _INET_DIAG_H_ +#define _INET_DIAG_H_ 1 + +/* Just some random number */ +#define TCPDIAG_GETSOCK 18 +#define DCCPDIAG_GETSOCK 19 + +#define INET_DIAG_GETSOCK_MAX 24 + +/* Socket identity */ +struct inet_diag_sockid { + __u16 idiag_sport; + __u16 idiag_dport; + __u32 idiag_src[4]; + __u32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; +#define INET_DIAG_NOCOOKIE (~0U) +}; + +/* Request structure */ + +struct inet_diag_req { + __u8 idiag_family; /* Family of addresses. */ + __u8 idiag_src_len; + __u8 idiag_dst_len; + __u8 idiag_ext; /* Query extended information */ + + struct inet_diag_sockid id; + + __u32 idiag_states; /* States to dump */ + __u32 idiag_dbs; /* Tables to dump (NI) */ +}; + +enum { + INET_DIAG_REQ_NONE, + INET_DIAG_REQ_BYTECODE, +}; + +#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE + +/* Bytecode is sequence of 4 byte commands followed by variable arguments. + * All the commands identified by "code" are conditional jumps forward: + * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be + * length of the command and its arguments. + */ + +struct inet_diag_bc_op { + unsigned char code; + unsigned char yes; + unsigned short no; +}; + +enum { + INET_DIAG_BC_NOP, + INET_DIAG_BC_JMP, + INET_DIAG_BC_S_GE, + INET_DIAG_BC_S_LE, + INET_DIAG_BC_D_GE, + INET_DIAG_BC_D_LE, + INET_DIAG_BC_AUTO, + INET_DIAG_BC_S_COND, + INET_DIAG_BC_D_COND, +}; + +struct inet_diag_hostcond { + __u8 family; + __u8 prefix_len; + int port; + __u32 addr[0]; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) + * and, alas, the information shown by netstat. */ +struct inet_diag_msg { + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; + + struct inet_diag_sockid id; + + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; +}; + +/* Extensions */ + +enum { + INET_DIAG_NONE, + INET_DIAG_MEMINFO, + INET_DIAG_INFO, + INET_DIAG_VEGASINFO, + INET_DIAG_CONG, +}; + +#define INET_DIAG_MAX INET_DIAG_CONG + + +/* INET_DIAG_MEM */ + +struct inet_diag_meminfo { + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; +}; + +/* INET_DIAG_VEGASINFO */ + +struct tcpvegas_info { + __u32 tcpv_enabled; + __u32 tcpv_rttcnt; + __u32 tcpv_rtt; + __u32 tcpv_minrtt; +}; + +#ifdef __KERNEL__ +struct sock; +struct inet_hashinfo; + +struct inet_diag_handler { + struct inet_hashinfo *idiag_hashinfo; + void (*idiag_get_info)(struct sock *sk, + struct inet_diag_msg *r, + void *info); + __u16 idiag_info_size; + __u16 idiag_type; +}; + +extern int inet_diag_register(const struct inet_diag_handler *handler); +extern void inet_diag_unregister(const struct inet_diag_handler *handler); +#endif /* __KERNEL__ */ + +#endif /* _INET_DIAG_H_ */ diff --git a/include/linux/ip.h b/include/linux/ip.h index 31e7cedd9f8..33e8a19a1a0 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -196,6 +196,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #endif #endif +extern int inet_sk_rebuild_header(struct sock *sk); + struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 6fcd6a0ade2..3c7dbc6a0a7 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -193,6 +193,11 @@ struct inet6_skb_parm { #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) +static inline int inet6_iif(const struct sk_buff *skb) +{ + return IP6CB(skb)->iif; +} + struct tcp6_request_sock { struct tcp_request_sock req; struct in6_addr loc_addr; @@ -308,6 +313,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) + +#include <linux/tcp.h> + +struct tcp6_timewait_sock { + struct tcp_timewait_sock tw_v6_sk; + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + +static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +{ + return (struct tcp6_timewait_sock *)sk; +} + +static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; +} + +static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +{ + return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; +} + +static inline int inet_v6_ipv6only(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only; +} #else #define __ipv6_only_sock(sk) 0 #define ipv6_only_sock(sk) 0 @@ -322,8 +357,19 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return NULL; } -#endif +#define __tcp_v6_rcv_saddr(__sk) NULL +#define tcp_v6_rcv_saddr(__sk) NULL +#define tcp_twsk_ipv6only(__sk) 0 +#define inet_v6_ipv6only(__sk) 0 +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ -#endif +#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ + (((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + ((__sk)->sk_family == AF_INET6) && \ + ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ + ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif +#endif /* __KERNEL__ */ + +#endif /* _IPV6_H */ diff --git a/include/linux/list.h b/include/linux/list.h index aab2db21b01..e6ec5968227 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -419,6 +419,20 @@ static inline void list_splice_init(struct list_head *list, pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** + * list_for_each_entry_safe_continue - iterate over list of given type + * continuing after existing point safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** * list_for_each_rcu - iterate over an rcu-protected list * @pos: the &struct list_head to use as a loop counter. * @head: the head for your list. @@ -620,6 +634,57 @@ static inline void hlist_add_after(struct hlist_node *n, next->next->pprev = &next->next; } +/** + * hlist_add_before_rcu - adds the specified element to the specified hlist + * before the specified node while permitting racing traversals. + * @n: the new element to add to the hash list. + * @next: the existing element to add the new element before. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ +static inline void hlist_add_before_rcu(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + smp_wmb(); + next->pprev = &n->next; + *(n->pprev) = n; +} + +/** + * hlist_add_after_rcu - adds the specified element to the specified hlist + * after the specified node while permitting racing traversals. + * @prev: the existing element to add the new element after. + * @n: the new element to add to the hash list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ +static inline void hlist_add_after_rcu(struct hlist_node *prev, + struct hlist_node *n) +{ + n->next = prev->next; + n->pprev = &prev->next; + smp_wmb(); + prev->next = n; + if (n->next) + n->next->pprev = &n->next; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 97bbccdbcca..47da39ba3f0 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -1,6 +1,6 @@ /* * Device tables which are exported to userspace via - * scripts/table2alias.c. You must keep that file in sync with this + * scripts/mod/file2alias.c. You must keep that file in sync with this * header. */ @@ -190,6 +190,11 @@ struct of_device_id #endif }; +/* VIO */ +struct vio_device_id { + char type[32]; + char compat[32]; +}; /* PCMCIA */ diff --git a/include/linux/net.h b/include/linux/net.h index 20cb226b226..4e981585a89 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -84,6 +84,7 @@ enum sock_type { SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, + SOCK_DCCP = 6, SOCK_PACKET = 10, }; @@ -282,5 +283,15 @@ static struct proto_ops name##_ops = { \ #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) +#define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ + MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +extern ctl_table net_table[]; +extern int net_msg_cost; +extern int net_msg_burst; +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_NET_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3a0ed7f9e80..7c717907896 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -244,6 +244,7 @@ struct netdev_boot_setup { }; #define NETDEV_BOOT_SETUP_MAX 8 +extern int __init netdev_boot_setup(char *str); /* * The DEVICE structure. @@ -336,6 +337,7 @@ struct net_device /* Interface address info. */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address */ + unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ unsigned char addr_len; /* hardware address length */ unsigned short dev_id; /* for shared network cards */ @@ -497,10 +499,12 @@ static inline void *netdev_priv(struct net_device *dev) #define SET_NETDEV_DEV(net, pdev) ((net)->class_dev.dev = (pdev)) struct packet_type { - __be16 type; /* This is really htons(ether_type). */ - struct net_device *dev; /* NULL is wildcarded here */ - int (*func) (struct sk_buff *, struct net_device *, - struct packet_type *); + __be16 type; /* This is really htons(ether_type). */ + struct net_device *dev; /* NULL is wildcarded here */ + int (*func) (struct sk_buff *, + struct net_device *, + struct packet_type *, + struct net_device *); void *af_packet_priv; struct list_head list; }; @@ -671,6 +675,7 @@ extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern void dev_init(void); extern int netdev_nit; +extern int netdev_budget; /* Called by rtnetlink.c:rtnl_unlock() */ extern void netdev_run_todo(void); @@ -697,19 +702,9 @@ static inline int netif_carrier_ok(const struct net_device *dev) extern void __netdev_watchdog_up(struct net_device *dev); -static inline void netif_carrier_on(struct net_device *dev) -{ - if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) - linkwatch_fire_event(dev); - if (netif_running(dev)) - __netdev_watchdog_up(dev); -} +extern void netif_carrier_on(struct net_device *dev); -static inline void netif_carrier_off(struct net_device *dev) -{ - if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) - linkwatch_fire_event(dev); -} +extern void netif_carrier_off(struct net_device *dev); /* Hot-plugging. */ static inline int netif_device_present(struct net_device *dev) @@ -916,6 +911,14 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward); extern void net_enable_timestamp(void); extern void net_disable_timestamp(void); +#ifdef CONFIG_PROC_FS +extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); +extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); +extern void dev_seq_stop(struct seq_file *seq, void *v); +#endif + +extern void linkwatch_run_queue(void); + #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2e2045482cb..be365e70ee9 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -21,10 +21,23 @@ #define NF_STOP 5 #define NF_MAX_VERDICT NF_STOP +/* we overload the higher bits for encoding auxiliary data such as the queue + * number. Not nice, but better than additional function arguments. */ +#define NF_VERDICT_MASK 0x0000ffff +#define NF_VERDICT_BITS 16 + +#define NF_VERDICT_QMASK 0xffff0000 +#define NF_VERDICT_QBITS 16 + +#define NF_QUEUE_NR(x) (((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK) | NF_QUEUE) + +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* Generic cache responses from hook functions. <= 0x2000 is used for protocol-flags. */ #define NFC_UNKNOWN 0x4000 #define NFC_ALTERED 0x8000 +#endif #ifdef __KERNEL__ #include <linux/config.h> @@ -101,15 +114,51 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -typedef void nf_logfn(unsigned int hooknum, +/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will + * disappear once iptables is replaced with pkttables. Please DO NOT use them + * for any new code! */ +#define NF_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ +#define NF_LOG_TCPOPT 0x02 /* Log TCP options */ +#define NF_LOG_IPOPT 0x04 /* Log IP options */ +#define NF_LOG_UID 0x08 /* Log UID owning local socket */ +#define NF_LOG_MASK 0x0f + +#define NF_LOG_TYPE_LOG 0x01 +#define NF_LOG_TYPE_ULOG 0x02 + +struct nf_loginfo { + u_int8_t type; + union { + struct { + u_int32_t copy_len; + u_int16_t group; + u_int16_t qthreshold; + } ulog; + struct { + u_int8_t level; + u_int8_t logflags; + } log; + } u; +}; + +typedef void nf_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix); +struct nf_logger { + struct module *me; + nf_logfn *logfn; + char *name; +}; + /* Function to register/unregister log function. */ -int nf_log_register(int pf, nf_logfn *logfn); -void nf_log_unregister(int pf, nf_logfn *logfn); +int nf_log_register(int pf, struct nf_logger *logger); +int nf_log_unregister_pf(int pf); +void nf_log_unregister_logger(struct nf_logger *logger); /* Calls the registered backend logging function */ void nf_log_packet(int pf, @@ -117,6 +166,7 @@ void nf_log_packet(int pf, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + struct nf_loginfo *li, const char *fmt, ...); /* Activate hook; either okfn or kfree_skb called, unless a hook @@ -175,11 +225,16 @@ int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt, int *len); /* Packet queuing */ -typedef int (*nf_queue_outfn_t)(struct sk_buff *skb, - struct nf_info *info, void *data); +struct nf_queue_handler { + int (*outfn)(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data); + void *data; + char *name; +}; extern int nf_register_queue_handler(int pf, - nf_queue_outfn_t outfn, void *data); + struct nf_queue_handler *qh); extern int nf_unregister_queue_handler(int pf); +extern void nf_unregister_queue_handlers(struct nf_queue_handler *qh); extern void nf_reinject(struct sk_buff *skb, struct nf_info *info, unsigned int verdict); @@ -190,6 +245,27 @@ extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); /* FIXME: Before cache is ever used, this must be implemented for real. */ extern void nf_invalidate_cache(int pf); +/* Call this before modifying an existing packet: ensures it is + modifiable and linear to the point you care about (writable_len). + Returns true or false. */ +extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len); + +struct nf_queue_rerouter { + void (*save)(const struct sk_buff *skb, struct nf_info *info); + int (*reroute)(struct sk_buff **skb, const struct nf_info *info); + int rer_size; +}; + +#define nf_info_reroute(x) ((void *)x + sizeof(struct nf_info)) + +extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer); +extern int nf_unregister_queue_rerouter(int pf); + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +extern struct proc_dir_entry *proc_net_netfilter; +#endif + #else /* !CONFIG_NETFILTER */ #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb) static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h new file mode 100644 index 00000000000..1d5b10ae239 --- /dev/null +++ b/include/linux/netfilter/nfnetlink.h @@ -0,0 +1,169 @@ +#ifndef _NFNETLINK_H +#define _NFNETLINK_H +#include <linux/types.h> + +#ifndef __KERNEL__ +/* nfnetlink groups: Up to 32 maximum - backwards compatibility for userspace */ +#define NF_NETLINK_CONNTRACK_NEW 0x00000001 +#define NF_NETLINK_CONNTRACK_UPDATE 0x00000002 +#define NF_NETLINK_CONNTRACK_DESTROY 0x00000004 +#define NF_NETLINK_CONNTRACK_EXP_NEW 0x00000008 +#define NF_NETLINK_CONNTRACK_EXP_UPDATE 0x00000010 +#define NF_NETLINK_CONNTRACK_EXP_DESTROY 0x00000020 +#endif + +enum nfnetlink_groups { + NFNLGRP_NONE, +#define NFNLGRP_NONE NFNLGRP_NONE + NFNLGRP_CONNTRACK_NEW, +#define NFNLGRP_CONNTRACK_NEW NFNLGRP_CONNTRACK_NEW + NFNLGRP_CONNTRACK_UPDATE, +#define NFNLGRP_CONNTRACK_UPDATE NFNLGRP_CONNTRACK_UPDATE + NFNLGRP_CONNTRACK_DESTROY, +#define NFNLGRP_CONNTRACK_DESTROY NFNLGRP_CONNTRACK_DESTROY + NFNLGRP_CONNTRACK_EXP_NEW, +#define NFNLGRP_CONNTRACK_EXP_NEW NFNLGRP_CONNTRACK_EXP_NEW + NFNLGRP_CONNTRACK_EXP_UPDATE, +#define NFNLGRP_CONNTRACK_EXP_UPDATE NFNLGRP_CONNTRACK_EXP_UPDATE + NFNLGRP_CONNTRACK_EXP_DESTROY, +#define NFNLGRP_CONNTRACK_EXP_DESTROY NFNLGRP_CONNTRACK_EXP_DESTROY + __NFNLGRP_MAX, +}; +#define NFNLGRP_MAX (__NFNLGRP_MAX - 1) + +/* Generic structure for encapsulation optional netfilter information. + * It is reminiscent of sockaddr, but with sa_family replaced + * with attribute type. + * ! This should someday be put somewhere generic as now rtnetlink and + * ! nfnetlink use the same attributes methods. - J. Schulist. + */ + +struct nfattr +{ + u_int16_t nfa_len; + u_int16_t nfa_type; +} __attribute__ ((packed)); + +/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time + * to put this in a generic file */ + +#define NFA_ALIGNTO 4 +#define NFA_ALIGN(len) (((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1)) +#define NFA_OK(nfa,len) ((len) > 0 && (nfa)->nfa_len >= sizeof(struct nfattr) \ + && (nfa)->nfa_len <= (len)) +#define NFA_NEXT(nfa,attrlen) ((attrlen) -= NFA_ALIGN((nfa)->nfa_len), \ + (struct nfattr *)(((char *)(nfa)) + NFA_ALIGN((nfa)->nfa_len))) +#define NFA_LENGTH(len) (NFA_ALIGN(sizeof(struct nfattr)) + (len)) +#define NFA_SPACE(len) NFA_ALIGN(NFA_LENGTH(len)) +#define NFA_DATA(nfa) ((void *)(((char *)(nfa)) + NFA_LENGTH(0))) +#define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0)) +#define NFA_NEST(skb, type) \ +({ struct nfattr *__start = (struct nfattr *) (skb)->tail; \ + NFA_PUT(skb, type, 0, NULL); \ + __start; }) +#define NFA_NEST_END(skb, start) \ +({ (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \ + (skb)->len; }) +#define NFA_NEST_CANCEL(skb, start) \ +({ if (start) \ + skb_trim(skb, (unsigned char *) (start) - (skb)->data); \ + -1; }) + +/* General form of address family dependent message. + */ +struct nfgenmsg { + u_int8_t nfgen_family; /* AF_xxx */ + u_int8_t version; /* nfnetlink version */ + u_int16_t res_id; /* resource id */ +} __attribute__ ((packed)); + +#define NFNETLINK_V0 0 + +#define NFM_NFA(n) ((struct nfattr *)(((char *)(n)) \ + + NLMSG_ALIGN(sizeof(struct nfgenmsg)))) +#define NFM_PAYLOAD(n) NLMSG_PAYLOAD(n, sizeof(struct nfgenmsg)) + +/* netfilter netlink message types are split in two pieces: + * 8 bit subsystem, 8bit operation. + */ + +#define NFNL_SUBSYS_ID(x) ((x & 0xff00) >> 8) +#define NFNL_MSG_TYPE(x) (x & 0x00ff) + +/* No enum here, otherwise __stringify() trick of MODULE_ALIAS_NFNL_SUBSYS() + * won't work anymore */ +#define NFNL_SUBSYS_NONE 0 +#define NFNL_SUBSYS_CTNETLINK 1 +#define NFNL_SUBSYS_CTNETLINK_EXP 2 +#define NFNL_SUBSYS_QUEUE 3 +#define NFNL_SUBSYS_ULOG 4 +#define NFNL_SUBSYS_COUNT 5 + +#ifdef __KERNEL__ + +#include <linux/netlink.h> +#include <linux/capability.h> + +struct nfnl_callback +{ + int (*call)(struct sock *nl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp); + kernel_cap_t cap_required; /* capabilities required for this msg */ + u_int16_t attr_count; /* number of nfattr's */ +}; + +struct nfnetlink_subsystem +{ + const char *name; + __u8 subsys_id; /* nfnetlink subsystem ID */ + __u8 cb_count; /* number of callbacks */ + struct nfnl_callback *cb; /* callback for individual types */ +}; + +extern void __nfa_fill(struct sk_buff *skb, int attrtype, + int attrlen, const void *data); +#define NFA_PUT(skb, attrtype, attrlen, data) \ +({ if (skb_tailroom(skb) < (int)NFA_SPACE(attrlen)) goto nfattr_failure; \ + __nfa_fill(skb, attrtype, attrlen, data); }) + +extern struct semaphore nfnl_sem; + +#define nfnl_shlock() down(&nfnl_sem) +#define nfnl_shlock_nowait() down_trylock(&nfnl_sem) + +#define nfnl_shunlock() do { up(&nfnl_sem); \ + if(nfnl && nfnl->sk_receive_queue.qlen) \ + nfnl->sk_data_ready(nfnl, 0); \ + } while(0) + +extern void nfnl_lock(void); +extern void nfnl_unlock(void); + +extern int nfnetlink_subsys_register(struct nfnetlink_subsystem *n); +extern int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n); + +extern int nfattr_parse(struct nfattr *tb[], int maxattr, + struct nfattr *nfa, int len); + +#define nfattr_parse_nested(tb, max, nfa) \ + nfattr_parse((tb), (max), NFA_DATA((nfa)), NFA_PAYLOAD((nfa))) + +#define nfattr_bad_size(tb, max, cta_min) \ +({ int __i, __res = 0; \ + for (__i=0; __i<max; __i++) \ + if (tb[__i] && NFA_PAYLOAD(tb[__i]) < cta_min[__i]){ \ + __res = 1; \ + break; \ + } \ + __res; \ +}) + +extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, + int echo); +extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags); + +#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ + MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) + +#endif /* __KERNEL__ */ +#endif /* _NFNETLINK_H */ diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h new file mode 100644 index 00000000000..5c55751c78e --- /dev/null +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -0,0 +1,124 @@ +#ifndef _IPCONNTRACK_NETLINK_H +#define _IPCONNTRACK_NETLINK_H +#include <linux/netfilter/nfnetlink.h> + +enum cntl_msg_types { + IPCTNL_MSG_CT_NEW, + IPCTNL_MSG_CT_GET, + IPCTNL_MSG_CT_DELETE, + IPCTNL_MSG_CT_GET_CTRZERO, + + IPCTNL_MSG_MAX +}; + +enum ctnl_exp_msg_types { + IPCTNL_MSG_EXP_NEW, + IPCTNL_MSG_EXP_GET, + IPCTNL_MSG_EXP_DELETE, + + IPCTNL_MSG_EXP_MAX +}; + + +enum ctattr_type { + CTA_UNSPEC, + CTA_TUPLE_ORIG, + CTA_TUPLE_REPLY, + CTA_STATUS, + CTA_PROTOINFO, + CTA_HELP, + CTA_NAT, + CTA_TIMEOUT, + CTA_MARK, + CTA_COUNTERS_ORIG, + CTA_COUNTERS_REPLY, + CTA_USE, + CTA_ID, + __CTA_MAX +}; +#define CTA_MAX (__CTA_MAX - 1) + +enum ctattr_tuple { + CTA_TUPLE_UNSPEC, + CTA_TUPLE_IP, + CTA_TUPLE_PROTO, + __CTA_TUPLE_MAX +}; +#define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1) + +enum ctattr_ip { + CTA_IP_UNSPEC, + CTA_IP_V4_SRC, + CTA_IP_V4_DST, + CTA_IP_V6_SRC, + CTA_IP_V6_DST, + __CTA_IP_MAX +}; +#define CTA_IP_MAX (__CTA_IP_MAX - 1) + +enum ctattr_l4proto { + CTA_PROTO_UNSPEC, + CTA_PROTO_NUM, + CTA_PROTO_SRC_PORT, + CTA_PROTO_DST_PORT, + CTA_PROTO_ICMP_ID, + CTA_PROTO_ICMP_TYPE, + CTA_PROTO_ICMP_CODE, + __CTA_PROTO_MAX +}; +#define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1) + +enum ctattr_protoinfo { + CTA_PROTOINFO_UNSPEC, + CTA_PROTOINFO_TCP_STATE, + __CTA_PROTOINFO_MAX +}; +#define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1) + +enum ctattr_counters { + CTA_COUNTERS_UNSPEC, + CTA_COUNTERS_PACKETS, + CTA_COUNTERS_BYTES, + __CTA_COUNTERS_MAX +}; +#define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) + +enum ctattr_nat { + CTA_NAT_UNSPEC, + CTA_NAT_MINIP, + CTA_NAT_MAXIP, + CTA_NAT_PROTO, + __CTA_NAT_MAX +}; +#define CTA_NAT_MAX (__CTA_NAT_MAX - 1) + +enum ctattr_protonat { + CTA_PROTONAT_UNSPEC, + CTA_PROTONAT_PORT_MIN, + CTA_PROTONAT_PORT_MAX, + __CTA_PROTONAT_MAX +}; +#define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1) + +enum ctattr_expect { + CTA_EXPECT_UNSPEC, + CTA_EXPECT_MASTER, + CTA_EXPECT_TUPLE, + CTA_EXPECT_MASK, + CTA_EXPECT_TIMEOUT, + CTA_EXPECT_ID, + CTA_EXPECT_HELP_NAME, + __CTA_EXPECT_MAX +}; +#define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1) + +enum ctattr_help { + CTA_HELP_UNSPEC, + CTA_HELP_NAME, + __CTA_HELP_MAX +}; +#define CTA_HELP_MAX (__CTA_HELP_MAX - 1) + +#define CTA_HELP_MAXNAMESIZE 32 + +#endif /* _IPCONNTRACK_NETLINK_H */ diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h new file mode 100644 index 00000000000..b04b0388059 --- /dev/null +++ b/include/linux/netfilter/nfnetlink_log.h @@ -0,0 +1,88 @@ +#ifndef _NFNETLINK_LOG_H +#define _NFNETLINK_LOG_H + +/* This file describes the netlink messages (i.e. 'protocol packets'), + * and not any kind of function definitions. It is shared between kernel and + * userspace. Don't put kernel specific stuff in here */ + +#include <linux/types.h> +#include <linux/netfilter/nfnetlink.h> + +enum nfulnl_msg_types { + NFULNL_MSG_PACKET, /* packet from kernel to userspace */ + NFULNL_MSG_CONFIG, /* connect to a particular queue */ + + NFULNL_MSG_MAX +}; + +struct nfulnl_msg_packet_hdr { + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ + u_int8_t _pad; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfulnl_msg_packet_timestamp { + aligned_u64 sec; + aligned_u64 usec; +} __attribute__ ((packed)); + +#define NFULNL_PREFIXLEN 30 /* just like old log target */ + +enum nfulnl_attr_type { + NFULA_UNSPEC, + NFULA_PACKET_HDR, + NFULA_MARK, /* u_int32_t nfmark */ + NFULA_TIMESTAMP, /* nfulnl_msg_packet_timestamp */ + NFULA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ + NFULA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ + NFULA_HWADDR, /* nfulnl_msg_packet_hw */ + NFULA_PAYLOAD, /* opaque data payload */ + NFULA_PREFIX, /* string prefix */ + NFULA_UID, /* user id of socket */ + + __NFULA_MAX +}; +#define NFULA_MAX (__NFULA_MAX - 1) + +enum nfulnl_msg_config_cmds { + NFULNL_CFG_CMD_NONE, + NFULNL_CFG_CMD_BIND, + NFULNL_CFG_CMD_UNBIND, + NFULNL_CFG_CMD_PF_BIND, + NFULNL_CFG_CMD_PF_UNBIND, +}; + +struct nfulnl_msg_config_cmd { + u_int8_t command; /* nfulnl_msg_config_cmds */ +} __attribute__ ((packed)); + +struct nfulnl_msg_config_mode { + u_int32_t copy_range; + u_int8_t copy_mode; + u_int8_t _pad; +} __attribute__ ((packed)); + +enum nfulnl_attr_config { + NFULA_CFG_UNSPEC, + NFULA_CFG_CMD, /* nfulnl_msg_config_cmd */ + NFULA_CFG_MODE, /* nfulnl_msg_config_mode */ + NFULA_CFG_NLBUFSIZ, /* u_int32_t buffer size */ + NFULA_CFG_TIMEOUT, /* u_int32_t in 1/100 s */ + NFULA_CFG_QTHRESH, /* u_int32_t */ + __NFULA_CFG_MAX +}; +#define NFULA_CFG_MAX (__NFULA_CFG_MAX -1) + +#define NFULNL_COPY_NONE 0x00 +#define NFULNL_COPY_META 0x01 +#define NFULNL_COPY_PACKET 0x02 + +#endif /* _NFNETLINK_LOG_H */ diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h new file mode 100644 index 00000000000..9e774373244 --- /dev/null +++ b/include/linux/netfilter/nfnetlink_queue.h @@ -0,0 +1,89 @@ +#ifndef _NFNETLINK_QUEUE_H +#define _NFNETLINK_QUEUE_H + +#include <linux/types.h> +#include <linux/netfilter/nfnetlink.h> + +enum nfqnl_msg_types { + NFQNL_MSG_PACKET, /* packet from kernel to userspace */ + NFQNL_MSG_VERDICT, /* verdict from userspace to kernel */ + NFQNL_MSG_CONFIG, /* connect to a particular queue */ + + NFQNL_MSG_MAX +}; + +struct nfqnl_msg_packet_hdr { + u_int32_t packet_id; /* unique ID of packet in queue */ + u_int16_t hw_protocol; /* hw protocol (network order) */ + u_int8_t hook; /* netfilter hook */ +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_hw { + u_int16_t hw_addrlen; + u_int16_t _pad; + u_int8_t hw_addr[8]; +} __attribute__ ((packed)); + +struct nfqnl_msg_packet_timestamp { + aligned_u64 sec; + aligned_u64 usec; +} __attribute__ ((packed)); + +enum nfqnl_attr_type { + NFQA_UNSPEC, + NFQA_PACKET_HDR, + NFQA_VERDICT_HDR, /* nfqnl_msg_verdict_hrd */ + NFQA_MARK, /* u_int32_t nfmark */ + NFQA_TIMESTAMP, /* nfqnl_msg_packet_timestamp */ + NFQA_IFINDEX_INDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_OUTDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_PHYSINDEV, /* u_int32_t ifindex */ + NFQA_IFINDEX_PHYSOUTDEV, /* u_int32_t ifindex */ + NFQA_HWADDR, /* nfqnl_msg_packet_hw */ + NFQA_PAYLOAD, /* opaque data payload */ + + __NFQA_MAX +}; +#define NFQA_MAX (__NFQA_MAX - 1) + +struct nfqnl_msg_verdict_hdr { + u_int32_t verdict; + u_int32_t id; +} __attribute__ ((packed)); + + +enum nfqnl_msg_config_cmds { + NFQNL_CFG_CMD_NONE, + NFQNL_CFG_CMD_BIND, + NFQNL_CFG_CMD_UNBIND, + NFQNL_CFG_CMD_PF_BIND, + NFQNL_CFG_CMD_PF_UNBIND, +}; + +struct nfqnl_msg_config_cmd { + u_int8_t command; /* nfqnl_msg_config_cmds */ + u_int8_t _pad; + u_int16_t pf; /* AF_xxx for PF_[UN]BIND */ +} __attribute__ ((packed)); + +enum nfqnl_config_mode { + NFQNL_COPY_NONE, + NFQNL_COPY_META, + NFQNL_COPY_PACKET, +}; + +struct nfqnl_msg_config_params { + u_int32_t copy_range; + u_int8_t copy_mode; /* enum nfqnl_config_mode */ +} __attribute__ ((packed)); + + +enum nfqnl_attr_config { + NFQA_CFG_UNSPEC, + NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ + NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ + __NFQA_CFG_MAX +}; +#define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) + +#endif /* _NFNETLINK_QUEUE_H */ diff --git a/include/linux/netfilter_decnet.h b/include/linux/netfilter_decnet.h index 3064eec9cb8..6f425369ee2 100644 --- a/include/linux/netfilter_decnet.h +++ b/include/linux/netfilter_decnet.h @@ -9,6 +9,8 @@ #include <linux/netfilter.h> +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_DN_SRC 0x0001 @@ -18,6 +20,7 @@ #define NFC_DN_IF_IN 0x0004 /* Output device. */ #define NFC_DN_IF_OUT 0x0008 +#endif /* ! __KERNEL__ */ /* DECnet Hooks */ /* After promisc drops, checksum checks. */ @@ -53,7 +56,21 @@ struct nf_dn_rtmsg { #define NFDN_RTMSG(r) ((unsigned char *)(r) + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg))) +#ifndef __KERNEL__ +/* backwards compatibility for userspace */ #define DNRMG_L1_GROUP 0x01 #define DNRMG_L2_GROUP 0x02 +#endif + +enum { + DNRNG_NLGRP_NONE, +#define DNRNG_NLGRP_NONE DNRNG_NLGRP_NONE + DNRNG_NLGRP_L1, +#define DNRNG_NLGRP_L1 DNRNG_NLGRP_L1 + DNRNG_NLGRP_L2, +#define DNRNG_NLGRP_L2 DNRNG_NLGRP_L2 + __DNRNG_NLGRP_MAX +}; +#define DNRNG_NLGRP_MAX (__DNRNG_NLGRP_MAX - 1) #endif /*__LINUX_DECNET_NETFILTER_H*/ diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h index 3ebc36afae1..fdc4a952734 100644 --- a/include/linux/netfilter_ipv4.h +++ b/include/linux/netfilter_ipv4.h @@ -8,6 +8,8 @@ #include <linux/config.h> #include <linux/netfilter.h> +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP_SRC 0x0001 @@ -35,6 +37,7 @@ #define NFC_IP_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP Hooks */ /* After promisc drops, checksum checks. */ @@ -77,11 +80,6 @@ enum nf_ip_hook_priorities { #ifdef __KERNEL__ extern int ip_route_me_harder(struct sk_buff **pskb); -/* Call this before modifying an existing IP packet: ensures it is - modifiable and linear to the point you care about (writable_len). - Returns true or false. */ -extern int skb_ip_make_writable(struct sk_buff **pskb, - unsigned int writable_len); #endif /*__KERNEL__*/ #endif /*__LINUX_IP_NETFILTER_H*/ diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 08fe5f7d14a..088742befe4 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -65,6 +65,63 @@ enum ip_conntrack_status { /* Both together */ IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE), + + /* Connection is dying (removed from lists), can not be unset. */ + IPS_DYING_BIT = 9, + IPS_DYING = (1 << IPS_DYING_BIT), +}; + +/* Connection tracking event bits */ +enum ip_conntrack_events +{ + /* New conntrack */ + IPCT_NEW_BIT = 0, + IPCT_NEW = (1 << IPCT_NEW_BIT), + + /* Expected connection */ + IPCT_RELATED_BIT = 1, + IPCT_RELATED = (1 << IPCT_RELATED_BIT), + + /* Destroyed conntrack */ + IPCT_DESTROY_BIT = 2, + IPCT_DESTROY = (1 << IPCT_DESTROY_BIT), + + /* Timer has been refreshed */ + IPCT_REFRESH_BIT = 3, + IPCT_REFRESH = (1 << IPCT_REFRESH_BIT), + + /* Status has changed */ + IPCT_STATUS_BIT = 4, + IPCT_STATUS = (1 << IPCT_STATUS_BIT), + + /* Update of protocol info */ + IPCT_PROTOINFO_BIT = 5, + IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT), + + /* Volatile protocol info */ + IPCT_PROTOINFO_VOLATILE_BIT = 6, + IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT), + + /* New helper for conntrack */ + IPCT_HELPER_BIT = 7, + IPCT_HELPER = (1 << IPCT_HELPER_BIT), + + /* Update of helper info */ + IPCT_HELPINFO_BIT = 8, + IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT), + + /* Volatile helper info */ + IPCT_HELPINFO_VOLATILE_BIT = 9, + IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT), + + /* NAT info */ + IPCT_NATINFO_BIT = 10, + IPCT_NATINFO = (1 << IPCT_NATINFO_BIT), +}; + +enum ip_conntrack_expect_events { + IPEXP_NEW_BIT = 0, + IPEXP_NEW = (1 << IPEXP_NEW_BIT), }; #ifdef __KERNEL__ @@ -152,6 +209,9 @@ struct ip_conntrack /* Current number of expected connections */ unsigned int expecting; + /* Unique ID that identifies this conntrack*/ + unsigned int id; + /* Helper, if any. */ struct ip_conntrack_helper *helper; @@ -171,7 +231,7 @@ struct ip_conntrack #endif /* CONFIG_IP_NF_NAT_NEEDED */ #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - unsigned long mark; + u_int32_t mark; #endif /* Traversed often, so hopefully in different cacheline to top */ @@ -200,6 +260,9 @@ struct ip_conntrack_expect /* Usage count. */ atomic_t use; + /* Unique ID */ + unsigned int id; + #ifdef CONFIG_IP_NF_NAT_NEEDED /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ @@ -239,7 +302,12 @@ ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) } /* decrement reference count on a conntrack */ -extern void ip_conntrack_put(struct ip_conntrack *ct); +static inline void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} /* call to create an explicit dependency on ip_conntrack. */ extern void need_ip_conntrack(void); @@ -274,12 +342,50 @@ extern void ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *data), void *data); +extern struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *); +extern struct ip_conntrack_helper * +ip_conntrack_helper_find_get(const struct ip_conntrack_tuple *tuple); +extern void ip_conntrack_helper_put(struct ip_conntrack_helper *helper); + +extern struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol); +extern struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol); +extern void ip_conntrack_proto_put(struct ip_conntrack_protocol *proto); + +extern void ip_ct_remove_expectations(struct ip_conntrack *ct); + +extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, + struct ip_conntrack_tuple *); + +extern void ip_conntrack_free(struct ip_conntrack *ct); + +extern void ip_conntrack_hash_insert(struct ip_conntrack *ct); + +extern struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple); + +extern struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack); + +extern void ip_conntrack_flush(void); + /* It's confirmed if it is, or has been in the hash table. */ static inline int is_confirmed(struct ip_conntrack *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } +static inline int is_dying(struct ip_conntrack *ct) +{ + return test_bit(IPS_DYING_BIT, &ct->status); +} + extern unsigned int ip_conntrack_htable_size; struct ip_conntrack_stat @@ -303,6 +409,85 @@ struct ip_conntrack_stat #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +#include <linux/notifier.h> +#include <linux/interrupt.h> + +struct ip_conntrack_ecache { + struct ip_conntrack *ct; + unsigned int events; +}; +DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +#define CONNTRACK_ECACHE(x) (__get_cpu_var(ip_conntrack_ecache).x) + +extern struct notifier_block *ip_conntrack_chain; +extern struct notifier_block *ip_conntrack_expect_chain; + +static inline int ip_conntrack_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_chain, nb); +} + +static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_chain, nb); +} + +static inline int +ip_conntrack_expect_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&ip_conntrack_expect_chain, nb); +} + +static inline int +ip_conntrack_expect_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&ip_conntrack_expect_chain, nb); +} + +extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct); +extern void __ip_ct_event_cache_init(struct ip_conntrack *ct); + +static inline void +ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ct != ecache->ct) + __ip_ct_event_cache_init(ct); + ecache->events |= event; + local_bh_enable(); +} + +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) +{ + if (is_confirmed(ct) && !is_dying(ct)) + notifier_call_chain(&ip_conntrack_chain, event, ct); +} + +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) +{ + notifier_call_chain(&ip_conntrack_expect_chain, event, exp); +} +#else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ +static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, + const struct sk_buff *skb) {} +static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) {} +static inline void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) {} +static inline void +ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + #ifdef CONFIG_IP_NF_NAT_NEEDED static inline int ip_nat_initialized(struct ip_conntrack *conntrack, enum ip_nat_manip_type manip) diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h index 694aec9b478..dc4d2a0575d 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_core.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h @@ -2,6 +2,9 @@ #define _IP_CONNTRACK_CORE_H #include <linux/netfilter.h> +#define MAX_IP_CT_PROTO 256 +extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ @@ -38,12 +41,19 @@ extern int __ip_conntrack_confirm(struct sk_buff **pskb); /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int ip_conntrack_confirm(struct sk_buff **pskb) { - if ((*pskb)->nfct - && !is_confirmed((struct ip_conntrack *)(*pskb)->nfct)) - return __ip_conntrack_confirm(pskb); - return NF_ACCEPT; + struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct; + int ret = NF_ACCEPT; + + if (ct) { + if (!is_confirmed(ct)) + ret = __ip_conntrack_confirm(pskb); + ip_ct_deliver_cached_events(ct); + } + return ret; } +extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp); + extern struct list_head *ip_conntrack_hash; extern struct list_head ip_conntrack_expect_list; extern rwlock_t ip_conntrack_lock; diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper.h b/include/linux/netfilter_ipv4/ip_conntrack_helper.h index 3692daa93de..8d69279ccfe 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_helper.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_helper.h @@ -24,6 +24,8 @@ struct ip_conntrack_helper int (*help)(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info conntrackinfo); + + int (*to_nfattr)(struct sk_buff *skb, const struct ip_conntrack *ct); }; extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h index e20b57c5e1b..b6b99be8632 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h @@ -2,6 +2,7 @@ #ifndef _IP_CONNTRACK_PROTOCOL_H #define _IP_CONNTRACK_PROTOCOL_H #include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter/nfnetlink_conntrack.h> struct seq_file; @@ -47,22 +48,22 @@ struct ip_conntrack_protocol int (*error)(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum); + /* convert protoinfo to nfnetink attributes */ + int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct); + + int (*tuple_to_nfattr)(struct sk_buff *skb, + const struct ip_conntrack_tuple *t); + int (*nfattr_to_tuple)(struct nfattr *tb[], + struct ip_conntrack_tuple *t); + /* Module (if any) which this is connected to. */ struct module *me; }; -#define MAX_IP_CT_PROTO 256 -extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; - /* Protocol registration. */ extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); - -static inline struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol) -{ - return ip_ct_protos[protocol]; -} - /* Existing built-in protocols */ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; @@ -73,6 +74,11 @@ extern int ip_conntrack_protocol_tcp_init(void); /* Log invalid packets */ extern unsigned int ip_ct_log_invalid; +extern int ip_ct_port_tuple_to_nfattr(struct sk_buff *, + const struct ip_conntrack_tuple *); +extern int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *); + #ifdef CONFIG_SYSCTL #ifdef DEBUG_INVALID_PACKETS #define LOG_INVALID(proto) \ diff --git a/include/linux/netfilter_ipv4/ip_logging.h b/include/linux/netfilter_ipv4/ip_logging.h deleted file mode 100644 index 0c5c52cb658..00000000000 --- a/include/linux/netfilter_ipv4/ip_logging.h +++ /dev/null @@ -1,20 +0,0 @@ -/* IPv4 macros for the internal logging interface. */ -#ifndef __IP_LOGGING_H -#define __IP_LOGGING_H - -#ifdef __KERNEL__ -#include <linux/socket.h> -#include <linux/netfilter_logging.h> - -#define nf_log_ip_packet(pskb,hooknum,in,out,fmt,args...) \ - nf_log_packet(AF_INET,pskb,hooknum,in,out,fmt,##args) - -#define nf_log_ip(pfh,len,fmt,args...) \ - nf_log(AF_INET,pfh,len,fmt,##args) - -#define nf_ip_log_register(logging) nf_log_register(AF_INET,logging) -#define nf_ip_log_unregister(logging) nf_log_unregister(AF_INET,logging) - -#endif /*__KERNEL__*/ - -#endif /*__IP_LOGGING_H*/ diff --git a/include/linux/netfilter_ipv4/ip_nat_protocol.h b/include/linux/netfilter_ipv4/ip_nat_protocol.h index 129708c2238..ef63aa991a0 100644 --- a/include/linux/netfilter_ipv4/ip_nat_protocol.h +++ b/include/linux/netfilter_ipv4/ip_nat_protocol.h @@ -4,6 +4,9 @@ #include <linux/init.h> #include <linux/list.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + struct iphdr; struct ip_nat_range; @@ -15,6 +18,8 @@ struct ip_nat_protocol /* Protocol number. */ unsigned int protonum; + struct module *me; + /* Translate a packet to the target according to manip type. Return true if succeeded. */ int (*manip_pkt)(struct sk_buff **pskb, @@ -43,19 +48,20 @@ struct ip_nat_protocol unsigned int (*print_range)(char *buffer, const struct ip_nat_range *range); -}; -#define MAX_IP_NAT_PROTO 256 -extern struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + int (*range_to_nfattr)(struct sk_buff *skb, + const struct ip_nat_range *range); + + int (*nfattr_to_range)(struct nfattr *tb[], + struct ip_nat_range *range); +}; /* Protocol registration. */ extern int ip_nat_protocol_register(struct ip_nat_protocol *proto); extern void ip_nat_protocol_unregister(struct ip_nat_protocol *proto); -static inline struct ip_nat_protocol *ip_nat_find_proto(u_int8_t protocol) -{ - return ip_nat_protos[protocol]; -} +extern struct ip_nat_protocol *ip_nat_proto_find_get(u_int8_t protocol); +extern void ip_nat_proto_put(struct ip_nat_protocol *proto); /* Built-in protocols. */ extern struct ip_nat_protocol ip_nat_protocol_tcp; @@ -67,4 +73,9 @@ extern int init_protocols(void) __init; extern void cleanup_protocols(void); extern struct ip_nat_protocol *find_nat_proto(u_int16_t protonum); +extern int ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range); +extern int ip_nat_port_nfattr_to_range(struct nfattr *tb[], + struct ip_nat_range *range); + #endif /*_IP_NAT_PROTO_H*/ diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 12ce47808e7..d19d65cf453 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -109,7 +109,8 @@ struct ipt_counters /* Values for "flag" field in struct ipt_ip (general ip structure). */ #define IPT_F_FRAG 0x01 /* Set if rule is a fragment rule */ -#define IPT_F_MASK 0x01 /* All possible flag bits mask. */ +#define IPT_F_GOTO 0x02 /* Set if jump is a goto */ +#define IPT_F_MASK 0x03 /* All possible flag bits mask. */ /* Values for "inv" field in struct ipt_ip. */ #define IPT_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ diff --git a/include/linux/netfilter_ipv4/ipt_LOG.h b/include/linux/netfilter_ipv4/ipt_LOG.h index d25f782e57d..22d16177319 100644 --- a/include/linux/netfilter_ipv4/ipt_LOG.h +++ b/include/linux/netfilter_ipv4/ipt_LOG.h @@ -1,6 +1,7 @@ #ifndef _IPT_LOG_H #define _IPT_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IPT_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IPT_LOG_TCPOPT 0x02 /* Log TCP options */ #define IPT_LOG_IPOPT 0x04 /* Log IP options */ diff --git a/include/linux/netfilter_ipv4/ipt_NFQUEUE.h b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h new file mode 100644 index 00000000000..b5b2943b0c6 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h @@ -0,0 +1,16 @@ +/* iptables module for using NFQUEUE mechanism + * + * (C) 2005 Harald Welte <laforge@netfilter.org> + * + * This software is distributed under GNU GPL v2, 1991 + * +*/ +#ifndef _IPT_NFQ_TARGET_H +#define _IPT_NFQ_TARGET_H + +/* target info */ +struct ipt_NFQ_info { + u_int16_t queuenum; +}; + +#endif /* _IPT_DSCP_TARGET_H */ diff --git a/include/linux/netfilter_ipv4/ipt_TTL.h b/include/linux/netfilter_ipv4/ipt_TTL.h new file mode 100644 index 00000000000..ee6611edc11 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_TTL.h @@ -0,0 +1,21 @@ +/* TTL modification module for IP tables + * (C) 2000 by Harald Welte <laforge@netfilter.org> */ + +#ifndef _IPT_TTL_H +#define _IPT_TTL_H + +enum { + IPT_TTL_SET = 0, + IPT_TTL_INC, + IPT_TTL_DEC +}; + +#define IPT_TTL_MAXMODE IPT_TTL_DEC + +struct ipt_TTL_info { + u_int8_t mode; + u_int8_t ttl; +}; + + +#endif diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h new file mode 100644 index 00000000000..9e5532f8d8a --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_connbytes.h @@ -0,0 +1,25 @@ +#ifndef _IPT_CONNBYTES_H +#define _IPT_CONNBYTES_H + +enum ipt_connbytes_what { + IPT_CONNBYTES_PKTS, + IPT_CONNBYTES_BYTES, + IPT_CONNBYTES_AVGPKT, +}; + +enum ipt_connbytes_direction { + IPT_CONNBYTES_DIR_ORIGINAL, + IPT_CONNBYTES_DIR_REPLY, + IPT_CONNBYTES_DIR_BOTH, +}; + +struct ipt_connbytes_info +{ + struct { + aligned_u64 from; /* count to be matched */ + aligned_u64 to; /* count to be matched */ + } count; + u_int8_t what; /* ipt_connbytes_what */ + u_int8_t direction; /* ipt_connbytes_direction */ +}; +#endif diff --git a/include/linux/netfilter_ipv4/ipt_dccp.h b/include/linux/netfilter_ipv4/ipt_dccp.h new file mode 100644 index 00000000000..3cb3a522e62 --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_dccp.h @@ -0,0 +1,23 @@ +#ifndef _IPT_DCCP_H_ +#define _IPT_DCCP_H_ + +#define IPT_DCCP_SRC_PORTS 0x01 +#define IPT_DCCP_DEST_PORTS 0x02 +#define IPT_DCCP_TYPE 0x04 +#define IPT_DCCP_OPTION 0x08 + +#define IPT_DCCP_VALID_FLAGS 0x0f + +struct ipt_dccp_info { + u_int16_t dpts[2]; /* Min, Max */ + u_int16_t spts[2]; /* Min, Max */ + + u_int16_t flags; + u_int16_t invflags; + + u_int16_t typemask; + u_int8_t option; +}; + +#endif /* _IPT_DCCP_H_ */ + diff --git a/include/linux/netfilter_ipv4/ipt_string.h b/include/linux/netfilter_ipv4/ipt_string.h new file mode 100644 index 00000000000..a265f6e44ea --- /dev/null +++ b/include/linux/netfilter_ipv4/ipt_string.h @@ -0,0 +1,18 @@ +#ifndef _IPT_STRING_H +#define _IPT_STRING_H + +#define IPT_STRING_MAX_PATTERN_SIZE 128 +#define IPT_STRING_MAX_ALGO_NAME_SIZE 16 + +struct ipt_string_info +{ + u_int16_t from_offset; + u_int16_t to_offset; + char algo[IPT_STRING_MAX_ALGO_NAME_SIZE]; + char pattern[IPT_STRING_MAX_PATTERN_SIZE]; + u_int8_t patlen; + u_int8_t invert; + struct ts_config __attribute__((aligned(8))) *config; +}; + +#endif /*_IPT_STRING_H*/ diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index bee7a5ec7c6..edcc2c6eb5c 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -10,6 +10,8 @@ #include <linux/netfilter.h> +/* only for userspace compatibility */ +#ifndef __KERNEL__ /* IP Cache bits. */ /* Src IP address. */ #define NFC_IP6_SRC 0x0001 @@ -38,6 +40,7 @@ #define NFC_IP6_DST_PT 0x0400 /* Something else about the proto */ #define NFC_IP6_PROTO_UNKNOWN 0x2000 +#endif /* ! __KERNEL__ */ /* IP6 Hooks */ @@ -68,4 +71,7 @@ enum nf_ip6_hook_priorities { NF_IP6_PRI_LAST = INT_MAX, }; +extern int ipv6_netfilter_init(void); +extern void ipv6_netfilter_fini(void); + #endif /*__LINUX_IP6_NETFILTER_H*/ diff --git a/include/linux/netfilter_ipv6/ip6_logging.h b/include/linux/netfilter_ipv6/ip6_logging.h deleted file mode 100644 index a0b2ee3043a..00000000000 --- a/include/linux/netfilter_ipv6/ip6_logging.h +++ /dev/null @@ -1,20 +0,0 @@ -/* IPv6 macros for the nternal logging interface. */ -#ifndef __IP6_LOGGING_H -#define __IP6_LOGGING_H - -#ifdef __KERNEL__ -#include <linux/socket.h> -#include <linux/netfilter_logging.h> - -#define nf_log_ip6_packet(pskb,hooknum,in,out,fmt,args...) \ - nf_log_packet(AF_INET6,pskb,hooknum,in,out,fmt,##args) - -#define nf_log_ip6(pfh,len,fmt,args...) \ - nf_log(AF_INET6,pfh,len,fmt,##args) - -#define nf_ip6_log_register(logging) nf_log_register(AF_INET6,logging) -#define nf_ip6_log_unregister(logging) nf_log_unregister(AF_INET6,logging) - -#endif /*__KERNEL__*/ - -#endif /*__IP6_LOGGING_H*/ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index f1ce3b00985..58c72a52dc6 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -111,7 +111,8 @@ struct ip6t_counters #define IP6T_F_PROTO 0x01 /* Set if rule cares about upper protocols */ #define IP6T_F_TOS 0x02 /* Match the TOS. */ -#define IP6T_F_MASK 0x03 /* All possible flag bits mask. */ +#define IP6T_F_GOTO 0x04 /* Set if jump is a goto */ +#define IP6T_F_MASK 0x07 /* All possible flag bits mask. */ /* Values for "inv" field in struct ip6t_ip6. */ #define IP6T_INV_VIA_IN 0x01 /* Invert the sense of IN IFACE. */ diff --git a/include/linux/netfilter_ipv6/ip6t_HL.h b/include/linux/netfilter_ipv6/ip6t_HL.h new file mode 100644 index 00000000000..afb7813d45a --- /dev/null +++ b/include/linux/netfilter_ipv6/ip6t_HL.h @@ -0,0 +1,22 @@ +/* Hop Limit modification module for ip6tables + * Maciej Soltysiak <solt@dns.toxicfilms.tv> + * Based on HW's TTL module */ + +#ifndef _IP6T_HL_H +#define _IP6T_HL_H + +enum { + IP6T_HL_SET = 0, + IP6T_HL_INC, + IP6T_HL_DEC +}; + +#define IP6T_HL_MAXMODE IP6T_HL_DEC + +struct ip6t_HL_info { + u_int8_t mode; + u_int8_t hop_limit; +}; + + +#endif diff --git a/include/linux/netfilter_ipv6/ip6t_LOG.h b/include/linux/netfilter_ipv6/ip6t_LOG.h index 42996a43bb3..9008ff5c40a 100644 --- a/include/linux/netfilter_ipv6/ip6t_LOG.h +++ b/include/linux/netfilter_ipv6/ip6t_LOG.h @@ -1,6 +1,7 @@ #ifndef _IP6T_LOG_H #define _IP6T_LOG_H +/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */ #define IP6T_LOG_TCPSEQ 0x01 /* Log TCP sequence numbers */ #define IP6T_LOG_TCPOPT 0x02 /* Log TCP options */ #define IP6T_LOG_IPOPT 0x04 /* Log IP options */ diff --git a/include/linux/netfilter_ipv6/ip6t_REJECT.h b/include/linux/netfilter_ipv6/ip6t_REJECT.h new file mode 100644 index 00000000000..6be6504162b --- /dev/null +++ b/include/linux/netfilter_ipv6/ip6t_REJECT.h @@ -0,0 +1,18 @@ +#ifndef _IP6T_REJECT_H +#define _IP6T_REJECT_H + +enum ip6t_reject_with { + IP6T_ICMP6_NO_ROUTE, + IP6T_ICMP6_ADM_PROHIBITED, + IP6T_ICMP6_NOT_NEIGHBOUR, + IP6T_ICMP6_ADDR_UNREACH, + IP6T_ICMP6_PORT_UNREACH, + IP6T_ICMP6_ECHOREPLY, + IP6T_TCP_RESET +}; + +struct ip6t_reject_info { + u_int32_t with; /* reject type */ +}; + +#endif /*_IP6T_REJECT_H*/ diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 6552b71bfa7..16751866893 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -8,7 +8,7 @@ #define NETLINK_W1 1 /* 1-wire subsystem */ #define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ #define NETLINK_FIREWALL 3 /* Firewalling hook */ -#define NETLINK_TCPDIAG 4 /* TCP socket monitoring */ +#define NETLINK_INET_DIAG 4 /* INET socket monitoring */ #define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ #define NETLINK_XFRM 6 /* ipsec */ #define NETLINK_SELINUX 7 /* SELinux event notifications */ @@ -90,6 +90,15 @@ struct nlmsgerr struct nlmsghdr msg; }; +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 + +struct nl_pktinfo +{ + __u32 group; +}; + #define NET_MAJOR 36 /* Major 36 is reserved for networking */ enum { @@ -106,9 +115,8 @@ struct netlink_skb_parms { struct ucred creds; /* Skb credentials */ __u32 pid; - __u32 groups; __u32 dst_pid; - __u32 dst_groups; + __u32 dst_group; kernel_cap_t eff_cap; __u32 loginuid; /* Login (audit) uid */ }; @@ -117,11 +125,11 @@ struct netlink_skb_parms #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) -extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)); +extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (*input)(struct sock *sk, int len), struct module *module); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, - __u32 group, int allocation); + __u32 group, unsigned int __nocast allocation); extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); extern int netlink_register_notifier(struct notifier_block *nb); extern int netlink_unregister_notifier(struct notifier_block *nb); diff --git a/include/linux/random.h b/include/linux/random.h index cc670344991..7b2adb3322d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -59,6 +59,8 @@ extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, __u16 sport, __u16 dport); +extern u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr, + __u16 sport, __u16 dport); #ifndef MODULE extern struct file_operations random_fops, urandom_fops; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 657c05ab8f9..c231e9a08f0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -826,9 +826,8 @@ enum #define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) - -/* RTnetlink multicast groups */ - +#ifndef __KERNEL__ +/* RTnetlink multicast groups - backwards compatibility for userspace */ #define RTMGRP_LINK 1 #define RTMGRP_NOTIFY 2 #define RTMGRP_NEIGH 4 @@ -847,6 +846,43 @@ enum #define RTMGRP_DECnet_ROUTE 0x4000 #define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* RTnetlink multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) /* TC action piece */ struct tcamsg diff --git a/include/linux/security.h b/include/linux/security.h index b42095a68b1..7aab6ab7c57 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2727,7 +2727,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o return security_ops->socket_getpeersec(sock, optval, optlen, len); } -static inline int security_sk_alloc(struct sock *sk, int family, int priority) +static inline int security_sk_alloc(struct sock *sk, int family, + unsigned int __nocast priority) { return security_ops->sk_alloc_security(sk, family, priority); } @@ -2844,7 +2845,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o return -ENOPROTOOPT; } -static inline int security_sk_alloc(struct sock *sk, int family, int priority) +static inline int security_sk_alloc(struct sock *sk, int family, + unsigned int __nocast priority) { return 0; } diff --git a/include/linux/selinux_netlink.h b/include/linux/selinux_netlink.h index 957e6ebca4e..bbf489decd8 100644 --- a/include/linux/selinux_netlink.h +++ b/include/linux/selinux_netlink.h @@ -20,10 +20,21 @@ enum { SELNL_MSG_MAX }; -/* Multicast groups */ +#ifndef __KERNEL__ +/* Multicast groups - backwards compatiblility for userspace */ #define SELNL_GRP_NONE 0x00000000 #define SELNL_GRP_AVC 0x00000001 /* AVC notifications */ #define SELNL_GRP_ALL 0xffffffff +#endif + +enum selinux_nlgroups { + SELNLGRP_NONE, +#define SELNLGRP_NONE SELNLGRP_NONE + SELNLGRP_AVC, +#define SELNLGRP_AVC SELNLGRP_AVC + __SELNLGRP_MAX +}; +#define SELNLGRP_MAX (__SELNLGRP_MAX - 1) /* Message structures */ struct selnl_msg_setenforce { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 948527e42a6..42edce6abe2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -155,16 +155,29 @@ struct skb_shared_info { #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) +extern struct timeval skb_tv_base; + +struct skb_timeval { + u32 off_sec; + u32 off_usec; +}; + + +enum { + SKB_FCLONE_UNAVAILABLE, + SKB_FCLONE_ORIG, + SKB_FCLONE_CLONE, +}; + /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @list: List we are on * @sk: Socket we are owned by - * @stamp: Time we arrived + * @tstamp: Time we arrived stored as offset to skb_tv_base * @dev: Device we arrived on/are leaving by * @input_dev: Device we arrived on - * @real_dev: The real device we are using * @h: Transport layer header * @nh: Network layer header * @mac: Link layer header @@ -190,14 +203,11 @@ struct skb_shared_info { * @end: End pointer * @destructor: Destruct function * @nfmark: Can be used for communication between hooks - * @nfcache: Cache info * @nfct: Associated connection, if any * @nfctinfo: Relationship of this skb to the connection * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c - * @private: Data which is private to the HIPPI implementation * @tc_index: Traffic control index * @tc_verd: traffic control verdict - * @tc_classid: traffic control classid */ struct sk_buff { @@ -205,12 +215,10 @@ struct sk_buff { struct sk_buff *next; struct sk_buff *prev; - struct sk_buff_head *list; struct sock *sk; - struct timeval stamp; + struct skb_timeval tstamp; struct net_device *dev; struct net_device *input_dev; - struct net_device *real_dev; union { struct tcphdr *th; @@ -252,33 +260,28 @@ struct sk_buff { __u8 local_df:1, cloned:1, ip_summed:2, - nohdr:1; - /* 3 bits spare */ - __u8 pkt_type; + nohdr:1, + nfctinfo:3; + __u8 pkt_type:3, + fclone:2; __be16 protocol; void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER - unsigned long nfmark; - __u32 nfcache; - __u32 nfctinfo; + __u32 nfmark; struct nf_conntrack *nfct; +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + __u8 ipvs_property:1; +#endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; #endif #endif /* CONFIG_NETFILTER */ -#if defined(CONFIG_HIPPI) - union { - __u32 ifield; - } private; -#endif #ifdef CONFIG_NET_SCHED - __u32 tc_index; /* traffic control index */ + __u16 tc_index; /* traffic control index */ #ifdef CONFIG_NET_CLS_ACT - __u32 tc_verd; /* traffic control verdict */ - __u32 tc_classid; /* traffic control classid */ + __u16 tc_verd; /* traffic control verdict */ #endif - #endif @@ -300,8 +303,20 @@ struct sk_buff { #include <asm/system.h> extern void __kfree_skb(struct sk_buff *skb); -extern struct sk_buff *alloc_skb(unsigned int size, - unsigned int __nocast priority); +extern struct sk_buff *__alloc_skb(unsigned int size, + unsigned int __nocast priority, int fclone); +static inline struct sk_buff *alloc_skb(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 0); +} + +static inline struct sk_buff *alloc_skb_fclone(unsigned int size, + unsigned int __nocast priority) +{ + return __alloc_skb(size, priority, 1); +} + extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, unsigned int size, unsigned int __nocast priority); @@ -597,7 +612,6 @@ static inline void __skb_queue_head(struct sk_buff_head *list, { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; prev = (struct sk_buff *)list; next = prev->next; @@ -622,7 +636,6 @@ static inline void __skb_queue_tail(struct sk_buff_head *list, { struct sk_buff *prev, *next; - newsk->list = list; list->qlen++; next = (struct sk_buff *)list; prev = next->prev; @@ -655,7 +668,6 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) next->prev = prev; prev->next = next; result->next = result->prev = NULL; - result->list = NULL; } return result; } @@ -664,7 +676,7 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) /* * Insert a packet on a list. */ -extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk); +extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) @@ -672,24 +684,23 @@ static inline void __skb_insert(struct sk_buff *newsk, newsk->next = next; newsk->prev = prev; next->prev = prev->next = newsk; - newsk->list = list; list->qlen++; } /* * Place a packet after a given packet in a list. */ -extern void skb_append(struct sk_buff *old, struct sk_buff *newsk); -static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) +extern void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { - __skb_insert(newsk, old, old->next, old->list); + __skb_insert(newsk, old, old->next, list); } /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ -extern void skb_unlink(struct sk_buff *skb); +extern void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; @@ -698,7 +709,6 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; - skb->list = NULL; next->prev = prev; prev->next = next; } @@ -1213,6 +1223,8 @@ extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); +extern void skb_release_data(struct sk_buff *skb); + static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { @@ -1230,6 +1242,42 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, extern void skb_init(void); extern void skb_add_mtu(int mtu); +/** + * skb_get_timestamp - get timestamp from a skb + * @skb: skb to get stamp from + * @stamp: pointer to struct timeval to store stamp in + * + * Timestamps are stored in the skb as offsets to a base timestamp. + * This function converts the offset back to a struct timeval and stores + * it in stamp. + */ +static inline void skb_get_timestamp(struct sk_buff *skb, struct timeval *stamp) +{ + stamp->tv_sec = skb->tstamp.off_sec; + stamp->tv_usec = skb->tstamp.off_usec; + if (skb->tstamp.off_sec) { + stamp->tv_sec += skb_tv_base.tv_sec; + stamp->tv_usec += skb_tv_base.tv_usec; + } +} + +/** + * skb_set_timestamp - set timestamp of a skb + * @skb: skb to set stamp of + * @stamp: pointer to struct timeval to get stamp from + * + * Timestamps are stored in the skb as offsets to a base timestamp. + * This function converts a struct timeval to an offset and stores + * it in the skb. + */ +static inline void skb_set_timestamp(struct sk_buff *skb, struct timeval *stamp) +{ + skb->tstamp.off_sec = stamp->tv_sec - skb_tv_base.tv_sec; + skb->tstamp.off_usec = stamp->tv_usec - skb_tv_base.tv_usec; +} + +extern void __net_timestamp(struct sk_buff *skb); + #ifdef CONFIG_NETFILTER static inline void nf_conntrack_put(struct nf_conntrack *nfct) { diff --git a/include/linux/socket.h b/include/linux/socket.h index a5c7d96e4d2..1739c2d5b95 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -26,6 +26,13 @@ struct __kernel_sockaddr_storage { #include <linux/types.h> /* pid_t */ #include <linux/compiler.h> /* __user */ +extern int sysctl_somaxconn; +extern void sock_init(void); +#ifdef CONFIG_PROC_FS +struct seq_file; +extern void socket_seq_show(struct seq_file *seq); +#endif + typedef unsigned short sa_family_t; /* @@ -271,6 +278,8 @@ struct ucred { #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 +#define SOL_DCCP 269 +#define SOL_NETLINK 270 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index e4fd82e4210..ac4ca44c75c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -55,24 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; - -enum { - TCP_ESTABLISHED = 1, - TCP_SYN_SENT, - TCP_SYN_RECV, - TCP_FIN_WAIT1, - TCP_FIN_WAIT2, - TCP_TIME_WAIT, - TCP_CLOSE, - TCP_CLOSE_WAIT, - TCP_LAST_ACK, - TCP_LISTEN, - TCP_CLOSING, /* now a valid state */ - - TCP_MAX_STATES /* Leave at the end! */ -}; - -#define TCP_STATE_MASK 0xF #define TCP_ACTION_FIN (1 << 7) enum { @@ -195,8 +177,9 @@ struct tcp_info #include <linux/config.h> #include <linux/skbuff.h> -#include <linux/ip.h> #include <net/sock.h> +#include <net/inet_connection_sock.h> +#include <net/inet_timewait_sock.h> /* This defines a selective acknowledgement block. */ struct tcp_sack_block { @@ -236,8 +219,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) } struct tcp_sock { - /* inet_sock has to be the first member of tcp_sock */ - struct inet_sock inet; + /* inet_connection_sock has to be the first member of tcp_sock */ + struct inet_connection_sock inet_conn; int tcp_header_len; /* Bytes of tcp header to send */ /* @@ -258,19 +241,6 @@ struct tcp_sock { __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ - struct tcp_bind_bucket *bind_hash; - /* Delayed ACK control data */ - struct { - __u8 pending; /* ACK is pending */ - __u8 quick; /* Scheduled number of quick acks */ - __u8 pingpong; /* The session is interactive */ - __u8 blocked; /* Delayed ACK was blocked by socket lock*/ - __u32 ato; /* Predicted tick of soft clock */ - unsigned long timeout; /* Currently scheduled timeout */ - __u32 lrcvtime; /* timestamp of last received data packet*/ - __u16 last_seg_size; /* Size of last incoming segment */ - __u16 rcv_mss; /* MSS used for delayed ACK decisions */ - } ack; /* Data for direct copy to user */ struct { @@ -288,19 +258,15 @@ struct tcp_sock { __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 xmit_size_goal; /* Goal for segmenting output packets */ __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ - __u8 ca_state; /* State of fast-retransmit machine */ - __u8 retransmits; /* Number of unrecovered RTO timeouts. */ - __u16 advmss; /* Advertised MSS */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ __u32 frto_highmark; /* snd_nxt when RTO occurred */ __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - - __u8 unused; - __u8 defer_accept; /* User waits for some data after accept() */ + __u8 nonagle; /* Disable Nagle algorithm? */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ /* RTT measurement */ __u32 srtt; /* smoothed round trip time << 3 */ @@ -308,19 +274,13 @@ struct tcp_sock { __u32 mdev_max; /* maximal mdev for the last rtt period */ __u32 rttvar; /* smoothed mdev_max */ __u32 rtt_seq; /* sequence number to update rttvar */ - __u32 rto; /* retransmit timeout */ __u32 packets_out; /* Packets which are "in flight" */ __u32 left_out; /* Packets which leaved network */ __u32 retrans_out; /* Retransmitted packets out */ - __u8 backoff; /* backoff */ /* * Options received (usually on last packet, some only on SYN packets). */ - __u8 nonagle; /* Disable Nagle algorithm? */ - __u8 keepalive_probes; /* num of allowed keep alive probes */ - - __u8 probes_out; /* unanswered 0 window probes */ struct tcp_options_received rx_opt; /* @@ -333,11 +293,6 @@ struct tcp_sock { __u32 snd_cwnd_used; __u32 snd_cwnd_stamp; - /* Two commonly used timers in both sender and receiver paths. */ - unsigned long timeout; - struct timer_list retransmit_timer; /* Resend (no ack) */ - struct timer_list delack_timer; /* Ack delay */ - struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ @@ -352,8 +307,7 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - __u8 syn_retries; /* num of allowed syn retries */ - __u8 ecn_flags; /* ECN status bits. */ + __u16 advmss; /* Advertised MSS */ __u16 prior_ssthresh; /* ssthresh saved at recovery start */ __u32 lost_out; /* Lost packets */ __u32 sacked_out; /* SACK'd packets */ @@ -367,14 +321,12 @@ struct tcp_sock { int undo_retrans; /* number of undoable retransmissions. */ __u32 urg_seq; /* Seq of received urgent pointer */ __u16 urg_data; /* Saved octet of OOB data and control flags */ - __u8 pending; /* Scheduled timer event */ __u8 urg_mode; /* In urgent mode */ + __u8 ecn_flags; /* ECN status bits. */ __u32 snd_up; /* Urgent pointer */ __u32 total_retrans; /* Total retransmits for entire connection */ - struct request_sock_queue accept_queue; /* FIFO of established children */ - unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; @@ -394,11 +346,6 @@ struct tcp_sock { __u32 seq; __u32 time; } rcvq_space; - - /* Pluggable TCP congestion control hook */ - struct tcp_congestion_ops *ca_ops; - u32 ca_priv[16]; -#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -406,9 +353,18 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } -static inline void *tcp_ca(const struct tcp_sock *tp) +struct tcp_timewait_sock { + struct inet_timewait_sock tw_sk; + __u32 tw_rcv_nxt; + __u32 tw_snd_nxt; + __u32 tw_rcv_wnd; + __u32 tw_ts_recent; + long tw_ts_recent_stamp; +}; + +static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { - return (void *) tp->ca_priv; + return (struct tcp_timewait_sock *)sk; } #endif diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h deleted file mode 100644 index 7a599674394..00000000000 --- a/include/linux/tcp_diag.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef _TCP_DIAG_H_ -#define _TCP_DIAG_H_ 1 - -/* Just some random number */ -#define TCPDIAG_GETSOCK 18 - -/* Socket identity */ -struct tcpdiag_sockid -{ - __u16 tcpdiag_sport; - __u16 tcpdiag_dport; - __u32 tcpdiag_src[4]; - __u32 tcpdiag_dst[4]; - __u32 tcpdiag_if; - __u32 tcpdiag_cookie[2]; -#define TCPDIAG_NOCOOKIE (~0U) -}; - -/* Request structure */ - -struct tcpdiagreq -{ - __u8 tcpdiag_family; /* Family of addresses. */ - __u8 tcpdiag_src_len; - __u8 tcpdiag_dst_len; - __u8 tcpdiag_ext; /* Query extended information */ - - struct tcpdiag_sockid id; - - __u32 tcpdiag_states; /* States to dump */ - __u32 tcpdiag_dbs; /* Tables to dump (NI) */ -}; - -enum -{ - TCPDIAG_REQ_NONE, - TCPDIAG_REQ_BYTECODE, -}; - -#define TCPDIAG_REQ_MAX TCPDIAG_REQ_BYTECODE - -/* Bytecode is sequence of 4 byte commands followed by variable arguments. - * All the commands identified by "code" are conditional jumps forward: - * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be - * length of the command and its arguments. - */ - -struct tcpdiag_bc_op -{ - unsigned char code; - unsigned char yes; - unsigned short no; -}; - -enum -{ - TCPDIAG_BC_NOP, - TCPDIAG_BC_JMP, - TCPDIAG_BC_S_GE, - TCPDIAG_BC_S_LE, - TCPDIAG_BC_D_GE, - TCPDIAG_BC_D_LE, - TCPDIAG_BC_AUTO, - TCPDIAG_BC_S_COND, - TCPDIAG_BC_D_COND, -}; - -struct tcpdiag_hostcond -{ - __u8 family; - __u8 prefix_len; - int port; - __u32 addr[0]; -}; - -/* Base info structure. It contains socket identity (addrs/ports/cookie) - * and, alas, the information shown by netstat. */ -struct tcpdiagmsg -{ - __u8 tcpdiag_family; - __u8 tcpdiag_state; - __u8 tcpdiag_timer; - __u8 tcpdiag_retrans; - - struct tcpdiag_sockid id; - - __u32 tcpdiag_expires; - __u32 tcpdiag_rqueue; - __u32 tcpdiag_wqueue; - __u32 tcpdiag_uid; - __u32 tcpdiag_inode; -}; - -/* Extensions */ - -enum -{ - TCPDIAG_NONE, - TCPDIAG_MEMINFO, - TCPDIAG_INFO, - TCPDIAG_VEGASINFO, - TCPDIAG_CONG, -}; - -#define TCPDIAG_MAX TCPDIAG_CONG - - -/* TCPDIAG_MEM */ - -struct tcpdiag_meminfo -{ - __u32 tcpdiag_rmem; - __u32 tcpdiag_wmem; - __u32 tcpdiag_fmem; - __u32 tcpdiag_tmem; -}; - -/* TCPDIAG_VEGASINFO */ - -struct tcpvegas_info { - __u32 tcpv_enabled; - __u32 tcpv_rttcnt; - __u32 tcpv_rtt; - __u32 tcpv_minrtt; -}; - -#endif /* _TCP_DIAG_H_ */ diff --git a/include/linux/types.h b/include/linux/types.h index dcb13f865df..2b678c22ca4 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -123,6 +123,9 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif +/* this is a special 64bit data type that is 8-byte aligned */ +#define aligned_u64 unsigned long long __attribute__((aligned(8))) + /* * The type used for indexing onto a disc or disc partition. * If required, asm/types.h can override it and define diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index f0d423300d8..0fb077d6844 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -258,9 +258,27 @@ struct xfrm_usersa_flush { __u8 proto; }; +#ifndef __KERNEL__ +/* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 #define XFRMGRP_EXPIRE 2 #define XFRMGRP_SA 4 #define XFRMGRP_POLICY 8 +#endif + +enum xfrm_nlgroups { + XFRMNLGRP_NONE, +#define XFRMNLGRP_NONE XFRMNLGRP_NONE + XFRMNLGRP_ACQUIRE, +#define XFRMNLGRP_ACQUIRE XFRMNLGRP_ACQUIRE + XFRMNLGRP_EXPIRE, +#define XFRMNLGRP_EXPIRE XFRMNLGRP_EXPIRE + XFRMNLGRP_SA, +#define XFRMNLGRP_SA XFRMNLGRP_SA + XFRMNLGRP_POLICY, +#define XFRMNLGRP_POLICY XFRMNLGRP_POLICY + __XFRMNLGRP_MAX +}; +#define XFRMNLGRP_MAX (__XFRMNLGRP_MAX - 1) #endif /* _LINUX_XFRM_H */ diff --git a/include/net/act_api.h b/include/net/act_api.h index ed00a995f57..b55eb7c7f03 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -63,7 +63,7 @@ struct tc_action_ops __u32 type; /* TBD to match kind */ __u32 capab; /* capabilities includes 4 bit version */ struct module *owner; - int (*act)(struct sk_buff **, struct tc_action *); + int (*act)(struct sk_buff **, struct tc_action *, struct tcf_result *); int (*get_stats)(struct sk_buff *, struct tc_action *); int (*dump)(struct sk_buff *, struct tc_action *,int , int); int (*cleanup)(struct tc_action *, int bind); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index a0ed9367217..750e2508dd9 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -45,6 +45,7 @@ struct prefix_info { #ifdef __KERNEL__ +#include <linux/config.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> @@ -238,5 +239,10 @@ static inline int ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) addr->s6_addr32[3] == htonl(0x00000002)); } +#ifdef CONFIG_PROC_FS +extern int if6_proc_init(void); +extern void if6_proc_exit(void); +#endif + #endif #endif diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b60b3846b9d..b5d785ab4a0 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -1,5 +1,11 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H + +#include <linux/config.h> +#include <linux/socket.h> +#include <linux/un.h> +#include <net/sock.h> + extern void unix_inflight(struct file *fp); extern void unix_notinflight(struct file *fp); extern void unix_gc(void); @@ -74,5 +80,14 @@ struct unix_sock { wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) + +#ifdef CONFIG_SYSCTL +extern int sysctl_unix_max_dgram_qlen; +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#else +static inline void unix_sysctl_register(void) {} +static inline void unix_sysctl_unregister(void) {} +#endif #endif #endif diff --git a/include/net/arp.h b/include/net/arp.h index a1f09fad6a5..a13e30c35f4 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -11,7 +11,7 @@ extern struct neigh_table arp_tbl; extern void arp_init(void); extern int arp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int arp_find(unsigned char *haddr, struct sk_buff *skb); extern int arp_ioctl(unsigned int cmd, void __user *arg); extern void arp_send(int type, int ptype, u32 dest_ip, diff --git a/include/net/ax25.h b/include/net/ax25.h index 3696f988a9f..926eed54302 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -316,7 +316,7 @@ extern int ax25_protocol_is_registered(unsigned int); /* ax25_in.c */ extern int ax25_rx_iframe(ax25_cb *, struct sk_buff *); -extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); /* ax25_ip.c */ extern int ax25_encapsulate(struct sk_buff *, struct net_device *, unsigned short, void *, void *, unsigned int); diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 06b24f63702..6dfa4a61ffd 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -131,11 +131,12 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock); /* Skb helpers */ struct bt_skb_cb { - int incoming; + __u8 pkt_type; + __u8 incoming; }; #define bt_cb(skb) ((struct bt_skb_cb *)(skb->cb)) -static inline struct sk_buff *bt_skb_alloc(unsigned int len, int how) +static inline struct sk_buff *bt_skb_alloc(unsigned int len, unsigned int __nocast how) { struct sk_buff *skb; diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 6f0706f4af6..371e7d3f2e6 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -453,6 +453,15 @@ struct inquiry_info_with_rssi { __u16 clock_offset; __s8 rssi; } __attribute__ ((packed)); +struct inquiry_info_with_rssi_and_pscan_mode { + bdaddr_t bdaddr; + __u8 pscan_rep_mode; + __u8 pscan_period_mode; + __u8 pscan_mode; + __u8 dev_class[3]; + __u16 clock_offset; + __s8 rssi; +} __attribute__ ((packed)); #define HCI_EV_CONN_COMPLETE 0x03 struct hci_ev_conn_complete { @@ -584,6 +593,12 @@ struct hci_ev_clock_offset { __u16 clock_offset; } __attribute__ ((packed)); +#define HCI_EV_PSCAN_REP_MODE 0x20 +struct hci_ev_pscan_rep_mode { + bdaddr_t bdaddr; + __u8 pscan_rep_mode; +} __attribute__ ((packed)); + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xFD struct hci_ev_stack_internal { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6d63a47c731..7f933f30207 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -404,7 +404,7 @@ static inline int hci_recv_frame(struct sk_buff *skb) bt_cb(skb)->incoming = 1; /* Time stamp */ - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); /* Queue frame for rx task */ skb_queue_tail(&hdev->rx_q, skb); diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 13669bad00b..ffea9d54071 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -80,9 +80,9 @@ #define RFCOMM_RPN_STOP_15 1 #define RFCOMM_RPN_PARITY_NONE 0x0 -#define RFCOMM_RPN_PARITY_ODD 0x4 -#define RFCOMM_RPN_PARITY_EVEN 0x5 -#define RFCOMM_RPN_PARITY_MARK 0x6 +#define RFCOMM_RPN_PARITY_ODD 0x1 +#define RFCOMM_RPN_PARITY_EVEN 0x3 +#define RFCOMM_RPN_PARITY_MARK 0x5 #define RFCOMM_RPN_PARITY_SPACE 0x7 #define RFCOMM_RPN_FLOW_NONE 0x00 @@ -223,8 +223,14 @@ struct rfcomm_dlc { #define RFCOMM_CFC_DISABLED 0 #define RFCOMM_CFC_ENABLED RFCOMM_MAX_CREDITS +/* ---- RFCOMM SEND RPN ---- */ +int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask); + /* ---- RFCOMM DLCs (channels) ---- */ -struct rfcomm_dlc *rfcomm_dlc_alloc(int prio); +struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio); void rfcomm_dlc_free(struct rfcomm_dlc *d); int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel); int rfcomm_dlc_close(struct rfcomm_dlc *d, int reason); diff --git a/include/net/datalink.h b/include/net/datalink.h index 5797ba3d2eb..deb7ca75db4 100644 --- a/include/net/datalink.h +++ b/include/net/datalink.h @@ -9,7 +9,7 @@ struct datalink_proto { unsigned short header_length; int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); int (*request)(struct datalink_proto *, struct sk_buff *, unsigned char *); struct list_head node; diff --git a/include/net/dn.h b/include/net/dn.h index 5551c46db39..c1dbbd22279 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -3,6 +3,7 @@ #include <linux/dn.h> #include <net/sock.h> +#include <net/tcp.h> #include <asm/byteorder.h> typedef unsigned short dn_address; diff --git a/include/net/icmp.h b/include/net/icmp.h index e5ef0d15fb4..6cdebeee5f9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -57,4 +57,11 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; +extern int sysctl_icmp_errors_use_inbound_ifaddr; +extern int sysctl_icmp_ratelimit; +extern int sysctl_icmp_ratemask; + #endif /* _ICMP_H */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h new file mode 100644 index 00000000000..03df3b15796 --- /dev/null +++ b/include/net/inet6_hashtables.h @@ -0,0 +1,130 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _INET6_HASHTABLES_H +#define _INET6_HASHTABLES_H + +#include <linux/config.h> + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/types.h> + +#include <net/ipv6.h> + +struct inet_hashinfo; + +/* I have no idea if this is a good hash for v6 or not. -DaveM */ +static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const u16 fport, + const int ehash_size) +{ + int hashent = (lport ^ fport); + + hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); + hashent ^= hashent >> 16; + hashent ^= hashent >> 8; + return (hashent & (ehash_size - 1)); +} + +static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *laddr = &np->rcv_saddr; + const struct in6_addr *faddr = &np->daddr; + const __u16 lport = inet->num; + const __u16 fport = inet->dport; + return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size); +} + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ +static inline struct sock * + __inet6_lookup_established(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_node *node; + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + const int hash = inet6_ehashfn(daddr, hnum, saddr, sport, + hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; + + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if (INET6_MATCH(sk, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { + const struct inet_timewait_sock *tw = inet_twsk(sk); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk->sk_family == PF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + goto hit; + } + } + read_unlock(&head->lock); + return NULL; + +hit: + sock_hold(sk); + read_unlock(&head->lock); + return sk; +} + +extern struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif); + +static inline struct sock *__inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const u16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk = __inet6_lookup_established(hashinfo, saddr, sport, + daddr, hnum, dif); + if (sk) + return sk; + + return inet6_lookup_listener(hashinfo, daddr, hnum, dif); +} + +extern struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const u16 sport, + const struct in6_addr *daddr, const u16 dport, + const int dif); +#endif /* defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) */ +#endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index fbc1f4d140d..f943306ce5f 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -8,6 +8,11 @@ extern struct proto_ops inet_dgram_ops; * INET4 prototypes used by INET6 */ +struct msghdr; +struct sock; +struct sockaddr; +struct socket; + extern void inet_remove_sock(struct sock *sk1); extern void inet_put_sock(unsigned short num, struct sock *sk); @@ -29,7 +34,6 @@ extern unsigned int inet_poll(struct file * file, struct socket *sock, struct p extern int inet_listen(struct socket *sock, int backlog); extern void inet_sock_destruct(struct sock *sk); -extern atomic_t inet_sock_nr; extern int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h new file mode 100644 index 00000000000..8a87a3a4f10 --- /dev/null +++ b/include/net/inet_connection_sock.h @@ -0,0 +1,276 @@ +/* + * NET Generic infrastructure for INET connection oriented protocols. + * + * Definitions for inet_connection_sock + * + * Authors: Many people, see the TCP sources + * + * From code originally in TCP + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_CONNECTION_SOCK_H +#define _INET_CONNECTION_SOCK_H + +#include <linux/ip.h> +#include <linux/string.h> +#include <linux/timer.h> +#include <net/request_sock.h> + +#define INET_CSK_DEBUG 1 + +/* Cancel timers, when they are not required. */ +#undef INET_CSK_CLEAR_TIMERS + +struct inet_bind_bucket; +struct inet_hashinfo; +struct tcp_congestion_ops; + +/** inet_connection_sock - INET connection oriented sock + * + * @icsk_accept_queue: FIFO of established children + * @icsk_bind_hash: Bind node + * @icsk_timeout: Timeout + * @icsk_retransmit_timer: Resend (no ack) + * @icsk_rto: Retransmit timeout + * @icsk_ca_ops Pluggable congestion control hook + * @icsk_ca_state: Congestion control state + * @icsk_retransmits: Number of unrecovered [RTO] timeouts + * @icsk_pending: Scheduled timer event + * @icsk_backoff: Backoff + * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries + * @icsk_probes_out: unanswered 0 window probes + * @icsk_ack: Delayed ACK control data + */ +struct inet_connection_sock { + /* inet_sock has to be the first member! */ + struct inet_sock icsk_inet; + struct request_sock_queue icsk_accept_queue; + struct inet_bind_bucket *icsk_bind_hash; + unsigned long icsk_timeout; + struct timer_list icsk_retransmit_timer; + struct timer_list icsk_delack_timer; + __u32 icsk_rto; + struct tcp_congestion_ops *icsk_ca_ops; + __u8 icsk_ca_state; + __u8 icsk_retransmits; + __u8 icsk_pending; + __u8 icsk_backoff; + __u8 icsk_syn_retries; + __u8 icsk_probes_out; + /* 2 BYTES HOLE, TRY TO PACK! */ + struct { + __u8 pending; /* ACK is pending */ + __u8 quick; /* Scheduled number of quick acks */ + __u8 pingpong; /* The session is interactive */ + __u8 blocked; /* Delayed ACK was blocked by socket lock */ + __u32 ato; /* Predicted tick of soft clock */ + unsigned long timeout; /* Currently scheduled timeout */ + __u32 lrcvtime; /* timestamp of last received data packet */ + __u16 last_seg_size; /* Size of last incoming segment */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + } icsk_ack; + u32 icsk_ca_priv[16]; +#define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) +}; + +#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +#define ICSK_TIME_DACK 2 /* Delayed ack timer */ +#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ +#define ICSK_TIME_KEEPOPEN 4 /* Keepalive timer */ + +static inline struct inet_connection_sock *inet_csk(const struct sock *sk) +{ + return (struct inet_connection_sock *)sk; +} + +static inline void *inet_csk_ca(const struct sock *sk) +{ + return (void *)inet_csk(sk)->icsk_ca_priv; +} + +extern struct sock *inet_csk_clone(struct sock *sk, + const struct request_sock *req, + const unsigned int __nocast priority); + +enum inet_csk_ack_state_t { + ICSK_ACK_SCHED = 1, + ICSK_ACK_TIMER = 2, + ICSK_ACK_PUSHED = 4 +}; + +extern void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)); +extern void inet_csk_clear_xmit_timers(struct sock *sk); + +static inline void inet_csk_schedule_ack(struct sock *sk) +{ + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; +} + +static inline int inet_csk_ack_scheduled(const struct sock *sk) +{ + return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; +} + +static inline void inet_csk_delack_init(struct sock *sk) +{ + memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); +} + +extern void inet_csk_delete_keepalive_timer(struct sock *sk); +extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); + +#ifdef INET_CSK_DEBUG +extern const char inet_csk_timer_bug_msg[]; +#endif + +static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); +#endif + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; +#ifdef INET_CSK_CLEAR_TIMERS + sk_stop_timer(sk, &icsk->icsk_delack_timer); +#endif + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +/* + * Reset the retransmission timer + */ +static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, + unsigned long when, + const unsigned long max_when) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (when > max_when) { +#ifdef INET_CSK_DEBUG + pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", + sk, what, when, current_text_addr()); +#endif + when = max_when; + } + + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { + icsk->icsk_pending = what; + icsk->icsk_timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + } else if (what == ICSK_TIME_DACK) { + icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = jiffies + when; + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + } +#ifdef INET_CSK_DEBUG + else { + pr_debug(inet_csk_timer_bug_msg); + } +#endif +} + +extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); + +extern struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const __u32 raddr, + const __u32 laddr); +extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum); + +extern struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req); + +static inline void inet_csk_reqsk_queue_add(struct sock *sk, + struct request_sock *req, + struct sock *child) +{ + reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child); +} + +extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned timeout); + +static inline void inet_csk_reqsk_queue_removed(struct sock *sk, + struct request_sock *req) +{ + if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0) + inet_csk_delete_keepalive_timer(sk); +} + +static inline void inet_csk_reqsk_queue_added(struct sock *sk, + const unsigned long timeout) +{ + if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0) + inet_csk_reset_keepalive_timer(sk, timeout); +} + +static inline int inet_csk_reqsk_queue_len(const struct sock *sk) +{ + return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_young(const struct sock *sk) +{ + return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); +} + +static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) +{ + return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); +} + +static inline void inet_csk_reqsk_queue_unlink(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev); +} + +static inline void inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + struct request_sock **prev) +{ + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + reqsk_free(req); +} + +extern void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto); + +extern void inet_csk_destroy_sock(struct sock *sk); + +/* + * LISTEN is a special case for poll.. + */ +static inline unsigned int inet_csk_listen_poll(const struct sock *sk) +{ + return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? + (POLLIN | POLLRDNORM) : 0; +} + +extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); +extern void inet_csk_listen_stop(struct sock *sk); + +#endif /* _INET_CONNECTION_SOCK_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h new file mode 100644 index 00000000000..646b6ea7fe2 --- /dev/null +++ b/include/net/inet_hashtables.h @@ -0,0 +1,427 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _INET_HASHTABLES_H +#define _INET_HASHTABLES_H + +#include <linux/config.h> + +#include <linux/interrupt.h> +#include <linux/ipv6.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/socket.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/wait.h> + +#include <net/inet_connection_sock.h> +#include <net/route.h> +#include <net/sock.h> +#include <net/tcp_states.h> + +#include <asm/atomic.h> +#include <asm/byteorder.h> + +/* This is for all connections with a full identity, no wildcards. + * New scheme, half the table is for TIME_WAIT, the other half is + * for the rest. I'll experiment with dynamic table growth later. + */ +struct inet_ehash_bucket { + rwlock_t lock; + struct hlist_head chain; +} __attribute__((__aligned__(8))); + +/* There are a few simple rules, which allow for local port reuse by + * an application. In essence: + * + * 1) Sockets bound to different interfaces may share a local port. + * Failing that, goto test 2. + * 2) If all sockets have sk->sk_reuse set, and none of them are in + * TCP_LISTEN state, the port may be shared. + * Failing that, goto test 3. + * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local + * address, and none of them are the same, the port may be + * shared. + * Failing this, the port cannot be shared. + * + * The interesting point, is test #2. This is what an FTP server does + * all day. To optimize this case we use a specific flag bit defined + * below. As we add sockets to a bind bucket list, we perform a + * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) + * As long as all sockets added to a bind bucket pass this test, + * the flag bit will be set. + * The resulting situation is that tcp_v[46]_verify_bind() can just check + * for this flag bit, if it is set and the socket trying to bind has + * sk->sk_reuse set, we don't even have to walk the owners list at all, + * we return that it is ok to bind this socket to the requested local port. + * + * Sounds like a lot of work, but it is worth it. In a more naive + * implementation (ie. current FreeBSD etc.) the entire list of ports + * must be walked for each data port opened by an ftp server. Needless + * to say, this does not scale at all. With a couple thousand FTP + * users logged onto your box, isn't it nice to know that new data + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ +struct inet_bind_bucket { + unsigned short port; + signed short fastreuse; + struct hlist_node node; + struct hlist_head owners; +}; + +#define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) + +struct inet_bind_hashbucket { + spinlock_t lock; + struct hlist_head chain; +}; + +/* This is for listening sockets, thus all sockets which possess wildcards. */ +#define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ + +struct inet_hashinfo { + /* This is for sockets with full identity only. Sockets here will + * always be without wildcards and will have the following invariant: + * + * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE + * + * First half of the table is for sockets not in TIME_WAIT, second half + * is for TIME_WAIT sockets only. + */ + struct inet_ehash_bucket *ehash; + + /* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ + struct inet_bind_hashbucket *bhash; + + int bhash_size; + int ehash_size; + + /* All sockets in TCP_LISTEN state will be in here. This is the only + * table where wildcard'd TCP sockets can exist. Hash function here + * is just local port number. + */ + struct hlist_head listening_hash[INET_LHTABLE_SIZE]; + + /* All the above members are written once at bootup and + * never written again _or_ are predominantly read-access. + * + * Now align to a new cache line as all the following members + * are often dirty. + */ + rwlock_t lhash_lock ____cacheline_aligned; + atomic_t lhash_users; + wait_queue_head_t lhash_wait; + spinlock_t portalloc_lock; + kmem_cache_t *bind_bucket_cachep; + int port_rover; +}; + +static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport, + const int ehash_size) +{ + int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h & (ehash_size - 1); +} + +static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); +} + +extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum); +extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); + +static inline int inet_bhashfn(const __u16 lport, const int bhash_size) +{ + return lport & (bhash_size - 1); +} + +extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + +/* These can have wildcards, don't try too hard. */ +static inline int inet_lhashfn(const unsigned short num) +{ + return num & (INET_LHTABLE_SIZE - 1); +} + +static inline int inet_sk_listen_hashfn(const struct sock *sk) +{ + return inet_lhashfn(inet_sk(sk)->num); +} + +/* Caller must disable local BH processing. */ +static inline void __inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + sk_add_bind_node(child, &tb->owners); + inet_csk(child)->icsk_bind_hash = tb; + spin_unlock(&head->lock); +} + +static inline void inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) +{ + local_bh_disable(); + __inet_inherit_port(table, sk, child); + local_bh_enable(); +} + +extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk); + +extern void inet_listen_wlock(struct inet_hashinfo *hashinfo); + +/* + * - We may sleep inside this lock. + * - If sleeping is not required (or called from BH), + * use plain read_(un)lock(&inet_hashinfo.lhash_lock). + */ +static inline void inet_listen_lock(struct inet_hashinfo *hashinfo) +{ + /* read_lock synchronizes to candidates to writers */ + read_lock(&hashinfo->lhash_lock); + atomic_inc(&hashinfo->lhash_users); + read_unlock(&hashinfo->lhash_lock); +} + +static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo) +{ + if (atomic_dec_and_test(&hashinfo->lhash_users)) + wake_up(&hashinfo->lhash_wait); +} + +static inline void __inet_hash(struct inet_hashinfo *hashinfo, + struct sock *sk, const int listen_possible) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + if (listen_possible && sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + sk->sk_hashent = inet_sk_ehashfn(sk, hashinfo->ehash_size); + list = &hashinfo->ehash[sk->sk_hashent].chain; + lock = &hashinfo->ehash[sk->sk_hashent].lock; + write_lock(lock); + } + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); + if (listen_possible && sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} + +static inline void inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __inet_hash(hashinfo, sk, 1); + local_bh_enable(); + } +} + +static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + rwlock_t *lock; + + if (sk_unhashed(sk)) + goto out; + + if (sk->sk_state == TCP_LISTEN) { + local_bh_disable(); + inet_listen_wlock(hashinfo); + lock = &hashinfo->lhash_lock; + } else { + struct inet_ehash_bucket *head = &hashinfo->ehash[sk->sk_hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(lock); +out: + if (sk->sk_state == TCP_LISTEN) + wake_up(&hashinfo->lhash_wait); +} + +static inline int inet_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +extern struct sock *__inet_lookup_listener(const struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, + const int dif); + +/* Optimize the common listener case. */ +static inline struct sock * + inet_lookup_listener(struct inet_hashinfo *hashinfo, + const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk = NULL; + const struct hlist_head *head; + + read_lock(&hashinfo->lhash_lock); + head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; + sk = __inet_lookup_listener(head, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&hashinfo->lhash_lock); + return sk; +} + +/* Socket demux engine toys. */ +#ifdef __BIG_ENDIAN +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__sport) << 16) | (__u32)(__dport)) +#else /* __LITTLE_ENDIAN */ +#define INET_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__dport) << 16) | (__u32)(__sport)) +#endif + +#if (BITS_PER_LONG == 64) +#ifdef __BIG_ENDIAN +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__saddr)) << 32) | ((__u64)(__daddr)); +#else /* __LITTLE_ENDIAN */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); +#endif /* __BIG_ENDIAN */ +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#else /* 32-bit arch */ +#define INET_ADDR_COOKIE(__name, __saddr, __daddr) +#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_sk(__sk)->daddr == (__saddr)) && \ + (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif) \ + ((inet_twsk(__sk)->tw_daddr == (__saddr)) && \ + (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +#endif /* 64-bit arch */ + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need + * not check it for lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ +static inline struct sock * + __inet_lookup_established(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); + struct sock *sk; + const struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size); + struct inet_ehash_bucket *head = &hashinfo->ehash[hash]; + + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(&head->lock); + return sk; +hit: + sock_hold(sk); + goto out; +} + +static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 hnum, + const int dif) +{ + struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr, + hnum, dif); + return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif); +} + +static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, + const u32 saddr, const u16 sport, + const u32 daddr, const u16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} +#endif /* _INET_HASHTABLES_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h new file mode 100644 index 00000000000..3b070352e86 --- /dev/null +++ b/include/net/inet_timewait_sock.h @@ -0,0 +1,219 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for a generic INET TIMEWAIT sock + * + * From code originally in net/tcp.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_TIMEWAIT_SOCK_ +#define _INET_TIMEWAIT_SOCK_ + +#include <linux/config.h> + +#include <linux/ip.h> +#include <linux/list.h> +#include <linux/timer.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include <net/sock.h> +#include <net/tcp_states.h> + +#include <asm/atomic.h> + +struct inet_hashinfo; + +#define INET_TWDR_RECYCLE_SLOTS_LOG 5 +#define INET_TWDR_RECYCLE_SLOTS (1 << INET_TWDR_RECYCLE_SLOTS_LOG) + +/* + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +#if HZ <= 16 || HZ > 4096 +# error Unsupported: HZ <= 16 or HZ > 4096 +#elif HZ <= 32 +# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 64 +# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 128 +# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 256 +# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 512 +# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 1024 +# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 2048 +# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#else +# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#endif + +/* TIME_WAIT reaping mechanism. */ +#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ + +#define INET_TWDR_TWKILL_QUOTA 100 + +struct inet_timewait_death_row { + /* Short-time timewait calendar */ + int twcal_hand; + int twcal_jiffie; + struct timer_list twcal_timer; + struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; + + spinlock_t death_lock; + int tw_count; + int period; + u32 thread_slots; + struct work_struct twkill_work; + struct timer_list tw_timer; + int slot; + struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; + struct inet_hashinfo *hashinfo; + int sysctl_tw_recycle; + int sysctl_max_tw_buckets; +}; + +extern void inet_twdr_hangman(unsigned long data); +extern void inet_twdr_twkill_work(void *data); +extern void inet_twdr_twcal_tick(unsigned long data); + +#if (BITS_PER_LONG == 64) +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8 +#else +#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4 +#endif + +struct inet_bind_bucket; + +/* + * This is a TIME_WAIT sock. It works around the memory consumption + * problems of sockets in such a state on heavily loaded servers, but + * without violating the protocol specification. + */ +struct inet_timewait_sock { + /* + * Now struct sock also uses sock_common, so please just + * don't add nothing before this first member (__tw_common) --acme + */ + struct sock_common __tw_common; +#define tw_family __tw_common.skc_family +#define tw_state __tw_common.skc_state +#define tw_reuse __tw_common.skc_reuse +#define tw_bound_dev_if __tw_common.skc_bound_dev_if +#define tw_node __tw_common.skc_node +#define tw_bind_node __tw_common.skc_bind_node +#define tw_refcnt __tw_common.skc_refcnt +#define tw_prot __tw_common.skc_prot + volatile unsigned char tw_substate; + /* 3 bits hole, try to pack */ + unsigned char tw_rcv_wscale; + /* Socket demultiplex comparisons on incoming packets. */ + /* these five are in inet_sock */ + __u16 tw_sport; + __u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES))); + __u32 tw_rcv_saddr; + __u16 tw_dport; + __u16 tw_num; + /* And these are ours. */ + __u8 tw_ipv6only:1; + /* 31 bits hole, try to pack */ + int tw_hashent; + int tw_timeout; + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; +}; + +static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_node, list); +} + +static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, + struct hlist_head *list) +{ + hlist_add_head(&tw->tw_bind_node, list); +} + +static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw) +{ + return tw->tw_death_node.pprev != NULL; +} + +static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw) +{ + tw->tw_death_node.pprev = NULL; +} + +static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + __hlist_del(&tw->tw_death_node); + inet_twsk_dead_node_init(tw); +} + +static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) +{ + if (inet_twsk_dead_hashed(tw)) { + __inet_twsk_del_dead_node(tw); + return 1; + } + return 0; +} + +#define inet_twsk_for_each(tw, node, head) \ + hlist_for_each_entry(tw, node, head, tw_node) + +#define inet_twsk_for_each_inmate(tw, node, jail) \ + hlist_for_each_entry(tw, node, jail, tw_death_node) + +#define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \ + hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) + +static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) +{ + return (struct inet_timewait_sock *)sk; +} + +static inline u32 inet_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr; +} + +static inline void inet_twsk_put(struct inet_timewait_sock *tw) +{ + if (atomic_dec_and_test(&tw->tw_refcnt)) { +#ifdef SOCK_REFCNT_DEBUG + printk(KERN_DEBUG "%s timewait_sock %p released\n", + tw->tw_prot->name, tw); +#endif + kmem_cache_free(tw->tw_prot->twsk_slab, tw); + } +} + +extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, + const int state); + +extern void __inet_twsk_kill(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo); + +extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo); + +extern void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len); +extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr); +#endif /* _INET_TIMEWAIT_SOCK_ */ diff --git a/include/net/ip.h b/include/net/ip.h index 32360bbe143..e4563bbee6e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -86,7 +86,7 @@ extern int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt); extern int ip_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int ip_local_deliver(struct sk_buff *skb); extern int ip_mr_input(struct sk_buff *skb); extern int ip_output(struct sk_buff *skb); @@ -140,8 +140,6 @@ struct ip_reply_arg { void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, unsigned int len); -extern int ip_finish_output(struct sk_buff *skb); - struct ipv4_config { int log_martians; @@ -165,6 +163,24 @@ extern int sysctl_local_port_range[2]; extern int sysctl_ip_default_ttl; extern int sysctl_ip_nonlocal_bind; +/* From ip_fragment.c */ +extern int sysctl_ipfrag_high_thresh; +extern int sysctl_ipfrag_low_thresh; +extern int sysctl_ipfrag_time; +extern int sysctl_ipfrag_secret_interval; + +/* From inetpeer.c */ +extern int inet_peer_threshold; +extern int inet_peer_minttl; +extern int inet_peer_maxttl; +extern int inet_peer_gc_mintime; +extern int inet_peer_gc_maxtime; + +/* From ip_output.c */ +extern int sysctl_ip_dynaddr; + +extern void ipfrag_init(void); + #ifdef CONFIG_INET /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ @@ -319,7 +335,10 @@ extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, u32 da extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); extern void ip_options_fragment(struct sk_buff *skb); extern int ip_options_compile(struct ip_options *opt, struct sk_buff *skb); -extern int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user); +extern int ip_options_get(struct ip_options **optp, + unsigned char *data, int optlen); +extern int ip_options_get_from_user(struct ip_options **optp, + unsigned char __user *data, int optlen); extern void ip_options_undo(struct ip_options * opt); extern void ip_forward_options(struct sk_buff *skb); extern int ip_options_rcv_srr(struct sk_buff *skb); @@ -350,5 +369,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context); +#ifdef CONFIG_PROC_FS +extern int ip_misc_proc_init(void); +#endif + +extern struct ctl_table ipv4_table[]; #endif /* _IP_H */ diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f920706d526..1f2e428ca36 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -12,7 +12,6 @@ #include <net/flow.h> #include <net/ip6_fib.h> #include <net/sock.h> -#include <linux/tcp.h> #include <linux/ip.h> #include <linux/ipv6.h> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a4208a336ac..14de4ebd121 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -295,4 +295,9 @@ static inline void fib_res_put(struct fib_result *res) #endif } +#ifdef CONFIG_PROC_FS +extern int fib_proc_init(void); +extern void fib_proc_exit(void); +#endif + #endif /* _NET_FIB_H */ diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 52da5d26617..7a3c43711a1 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -255,7 +255,6 @@ struct ip_vs_daemon_user { #include <asm/atomic.h> /* for struct atomic_t */ #include <linux/netdevice.h> /* for struct neighbour */ #include <net/dst.h> /* for struct dst_entry */ -#include <net/tcp.h> #include <net/udp.h> #include <linux/compiler.h> diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 69324465e8b..3203eaff4bd 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -104,6 +104,7 @@ struct frag_hdr { #ifdef __KERNEL__ +#include <linux/config.h> #include <net/sock.h> /* sysctls */ @@ -145,7 +146,6 @@ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); #define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field) #define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field) #define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field) -extern atomic_t inet6_sock_nr; int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); @@ -346,7 +346,8 @@ static inline int ipv6_addr_any(const struct in6_addr *a) extern int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); /* * upper-layer output functions @@ -464,8 +465,38 @@ extern int sysctl_ip6frag_low_thresh; extern int sysctl_ip6frag_time; extern int sysctl_ip6frag_secret_interval; -#endif /* __KERNEL__ */ -#endif /* _NET_IPV6_H */ +extern struct proto_ops inet6_stream_ops; +extern struct proto_ops inet6_dgram_ops; + +extern int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr); +extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); +extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, + int __user *optlen); + +#ifdef CONFIG_PROC_FS +extern int ac6_proc_init(void); +extern void ac6_proc_exit(void); +extern int raw6_proc_init(void); +extern void raw6_proc_exit(void); +extern int tcp6_proc_init(void); +extern void tcp6_proc_exit(void); +extern int udp6_proc_init(void); +extern void udp6_proc_exit(void); +extern int ipv6_misc_proc_init(void); +extern void ipv6_misc_proc_exit(void); + +extern struct rt6_statistics rt6_stats; +#endif +#ifdef CONFIG_SYSCTL +extern ctl_table ipv6_route_table[]; +extern ctl_table ipv6_icmp_table[]; +extern void ipv6_sysctl_register(void); +extern void ipv6_sysctl_unregister(void); +#endif +#endif /* __KERNEL__ */ +#endif /* _NET_IPV6_H */ diff --git a/include/net/llc.h b/include/net/llc.h index c9aed2a8b4e..71769a5aeef 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -46,7 +46,8 @@ struct llc_sap { unsigned char f_bit; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, + struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; struct { @@ -64,7 +65,7 @@ extern rwlock_t llc_sap_list_lock; extern unsigned char llc_station_mac_sa[ETH_ALEN]; extern int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt); + struct packet_type *pt, struct net_device *orig_dev); extern int llc_mac_hdr_init(struct sk_buff *skb, unsigned char *sa, unsigned char *da); @@ -78,7 +79,8 @@ extern void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); extern struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void llc_sap_close(struct llc_sap *sap); extern struct llc_sap *llc_sap_find(unsigned char sap_value); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 89809891e5a..34c07731933 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -363,7 +363,14 @@ __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, return neigh_create(tbl, pkey, dev); } -#define LOCALLY_ENQUEUED -2 +struct neighbour_cb { + unsigned long sched_next; + unsigned int flags; +}; + +#define LOCALLY_ENQUEUED 0x1 + +#define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) #endif #endif diff --git a/include/net/p8022.h b/include/net/p8022.h index 3c99a86c358..42e9fac51b3 100644 --- a/include/net/p8022.h +++ b/include/net/p8022.h @@ -4,7 +4,10 @@ extern struct datalink_proto * register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)); + struct packet_type *pt, + struct net_device *orig_dev)); extern void unregister_8022_client(struct datalink_proto *proto); +extern struct datalink_proto *make_8023_client(void); +extern void destroy_8023_client(struct datalink_proto *dl); #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4abda6aec05..b902d24a325 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -352,10 +352,10 @@ tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv) static inline int tcf_match_indev(struct sk_buff *skb, char *indev) { - if (0 != indev[0]) { - if (NULL == skb->input_dev) + if (indev[0]) { + if (!skb->input_dev) return 0; - else if (0 != strcmp(indev, skb->input_dev->name)) + if (strcmp(indev, skb->input_dev->name)) return 0; } diff --git a/include/net/psnap.h b/include/net/psnap.h index 9c94e8f98b3..b2e01cc3fc8 100644 --- a/include/net/psnap.h +++ b/include/net/psnap.h @@ -1,7 +1,7 @@ #ifndef _NET_PSNAP_H #define _NET_PSNAP_H -extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *)); +extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *orig_dev)); extern void unregister_snap_client(struct datalink_proto *proto); #endif diff --git a/include/net/raw.h b/include/net/raw.h index 1c411c45587..f47917469b1 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -17,10 +17,10 @@ #ifndef _RAW_H #define _RAW_H +#include <linux/config.h> extern struct proto raw_prot; - extern void raw_err(struct sock *, struct sk_buff *, u32 info); extern int raw_rcv(struct sock *, struct sk_buff *); @@ -37,6 +37,11 @@ extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, unsigned long raddr, unsigned long laddr, int dif); -extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); +extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); + +#ifdef CONFIG_PROC_FS +extern int raw_proc_init(void); +extern void raw_proc_exit(void); +#endif #endif /* _RAW_H */ diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 23fd9a6a221..14476a71725 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -7,10 +7,11 @@ extern struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; extern rwlock_t raw_v6_lock; -extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); +extern int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr); extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr); + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif); extern int rawv6_rcv(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 72fd6f5e86b..b52cc52ffe3 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -89,6 +89,7 @@ struct listen_sock { int qlen_young; int clock_hand; u32 hash_rnd; + u32 nr_table_entries; struct request_sock *syn_table[0]; }; @@ -96,6 +97,7 @@ struct listen_sock { * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children + * @rskq_defer_accept - User waits for some data after accept() * @syn_wait_lock - serializer * * %syn_wait_lock is necessary only to avoid proc interface having to grab the main @@ -111,6 +113,8 @@ struct request_sock_queue { struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; + u8 rskq_defer_accept; + /* 3 bytes hole, try to pack */ struct listen_sock *listen_opt; }; @@ -129,11 +133,13 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock return lopt; } -static inline void reqsk_queue_destroy(struct request_sock_queue *queue) +static inline void __reqsk_queue_destroy(struct request_sock_queue *queue) { kfree(reqsk_queue_yank_listen_sk(queue)); } +extern void reqsk_queue_destroy(struct request_sock_queue *queue); + static inline struct request_sock * reqsk_queue_yank_acceptq(struct request_sock_queue *queue) { @@ -221,17 +227,17 @@ static inline int reqsk_queue_added(struct request_sock_queue *queue) return prev_qlen; } -static inline int reqsk_queue_len(struct request_sock_queue *queue) +static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0; } -static inline int reqsk_queue_len_young(struct request_sock_queue *queue) +static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return queue->listen_opt->qlen_young; } -static inline int reqsk_queue_is_full(struct request_sock_queue *queue) +static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log; } diff --git a/include/net/route.h b/include/net/route.h index c3cd069a9ac..dbe79ca67d3 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -105,10 +105,6 @@ struct rt_cache_stat unsigned int out_hlist_search; }; -extern struct rt_cache_stat *rt_cache_stat; -#define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) - extern struct ip_rt_acct *ip_rt_acct; struct in_device; @@ -199,4 +195,6 @@ static inline struct inet_peer *rt_get_peer(struct rtable *rt) return rt->peer; } +extern ctl_table ipv4_route_table[]; + #endif /* _ROUTE_H */ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5999e5684bb..c51541ee024 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -47,10 +47,10 @@ #ifndef __sctp_constants_h__ #define __sctp_constants_h__ -#include <linux/tcp.h> /* For TCP states used in sctp_sock_state_t */ #include <linux/sctp.h> #include <linux/ipv6.h> /* For ipv6hdr. */ #include <net/sctp/user.h> +#include <net/tcp_states.h> /* For TCP states used in sctp_sock_state_t */ /* Value used for stream negotiation. */ enum { SCTP_MAX_STREAM = 0xffff }; diff --git a/include/net/sock.h b/include/net/sock.h index e9b1dbab90d..312cb25cbd1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -88,6 +88,7 @@ do { spin_lock_init(&((__sk)->sk_lock.slock)); \ } while(0) struct sock; +struct proto; /** * struct sock_common - minimal network layer representation of sockets @@ -98,10 +99,11 @@ struct sock; * @skc_node: main hash linkage for various protocol lookup tables * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count + * @skc_prot: protocol handlers inside a network family * * This is the minimal network layer representation of sockets, the header - * for struct sock and struct tcp_tw_bucket. - */ + * for struct sock and struct inet_timewait_sock. + */ struct sock_common { unsigned short skc_family; volatile unsigned char skc_state; @@ -110,11 +112,12 @@ struct sock_common { struct hlist_node skc_node; struct hlist_node skc_bind_node; atomic_t skc_refcnt; + struct proto *skc_prot; }; /** * struct sock - network layer representation of sockets - * @__sk_common: shared layout with tcp_tw_bucket + * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer @@ -136,11 +139,10 @@ struct sock_common { * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_lingertime: %SO_LINGER l_linger setting - * @sk_hashent: hash entry in several tables (e.g. tcp_ehash) + * @sk_hashent: hash entry in several tables (e.g. inet_hashinfo.ehash) * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used - * @sk_prot: protocol handlers inside a network family * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' @@ -173,7 +175,7 @@ struct sock_common { */ struct sock { /* - * Now struct tcp_tw_bucket also uses sock_common, so please just + * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; @@ -184,6 +186,7 @@ struct sock { #define sk_node __sk_common.skc_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt +#define sk_prot __sk_common.skc_prot unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; @@ -218,7 +221,6 @@ struct sock { struct sk_buff *tail; } sk_backlog; struct sk_buff_head sk_error_queue; - struct proto *sk_prot; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, @@ -253,28 +255,28 @@ struct sock { /* * Hashed lists helper routines */ -static inline struct sock *__sk_head(struct hlist_head *head) +static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } -static inline struct sock *sk_head(struct hlist_head *head) +static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } -static inline struct sock *sk_next(struct sock *sk) +static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } -static inline int sk_unhashed(struct sock *sk) +static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } -static inline int sk_hashed(struct sock *sk) +static inline int sk_hashed(const struct sock *sk) { return sk->sk_node.pprev != NULL; } @@ -554,6 +556,10 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + atomic_t *orphan_count; + struct request_sock_ops *rsk_prot; struct module *owner; @@ -561,7 +567,9 @@ struct proto { char name[32]; struct list_head node; - +#ifdef SOCK_REFCNT_DEBUG + atomic_t socks; +#endif struct { int inuse; u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; @@ -571,6 +579,31 @@ struct proto { extern int proto_register(struct proto *prot, int alloc_slab); extern void proto_unregister(struct proto *prot); +#ifdef SOCK_REFCNT_DEBUG +static inline void sk_refcnt_debug_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_prot->socks); +} + +static inline void sk_refcnt_debug_dec(struct sock *sk) +{ + atomic_dec(&sk->sk_prot->socks); + printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); +} + +static inline void sk_refcnt_debug_release(const struct sock *sk) +{ + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", + sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt)); +} +#else /* SOCK_REFCNT_DEBUG */ +#define sk_refcnt_debug_inc(sk) do { } while (0) +#define sk_refcnt_debug_dec(sk) do { } while (0) +#define sk_refcnt_debug_release(sk) do { } while (0) +#endif /* SOCK_REFCNT_DEBUG */ + /* Called with local bh disabled */ static __inline__ void sock_prot_inc_use(struct proto *prot) { @@ -582,6 +615,15 @@ static __inline__ void sock_prot_dec_use(struct proto *prot) prot->stats[smp_processor_id()].inuse--; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static inline void __sk_prot_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) @@ -693,6 +735,8 @@ extern struct sock *sk_alloc(int family, unsigned int __nocast priority, struct proto *prot, int zero_it); extern void sk_free(struct sock *sk); +extern struct sock *sk_clone(const struct sock *sk, + const unsigned int __nocast priority); extern struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, @@ -986,6 +1030,16 @@ sk_dst_check(struct sock *sk, u32 cookie) return dst; } +static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; + } +} + static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) { sk->sk_wmem_queued += skb->truesize; @@ -1146,7 +1200,7 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, int hdr_len; hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); - skb = alloc_skb(size + hdr_len, gfp); + skb = alloc_skb_fclone(size + hdr_len, gfp); if (skb) { skb->truesize += mem; if (sk->sk_forward_alloc >= (int)skb->truesize || @@ -1228,16 +1282,19 @@ static inline int sock_intr_errno(long timeo) static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval *stamp = &skb->stamp; + struct timeval stamp; + + skb_get_timestamp(skb, &stamp); if (sock_flag(sk, SOCK_RCVTSTAMP)) { /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ - if (stamp->tv_sec == 0) - do_gettimeofday(stamp); + if (stamp.tv_sec == 0) + do_gettimeofday(&stamp); + skb_set_timestamp(skb, &stamp); put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - stamp); + &stamp); } else - sk->sk_stamp = *stamp; + sk->sk_stamp = stamp; } /** @@ -1262,11 +1319,11 @@ extern int sock_get_timestamp(struct sock *, struct timeval __user *); */ #if 0 -#define NETDEBUG(x) do { } while (0) -#define LIMIT_NETDEBUG(x) do {} while(0) +#define NETDEBUG(fmt, args...) do { } while (0) +#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) #else -#define NETDEBUG(x) do { x; } while (0) -#define LIMIT_NETDEBUG(x) do { if (net_ratelimit()) { x; } } while(0) +#define NETDEBUG(fmt, args...) printk(fmt,##args) +#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) #endif /* @@ -1313,4 +1370,14 @@ static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsign } #endif +extern void sk_init(void); + +#ifdef CONFIG_SYSCTL +extern struct ctl_table core_table[]; +extern int sysctl_optmem_max; +#endif + +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + #endif /* _SOCK_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5010f0c5a56..d6bcf1317a6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -21,360 +21,29 @@ #define TCP_DEBUG 1 #define FASTRETRANS_DEBUG 1 -/* Cancel timers, when they are not required. */ -#undef TCP_CLEAR_TIMERS - #include <linux/config.h> #include <linux/list.h> #include <linux/tcp.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_timewait_sock.h> +#include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) -#include <linux/ipv6.h> -#endif -#include <linux/seq_file.h> - -/* This is for all connections with a full identity, no wildcards. - * New scheme, half the table is for TIME_WAIT, the other half is - * for the rest. I'll experiment with dynamic table growth later. - */ -struct tcp_ehash_bucket { - rwlock_t lock; - struct hlist_head chain; -} __attribute__((__aligned__(8))); - -/* This is for listening sockets, thus all sockets which possess wildcards. */ -#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ - -/* There are a few simple rules, which allow for local port reuse by - * an application. In essence: - * - * 1) Sockets bound to different interfaces may share a local port. - * Failing that, goto test 2. - * 2) If all sockets have sk->sk_reuse set, and none of them are in - * TCP_LISTEN state, the port may be shared. - * Failing that, goto test 3. - * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local - * address, and none of them are the same, the port may be - * shared. - * Failing this, the port cannot be shared. - * - * The interesting point, is test #2. This is what an FTP server does - * all day. To optimize this case we use a specific flag bit defined - * below. As we add sockets to a bind bucket list, we perform a - * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) - * As long as all sockets added to a bind bucket pass this test, - * the flag bit will be set. - * The resulting situation is that tcp_v[46]_verify_bind() can just check - * for this flag bit, if it is set and the socket trying to bind has - * sk->sk_reuse set, we don't even have to walk the owners list at all, - * we return that it is ok to bind this socket to the requested local port. - * - * Sounds like a lot of work, but it is worth it. In a more naive - * implementation (ie. current FreeBSD etc.) the entire list of ports - * must be walked for each data port opened by an ftp server. Needless - * to say, this does not scale at all. With a couple thousand FTP - * users logged onto your box, isn't it nice to know that new data - * ports are created in O(1) time? I thought so. ;-) -DaveM - */ -struct tcp_bind_bucket { - unsigned short port; - signed short fastreuse; - struct hlist_node node; - struct hlist_head owners; -}; - -#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node) - -struct tcp_bind_hashbucket { - spinlock_t lock; - struct hlist_head chain; -}; - -static inline struct tcp_bind_bucket *__tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_entry(head->chain.first, struct tcp_bind_bucket, node); -} - -static inline struct tcp_bind_bucket *tb_head(struct tcp_bind_hashbucket *head) -{ - return hlist_empty(&head->chain) ? NULL : __tb_head(head); -} - -extern struct tcp_hashinfo { - /* This is for sockets with full identity only. Sockets here will - * always be without wildcards and will have the following invariant: - * - * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE - * - * First half of the table is for sockets not in TIME_WAIT, second half - * is for TIME_WAIT sockets only. - */ - struct tcp_ehash_bucket *__tcp_ehash; - - /* Ok, let's try this, I give up, we do need a local binding - * TCP hash as well as the others for fast bind/connect. - */ - struct tcp_bind_hashbucket *__tcp_bhash; +#include <net/tcp_states.h> - int __tcp_bhash_size; - int __tcp_ehash_size; - - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. - */ - struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE]; - - /* All the above members are written once at bootup and - * never written again _or_ are predominantly read-access. - * - * Now align to a new cache line as all the following members - * are often dirty. - */ - rwlock_t __tcp_lhash_lock ____cacheline_aligned; - atomic_t __tcp_lhash_users; - wait_queue_head_t __tcp_lhash_wait; - spinlock_t __tcp_portalloc_lock; -} tcp_hashinfo; - -#define tcp_ehash (tcp_hashinfo.__tcp_ehash) -#define tcp_bhash (tcp_hashinfo.__tcp_bhash) -#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size) -#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size) -#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash) -#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock) -#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users) -#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait) -#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock) - -extern kmem_cache_t *tcp_bucket_cachep; -extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum); -extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb); -extern void tcp_bucket_unlock(struct sock *sk); -extern int tcp_port_rover; - -/* These are AF independent. */ -static __inline__ int tcp_bhashfn(__u16 lport) -{ - return (lport & (tcp_bhash_size - 1)); -} - -extern void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum); - -#if (BITS_PER_LONG == 64) -#define TCP_ADDRCMP_ALIGN_BYTES 8 -#else -#define TCP_ADDRCMP_ALIGN_BYTES 4 -#endif - -/* This is a TIME_WAIT bucket. It works around the memory consumption - * problems of sockets in such a state on heavily loaded servers, but - * without violating the protocol specification. - */ -struct tcp_tw_bucket { - /* - * Now struct sock also uses sock_common, so please just - * don't add nothing before this first member (__tw_common) --acme - */ - struct sock_common __tw_common; -#define tw_family __tw_common.skc_family -#define tw_state __tw_common.skc_state -#define tw_reuse __tw_common.skc_reuse -#define tw_bound_dev_if __tw_common.skc_bound_dev_if -#define tw_node __tw_common.skc_node -#define tw_bind_node __tw_common.skc_bind_node -#define tw_refcnt __tw_common.skc_refcnt - volatile unsigned char tw_substate; - unsigned char tw_rcv_wscale; - __u16 tw_sport; - /* Socket demultiplex comparisons on incoming packets. */ - /* these five are in inet_sock */ - __u32 tw_daddr - __attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES))); - __u32 tw_rcv_saddr; - __u16 tw_dport; - __u16 tw_num; - /* And these are ours. */ - int tw_hashent; - int tw_timeout; - __u32 tw_rcv_nxt; - __u32 tw_snd_nxt; - __u32 tw_rcv_wnd; - __u32 tw_ts_recent; - long tw_ts_recent_stamp; - unsigned long tw_ttd; - struct tcp_bind_bucket *tw_tb; - struct hlist_node tw_death_node; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; - int tw_v6_ipv6only; -#endif -}; - -static __inline__ void tw_add_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_node, list); -} - -static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - -static inline int tw_dead_hashed(struct tcp_tw_bucket *tw) -{ - return tw->tw_death_node.pprev != NULL; -} - -static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw) -{ - tw->tw_death_node.pprev = NULL; -} - -static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - __hlist_del(&tw->tw_death_node); - tw_dead_node_init(tw); -} - -static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw) -{ - if (tw_dead_hashed(tw)) { - __tw_del_dead_node(tw); - return 1; - } - return 0; -} - -#define tw_for_each(tw, node, head) \ - hlist_for_each_entry(tw, node, head, tw_node) - -#define tw_for_each_inmate(tw, node, jail) \ - hlist_for_each_entry(tw, node, jail, tw_death_node) - -#define tw_for_each_inmate_safe(tw, node, safe, jail) \ - hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) - -#define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk)) - -static inline u32 tcp_v4_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr; -} - -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) -{ - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; -} - -#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only) - -static inline int tcp_v6_ipv6only(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk); -} -#else -# define __tcp_v6_rcv_saddr(__sk) NULL -# define tcp_v6_rcv_saddr(__sk) NULL -# define tcptw_sk_ipv6only(__sk) 0 -# define tcp_v6_ipv6only(__sk) 0 -#endif +#include <linux/seq_file.h> -extern kmem_cache_t *tcp_timewait_cachep; - -static inline void tcp_tw_put(struct tcp_tw_bucket *tw) -{ - if (atomic_dec_and_test(&tw->tw_refcnt)) { -#ifdef INET_REFCNT_DEBUG - printk(KERN_DEBUG "tw_bucket %p released\n", tw); -#endif - kmem_cache_free(tcp_timewait_cachep, tw); - } -} +extern struct inet_hashinfo tcp_hashinfo; extern atomic_t tcp_orphan_count; -extern int tcp_tw_count; extern void tcp_time_wait(struct sock *sk, int state, int timeo); -extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); - - -/* Socket demux engine toys. */ -#ifdef __BIG_ENDIAN -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__sport)<<16) | (__u32)(__dport)) -#else /* __LITTLE_ENDIAN */ -#define TCP_COMBINED_PORTS(__sport, __dport) \ - (((__u32)(__dport)<<16) | (__u32)(__sport)) -#endif - -#if (BITS_PER_LONG == 64) -#ifdef __BIG_ENDIAN -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr)); -#else /* __LITTLE_ENDIAN */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ - __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr)); -#endif /* __BIG_ENDIAN */ -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - (((*((__u64 *)&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#else /* 32-bit arch */ -#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) -#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((inet_sk(__sk)->daddr == (__saddr)) && \ - (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ - ((tcptw_sk(__sk)->tw_daddr == (__saddr)) && \ - (tcptw_sk(__sk)->tw_rcv_saddr == (__daddr)) && \ - ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) -#endif /* 64-bit arch */ - -#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ - (((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports)) && \ - ((__sk)->sk_family == AF_INET6) && \ - ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ - ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ - (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) - -/* These can have wildcards, don't try too hard. */ -static __inline__ int tcp_lhashfn(unsigned short num) -{ - return num & (TCP_LHTABLE_SIZE - 1); -} - -static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) -{ - return tcp_lhashfn(inet_sk(sk)->num); -} #define MAX_TCP_HEADER (128 + MAX_HEADER) @@ -478,33 +147,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) * timestamps. It must be less than * minimal timewait lifetime. */ - -#define TCP_TW_RECYCLE_SLOTS_LOG 5 -#define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG) - -/* If time > 4sec, it is "slow" path, no recycling is required, - so that we select tick to get range about 4 seconds. - */ - -#if HZ <= 16 || HZ > 4096 -# error Unsupported: HZ <= 16 or HZ > 4096 -#elif HZ <= 32 -# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 64 -# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 128 -# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 256 -# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 512 -# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 1024 -# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG) -#elif HZ <= 2048 -# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG) -#else -# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) -#endif /* * TCP option */ @@ -534,22 +176,18 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 -#define TCP_TIME_RETRANS 1 /* Retransmit timer */ -#define TCP_TIME_DACK 2 /* Delayed ack timer */ -#define TCP_TIME_PROBE0 3 /* Zero window probe timer */ -#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */ - /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ +extern struct inet_timewait_death_row tcp_death_row; + /* sysctl variables for tcp */ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_tw_recycle; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_keepalive_intvl; @@ -564,7 +202,6 @@ extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_max_tw_buckets; extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_ecn; @@ -585,12 +222,6 @@ extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -#define TCP_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define TCP_INET_FAMILY(fam) 1 -#endif - /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) @@ -671,9 +302,6 @@ DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); #define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) #define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) -extern void tcp_put_port(struct sock *sk); -extern void tcp_inherit_port(struct sock *sk, struct sock *child); - extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); @@ -682,7 +310,7 @@ extern int tcp_v4_rcv(struct sk_buff *skb); extern int tcp_v4_remember_stamp(struct sock *sk); -extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw); +extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); @@ -704,42 +332,22 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); -enum tcp_ack_state_t -{ - TCP_ACK_SCHED = 1, - TCP_ACK_TIMER = 2, - TCP_ACK_PUSHED= 4 -}; - -static inline void tcp_schedule_ack(struct tcp_sock *tp) +static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) { - tp->ack.pending |= TCP_ACK_SCHED; -} - -static inline int tcp_ack_scheduled(struct tcp_sock *tp) -{ - return tp->ack.pending&TCP_ACK_SCHED; -} - -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts) -{ - if (tp->ack.quick) { - if (pkts >= tp->ack.quick) { - tp->ack.quick = 0; + struct inet_connection_sock *icsk = inet_csk(sk); + if (icsk->icsk_ack.quick) { + if (pkts >= icsk->icsk_ack.quick) { + icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.ato = TCP_ATO_MIN; } else - tp->ack.quick -= pkts; + icsk->icsk_ack.quick -= pkts; } } -extern void tcp_enter_quickack_mode(struct tcp_sock *tp); - -static __inline__ void tcp_delack_init(struct tcp_sock *tp) -{ - memset(&tp->ack, 0, sizeof(tp->ack)); -} +extern void tcp_enter_quickack_mode(struct sock *sk); static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { @@ -755,10 +363,9 @@ enum tcp_tw_status }; -extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, +extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - struct tcphdr *th, - unsigned len); + const struct tcphdr *th); extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock *req, @@ -773,7 +380,6 @@ extern void tcp_update_metrics(struct sock *sk); extern void tcp_close(struct sock *sk, long timeout); -extern struct sock * tcp_accept(struct sock *sk, int flags, int *err); extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); extern int tcp_getsockopt(struct sock *sk, int level, @@ -789,8 +395,6 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, size_t len, int nonblock, int flags, int *addr_len); -extern int tcp_listen_start(struct sock *sk); - extern void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab); @@ -799,11 +403,6 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern int tcp_v4_rebuild_header(struct sock *sk); - -extern int tcp_v4_build_header(struct sock *sk, - struct sk_buff *skb); - extern void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); @@ -872,18 +471,15 @@ extern void tcp_cwnd_application_limited(struct sock *sk); /* tcp_timer.c */ extern void tcp_init_xmit_timers(struct sock *); -extern void tcp_clear_xmit_timers(struct sock *); +static inline void tcp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} -extern void tcp_delete_keepalive_timer(struct sock *); -extern void tcp_reset_keepalive_timer(struct sock *, unsigned long); extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); extern unsigned int tcp_current_mss(struct sock *sk, int large); -#ifdef TCP_DEBUG -extern const char tcp_timer_bug_msg[]; -#endif - -/* tcp_diag.c */ +/* tcp.c */ extern void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ @@ -892,72 +488,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -static inline void tcp_clear_xmit_timer(struct sock *sk, int what) -{ - struct tcp_sock *tp = tcp_sk(sk); - - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->retransmit_timer); -#endif - break; - case TCP_TIME_DACK: - tp->ack.blocked = 0; - tp->ack.pending = 0; - -#ifdef TCP_CLEAR_TIMERS - sk_stop_timer(sk, &tp->delack_timer); -#endif - break; - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); -#endif - return; - }; - -} - -/* - * Reset the retransmission timer - */ -static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (when > TCP_RTO_MAX) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); -#endif - when = TCP_RTO_MAX; - } - - switch (what) { - case TCP_TIME_RETRANS: - case TCP_TIME_PROBE0: - tp->pending = what; - tp->timeout = jiffies+when; - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); - break; - - case TCP_TIME_DACK: - tp->ack.pending |= TCP_ACK_TIMER; - tp->ack.timeout = jiffies+when; - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); - break; - - default: -#ifdef TCP_DEBUG - printk(tcp_timer_bug_msg); -#endif - return; - }; -} - /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. @@ -975,7 +505,7 @@ static inline void tcp_initialize_rcv_mss(struct sock *sk) hint = min(hint, TCP_MIN_RCVMSS); hint = max(hint, TCP_MIN_MSS); - tp->ack.rcv_mss = hint; + inet_csk(sk)->icsk_ack.rcv_mss = hint; } static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) @@ -1110,7 +640,8 @@ static inline void tcp_packets_out_inc(struct sock *sk, tp->packets_out += tcp_skb_pcount(skb); if (!orig) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } static inline void tcp_packets_out_dec(struct tcp_sock *tp, @@ -1138,29 +669,29 @@ struct tcp_congestion_ops { struct list_head list; /* initialize private data (optional) */ - void (*init)(struct tcp_sock *tp); + void (*init)(struct sock *sk); /* cleanup private data (optional) */ - void (*release)(struct tcp_sock *tp); + void (*release)(struct sock *sk); /* return slow start threshold (required) */ - u32 (*ssthresh)(struct tcp_sock *tp); + u32 (*ssthresh)(struct sock *sk); /* lower bound for congestion window (optional) */ - u32 (*min_cwnd)(struct tcp_sock *tp); + u32 (*min_cwnd)(struct sock *sk); /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good_ack); /* round trip time sample per acked packet (optional) */ - void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + void (*rtt_sample)(struct sock *sk, u32 usrtt); /* call before changing ca_state (optional) */ - void (*set_state)(struct tcp_sock *tp, u8 new_state); + void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ - void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* new value of cwnd after loss (optional) */ - u32 (*undo_cwnd)(struct tcp_sock *tp); + u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); - /* get info for tcp_diag (optional) */ - void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + void (*pkts_acked)(struct sock *sk, u32 num_acked); + /* get info for inet_diag (optional) */ + void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb); char name[TCP_CA_NAME_MAX]; struct module *owner; @@ -1169,30 +700,34 @@ struct tcp_congestion_ops { extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); -extern void tcp_init_congestion_control(struct tcp_sock *tp); -extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern void tcp_init_congestion_control(struct sock *sk); +extern void tcp_cleanup_congestion_control(struct sock *sk); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); -extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name); +extern int tcp_set_congestion_control(struct sock *sk, const char *name); extern struct tcp_congestion_ops tcp_init_congestion_ops; -extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); -extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, +extern u32 tcp_reno_ssthresh(struct sock *sk); +extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag); -extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); +extern u32 tcp_reno_min_cwnd(struct sock *sk); extern struct tcp_congestion_ops tcp_reno; -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { - if (tp->ca_ops->set_state) - tp->ca_ops->set_state(tp, ca_state); - tp->ca_state = ca_state; + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->set_state) + icsk->icsk_ca_ops->set_state(sk, ca_state); + icsk->icsk_ca_state = ca_state; } -static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { - if (tp->ca_ops->cwnd_event) - tp->ca_ops->cwnd_event(tp, event); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->cwnd_event) + icsk->icsk_ca_ops->cwnd_event(sk, event); } /* This determines how many packets are "in the network" to the best @@ -1218,9 +753,10 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. */ -static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp) +static inline __u32 tcp_current_ssthresh(const struct sock *sk) { - if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery)) + const struct tcp_sock *tp = tcp_sk(sk); + if ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_CWR | TCPF_CA_Recovery)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, @@ -1237,10 +773,13 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) } /* Set slow start threshold and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct tcp_sock *tp) +static inline void __tcp_enter_cwr(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + tp->undo_marker = 0; - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1249,12 +788,14 @@ static inline void __tcp_enter_cwr(struct tcp_sock *tp) TCP_ECN_queue_cwr(tp); } -static inline void tcp_enter_cwr(struct tcp_sock *tp) +static inline void tcp_enter_cwr(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + tp->prior_ssthresh = 0; - if (tp->ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(tp); - tcp_set_ca_state(tp, TCP_CA_CWR); + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + __tcp_enter_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); } } @@ -1277,8 +818,10 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { - if (!tp->packets_out && !tp->pending) - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); + const struct inet_connection_sock *icsk = inet_csk(sk); + if (!tp->packets_out && !icsk->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + icsk->icsk_rto, TCP_RTO_MAX); } static __inline__ void tcp_push_pending_frames(struct sock *sk, @@ -1297,9 +840,6 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) tp->snd_wl1 = seq; } -extern void tcp_destroy_sock(struct sock *sk); - - /* * Calculate(/check) TCP checksum */ @@ -1359,8 +899,10 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { wake_up_interruptible(sk->sk_sleep); - if (!tcp_ack_scheduled(tp)) - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + if (!inet_csk_ack_scheduled(sk)) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + (3 * TCP_RTO_MIN) / 4, + TCP_RTO_MAX); } return 1; } @@ -1393,9 +935,9 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) TCP_INC_STATS(TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); - if (tcp_sk(sk)->bind_hash && + if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - tcp_put_port(sk); + inet_put_port(&tcp_hashinfo, sk); /* fall through */ default: if (oldstate==TCP_ESTABLISHED) @@ -1422,7 +964,7 @@ static __inline__ void tcp_done(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); } static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) @@ -1524,54 +1066,6 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static inline void tcp_acceptq_queue(struct sock *sk, struct request_sock *req, - struct sock *child) -{ - reqsk_queue_add(&tcp_sk(sk)->accept_queue, req, sk, child); -} - -static inline void -tcp_synq_removed(struct sock *sk, struct request_sock *req) -{ - if (reqsk_queue_removed(&tcp_sk(sk)->accept_queue, req) == 0) - tcp_delete_keepalive_timer(sk); -} - -static inline void tcp_synq_added(struct sock *sk) -{ - if (reqsk_queue_added(&tcp_sk(sk)->accept_queue) == 0) - tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); -} - -static inline int tcp_synq_len(struct sock *sk) -{ - return reqsk_queue_len(&tcp_sk(sk)->accept_queue); -} - -static inline int tcp_synq_young(struct sock *sk) -{ - return reqsk_queue_len_young(&tcp_sk(sk)->accept_queue); -} - -static inline int tcp_synq_is_full(struct sock *sk) -{ - return reqsk_queue_is_full(&tcp_sk(sk)->accept_queue); -} - -static inline void tcp_synq_unlink(struct tcp_sock *tp, struct request_sock *req, - struct request_sock **prev) -{ - reqsk_queue_unlink(&tp->accept_queue, req, prev); -} - -static inline void tcp_synq_drop(struct sock *sk, struct request_sock *req, - struct request_sock **prev) -{ - tcp_synq_unlink(tcp_sk(sk), req, prev); - tcp_synq_removed(sk, req); - reqsk_free(req); -} - static __inline__ void tcp_openreq_init(struct request_sock *req, struct tcp_options_received *rx_opt, struct sk_buff *skb) @@ -1593,27 +1087,6 @@ static __inline__ void tcp_openreq_init(struct request_sock *req, extern void tcp_enter_memory_pressure(void); -extern void tcp_listen_wlock(void); - -/* - We may sleep inside this lock. - * - If sleeping is not required (or called from BH), - * use plain read_(un)lock(&tcp_lhash_lock). - */ - -static inline void tcp_listen_lock(void) -{ - /* read_lock synchronizes to candidates to writers */ - read_lock(&tcp_lhash_lock); - atomic_inc(&tcp_lhash_users); - read_unlock(&tcp_lhash_lock); -} - -static inline void tcp_listen_unlock(void) -{ - if (atomic_dec_and_test(&tcp_lhash_users)) - wake_up(&tcp_lhash_wait); -} - static inline int keepalive_intvl_when(const struct tcp_sock *tp) { return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; @@ -1624,12 +1097,13 @@ static inline int keepalive_time_when(const struct tcp_sock *tp) return tp->keepalive_time ? : sysctl_tcp_keepalive_time; } -static inline int tcp_fin_time(const struct tcp_sock *tp) +static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout; + const int rto = inet_csk(sk)->icsk_rto; - if (fin_timeout < (tp->rto<<2) - (tp->rto>>1)) - fin_timeout = (tp->rto<<2) - (tp->rto>>1); + if (fin_timeout < (rto << 2) - (rto >> 1)) + fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } @@ -1658,15 +1132,6 @@ static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int return 1; } -static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst) -{ - sk->sk_route_caps = dst->dev->features; - if (sk->sk_route_caps & NETIF_F_TSO) { - if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) - sk->sk_route_caps &= ~NETIF_F_TSO; - } -} - #define TCP_CHECK_TIMER(sk) do { } while (0) static inline int tcp_use_frto(const struct sock *sk) @@ -1718,4 +1183,16 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); +extern struct request_sock_ops tcp_request_sock_ops; + +extern int tcp_v4_destroy_sock(struct sock *sk); + +#ifdef CONFIG_PROC_FS +extern int tcp4_proc_init(void); +extern void tcp4_proc_exit(void); +#endif + +extern void tcp_v4_init(struct net_proto_family *ops); +extern void tcp_init(void); + #endif /* _TCP_H */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 64980ee8c92..c6b84397448 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -88,7 +88,7 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode((struct sock *)tp); } } diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h new file mode 100644 index 00000000000..b9d4176b2d1 --- /dev/null +++ b/include/net/tcp_states.h @@ -0,0 +1,34 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol sk_state field. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_TCP_STATES_H +#define _LINUX_TCP_STATES_H + +enum { + TCP_ESTABLISHED = 1, + TCP_SYN_SENT, + TCP_SYN_RECV, + TCP_FIN_WAIT1, + TCP_FIN_WAIT2, + TCP_TIME_WAIT, + TCP_CLOSE, + TCP_CLOSE_WAIT, + TCP_LAST_ACK, + TCP_LISTEN, + TCP_CLOSING, /* Now a valid state */ + + TCP_MAX_STATES /* Leave at the end! */ +}; + +#define TCP_STATE_MASK 0xF + +#endif /* _LINUX_TCP_STATES_H */ diff --git a/include/net/udp.h b/include/net/udp.h index ac229b761db..107b9d791a1 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -94,6 +94,11 @@ struct udp_iter_state { struct seq_operations seq_ops; }; +#ifdef CONFIG_PROC_FS extern int udp_proc_register(struct udp_seq_afinfo *afinfo); extern void udp_proc_unregister(struct udp_seq_afinfo *afinfo); + +extern int udp4_proc_init(void); +extern void udp4_proc_exit(void); +#endif #endif /* _UDP_H */ diff --git a/include/net/x25.h b/include/net/x25.h index 8b39b98876e..fee62ff8c19 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -175,7 +175,7 @@ extern void x25_kill_by_neigh(struct x25_neigh *); /* x25_dev.c */ extern void x25_send_frame(struct sk_buff *, struct x25_neigh *); -extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *); +extern int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); extern void x25_establish_link(struct x25_neigh *); extern void x25_terminate_link(struct x25_neigh *); diff --git a/include/net/x25device.h b/include/net/x25device.h index d45ae883bd1..1a318374fae 100644 --- a/include/net/x25device.h +++ b/include/net/x25device.h @@ -8,7 +8,6 @@ static inline __be16 x25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->mac.raw = skb->data; - skb->input_dev = skb->dev = dev; skb->pkt_type = PACKET_HOST; return htons(ETH_P_X25); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 868ef88ef97..a9d0d8c5dfb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -818,7 +818,6 @@ extern void xfrm6_init(void); extern void xfrm6_fini(void); extern void xfrm_state_init(void); extern void xfrm4_state_init(void); -extern void xfrm4_state_fini(void); extern void xfrm6_state_init(void); extern void xfrm6_state_fini(void); diff --git a/init/main.c b/init/main.c index c9c311cf177..ff410063e4e 100644 --- a/init/main.c +++ b/init/main.c @@ -47,6 +47,7 @@ #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> +#include <net/sock.h> #include <asm/io.h> #include <asm/bugs.h> @@ -80,7 +81,6 @@ static int init(void *); extern void init_IRQ(void); -extern void sock_init(void); extern void fork_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); diff --git a/kernel/audit.c b/kernel/audit.c index ef35166fdc2..7f0699790d4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -514,7 +514,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e0bbee549e..8e56e249554 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> @@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -#ifdef CONFIG_NET -extern ctl_table net_table[]; -#endif static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; diff --git a/lib/Kconfig b/lib/Kconfig index eeb429a5215..e43197efeb9 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -72,6 +72,9 @@ config TEXTSEARCH config TEXTSEARCH_KMP tristate +config TEXTSEARCH_BM + tristate + config TEXTSEARCH_FSM tristate diff --git a/lib/Makefile b/lib/Makefile index f28d9031303..52f83380f70 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o +obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o hostprogs-y := gen_crc32table diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 8e49d21057e..04ca4429ddf 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -93,6 +93,7 @@ static int send_uevent(const char *signal, const char *obj, } } + NETLINK_CB(skb).dst_group = 1; return netlink_broadcast(uevent_sock, skb, 0, 1, gfp_mask); } @@ -153,7 +154,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_atomic); static int __init kobject_uevent_init(void) { - uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL); + uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL, + THIS_MODULE); if (!uevent_sock) { printk(KERN_ERR diff --git a/lib/ts_bm.c b/lib/ts_bm.c new file mode 100644 index 00000000000..2cc79112ecc --- /dev/null +++ b/lib/ts_bm.c @@ -0,0 +1,185 @@ +/* + * lib/ts_bm.c Boyer-Moore text search implementation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Pablo Neira Ayuso <pablo@eurodev.net> + * + * ========================================================================== + * + * Implements Boyer-Moore string matching algorithm: + * + * [1] A Fast String Searching Algorithm, R.S. Boyer and Moore. + * Communications of the Association for Computing Machinery, + * 20(10), 1977, pp. 762-772. + * http://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf + * + * [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004 + * http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf + * + * Note: Since Boyer-Moore (BM) performs searches for matchings from right + * to left, it's still possible that a matching could be spread over + * multiple blocks, in that case this algorithm won't find any coincidence. + * + * If you're willing to ensure that such thing won't ever happen, use the + * Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose + * the proper string search algorithm depending on your setting. + * + * Say you're using the textsearch infrastructure for filtering, NIDS or + * any similar security focused purpose, then go KMP. Otherwise, if you + * really care about performance, say you're classifying packets to apply + * Quality of Service (QoS) policies, and you don't mind about possible + * matchings spread over multiple fragments, then go BM. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/textsearch.h> + +/* Alphabet size, use ASCII */ +#define ASIZE 256 + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(args, format...) +#endif + +struct ts_bm +{ + u8 * pattern; + unsigned int patlen; + unsigned int bad_shift[ASIZE]; + unsigned int good_shift[0]; +}; + +static unsigned int bm_find(struct ts_config *conf, struct ts_state *state) +{ + struct ts_bm *bm = ts_config_priv(conf); + unsigned int i, text_len, consumed = state->offset; + const u8 *text; + int shift = bm->patlen, bs; + + for (;;) { + text_len = conf->get_next_block(consumed, &text, conf, state); + + if (unlikely(text_len == 0)) + break; + + while (shift < text_len) { + DEBUGP("Searching in position %d (%c)\n", + shift, text[shift]); + for (i = 0; i < bm->patlen; i++) + if (text[shift-i] != bm->pattern[bm->patlen-1-i]) + goto next; + + /* London calling... */ + DEBUGP("found!\n"); + return consumed += (shift-(bm->patlen-1)); + +next: bs = bm->bad_shift[text[shift-i]]; + + /* Now jumping to... */ + shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]); + } + consumed += text_len; + } + + return UINT_MAX; +} + +static void compute_prefix_tbl(struct ts_bm *bm, const u8 *pattern, + unsigned int len) +{ + int i, j, ended, l[ASIZE]; + + for (i = 0; i < ASIZE; i++) + bm->bad_shift[i] = len; + for (i = 0; i < len - 1; i++) + bm->bad_shift[pattern[i]] = len - 1 - i; + + /* Compute the good shift array, used to match reocurrences + * of a subpattern */ + for (i = 1; i < bm->patlen; i++) { + for (j = 0; j < bm->patlen && bm->pattern[bm->patlen - 1 - j] + == bm->pattern[bm->patlen - 1 - i - j]; j++); + l[i] = j; + } + + bm->good_shift[0] = 1; + for (i = 1; i < bm->patlen; i++) + bm->good_shift[i] = bm->patlen; + for (i = bm->patlen - 1; i > 0; i--) + bm->good_shift[l[i]] = i; + ended = 0; + for (i = 0; i < bm->patlen; i++) { + if (l[i] == bm->patlen - 1 - i) + ended = i; + if (ended) + bm->good_shift[i] = ended; + } +} + +static struct ts_config *bm_init(const void *pattern, unsigned int len, + int gfp_mask) +{ + struct ts_config *conf; + struct ts_bm *bm; + unsigned int prefix_tbl_len = len * sizeof(unsigned int); + size_t priv_size = sizeof(*bm) + len + prefix_tbl_len; + + conf = alloc_ts_config(priv_size, gfp_mask); + if (IS_ERR(conf)) + return conf; + + bm = ts_config_priv(conf); + bm->patlen = len; + bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len; + compute_prefix_tbl(bm, pattern, len); + memcpy(bm->pattern, pattern, len); + + return conf; +} + +static void *bm_get_pattern(struct ts_config *conf) +{ + struct ts_bm *bm = ts_config_priv(conf); + return bm->pattern; +} + +static unsigned int bm_get_pattern_len(struct ts_config *conf) +{ + struct ts_bm *bm = ts_config_priv(conf); + return bm->patlen; +} + +static struct ts_ops bm_ops = { + .name = "bm", + .find = bm_find, + .init = bm_init, + .get_pattern = bm_get_pattern, + .get_pattern_len = bm_get_pattern_len, + .owner = THIS_MODULE, + .list = LIST_HEAD_INIT(bm_ops.list) +}; + +static int __init init_bm(void) +{ + return textsearch_register(&bm_ops); +} + +static void __exit exit_bm(void) +{ + textsearch_unregister(&bm_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_bm); +module_exit(exit_bm); diff --git a/mm/memory.c b/mm/memory.c index e046b7e4b53..a596c117224 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -498,6 +498,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!vma->anon_vma) + return 0; + } + if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); diff --git a/net/802/fc.c b/net/802/fc.c index 640d34e026c..282c4ab1abe 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb) struct fch_hdr *fch=(struct fch_hdr *)skb->data; struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); if(fcllc->ethertype != htons(ETH_P_IP)) { - printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype)); + printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype)); return 0; } #ifdef CONFIG_INET diff --git a/net/802/fddi.c b/net/802/fddi.c index 5ce24c4bb84..ac242a4bc34 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff *skb) else #endif { - printk("%s: Don't know how to resolve type %02X addresses.\n", - skb->dev->name, htons(fddi->hdr.llc_snap.ethertype)); + printk("%s: Don't know how to resolve type %04X addresses.\n", + skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); return(0); } } diff --git a/net/802/hippi.c b/net/802/hippi.c index 051e8af56a7..6d7fed3dd99 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, unsigned len) { struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); + struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; if (!len){ len = skb->len - HIPPI_HLEN; @@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev, if (daddr) { memcpy(hip->le.dest_switch_addr, daddr + 3, 3); - memcpy(&skb->private.ifield, daddr + 2, 4); + memcpy(&hcb->ifield, daddr + 2, 4); return HIPPI_HLEN; } + hcb->ifield = 0; return -((int)HIPPI_HLEN); } @@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb) * Determine the packet's protocol ID. */ -unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev) +__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) { struct hippi_hdr *hip; diff --git a/net/802/p8022.c b/net/802/p8022.c index 5ae63416df6..b24817c63ca 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, struct datalink_proto *register_8022_client(unsigned char type, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct datalink_proto *proto; diff --git a/net/802/p8023.c b/net/802/p8023.c index a0b61b40225..6368d3dce44 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -20,6 +20,7 @@ #include <linux/skbuff.h> #include <net/datalink.h> +#include <net/p8022.h> /* * Place an 802.3 header on a packet. The driver will do the mac diff --git a/net/802/psnap.c b/net/802/psnap.c index 1053821ddf9..ab80b1fab53 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc) * A SNAP packet has arrived */ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { int rc = 1; struct datalink_proto *proto; @@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, /* Pass the frame on. */ skb->h.raw += 5; skb_pull(skb, 5); - rc = proto->rcvfunc(skb, dev, &snap_packet_type); + rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } else { skb->sk = NULL; kfree_skb(skb); @@ -118,7 +118,8 @@ module_exit(snap_exit); struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, - struct packet_type *)) + struct packet_type *, + struct net_device *)) { struct datalink_proto *proto = NULL; diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c index 36079630c49..700129556c1 100644 --- a/net/802/sysctl_net_802.c +++ b/net/802/sysctl_net_802.c @@ -10,9 +10,10 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> #include <linux/mm.h> +#include <linux/if_tr.h> #include <linux/sysctl.h> -#include <linux/config.h> #ifdef CONFIG_TR extern int sysctl_tr_rif_timeout; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 508b1fa1454..9ae3a14dd01 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev, /* found in vlan_dev.c */ int vlan_dev_rebuild_header(struct sk_buff *skb); int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype); + struct packet_type *ptype, struct net_device *orig_dev); int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 49c48741351..145f5cde96c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) * */ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, - struct packet_type* ptype) + struct packet_type* ptype, struct net_device *orig_dev) { unsigned char *rawp = NULL; struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data); diff --git a/net/Kconfig b/net/Kconfig index 40a31ba86d2..c07aafb59a0 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -147,6 +147,7 @@ source "net/bridge/netfilter/Kconfig" endif +source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/atm/Kconfig" source "net/bridge/Kconfig" @@ -205,6 +206,8 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +source "net/netfilter/Kconfig" + endmenu endmenu diff --git a/net/Makefile b/net/Makefile index 8e2bdc025ab..7e6eff206c8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_NET) += $(tmp-y) obj-$(CONFIG_LLC) += llc/ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_INET) += ipv4/ +obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ ifneq ($(CONFIG_IPV6),) @@ -41,6 +42,7 @@ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_DECNET) += decnet/ obj-$(CONFIG_ECONET) += econet/ obj-$(CONFIG_VLAN_8021Q) += 8021q/ +obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ ifeq ($(CONFIG_NET),y) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index c34614ea5fc..7076097debc 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, * frame. We currently only support Ethernet. */ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct elapaarp *ea = aarp_hdr(skb); int hash, ret = 0; diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 192b529f86a..1d31b3a3f1e 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -53,12 +53,12 @@ #include <linux/config.h> #include <linux/module.h> -#include <linux/tcp.h> #include <linux/if_arp.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <net/datalink.h> #include <net/psnap.h> #include <net/sock.h> +#include <net/tcp_states.h> #include <net/route.h> #include <linux/atalk.h> @@ -1390,7 +1390,7 @@ free_it: * [ie ARPHRD_ETHERTALK] */ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct ddpehdr *ddp; struct sock *sock; @@ -1482,7 +1482,7 @@ freeit: * header and append a long one. */ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { /* Expand any short form frames */ if (skb->mac.raw[2] == 1) { @@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, } skb->h.raw = skb->data; - return atalk_rcv(skb, dev, pt); + return atalk_rcv(skb, dev, pt, orig_dev); freeit: kfree_skb(skb); return 0; diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c index 181a3002d8a..4b1faca5013 100644 --- a/net/atm/ipcommon.c +++ b/net/atm/ipcommon.c @@ -34,7 +34,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) { - struct sk_buff *skb; unsigned long flags; struct sk_buff *skb_from = (struct sk_buff *) from; struct sk_buff *skb_to = (struct sk_buff *) to; @@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) prev->next = skb_to; to->prev->next = from->next; to->prev = from->prev; - for (skb = from->next; skb != skb_to; skb = skb->next) - skb->list = to; to->qlen += from->qlen; spin_unlock(&to->lock); from->prev = skb_from; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a5c94f11547..ea43dfb774e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -45,7 +45,7 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/spinlock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/ip.h> #include <net/arp.h> diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c index 8adc0022cf5..edcaa897027 100644 --- a/net/ax25/ax25_ds_in.c +++ b/net/ax25/ax25_ds_in.c @@ -22,8 +22,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/ip.h> /* For ip_rcv */ -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 3a8b67316fc..061083efc1d 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -18,7 +18,7 @@ #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 3dc808fde33..810c9c76c2e 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -9,7 +9,6 @@ * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) */ -#include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -26,9 +25,7 @@ #include <linux/skbuff.h> #include <linux/netfilter.h> #include <net/sock.h> -#include <net/ip.h> /* For ip_rcv */ -#include <net/tcp.h> -#include <net/arp.h> /* For arp_rcv */ +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> @@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) pid = *skb->data; -#ifdef CONFIG_INET if (pid == AX25_P_IP) { /* working around a TCP bug to keep additional listeners * happy. TCP re-uses the buffer and destroys the original @@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) skb->dev = ax25->ax25_dev->dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ + netif_rx(skb); return 1; } -#endif if (pid == AX25_P_SEGMENT) { skb_pull(skb, 1); /* Remove PID */ return ax25_rx_fragment(ax25, skb); @@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, /* Now we are pointing at the pid byte */ switch (skb->data[1]) { -#ifdef CONFIG_INET case AX25_P_IP: skb_pull(skb,2); /* drop PID/CTRL */ skb->h.raw = skb->data; @@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_IP); - ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ + netif_rx(skb); break; case AX25_P_ARP: @@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb->pkt_type = PACKET_HOST; skb->protocol = htons(ETH_P_ARP); - arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ + netif_rx(skb); break; -#endif case AX25_P_TEXT: /* Now find a suitable dgram socket */ sk = ax25_get_socket(&dest, &src, SOCK_DGRAM); @@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, * Receive an AX.25 frame via a SLIP interface. */ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { skb->sk = NULL; /* Initially we don't know who it's for */ skb->destructor = NULL; /* Who initializes this, dammit?! */ diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c index 7131873322c..f6ed283e9de 100644 --- a/net/ax25/ax25_std_in.c +++ b/net/ax25/ax25_std_in.c @@ -29,8 +29,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/ip.h> /* For ip_rcv */ -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c index 066897bc074..a29c480a4dc 100644 --- a/net/ax25/ax25_std_timer.c +++ b/net/ax25/ax25_std_timer.c @@ -24,7 +24,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 99694b57f6f..c41dbe5fade 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -24,7 +24,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> @@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25) if (skb_prev == NULL) skb_queue_head(&ax25->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &ax25->write_queue); skb_prev = skb; } } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ffa26c10bfe..55dc42eac92 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt) /* Special commands */ while ((skb = skb_dequeue(&hdev->driver_init))) { - skb->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; skb->dev = (void *) hdev; skb_queue_tail(&hdev->cmd_q, skb); hci_sched_cmd(hdev); @@ -995,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb) return -ENODEV; } - BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); + BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len); if (atomic_read(&hdev->promisc)) { /* Time stamp */ - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); hci_send_to_sock(hdev, skb); } @@ -1034,7 +1034,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p BT_DBG("skb len %d", skb->len); - skb->pkt_type = HCI_COMMAND_PKT; + bt_cb(skb)->pkt_type = HCI_COMMAND_PKT; skb->dev = (void *) hdev; skb_queue_tail(&hdev->cmd_q, skb); hci_sched_cmd(hdev); @@ -1081,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags); skb->dev = (void *) hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags | ACL_START); if (!(list = skb_shinfo(skb)->frag_list)) { @@ -1103,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) skb = list; list = list->next; skb->dev = (void *) hdev; - skb->pkt_type = HCI_ACLDATA_PKT; + bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); @@ -1139,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE); skb->dev = (void *) hdev; - skb->pkt_type = HCI_SCODATA_PKT; + bt_cb(skb)->pkt_type = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); hci_sched_tx(hdev); return 0; @@ -1369,7 +1369,7 @@ void hci_rx_task(unsigned long arg) if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: kfree_skb(skb); @@ -1378,7 +1378,7 @@ void hci_rx_task(unsigned long arg) } /* Process frame */ - switch (skb->pkt_type) { + switch (bt_cb(skb)->pkt_type) { case HCI_EVENT_PKT: hci_event_packet(hdev, skb); break; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 46367bd129c..d6da0939216 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff /* Inquiry Result */ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) { + struct inquiry_data data; struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1); int num_rsp = *((__u8 *) skb->data); BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + if (!num_rsp) + return; + hci_dev_lock(hdev); + for (; num_rsp; num_rsp--) { - struct inquiry_data data; bacpy(&data.bdaddr, &info->bdaddr); data.pscan_rep_mode = info->pscan_rep_mode; data.pscan_period_mode = info->pscan_period_mode; @@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff * info++; hci_inquiry_cache_update(hdev, &data); } + hci_dev_unlock(hdev); } /* Inquiry Result With RSSI */ static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb) { - struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1); + struct inquiry_data data; int num_rsp = *((__u8 *) skb->data); BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + if (!num_rsp) + return; + hci_dev_lock(hdev); - for (; num_rsp; num_rsp--) { - struct inquiry_data data; - bacpy(&data.bdaddr, &info->bdaddr); - data.pscan_rep_mode = info->pscan_rep_mode; - data.pscan_period_mode = info->pscan_period_mode; - data.pscan_mode = 0x00; - memcpy(data.dev_class, info->dev_class, 3); - data.clock_offset = info->clock_offset; - data.rssi = info->rssi; - info++; - hci_inquiry_cache_update(hdev, &data); + + if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) { + struct inquiry_info_with_rssi_and_pscan_mode *info = + (struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1); + + for (; num_rsp; num_rsp--) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = info->pscan_mode; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + info++; + hci_inquiry_cache_update(hdev, &data); + } + } else { + struct inquiry_info_with_rssi *info = + (struct inquiry_info_with_rssi *) (skb->data + 1); + + for (; num_rsp; num_rsp--) { + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = 0x00; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + info++; + hci_inquiry_cache_update(hdev, &data); + } } + hci_dev_unlock(hdev); } @@ -865,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk hci_dev_unlock(hdev); } +/* Page Scan Repetition Mode */ +static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data; + struct inquiry_entry *ie; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) { + ie->data.pscan_rep_mode = ev->pscan_rep_mode; + ie->timestamp = jiffies; + } + + hci_dev_unlock(hdev); +} + void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data; @@ -937,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_clock_offset_evt(hdev, skb); break; + case HCI_EV_PSCAN_REP_MODE: + hci_pscan_rep_mode_evt(hdev, skb); + break; + case HCI_EV_CMD_STATUS: cs = (struct hci_ev_cmd_status *) skb->data; skb_pull(skb, sizeof(cs)); @@ -1036,9 +1087,9 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) memcpy(ev->data, data, dlen); bt_cb(skb)->incoming = 1; - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); - skb->pkt_type = HCI_EVENT_PKT; + bt_cb(skb)->pkt_type = HCI_EVENT_PKT; skb->dev = (void *) hdev; hci_send_to_sock(hdev, skb); kfree_skb(skb); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index ebdcce5e7ca..32ef7975a13 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) /* Apply filter */ flt = &hci_pi(sk)->filter; - if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ? - 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) + if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ? + 0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) continue; - if (skb->pkt_type == HCI_EVENT_PKT) { + if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) { register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); if (!hci_test_bit(evt, &flt->event_mask)) @@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) continue; /* Put type byte before the data */ - memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1); + memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1); if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); @@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_ { __u32 mask = hci_pi(sk)->cmsg_mask; - if (mask & HCI_CMSG_DIR) - put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming); + if (mask & HCI_CMSG_DIR) { + int incoming = bt_cb(skb)->incoming; + put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming); + } + + if (mask & HCI_CMSG_TSTAMP) { + struct timeval tv; - if (mask & HCI_CMSG_TSTAMP) - put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp); + skb_get_timestamp(skb, &tv); + put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv); + } } static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, @@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, goto drop; } - skb->pkt_type = *((unsigned char *) skb->data); + bt_cb(skb)->pkt_type = *((unsigned char *) skb->data); skb_pull(skb, 1); skb->dev = (void *) hdev; - if (skb->pkt_type == HCI_COMMAND_PKT) { + if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) { u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); u16 ogf = hci_opcode_ogf(opcode); u16 ocf = hci_opcode_ocf(opcode); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 32fccfb5bfa..d3d6bc54721 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -372,7 +372,7 @@ static struct proto l2cap_proto = { .obj_size = sizeof(struct l2cap_pinfo) }; -static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct sock *sk; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 27bf5047cd3..173f46e8cda 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -21,10 +21,6 @@ SOFTWARE IS DISCLAIMED. */ -/* - RPN support - Dirk Husemann <hud@zurich.ibm.com> -*/ - /* * Bluetooth RFCOMM core. * @@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s); #define __get_mcc_len(b) ((b & 0xfe) >> 1) /* RPN macros */ -#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3)) +#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3)) #define __get_rpn_data_bits(line) ((line) & 0x3) #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) -#define __get_rpn_parity(line) (((line) >> 3) & 0x3) +#define __get_rpn_parity(line) (((line) >> 3) & 0x7) static inline void rfcomm_schedule(uint event) { @@ -233,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) d->rx_credits = RFCOMM_DEFAULT_CREDITS; } -struct rfcomm_dlc *rfcomm_dlc_alloc(int prio) +struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio) { struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio); if (!d) @@ -780,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d return rfcomm_send_frame(s, buf, ptr - buf); } -static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, - u8 bit_rate, u8 data_bits, u8 stop_bits, - u8 parity, u8 flow_ctrl_settings, - u8 xon_char, u8 xoff_char, u16 param_mask) +int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask) { struct rfcomm_hdr *hdr; struct rfcomm_mcc *mcc; @@ -791,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 buf[16], *ptr = buf; BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" - "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", - s, cr, dlci, bit_rate, data_bits, stop_bits, parity, - flow_ctrl_settings, xon_char, xoff_char, param_mask); + " flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", + s, cr, dlci, bit_rate, data_bits, stop_bits, parity, + flow_ctrl_settings, xon_char, xoff_char, param_mask); hdr = (void *) ptr; ptr += sizeof(*hdr); hdr->addr = __addr(s->initiator, 0); @@ -1265,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ u8 xon_char = 0; u8 xoff_char = 0; u16 rpn_mask = RFCOMM_RPN_PM_ALL; - - BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", - dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, - rpn->xon_char, rpn->xoff_char, rpn->param_mask); - - if (!cr) + + BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", + dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, + rpn->xon_char, rpn->xoff_char, rpn->param_mask); + + if (!cr) return 0; - + if (len == 1) { - /* request: return default setting */ + /* This is a request, return default settings */ bit_rate = RFCOMM_RPN_BR_115200; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; @@ -1282,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ flow_ctrl = RFCOMM_RPN_FLOW_NONE; xon_char = RFCOMM_RPN_XON_CHAR; xoff_char = RFCOMM_RPN_XOFF_CHAR; - goto rpn_out; } - /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity, - no flow control lines, normal XON/XOFF chars */ + + /* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit, + * no parity, no flow control lines, normal XON/XOFF chars */ + if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) { bit_rate = rpn->bit_rate; if (bit_rate != RFCOMM_RPN_BR_115200) { @@ -1295,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_BITRATE; } } + if (rpn->param_mask & RFCOMM_RPN_PM_DATA) { data_bits = __get_rpn_data_bits(rpn->line_settings); if (data_bits != RFCOMM_RPN_DATA_8) { @@ -1303,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_DATA; } } + if (rpn->param_mask & RFCOMM_RPN_PM_STOP) { stop_bits = __get_rpn_stop_bits(rpn->line_settings); if (stop_bits != RFCOMM_RPN_STOP_1) { @@ -1311,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_STOP; } } + if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) { parity = __get_rpn_parity(rpn->line_settings); if (parity != RFCOMM_RPN_PARITY_NONE) { @@ -1319,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_PARITY; } } + if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) { flow_ctrl = rpn->flow_ctrl; if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { @@ -1327,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_FLOW; } } + if (rpn->param_mask & RFCOMM_RPN_PM_XON) { xon_char = rpn->xon_char; if (xon_char != RFCOMM_RPN_XON_CHAR) { @@ -1335,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ rpn_mask ^= RFCOMM_RPN_PM_XON; } } + if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) { xoff_char = rpn->xoff_char; if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { @@ -1345,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ } rpn_out: - rfcomm_send_rpn(s, 0, dlci, - bit_rate, data_bits, stop_bits, parity, flow_ctrl, - xon_char, xoff_char, rpn_mask); + rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits, + parity, flow_ctrl, xon_char, xoff_char, rpn_mask); return 0; } @@ -1358,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb u8 dlci = __get_dlci(rls->dlci); BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); - + if (!cr) return 0; - /* FIXME: We should probably do something with this - information here. But for now it's sufficient just - to reply -- Bluetooth 1.1 says it's mandatory to - recognise and respond to RLS */ + /* We should probably do something with this information here. But + * for now it's sufficient just to reply -- Bluetooth 1.1 says it's + * mandatory to recognise and respond to RLS */ rfcomm_send_rls(s, 0, dlci, rls->status); @@ -1381,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); d = rfcomm_dlc_get(s, dlci); - if (!d) + if (!d) return 0; if (cr) { @@ -1389,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb set_bit(RFCOMM_TX_THROTTLED, &d->flags); else clear_bit(RFCOMM_TX_THROTTLED, &d->flags); - + rfcomm_dlc_lock(d); if (d->modem_status) d->modem_status(d, msc->v24_sig); @@ -1398,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb rfcomm_send_msc(s, 0, dlci, msc->v24_sig); d->mscex |= RFCOMM_MSCEX_RX; - } else + } else d->mscex |= RFCOMM_MSCEX_TX; return 0; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 63a123c5c41..90e19eb6d3c 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -284,7 +284,7 @@ static struct proto rfcomm_proto = { .obj_size = sizeof(struct rfcomm_pinfo) }; -static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct rfcomm_dlc *d; struct sock *sk; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 6304590fd36..1bca860a610 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de skb->destructor = rfcomm_wfree; } -static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority) +static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority) { if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { struct sk_buff *skb = alloc_skb(size, priority); @@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) struct rfcomm_dev *dev = dlc->owner; if (!dev) return; - + BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); + if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) { + if (dev->tty && !C_CLOCAL(dev->tty)) + tty_hangup(dev->tty); + } + dev->modem_status = ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | @@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned return -ENOIOCTLCMD; } -#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) - static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old) { - BT_DBG("tty %p", tty); + struct termios *new = (struct termios *) tty->termios; + int old_baud_rate = tty_termios_baud_rate(old); + int new_baud_rate = tty_termios_baud_rate(new); - if ((tty->termios->c_cflag == old->c_cflag) && - (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag))) - return; + u8 baud, data_bits, stop_bits, parity, x_on, x_off; + u16 changes = 0; + + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p termios %p", tty, old); + + /* Handle turning off CRTSCTS */ + if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) + BT_DBG("Turning off CRTSCTS unsupported"); + + /* Parity on/off and when on, odd/even */ + if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) || + ((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) { + changes |= RFCOMM_RPN_PM_PARITY; + BT_DBG("Parity change detected."); + } + + /* Mark and space parity are not supported! */ + if (new->c_cflag & PARENB) { + if (new->c_cflag & PARODD) { + BT_DBG("Parity is ODD"); + parity = RFCOMM_RPN_PARITY_ODD; + } else { + BT_DBG("Parity is EVEN"); + parity = RFCOMM_RPN_PARITY_EVEN; + } + } else { + BT_DBG("Parity is OFF"); + parity = RFCOMM_RPN_PARITY_NONE; + } + + /* Setting the x_on / x_off characters */ + if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) { + BT_DBG("XOFF custom"); + x_on = new->c_cc[VSTOP]; + changes |= RFCOMM_RPN_PM_XON; + } else { + BT_DBG("XOFF default"); + x_on = RFCOMM_RPN_XON_CHAR; + } + + if (old->c_cc[VSTART] != new->c_cc[VSTART]) { + BT_DBG("XON custom"); + x_off = new->c_cc[VSTART]; + changes |= RFCOMM_RPN_PM_XOFF; + } else { + BT_DBG("XON default"); + x_off = RFCOMM_RPN_XOFF_CHAR; + } + + /* Handle setting of stop bits */ + if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB)) + changes |= RFCOMM_RPN_PM_STOP; + + /* POSIX does not support 1.5 stop bits and RFCOMM does not + * support 2 stop bits. So a request for 2 stop bits gets + * translated to 1.5 stop bits */ + if (new->c_cflag & CSTOPB) { + stop_bits = RFCOMM_RPN_STOP_15; + } else { + stop_bits = RFCOMM_RPN_STOP_1; + } + + /* Handle number of data bits [5-8] */ + if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) + changes |= RFCOMM_RPN_PM_DATA; + + switch (new->c_cflag & CSIZE) { + case CS5: + data_bits = RFCOMM_RPN_DATA_5; + break; + case CS6: + data_bits = RFCOMM_RPN_DATA_6; + break; + case CS7: + data_bits = RFCOMM_RPN_DATA_7; + break; + case CS8: + data_bits = RFCOMM_RPN_DATA_8; + break; + default: + data_bits = RFCOMM_RPN_DATA_8; + break; + } + + /* Handle baudrate settings */ + if (old_baud_rate != new_baud_rate) + changes |= RFCOMM_RPN_PM_BITRATE; - /* handle turning off CRTSCTS */ - if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) { - BT_DBG("turning off CRTSCTS"); + switch (new_baud_rate) { + case 2400: + baud = RFCOMM_RPN_BR_2400; + break; + case 4800: + baud = RFCOMM_RPN_BR_4800; + break; + case 7200: + baud = RFCOMM_RPN_BR_7200; + break; + case 9600: + baud = RFCOMM_RPN_BR_9600; + break; + case 19200: + baud = RFCOMM_RPN_BR_19200; + break; + case 38400: + baud = RFCOMM_RPN_BR_38400; + break; + case 57600: + baud = RFCOMM_RPN_BR_57600; + break; + case 115200: + baud = RFCOMM_RPN_BR_115200; + break; + case 230400: + baud = RFCOMM_RPN_BR_230400; + break; + default: + /* 9600 is standard accordinag to the RFCOMM specification */ + baud = RFCOMM_RPN_BR_9600; + break; + } + + if (changes) + rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud, + data_bits, stop_bits, parity, + RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes); + + return; } static void rfcomm_tty_throttle(struct tty_struct *tty) @@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty) struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); - + rfcomm_dlc_throttle(dev->dlc); } @@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty) struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); - + rfcomm_dlc_unthrottle(dev->dlc); } @@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp) static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear) { - struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; - struct rfcomm_dlc *dlc = dev->dlc; - u8 v24_sig; + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); - rfcomm_dlc_get_modem_status(dlc, &v24_sig); - - if (set & TIOCM_DSR || set & TIOCM_DTR) - v24_sig |= RFCOMM_V24_RTC; - if (set & TIOCM_RTS || set & TIOCM_CTS) - v24_sig |= RFCOMM_V24_RTR; - if (set & TIOCM_RI) - v24_sig |= RFCOMM_V24_IC; - if (set & TIOCM_CD) - v24_sig |= RFCOMM_V24_DV; - - if (clear & TIOCM_DSR || clear & TIOCM_DTR) - v24_sig &= ~RFCOMM_V24_RTC; - if (clear & TIOCM_RTS || clear & TIOCM_CTS) - v24_sig &= ~RFCOMM_V24_RTR; - if (clear & TIOCM_RI) - v24_sig &= ~RFCOMM_V24_IC; - if (clear & TIOCM_CD) - v24_sig &= ~RFCOMM_V24_DV; - - rfcomm_dlc_set_modem_status(dlc, v24_sig); - - return 0; + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + if (set & TIOCM_DSR || set & TIOCM_DTR) + v24_sig |= RFCOMM_V24_RTC; + if (set & TIOCM_RTS || set & TIOCM_CTS) + v24_sig |= RFCOMM_V24_RTR; + if (set & TIOCM_RI) + v24_sig |= RFCOMM_V24_IC; + if (set & TIOCM_CD) + v24_sig |= RFCOMM_V24_DV; + + if (clear & TIOCM_DSR || clear & TIOCM_DTR) + v24_sig &= ~RFCOMM_V24_RTC; + if (clear & TIOCM_RTS || clear & TIOCM_CTS) + v24_sig &= ~RFCOMM_V24_RTR; + if (clear & TIOCM_RI) + v24_sig &= ~RFCOMM_V24_IC; + if (clear & TIOCM_CD) + v24_sig &= ~RFCOMM_V24_DV; + + rfcomm_dlc_set_modem_status(dlc, v24_sig); + + return 0; } /* ---- TTY structure ---- */ diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 746c11fc017..ce7ab7dfa0b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -418,7 +418,7 @@ static struct proto sco_proto = { .obj_size = sizeof(struct sco_pinfo) }; -static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio) +static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio) { struct sock *sk; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e6c2200b7ca..24396b914d1 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -23,7 +23,7 @@ #include <asm/atomic.h> #include "br_private.h" -static kmem_cache_t *br_fdb_cache; +static kmem_cache_t *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr); diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 02c632b4d32..c93d35ab95c 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr, { struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; - if ((*pskb)->nfmark != info->mark) { + if ((*pskb)->nfmark != info->mark) (*pskb)->nfmark = info->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return info->target; } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index 01af4fcef26..aae26ae2e61 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup) if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; - NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup; - netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC); + NETLINK_CB(ub->skb).dst_group = nlgroup + 1; + netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; @@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, pm->version = EBT_ULOG_VERSION; do_gettimeofday(&pm->stamp); if (ub->qlen == 1) - ub->skb->stamp = pm->stamp; + skb_set_timestamp(ub->skb, &pm->stamp); pm->data_len = copy_len; pm->mark = skb->nfmark; pm->hook = hooknr; @@ -258,7 +258,8 @@ static int __init init(void) spin_lock_init(&ulog_buffers[i].lock); } - ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS, + NULL, THIS_MODULE); if (!ebtulognl) ret = -ENOMEM; else if ((ret = ebt_register_watcher(&ulog))) diff --git a/net/core/Makefile b/net/core/Makefile index f5f5e58943e..630da0f0579 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -12,7 +12,6 @@ obj-y += dev.o ethtool.o dev_mcast.o dst.o \ obj-$(CONFIG_XFRM) += flow.o obj-$(CONFIG_SYSFS) += net-sysfs.o -obj-$(CONFIG_NETFILTER) += netfilter.o obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NET_RADIO) += wireless.o diff --git a/net/core/datagram.c b/net/core/datagram.c index fcee054b6f7..da9bf71421a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -43,7 +43,6 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/inet.h> -#include <linux/tcp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/poll.h> @@ -51,9 +50,10 @@ #include <net/protocol.h> #include <linux/skbuff.h> -#include <net/sock.h> -#include <net/checksum.h> +#include <net/checksum.h> +#include <net/sock.h> +#include <net/tcp_states.h> /* * Is a socket 'connection oriented' ? diff --git a/net/core/dev.c b/net/core/dev.c index faf59b02c4b..c01511e3d0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt) spin_unlock_bh(&ptype_lock); } -extern void linkwatch_run_queue(void); - - - /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration @@ -1009,13 +1005,22 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed); } -static inline void net_timestamp(struct timeval *stamp) +void __net_timestamp(struct sk_buff *skb) +{ + struct timeval tv; + + do_gettimeofday(&tv); + skb_set_timestamp(skb, &tv); +} +EXPORT_SYMBOL(__net_timestamp); + +static inline void net_timestamp(struct sk_buff *skb) { if (atomic_read(&netstamp_needed)) - do_gettimeofday(stamp); + __net_timestamp(skb); else { - stamp->tv_sec = 0; - stamp->tv_usec = 0; + skb->tstamp.off_sec = 0; + skb->tstamp.off_usec = 0; } } @@ -1027,7 +1032,8 @@ static inline void net_timestamp(struct timeval *stamp) void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; - net_timestamp(&skb->stamp); + + net_timestamp(skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { @@ -1058,7 +1064,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb2->h.raw = skb2->nh.raw; skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); + ptype->func(skb2, skb->dev, ptype, skb->dev); } } rcu_read_unlock(); @@ -1123,8 +1129,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif -extern void skb_release_data(struct sk_buff *); - /* Keep head the same: replace data */ int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask) { @@ -1379,8 +1383,8 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + if (!skb->tstamp.off_sec) + net_timestamp(skb); /* * The code is rearranged so that the path is the most @@ -1425,14 +1429,14 @@ int netif_rx_ni(struct sk_buff *skb) EXPORT_SYMBOL(netif_rx_ni); -static __inline__ void skb_bond(struct sk_buff *skb) +static inline struct net_device *skb_bond(struct sk_buff *skb) { struct net_device *dev = skb->dev; - if (dev->master) { - skb->real_dev = skb->dev; + if (dev->master) skb->dev = dev->master; - } + + return dev; } static void net_tx_action(struct softirq_action *h) @@ -1482,10 +1486,11 @@ static void net_tx_action(struct softirq_action *h) } static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev) + struct packet_type *pt_prev, + struct net_device *orig_dev) { atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) @@ -1496,7 +1501,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret) + struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev) { struct net_bridge_port *port; @@ -1505,14 +1511,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb, return 0; if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev); + *ret = deliver_skb(*pskb, *pt_prev, orig_dev); *pt_prev = NULL; } return br_handle_frame_hook(port, pskb); } #else -#define handle_bridge(skb, pt_prev, ret) (0) +#define handle_bridge(skb, pt_prev, ret, orig_dev) (0) #endif #ifdef CONFIG_NET_CLS_ACT @@ -1534,17 +1540,14 @@ static int ing_filter(struct sk_buff *skb) __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); if (MAX_RED_LOOP < ttl++) { printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + skb->input_dev->name, skb->dev->name); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - if (NULL == skb->input_dev) { - skb->input_dev = skb->dev; - printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); - } + spin_lock(&dev->ingress_lock); if ((q = dev->qdisc_ingress) != NULL) result = q->enqueue(skb, q); @@ -1559,6 +1562,7 @@ static int ing_filter(struct sk_buff *skb) int netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; + struct net_device *orig_dev; int ret = NET_RX_DROP; unsigned short type; @@ -1566,10 +1570,13 @@ int netif_receive_skb(struct sk_buff *skb) if (skb->dev->poll && netpoll_rx(skb)) return NET_RX_DROP; - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); + if (!skb->tstamp.off_sec) + net_timestamp(skb); + + if (!skb->input_dev) + skb->input_dev = skb->dev; - skb_bond(skb); + orig_dev = skb_bond(skb); __get_cpu_var(netdev_rx_stat).total++; @@ -1590,14 +1597,14 @@ int netif_receive_skb(struct sk_buff *skb) list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; /* noone else should process this after*/ } else { skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -1616,7 +1623,7 @@ ncls: handle_diverter(skb); - if (handle_bridge(&skb, &pt_prev, &ret)) + if (handle_bridge(&skb, &pt_prev, &ret, orig_dev)) goto out; type = skb->protocol; @@ -1624,13 +1631,13 @@ ncls: if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) { if (pt_prev) - ret = deliver_skb(skb, pt_prev); + ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev); + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { kfree_skb(skb); /* Jamal, now you will not able to escape explaining diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a3eeb88e1c8..289c1b5a8e4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data) return 0; } +int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data) +{ + unsigned char len = dev->addr_len; + if ( addr->size < len ) + return -ETOOSMALL; + + addr->size = len; + memcpy(data, dev->perm_addr, len); + return 0; +} + + /* Handlers for each ethtool command */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) @@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_perm_addr(struct net_device *dev, void *useraddr) +{ + struct ethtool_perm_addr epaddr; + u8 *data; + int ret; + + if (!dev->ethtool_ops->get_perm_addr) + return -EOPNOTSUPP; + + if (copy_from_user(&epaddr,useraddr,sizeof(epaddr))) + return -EFAULT; + + data = kmalloc(epaddr.size, GFP_USER); + if (!data) + return -ENOMEM; + + ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) + goto out; + useraddr += sizeof(epaddr); + if (copy_to_user(useraddr, data, epaddr.size)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev.c */ int dev_ethtool(struct ifreq *ifr) @@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr) case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; + case ETHTOOL_GPERMADDR: + rc = ethtool_get_perm_addr(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } @@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr) EXPORT_SYMBOL(dev_ethtool); EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr); EXPORT_SYMBOL(ethtool_op_get_sg); EXPORT_SYMBOL(ethtool_op_get_tso); EXPORT_SYMBOL(ethtool_op_get_tx_csum); diff --git a/net/core/flow.c b/net/core/flow.c index f289570b15a..7e95b39de9f 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; #define flow_table(cpu) (per_cpu(flow_tables, cpu)) -static kmem_cache_t *flow_cachep; +static kmem_cache_t *flow_cachep __read_mostly; static int flow_lwm, flow_hwm; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1beb782ac41..39fc55edf69 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1217,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg) while (skb != (struct sk_buff *)&tbl->proxy_queue) { struct sk_buff *back = skb; - long tdif = back->stamp.tv_usec - now; + long tdif = NEIGH_CB(back)->sched_next - now; skb = skb->next; if (tdif <= 0) { @@ -1248,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, kfree_skb(skb); return; } - skb->stamp.tv_sec = LOCALLY_ENQUEUED; - skb->stamp.tv_usec = sched_next; + + NEIGH_CB(skb)->sched_next = sched_next; + NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); if (del_timer(&tbl->proxy_timer)) { @@ -2342,8 +2343,8 @@ void neigh_app_ns(struct neighbour *n) } nlh = (struct nlmsghdr *)skb->data; nlh->nlmsg_flags = NLM_F_REQUEST; - NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); } static void neigh_app_notify(struct neighbour *n) @@ -2360,8 +2361,8 @@ static void neigh_app_notify(struct neighbour *n) return; } nlh = (struct nlmsghdr *)skb->data; - NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; - netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC); } #endif /* CONFIG_ARPD */ diff --git a/net/core/netfilter.c b/net/core/netfilter.c deleted file mode 100644 index 076c156d5ed..00000000000 --- a/net/core/netfilter.c +++ /dev/null @@ -1,648 +0,0 @@ -/* netfilter.c: look after the filters for various protocols. - * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. - * - * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any - * way. - * - * Rusty Russell (C)2000 -- This code is GPL. - * - * February 2000: Modified by James Morris to have 1 queue per protocol. - * 15-Mar-2000: Added NF_REPEAT --RR. - * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. - */ -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/wait.h> -#include <linux/module.h> -#include <linux/interrupt.h> -#include <linux/if.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <net/sock.h> -#include <net/route.h> -#include <linux/ip.h> - -/* In this code, we can be waiting indefinitely for userspace to - * service a packet if a hook returns NF_QUEUE. We could keep a count - * of skbuffs queued for userspace, and not deregister a hook unless - * this is zero, but that sucks. Now, we simply check when the - * packets come back: if the hook is gone, the packet is discarded. */ -#ifdef CONFIG_NETFILTER_DEBUG -#define NFDEBUG(format, args...) printk(format , ## args) -#else -#define NFDEBUG(format, args...) -#endif - -/* Sockopts only registered and called from user context, so - net locking would be overkill. Also, [gs]etsockopt calls may - sleep. */ -static DECLARE_MUTEX(nf_sockopt_mutex); - -struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; -static LIST_HEAD(nf_sockopts); -static DEFINE_SPINLOCK(nf_hook_lock); - -/* - * A queue handler may be registered for each protocol. Each is protected by - * long term mutex. The handler must provide an an outfn() to accept packets - * for queueing and must reinject all packets it receives, no matter what. - */ -static struct nf_queue_handler_t { - nf_queue_outfn_t outfn; - void *data; -} queue_handler[NPROTO]; -static DEFINE_RWLOCK(queue_handler_lock); - -int nf_register_hook(struct nf_hook_ops *reg) -{ - struct list_head *i; - - spin_lock_bh(&nf_hook_lock); - list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { - if (reg->priority < ((struct nf_hook_ops *)i)->priority) - break; - } - list_add_rcu(®->list, i->prev); - spin_unlock_bh(&nf_hook_lock); - - synchronize_net(); - return 0; -} - -void nf_unregister_hook(struct nf_hook_ops *reg) -{ - spin_lock_bh(&nf_hook_lock); - list_del_rcu(®->list); - spin_unlock_bh(&nf_hook_lock); - - synchronize_net(); -} - -/* Do exclusive ranges overlap? */ -static inline int overlap(int min1, int max1, int min2, int max2) -{ - return max1 > min2 && min1 < max2; -} - -/* Functions to register sockopt ranges (exclusive). */ -int nf_register_sockopt(struct nf_sockopt_ops *reg) -{ - struct list_head *i; - int ret = 0; - - if (down_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - - list_for_each(i, &nf_sockopts) { - struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; - if (ops->pf == reg->pf - && (overlap(ops->set_optmin, ops->set_optmax, - reg->set_optmin, reg->set_optmax) - || overlap(ops->get_optmin, ops->get_optmax, - reg->get_optmin, reg->get_optmax))) { - NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", - ops->set_optmin, ops->set_optmax, - ops->get_optmin, ops->get_optmax, - reg->set_optmin, reg->set_optmax, - reg->get_optmin, reg->get_optmax); - ret = -EBUSY; - goto out; - } - } - - list_add(®->list, &nf_sockopts); -out: - up(&nf_sockopt_mutex); - return ret; -} - -void nf_unregister_sockopt(struct nf_sockopt_ops *reg) -{ - /* No point being interruptible: we're probably in cleanup_module() */ - restart: - down(&nf_sockopt_mutex); - if (reg->use != 0) { - /* To be woken by nf_sockopt call... */ - /* FIXME: Stuart Young's name appears gratuitously. */ - set_current_state(TASK_UNINTERRUPTIBLE); - reg->cleanup_task = current; - up(&nf_sockopt_mutex); - schedule(); - goto restart; - } - list_del(®->list); - up(&nf_sockopt_mutex); -} - -/* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, int pf, int val, - char __user *opt, int *len, int get) -{ - struct list_head *i; - struct nf_sockopt_ops *ops; - int ret; - - if (down_interruptible(&nf_sockopt_mutex) != 0) - return -EINTR; - - list_for_each(i, &nf_sockopts) { - ops = (struct nf_sockopt_ops *)i; - if (ops->pf == pf) { - if (get) { - if (val >= ops->get_optmin - && val < ops->get_optmax) { - ops->use++; - up(&nf_sockopt_mutex); - ret = ops->get(sk, val, opt, len); - goto out; - } - } else { - if (val >= ops->set_optmin - && val < ops->set_optmax) { - ops->use++; - up(&nf_sockopt_mutex); - ret = ops->set(sk, val, opt, *len); - goto out; - } - } - } - } - up(&nf_sockopt_mutex); - return -ENOPROTOOPT; - - out: - down(&nf_sockopt_mutex); - ops->use--; - if (ops->cleanup_task) - wake_up_process(ops->cleanup_task); - up(&nf_sockopt_mutex); - return ret; -} - -int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, - int len) -{ - return nf_sockopt(sk, pf, val, opt, &len, 0); -} - -int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) -{ - return nf_sockopt(sk, pf, val, opt, len, 1); -} - -static unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct list_head **i, - int (*okfn)(struct sk_buff *), - int hook_thresh) -{ - unsigned int verdict; - - /* - * The caller must not block between calls to this - * function because of risk of continuing from deleted element. - */ - list_for_each_continue_rcu(*i, head) { - struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; - - if (hook_thresh > elem->priority) - continue; - - /* Optimization: we don't need to hold module - reference here, since function can't sleep. --RR */ - verdict = elem->hook(hook, skb, indev, outdev, okfn); - if (verdict != NF_ACCEPT) { -#ifdef CONFIG_NETFILTER_DEBUG - if (unlikely(verdict > NF_MAX_VERDICT)) { - NFDEBUG("Evil return from %p(%u).\n", - elem->hook, hook); - continue; - } -#endif - if (verdict != NF_REPEAT) - return verdict; - *i = (*i)->prev; - } - } - return NF_ACCEPT; -} - -int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) -{ - int ret; - - write_lock_bh(&queue_handler_lock); - if (queue_handler[pf].outfn) - ret = -EBUSY; - else { - queue_handler[pf].outfn = outfn; - queue_handler[pf].data = data; - ret = 0; - } - write_unlock_bh(&queue_handler_lock); - - return ret; -} - -/* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf) -{ - write_lock_bh(&queue_handler_lock); - queue_handler[pf].outfn = NULL; - queue_handler[pf].data = NULL; - write_unlock_bh(&queue_handler_lock); - - return 0; -} - -/* - * Any packet that leaves via this function must come back - * through nf_reinject(). - */ -static int nf_queue(struct sk_buff *skb, - struct list_head *elem, - int pf, unsigned int hook, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *)) -{ - int status; - struct nf_info *info; -#ifdef CONFIG_BRIDGE_NETFILTER - struct net_device *physindev = NULL; - struct net_device *physoutdev = NULL; -#endif - - /* QUEUE == DROP if noone is waiting, to be safe. */ - read_lock(&queue_handler_lock); - if (!queue_handler[pf].outfn) { - read_unlock(&queue_handler_lock); - kfree_skb(skb); - return 1; - } - - info = kmalloc(sizeof(*info), GFP_ATOMIC); - if (!info) { - if (net_ratelimit()) - printk(KERN_ERR "OOM queueing packet %p\n", - skb); - read_unlock(&queue_handler_lock); - kfree_skb(skb); - return 1; - } - - *info = (struct nf_info) { - (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; - - /* If it's going away, ignore hook. */ - if (!try_module_get(info->elem->owner)) { - read_unlock(&queue_handler_lock); - kfree(info); - return 0; - } - - /* Bump dev refs so they don't vanish while packet is out */ - if (indev) dev_hold(indev); - if (outdev) dev_hold(outdev); - -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - physindev = skb->nf_bridge->physindev; - if (physindev) dev_hold(physindev); - physoutdev = skb->nf_bridge->physoutdev; - if (physoutdev) dev_hold(physoutdev); - } -#endif - - status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); - read_unlock(&queue_handler_lock); - - if (status < 0) { - /* James M doesn't say fuck enough. */ - if (indev) dev_put(indev); - if (outdev) dev_put(outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (physindev) dev_put(physindev); - if (physoutdev) dev_put(physoutdev); -#endif - module_put(info->elem->owner); - kfree(info); - kfree_skb(skb); - return 1; - } - return 1; -} - -/* Returns 1 if okfn() needs to be executed by the caller, - * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, - struct net_device *indev, - struct net_device *outdev, - int (*okfn)(struct sk_buff *), - int hook_thresh) -{ - struct list_head *elem; - unsigned int verdict; - int ret = 0; - - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - - elem = &nf_hooks[pf][hook]; -next_hook: - verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, - outdev, &elem, okfn, hook_thresh); - if (verdict == NF_ACCEPT || verdict == NF_STOP) { - ret = 1; - goto unlock; - } else if (verdict == NF_DROP) { - kfree_skb(*pskb); - ret = -EPERM; - } else if (verdict == NF_QUEUE) { - NFDEBUG("nf_hook: Verdict = QUEUE.\n"); - if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) - goto next_hook; - } -unlock: - rcu_read_unlock(); - return ret; -} - -void nf_reinject(struct sk_buff *skb, struct nf_info *info, - unsigned int verdict) -{ - struct list_head *elem = &info->elem->list; - struct list_head *i; - - rcu_read_lock(); - - /* Release those devices we held, or Alexey will kill me. */ - if (info->indev) dev_put(info->indev); - if (info->outdev) dev_put(info->outdev); -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge) { - if (skb->nf_bridge->physindev) - dev_put(skb->nf_bridge->physindev); - if (skb->nf_bridge->physoutdev) - dev_put(skb->nf_bridge->physoutdev); - } -#endif - - /* Drop reference to owner of hook which queued us. */ - module_put(info->elem->owner); - - list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { - if (i == elem) - break; - } - - if (elem == &nf_hooks[info->pf][info->hook]) { - /* The module which sent it to userspace is gone. */ - NFDEBUG("%s: module disappeared, dropping packet.\n", - __FUNCTION__); - verdict = NF_DROP; - } - - /* Continue traversal iff userspace said ok... */ - if (verdict == NF_REPEAT) { - elem = elem->prev; - verdict = NF_ACCEPT; - } - - if (verdict == NF_ACCEPT) { - next_hook: - verdict = nf_iterate(&nf_hooks[info->pf][info->hook], - &skb, info->hook, - info->indev, info->outdev, &elem, - info->okfn, INT_MIN); - } - - switch (verdict) { - case NF_ACCEPT: - info->okfn(skb); - break; - - case NF_QUEUE: - if (!nf_queue(skb, elem, info->pf, info->hook, - info->indev, info->outdev, info->okfn)) - goto next_hook; - break; - } - rcu_read_unlock(); - - if (verdict == NF_DROP) - kfree_skb(skb); - - kfree(info); - return; -} - -#ifdef CONFIG_INET -/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ -int ip_route_me_harder(struct sk_buff **pskb) -{ - struct iphdr *iph = (*pskb)->nh.iph; - struct rtable *rt; - struct flowi fl = {}; - struct dst_entry *odst; - unsigned int hh_len; - - /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause - * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. - */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { - fl.nl_u.ip4_u.daddr = iph->daddr; - fl.nl_u.ip4_u.saddr = iph->saddr; - fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); - fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; -#ifdef CONFIG_IP_ROUTE_FWMARK - fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; -#endif - fl.proto = iph->protocol; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - /* Drop old route. */ - dst_release((*pskb)->dst); - (*pskb)->dst = &rt->u.dst; - } else { - /* non-local src, find valid iif to satisfy - * rp-filter when calling ip_route_input. */ - fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) - return -1; - - odst = (*pskb)->dst; - if (ip_route_input(*pskb, iph->daddr, iph->saddr, - RT_TOS(iph->tos), rt->u.dst.dev) != 0) { - dst_release(&rt->u.dst); - return -1; - } - dst_release(&rt->u.dst); - dst_release(odst); - } - - if ((*pskb)->dst->error) - return -1; - - /* Change in oif may mean change in hh_len. */ - hh_len = (*pskb)->dst->dev->hard_header_len; - if (skb_headroom(*pskb) < hh_len) { - struct sk_buff *nskb; - - nskb = skb_realloc_headroom(*pskb, hh_len); - if (!nskb) - return -1; - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - } - - return 0; -} -EXPORT_SYMBOL(ip_route_me_harder); - -int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) -{ - struct sk_buff *nskb; - - if (writable_len > (*pskb)->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (skb_shared(*pskb) || skb_cloned(*pskb)) - goto copy_skb; - - return pskb_may_pull(*pskb, writable_len); - -copy_skb: - nskb = skb_copy(*pskb, GFP_ATOMIC); - if (!nskb) - return 0; - BUG_ON(skb_is_nonlinear(nskb)); - - /* Rest of kernel will get very unhappy if we pass it a - suddenly-orphaned skbuff */ - if ((*pskb)->sk) - skb_set_owner_w(nskb, (*pskb)->sk); - kfree_skb(*pskb); - *pskb = nskb; - return 1; -} -EXPORT_SYMBOL(skb_ip_make_writable); -#endif /*CONFIG_INET*/ - -/* Internal logging interface, which relies on the real - LOG target modules */ - -#define NF_LOG_PREFIXLEN 128 - -static nf_logfn *nf_logging[NPROTO]; /* = NULL */ -static int reported = 0; -static DEFINE_SPINLOCK(nf_log_lock); - -int nf_log_register(int pf, nf_logfn *logfn) -{ - int ret = -EBUSY; - - /* Any setup of logging members must be done before - * substituting pointer. */ - spin_lock(&nf_log_lock); - if (!nf_logging[pf]) { - rcu_assign_pointer(nf_logging[pf], logfn); - ret = 0; - } - spin_unlock(&nf_log_lock); - return ret; -} - -void nf_log_unregister(int pf, nf_logfn *logfn) -{ - spin_lock(&nf_log_lock); - if (nf_logging[pf] == logfn) - nf_logging[pf] = NULL; - spin_unlock(&nf_log_lock); - - /* Give time to concurrent readers. */ - synchronize_net(); -} - -void nf_log_packet(int pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *fmt, ...) -{ - va_list args; - char prefix[NF_LOG_PREFIXLEN]; - nf_logfn *logfn; - - rcu_read_lock(); - logfn = rcu_dereference(nf_logging[pf]); - if (logfn) { - va_start(args, fmt); - vsnprintf(prefix, sizeof(prefix), fmt, args); - va_end(args); - /* We must read logging before nf_logfn[pf] */ - logfn(hooknum, skb, in, out, prefix); - } else if (!reported) { - printk(KERN_WARNING "nf_log_packet: can\'t log yet, " - "no backend logging module loaded in!\n"); - reported++; - } - rcu_read_unlock(); -} -EXPORT_SYMBOL(nf_log_register); -EXPORT_SYMBOL(nf_log_unregister); -EXPORT_SYMBOL(nf_log_packet); - -/* This does not belong here, but locally generated errors need it if connection - tracking in use: without this, connection may not be in hash table, and hence - manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); - -void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) -{ - void (*attach)(struct sk_buff *, struct sk_buff *); - - if (skb->nfct && (attach = ip_ct_attach) != NULL) { - mb(); /* Just to be sure: must be read before executing this */ - attach(new, skb); - } -} - -void __init netfilter_init(void) -{ - int i, h; - - for (i = 0; i < NPROTO; i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } -} - -EXPORT_SYMBOL(ip_ct_attach); -EXPORT_SYMBOL(nf_ct_attach); -EXPORT_SYMBOL(nf_getsockopt); -EXPORT_SYMBOL(nf_hook_slow); -EXPORT_SYMBOL(nf_hooks); -EXPORT_SYMBOL(nf_register_hook); -EXPORT_SYMBOL(nf_register_queue_handler); -EXPORT_SYMBOL(nf_register_sockopt); -EXPORT_SYMBOL(nf_reinject); -EXPORT_SYMBOL(nf_setsockopt); -EXPORT_SYMBOL(nf_unregister_hook); -EXPORT_SYMBOL(nf_unregister_queue_handler); -EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index bb55675f068..b8203de5ff0 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -32,7 +32,6 @@ * Further increasing requires to change hash table size. */ int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); int reqsk_queue_alloc(struct request_sock_queue *queue, const int nr_table_entries) @@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = queue->rskq_accept_head = NULL; + queue->rskq_defer_accept = 0; + lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; @@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, } EXPORT_SYMBOL(reqsk_queue_alloc); + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + + if (lopt->qlen != 0) { + int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + BUG_TRAP(lopt->qlen == 0); + kfree(lopt); +} + +EXPORT_SYMBOL(reqsk_queue_destroy); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 4b1bb30e638..9bed7569ce3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) { int err = 0; - NETLINK_CB(skb).dst_groups = group; + NETLINK_CB(skb).dst_group = group; if (echo) atomic_inc(&skb->users); netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); @@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_LINK; - netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL); } static int rtnetlink_done(struct netlink_callback *cb) @@ -708,7 +708,8 @@ void __init rtnetlink_init(void) if (!rta_buf) panic("rtnetlink_init: cannot allocate rta_buf\n"); - rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv, + THIS_MODULE); if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7eab867ede5..f80a2878561 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -68,7 +68,10 @@ #include <asm/uaccess.h> #include <asm/system.h> -static kmem_cache_t *skbuff_head_cache; +static kmem_cache_t *skbuff_head_cache __read_mostly; +static kmem_cache_t *skbuff_fclone_cache __read_mostly; + +struct timeval __read_mostly skb_tv_base; /* * Keep out-of-line to prevent kernel bloat. @@ -118,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) */ /** - * alloc_skb - allocate a network buffer + * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask * @@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. */ -struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) +struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask, + int fclone) { struct sk_buff *skb; u8 *data; /* Get the HEAD */ - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); + if (fclone) + skb = kmem_cache_alloc(skbuff_fclone_cache, + gfp_mask & ~__GFP_DMA); + else + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) goto out; @@ -153,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask) skb->data = data; skb->tail = data; skb->end = data + size; + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + + child->fclone = SKB_FCLONE_UNAVAILABLE; + } atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->tso_size = 0; @@ -266,8 +283,34 @@ void skb_release_data(struct sk_buff *skb) */ void kfree_skbmem(struct sk_buff *skb) { + struct sk_buff *other; + atomic_t *fclone_ref; + skb_release_data(skb); - kmem_cache_free(skbuff_head_cache, skb); + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + fclone_ref = (atomic_t *) (skb + 2); + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + fclone_ref = (atomic_t *) (skb + 1); + other = skb - 1; + + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + }; } /** @@ -281,8 +324,6 @@ void kfree_skbmem(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - BUG_ON(skb->list != NULL); - dst_release(skb->dst); #ifdef CONFIG_XFRM secpath_put(skb->sp); @@ -302,7 +343,6 @@ void __kfree_skb(struct sk_buff *skb) skb->tc_index = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = 0; - skb->tc_classid = 0; #endif #endif @@ -325,19 +365,27 @@ void __kfree_skb(struct sk_buff *skb) struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) { - struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); - - if (!n) - return NULL; + struct sk_buff *n; + + n = skb + 1; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone = SKB_FCLONE_CLONE; + atomic_inc(fclone_ref); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + n->fclone = SKB_FCLONE_UNAVAILABLE; + } #define C(x) n->x = skb->x n->next = n->prev = NULL; - n->list = NULL; n->sk = NULL; - C(stamp); + C(tstamp); C(dev); - C(real_dev); C(h); C(nh); C(mac); @@ -361,7 +409,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->destructor = NULL; #ifdef CONFIG_NETFILTER C(nfmark); - C(nfcache); C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); @@ -370,9 +417,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) nf_bridge_get(skb->nf_bridge); #endif #endif /*CONFIG_NETFILTER*/ -#if defined(CONFIG_HIPPI) - C(private); -#endif #ifdef CONFIG_NET_SCHED C(tc_index); #ifdef CONFIG_NET_CLS_ACT @@ -380,7 +424,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask) n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd); n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); - C(tc_classid); #endif #endif @@ -404,10 +447,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) */ unsigned long offset = new->data - old->data; - new->list = NULL; new->sk = NULL; new->dev = old->dev; - new->real_dev = old->real_dev; new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); @@ -419,12 +460,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac.raw = old->mac.raw + offset; memcpy(new->cb, old->cb, sizeof(old->cb)); new->local_df = old->local_df; + new->fclone = SKB_FCLONE_UNAVAILABLE; new->pkt_type = old->pkt_type; - new->stamp = old->stamp; + new->tstamp = old->tstamp; new->destructor = NULL; #ifdef CONFIG_NETFILTER new->nfmark = old->nfmark; - new->nfcache = old->nfcache; new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; @@ -1344,50 +1385,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) __skb_queue_tail(list, newsk); spin_unlock_irqrestore(&list->lock, flags); } + /** * skb_unlink - remove a buffer from a list * @skb: buffer to remove + * @list: list to use * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls * - * Works even without knowing the list it is sitting on, which can be - * handy at times. It also means that THE LIST MUST EXIST when you - * unlink. Thus a list must have its contents unlinked before it is - * destroyed. + * You must know what list the SKB is on. */ -void skb_unlink(struct sk_buff *skb) +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { - struct sk_buff_head *list = skb->list; - - if (list) { - unsigned long flags; + unsigned long flags; - spin_lock_irqsave(&list->lock, flags); - if (skb->list == list) - __skb_unlink(skb, skb->list); - spin_unlock_irqrestore(&list->lock, flags); - } + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); } - /** * skb_append - append a buffer * @old: buffer to insert after * @newsk: buffer to insert + * @list: list to use * * Place a packet after a given packet in a list. The list locks are taken * and this function is atomic with respect to other list locked calls. * A buffer cannot be placed on two lists at the same time. */ - -void skb_append(struct sk_buff *old, struct sk_buff *newsk) +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_append(old, newsk); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_append(old, newsk, list); + spin_unlock_irqrestore(&list->lock, flags); } @@ -1395,19 +1429,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk) * skb_insert - insert a buffer * @old: buffer to insert before * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. * - * Place a packet before a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls * A buffer cannot be placed on two lists at the same time. */ - -void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) { unsigned long flags; - spin_lock_irqsave(&old->list->lock, flags); - __skb_insert(newsk, old->prev, old, old->list); - spin_unlock_irqrestore(&old->list->lock, flags); + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); } #if 0 @@ -1663,12 +1699,23 @@ void __init skb_init(void) NULL, NULL); if (!skbuff_head_cache) panic("cannot create skbuff cache"); + + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + (2*sizeof(struct sk_buff)) + + sizeof(atomic_t), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_fclone_cache) + panic("cannot create skbuff cache"); + + do_gettimeofday(&skb_tv_base); } EXPORT_SYMBOL(___pskb_trim); EXPORT_SYMBOL(__kfree_skb); EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(__alloc_skb); EXPORT_SYMBOL(pskb_copy); EXPORT_SYMBOL(pskb_expand_head); EXPORT_SYMBOL(skb_checksum); @@ -1696,3 +1743,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read); EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(skb_tv_base); diff --git a/net/core/sock.c b/net/core/sock.c index 12f6d9a2a52..ccd10fd6568 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val > sysctl_wmem_max) val = sysctl_wmem_max; - +set_sndbuf: sk->sk_userlocks |= SOCK_SNDBUF_LOCK; if ((val * 2) < SOCK_MIN_SNDBUF) sk->sk_sndbuf = SOCK_MIN_SNDBUF; @@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_write_space(sk); break; + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; + case SO_RCVBUF: /* Don't error on this BSD doesn't and if you think about it this is right. Otherwise apps have to @@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val > sysctl_rmem_max) val = sysctl_rmem_max; - +set_rcvbuf: sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* FIXME: is this lower bound the right one? */ if ((val * 2) < SOCK_MIN_RCVBUF) @@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->sk_rcvbuf = val * 2; break; + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; + case SO_KEEPALIVE: #ifdef CONFIG_INET if (sk->sk_protocol == IPPROTO_TCP) @@ -686,6 +700,80 @@ void sk_free(struct sock *sk) module_put(owner); } +struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority) +{ + struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0); + + if (newsk != NULL) { + struct sk_filter *filter; + + memcpy(newsk, sk, sk->sk_prot->obj_size); + + /* SANITY */ + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + + atomic_set(&newsk->sk_rmem_alloc, 0); + atomic_set(&newsk->sk_wmem_alloc, 0); + atomic_set(&newsk->sk_omem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); + + rwlock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + + newsk->sk_dst_cache = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = newsk->sk_filter; + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + newsk = NULL; + goto out; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + atomic_set(&newsk->sk_refcnt, 2); + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + newsk->sk_socket = NULL; + newsk->sk_sleep = NULL; + + if (newsk->sk_prot->sockets_allocated) + atomic_inc(newsk->sk_prot->sockets_allocated); + } +out: + return newsk; +} + +EXPORT_SYMBOL_GPL(sk_clone); + void __init sk_init(void) { if (num_physpages <= 4096) { @@ -1353,11 +1441,7 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) - printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); -#endif + sk_refcnt_debug_release(sk); sock_put(sk); } @@ -1368,7 +1452,8 @@ static LIST_HEAD(proto_list); int proto_register(struct proto *prot, int alloc_slab) { - char *request_sock_slab_name; + char *request_sock_slab_name = NULL; + char *timewait_sock_slab_name; int rc = -ENOBUFS; if (alloc_slab) { @@ -1399,6 +1484,23 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab_name; } } + + if (prot->twsk_obj_size) { + static const char mask[] = "tw_sock_%s"; + + timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + + if (timewait_sock_slab_name == NULL) + goto out_free_request_sock_slab; + + sprintf(timewait_sock_slab_name, mask, prot->name); + prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, + prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } } write_lock(&proto_list_lock); @@ -1407,6 +1509,13 @@ int proto_register(struct proto *prot, int alloc_slab) rc = 0; out: return rc; +out_free_timewait_sock_slab_name: + kfree(timewait_sock_slab_name); +out_free_request_sock_slab: + if (prot->rsk_prot && prot->rsk_prot->slab) { + kmem_cache_destroy(prot->rsk_prot->slab); + prot->rsk_prot->slab = NULL; + } out_free_request_sock_slab_name: kfree(request_sock_slab_name); out_free_sock_slab: @@ -1434,6 +1543,14 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } + if (prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_slab); + + kmem_cache_destroy(prot->twsk_slab); + kfree(name); + prot->twsk_slab = NULL; + } + list_del(&prot->node); write_unlock(&proto_list_lock); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8f817ad9f54..2f278c8e474 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -9,23 +9,18 @@ #include <linux/sysctl.h> #include <linux/config.h> #include <linux/module.h> +#include <linux/socket.h> +#include <net/sock.h> #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; -extern int netdev_budget; extern int weight_p; -extern int net_msg_cost; -extern int net_msg_burst; extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern __u32 sysctl_wmem_default; -extern __u32 sysctl_rmem_default; extern int sysctl_core_destroy_delay; -extern int sysctl_optmem_max; -extern int sysctl_somaxconn; #ifdef CONFIG_NET_DIVERT extern char sysctl_divert_version[]; diff --git a/net/core/utils.c b/net/core/utils.c index 88eb8b68e26..7b5970fc9e4 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -16,7 +16,9 @@ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/inet.h> #include <linux/mm.h> +#include <linux/net.h> #include <linux/string.h> #include <linux/types.h> #include <linux/random.h> diff --git a/net/core/wireless.c b/net/core/wireless.c index 3ff5639c0b7..5caae2399f3 100644 --- a/net/core/wireless.c +++ b/net/core/wireless.c @@ -571,10 +571,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v) return 0; } -extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); -extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); -extern void dev_seq_stop(struct seq_file *seq, void *v); - static struct seq_operations wireless_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, @@ -1144,8 +1140,8 @@ static inline void rtmsg_iwinfo(struct net_device * dev, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_LINK; - netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); } #endif /* WE_EVENT_NETLINK */ diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig new file mode 100644 index 00000000000..187ac182e24 --- /dev/null +++ b/net/dccp/Kconfig @@ -0,0 +1,50 @@ +menu "DCCP Configuration (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + +config IP_DCCP + tristate "The DCCP Protocol (EXPERIMENTAL)" + ---help--- + Datagram Congestion Control Protocol + + From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>. + + The Datagram Congestion Control Protocol (DCCP) is a transport + protocol that implements bidirectional, unicast connections of + congestion-controlled, unreliable datagrams. It should be suitable + for use by applications such as streaming media, Internet telephony, + and on-line games + + To compile this protocol support as a module, choose M here: the + module will be called dccp. + + If in doubt, say N. + +config INET_DCCP_DIAG + depends on IP_DCCP && INET_DIAG + def_tristate y if (IP_DCCP = y && INET_DIAG = y) + def_tristate m + +source "net/dccp/ccids/Kconfig" + +menu "DCCP Kernel Hacking" + depends on IP_DCCP && DEBUG_KERNEL=y + +config IP_DCCP_DEBUG + bool "DCCP debug messages" + ---help--- + Only use this if you're hacking DCCP. + + Just say N. + +config IP_DCCP_UNLOAD_HACK + depends on IP_DCCP=m && IP_DCCP_CCID3=m + bool "DCCP control sock unload hack" + ---help--- + Enable this to be able to unload the dccp module when the it + has only one refcount held, the control sock one. Just execute + "rmmod dccp_ccid3 dccp" + + Just say N. +endmenu + +endmenu diff --git a/net/dccp/Makefile b/net/dccp/Makefile new file mode 100644 index 00000000000..fb97bb04245 --- /dev/null +++ b/net/dccp/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_IP_DCCP) += dccp.o + +dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ + timer.o + +obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o + +dccp_diag-y := diag.o + +obj-y += ccids/ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c new file mode 100644 index 00000000000..9d8fc0e289e --- /dev/null +++ b/net/dccp/ccid.c @@ -0,0 +1,139 @@ +/* + * net/dccp/ccid.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ccid.h" + +static struct ccid *ccids[CCID_MAX]; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +static atomic_t ccids_lockct = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(ccids_lock); + +/* + * The strategy is: modifications ccids vector are short, do not sleep and + * veeery rare, but read access should be free of any exclusive locks. + */ +static void ccids_write_lock(void) +{ + spin_lock(&ccids_lock); + while (atomic_read(&ccids_lockct) != 0) { + spin_unlock(&ccids_lock); + yield(); + spin_lock(&ccids_lock); + } +} + +static inline void ccids_write_unlock(void) +{ + spin_unlock(&ccids_lock); +} + +static inline void ccids_read_lock(void) +{ + atomic_inc(&ccids_lockct); + spin_unlock_wait(&ccids_lock); +} + +static inline void ccids_read_unlock(void) +{ + atomic_dec(&ccids_lockct); +} + +#else +#define ccids_write_lock() do { } while(0) +#define ccids_write_unlock() do { } while(0) +#define ccids_read_lock() do { } while(0) +#define ccids_read_unlock() do { } while(0) +#endif + +int ccid_register(struct ccid *ccid) +{ + int err; + + if (ccid->ccid_init == NULL) + return -1; + + ccids_write_lock(); + err = -EEXIST; + if (ccids[ccid->ccid_id] == NULL) { + ccids[ccid->ccid_id] = ccid; + err = 0; + } + ccids_write_unlock(); + if (err == 0) + pr_info("CCID: Registered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return err; +} + +EXPORT_SYMBOL_GPL(ccid_register); + +int ccid_unregister(struct ccid *ccid) +{ + ccids_write_lock(); + ccids[ccid->ccid_id] = NULL; + ccids_write_unlock(); + pr_info("CCID: Unregistered CCID %d (%s)\n", + ccid->ccid_id, ccid->ccid_name); + return 0; +} + +EXPORT_SYMBOL_GPL(ccid_unregister); + +struct ccid *ccid_init(unsigned char id, struct sock *sk) +{ + struct ccid *ccid; + +#ifdef CONFIG_KMOD + if (ccids[id] == NULL) + request_module("net-dccp-ccid-%d", id); +#endif + ccids_read_lock(); + + ccid = ccids[id]; + if (ccid == NULL) + goto out; + + if (!try_module_get(ccid->ccid_owner)) + goto out_err; + + if (ccid->ccid_init(sk) != 0) + goto out_module_put; +out: + ccids_read_unlock(); + return ccid; +out_module_put: + module_put(ccid->ccid_owner); +out_err: + ccid = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(ccid_init); + +void ccid_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid == NULL) + return; + + ccids_read_lock(); + + if (ccids[ccid->ccid_id] != NULL) { + if (ccid->ccid_exit != NULL) + ccid->ccid_exit(sk); + module_put(ccid->ccid_owner); + } + + ccids_read_unlock(); +} + +EXPORT_SYMBOL_GPL(ccid_exit); diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h new file mode 100644 index 00000000000..962f1e9e2f7 --- /dev/null +++ b/net/dccp/ccid.h @@ -0,0 +1,180 @@ +#ifndef _CCID_H +#define _CCID_H +/* + * net/dccp/ccid.h + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * CCID infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <net/sock.h> +#include <linux/dccp.h> +#include <linux/list.h> +#include <linux/module.h> + +#define CCID_MAX 255 + +struct ccid { + unsigned char ccid_id; + const char *ccid_name; + struct module *ccid_owner; + int (*ccid_init)(struct sock *sk); + void (*ccid_exit)(struct sock *sk); + int (*ccid_hc_rx_init)(struct sock *sk); + int (*ccid_hc_tx_init)(struct sock *sk); + void (*ccid_hc_rx_exit)(struct sock *sk); + void (*ccid_hc_tx_exit)(struct sock *sk); + void (*ccid_hc_rx_packet_recv)(struct sock *sk, + struct sk_buff *skb); + int (*ccid_hc_rx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + void (*ccid_hc_rx_insert_options)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_insert_options)(struct sock *sk, + struct sk_buff *skb); + void (*ccid_hc_tx_packet_recv)(struct sock *sk, + struct sk_buff *skb); + int (*ccid_hc_tx_parse_options)(struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value); + int (*ccid_hc_tx_send_packet)(struct sock *sk, + struct sk_buff *skb, int len); + void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more, + int len); + void (*ccid_hc_rx_get_info)(struct sock *sk, + struct tcp_info *info); + void (*ccid_hc_tx_get_info)(struct sock *sk, + struct tcp_info *info); +}; + +extern int ccid_register(struct ccid *ccid); +extern int ccid_unregister(struct ccid *ccid); + +extern struct ccid *ccid_init(unsigned char id, struct sock *sk); +extern void ccid_exit(struct ccid *ccid, struct sock *sk); + +static inline void __ccid_get(struct ccid *ccid) +{ + __module_get(ccid->ccid_owner); +} + +static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb, int len) +{ + int rc = 0; + if (ccid->ccid_hc_tx_send_packet != NULL) + rc = ccid->ccid_hc_tx_send_packet(sk, skb, len); + return rc; +} + +static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, + int more, int len) +{ + if (ccid->ccid_hc_tx_packet_sent != NULL) + ccid->ccid_hc_tx_packet_sent(sk, more, len); +} + +static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_rx_init != NULL) + rc = ccid->ccid_hc_rx_init(sk); + return rc; +} + +static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk) +{ + int rc = 0; + if (ccid->ccid_hc_tx_init != NULL) + rc = ccid->ccid_hc_tx_init(sk); + return rc; +} + +static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_rx_exit != NULL && + dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL) + ccid->ccid_hc_rx_exit(sk); +} + +static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk) +{ + if (ccid->ccid_hc_tx_exit != NULL && + dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL) + ccid->ccid_hc_tx_exit(sk); +} + +static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_packet_recv != NULL) + ccid->ccid_hc_rx_packet_recv(sk, skb); +} + +static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_packet_recv != NULL) + ccid->ccid_hc_tx_packet_recv(sk, skb); +} + +static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_tx_parse_options != NULL) + rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx, + value); + return rc; +} + +static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, + unsigned char option, + unsigned char len, u16 idx, + unsigned char* value) +{ + int rc = 0; + if (ccid->ccid_hc_rx_parse_options != NULL) + rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value); + return rc; +} + +static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_tx_insert_options != NULL) + ccid->ccid_hc_tx_insert_options(sk, skb); +} + +static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, + struct sk_buff *skb) +{ + if (ccid->ccid_hc_rx_insert_options != NULL) + ccid->ccid_hc_rx_insert_options(sk, skb); +} + +static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_hc_rx_get_info != NULL) + ccid->ccid_hc_rx_get_info(sk, info); +} + +static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, + struct tcp_info *info) +{ + if (ccid->ccid_hc_tx_get_info != NULL) + ccid->ccid_hc_tx_get_info(sk, info); +} +#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig new file mode 100644 index 00000000000..7684d83946a --- /dev/null +++ b/net/dccp/ccids/Kconfig @@ -0,0 +1,29 @@ +menu "DCCP CCIDs Configuration (EXPERIMENTAL)" + depends on IP_DCCP && EXPERIMENTAL + +config IP_DCCP_CCID3 + tristate "CCID3 (TFRC) (EXPERIMENTAL)" + depends on IP_DCCP + ---help--- + CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based + rate-controlled congestion control mechanism. TFRC is designed to + be reasonably fair when competing for bandwidth with TCP-like flows, + where a flow is "reasonably fair" if its sending rate is generally + within a factor of two of the sending rate of a TCP flow under the + same conditions. However, TFRC has a much lower variation of + throughput over time compared with TCP, which makes CCID 3 more + suitable than CCID 2 for applications such streaming media where a + relatively smooth sending rate is of importance. + + CCID 3 is further described in [CCID 3 PROFILE]. The TFRC + congestion control algorithms were initially described in RFC 3448. + + This text was extracted from draft-ietf-dccp-spec-11.txt. + + If in doubt, say M. + +config IP_DCCP_TFRC_LIB + depends on IP_DCCP_CCID3 + def_tristate IP_DCCP_CCID3 + +endmenu diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile new file mode 100644 index 00000000000..956f79f5074 --- /dev/null +++ b/net/dccp/ccids/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o + +dccp_ccid3-y := ccid3.o + +obj-y += lib/ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c new file mode 100644 index 00000000000..7bf3b3a91e9 --- /dev/null +++ b/net/dccp/ccids/ccid3.c @@ -0,0 +1,1221 @@ +/* + * net/dccp/ccids/ccid3.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/config.h> +#include "../ccid.h" +#include "../dccp.h" +#include "lib/packet_history.h" +#include "lib/loss_interval.h" +#include "lib/tfrc.h" +#include "ccid3.h" + +/* + * Reason for maths with 10 here is to avoid 32 bit overflow when a is big. + */ +static inline u32 usecs_div(const u32 a, const u32 b) +{ + const u32 tmp = a * (USEC_PER_SEC / 10); + return b > 20 ? tmp / (b / 10) : tmp; +} + +static int ccid3_debug; + +#ifdef CCID3_DEBUG +#define ccid3_pr_debug(format, a...) \ + do { if (ccid3_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \ + } while (0) +#else +#define ccid3_pr_debug(format, a...) +#endif + +static struct dccp_tx_hist *ccid3_tx_hist; +static struct dccp_rx_hist *ccid3_rx_hist; +static struct dccp_li_hist *ccid3_li_hist; + +static int ccid3_init(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + return 0; +} + +static void ccid3_exit(struct sock *sk) +{ + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); +} + +/* TFRC sender states */ +enum ccid3_hc_tx_states { + TFRC_SSTATE_NO_SENT = 1, + TFRC_SSTATE_NO_FBACK, + TFRC_SSTATE_FBACK, + TFRC_SSTATE_TERM, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) +{ + static char *ccid3_state_names[] = { + [TFRC_SSTATE_NO_SENT] = "NO_SENT", + [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", + [TFRC_SSTATE_FBACK] = "FBACK", + [TFRC_SSTATE_TERM] = "TERM", + }; + + return ccid3_state_names[state]; +} +#endif + +static inline void ccid3_hc_tx_set_state(struct sock *sk, + enum ccid3_hc_tx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_tx_state_name(oldstate), + ccid3_tx_state_name(state)); + WARN_ON(state == oldstate); + hctx->ccid3hctx_state = state; +} + +/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */ +static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx) +{ + /* + * If no feedback spec says t_ipi is 1 second (set elsewhere and then + * doubles after every no feedback timer (separate function) + */ + if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x); +} + +/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ +static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx) +{ + hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, + TFRC_OPSYS_HALF_TIME_GRAN); +} + +/* + * Update X by + * If (p > 0) + * x_calc = calcX(s, R, p); + * X = max(min(X_calc, 2 * X_recv), s / t_mbi); + * Else + * If (now - tld >= R) + * X = max(min(2 * X, 2 * X_recv), s / R); + * tld = now; + */ +static void ccid3_hc_tx_update_x(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + /* To avoid large error in calcX */ + if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { + hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt, + hctx->ccid3hctx_p); + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc, + 2 * hctx->ccid3hctx_x_recv), + (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME)); + } else { + struct timeval now; + + do_gettimeofday(&now); + if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >= + hctx->ccid3hctx_rtt) { + hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv, + hctx->ccid3hctx_x) * 2, + usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_rtt)); + hctx->ccid3hctx_t_ld = now; + } + } +} + +static void ccid3_hc_tx_no_feedback_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct dccp_sock *dp = dccp_sk(sk); + unsigned long next_tmout = 0; + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + /* XXX: set some sensible MIB */ + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + HZ / 5); + goto out; + } + + ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_TERM: + goto out; + case TFRC_SSTATE_NO_FBACK: + /* Halve send rate */ + hctx->ccid3hctx_x /= 2; + if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME)) + hctx->ccid3hctx_x = (hctx->ccid3hctx_s / + TFRC_MAX_BACK_OFF_TIME); + + ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d " + "bytes/s\n", + dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state), + hctx->ccid3hctx_x); + next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x), + TFRC_INITIAL_TIMEOUT); + /* + * FIXME - not sure above calculation is correct. See section + * 5 of CCID3 11 should adjust tx_t_ipi and double that to + * achieve it really + */ + break; + case TFRC_SSTATE_FBACK: + /* + * Check if IDLE since last timeout and recv rate is less than + * 4 packets per RTT + */ + if (!hctx->ccid3hctx_idle || + (hctx->ccid3hctx_x_recv >= + 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) { + ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n", + dccp_role(sk), sk, + ccid3_tx_state_name(hctx->ccid3hctx_state)); + /* Halve sending rate */ + + /* If (X_calc > 2 * X_recv) + * X_recv = max(X_recv / 2, s / (2 * t_mbi)); + * Else + * X_recv = X_calc / 4; + */ + BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && + hctx->ccid3hctx_x_calc == 0); + + /* check also if p is zero -> x_calc is infinity? */ + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || + hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) + hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, + hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME)); + else + hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4; + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + } + /* + * Schedule no feedback timer to expire in + * max(4 * R, 2 * s / X) + */ + next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, + 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x)); + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + goto out; + } + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + hctx->ccid3hctx_idle = 1; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +static int ccid3_hc_tx_send_packet(struct sock *sk, + struct sk_buff *skb, int len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct dccp_tx_hist_entry *new_packet; + struct timeval now; + long delay; + int rc = -ENOTCONN; + + /* Check if pure ACK or Terminating*/ + + /* + * XXX: We only call this function for DATA and DATAACK, on, these + * packets can have zero length, but why the comment about "pure ACK"? + */ + if (hctx == NULL || len == 0 || + hctx->ccid3hctx_state == TFRC_SSTATE_TERM) + goto out; + + /* See if last packet allocated was not sent */ + new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); + if (new_packet == NULL || new_packet->dccphtx_sent) { + new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist, + SLAB_ATOMIC); + + rc = -ENOBUFS; + if (new_packet == NULL) { + ccid3_pr_debug("%s, sk=%p, not enough mem to add " + "to history, send refused\n", + dccp_role(sk), sk); + goto out; + } + + dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet); + } + + do_gettimeofday(&now); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n", + dccp_role(sk), sk, dp->dccps_gss); + + hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer; + hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk; + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)); + hctx->ccid3hctx_last_win_count = 0; + hctx->ccid3hctx_t_last_win_count = now; + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT; + + /* Set nominal send time for initial packet */ + hctx->ccid3hctx_t_nom = now; + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + ccid3_calc_new_delta(hctx); + rc = 0; + break; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) - + hctx->ccid3hctx_delta); + ccid3_pr_debug("send_packet delay=%ld\n", delay); + delay /= -1000; + /* divide by -1000 is to convert to ms and get sign right */ + rc = delay > 0 ? delay : 0; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + rc = -EINVAL; + break; + } + + /* Can we send? if so add options and add to packet history */ + if (rc == 0) + new_packet->dccphtx_ccval = + DCCP_SKB_CB(skb)->dccpd_ccval = + hctx->ccid3hctx_last_win_count; +out: + return rc; +} + +static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct timeval now; + + BUG_ON(hctx == NULL); + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n", + dccp_role(sk), sk); + return; + } + + do_gettimeofday(&now); + + /* check if we have sent a data packet */ + if (len > 0) { + unsigned long quarter_rtt; + struct dccp_tx_hist_entry *packet; + + packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist); + if (packet == NULL) { + printk(KERN_CRIT "%s: packet doesn't exists in " + "history!\n", __FUNCTION__); + return; + } + if (packet->dccphtx_sent) { + printk(KERN_CRIT "%s: no unsent packet in history!\n", + __FUNCTION__); + return; + } + packet->dccphtx_tstamp = now; + packet->dccphtx_seqno = dp->dccps_gss; + /* + * Check if win_count have changed + * Algorithm in "8.1. Window Counter Valuer" in + * draft-ietf-dccp-ccid3-11.txt + */ + quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count); + if (likely(hctx->ccid3hctx_rtt > 8)) + quarter_rtt /= hctx->ccid3hctx_rtt / 4; + + if (quarter_rtt > 0) { + hctx->ccid3hctx_t_last_win_count = now; + hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count + + min_t(unsigned long, quarter_rtt, 5)) % 16; + ccid3_pr_debug("%s, sk=%p, window changed from " + "%u to %u!\n", + dccp_role(sk), sk, + packet->dccphtx_ccval, + hctx->ccid3hctx_last_win_count); + } + + hctx->ccid3hctx_idle = 0; + packet->dccphtx_rtt = hctx->ccid3hctx_rtt; + packet->dccphtx_sent = 1; + } else + ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n", + dccp_role(sk), sk, dp->dccps_gss); + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* if first wasn't pure ack */ + if (len != 0) + printk(KERN_CRIT "%s: %s, First packet sent is noted " + "as a data packet\n", + __FUNCTION__, dccp_role(sk)); + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + if (len > 0) { + hctx->ccid3hctx_t_nom = now; + ccid3_calc_new_t_ipi(hctx); + ccid3_calc_new_delta(hctx); + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + struct dccp_tx_hist_entry *packet; + unsigned long next_tmout; + u32 t_elapsed; + u32 pinv; + u32 x_recv; + u32 r_sample; + + if (hctx == NULL) + return; + + if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) { + ccid3_pr_debug("%s, sk=%p, received a packet when " + "terminating!\n", dccp_role(sk), sk); + return; + } + + /* we are only interested in ACKs */ + if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || + DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) + return; + + opt_recv = &hctx->ccid3hctx_options_received; + + t_elapsed = dp->dccps_options_received.dccpor_elapsed_time; + x_recv = opt_recv->ccid3or_receive_rate; + pinv = opt_recv->ccid3or_loss_event_rate; + + switch (hctx->ccid3hctx_state) { + case TFRC_SSTATE_NO_SENT: + /* FIXME: what to do here? */ + return; + case TFRC_SSTATE_NO_FBACK: + case TFRC_SSTATE_FBACK: + /* Calculate new round trip sample by + * R_sample = (now - t_recvdata) - t_delay */ + /* get t_recvdata from history */ + packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, + DCCP_SKB_CB(skb)->dccpd_ack_seq); + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't " + "exist in history!\n", + dccp_role(sk), sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + /* Update RTT */ + r_sample = timeval_now_delta(&packet->dccphtx_tstamp); + /* FIXME: */ + // r_sample -= usecs_to_jiffies(t_elapsed * 10); + + /* Update RTT estimate by + * If (No feedback recv) + * R = R_sample; + * Else + * R = q * R + (1 - q) * R_sample; + * + * q is a constant, RFC 3448 recomments 0.9 + */ + if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + hctx->ccid3hctx_rtt = r_sample; + } else + hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + + r_sample / 10; + + ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " + "r_sample=%us\n", dccp_role(sk), sk, + hctx->ccid3hctx_rtt, r_sample); + + /* Update timeout interval */ + hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, + USEC_PER_SEC); + + /* Update receive rate */ + hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ + + /* Update loss event rate */ + if (pinv == ~0 || pinv == 0) + hctx->ccid3hctx_p = 0; + else { + hctx->ccid3hctx_p = 1000000 / pinv; + + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { + hctx->ccid3hctx_p = TFRC_SMALLEST_P; + ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", + dccp_role(sk), sk); + } + } + + /* unschedule no feedback timer */ + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Update sending rate */ + ccid3_hc_tx_update_x(sk); + + /* Update next send time */ + timeval_sub_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + ccid3_calc_new_t_ipi(hctx); + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); + ccid3_calc_new_delta(hctx); + + /* remove all packets older than the one acked from history */ + dccp_tx_hist_purge_older(ccid3_tx_hist, + &hctx->ccid3hctx_hist, packet); + /* + * As we have calculated new ipi, delta, t_nom it is possible that + * we now can send a packet, so wake up dccp_wait_for_ccids. + */ + sk->sk_write_space(sk); + + /* + * Schedule no feedback timer to expire in + * max(4 * R, 2 * s / X) + */ + next_tmout = max(hctx->ccid3hctx_t_rto, + 2 * usecs_div(hctx->ccid3hctx_s, + hctx->ccid3hctx_x)); + + ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " + "expire in %lu jiffies (%luus)\n", + dccp_role(sk), sk, + usecs_to_jiffies(next_tmout), next_tmout); + + sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout))); + + /* set idle flag */ + hctx->ccid3hctx_idle = 1; + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state); + dump_stack(); + break; + } +} + +static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx == NULL || !(sk->sk_state == DCCP_OPEN || + sk->sk_state == DCCP_PARTOPEN)) + return; + + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; +} + +static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, + unsigned char len, u16 idx, + unsigned char *value) +{ + int rc = 0; + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + struct ccid3_options_received *opt_recv; + + if (hctx == NULL) + return 0; + + opt_recv = &hctx->ccid3hctx_options_received; + + if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { + opt_recv->ccid3or_seqno = dp->dccps_gsr; + opt_recv->ccid3or_loss_event_rate = ~0; + opt_recv->ccid3or_loss_intervals_idx = 0; + opt_recv->ccid3or_loss_intervals_len = 0; + opt_recv->ccid3or_receive_rate = 0; + } + + switch (option) { + case TFRC_OPT_LOSS_EVENT_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for " + "TFRC_OPT_LOSS_EVENT_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_event_rate); + } + break; + case TFRC_OPT_LOSS_INTERVALS: + opt_recv->ccid3or_loss_intervals_idx = idx; + opt_recv->ccid3or_loss_intervals_len = len; + ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n", + dccp_role(sk), sk, + opt_recv->ccid3or_loss_intervals_idx, + opt_recv->ccid3or_loss_intervals_len); + break; + case TFRC_OPT_RECEIVE_RATE: + if (len != 4) { + ccid3_pr_debug("%s, sk=%p, invalid len for " + "TFRC_OPT_RECEIVE_RATE\n", + dccp_role(sk), sk); + rc = -EINVAL; + } else { + opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value); + ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n", + dccp_role(sk), sk, + opt_recv->ccid3or_receive_rate); + } + break; + } + + return rc; +} + +static int ccid3_hc_tx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), + gfp_any()); + if (hctx == NULL) + return -ENOMEM; + + memset(hctx, 0, sizeof(*hctx)); + + if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) + hctx->ccid3hctx_s = dp->dccps_packet_size; + else + hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE; + + /* Set transmission rate to 1 packet per second */ + hctx->ccid3hctx_x = hctx->ccid3hctx_s; + hctx->ccid3hctx_t_rto = USEC_PER_SEC; + hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; + INIT_LIST_HEAD(&hctx->ccid3hctx_hist); + init_timer(&hctx->ccid3hctx_no_feedback_timer); + + return 0; +} + +static void ccid3_hc_tx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + BUG_ON(hctx == NULL); + + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); + sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + + /* Empty packet history */ + dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist); + + kfree(dp->dccps_hc_tx_ccid_private); + dp->dccps_hc_tx_ccid_private = NULL; +} + +/* + * RX Half Connection methods + */ + +/* TFRC receiver states */ +enum ccid3_hc_rx_states { + TFRC_RSTATE_NO_DATA = 1, + TFRC_RSTATE_DATA, + TFRC_RSTATE_TERM = 127, +}; + +#ifdef CCID3_DEBUG +static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) +{ + static char *ccid3_rx_state_names[] = { + [TFRC_RSTATE_NO_DATA] = "NO_DATA", + [TFRC_RSTATE_DATA] = "DATA", + [TFRC_RSTATE_TERM] = "TERM", + }; + + return ccid3_rx_state_names[state]; +} +#endif + +static inline void ccid3_hc_rx_set_state(struct sock *sk, + enum ccid3_hc_rx_states state) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + + ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", + dccp_role(sk), sk, ccid3_rx_state_name(oldstate), + ccid3_rx_state_name(state)); + WARN_ON(state == oldstate); + hcrx->ccid3hcrx_state = state; +} + +static void ccid3_hc_rx_send_feedback(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct dccp_rx_hist_entry *packet; + struct timeval now; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + do_gettimeofday(&now); + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + hcrx->ccid3hcrx_x_recv = 0; + break; + case TFRC_RSTATE_DATA: { + const u32 delta = timeval_delta(&now, + &hcrx->ccid3hcrx_tstamp_last_feedback); + + hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv * + USEC_PER_SEC); + if (likely(delta > 1)) + hcrx->ccid3hcrx_x_recv /= delta; + } + break; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist); + if (packet == NULL) { + printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n", + __FUNCTION__, dccp_role(sk), sk); + dump_stack(); + return; + } + + hcrx->ccid3hcrx_tstamp_last_feedback = now; + hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval; + hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; + hcrx->ccid3hcrx_bytes_recv = 0; + + /* Convert to multiples of 10us */ + hcrx->ccid3hcrx_elapsed_time = + timeval_delta(&now, &packet->dccphrx_tstamp) / 10; + if (hcrx->ccid3hcrx_p == 0) + hcrx->ccid3hcrx_pinv = ~0; + else + hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p; + dccp_send_ack(sk); +} + +static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_sock *dp = dccp_sk(sk); + u32 x_recv, pinv; + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN || + sk->sk_state == DCCP_PARTOPEN)) + return; + + DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; + + if (dccp_packet_without_ack(skb)) + return; + + if (hcrx->ccid3hcrx_elapsed_time != 0) + dccp_insert_option_elapsed_time(sk, skb, + hcrx->ccid3hcrx_elapsed_time); + dccp_insert_option_timestamp(sk, skb); + x_recv = htonl(hcrx->ccid3hcrx_x_recv); + pinv = htonl(hcrx->ccid3hcrx_pinv); + dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, + &pinv, sizeof(pinv)); + dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE, + &x_recv, sizeof(x_recv)); +} + +/* calculate first loss interval + * + * returns estimated loss interval in usecs */ + +static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + struct dccp_rx_hist_entry *entry, *next, *tail = NULL; + u32 rtt, delta, x_recv, fval, p, tmp2; + struct timeval tstamp = { 0, }; + int interval = 0; + int win_count = 0; + int step = 0; + u64 tmp1; + + list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist, + dccphrx_node) { + if (dccp_rx_hist_entry_data_packet(entry)) { + tail = entry; + + switch (step) { + case 0: + tstamp = entry->dccphrx_tstamp; + win_count = entry->dccphrx_ccval; + step = 1; + break; + case 1: + interval = win_count - entry->dccphrx_ccval; + if (interval < 0) + interval += TFRC_WIN_COUNT_LIMIT; + if (interval > 4) + goto found; + break; + } + } + } + + if (step == 0) { + printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no " + "data packets!\n", + __FUNCTION__, dccp_role(sk), sk); + return ~0; + } + + if (interval == 0) { + ccid3_pr_debug("%s, sk=%p, Could not find a win_count " + "interval > 0. Defaulting to 1\n", + dccp_role(sk), sk); + interval = 1; + } +found: + rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval; + ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", + dccp_role(sk), sk, rtt); + if (rtt == 0) + rtt = 1; + + delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback); + x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC; + if (likely(delta > 1)) + x_recv /= delta; + + tmp1 = (u64)x_recv * (u64)rtt; + do_div(tmp1,10000000); + tmp2 = (u32)tmp1; + fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; + /* do not alter order above or you will get overflow on 32 bit */ + p = tfrc_calc_x_reverse_lookup(fval); + ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied " + "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); + + if (p == 0) + return ~0; + else + return 1000000 / p; +} + +static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (seq_loss != DCCP_MAX_SEQNO + 1 && + list_empty(&hcrx->ccid3hcrx_li_hist)) { + struct dccp_li_hist_entry *li_tail; + + li_tail = dccp_li_hist_interval_new(ccid3_li_hist, + &hcrx->ccid3hcrx_li_hist, + seq_loss, win_loss); + if (li_tail == NULL) + return; + li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); + } + /* FIXME: find end of interval */ +} + +static void ccid3_hc_rx_detect_loss(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + u8 win_loss; + const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, + &win_loss); + + ccid3_hc_rx_update_li(sk, seq_loss, win_loss); +} + +static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + const struct dccp_options_received *opt_recv; + struct dccp_rx_hist_entry *packet; + struct timeval now; + u8 win_count; + u32 p_prev; + int ins; + + if (hcrx == NULL) + return; + + BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || + hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA)); + + opt_recv = &dp->dccps_options_received; + + switch (DCCP_SKB_CB(skb)->dccpd_type) { + case DCCP_PKT_ACK: + if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA) + return; + case DCCP_PKT_DATAACK: + if (opt_recv->dccpor_timestamp_echo == 0) + break; + p_prev = hcrx->ccid3hcrx_rtt; + do_gettimeofday(&now); + hcrx->ccid3hcrx_rtt = timeval_usecs(&now) - + (opt_recv->dccpor_timestamp_echo - + opt_recv->dccpor_elapsed_time) * 10; + if (p_prev != hcrx->ccid3hcrx_rtt) + ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n", + dccp_role(sk), hcrx->ccid3hcrx_rtt, + opt_recv->dccpor_elapsed_time); + break; + case DCCP_PKT_DATA: + break; + default: + ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n", + dccp_role(sk), sk, + dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); + return; + } + + packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp, + skb, SLAB_ATOMIC); + if (packet == NULL) { + ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet " + "to history (consider it lost)!", + dccp_role(sk), sk); + return; + } + + win_count = packet->dccphrx_ccval; + + ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, packet); + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) + return; + + switch (hcrx->ccid3hcrx_state) { + case TFRC_RSTATE_NO_DATA: + ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial " + "feedback\n", + dccp_role(sk), sk, + dccp_state_name(sk->sk_state), skb); + ccid3_hc_rx_send_feedback(sk); + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); + return; + case TFRC_RSTATE_DATA: + hcrx->ccid3hcrx_bytes_recv += skb->len - + dccp_hdr(skb)->dccph_doff * 4; + if (ins != 0) + break; + + do_gettimeofday(&now); + if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >= + hcrx->ccid3hcrx_rtt) { + hcrx->ccid3hcrx_tstamp_last_ack = now; + ccid3_hc_rx_send_feedback(sk); + } + return; + default: + printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n", + __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state); + dump_stack(); + return; + } + + /* Dealing with packet loss */ + ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n", + dccp_role(sk), sk, dccp_state_name(sk->sk_state)); + + ccid3_hc_rx_detect_loss(sk); + p_prev = hcrx->ccid3hcrx_p; + + /* Calculate loss event rate */ + if (!list_empty(&hcrx->ccid3hcrx_li_hist)) + /* Scaling up by 1000000 as fixed decimal */ + hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist); + + if (hcrx->ccid3hcrx_p > p_prev) { + ccid3_hc_rx_send_feedback(sk); + return; + } +} + +static int ccid3_hc_rx_init(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), + gfp_any()); + if (hcrx == NULL) + return -ENOMEM; + + memset(hcrx, 0, sizeof(*hcrx)); + + if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE && + dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE) + hcrx->ccid3hcrx_s = dp->dccps_packet_size; + else + hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE; + + hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; + INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist); + INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist); + /* + * XXX this seems to be paranoid, need to think more about this, for + * now start with something different than zero. -acme + */ + hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5; + return 0; +} + +static void ccid3_hc_rx_exit(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk); + + if (hcrx == NULL) + return; + + ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); + + /* Empty packet history */ + dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist); + + /* Empty loss interval history */ + dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist); + + kfree(dp->dccps_hc_rx_ccid_private); + dp->dccps_hc_rx_ccid_private = NULL; +} + +static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private; + + if (hcrx == NULL) + return; + + info->tcpi_ca_state = hcrx->ccid3hcrx_state; + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; +} + +static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private; + + if (hctx == NULL) + return; + + info->tcpi_rto = hctx->ccid3hctx_t_rto; + info->tcpi_rtt = hctx->ccid3hctx_rtt; +} + +static struct ccid ccid3 = { + .ccid_id = 3, + .ccid_name = "ccid3", + .ccid_owner = THIS_MODULE, + .ccid_init = ccid3_init, + .ccid_exit = ccid3_exit, + .ccid_hc_tx_init = ccid3_hc_tx_init, + .ccid_hc_tx_exit = ccid3_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, + .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, + .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options, + .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, + .ccid_hc_rx_init = ccid3_hc_rx_init, + .ccid_hc_rx_exit = ccid3_hc_rx_exit, + .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, + .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, + .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, + .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, +}; + +module_param(ccid3_debug, int, 0444); +MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); + +static __init int ccid3_module_init(void) +{ + int rc = -ENOBUFS; + + ccid3_rx_hist = dccp_rx_hist_new("ccid3"); + if (ccid3_rx_hist == NULL) + goto out; + + ccid3_tx_hist = dccp_tx_hist_new("ccid3"); + if (ccid3_tx_hist == NULL) + goto out_free_rx; + + ccid3_li_hist = dccp_li_hist_new("ccid3"); + if (ccid3_li_hist == NULL) + goto out_free_tx; + + rc = ccid_register(&ccid3); + if (rc != 0) + goto out_free_loss_interval_history; +out: + return rc; + +out_free_loss_interval_history: + dccp_li_hist_delete(ccid3_li_hist); + ccid3_li_hist = NULL; +out_free_tx: + dccp_tx_hist_delete(ccid3_tx_hist); + ccid3_tx_hist = NULL; +out_free_rx: + dccp_rx_hist_delete(ccid3_rx_hist); + ccid3_rx_hist = NULL; + goto out; +} +module_init(ccid3_module_init); + +static __exit void ccid3_module_exit(void) +{ +#ifdef CONFIG_IP_DCCP_UNLOAD_HACK + /* + * Hack to use while developing, so that we get rid of the control + * sock, that is what keeps a refcount on dccp.ko -acme + */ + extern void dccp_ctl_sock_exit(void); + + dccp_ctl_sock_exit(); +#endif + ccid_unregister(&ccid3); + + if (ccid3_tx_hist != NULL) { + dccp_tx_hist_delete(ccid3_tx_hist); + ccid3_tx_hist = NULL; + } + if (ccid3_rx_hist != NULL) { + dccp_rx_hist_delete(ccid3_rx_hist); + ccid3_rx_hist = NULL; + } + if (ccid3_li_hist != NULL) { + dccp_li_hist_delete(ccid3_li_hist); + ccid3_li_hist = NULL; + } +} +module_exit(ccid3_module_exit); + +MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, " + "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>"); +MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("net-dccp-ccid-3"); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h new file mode 100644 index 00000000000..ee8cbace663 --- /dev/null +++ b/net/dccp/ccids/ccid3.h @@ -0,0 +1,137 @@ +/* + * net/dccp/ccids/ccid3.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _DCCP_CCID3_H_ +#define _DCCP_CCID3_H_ + +#include <linux/config.h> +#include <linux/list.h> +#include <linux/time.h> +#include <linux/types.h> + +#define TFRC_MIN_PACKET_SIZE 16 +#define TFRC_STD_PACKET_SIZE 256 +#define TFRC_MAX_PACKET_SIZE 65535 + +/* Two seconds as per CCID3 spec */ +#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) + +/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ +#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) + +/* In seconds */ +#define TFRC_MAX_BACK_OFF_TIME 64 + +#define TFRC_SMALLEST_P 40 + +enum ccid3_options { + TFRC_OPT_LOSS_EVENT_RATE = 192, + TFRC_OPT_LOSS_INTERVALS = 193, + TFRC_OPT_RECEIVE_RATE = 194, +}; + +struct ccid3_options_received { + u64 ccid3or_seqno:48, + ccid3or_loss_intervals_idx:16; + u16 ccid3or_loss_intervals_len; + u32 ccid3or_loss_event_rate; + u32 ccid3or_receive_rate; +}; + +/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock + * + * @ccid3hctx_state - Sender state + * @ccid3hctx_x - Current sending rate + * @ccid3hctx_x_recv - Receive rate + * @ccid3hctx_x_calc - Calculated send (?) rate + * @ccid3hctx_s - Packet size + * @ccid3hctx_rtt - Estimate of current round trip time in usecs + * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 + * @ccid3hctx_last_win_count - Last window counter sent + * @ccid3hctx_t_last_win_count - Timestamp of earliest packet + * with last_win_count value sent + * @ccid3hctx_no_feedback_timer - Handle to no feedback timer + * @ccid3hctx_idle - FIXME + * @ccid3hctx_t_ld - Time last doubled during slow start + * @ccid3hctx_t_nom - Nominal send time of next packet + * @ccid3hctx_t_ipi - Interpacket (send) interval + * @ccid3hctx_delta - Send timer delta + * @ccid3hctx_hist - Packet history + */ +struct ccid3_hc_tx_sock { + u32 ccid3hctx_x; + u32 ccid3hctx_x_recv; + u32 ccid3hctx_x_calc; + u16 ccid3hctx_s; + u32 ccid3hctx_rtt; + u32 ccid3hctx_p; + u8 ccid3hctx_state; + u8 ccid3hctx_last_win_count; + u8 ccid3hctx_idle; + struct timeval ccid3hctx_t_last_win_count; + struct timer_list ccid3hctx_no_feedback_timer; + struct timeval ccid3hctx_t_ld; + struct timeval ccid3hctx_t_nom; + u32 ccid3hctx_t_rto; + u32 ccid3hctx_t_ipi; + u32 ccid3hctx_delta; + struct list_head ccid3hctx_hist; + struct ccid3_options_received ccid3hctx_options_received; +}; + +struct ccid3_hc_rx_sock { + u64 ccid3hcrx_seqno_last_counter:48, + ccid3hcrx_state:8, + ccid3hcrx_last_counter:4; + unsigned long ccid3hcrx_rtt; + u32 ccid3hcrx_p; + u32 ccid3hcrx_bytes_recv; + struct timeval ccid3hcrx_tstamp_last_feedback; + struct timeval ccid3hcrx_tstamp_last_ack; + struct list_head ccid3hcrx_hist; + struct list_head ccid3hcrx_li_hist; + u16 ccid3hcrx_s; + u32 ccid3hcrx_pinv; + u32 ccid3hcrx_elapsed_time; + u32 ccid3hcrx_x_recv; +}; + +#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field) + +#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \ + ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field) + +#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile new file mode 100644 index 00000000000..5f940a6cbac --- /dev/null +++ b/net/dccp/ccids/lib/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o + +dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c new file mode 100644 index 00000000000..4c01a54143a --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.c @@ -0,0 +1,144 @@ +/* + * net/dccp/ccids/lib/loss_interval.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include "loss_interval.h" + +struct dccp_li_hist *dccp_li_hist_new(const char *name) +{ + struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_li_hist_mask[] = "li_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_li_hist_mask, name); + hist->dccplih_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_li_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccplih_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_new); + +void dccp_li_hist_delete(struct dccp_li_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccplih_slab); + + kmem_cache_destroy(hist->dccplih_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_delete); + +void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list) +{ + struct dccp_li_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccplih_node) { + list_del_init(&entry->dccplih_node); + kmem_cache_free(hist->dccplih_slab, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_purge); + +/* Weights used to calculate loss event rate */ +/* + * These are integers as per section 8 of RFC3448. We can then divide by 4 * + * when we use it. + */ +static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = { + 4, 4, 4, 4, 3, 2, 1, 1, +}; + +u32 dccp_li_hist_calc_i_mean(struct list_head *list) +{ + struct dccp_li_hist_entry *li_entry, *li_next; + int i = 0; + u32 i_tot; + u32 i_tot0 = 0; + u32 i_tot1 = 0; + u32 w_tot = 0; + + list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) { + if (i < DCCP_LI_HIST_IVAL_F_LENGTH) { + i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i]; + w_tot += dccp_li_hist_w[i]; + } + + if (i != 0) + i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1]; + + if (++i > DCCP_LI_HIST_IVAL_F_LENGTH) + break; + } + + if (i != DCCP_LI_HIST_IVAL_F_LENGTH) + return 0; + + i_tot = max(i_tot0, i_tot1); + + /* FIXME: Why do we do this? -Ian McDonald */ + if (i_tot * 4 < w_tot) + i_tot = w_tot * 4; + + return i_tot * 4 / w_tot; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); + +struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, + const u64 seq_loss, + const u8 win_loss) +{ + struct dccp_li_hist_entry *tail = NULL, *entry; + int i; + + for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) { + entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC); + if (entry == NULL) { + dccp_li_hist_purge(hist, list); + return NULL; + } + if (tail == NULL) + tail = entry; + list_add(&entry->dccplih_node, list); + } + + entry->dccplih_seqno = seq_loss; + entry->dccplih_win_count = win_loss; + return tail; +} + +EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h new file mode 100644 index 00000000000..13ad47ba142 --- /dev/null +++ b/net/dccp/ccids/lib/loss_interval.h @@ -0,0 +1,61 @@ +#ifndef _DCCP_LI_HIST_ +#define _DCCP_LI_HIST_ +/* + * net/dccp/ccids/lib/loss_interval.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/config.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/time.h> + +#define DCCP_LI_HIST_IVAL_F_LENGTH 8 + +struct dccp_li_hist { + kmem_cache_t *dccplih_slab; +}; + +extern struct dccp_li_hist *dccp_li_hist_new(const char *name); +extern void dccp_li_hist_delete(struct dccp_li_hist *hist); + +struct dccp_li_hist_entry { + struct list_head dccplih_node; + u64 dccplih_seqno:48, + dccplih_win_count:4; + u32 dccplih_interval; +}; + +static inline struct dccp_li_hist_entry * + dccp_li_hist_entry_new(struct dccp_li_hist *hist, + const unsigned int __nocast prio) +{ + return kmem_cache_alloc(hist->dccplih_slab, prio); +} + +static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist, + struct dccp_li_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccplih_slab, entry); +} + +extern void dccp_li_hist_purge(struct dccp_li_hist *hist, + struct list_head *list); + +extern u32 dccp_li_hist_calc_i_mean(struct list_head *list); + +extern struct dccp_li_hist_entry * + dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, + const u64 seq_loss, + const u8 win_loss); +#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c new file mode 100644 index 00000000000..d3f9d205383 --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.c @@ -0,0 +1,398 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/string.h> + +#include "packet_history.h" + +struct dccp_rx_hist *dccp_rx_hist_new(const char *name) +{ + struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_rx_hist_mask[] = "rx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_rx_hist_mask, name); + hist->dccprxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_rx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccprxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_new); + +void dccp_rx_hist_delete(struct dccp_rx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccprxh_slab); + + kmem_cache_destroy(hist->dccprxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_delete); + +void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphrx_node) { + list_del_init(&entry->dccphrx_node); + kmem_cache_free(hist->dccprxh_slab, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_purge); + +struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list) +{ + struct dccp_rx_hist_entry *entry, *packet = NULL; + + list_for_each_entry(entry, list, dccphrx_node) + if (entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); + +int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, + struct list_head *rx_list, + struct list_head *li_list, + struct dccp_rx_hist_entry *packet) +{ + struct dccp_rx_hist_entry *entry, *next, *iter; + u8 num_later = 0; + + iter = dccp_rx_hist_head(rx_list); + if (iter == NULL) + dccp_rx_hist_add_entry(rx_list, packet); + else { + const u64 seqno = packet->dccphrx_seqno; + + if (after48(seqno, iter->dccphrx_seqno)) + dccp_rx_hist_add_entry(rx_list, packet); + else { + if (dccp_rx_hist_entry_data_packet(iter)) + num_later = 1; + + list_for_each_entry_continue(iter, rx_list, + dccphrx_node) { + if (after48(seqno, iter->dccphrx_seqno)) { + dccp_rx_hist_add_entry(&iter->dccphrx_node, + packet); + goto trim_history; + } + + if (dccp_rx_hist_entry_data_packet(iter)) + num_later++; + + if (num_later == TFRC_RECV_NUM_LATE_LOSS) { + dccp_rx_hist_entry_delete(hist, packet); + return 1; + } + } + + if (num_later < TFRC_RECV_NUM_LATE_LOSS) + dccp_rx_hist_add_entry(rx_list, packet); + /* + * FIXME: else what? should we destroy the packet + * like above? + */ + } + } + +trim_history: + /* + * Trim history (remove all packets after the NUM_LATE_LOSS + 1 + * data packets) + */ + num_later = TFRC_RECV_NUM_LATE_LOSS + 1; + + if (!list_empty(li_list)) { + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(hist, entry); + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + } else { + int step = 0; + u8 win_count = 0; /* Not needed, but lets shut up gcc */ + int tmp; + /* + * We have no loss interval history so we need at least one + * rtt:s of data packets to approximate rtt. + */ + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + switch (step) { + case 0: + step = 1; + /* OK, find next data packet */ + num_later = 1; + break; + case 1: + step = 2; + /* OK, find next data packet */ + num_later = 1; + win_count = entry->dccphrx_ccval; + break; + case 2: + tmp = win_count - entry->dccphrx_ccval; + if (tmp < 0) + tmp += TFRC_WIN_COUNT_LIMIT; + if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) { + /* + * We have found a packet older + * than one rtt remove the rest + */ + step = 3; + } else /* OK, find next data packet */ + num_later = 1; + break; + case 3: + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(hist, entry); + break; + } + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + } + + return 0; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); + +u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, + struct list_head *li_list, u8 *win_loss) +{ + struct dccp_rx_hist_entry *entry, *next, *packet; + struct dccp_rx_hist_entry *a_loss = NULL; + struct dccp_rx_hist_entry *b_loss = NULL; + u64 seq_loss = DCCP_MAX_SEQNO + 1; + u8 num_later = TFRC_RECV_NUM_LATE_LOSS; + + list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + b_loss = entry; + break; + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + + if (b_loss == NULL) + goto out; + + num_later = 1; + list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { + if (num_later == 0) { + a_loss = entry; + break; + } else if (dccp_rx_hist_entry_data_packet(entry)) + --num_later; + } + + if (a_loss == NULL) { + if (list_empty(li_list)) { + /* no loss event have occured yet */ + LIMIT_NETDEBUG("%s: TODO: find a lost data packet by " + "comparing to initial seqno\n", + __FUNCTION__); + goto out; + } else { + LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!", + __FUNCTION__); + goto out; + } + } + + /* Locate a lost data packet */ + entry = packet = b_loss; + list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { + u64 delta = dccp_delta_seqno(entry->dccphrx_seqno, + packet->dccphrx_seqno); + + if (delta != 0) { + if (dccp_rx_hist_entry_data_packet(packet)) + --delta; + /* + * FIXME: check this, probably this % usage is because + * in earlier drafts the ndp count was just 8 bits + * long, but now it cam be up to 24 bits long. + */ +#if 0 + if (delta % DCCP_NDP_LIMIT != + (packet->dccphrx_ndp - + entry->dccphrx_ndp) % DCCP_NDP_LIMIT) +#endif + if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) { + seq_loss = entry->dccphrx_seqno; + dccp_inc_seqno(&seq_loss); + } + } + packet = entry; + if (packet == a_loss) + break; + } +out: + if (seq_loss != DCCP_MAX_SEQNO + 1) + *win_loss = a_loss->dccphrx_ccval; + else + *win_loss = 0; /* Paranoia */ + + return seq_loss; +} + +EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss); + +struct dccp_tx_hist *dccp_tx_hist_new(const char *name) +{ + struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); + static const char dccp_tx_hist_mask[] = "tx_hist_%s"; + char *slab_name; + + if (hist == NULL) + goto out; + + slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1, + GFP_ATOMIC); + if (slab_name == NULL) + goto out_free_hist; + + sprintf(slab_name, dccp_tx_hist_mask, name); + hist->dccptxh_slab = kmem_cache_create(slab_name, + sizeof(struct dccp_tx_hist_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (hist->dccptxh_slab == NULL) + goto out_free_slab_name; +out: + return hist; +out_free_slab_name: + kfree(slab_name); +out_free_hist: + kfree(hist); + hist = NULL; + goto out; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_new); + +void dccp_tx_hist_delete(struct dccp_tx_hist *hist) +{ + const char* name = kmem_cache_name(hist->dccptxh_slab); + + kmem_cache_destroy(hist->dccptxh_slab); + kfree(name); + kfree(hist); +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_delete); + +struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq) +{ + struct dccp_tx_hist_entry *packet = NULL, *entry; + + list_for_each_entry(entry, list, dccphtx_node) + if (entry->dccphtx_seqno == seq) { + packet = entry; + break; + } + + return packet; +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry); + +void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, + struct dccp_tx_hist_entry *packet) +{ + struct dccp_tx_hist_entry *next; + + list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) { + list_del_init(&packet->dccphtx_node); + dccp_tx_hist_entry_delete(hist, packet); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older); + +void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list) +{ + struct dccp_tx_hist_entry *entry, *next; + + list_for_each_entry_safe(entry, next, list, dccphtx_node) { + list_del_init(&entry->dccphtx_node); + dccp_tx_hist_entry_delete(hist, entry); + } +} + +EXPORT_SYMBOL_GPL(dccp_tx_hist_purge); + +MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, " + "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>"); +MODULE_DESCRIPTION("DCCP TFRC library"); +MODULE_LICENSE("GPL"); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h new file mode 100644 index 00000000000..fb90a91aa93 --- /dev/null +++ b/net/dccp/ccids/lib/packet_history.h @@ -0,0 +1,199 @@ +/* + * net/dccp/packet_history.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * + * An implementation of the DCCP protocol + * + * This code has been developed by the University of Waikato WAND + * research group. For further information please see http://www.wand.net.nz/ + * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz + * + * This code also uses code from Lulea University, rereleased as GPL by its + * authors: + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * Changes to meet Linux coding standards, to make it meet latest ccid3 draft + * and to make it work as a loadable module in the DCCP stack written by + * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. + * + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DCCP_PKT_HIST_ +#define _DCCP_PKT_HIST_ + +#include <linux/config.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/time.h> + +#include "../../dccp.h" + +/* Number of later packets received before one is considered lost */ +#define TFRC_RECV_NUM_LATE_LOSS 3 + +#define TFRC_WIN_COUNT_PER_RTT 4 +#define TFRC_WIN_COUNT_LIMIT 16 + +struct dccp_tx_hist_entry { + struct list_head dccphtx_node; + u64 dccphtx_seqno:48, + dccphtx_ccval:4, + dccphtx_sent:1; + u32 dccphtx_rtt; + struct timeval dccphtx_tstamp; +}; + +struct dccp_rx_hist_entry { + struct list_head dccphrx_node; + u64 dccphrx_seqno:48, + dccphrx_ccval:4, + dccphrx_type:4; + u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */ + struct timeval dccphrx_tstamp; +}; + +struct dccp_tx_hist { + kmem_cache_t *dccptxh_slab; +}; + +extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name); +extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist); + +struct dccp_rx_hist { + kmem_cache_t *dccprxh_slab; +}; + +extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name); +extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist); +extern struct dccp_rx_hist_entry * + dccp_rx_hist_find_data_packet(const struct list_head *list); + +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_entry_new(struct dccp_tx_hist *hist, + const unsigned int __nocast prio) +{ + struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab, + prio); + + if (entry != NULL) + entry->dccphtx_sent = 0; + + return entry; +} + +static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist, + struct dccp_tx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccptxh_slab, entry); +} + +extern struct dccp_tx_hist_entry * + dccp_tx_hist_find_entry(const struct list_head *list, + const u64 seq); + +static inline void dccp_tx_hist_add_entry(struct list_head *list, + struct dccp_tx_hist_entry *entry) +{ + list_add(&entry->dccphtx_node, list); +} + +extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist, + struct list_head *list, + struct dccp_tx_hist_entry *next); + +extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist, + struct list_head *list); + +static inline struct dccp_tx_hist_entry * + dccp_tx_hist_head(struct list_head *list) +{ + struct dccp_tx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_tx_hist_entry, + dccphtx_node); + return head; +} + +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_entry_new(struct dccp_rx_hist *hist, + const u32 ndp, + const struct sk_buff *skb, + const unsigned int __nocast prio) +{ + struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab, + prio); + + if (entry != NULL) { + const struct dccp_hdr *dh = dccp_hdr(skb); + + entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + entry->dccphrx_ccval = dh->dccph_ccval; + entry->dccphrx_type = dh->dccph_type; + entry->dccphrx_ndp = ndp; + do_gettimeofday(&(entry->dccphrx_tstamp)); + } + + return entry; +} + +static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, + struct dccp_rx_hist_entry *entry) +{ + if (entry != NULL) + kmem_cache_free(hist->dccprxh_slab, entry); +} + +extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, + struct list_head *list); + +static inline void dccp_rx_hist_add_entry(struct list_head *list, + struct dccp_rx_hist_entry *entry) +{ + list_add(&entry->dccphrx_node, list); +} + +static inline struct dccp_rx_hist_entry * + dccp_rx_hist_head(struct list_head *list) +{ + struct dccp_rx_hist_entry *head = NULL; + + if (!list_empty(list)) + head = list_entry(list->next, struct dccp_rx_hist_entry, + dccphrx_node); + return head; +} + +static inline int + dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry) +{ + return entry->dccphrx_type == DCCP_PKT_DATA || + entry->dccphrx_type == DCCP_PKT_DATAACK; +} + +extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, + struct list_head *rx_list, + struct list_head *li_list, + struct dccp_rx_hist_entry *packet); + +extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, + struct list_head *li_list, u8 *win_loss); + +#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h new file mode 100644 index 00000000000..130c4c40cfe --- /dev/null +++ b/net/dccp/ccids/lib/tfrc.h @@ -0,0 +1,22 @@ +#ifndef _TFRC_H_ +#define _TFRC_H_ +/* + * net/dccp/ccids/lib/tfrc.h + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/types.h> + +extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); +extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); + +#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c new file mode 100644 index 00000000000..d2b5933b451 --- /dev/null +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -0,0 +1,644 @@ +/* + * net/dccp/ccids/lib/tfrc_equation.c + * + * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <asm/bug.h> +#include <asm/div64.h> + +#include "tfrc.h" + +#define TFRC_CALC_X_ARRSIZE 500 + +#define TFRC_CALC_X_SPLIT 50000 +/* equivalent to 0.05 */ + +static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { + { 37172, 8172 }, + { 53499, 11567 }, + { 66664, 14180 }, + { 78298, 16388 }, + { 89021, 18339 }, + { 99147, 20108 }, + { 108858, 21738 }, + { 118273, 23260 }, + { 127474, 24693 }, + { 136520, 26052 }, + { 145456, 27348 }, + { 154316, 28589 }, + { 163130, 29783 }, + { 171919, 30935 }, + { 180704, 32049 }, + { 189502, 33130 }, + { 198328, 34180 }, + { 207194, 35202 }, + { 216114, 36198 }, + { 225097, 37172 }, + { 234153, 38123 }, + { 243294, 39055 }, + { 252527, 39968 }, + { 261861, 40864 }, + { 271305, 41743 }, + { 280866, 42607 }, + { 290553, 43457 }, + { 300372, 44293 }, + { 310333, 45117 }, + { 320441, 45929 }, + { 330705, 46729 }, + { 341131, 47518 }, + { 351728, 48297 }, + { 362501, 49066 }, + { 373460, 49826 }, + { 384609, 50577 }, + { 395958, 51320 }, + { 407513, 52054 }, + { 419281, 52780 }, + { 431270, 53499 }, + { 443487, 54211 }, + { 455940, 54916 }, + { 468635, 55614 }, + { 481581, 56306 }, + { 494785, 56991 }, + { 508254, 57671 }, + { 521996, 58345 }, + { 536019, 59014 }, + { 550331, 59677 }, + { 564939, 60335 }, + { 579851, 60988 }, + { 595075, 61636 }, + { 610619, 62279 }, + { 626491, 62918 }, + { 642700, 63553 }, + { 659253, 64183 }, + { 676158, 64809 }, + { 693424, 65431 }, + { 711060, 66050 }, + { 729073, 66664 }, + { 747472, 67275 }, + { 766266, 67882 }, + { 785464, 68486 }, + { 805073, 69087 }, + { 825103, 69684 }, + { 845562, 70278 }, + { 866460, 70868 }, + { 887805, 71456 }, + { 909606, 72041 }, + { 931873, 72623 }, + { 954614, 73202 }, + { 977839, 73778 }, + { 1001557, 74352 }, + { 1025777, 74923 }, + { 1050508, 75492 }, + { 1075761, 76058 }, + { 1101544, 76621 }, + { 1127867, 77183 }, + { 1154739, 77741 }, + { 1182172, 78298 }, + { 1210173, 78852 }, + { 1238753, 79405 }, + { 1267922, 79955 }, + { 1297689, 80503 }, + { 1328066, 81049 }, + { 1359060, 81593 }, + { 1390684, 82135 }, + { 1422947, 82675 }, + { 1455859, 83213 }, + { 1489430, 83750 }, + { 1523671, 84284 }, + { 1558593, 84817 }, + { 1594205, 85348 }, + { 1630518, 85878 }, + { 1667543, 86406 }, + { 1705290, 86932 }, + { 1743770, 87457 }, + { 1782994, 87980 }, + { 1822973, 88501 }, + { 1863717, 89021 }, + { 1905237, 89540 }, + { 1947545, 90057 }, + { 1990650, 90573 }, + { 2034566, 91087 }, + { 2079301, 91600 }, + { 2124869, 92111 }, + { 2171279, 92622 }, + { 2218543, 93131 }, + { 2266673, 93639 }, + { 2315680, 94145 }, + { 2365575, 94650 }, + { 2416371, 95154 }, + { 2468077, 95657 }, + { 2520707, 96159 }, + { 2574271, 96660 }, + { 2628782, 97159 }, + { 2684250, 97658 }, + { 2740689, 98155 }, + { 2798110, 98651 }, + { 2856524, 99147 }, + { 2915944, 99641 }, + { 2976382, 100134 }, + { 3037850, 100626 }, + { 3100360, 101117 }, + { 3163924, 101608 }, + { 3228554, 102097 }, + { 3294263, 102586 }, + { 3361063, 103073 }, + { 3428966, 103560 }, + { 3497984, 104045 }, + { 3568131, 104530 }, + { 3639419, 105014 }, + { 3711860, 105498 }, + { 3785467, 105980 }, + { 3860253, 106462 }, + { 3936229, 106942 }, + { 4013410, 107422 }, + { 4091808, 107902 }, + { 4171435, 108380 }, + { 4252306, 108858 }, + { 4334431, 109335 }, + { 4417825, 109811 }, + { 4502501, 110287 }, + { 4588472, 110762 }, + { 4675750, 111236 }, + { 4764349, 111709 }, + { 4854283, 112182 }, + { 4945564, 112654 }, + { 5038206, 113126 }, + { 5132223, 113597 }, + { 5227627, 114067 }, + { 5324432, 114537 }, + { 5422652, 115006 }, + { 5522299, 115474 }, + { 5623389, 115942 }, + { 5725934, 116409 }, + { 5829948, 116876 }, + { 5935446, 117342 }, + { 6042439, 117808 }, + { 6150943, 118273 }, + { 6260972, 118738 }, + { 6372538, 119202 }, + { 6485657, 119665 }, + { 6600342, 120128 }, + { 6716607, 120591 }, + { 6834467, 121053 }, + { 6953935, 121514 }, + { 7075025, 121976 }, + { 7197752, 122436 }, + { 7322131, 122896 }, + { 7448175, 123356 }, + { 7575898, 123815 }, + { 7705316, 124274 }, + { 7836442, 124733 }, + { 7969291, 125191 }, + { 8103877, 125648 }, + { 8240216, 126105 }, + { 8378321, 126562 }, + { 8518208, 127018 }, + { 8659890, 127474 }, + { 8803384, 127930 }, + { 8948702, 128385 }, + { 9095861, 128840 }, + { 9244875, 129294 }, + { 9395760, 129748 }, + { 9548529, 130202 }, + { 9703198, 130655 }, + { 9859782, 131108 }, + { 10018296, 131561 }, + { 10178755, 132014 }, + { 10341174, 132466 }, + { 10505569, 132917 }, + { 10671954, 133369 }, + { 10840345, 133820 }, + { 11010757, 134271 }, + { 11183206, 134721 }, + { 11357706, 135171 }, + { 11534274, 135621 }, + { 11712924, 136071 }, + { 11893673, 136520 }, + { 12076536, 136969 }, + { 12261527, 137418 }, + { 12448664, 137867 }, + { 12637961, 138315 }, + { 12829435, 138763 }, + { 13023101, 139211 }, + { 13218974, 139658 }, + { 13417071, 140106 }, + { 13617407, 140553 }, + { 13819999, 140999 }, + { 14024862, 141446 }, + { 14232012, 141892 }, + { 14441465, 142339 }, + { 14653238, 142785 }, + { 14867346, 143230 }, + { 15083805, 143676 }, + { 15302632, 144121 }, + { 15523842, 144566 }, + { 15747453, 145011 }, + { 15973479, 145456 }, + { 16201939, 145900 }, + { 16432847, 146345 }, + { 16666221, 146789 }, + { 16902076, 147233 }, + { 17140429, 147677 }, + { 17381297, 148121 }, + { 17624696, 148564 }, + { 17870643, 149007 }, + { 18119154, 149451 }, + { 18370247, 149894 }, + { 18623936, 150336 }, + { 18880241, 150779 }, + { 19139176, 151222 }, + { 19400759, 151664 }, + { 19665007, 152107 }, + { 19931936, 152549 }, + { 20201564, 152991 }, + { 20473907, 153433 }, + { 20748982, 153875 }, + { 21026807, 154316 }, + { 21307399, 154758 }, + { 21590773, 155199 }, + { 21876949, 155641 }, + { 22165941, 156082 }, + { 22457769, 156523 }, + { 22752449, 156964 }, + { 23049999, 157405 }, + { 23350435, 157846 }, + { 23653774, 158287 }, + { 23960036, 158727 }, + { 24269236, 159168 }, + { 24581392, 159608 }, + { 24896521, 160049 }, + { 25214642, 160489 }, + { 25535772, 160929 }, + { 25859927, 161370 }, + { 26187127, 161810 }, + { 26517388, 162250 }, + { 26850728, 162690 }, + { 27187165, 163130 }, + { 27526716, 163569 }, + { 27869400, 164009 }, + { 28215234, 164449 }, + { 28564236, 164889 }, + { 28916423, 165328 }, + { 29271815, 165768 }, + { 29630428, 166208 }, + { 29992281, 166647 }, + { 30357392, 167087 }, + { 30725779, 167526 }, + { 31097459, 167965 }, + { 31472452, 168405 }, + { 31850774, 168844 }, + { 32232445, 169283 }, + { 32617482, 169723 }, + { 33005904, 170162 }, + { 33397730, 170601 }, + { 33792976, 171041 }, + { 34191663, 171480 }, + { 34593807, 171919 }, + { 34999428, 172358 }, + { 35408544, 172797 }, + { 35821174, 173237 }, + { 36237335, 173676 }, + { 36657047, 174115 }, + { 37080329, 174554 }, + { 37507197, 174993 }, + { 37937673, 175433 }, + { 38371773, 175872 }, + { 38809517, 176311 }, + { 39250924, 176750 }, + { 39696012, 177190 }, + { 40144800, 177629 }, + { 40597308, 178068 }, + { 41053553, 178507 }, + { 41513554, 178947 }, + { 41977332, 179386 }, + { 42444904, 179825 }, + { 42916290, 180265 }, + { 43391509, 180704 }, + { 43870579, 181144 }, + { 44353520, 181583 }, + { 44840352, 182023 }, + { 45331092, 182462 }, + { 45825761, 182902 }, + { 46324378, 183342 }, + { 46826961, 183781 }, + { 47333531, 184221 }, + { 47844106, 184661 }, + { 48358706, 185101 }, + { 48877350, 185541 }, + { 49400058, 185981 }, + { 49926849, 186421 }, + { 50457743, 186861 }, + { 50992759, 187301 }, + { 51531916, 187741 }, + { 52075235, 188181 }, + { 52622735, 188622 }, + { 53174435, 189062 }, + { 53730355, 189502 }, + { 54290515, 189943 }, + { 54854935, 190383 }, + { 55423634, 190824 }, + { 55996633, 191265 }, + { 56573950, 191706 }, + { 57155606, 192146 }, + { 57741621, 192587 }, + { 58332014, 193028 }, + { 58926806, 193470 }, + { 59526017, 193911 }, + { 60129666, 194352 }, + { 60737774, 194793 }, + { 61350361, 195235 }, + { 61967446, 195677 }, + { 62589050, 196118 }, + { 63215194, 196560 }, + { 63845897, 197002 }, + { 64481179, 197444 }, + { 65121061, 197886 }, + { 65765563, 198328 }, + { 66414705, 198770 }, + { 67068508, 199213 }, + { 67726992, 199655 }, + { 68390177, 200098 }, + { 69058085, 200540 }, + { 69730735, 200983 }, + { 70408147, 201426 }, + { 71090343, 201869 }, + { 71777343, 202312 }, + { 72469168, 202755 }, + { 73165837, 203199 }, + { 73867373, 203642 }, + { 74573795, 204086 }, + { 75285124, 204529 }, + { 76001380, 204973 }, + { 76722586, 205417 }, + { 77448761, 205861 }, + { 78179926, 206306 }, + { 78916102, 206750 }, + { 79657310, 207194 }, + { 80403571, 207639 }, + { 81154906, 208084 }, + { 81911335, 208529 }, + { 82672880, 208974 }, + { 83439562, 209419 }, + { 84211402, 209864 }, + { 84988421, 210309 }, + { 85770640, 210755 }, + { 86558080, 211201 }, + { 87350762, 211647 }, + { 88148708, 212093 }, + { 88951938, 212539 }, + { 89760475, 212985 }, + { 90574339, 213432 }, + { 91393551, 213878 }, + { 92218133, 214325 }, + { 93048107, 214772 }, + { 93883493, 215219 }, + { 94724314, 215666 }, + { 95570590, 216114 }, + { 96422343, 216561 }, + { 97279594, 217009 }, + { 98142366, 217457 }, + { 99010679, 217905 }, + { 99884556, 218353 }, + { 100764018, 218801 }, + { 101649086, 219250 }, + { 102539782, 219698 }, + { 103436128, 220147 }, + { 104338146, 220596 }, + { 105245857, 221046 }, + { 106159284, 221495 }, + { 107078448, 221945 }, + { 108003370, 222394 }, + { 108934074, 222844 }, + { 109870580, 223294 }, + { 110812910, 223745 }, + { 111761087, 224195 }, + { 112715133, 224646 }, + { 113675069, 225097 }, + { 114640918, 225548 }, + { 115612702, 225999 }, + { 116590442, 226450 }, + { 117574162, 226902 }, + { 118563882, 227353 }, + { 119559626, 227805 }, + { 120561415, 228258 }, + { 121569272, 228710 }, + { 122583219, 229162 }, + { 123603278, 229615 }, + { 124629471, 230068 }, + { 125661822, 230521 }, + { 126700352, 230974 }, + { 127745083, 231428 }, + { 128796039, 231882 }, + { 129853241, 232336 }, + { 130916713, 232790 }, + { 131986475, 233244 }, + { 133062553, 233699 }, + { 134144966, 234153 }, + { 135233739, 234608 }, + { 136328894, 235064 }, + { 137430453, 235519 }, + { 138538440, 235975 }, + { 139652876, 236430 }, + { 140773786, 236886 }, + { 141901190, 237343 }, + { 143035113, 237799 }, + { 144175576, 238256 }, + { 145322604, 238713 }, + { 146476218, 239170 }, + { 147636442, 239627 }, + { 148803298, 240085 }, + { 149976809, 240542 }, + { 151156999, 241000 }, + { 152343890, 241459 }, + { 153537506, 241917 }, + { 154737869, 242376 }, + { 155945002, 242835 }, + { 157158929, 243294 }, + { 158379673, 243753 }, + { 159607257, 244213 }, + { 160841704, 244673 }, + { 162083037, 245133 }, + { 163331279, 245593 }, + { 164586455, 246054 }, + { 165848586, 246514 }, + { 167117696, 246975 }, + { 168393810, 247437 }, + { 169676949, 247898 }, + { 170967138, 248360 }, + { 172264399, 248822 }, + { 173568757, 249284 }, + { 174880235, 249747 }, + { 176198856, 250209 }, + { 177524643, 250672 }, + { 178857621, 251136 }, + { 180197813, 251599 }, + { 181545242, 252063 }, + { 182899933, 252527 }, + { 184261908, 252991 }, + { 185631191, 253456 }, + { 187007807, 253920 }, + { 188391778, 254385 }, + { 189783129, 254851 }, + { 191181884, 255316 }, + { 192588065, 255782 }, + { 194001698, 256248 }, + { 195422805, 256714 }, + { 196851411, 257181 }, + { 198287540, 257648 }, + { 199731215, 258115 }, + { 201182461, 258582 }, + { 202641302, 259050 }, + { 204107760, 259518 }, + { 205581862, 259986 }, + { 207063630, 260454 }, + { 208553088, 260923 }, + { 210050262, 261392 }, + { 211555174, 261861 }, + { 213067849, 262331 }, + { 214588312, 262800 }, + { 216116586, 263270 }, + { 217652696, 263741 }, + { 219196666, 264211 }, + { 220748520, 264682 }, + { 222308282, 265153 }, + { 223875978, 265625 }, + { 225451630, 266097 }, + { 227035265, 266569 }, + { 228626905, 267041 }, + { 230226576, 267514 }, + { 231834302, 267986 }, + { 233450107, 268460 }, + { 235074016, 268933 }, + { 236706054, 269407 }, + { 238346244, 269881 }, + { 239994613, 270355 }, + { 241651183, 270830 }, + { 243315981, 271305 } +}; + +/* Calculate the send rate as per section 3.1 of RFC3448 + +Returns send rate in bytes per second + +Integer maths and lookups are used as not allowed floating point in kernel + +The function for Xcalc as per section 3.1 of RFC3448 is: + +X = s + ------------------------------------------------------------- + R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) + +where +X is the trasmit rate in bytes/second +s is the packet size in bytes +R is the round trip time in seconds +p is the loss event rate, between 0 and 1.0, of the number of loss events + as a fraction of the number of packets transmitted +t_RTO is the TCP retransmission timeout value in seconds +b is the number of packets acknowledged by a single TCP acknowledgement + +we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: + +X = s + ----------------------------------------------------------------------- + R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) + + +which we can break down into: + +X = s + -------- + R * f(p) + +where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) + +Function parameters: +s - bytes +R - RTT in usecs +p - loss rate (decimal fraction multiplied by 1,000,000) + +Returns Xcalc in bytes per second + +DON'T alter this code unless you run test cases against it as the code +has been manipulated to stop underflow/overlow. + +*/ +u32 tfrc_calc_x(u16 s, u32 R, u32 p) +{ + int index; + u32 f; + u64 tmp1, tmp2; + + if (p < TFRC_CALC_X_SPLIT) + index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; + else + index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; + + if (index < 0) + /* p should be 0 unless there is a bug in my code */ + index = 0; + + if (R == 0) + R = 1; /* RTT can't be zero or else divide by zero */ + + BUG_ON(index >= TFRC_CALC_X_ARRSIZE); + + if (p >= TFRC_CALC_X_SPLIT) + f = tfrc_calc_x_lookup[index][0]; + else + f = tfrc_calc_x_lookup[index][1]; + + tmp1 = ((u64)s * 100000000); + tmp2 = ((u64)R * (u64)f); + do_div(tmp2, 10000); + do_div(tmp1, tmp2); + /* Don't alter above math unless you test due to overflow on 32 bit */ + + return (u32)tmp1; +} + +EXPORT_SYMBOL_GPL(tfrc_calc_x); + +/* + * args: fvalue - function value to match + * returns: p closest to that value + * + * both fvalue and p are multiplied by 1,000,000 to use ints + */ +u32 tfrc_calc_x_reverse_lookup(u32 fvalue) +{ + int ctr = 0; + int small; + + if (fvalue < tfrc_calc_x_lookup[0][1]) + return 0; + + if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) + small = 1; + else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) + return 1000000; + else + small = 0; + + while (fvalue > tfrc_calc_x_lookup[ctr][small]) + ctr++; + + if (small) + return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; + else + return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; +} + +EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h new file mode 100644 index 00000000000..33456c0d593 --- /dev/null +++ b/net/dccp/dccp.h @@ -0,0 +1,493 @@ +#ifndef _DCCP_H +#define _DCCP_H +/* + * net/dccp/dccp.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <net/snmp.h> +#include <net/sock.h> +#include <net/tcp.h> + +#ifdef CONFIG_IP_DCCP_DEBUG +extern int dccp_debug; + +#define dccp_pr_debug(format, a...) \ + do { if (dccp_debug) \ + printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \ + } while (0) +#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \ + printk(format, ##a); } while (0) +#else +#define dccp_pr_debug(format, a...) +#define dccp_pr_debug_cat(format, a...) +#endif + +extern struct inet_hashinfo dccp_hashinfo; + +extern atomic_t dccp_orphan_count; +extern int dccp_tw_count; +extern void dccp_tw_deschedule(struct inet_timewait_sock *tw); + +extern void dccp_time_wait(struct sock *sk, int state, int timeo); + +/* FIXME: Right size this */ +#define DCCP_MAX_OPT_LEN 128 + +#define DCCP_MAX_PACKET_HDR 32 + +#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER) + +#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ + +/* draft-ietf-dccp-spec-11.txt initial RTO value */ +#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ)) + +/* Maximal interval between probes for local resources. */ +#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) + +#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ + +extern struct proto dccp_v4_prot; + +/* is seq1 < seq2 ? */ +static inline int before48(const u64 seq1, const u64 seq2) +{ + return (s64)((seq1 << 16) - (seq2 << 16)) < 0; +} + +/* is seq1 > seq2 ? */ +static inline int after48(const u64 seq1, const u64 seq2) +{ + return (s64)((seq2 << 16) - (seq1 << 16)) < 0; +} + +/* is seq2 <= seq1 <= seq3 ? */ +static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) +{ + return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); +} + +static inline u64 max48(const u64 seq1, const u64 seq2) +{ + return after48(seq1, seq2) ? seq1 : seq2; +} + +enum { + DCCP_MIB_NUM = 0, + DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ + DCCP_MIB_ESTABRESETS, /* EstabResets */ + DCCP_MIB_CURRESTAB, /* CurrEstab */ + DCCP_MIB_OUTSEGS, /* OutSegs */ + DCCP_MIB_OUTRSTS, + DCCP_MIB_ABORTONTIMEOUT, + DCCP_MIB_TIMEOUTS, + DCCP_MIB_ABORTFAILED, + DCCP_MIB_PASSIVEOPENS, + DCCP_MIB_ATTEMPTFAILS, + DCCP_MIB_OUTDATAGRAMS, + DCCP_MIB_INERRS, + DCCP_MIB_OPTMANDATORYERROR, + DCCP_MIB_INVALIDOPT, + __DCCP_MIB_MAX +}; + +#define DCCP_MIB_MAX __DCCP_MIB_MAX +struct dccp_mib { + unsigned long mibs[DCCP_MIB_MAX]; +} __SNMP_MIB_ALIGN__; + +DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) +#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_ADD_STATS_BH(field, val) \ + SNMP_ADD_STATS_BH(dccp_statistics, field, val) +#define DCCP_ADD_STATS_USER(field, val) \ + SNMP_ADD_STATS_USER(dccp_statistics, field, val) + +extern int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb); +extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb); + +extern int dccp_send_response(struct sock *sk); +extern void dccp_send_ack(struct sock *sk); +extern void dccp_send_delayed_ack(struct sock *sk); +extern void dccp_send_sync(struct sock *sk, const u64 seq, + const enum dccp_pkt_type pkt_type); + +extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo); +extern void dccp_write_space(struct sock *sk); + +extern void dccp_init_xmit_timers(struct sock *sk); +static inline void dccp_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} + +extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); + +extern const char *dccp_packet_name(const int type); +extern const char *dccp_state_name(const int state); + +static inline void dccp_set_state(struct sock *sk, const int state) +{ + const int oldstate = sk->sk_state; + + dccp_pr_debug("%s(%p) %-10.10s -> %s\n", + dccp_role(sk), sk, + dccp_state_name(oldstate), dccp_state_name(state)); + WARN_ON(state == oldstate); + + switch (state) { + case DCCP_OPEN: + if (oldstate != DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + break; + + case DCCP_CLOSED: + if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN) + DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash != NULL && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(&dccp_hashinfo, sk); + /* fall through */ + default: + if (oldstate == DCCP_OPEN) + DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; +} + +static inline void dccp_done(struct sock *sk) +{ + dccp_set_state(sk, DCCP_CLOSED); + dccp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} + +static inline void dccp_openreq_init(struct request_sock *req, + struct dccp_sock *dp, + struct sk_buff *skb) +{ + /* + * FIXME: fill in the other req fields from the DCCP options + * received + */ + inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->acked = 0; + req->rcv_wnd = 0; +} + +extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); + +extern struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb); + +extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); + +extern void dccp_v4_err(struct sk_buff *skb, u32); + +extern int dccp_v4_rcv(struct sk_buff *skb); + +extern struct sock *dccp_v4_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); +extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev); + +extern int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len); +extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len); + +extern void dccp_close(struct sock *sk, long timeout); +extern struct sk_buff *dccp_make_response(struct sock *sk, + struct dst_entry *dst, + struct request_sock *req); +extern struct sk_buff *dccp_make_reset(struct sock *sk, + struct dst_entry *dst, + enum dccp_reset_codes code); + +extern int dccp_connect(struct sock *sk); +extern int dccp_disconnect(struct sock *sk, int flags); +extern int dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +extern int dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen); +extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); +extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len); +extern void dccp_shutdown(struct sock *sk, int how); + +extern int dccp_v4_checksum(const struct sk_buff *skb, + const u32 saddr, const u32 daddr); + +extern int dccp_v4_send_reset(struct sock *sk, + enum dccp_reset_codes code); +extern void dccp_send_close(struct sock *sk, const int active); + +struct dccp_skb_cb { + __u8 dccpd_type; + __u8 dccpd_reset_code; + __u8 dccpd_service; + __u8 dccpd_ccval; + __u64 dccpd_seq; + __u64 dccpd_ack_seq; + int dccpd_opt_len; +}; + +#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) + +static inline int dccp_non_data_packet(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_ACK || + type == DCCP_PKT_CLOSE || + type == DCCP_PKT_CLOSEREQ || + type == DCCP_PKT_RESET || + type == DCCP_PKT_SYNC || + type == DCCP_PKT_SYNCACK; +} + +static inline int dccp_packet_without_ack(const struct sk_buff *skb) +{ + const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; + + return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; +} + +#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1) +#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2) + +static inline void dccp_set_seqno(u64 *seqno, u64 value) +{ + if (value > DCCP_MAX_SEQNO) + value -= DCCP_MAX_SEQNO + 1; + *seqno = value; +} + +static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2) +{ + return ((seqno2 << 16) - (seqno1 << 16)) >> 16; +} + +static inline void dccp_inc_seqno(u64 *seqno) +{ + if (++*seqno > DCCP_MAX_SEQNO) + *seqno = 0; +} + +static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) +{ + struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + + sizeof(*dh)); + +#if defined(__LITTLE_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dh->dccph_seq = htonl((gss >> 32)); +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + dhx->dccph_seq_low = htonl(gss & 0xffffffff); +} + +static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, + const u64 gsr) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8; +#elif defined(__BIG_ENDIAN_BITFIELD) + dhack->dccph_ack_nr_high = htonl((gsr >> 32)); +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); +} + +static inline void dccp_update_gsr(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + + dp->dccps_gsr = seq; + dccp_set_seqno(&dp->dccps_swl, + (dp->dccps_gsr + 1 - + (dp->dccps_options.dccpo_sequence_window / 4))); + dccp_set_seqno(&dp->dccps_swh, + (dp->dccps_gsr + + (3 * dp->dccps_options.dccpo_sequence_window) / 4)); +} + +static inline void dccp_update_gss(struct sock *sk, u64 seq) +{ + struct dccp_sock *dp = dccp_sk(sk); + + dp->dccps_awh = dp->dccps_gss = seq; + dccp_set_seqno(&dp->dccps_awl, + (dp->dccps_gss - + dp->dccps_options.dccpo_sequence_window + 1)); +} + +extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb); +extern void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time); +extern void dccp_insert_option_timestamp(struct sock *sk, + struct sk_buff *skb); +extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + unsigned char option, + const void *value, unsigned char len); + +extern struct socket *dccp_ctl_socket; + +#define DCCP_ACKPKTS_STATE_RECEIVED 0 +#define DCCP_ACKPKTS_STATE_ECN_MARKED (1 << 6) +#define DCCP_ACKPKTS_STATE_NOT_RECEIVED (3 << 6) + +#define DCCP_ACKPKTS_STATE_MASK 0xC0 /* 11000000 */ +#define DCCP_ACKPKTS_LEN_MASK 0x3F /* 00111111 */ + +/** struct dccp_ackpkts - acknowledgeable packets + * + * This data structure is the one defined in the DCCP draft + * Appendix A. + * + * @dccpap_buf_head - circular buffer head + * @dccpap_buf_tail - circular buffer tail + * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the + * buffer (i.e. %dccpap_buf_head) + * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked + * by the buffer with State 0 + * + * Additionally, the HC-Receiver must keep some information about the + * Ack Vectors it has recently sent. For each packet sent carrying an + * Ack Vector, it remembers four variables: + * + * @dccpap_ack_seqno - the Sequence Number used for the packet + * (HC-Receiver seqno) + * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement. + * @dccpap_ack_ackno - the Acknowledgement Number used for the packet + * (HC-Sender seqno) + * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. + * + * @dccpap_buf_len - circular buffer length + * @dccpap_time - the time in usecs + * @dccpap_buf - circular buffer of acknowledgeable packets + */ +struct dccp_ackpkts { + unsigned int dccpap_buf_head; + unsigned int dccpap_buf_tail; + u64 dccpap_buf_ackno; + u64 dccpap_ack_seqno; + u64 dccpap_ack_ackno; + unsigned int dccpap_ack_ptr; + unsigned int dccpap_buf_vector_len; + unsigned int dccpap_ack_vector_len; + unsigned int dccpap_buf_len; + struct timeval dccpap_time; + u8 dccpap_buf_nonce; + u8 dccpap_ack_nonce; + u8 dccpap_buf[0]; +}; + +extern struct dccp_ackpkts * + dccp_ackpkts_alloc(unsigned int len, + const unsigned int __nocast priority); +extern void dccp_ackpkts_free(struct dccp_ackpkts *ap); +extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state); +extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno); + +static inline suseconds_t timeval_usecs(const struct timeval *tv) +{ + return tv->tv_sec * USEC_PER_SEC + tv->tv_usec; +} + +static inline suseconds_t timeval_delta(const struct timeval *large, + const struct timeval *small) +{ + time_t secs = large->tv_sec - small->tv_sec; + suseconds_t usecs = large->tv_usec - small->tv_usec; + + if (usecs < 0) { + secs--; + usecs += USEC_PER_SEC; + } + return secs * USEC_PER_SEC + usecs; +} + +static inline void timeval_add_usecs(struct timeval *tv, + const suseconds_t usecs) +{ + tv->tv_usec += usecs; + while (tv->tv_usec >= USEC_PER_SEC) { + tv->tv_sec++; + tv->tv_usec -= USEC_PER_SEC; + } +} + +static inline void timeval_sub_usecs(struct timeval *tv, + const suseconds_t usecs) +{ + tv->tv_usec -= usecs; + while (tv->tv_usec < 0) { + tv->tv_sec--; + tv->tv_usec += USEC_PER_SEC; + } +} + +/* + * Returns the difference in usecs between timeval + * passed in and current time + */ +static inline suseconds_t timeval_now_delta(const struct timeval *tv) +{ + struct timeval now; + do_gettimeofday(&now); + return timeval_delta(&now, tv); +} + +#ifdef CONFIG_IP_DCCP_DEBUG +extern void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, int len); +extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap); +#else +static inline void dccp_ackvector_print(const u64 ackno, + const unsigned char *vector, + int len) { } +static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { } +#endif + +#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c new file mode 100644 index 00000000000..f675d8e642d --- /dev/null +++ b/net/dccp/diag.c @@ -0,0 +1,71 @@ +/* + * net/dccp/diag.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@mandriva.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> + +#include <linux/module.h> +#include <linux/inet_diag.h> + +#include "ccid.h" +#include "dccp.h" + +static void dccp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct dccp_sock *dp = dccp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_retransmits = icsk->icsk_retransmits; + info->tcpi_probes = icsk->icsk_probes_out; + info->tcpi_backoff = icsk->icsk_backoff; + info->tcpi_pmtu = dp->dccps_pmtu_cookie; + + if (dp->dccps_options.dccpo_send_ack_vector) + info->tcpi_options |= TCPI_OPT_SACK; + + ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); + ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); +} + +static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + r->idiag_rqueue = r->idiag_wqueue = 0; + + if (_info != NULL) + dccp_get_info(sk, _info); +} + +static struct inet_diag_handler dccp_diag_handler = { + .idiag_hashinfo = &dccp_hashinfo, + .idiag_get_info = dccp_diag_get_info, + .idiag_type = DCCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static int __init dccp_diag_init(void) +{ + return inet_diag_register(&dccp_diag_handler); +} + +static void __exit dccp_diag_fini(void) +{ + inet_diag_unregister(&dccp_diag_handler); +} + +module_init(dccp_diag_init); +module_exit(dccp_diag_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); +MODULE_DESCRIPTION("DCCP inet_diag handler"); diff --git a/net/dccp/input.c b/net/dccp/input.c new file mode 100644 index 00000000000..ef29cef1daf --- /dev/null +++ b/net/dccp/input.c @@ -0,0 +1,600 @@ +/* + * net/dccp/input.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/skbuff.h> + +#include <net/sock.h> + +#include "ccid.h" +#include "dccp.h" + +static void dccp_fin(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); +} + +static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb) +{ + dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED); + dccp_fin(sk, skb); + dccp_set_state(sk, DCCP_CLOSED); + sk_wake_async(sk, 1, POLL_HUP); +} + +static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) +{ + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); + return; + } + + dccp_set_state(sk, DCCP_CLOSING); + dccp_send_close(sk, 0); +} + +static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_options.dccpo_send_ack_vector) + dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq); +} + +static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + struct dccp_sock *dp = dccp_sk(sk); + u64 lswl, lawl; + + /* + * Step 5: Prepare sequence numbers for Sync + * If P.type == Sync or P.type == SyncAck, + * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, + * / * P is valid, so update sequence number variables + * accordingly. After this update, P will pass the tests + * in Step 6. A SyncAck is generated if necessary in + * Step 15 * / + * Update S.GSR, S.SWL, S.SWH + * Otherwise, + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_SYNC || + dh->dccph_type == DCCP_PKT_SYNCACK) { + if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + dp->dccps_awl, dp->dccps_awh) && + !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl)) + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + else + return -1; + } + + /* + * Step 6: Check sequence numbers + * Let LSWL = S.SWL and LAWL = S.AWL + * If P.type == CloseReq or P.type == Close or P.type == Reset, + * LSWL := S.GSR + 1, LAWL := S.GAR + * If LSWL <= P.seqno <= S.SWH + * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), + * Update S.GSR, S.SWL, S.SWH + * If P.type != Sync, + * Update S.GAR + * Otherwise, + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + lswl = dp->dccps_swl; + lawl = dp->dccps_awl; + + if (dh->dccph_type == DCCP_PKT_CLOSEREQ || + dh->dccph_type == DCCP_PKT_CLOSE || + dh->dccph_type == DCCP_PKT_RESET) { + lswl = dp->dccps_gsr; + dccp_inc_seqno(&lswl); + lawl = dp->dccps_gar; + } + + if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) && + (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ || + between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + lawl, dp->dccps_awh))) { + dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq); + + if (dh->dccph_type != DCCP_PKT_SYNC && + (DCCP_SKB_CB(skb)->dccpd_ack_seq != + DCCP_PKT_WITHOUT_ACK_SEQ)) + dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq; + } else { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, " + "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " + "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " + "sending SYNC...\n", + dccp_packet_name(dh->dccph_type), + (unsigned long long) lswl, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_seq, + (unsigned long long) dp->dccps_swh, + (DCCP_SKB_CB(skb)->dccpd_ack_seq == + DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists", + (unsigned long long) lawl, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) dp->dccps_awh); + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); + return -1; + } + + return 0; +} + +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable " + "packets buffer full!\n"); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MIN, + DCCP_RTO_MAX); + goto discard; + } + + /* + * FIXME: this activation is probably wrong, have to study more + * TCP delack machinery and how it fits into DCCP draft, but + * for now it kinda "works" 8) + */ + if (!inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ, + DCCP_RTO_MAX); + } + } + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + switch (dccp_hdr(skb)->dccph_type) { + case DCCP_PKT_DATAACK: + case DCCP_PKT_DATA: + /* + * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED + * option if it is. + */ + __skb_pull(skb, dh->dccph_doff * 4); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk, 0); + return 0; + case DCCP_PKT_ACK: + goto discard; + case DCCP_PKT_RESET: + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + case DCCP_PKT_CLOSEREQ: + dccp_rcv_closereq(sk, skb); + goto discard; + case DCCP_PKT_CLOSE: + dccp_rcv_close(sk, skb); + return 0; + case DCCP_PKT_REQUEST: + /* Step 7 + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state >= OPEN and P.type == Request + * and P.seqno >= S.OSR) + * or (S.state >= OPEN and P.type == Response + * and P.seqno >= S.OSR) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if (dp->dccps_role != DCCP_ROLE_LISTEN) + goto send_sync; + goto check_seq; + case DCCP_PKT_RESPONSE: + if (dp->dccps_role != DCCP_ROLE_CLIENT) + goto send_sync; +check_seq: + if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) { +send_sync: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNC); + } + break; + case DCCP_PKT_SYNC: + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNCACK); + /* + * From the draft: + * + * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets + * MAY have non-zero-length application data areas, whose + * contents * receivers MUST ignore. + */ + goto discard; + } + + DCCP_INC_STATS_BH(DCCP_MIB_INERRS); +discard: + __kfree_skb(skb); + return 0; +} + +static int dccp_rcv_request_sent_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + /* + * Step 4: Prepare sequence numbers in REQUEST + * If S.state == REQUEST, + * If (P.type == Response or P.type == Reset) + * and S.AWL <= P.ackno <= S.AWH, + * / * Set sequence number variables corresponding to the + * other endpoint, so P will pass the tests in Step 6 * / + * Set S.GSR, S.ISR, S.SWL, S.SWH + * / * Response processing continues in Step 10; Reset + * processing continues in Step 9 * / + */ + if (dh->dccph_type == DCCP_PKT_RESPONSE) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct dccp_sock *dp = dccp_sk(sk); + + /* Stop the REQUEST timer */ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + BUG_TRAP(sk->sk_send_head != NULL); + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + + if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, + dp->dccps_awl, dp->dccps_awh)) { + dccp_pr_debug("invalid ackno: S.AWL=%llu, " + "P.ackno=%llu, S.AWH=%llu \n", + (unsigned long long)dp->dccps_awl, + (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long)dp->dccps_awh); + goto out_invalid_packet; + } + + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_update_gsr(sk, dp->dccps_isr); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + * + * AWL was adjusted in dccp_v4_connect -acme + */ + dccp_set_seqno(&dp->dccps_swl, + max48(dp->dccps_swl, dp->dccps_isr)); + + if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 || + ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) { + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + /* FIXME: send appropriate RESET code */ + goto out_invalid_packet; + } + + dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + + /* + * Step 10: Process REQUEST state (second part) + * If S.state == REQUEST, + * / * If we get here, P is a valid Response from the + * server (see Step 4), and we should move to + * PARTOPEN state. PARTOPEN means send an Ack, + * don't send Data packets, retransmit Acks + * periodically, and always include any Init Cookie + * from the Response * / + * S.state := PARTOPEN + * Set PARTOPEN timer + * Continue with S.state == PARTOPEN + * / * Step 12 will send the Ack completing the + * three-way handshake * / + */ + dccp_set_state(sk, DCCP_PARTOPEN); + + /* Make sure socket is routed, for correct metrics. */ + inet_sk_rebuild_header(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (sk->sk_write_pending || icsk->icsk_ack.pingpong || + icsk->icsk_accept_queue.rskq_defer_accept) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + /* + * OK, in DCCP we can as well do a similar trick, its + * even in the draft, but there is no need for us to + * schedule an ack here, as dccp_sendmsg does this for + * us, also stated in the draft. -acme + */ + __kfree_skb(skb); + return 0; + } + dccp_send_ack(sk); + return -1; + } + +out_invalid_packet: + return 1; /* dccp_v4_do_rcv will send a reset, but... + FIXME: the reset code should be + DCCP_RESET_CODE_PACKET_ERROR */ +} + +static int dccp_rcv_respond_partopen_state_process(struct sock *sk, + struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) +{ + int queued = 0; + + switch (dh->dccph_type) { + case DCCP_PKT_RESET: + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + break; + case DCCP_PKT_DATAACK: + case DCCP_PKT_ACK: + /* + * FIXME: we should be reseting the PARTOPEN (DELACK) timer + * here but only if we haven't used the DELACK timer for + * something else, like sending a delayed ack for a TIMESTAMP + * echo, etc, for now were not clearing it, sending an extra + * ACK when there is nothing else to do in DELACK is not a big + * deal after all. + */ + + /* Stop the PARTOPEN timer */ + if (sk->sk_state == DCCP_PARTOPEN) + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + + dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; + dccp_set_state(sk, DCCP_OPEN); + + if (dh->dccph_type == DCCP_PKT_DATAACK) { + dccp_rcv_established(sk, skb, dh, len); + queued = 1; /* packet was queued + (by dccp_rcv_established) */ + } + break; + } + + return queued; +} + +int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct dccp_hdr *dh, unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + const int old_state = sk->sk_state; + int queued = 0; + + /* + * Step 3: Process LISTEN state + * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv) + * + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie + * option, + * * Must scan the packet's options to check for an Init + * Cookie. Only the Init Cookie is processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies * + * * Generate a new socket and switch to that socket * + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Continue with S.state == RESPOND + * * A Response packet will be generated in Step 11 * + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + if (dh->dccph_type == DCCP_PKT_REQUEST) { + if (dccp_v4_conn_request(sk, skb) < 0) + return 1; + + /* FIXME: do congestion control initialization */ + goto discard; + } + if (dh->dccph_type == DCCP_PKT_RESET) + goto discard; + + /* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/ + return 1; + } + + if (sk->sk_state != DCCP_REQUESTING) { + if (dccp_check_seqno(sk, skb)) + goto discard; + + /* + * Step 8: Process options and mark acknowledgeable + */ + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != + DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + /* + * FIXME: check ECN to see if we should use + * DCCP_ACKPKTS_STATE_ECN_MARKED + */ + if (dp->dccps_options.dccpo_send_ack_vector) { + if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKPKTS_STATE_RECEIVED)) + goto discard; + /* + * FIXME: this activation is probably wrong, have to + * study more TCP delack machinery and how it fits into + * DCCP draft, but for now it kinda "works" 8) + */ + if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno == + DCCP_MAX_SEQNO + 1) && + !inet_csk_ack_scheduled(sk)) { + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MIN, + DCCP_RTO_MAX); + } + } + } + + /* + * Step 9: Process Reset + * If P.type == Reset, + * Tear down connection + * S.state := TIMEWAIT + * Set TIMEWAIT timer + * Drop packet and return + */ + if (dh->dccph_type == DCCP_PKT_RESET) { + /* + * Queue the equivalent of TCP fin so that dccp_recvmsg + * exits the loop + */ + dccp_fin(sk, skb); + dccp_time_wait(sk, DCCP_TIME_WAIT, 0); + return 0; + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == CloseReq) + * or (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && + (dh->dccph_type == DCCP_PKT_RESPONSE || + dh->dccph_type == DCCP_PKT_CLOSEREQ)) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && + dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_PKT_SYNC); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { + dccp_rcv_closereq(sk, skb); + goto discard; + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_rcv_close(sk, skb); + return 0; + } + + switch (sk->sk_state) { + case DCCP_CLOSED: + return 1; + + case DCCP_REQUESTING: + /* FIXME: do congestion control initialization */ + + queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); + if (queued >= 0) + return queued; + + __kfree_skb(skb); + return 0; + + case DCCP_RESPOND: + case DCCP_PARTOPEN: + queued = dccp_rcv_respond_partopen_state_process(sk, skb, + dh, len); + break; + } + + if (dh->dccph_type == DCCP_PKT_ACK || + dh->dccph_type == DCCP_PKT_DATAACK) { + switch (old_state) { + case DCCP_PARTOPEN: + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + break; + } + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c new file mode 100644 index 00000000000..3fc75dbee4b --- /dev/null +++ b/net/dccp/ipv4.c @@ -0,0 +1,1356 @@ +/* + * net/dccp/ipv4.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/icmp.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/random.h> + +#include <net/icmp.h> +#include <net/inet_hashtables.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <net/xfrm.h> + +#include "ccid.h" +#include "dccp.h" + +struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, +}; + +EXPORT_SYMBOL_GPL(dccp_hashinfo); + +static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum); +} + +static void dccp_v4_hash(struct sock *sk) +{ + inet_hash(&dccp_hashinfo, sk); +} + +static void dccp_v4_unhash(struct sock *sk) +{ + inet_unhash(&dccp_hashinfo, sk); +} + +/* called with local bh disabled */ +static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_sock *inet = inet_sk(sk); + const u32 daddr = inet->rcv_saddr; + const u32 saddr = inet->daddr; + const int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, + dccp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash]; + const struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hashent = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, &dccp_death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +/* + * Bind a port for a connect operation and hash it. + */ +static int dccp_v4_hash_connect(struct sock *sk) +{ + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + int rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + + /* TODO. Actually it is not so bad idea to remove + * dccp_hashinfo.portalloc_lock before next submission to + * Linus. + * As soon as we touch this place at all it is time to think. + * + * Now it protects single _advisory_ variable + * dccp_hashinfo.port_rover, hence it is mostly useless. + * Code will work nicely if we just delete it, but + * I am afraid in contented case it will work not better or + * even worse: another cpu just will hit the same bucket + * and spin there. + * So some cpu salt could remove both contention and + * memory pingpong. Any ideas how to do this in a nice way? + */ + spin_lock(&dccp_hashinfo.portalloc_lock); + rover = dccp_hashinfo.port_rover; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &dccp_hashinfo.bhash[inet_bhashfn(rover, + dccp_hashinfo.bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == rover) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__dccp_v4_check_established(sk, + rover, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, + head, rover); + if (tb == NULL) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } while (--remaining > 0); + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + /* All locks still held and bhs disabled */ + dccp_hashinfo.port_rover = rover; + spin_unlock(&dccp_hashinfo.portalloc_lock); + + inet_bind_hash(sk, tb, rover); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(rover); + __inet_hash(&dccp_hashinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw != NULL) { + inet_twsk_deschedule(tw, &dccp_death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, + dccp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet_hash(&dccp_hashinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __dccp_v4_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt != NULL && inet->opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_DCCP, + inet->sport, usin->sin_port, sk); + if (tmp < 0) + return tmp; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (inet->opt == NULL || !inet->opt->srr) + daddr = rt->rt_dst; + + if (inet->saddr == 0) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + dp->dccps_ext_header_len = 0; + if (inet->opt != NULL) + dp->dccps_ext_header_len = inet->opt->optlen; + /* + * Socket identity is still unknown (sport may be zero). + * However we set state to DCCP_REQUESTING and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + dccp_set_state(sk, DCCP_REQUESTING); + err = dccp_v4_hash_connect(sk); + if (err != 0) + goto failure; + + err = ip_route_newports(&rt, inet->sport, inet->dport, sk); + if (err != 0) + goto failure; + + /* OK, now commit destination to socket. */ + sk_setup_caps(sk, &rt->u.dst); + + dp->dccps_gar = + dp->dccps_iss = secure_dccp_sequence_number(inet->saddr, + inet->daddr, + inet->sport, + usin->sin_port); + dccp_update_gss(sk, dp->dccps_iss); + + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); + + inet->id = dp->dccps_iss ^ jiffies; + + err = dccp_connect(sk); + rt = NULL; + if (err != 0) + goto failure; +out: + return err; +failure: + /* + * This unhashes the socket and releases the local port, if necessary. + */ + dccp_set_state(sk, DCCP_CLOSED); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + goto out; +} + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void dccp_do_pmtu_discovery(struct sock *sk, + const struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + const struct inet_sock *inet = inet_sk(sk); + const struct dccp_sock *dp = dccp_sk(sk); + + /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs + * send out by Linux are always < 576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == DCCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + dp->dccps_pmtu_cookie > mtu) { + dccp_sync_mss(sk, mtu); + + /* + * From: draft-ietf-dccp-spec-11.txt + * + * DCCP-Sync packets are the best choice for upward + * probing, since DCCP-Sync probes do not risk application + * data loss. + */ + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); + } /* else let the usual retransmit timer handle it */ +} + +static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + return; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + skb->dst = dst_clone(rxskb->dst); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, + rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +} + +static void dccp_v4_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) +{ + dccp_v4_ctl_send_ack(skb); +} + +static int dccp_v4_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff *skb; + + /* First, grab a route. */ + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto out; + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); + + err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, + ireq->rmt_addr, + ireq->opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +} + +/* + * This routine is called by the ICMP module when it gets some sort of error + * condition. If err < 0 then the socket should be closed and the error + * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. + * After adjustment header points to the first 8 bytes of the tcp header. We + * need to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When someone else + * accesses the socket the ICMP is just dropped and for some paths there is no + * check at all. A more general error queue to queue errors for later handling + * is probably better. + */ +void dccp_v4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (struct iphdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + + (iph->ihl << 2)); + struct dccp_sock *dp; + struct inet_sock *inet; + const int type = skb->h.icmph->type; + const int code = skb->h.icmph->code; + struct sock *sk; + __u64 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport, + iph->saddr, dh->dccph_sport, inet_iif(skb)); + if (sk == NULL) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + dp = dccp_sk(sk); + seq = dccp_hdr_seq(skb); + if (sk->sk_state != DCCP_LISTEN && + !between48(seq, dp->dccps_swl, dp->dccps_swh)) { + NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + dccp_do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct request_sock *req , **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + req = inet_csk_search_req(sk, &prev, dh->dccph_dport, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + BUG_TRAP(!req->sk); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else /* Only an error on timeout */ + sk->sk_err_soft = err; +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) +{ + struct sk_buff *skb; + /* + * FIXME: what if rebuild_header fails? + * Should we be doing a rebuild_header here? + */ + int err = inet_sk_rebuild_header(sk); + + if (err != 0) + return err; + + skb = dccp_make_reset(sk, sk->sk_dst_cache, code); + if (skb != NULL) { + const struct dccp_sock *dp = dccp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + err = ip_build_and_send_pkt(skb, sk, + inet->saddr, inet->daddr, NULL); + if (err == NET_XMIT_CN) + err = 0; + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + } + + return err; +} + +static inline u64 dccp_v4_init_sequence(const struct sock *sk, + const struct sk_buff *skb) +{ + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dccp_hdr(skb)->dccph_dport, + dccp_hdr(skb)->dccph_sport); +} + +int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + const __u32 saddr = skb->nh.iph->saddr; + const __u32 daddr = skb->nh.iph->daddr; + struct dst_entry *dst = NULL; + + /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* + * TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + /* + * Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + /* FIXME: Merge Aristeu's option parsing code when ready */ + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ + ireq->opt = NULL; + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq; + dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); + dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service; + + if (dccp_v4_send_response(sk, req, dst)) + goto drop_and_free; + + inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + /* + * FIXME: should be reqsk_free after implementing req->rsk_ops + */ + __reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + return -1; +} + +/* + * The three way handshake has completed - we got a valid ACK or DATAACK - + * now create the new socket. + * + * This is the equivalent of TCP's tcp_v4_syn_recv_sock + */ +struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_request_sock *ireq; + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL) + goto exit; + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto exit; + + sk_setup_caps(newsk, dst); + + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); + newinet->daddr = ireq->rmt_addr; + newinet->rcv_saddr = ireq->loc_addr; + newinet->saddr = ireq->loc_addr; + newinet->opt = ireq->opt; + ireq->opt = NULL; + newinet->mc_index = inet_iif(skb); + newinet->mc_ttl = skb->nh.iph->ttl; + newinet->id = jiffies; + + dccp_sync_mss(newsk, dst_mtu(dst)); + + __inet_hash(&dccp_hashinfo, newsk, 0); + __inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +exit: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + dst_release(dst); + return NULL; +} + +static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct iphdr *iph = skb->nh.iph; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet_csk_search_req(sk, &prev, + dh->dccph_sport, + iph->saddr, iph->daddr); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet_lookup_established(&dccp_hashinfo, + iph->saddr, dh->dccph_sport, + iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, + const u32 daddr) +{ + const struct dccp_hdr* dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; + + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : + skb->len; + } + + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_tcpudp_magic(saddr, daddr, checksum_len, + IPPROTO_DCCP, tmp); +} + +static int dccp_v4_verify_checksum(struct sk_buff *skb, + const u32 saddr, const u32 daddr) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + int checksum_len; + u32 tmp; + + if (dh->dccph_cscov == 0) + checksum_len = skb->len; + else { + checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32); + checksum_len = checksum_len < skb->len ? checksum_len : + skb->len; + } + tmp = csum_partial((unsigned char *)dh, checksum_len, 0); + return csum_tcpudp_magic(saddr, daddr, checksum_len, + IPPROTO_DCCP, tmp) == 0 ? 0 : -1; +} + +static struct dst_entry* dccp_v4_route_skb(struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif, + .nl_u = { .ip4_u = + { .daddr = skb->nh.iph->saddr, + .saddr = skb->nh.iph->daddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = dccp_hdr(skb)->dccph_dport, + .dport = dccp_hdr(skb)->dccph_sport } + } + }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + + return &rt->u.dst; +} + +static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb) +{ + int err; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct dst_entry *dst; + u64 seqno; + + /* Never send a reset in response to a reset. */ + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL) + return; + + dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb); + if (dst == NULL) + return; + + skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + goto out; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->dst = dst_clone(dst); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr, + rxskb->nh.iph->daddr); + + bh_lock_sock(dccp_ctl_socket->sk); + err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk, + rxskb->nh.iph->daddr, + rxskb->nh.iph->saddr, NULL); + bh_unlock_sock(dccp_ctl_socket->sk); + + if (err == NET_XMIT_CN || err == 0) { + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + } +out: + dst_release(dst); +} + +int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_hdr *dh = dccp_hdr(skb); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dh, skb->len)) + goto reset; + return 0; + } + + /* + * Step 3: Process LISTEN state + * If S.state == LISTEN, + * If P.type == Request or P contains a valid Init Cookie + * option, + * * Must scan the packet's options to check for an Init + * Cookie. Only the Init Cookie is processed here, + * however; other options are processed in Step 8. This + * scan need only be performed if the endpoint uses Init + * Cookies * + * * Generate a new socket and switch to that socket * + * Set S := new socket for this port pair + * S.state = RESPOND + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * Continue with S.state == RESPOND + * * A Response packet will be generated in Step 11 * + * Otherwise, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + * + * NOTE: the check for the packet types is done in + * dccp_rcv_state_process + */ + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v4_hnd_req(sk, skb); + + if (nsk == NULL) + goto discard; + + if (nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dh, skb->len)) + goto reset; + return 0; + +reset: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); +discard: + kfree_skb(skb); + return 0; +} + +static inline int dccp_invalid_packet(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + + if (skb->pkt_type != PACKET_HOST) + return 1; + + if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n"); + return 1; + } + + dh = dccp_hdr(skb); + + /* If the packet type is not understood, drop packet and return */ + if (dh->dccph_type >= DCCP_PKT_INVALID) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n"); + return 1; + } + + /* + * If P.Data Offset is too small for packet type, or too large for + * packet, drop packet and return + */ + if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) " + "too small 1\n", + dh->dccph_doff); + return 1; + } + + if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) " + "too small 2\n", + dh->dccph_doff); + return 1; + } + + dh = dccp_hdr(skb); + + /* + * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet + * has short sequence numbers), drop packet and return + */ + if (dh->dccph_x == 0 && + dh->dccph_type != DCCP_PKT_DATA && + dh->dccph_type != DCCP_PKT_ACK && + dh->dccph_type != DCCP_PKT_DATAACK) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack " + "nor DataAck and P.X == 0\n", + dccp_packet_name(dh->dccph_type)); + return 1; + } + + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " + "incorrect\n"); + return 1; + } + + return 0; +} + +/* this is called when real data arrives */ +int dccp_v4_rcv(struct sk_buff *skb) +{ + const struct dccp_hdr *dh; + struct sock *sk; + int rc; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); +#if 0 + /* + * Use something like this to simulate some DATA/DATAACK loss to test + * dccp_ackpkts_add, you'll get something like this on a session that + * sends 10 DATA/DATAACK packets: + * + * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1| + * + * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet + * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets + * with the same state + * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet + * + * So... + * + * 281473596467422 was received + * 281473596467421 was not received + * 281473596467420 was received + * 281473596467419 was not received + * 281473596467418 was received + * 281473596467417 was not received + * 281473596467416 was received + * 281473596467415 was not received + * 281473596467414 was received + * 281473596467413 was received (this one was the 3way handshake + * RESPONSE) + * + */ + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) { + static int discard = 0; + + if (discard) { + discard = 0; + goto discard_it; + } + discard = 1; + } +#endif + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + dccp_pr_debug("%8.8s " + "src=%u.%u.%u.%u@%-5d " + "dst=%u.%u.%u.%u@%-5d seq=%llu", + dccp_packet_name(dh->dccph_type), + NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport), + NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport), + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); + + if (dccp_packet_without_ack(skb)) { + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + dccp_pr_debug_cat("\n"); + } else { + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + dccp_pr_debug_cat(", ack=%llu\n", + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + } + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet_lookup(&dccp_hashinfo, + skb->nh.iph->saddr, dh->dccph_sport, + skb->nh.iph->daddr, ntohs(dh->dccph_dport), + inet_iif(skb)); + + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) { + dccp_pr_debug("failed to look up flow ID in table and " + "get corresponding socket\n"); + goto no_dccp_socket; + } + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) { + dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: " + "do_time_wait\n"); + goto do_time_wait; + } + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + dccp_pr_debug("xfrm4_policy_check failed\n"); + goto discard_and_relse; + } + + if (sk_filter(sk, skb, 0)) { + dccp_pr_debug("sk_filter failed\n"); + goto discard_and_relse; + } + + skb->dev = NULL; + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = dccp_v4_do_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return rc; + +no_dccp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v4_ctl_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; +} + +static int dccp_v4_init_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + static int dccp_ctl_socket_init = 1; + + dccp_options_init(&dp->dccps_options); + + if (dp->dccps_options.dccpo_send_ack_vector) { + dp->dccps_hc_rx_ackpkts = + dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_KERNEL); + + if (dp->dccps_hc_rx_ackpkts == NULL) + return -ENOMEM; + } + + /* + * FIXME: We're hardcoding the CCID, and doing this at this point makes + * the listening (master) sock get CCID control blocks, which is not + * necessary, but for now, to not mess with the test userspace apps, + * lets leave it here, later the real solution is to do this in a + * setsockopt(CCIDs-I-want/accept). -acme + */ + if (likely(!dccp_ctl_socket_init)) { + dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, + sk); + dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid, + sk); + if (dp->dccps_hc_rx_ccid == NULL || + dp->dccps_hc_tx_ccid == NULL) { + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + return -ENOMEM; + } + } else + dccp_ctl_socket_init = 0; + + dccp_init_xmit_timers(sk); + inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + sk->sk_state = DCCP_CLOSED; + sk->sk_write_space = dccp_write_space; + dp->dccps_mss_cache = 536; + dp->dccps_role = DCCP_ROLE_UNDEFINED; + + return 0; +} + +static int dccp_v4_destroy_sock(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + /* + * DCCP doesn't use sk_qrite_queue, just sk_send_head + * for retransmissions + */ + if (sk->sk_send_head != NULL) { + kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + /* Clean up a referenced DCCP bind bucket. */ + if (inet_csk(sk)->icsk_bind_hash != NULL) + inet_put_port(&dccp_hashinfo, sk); + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); + dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts); + dp->dccps_hc_rx_ackpkts = NULL; + ccid_exit(dp->dccps_hc_rx_ccid, sk); + ccid_exit(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + + return 0; +} + +static void dccp_v4_reqsk_destructor(struct request_sock *req) +{ + kfree(inet_rsk(req)->opt); +} + +static struct request_sock_ops dccp_request_sock_ops = { + .family = PF_INET, + .obj_size = sizeof(struct dccp_request_sock), + .rtx_syn_ack = dccp_v4_send_response, + .send_ack = dccp_v4_reqsk_send_ack, + .destructor = dccp_v4_reqsk_destructor, + .send_reset = dccp_v4_ctl_send_reset, +}; + +struct proto dccp_v4_prot = { + .name = "DCCP", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v4_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v4_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v4_do_rcv, + .hash = dccp_v4_hash, + .unhash = dccp_v4_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v4_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v4_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp_sock), + .rsk_prot = &dccp_request_sock_ops, + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c new file mode 100644 index 00000000000..ce5dff4ac22 --- /dev/null +++ b/net/dccp/minisocks.c @@ -0,0 +1,264 @@ +/* + * net/dccp/minisocks.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/skbuff.h> +#include <linux/timer.h> + +#include <net/sock.h> +#include <net/xfrm.h> +#include <net/inet_timewait_sock.h> + +#include "ccid.h" +#include "dccp.h" + +struct inet_timewait_death_row dccp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = SPIN_LOCK_UNLOCKED, + .hashinfo = &dccp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&dccp_death_row), + .twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work, + inet_twdr_twkill_work, + &dccp_death_row), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&dccp_death_row), +}; + +void dccp_time_wait(struct sock *sk, int state, int timeo) +{ + struct inet_timewait_sock *tw = NULL; + + if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + tw->tw_timeout = DCCP_TIMEWAIT_LEN; + if (state == DCCP_TIME_WAIT) + timeo = DCCP_TIMEWAIT_LEN; + + inet_twsk_schedule(tw, &dccp_death_row, timeo, + DCCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket " + "table overflow\n"); + } + + dccp_done(sk); +} + +struct sock *dccp_create_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + /* + * Step 3: Process LISTEN state + * + * // Generate a new socket and switch to that socket + * Set S := new socket for this port pair + */ + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); + + if (newsk != NULL) { + const struct dccp_request_sock *dreq = dccp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(sk); + struct dccp_sock *newdp = dccp_sk(newsk); + + newdp->dccps_hc_rx_ackpkts = NULL; + newdp->dccps_role = DCCP_ROLE_SERVER; + newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + + if (newdp->dccps_options.dccpo_send_ack_vector) { + newdp->dccps_hc_rx_ackpkts = + dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN, + GFP_ATOMIC); + /* + * XXX: We're using the same CCIDs set on the parent, + * i.e. sk_clone copied the master sock and left the + * CCID pointers for this child, that is why we do the + * __ccid_get calls. + */ + if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL)) + goto out_free; + } + + if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid, + newsk) != 0 || + ccid_hc_tx_init(newdp->dccps_hc_tx_ccid, + newsk) != 0)) { + dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts); + ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk); + ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk); +out_free: + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + + __ccid_get(newdp->dccps_hc_rx_ccid); + __ccid_get(newdp->dccps_hc_tx_ccid); + + /* + * Step 3: Process LISTEN state + * + * Choose S.ISS (initial seqno) or set from Init Cookie + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init + * Cookie + */ + + /* See dccp_v4_conn_request */ + newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd; + + newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr; + dccp_update_gsr(newsk, dreq->dreq_isr); + + newdp->dccps_iss = dreq->dreq_iss; + dccp_update_gss(newsk, dreq->dreq_iss); + + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&newdp->dccps_swl, + max48(newdp->dccps_swl, newdp->dccps_isr)); + dccp_set_seqno(&newdp->dccps_awl, + max48(newdp->dccps_awl, newdp->dccps_iss)); + + dccp_init_xmit_timers(newsk); + + DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +/* + * Process an incoming packet for RESPOND sockets represented + * as an request_sock. + */ +struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct request_sock **prev) +{ + struct sock *child = NULL; + + /* Check for retransmitted REQUEST */ + if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { + if (after48(DCCP_SKB_CB(skb)->dccpd_seq, + dccp_rsk(req)->dreq_isr)) { + struct dccp_request_sock *dreq = dccp_rsk(req); + + dccp_pr_debug("Retransmitted REQUEST\n"); + /* Send another RESPONSE packet */ + dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1); + dccp_set_seqno(&dreq->dreq_isr, + DCCP_SKB_CB(skb)->dccpd_seq); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); + } + /* Network Duplicate, discard packet */ + return NULL; + } + + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; + + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && + dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) + goto drop; + + /* Invalid ACK */ + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) { + dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " + "dreq_iss=%llu\n", + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq, + (unsigned long long) + dccp_rsk(req)->dreq_iss); + goto drop; + } + + child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + /* FIXME: deal with options */ + + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); + inet_csk_reqsk_queue_add(sk, req, child); +out: + return child; +listen_overflow: + dccp_pr_debug("listen_overflow!\n"); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; +drop: + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) + req->rsk_ops->send_reset(skb); + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ +int dccp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + const int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), + skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == DCCP_RESPOND && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} diff --git a/net/dccp/options.c b/net/dccp/options.c new file mode 100644 index 00000000000..382c5894acb --- /dev/null +++ b/net/dccp/options.c @@ -0,0 +1,855 @@ +/* + * net/dccp/options.c + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org> + * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> + * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include "ccid.h" +#include "dccp.h" + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, + const u64 ackno, + const unsigned char len, + const unsigned char *vector); + +/* stores the default values for new connection. may be changed with sysctl */ +static const struct dccp_options dccpo_default_values = { + .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW, + .dccpo_ccid = DCCPF_INITIAL_CCID, + .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR, + .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT, +}; + +void dccp_options_init(struct dccp_options *dccpo) +{ + memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo)); +} + +static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) +{ + u32 value = 0; + + if (len > 3) + value += *bf++ << 24; + if (len > 2) + value += *bf++ << 16; + if (len > 1) + value += *bf++ << 8; + if (len > 0) + value += *bf; + + return value; +} + +int dccp_parse_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef CONFIG_IP_DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx opt: " : "server rx opt: "; +#endif + const struct dccp_hdr *dh = dccp_hdr(skb); + const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; + unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); + unsigned char *opt_ptr = options; + const unsigned char *opt_end = (unsigned char *)dh + + (dh->dccph_doff * 4); + struct dccp_options_received *opt_recv = &dp->dccps_options_received; + unsigned char opt, len; + unsigned char *value; + + memset(opt_recv, 0, sizeof(*opt_recv)); + + while (opt_ptr != opt_end) { + opt = *opt_ptr++; + len = 0; + value = NULL; + + /* Check if this isn't a single byte option */ + if (opt > DCCPO_MAX_RESERVED) { + if (opt_ptr == opt_end) + goto out_invalid_option; + + len = *opt_ptr++; + if (len < 3) + goto out_invalid_option; + /* + * Remove the type and len fields, leaving + * just the value size + */ + len -= 2; + value = opt_ptr; + opt_ptr += len; + + if (opt_ptr > opt_end) + goto out_invalid_option; + } + + switch (opt) { + case DCCPO_PADDING: + break; + case DCCPO_NDP_COUNT: + if (len > 3) + goto out_invalid_option; + + opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); + dccp_pr_debug("%sNDP count=%d\n", debug_prefix, + opt_recv->dccpor_ndp); + break; + case DCCPO_ACK_VECTOR_0: + if (len > DCCP_MAX_ACK_VECTOR_LEN) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + + opt_recv->dccpor_ack_vector_len = len; + opt_recv->dccpor_ack_vector_idx = value - options; + + dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n", + debug_prefix, len, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, + value, len); + dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts, + sk, + DCCP_SKB_CB(skb)->dccpd_ack_seq, + len, value); + break; + case DCCPO_TIMESTAMP: + if (len != 4) + goto out_invalid_option; + + opt_recv->dccpor_timestamp = ntohl(*(u32 *)value); + + dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp; + do_gettimeofday(&dp->dccps_timestamp_time); + + dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n", + debug_prefix, opt_recv->dccpor_timestamp, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + break; + case DCCPO_TIMESTAMP_ECHO: + if (len != 4 && len != 6 && len != 8) + goto out_invalid_option; + + opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value); + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ", + debug_prefix, + opt_recv->dccpor_timestamp_echo, + len + 2, + (unsigned long long) + DCCP_SKB_CB(skb)->dccpd_ack_seq); + + if (len > 4) { + if (len == 6) + opt_recv->dccpor_elapsed_time = + ntohs(*(u16 *)(value + 4)); + else + opt_recv->dccpor_elapsed_time = + ntohl(*(u32 *)(value + 4)); + + dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n", + debug_prefix, + opt_recv->dccpor_elapsed_time); + } + break; + case DCCPO_ELAPSED_TIME: + if (len != 2 && len != 4) + goto out_invalid_option; + + if (pkt_type == DCCP_PKT_DATA) + continue; + + if (len == 2) + opt_recv->dccpor_elapsed_time = + ntohs(*(u16 *)value); + else + opt_recv->dccpor_elapsed_time = + ntohl(*(u32 *)value); + + dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix, + opt_recv->dccpor_elapsed_time); + break; + /* + * From draft-ietf-dccp-spec-11.txt: + * + * Option numbers 128 through 191 are for + * options sent from the HC-Sender to the + * HC-Receiver; option numbers 192 through 255 + * are for options sent from the HC-Receiver to + * the HC-Sender. + */ + case 128 ... 191: { + const u16 idx = value - options; + + if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, + opt, len, idx, + value) != 0) + goto out_invalid_option; + } + break; + case 192 ... 255: { + const u16 idx = value - options; + + if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, + opt, len, idx, + value) != 0) + goto out_invalid_option; + } + break; + default: + pr_info("DCCP(%p): option %d(len=%d) not " + "implemented, ignoring\n", + sk, opt, len); + break; + } + } + + return 0; + +out_invalid_option: + DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; + pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len); + return -1; +} + +static void dccp_encode_value_var(const u32 value, unsigned char *to, + const unsigned int len) +{ + if (len > 3) + *to++ = (value & 0xFF000000) >> 24; + if (len > 2) + *to++ = (value & 0xFF0000) >> 16; + if (len > 1) + *to++ = (value & 0xFF00) >> 8; + if (len > 0) + *to++ = (value & 0xFF); +} + +static inline int dccp_ndp_len(const int ndp) +{ + return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3; +} + +void dccp_insert_option(struct sock *sk, struct sk_buff *skb, + const unsigned char option, + const void *value, const unsigned char len) +{ + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert " + "%d option!\n", option); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; + + to = skb_push(skb, len + 2); + *to++ = option; + *to++ = len + 2; + + memcpy(to, value, len); +} + +EXPORT_SYMBOL_GPL(dccp_insert_option); + +static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + int ndp = dp->dccps_ndp_count; + + if (dccp_non_data_packet(skb)) + ++dp->dccps_ndp_count; + else + dp->dccps_ndp_count = 0; + + if (ndp > 0) { + unsigned char *ptr; + const int ndp_len = dccp_ndp_len(ndp); + const int len = ndp_len + 2; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + ptr = skb_push(skb, len); + *ptr++ = DCCPO_NDP_COUNT; + *ptr++ = len; + dccp_encode_value_var(ndp, ptr, ndp_len); + } +} + +static inline int dccp_elapsed_time_len(const u32 elapsed_time) +{ + return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; +} + +void dccp_insert_option_elapsed_time(struct sock *sk, + struct sk_buff *skb, + u32 elapsed_time) +{ +#ifdef CONFIG_IP_DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; +#endif + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 2 + elapsed_time_len; + unsigned char *to; + + if (elapsed_time_len == 0) + return; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to " + "insert elapsed time!\n"); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ELAPSED_TIME; + *to++ = len; + + if (elapsed_time_len == 2) { + const u16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else { + const u32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } + + dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n", + debug_prefix, elapsed_time, + len, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); +} + +EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); + +static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef CONFIG_IP_DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; +#endif + struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + int len = ap->dccpap_buf_vector_len + 2; + const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10; + unsigned char *to, *from; + + if (elapsed_time != 0) + dccp_insert_option_elapsed_time(sk, skb, elapsed_time); + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to " + "insert ACK Vector!\n"); + return; + } + + /* + * XXX: now we have just one ack vector sent record, so + * we have to wait for it to be cleared. + * + * Of course this is not acceptable, but this is just for + * basic testing now. + */ + if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1) + return; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_ACK_VECTOR_0; + *to++ = len; + + len = ap->dccpap_buf_vector_len; + from = ap->dccpap_buf + ap->dccpap_buf_head; + + /* Check if buf_head wraps */ + if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) { + const unsigned int tailsize = (ap->dccpap_buf_len - + ap->dccpap_buf_head); + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + from = ap->dccpap_buf; + } + + memcpy(to, from, len); + /* + * From draft-ietf-dccp-spec-11.txt: + * + * For each acknowledgement it sends, the HC-Receiver will add an + * acknowledgement record. ack_seqno will equal the HC-Receiver + * sequence number it used for the ack packet; ack_ptr will equal + * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will + * equal buf_nonce. + * + * This implemention uses just one ack record for now. + */ + ap->dccpap_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + ap->dccpap_ack_ptr = ap->dccpap_buf_head; + ap->dccpap_ack_ackno = ap->dccpap_buf_ackno; + ap->dccpap_ack_nonce = ap->dccpap_buf_nonce; + ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len; + + dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu\n", + debug_prefix, ap->dccpap_ack_vector_len, + (unsigned long long) ap->dccpap_ack_seqno, + (unsigned long long) ap->dccpap_ack_ackno); +} + +void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb) +{ + struct timeval tv; + u32 now; + + do_gettimeofday(&tv); + now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10; + /* yes this will overflow but that is the point as we want a + * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ + + now = htonl(now); + dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now)); +} + +EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); + +static void dccp_insert_option_timestamp_echo(struct sock *sk, + struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); +#ifdef CONFIG_IP_DCCP_DEBUG + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT TX opt: " : "server TX opt: "; +#endif + u32 tstamp_echo; + const u32 elapsed_time = + timeval_now_delta(&dp->dccps_timestamp_time) / 10; + const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time); + const int len = 6 + elapsed_time_len; + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert " + "timestamp echo!\n"); + return; + } + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + *to++ = DCCPO_TIMESTAMP_ECHO; + *to++ = len; + + tstamp_echo = htonl(dp->dccps_timestamp_echo); + memcpy(to, &tstamp_echo, 4); + to += 4; + + if (elapsed_time_len == 2) { + const u16 var16 = htons((u16)elapsed_time); + memcpy(to, &var16, 2); + } else if (elapsed_time_len == 4) { + const u32 var32 = htonl(elapsed_time); + memcpy(to, &var32, 4); + } + + dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n", + debug_prefix, dp->dccps_timestamp_echo, + len, + (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); + + dp->dccps_timestamp_echo = 0; + dp->dccps_timestamp_time.tv_sec = 0; + dp->dccps_timestamp_time.tv_usec = 0; +} + +void dccp_insert_options(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + + DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + + if (dp->dccps_options.dccpo_send_ndp_count) + dccp_insert_option_ndp(sk, skb); + + if (!dccp_packet_without_ack(skb)) { + if (dp->dccps_options.dccpo_send_ack_vector && + (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno != + DCCP_MAX_SEQNO + 1)) + dccp_insert_option_ack_vector(sk, skb); + + if (dp->dccps_timestamp_echo != 0) + dccp_insert_option_timestamp_echo(sk, skb); + } + + ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb); + + /* XXX: insert other options when appropriate */ + + if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) { + /* The length of all options has to be a multiple of 4 */ + int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; + + if (padding != 0) { + padding = 4 - padding; + memset(skb_push(skb, padding), 0, padding); + DCCP_SKB_CB(skb)->dccpd_opt_len += padding; + } + } +} + +struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len, + const unsigned int __nocast priority) +{ + struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority); + + if (ap != NULL) { +#ifdef CONFIG_IP_DCCP_DEBUG + memset(ap->dccpap_buf, 0xFF, len); +#endif + ap->dccpap_buf_len = len; + ap->dccpap_buf_head = + ap->dccpap_buf_tail = + ap->dccpap_buf_len - 1; + ap->dccpap_buf_ackno = + ap->dccpap_ack_ackno = + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0; + ap->dccpap_ack_ptr = 0; + ap->dccpap_time.tv_sec = 0; + ap->dccpap_time.tv_usec = 0; + ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0; + } + + return ap; +} + +void dccp_ackpkts_free(struct dccp_ackpkts *ap) +{ + if (ap != NULL) { +#ifdef CONFIG_IP_DCCP_DEBUG + memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len); +#endif + kfree(ap); + } +} + +static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK; +} + +static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap, + const unsigned int index) +{ + return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK; +} + +/* + * If several packets are missing, the HC-Receiver may prefer to enter multiple + * bytes with run length 0, rather than a single byte with a larger run length; + * this simplifies table updates if one of the missing packets arrives. + */ +static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap, + const unsigned int packets, + const unsigned char state) +{ + unsigned int gap; + signed long new_head; + + if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len) + return -ENOBUFS; + + gap = packets - 1; + new_head = ap->dccpap_buf_head - packets; + + if (new_head < 0) { + if (gap > 0) { + memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED, + gap + new_head + 1); + gap = -new_head; + } + new_head += ap->dccpap_buf_len; + } + + ap->dccpap_buf_head = new_head; + + if (gap > 0) + memset(ap->dccpap_buf + ap->dccpap_buf_head + 1, + DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap); + + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len += packets; + return 0; +} + +/* + * Implements the draft-ietf-dccp-spec-11.txt Appendix A + */ +int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state) +{ + /* + * Check at the right places if the buffer is full, if it is, tell the + * caller to start dropping packets till the HC-Sender acks our ACK + * vectors, when we will free up space in dccpap_buf. + * + * We may well decide to do buffer compression, etc, but for now lets + * just drop. + * + * From Appendix A: + * + * Of course, the circular buffer may overflow, either when the + * HC-Sender is sending data at a very high rate, when the + * HC-Receiver's acknowledgements are not reaching the HC-Sender, + * or when the HC-Sender is forgetting to acknowledge those acks + * (so the HC-Receiver is unable to clean up old state). In this + * case, the HC-Receiver should either compress the buffer (by + * increasing run lengths when possible), transfer its state to + * a larger buffer, or, as a last resort, drop all received + * packets, without processing them whatsoever, until its buffer + * shrinks again. + */ + + /* See if this is the first ackno being inserted */ + if (ap->dccpap_buf_vector_len == 0) { + ap->dccpap_buf[ap->dccpap_buf_head] = state; + ap->dccpap_buf_vector_len = 1; + } else if (after48(ackno, ap->dccpap_buf_ackno)) { + const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno, + ackno); + + /* + * Look if the state of this packet is the same as the + * previous ackno and if so if we can bump the head len. + */ + if (delta == 1 && + dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state && + (dccp_ackpkts_len(ap, ap->dccpap_buf_head) < + DCCP_ACKPKTS_LEN_MASK)) + ap->dccpap_buf[ap->dccpap_buf_head]++; + else if (dccp_ackpkts_set_buf_head_state(ap, delta, state)) + return -ENOBUFS; + } else { + /* + * A.1.2. Old Packets + * + * When a packet with Sequence Number S arrives, and + * S <= buf_ackno, the HC-Receiver will scan the table + * for the byte corresponding to S. (Indexing structures + * could reduce the complexity of this scan.) + */ + u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno); + unsigned int index = ap->dccpap_buf_head; + + while (1) { + const u8 len = dccp_ackpkts_len(ap, index); + const u8 state = dccp_ackpkts_state(ap, index); + /* + * valid packets not yet in dccpap_buf have a reserved + * entry, with a len equal to 0. + */ + if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED && + len == 0 && delta == 0) { /* Found our + reserved seat! */ + dccp_pr_debug("Found %llu reserved seat!\n", + (unsigned long long) ackno); + ap->dccpap_buf[index] = state; + goto out; + } + /* len == 0 means one packet */ + if (delta < len + 1) + goto out_duplicate; + + delta -= len + 1; + if (++index == ap->dccpap_buf_len) + index = 0; + } + } + + ap->dccpap_buf_ackno = ackno; + do_gettimeofday(&ap->dccpap_time); +out: + dccp_pr_debug(""); + dccp_ackpkts_print(ap); + return 0; + +out_duplicate: + /* Duplicate packet */ + dccp_pr_debug("Received a dup or already considered lost " + "packet: %llu\n", (unsigned long long) ackno); + return -EILSEQ; +} + +#ifdef CONFIG_IP_DCCP_DEBUG +void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, + int len) +{ + if (!dccp_debug) + return; + + printk("ACK vector len=%d, ackno=%llu |", len, + (unsigned long long) ackno); + + while (len--) { + const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6; + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + + printk("%d,%d|", state, rl); + ++vector; + } + + printk("\n"); +} + +void dccp_ackpkts_print(const struct dccp_ackpkts *ap) +{ + dccp_ackvector_print(ap->dccpap_buf_ackno, + ap->dccpap_buf + ap->dccpap_buf_head, + ap->dccpap_buf_vector_len); +} +#endif + +static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap) +{ + /* + * As we're keeping track of the ack vector size + * (dccpap_buf_vector_len) and the sent ack vector size + * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but + * keep this code here as in the future we'll implement a vector of + * ack records, as suggested in draft-ietf-dccp-spec-11.txt + * Appendix A. -acme + */ +#if 0 + ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1; + if (ap->dccpap_buf_tail >= ap->dccpap_buf_len) + ap->dccpap_buf_tail -= ap->dccpap_buf_len; +#endif + ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len; +} + +void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk, + u64 ackno) +{ + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + + if (ackno == ap->dccpap_ack_seqno) { +#ifdef CONFIG_IP_DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx ack: " : "server rx ack: "; +#endif + dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, " + "ack_ackno=%llu, ACKED!\n", + debug_prefix, 1, + (unsigned long long) ap->dccpap_ack_seqno, + (unsigned long long) ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + } +} + +static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap, + struct sock *sk, u64 ackno, + const unsigned char len, + const unsigned char *vector) +{ + unsigned char i; + + /* Check if we actually sent an ACK vector */ + if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1) + return; + /* + * We're in the receiver half connection, so if the received an ACK + * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're + * not interested. + * + * Extra explanation with example: + * + * if we received an ACK vector with ackno 50, it can only be acking + * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent). + */ + /* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */ + if (before48(ackno, ap->dccpap_ack_seqno)) { + /* dccp_pr_debug_cat("yes\n"); */ + return; + } + /* dccp_pr_debug_cat("no\n"); */ + + i = len; + while (i--) { + const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK); + u64 ackno_end_rl; + + dccp_set_seqno(&ackno_end_rl, ackno - rl); + + /* + * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl, + * ap->dccpap_ack_seqno, ackno); + */ + if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) { + const u8 state = (*vector & + DCCP_ACKPKTS_STATE_MASK) >> 6; + /* dccp_pr_debug_cat("yes\n"); */ + + if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) { +#ifdef CONFIG_IP_DCCP_DEBUG + struct dccp_sock *dp = dccp_sk(sk); + const char *debug_prefix = + dp->dccps_role == DCCP_ROLE_CLIENT ? + "CLIENT rx ack: " : "server rx ack: "; +#endif + dccp_pr_debug("%sACK vector 0, len=%d, " + "ack_seqno=%llu, ack_ackno=%llu, " + "ACKED!\n", + debug_prefix, len, + (unsigned long long) + ap->dccpap_ack_seqno, + (unsigned long long) + ap->dccpap_ack_ackno); + dccp_ackpkts_trow_away_ack_record(ap); + } + /* + * If dccpap_ack_seqno was not received, no problem + * we'll send another ACK vector. + */ + ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1; + break; + } + /* dccp_pr_debug_cat("no\n"); */ + + dccp_set_seqno(&ackno, ackno_end_rl - 1); + ++vector; + } +} diff --git a/net/dccp/output.c b/net/dccp/output.c new file mode 100644 index 00000000000..28de157a432 --- /dev/null +++ b/net/dccp/output.c @@ -0,0 +1,528 @@ +/* + * net/dccp/output.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/skbuff.h> + +#include <net/sock.h> + +#include "ccid.h" +#include "dccp.h" + +static inline void dccp_event_ack_sent(struct sock *sk) +{ + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +/* + * All SKB's seen here are completely headerless. It is our + * job to build the DCCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + */ +int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb != NULL)) { + const struct inet_sock *inet = inet_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + struct dccp_hdr *dh; + /* XXX For now we're using only 48 bits sequence numbers */ + const int dccp_header_size = sizeof(*dh) + + sizeof(struct dccp_hdr_ext) + + dccp_packet_hdr_len(dcb->dccpd_type); + int err, set_ack = 1; + u64 ackno = dp->dccps_gsr; + + dccp_inc_seqno(&dp->dccps_gss); + + switch (dcb->dccpd_type) { + case DCCP_PKT_DATA: + set_ack = 0; + break; + case DCCP_PKT_SYNC: + case DCCP_PKT_SYNCACK: + ackno = dcb->dccpd_seq; + break; + } + + dcb->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + dh = dccp_hdr(skb); + /* + * Data packets are not cloned as they are never retransmitted + */ + if (skb_cloned(skb)) + skb_set_owner_w(skb, sk); + + /* Build DCCP header and checksum it. */ + memset(dh, 0, dccp_header_size); + dh->dccph_type = dcb->dccpd_type; + dh->dccph_sport = inet->sport; + dh->dccph_dport = inet->dport; + dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; + dh->dccph_ccval = dcb->dccpd_ccval; + /* XXX For now we're using only 48 bits sequence numbers */ + dh->dccph_x = 1; + + dp->dccps_awh = dp->dccps_gss; + dccp_hdr_set_seq(dh, dp->dccps_gss); + if (set_ack) + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); + + switch (dcb->dccpd_type) { + case DCCP_PKT_REQUEST: + dccp_hdr_request(skb)->dccph_req_service = + dcb->dccpd_service; + break; + case DCCP_PKT_RESET: + dccp_hdr_reset(skb)->dccph_reset_code = + dcb->dccpd_reset_code; + break; + } + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, + inet->daddr); + + if (set_ack) + dccp_event_ack_sent(sk); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + + err = ip_queue_xmit(skb, 0); + if (err <= 0) + return err; + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +} + +unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct dccp_sock *dp = dccp_sk(sk); + int mss_now; + + /* + * FIXME: we really should be using the af_specific thing to support + * IPv6. + * mss_now = pmtu - tp->af_specific->net_header_len - + * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); + */ + mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - + sizeof(struct dccp_hdr_ext); + + /* Now subtract optional transport overhead */ + mss_now -= dp->dccps_ext_header_len; + + /* + * FIXME: this should come from the CCID infrastructure, where, say, + * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets + * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED + * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to + * make it a multiple of 4 + */ + + mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + + /* And store cached results */ + dp->dccps_pmtu_cookie = pmtu; + dp->dccps_mss_cache = mss_now; + + return mss_now; +} + +void dccp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, 2, POLL_OUT); + + read_unlock(&sk->sk_callback_lock); +} + +/** + * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * @sk: socket to wait for + * @timeo: for how long + */ +static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, + long *timeo) +{ + struct dccp_sock *dp = dccp_sk(sk); + DEFINE_WAIT(wait); + long delay; + int rc; + + while (1) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, + skb->len); + if (rc <= 0) + break; + delay = msecs_to_jiffies(rc); + if (delay > *timeo || delay < 0) + goto do_nonblock; + + sk->sk_write_pending++; + release_sock(sk); + *timeo -= schedule_timeout(delay); + lock_sock(sk); + sk->sk_write_pending--; + } +out: + finish_wait(sk->sk_sleep, &wait); + return rc; + +do_error: + rc = -EPIPE; + goto out; +do_nonblock: + rc = -EAGAIN; + goto out; +do_interrupted: + rc = sock_intr_errno(*timeo); + goto out; +} + +int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) +{ + const struct dccp_sock *dp = dccp_sk(sk); + int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb, + skb->len); + + if (err > 0) + err = dccp_wait_for_ccid(sk, skb, timeo); + + if (err == 0) { + const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int len = skb->len; + + if (sk->sk_state == DCCP_PARTOPEN) { + /* See 8.1.5. Handshake Completion */ + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + dcb->dccpd_type = DCCP_PKT_DATAACK; + /* + * FIXME: we really should have a + * dccps_ack_pending or use icsk. + */ + } else if (inet_csk_ack_scheduled(sk) || + dp->dccps_timestamp_echo != 0 || + (dp->dccps_options.dccpo_send_ack_vector && + ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 && + ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)) + dcb->dccpd_type = DCCP_PKT_DATAACK; + else + dcb->dccpd_type = DCCP_PKT_DATA; + + err = dccp_transmit_skb(sk, skb); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + } + + return err; +} + +int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (inet_sk_rebuild_header(sk) != 0) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + return dccp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); +} + +struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) +{ + struct dccp_hdr *dh; + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_response); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; + DCCP_SKB_CB(skb)->dccpd_seq = dccp_rsk(req)->dreq_iss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_rsk(req)->rmt_port; + dh->dccph_doff = (dccp_header_size + + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESPONSE; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr, + inet_rsk(req)->rmt_addr); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, + const enum dccp_reset_codes code) + +{ + struct dccp_hdr *dh; + struct dccp_sock *dp = dccp_sk(sk); + const int dccp_header_size = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN + + dccp_header_size, 1, + GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size); + + skb->dst = dst_clone(dst); + skb->csum = 0; + + dccp_inc_seqno(&dp->dccps_gss); + + DCCP_SKB_CB(skb)->dccpd_reset_code = code; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; + DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss; + dccp_insert_options(sk, skb); + + skb->h.raw = skb_push(skb, dccp_header_size); + + dh = dccp_hdr(skb); + memset(dh, 0, dccp_header_size); + + dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_dport = inet_sk(sk)->dport; + dh->dccph_doff = (dccp_header_size + + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_x = 1; + dccp_hdr_set_seq(dh, dp->dccps_gss); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr); + + dccp_hdr_reset(skb)->dccph_reset_code = code; + + dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr, + inet_sk(sk)->daddr); + + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + return skb; +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void dccp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + + dccp_sync_mss(sk, dst_mtu(dst)); + + /* + * FIXME: set dp->{dccps_swh,dccps_swl}, with + * something like dccp_inc_seq + */ + + icsk->icsk_retransmits = 0; +} + +int dccp_connect(struct sock *sk) +{ + struct sk_buff *skb; + struct inet_connection_sock *icsk = inet_csk(sk); + + dccp_connect_init(sk); + + skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation); + if (unlikely(skb == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_DCCP_HEADER); + + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; + /* FIXME: set service to something meaningful, coming + * from userspace*/ + DCCP_SKB_CB(skb)->dccpd_service = 0; + skb->csum = 0; + skb_set_owner_w(skb, sk); + + BUG_TRAP(sk->sk_send_head == NULL); + sk->sk_send_head = skb; + dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the REQUEST until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, DCCP_RTO_MAX); + return 0; +} + +void dccp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != DCCP_CLOSED) { + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, + DCCP_RTO_MAX); + return; + } + + /* Reserve space for headers */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); + } +} + +EXPORT_SYMBOL_GPL(dccp_send_ack); + +void dccp_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + /* + * FIXME: tune this timer. elapsed time fixes the skew, so no problem + * with using 2s, and active senders also piggyback the ACK into a + * DATAACK packet, so this is really for quiescent senders. + */ + unsigned long timeout = jiffies + 2 * HZ; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + * + * FIXME: check the "about to expire" part + */ + if (icsk->icsk_ack.blocked) { + dccp_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +void dccp_send_sync(struct sock *sk, const u64 seq, + const enum dccp_pkt_type pkt_type) +{ + /* + * We are not putting this on the write queue, so + * dccp_transmit_skb() will set the ownership to this + * sock. + */ + struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC); + + if (skb == NULL) + /* FIXME: how to make sure the sync is sent? */ + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_DCCP_HEADER); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = pkt_type; + DCCP_SKB_CB(skb)->dccpd_seq = seq; + + skb_set_owner_w(skb, sk); + dccp_transmit_skb(sk, skb); +} + +/* + * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This + * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under + * any circumstances. + */ +void dccp_send_close(struct sock *sk, const int active) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC; + + skb = alloc_skb(sk->sk_prot->max_header, prio); + if (skb == NULL) + return; + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, sk->sk_prot->max_header); + skb->csum = 0; + DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ? + DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ; + + skb_set_owner_w(skb, sk); + if (active) { + BUG_TRAP(sk->sk_send_head == NULL); + sk->sk_send_head = skb; + dccp_transmit_skb(sk, skb_clone(skb, prio)); + } else + dccp_transmit_skb(sk, skb); + + ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk); +} diff --git a/net/dccp/proto.c b/net/dccp/proto.c new file mode 100644 index 00000000000..18a0e69c9dc --- /dev/null +++ b/net/dccp/proto.c @@ -0,0 +1,826 @@ +/* + * net/dccp/proto.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/if_arp.h> +#include <linux/init.h> +#include <linux/random.h> +#include <net/checksum.h> + +#include <net/inet_common.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/sock.h> +#include <net/xfrm.h> + +#include <asm/semaphore.h> +#include <linux/spinlock.h> +#include <linux/timer.h> +#include <linux/delay.h> +#include <linux/poll.h> +#include <linux/dccp.h> + +#include "ccid.h" +#include "dccp.h" + +DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; + +atomic_t dccp_orphan_count = ATOMIC_INIT(0); + +static struct net_protocol dccp_protocol = { + .handler = dccp_v4_rcv, + .err_handler = dccp_v4_err, +}; + +const char *dccp_packet_name(const int type) +{ + static const char *dccp_packet_names[] = { + [DCCP_PKT_REQUEST] = "REQUEST", + [DCCP_PKT_RESPONSE] = "RESPONSE", + [DCCP_PKT_DATA] = "DATA", + [DCCP_PKT_ACK] = "ACK", + [DCCP_PKT_DATAACK] = "DATAACK", + [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", + [DCCP_PKT_CLOSE] = "CLOSE", + [DCCP_PKT_RESET] = "RESET", + [DCCP_PKT_SYNC] = "SYNC", + [DCCP_PKT_SYNCACK] = "SYNCACK", + }; + + if (type >= DCCP_NR_PKT_TYPES) + return "INVALID"; + else + return dccp_packet_names[type]; +} + +EXPORT_SYMBOL_GPL(dccp_packet_name); + +const char *dccp_state_name(const int state) +{ + static char *dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} + +EXPORT_SYMBOL_GPL(dccp_state_name); + +static inline int dccp_listen_start(struct sock *sk) +{ + dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN; + return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); +} + +int dccp_disconnect(struct sock *sk, int flags) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + const int old_state = sk->sk_state; + + if (old_state != DCCP_CLOSED) + dccp_set_state(sk, DCCP_CLOSED); + + /* ABORT function of RFC793 */ + if (old_state == DCCP_LISTEN) { + inet_csk_listen_stop(sk); + /* FIXME: do the active reset thing */ + } else if (old_state == DCCP_REQUESTING) + sk->sk_err = ECONNRESET; + + dccp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + if (sk->sk_send_head != NULL) { + __kfree_skb(sk->sk_send_head); + sk->sk_send_head = NULL; + } + + inet->dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + + icsk->icsk_backoff = 0; + inet_csk_delack_init(sk); + __sk_dst_reset(sk); + + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); + + sk->sk_error_report(sk); + return err; +} + +/* + * Wait for a DCCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +static unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == DCCP_LISTEN) + return inet_csk_listen_poll(sk); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->sk_err) + mask = POLLERR; + + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { + if (atomic_read(&sk->sk_rmem_alloc) > 0) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + } + return mask; +} + +int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + dccp_pr_debug("entry\n"); + return -ENOIOCTLCMD; +} + +int dccp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct dccp_sock *dp; + int err; + int val; + + if (level != SOL_DCCP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + dp = dccp_sk(sk); + err = 0; + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + dp->dccps_packet_size = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +int dccp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct dccp_sock *dp; + int val, len; + + if (level != SOL_DCCP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + dp = dccp_sk(sk); + + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + val = dp->dccps_packet_size; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + const struct dccp_sock *dp = dccp_sk(sk); + const int flags = msg->msg_flags; + const int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int rc, size; + long timeo; + + if (len > dp->dccps_mss_cache) + return -EMSGSIZE; + + lock_sock(sk); + timeo = sock_sndtimeo(sk, noblock); + + /* + * We have to use sk_stream_wait_connect here to set sk_write_pending, + * so that the trick in dccp_rcv_request_sent_state_process. + */ + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING)) + if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_release; + + size = sk->sk_prot->max_header + len; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + if (skb == NULL) + goto out_release; + + skb_reserve(skb, sk->sk_prot->max_header); + rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (rc != 0) + goto out_discard; + + rc = dccp_write_xmit(sk, skb, &timeo); + /* + * XXX we don't use sk_write_queue, so just discard the packet. + * Current plan however is to _use_ sk_write_queue with + * an algorith similar to tcp_sendmsg, where the main difference + * is that in DCCP we have to respect packet boundaries, so + * no coalescing of skbs. + * + * This bug was _quickly_ found & fixed by just looking at an OSTRA + * generated callgraph 8) -acme + */ + if (rc != 0) + goto out_discard; +out_release: + release_sock(sk); + return rc ? : len; +out_discard: + kfree_skb(skb); + goto out_release; +} + +int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + const struct dccp_hdr *dh; + long timeo; + + lock_sock(sk); + + if (sk->sk_state == DCCP_LISTEN) { + len = -ENOTCONN; + goto out; + } + + timeo = sock_rcvtimeo(sk, nonblock); + + do { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + if (skb == NULL) + goto verify_sock_status; + + dh = dccp_hdr(skb); + + if (dh->dccph_type == DCCP_PKT_DATA || + dh->dccph_type == DCCP_PKT_DATAACK) + goto found_ok_skb; + + if (dh->dccph_type == DCCP_PKT_RESET || + dh->dccph_type == DCCP_PKT_CLOSE) { + dccp_pr_debug("found fin ok!\n"); + len = 0; + goto found_fin_ok; + } + dccp_pr_debug("packet_type=%s\n", + dccp_packet_name(dh->dccph_type)); + sk_eat_skb(sk, skb); +verify_sock_status: + if (sock_flag(sk, SOCK_DONE)) { + len = 0; + break; + } + + if (sk->sk_err) { + len = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + len = 0; + break; + } + + if (sk->sk_state == DCCP_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + len = -ENOTCONN; + break; + } + len = 0; + break; + } + + if (!timeo) { + len = -EAGAIN; + break; + } + + if (signal_pending(current)) { + len = sock_intr_errno(timeo); + break; + } + + sk_wait_data(sk, &timeo); + continue; + found_ok_skb: + if (len > skb->len) + len = skb->len; + else if (len < skb->len) + msg->msg_flags |= MSG_TRUNC; + + if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + /* Exception. Bailout! */ + len = -EFAULT; + break; + } + found_fin_ok: + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (1); +out: + release_sock(sk); + return len; +} + +static int inet_dccp_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != DCCP_LISTEN) { + /* + * FIXME: here it probably should be sk->sk_prot->listen_start + * see tcp_listen_start + */ + err = dccp_listen_start(sk); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +static const unsigned char dccp_new_state[] = { + /* current state: new state: action: */ + [0] = DCCP_CLOSED, + [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_REQUESTING] = DCCP_CLOSED, + [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN, + [DCCP_LISTEN] = DCCP_CLOSED, + [DCCP_RESPOND] = DCCP_CLOSED, + [DCCP_CLOSING] = DCCP_CLOSED, + [DCCP_TIME_WAIT] = DCCP_CLOSED, + [DCCP_CLOSED] = DCCP_CLOSED, +}; + +static int dccp_close_state(struct sock *sk) +{ + const int next = dccp_new_state[sk->sk_state]; + const int ns = next & DCCP_STATE_MASK; + + if (ns != sk->sk_state) + dccp_set_state(sk, ns); + + return next & DCCP_ACTION_FIN; +} + +void dccp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + + lock_sock(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == DCCP_LISTEN) { + dccp_set_state(sk, DCCP_CLOSED); + + /* Special case. */ + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + *reader process may not have drained the data yet! + */ + /* FIXME: check for unread data */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + __kfree_skb(skb); + } + + if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + } else if (dccp_close_state(sk)) { + dccp_send_close(sk, 1); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + /* + * It is the last release_sock in its life. It will remove backlog. + */ + release_sock(sk); + /* + * Now socket is owned by kernel and we acquire BH lock + * to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(!sock_owned_by_user(sk)); + + sock_hold(sk); + sock_orphan(sk); + + /* + * The last release_sock may have processed the CLOSE or RESET + * packet moving sock to CLOSED state, if not we have to fire + * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination" + * in draft-ietf-dccp-spec-11. -acme + */ + if (sk->sk_state == DCCP_CLOSING) { + /* FIXME: should start at 2 * RTT */ + /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); +#if 0 + /* Yeah, we should use sk->sk_prot->orphan_count, etc */ + dccp_set_state(sk, DCCP_CLOSED); +#endif + } + + atomic_inc(sk->sk_prot->orphan_count); + if (sk->sk_state == DCCP_CLOSED) + inet_csk_destroy_sock(sk); + + /* Otherwise, socket is reprieved until protocol close. */ + + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +void dccp_shutdown(struct sock *sk, int how) +{ + dccp_pr_debug("entry\n"); +} + +static struct proto_ops inet_dccp_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ + .poll = dccp_poll, + .ioctl = inet_ioctl, + /* FIXME: work on inet_listen to rename it to sock_common_listen */ + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +extern struct net_proto_family inet_family_ops; + +static struct inet_protosw dccp_v4_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v4_prot, + .ops = &inet_dccp_ops, + .capability = -1, + .no_check = 0, + .flags = 0, +}; + +/* + * This is the global socket data structure used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ +struct socket *dccp_ctl_socket; + +static char dccp_ctl_socket_err_msg[] __initdata = + KERN_ERR "DCCP: Failed to create the control socket.\n"; + +static int __init dccp_ctl_sock_init(void) +{ + int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP, + &dccp_ctl_socket); + if (rc < 0) + printk(dccp_ctl_socket_err_msg); + else { + dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk); + } + + return rc; +} + +#ifdef CONFIG_IP_DCCP_UNLOAD_HACK +void dccp_ctl_sock_exit(void) +{ + if (dccp_ctl_socket != NULL) { + sock_release(dccp_ctl_socket); + dccp_ctl_socket = NULL; + } +} + +EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit); +#endif + +static int __init init_dccp_v4_mibs(void) +{ + int rc = -ENOMEM; + + dccp_statistics[0] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[0] == NULL) + goto out; + + dccp_statistics[1] = alloc_percpu(struct dccp_mib); + if (dccp_statistics[1] == NULL) + goto out_free_one; + + rc = 0; +out: + return rc; +out_free_one: + free_percpu(dccp_statistics[0]); + dccp_statistics[0] = NULL; + goto out; + +} + +static int thash_entries; +module_param(thash_entries, int, 0444); +MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); + +#ifdef CONFIG_IP_DCCP_DEBUG +int dccp_debug; +module_param(dccp_debug, int, 0444); +MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); +#endif + +static int __init dccp_init(void) +{ + unsigned long goal; + int ehash_order, bhash_order, i; + int rc = proto_register(&dccp_v4_prot, 1); + + if (rc) + goto out; + + dccp_hashinfo.bind_bucket_cachep = + kmem_cache_create("dccp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!dccp_hashinfo.bind_bucket_cachep) + goto out_proto_unregister; + + /* + * Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + if (num_physpages >= (128 * 1024)) + goal = num_physpages >> (21 - PAGE_SHIFT); + else + goal = num_physpages >> (23 - PAGE_SHIFT); + + if (thash_entries) + goal = (thash_entries * + sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; + for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) + ; + do { + dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE / + sizeof(struct inet_ehash_bucket); + dccp_hashinfo.ehash_size >>= 1; + while (dccp_hashinfo.ehash_size & + (dccp_hashinfo.ehash_size - 1)) + dccp_hashinfo.ehash_size--; + dccp_hashinfo.ehash = (struct inet_ehash_bucket *) + __get_free_pages(GFP_ATOMIC, ehash_order); + } while (!dccp_hashinfo.ehash && --ehash_order > 0); + + if (!dccp_hashinfo.ehash) { + printk(KERN_CRIT "Failed to allocate DCCP " + "established hash table\n"); + goto out_free_bind_bucket_cachep; + } + + for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&dccp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); + } + + bhash_order = ehash_order; + + do { + dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / + sizeof(struct inet_bind_hashbucket); + if ((dccp_hashinfo.bhash_size > (64 * 1024)) && + bhash_order > 0) + continue; + dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC, bhash_order); + } while (!dccp_hashinfo.bhash && --bhash_order >= 0); + + if (!dccp_hashinfo.bhash) { + printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n"); + goto out_free_dccp_ehash; + } + + for (i = 0; i < dccp_hashinfo.bhash_size; i++) { + spin_lock_init(&dccp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); + } + + if (init_dccp_v4_mibs()) + goto out_free_dccp_bhash; + + rc = -EAGAIN; + if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP)) + goto out_free_dccp_v4_mibs; + + inet_register_protosw(&dccp_v4_protosw); + + rc = dccp_ctl_sock_init(); + if (rc) + goto out_unregister_protosw; +out: + return rc; +out_unregister_protosw: + inet_unregister_protosw(&dccp_v4_protosw); + inet_del_protocol(&dccp_protocol, IPPROTO_DCCP); +out_free_dccp_v4_mibs: + free_percpu(dccp_statistics[0]); + free_percpu(dccp_statistics[1]); + dccp_statistics[0] = dccp_statistics[1] = NULL; +out_free_dccp_bhash: + free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); + dccp_hashinfo.bhash = NULL; +out_free_dccp_ehash: + free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); + dccp_hashinfo.ehash = NULL; +out_free_bind_bucket_cachep: + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + dccp_hashinfo.bind_bucket_cachep = NULL; +out_proto_unregister: + proto_unregister(&dccp_v4_prot); + goto out; +} + +static const char dccp_del_proto_err_msg[] __exitdata = + KERN_ERR "can't remove dccp net_protocol\n"; + +static void __exit dccp_fini(void) +{ + inet_unregister_protosw(&dccp_v4_protosw); + + if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0) + printk(dccp_del_proto_err_msg); + + free_percpu(dccp_statistics[0]); + free_percpu(dccp_statistics[1]); + free_pages((unsigned long)dccp_hashinfo.bhash, + get_order(dccp_hashinfo.bhash_size * + sizeof(struct inet_bind_hashbucket))); + free_pages((unsigned long)dccp_hashinfo.ehash, + get_order(dccp_hashinfo.ehash_size * + sizeof(struct inet_ehash_bucket))); + kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); + proto_unregister(&dccp_v4_prot); +} + +module_init(dccp_init); +module_exit(dccp_fini); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); +MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/timer.c b/net/dccp/timer.c new file mode 100644 index 00000000000..aa34b576e22 --- /dev/null +++ b/net/dccp/timer.c @@ -0,0 +1,255 @@ +/* + * net/dccp/timer.c + * + * An implementation of the DCCP protocol + * Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/dccp.h> +#include <linux/skbuff.h> + +#include "dccp.h" + +static void dccp_write_timer(unsigned long data); +static void dccp_keepalive_timer(unsigned long data); +static void dccp_delack_timer(unsigned long data); + +void dccp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, + &dccp_keepalive_timer); +} + +static void dccp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED); + dccp_done(sk); + DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); +} + +/* A write timeout has occurred. Process the after effects. */ +static int dccp_write_timeout(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + int retry_until; + + if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { + if (icsk->icsk_retransmits != 0) + dst_negative_advice(&sk->sk_dst_cache); + retry_until = icsk->icsk_syn_retries ? : + /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */; + } else { + if (icsk->icsk_retransmits >= + /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu + black hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->sk_dst_cache); + } + + retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */; + /* + * FIXME: see tcp_write_timout and tcp_out_of_resources + */ + } + + if (icsk->icsk_retransmits >= retry_until) { + /* Has it gone just too far? */ + dccp_write_err(sk); + return 1; + } + return 0; +} + +/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ +static void dccp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + icsk->icsk_ack.blocked = 1; + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &icsk->icsk_delack_timer, + jiffies + TCP_DELACK_MIN); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk->icsk_ack.timeout); + goto out; + } + + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, + icsk->icsk_rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } + dccp_send_ack(sk); + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * The DCCP retransmit timer. + */ +static void dccp_retransmit_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* + * sk->sk_send_head has to have one skb with + * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP + * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake + * (PARTOPEN timer), etc). + */ + BUG_TRAP(sk->sk_send_head != NULL); + + /* + * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was + * sent, no need to retransmit, this sock is dead. + */ + if (dccp_write_timeout(sk)) + goto out; + + /* + * We want to know the number of packets retransmitted, not the + * total number of retransmissions of clones of original packets. + */ + if (icsk->icsk_retransmits == 0) + DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + + if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) { + /* + * Retransmission failed because of local congestion, + * do not backoff. + */ + if (icsk->icsk_retransmits == 0) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, + TCP_RESOURCE_PROBE_INTERVAL), + DCCP_RTO_MAX); + goto out; + } + + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + + icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, + DCCP_RTO_MAX); + if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */) + __sk_dst_reset(sk); +out:; +} + +static void dccp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = inet_csk(sk); + int event = 0; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + jiffies + (HZ / 20)); + goto out; + } + + if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk->icsk_timeout); + goto out; + } + + event = icsk->icsk_pending; + icsk->icsk_pending = 0; + + switch (event) { + case ICSK_TIME_RETRANS: + dccp_retransmit_timer(sk); + break; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ +static void dccp_response_timer(struct sock *sk) +{ + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT, + DCCP_RTO_MAX); +} + +static void dccp_keepalive_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (sk->sk_state == DCCP_LISTEN) { + dccp_response_timer(sk); + goto out; + } +out: + bh_unlock_sock(sk); + sock_put(sk); +} diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index acdd18e6adb..621680f127a 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -118,7 +118,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include <linux/netfilter.h> #include <linux/seq_file.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/flow.h> #include <asm/system.h> #include <asm/ioctls.h> @@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, nskb = skb->next; if (skb->len == 0) { - skb_unlink(skb); + skb_unlink(skb, queue); kfree_skb(skb); /* * N.B. Don't refer to skb or cb after this point @@ -2064,7 +2064,7 @@ static struct notifier_block dn_dev_notifier = { .notifier_call = dn_device_event, }; -extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *); +extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static struct packet_type dn_dix_packet_type = { .type = __constant_htons(ETH_P_DNA_RT), diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 00233ecbc9c..5610bb16dbf 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa) skb = alloc_skb(size, GFP_KERNEL); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS); return; } if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL); + NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL); } static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 202dbde9850..369f25b60f3 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -60,7 +60,7 @@ #include <linux/inet.h> #include <linux/route.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 8cce1fdbda9..e0bebf4bbca 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -479,7 +479,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff xmit_count = cb2->xmit_count; segnum = cb2->segnum; /* Remove and drop ack'ed packet */ - skb_unlink(ack); + skb_unlink(ack, q); kfree_skb(ack); ack = NULL; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 2399fa8a3f8..2c915f305be 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb) return NET_RX_SUCCESS; } -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct dn_skb_cb *cb; unsigned char flags = 0; diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index 28ba5777a25..eeba56f9932 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n static DEFINE_RWLOCK(dn_fib_tables_lock); struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1]; -static kmem_cache_t *dn_hash_kmem; +static kmem_cache_t *dn_hash_kmem __read_mostly; static int dn_fib_hash_zombies; static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) @@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE; + NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE; if (nlh->nlmsg_flags & NLM_F_ECHO) atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL); + netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL); if (nlh->nlmsg_flags & NLM_F_ECHO) netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); } diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 284a9998e53..1ab94c6e22e 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -19,6 +19,7 @@ #include <linux/netfilter.h> #include <linux/spinlock.h> #include <linux/netlink.h> +#include <linux/netfilter_decnet.h> #include <net/sock.h> #include <net/flow.h> @@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb) switch(flags & DN_RT_CNTL_MSK) { case DN_RT_PKT_L1RT: - group = DNRMG_L1_GROUP; + group = DNRNG_NLGRP_L1; break; case DN_RT_PKT_L2RT: - group = DNRMG_L2_GROUP; + group = DNRNG_NLGRP_L2; break; default: return; @@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb) skb2 = dnrmg_build_message(skb, &status); if (skb2 == NULL) return; - NETLINK_CB(skb2).dst_groups = group; + NETLINK_CB(skb2).dst_group = group; netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); } @@ -138,7 +139,8 @@ static int __init init(void) { int rv = 0; - dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk); + dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX, + dnrmg_receive_user_sk, THIS_MODULE); if (dnrmg == NULL) { printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); return -ENOMEM; @@ -162,6 +164,7 @@ static void __exit fini(void) MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG); module_init(init); module_exit(fini); diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index de691e119e1..4a62093eb34 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_toiovec(msg->msg_iov, skb->data, copied); if (err) goto out_free; - sk->sk_stamp = skb->stamp; + skb_get_timestamp(skb, &sk->sk_stamp); if (msg->msg_name) memcpy(msg->msg_name, skb->cb, msg->msg_namelen); @@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result) foundit: tx_result(skb->sk, eb->cookie, result); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); spin_unlock_irqrestore(&aun_queue_lock, flags); kfree_skb(skb); } @@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h) { tx_result(skb->sk, eb->cookie, ECTYPE_TRANSMIT_NOT_PRESENT); - skb_unlink(skb); + skb_unlink(skb, &aun_queue); kfree_skb(skb); } skb = newskb; @@ -1009,7 +1009,7 @@ release: * Receive an Econet frame from a device. */ -static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ec_framehdr *hdr; struct sock *sk; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f6dbfb99b14..87a052a9a84 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -62,8 +62,6 @@ #include <asm/system.h> #include <asm/checksum.h> -extern int __init netdev_boot_setup(char *str); - __setup("ether=", netdev_boot_setup); /* @@ -163,7 +161,6 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->mac.raw=skb->data; skb_pull(skb,ETH_HLEN); eth = eth_hdr(skb); - skb->input_dev = dev; if(*eth->h_dest&1) { diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c index b81a6d53234..66b39fc342d 100644 --- a/net/ethernet/sysctl_net_ether.c +++ b/net/ethernet/sysctl_net_ether.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/sysctl.h> +#include <linux/if_ether.h> ctl_table ether_table[] = { {0} diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 0b3d9f1d806..e55136ae09f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,20 +413,19 @@ config INET_TUNNEL If unsure, say Y. -config IP_TCPDIAG - tristate "IP: TCP socket monitoring interface" +config INET_DIAG + tristate "INET: socket monitoring interface" default y ---help--- - Support for TCP socket monitoring interface used by native Linux - tools such as ss. ss is included in iproute2, currently downloadable - at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support - and have selected IPv6 as a module, you need to build this as a - module too. + Support for INET (TCP, DCCP, etc) socket monitoring interface used by + native Linux tools such as ss. ss is included in iproute2, currently + downloadable at <http://developer.osdl.org/dev/iproute2>. If unsure, say Y. -config IP_TCPDIAG_IPV6 - def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +config INET_TCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 55dc6cca1e7..f0435d00db6 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -4,11 +4,12 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ - ip_output.o ip_sockglue.o \ + ip_output.o ip_sockglue.o inet_hashtables.o \ + inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o @@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 163ae4068b5..bf147f8db39 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -99,6 +99,7 @@ #include <net/arp.h> #include <net/route.h> #include <net/ip_fib.h> +#include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/skbuff.h> @@ -112,11 +113,7 @@ #include <linux/mroute.h> #endif -DEFINE_SNMP_STAT(struct linux_mib, net_statistics); - -#ifdef INET_REFCNT_DEBUG -atomic_t inet_sock_nr; -#endif +DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; extern void ip_mc_drop_socket(struct sock *sk); @@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk) if (inet->opt) kfree(inet->opt); dst_release(sk->sk_dst_cache); -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet_sock_nr); - printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", - sk, atomic_read(&inet_sock_nr)); -#endif + sk_refcnt_debug_dec(sk); } /* @@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - err = tcp_listen_start(sk); + err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); if (err) goto out; } @@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol) struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; - int err; + int try_loading_module = 0; + int err = -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; +lookup_protocol: rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol) answer = NULL; } - err = -ESOCKTNOSUPPORT; - if (!answer) - goto out_rcu_unlock; + if (unlikely(answer == NULL)) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-2-proto-132-type-1 + * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-2-proto-132 + * (net-pf-PF_INET-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; @@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol) inet->mc_index = 0; inet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(sk); if (inet->num) { /* It assumes that any protocol which allows @@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = { .owner = THIS_MODULE, }; - -extern void tcp_init(void); -extern void tcp_v4_init(struct net_proto_family *); - /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ @@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p) } } +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; + +static int inet_sk_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + sk->sk_protocol, + inet->sport, inet->dport, sk); + if (err) + return err; + + sk_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "%s(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + __FUNCTION__, + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = inet->rcv_saddr = new_saddr; + + /* + * XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __sk_prot_rehash(sk); + return 0; +} + +int inet_sk_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; +{ + struct flowi fl = { + .oif = sk->sk_bound_dev_if, + .nl_u = { + .ip4_u = { + .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk), + }, + }, + .proto = sk->sk_protocol, + .uli_u = { + .ports = { + .sport = inet->sport, + .dport = inet->dport, + }, + }, + }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); +} + if (!err) + sk_setup_caps(sk, &rt->u.dst); + else { + /* Routing failed... */ + sk->sk_route_caps = 0; + /* + * Other protocols have to map its equivalent state to TCP_SYN_SENT. + * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme + */ + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = inet_sk_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + } + + return err; +} + +EXPORT_SYMBOL(inet_sk_rebuild_header); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void) } static int ipv4_proc_init(void); -extern void ipfrag_init(void); /* * IP protocol layer initialiser @@ -1128,19 +1248,10 @@ module_init(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -extern int fib_proc_init(void); -extern void fib_proc_exit(void); #ifdef CONFIG_IP_FIB_TRIE extern int fib_stat_proc_init(void); extern void fib_stat_proc_exit(void); #endif -extern int ip_misc_proc_init(void); -extern int raw_proc_init(void); -extern void raw_proc_exit(void); -extern int tcp4_proc_init(void); -extern void tcp4_proc_exit(void); -extern int udp4_proc_init(void); -extern void udp4_proc_exit(void); static int __init ipv4_proc_init(void) { @@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - -#ifdef INET_REFCNT_DEBUG -EXPORT_SYMBOL(inet_sock_nr); -#endif diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a642fd61285..8bf312bdea1 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip, static void parp_redo(struct sk_buff *skb) { nf_reset(skb); - arp_rcv(skb, skb->dev, NULL); + arp_rcv(skb, skb->dev, NULL, skb->dev); } /* @@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb) if (n) neigh_release(n); - if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || + if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); @@ -927,7 +927,7 @@ out: * Receive an arp request from the device layer. */ -int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *arp; @@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto out_of_mem; + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); freeskb: diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index b1db561f254..c1b42b5257f 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -16,9 +16,10 @@ #include <linux/module.h> #include <linux/ip.h> #include <linux/in.h> +#include <net/ip.h> #include <net/sock.h> -#include <net/tcp.h> #include <net/route.h> +#include <net/tcp_states.h> int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d8a10e3dd77..ba2895ae815 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa) struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); if (!skb) - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS); else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL); } else { - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL); } } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ba57446d5d1..b31ffc5053d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr)); xfrm_state_put(x); } @@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_tfm_alg_digestsize(esp->auth.tfm)) { - NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_tfm_alg_digestsize(esp->auth.tfm), - aalg_desc->uinfo.auth.icv_fullbits/8)); + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cd8e45ab958..4e1379f7126 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len) nl_fib_lookup(frn, tb); pid = nlh->nlmsg_pid; /*pid of sending process */ - NETLINK_CB(skb).groups = 0; /* not in mcast group */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; - NETLINK_CB(skb).dst_groups = 0; /* unicast */ + NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(sk, skb, pid, MSG_DONTWAIT); } static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); + netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) @@ -662,5 +661,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); -EXPORT_SYMBOL(ip_dev_find); EXPORT_SYMBOL(ip_rt_ioctl); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index b10d6bb5ef3..2a8c9afc369 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -45,8 +45,8 @@ #include "fib_lookup.h" -static kmem_cache_t *fn_hash_kmem; -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_hash_kmem __read_mostly; +static kmem_cache_t *fn_alias_kmem __read_mostly; struct fib_node { struct hlist_node fn_hash; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b729d97cfa9..ef6609ea0eb 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -7,6 +7,7 @@ struct fib_alias { struct list_head fa_list; + struct rcu_head rcu; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e278cb9d007..d41219e8037 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE; if (n->nlmsg_flags&NLM_F_ECHO) atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL); if (n->nlmsg_flags&NLM_F_ECHO) netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); } @@ -854,6 +854,7 @@ failure: return NULL; } +/* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_result *res, __u32 zone, __u32 mask, int prefixlen) @@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_alias *fa; int nh_sel = 0; - list_for_each_entry(fa, head, fa_list) { + list_for_each_entry_rcu(fa, head, fa_list) { int err; if (fa->fa_tos && diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 45efd5f4741..b2dea4e5da7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.325" +#define VERSION "0.402" #include <linux/config.h> #include <asm/uaccess.h> @@ -62,6 +62,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> +#include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> @@ -77,56 +78,55 @@ #undef CONFIG_IP_FIB_TRIE_STATS #define MAX_CHILDS 16384 -#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) #define KEYLENGTH (8*sizeof(t_key)) #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) -static DEFINE_RWLOCK(fib_lock); - typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL -#define NODE_PARENT(_node) \ - ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) -#define NODE_SET_PARENT(_node, _ptr) \ - ((_node)->_parent = (((unsigned long)(_ptr)) | \ - ((_node)->_parent & NODE_TYPE_MASK))) -#define NODE_INIT_PARENT(_node, _type) \ - ((_node)->_parent = (_type)) -#define NODE_TYPE(_node) \ - ((_node)->_parent & NODE_TYPE_MASK) - -#define IS_TNODE(n) (!(n->_parent & T_LEAF)) -#define IS_LEAF(n) (n->_parent & T_LEAF) +#define NODE_PARENT(node) \ + ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) + +#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) + +#define NODE_SET_PARENT(node, ptr) \ + rcu_assign_pointer((node)->parent, \ + ((unsigned long)(ptr)) | NODE_TYPE(node)) + +#define IS_TNODE(n) (!(n->parent & T_LEAF)) +#define IS_LEAF(n) (n->parent & T_LEAF) struct node { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; }; struct leaf { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; struct hlist_head list; + struct rcu_head rcu; }; struct leaf_info { struct hlist_node hlist; + struct rcu_head rcu; int plen; struct list_head falh; }; struct tnode { - t_key key; - unsigned long _parent; - unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short full_children; /* KEYLENGTH bits needed */ - unsigned short empty_children; /* KEYLENGTH bits needed */ - struct node *child[0]; + t_key key; + unsigned long parent; + unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short full_children; /* KEYLENGTH bits needed */ + unsigned short empty_children; /* KEYLENGTH bits needed */ + struct rcu_head rcu; + struct node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -150,77 +150,45 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif - int size; + int size; unsigned int revision; }; -static int trie_debug = 0; - -static int tnode_full(struct tnode *tn, struct node *n); static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); -static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); -extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt); - -extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, - struct nlmsghdr *n, struct netlink_skb_parms *req); -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; -static void trie_bug(char *err) -{ - printk("Trie Bug: %s\n", err); - BUG(); -} + +/* rcu_read_lock needs to be hold by caller from readside */ static inline struct node *tnode_get_child(struct tnode *tn, int i) { - if (i >= 1<<tn->bits) - trie_bug("tnode_get_child"); + BUG_ON(i >= 1 << tn->bits); - return tn->child[i]; + return rcu_dereference(tn->child[i]); } -static inline int tnode_child_length(struct tnode *tn) +static inline int tnode_child_length(const struct tnode *tn) { - return 1<<tn->bits; + return 1 << tn->bits; } -/* - _________________________________________________________________ - | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | - ---------------------------------------------------------------- - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - - _________________________________________________________________ - | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | - ----------------------------------------------------------------- - 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - - tp->pos = 7 - tp->bits = 3 - n->pos = 15 - n->bits=4 - KEYLENGTH=32 -*/ - static inline t_key tkey_extract_bits(t_key a, int offset, int bits) { - if (offset < KEYLENGTH) + if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); - else + else return 0; } @@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) { if (bits == 0 || offset >= KEYLENGTH) return 1; - bits = bits > KEYLENGTH ? KEYLENGTH : bits; - return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; + bits = bits > KEYLENGTH ? KEYLENGTH : bits; + return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; } static inline int tkey_mismatch(t_key a, int offset, t_key b) @@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) return i; } -/* Candiate for fib_semantics */ - -static void fn_free_alias(struct fib_alias *fa) -{ - fib_release_info(fa->fa_info); - kmem_cache_free(fn_alias_kmem, fa); -} - /* To understand this stuff, an understanding of keys and all their bits is necessary. Every node in the trie has a key associated with it, but not @@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa) tp->pos = 7 tp->bits = 3 n->pos = 15 - n->bits=4 + n->bits = 4 First, let's just ignore the bits that come before the parent tp, that is the bits from 0 to (tp->pos-1). They are *known* but at this point we do @@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa) */ -static void check_tnode(struct tnode *tn) +static inline void check_tnode(const struct tnode *tn) { - if (tn && tn->pos+tn->bits > 32) { - printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); - } + WARN_ON(tn && tn->pos+tn->bits > 32); } static int halve_threshold = 25; static int inflate_threshold = 50; -static struct leaf *leaf_new(void) + +static void __alias_free_mem(struct rcu_head *head) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); - if (l) { - NODE_INIT_PARENT(l, T_LEAF); - INIT_HLIST_HEAD(&l->list); - } - return l; + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + kmem_cache_free(fn_alias_kmem, fa); } -static struct leaf_info *leaf_info_new(int plen) +static inline void alias_free_mem_rcu(struct fib_alias *fa) { - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - INIT_LIST_HEAD(&li->falh); - } - return li; + call_rcu(&fa->rcu, __alias_free_mem); +} + +static void __leaf_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct leaf, rcu)); +} + +static inline void free_leaf(struct leaf *leaf) +{ + call_rcu(&leaf->rcu, __leaf_free_rcu); } -static inline void free_leaf(struct leaf *l) +static void __leaf_info_free_rcu(struct rcu_head *head) { - kfree(l); + kfree(container_of(head, struct leaf_info, rcu)); } -static inline void free_leaf_info(struct leaf_info *li) +static inline void free_leaf_info(struct leaf_info *leaf) { - kfree(li); + call_rcu(&leaf->rcu, __leaf_info_free_rcu); } static struct tnode *tnode_alloc(unsigned int size) { - if (size <= PAGE_SIZE) { - return kmalloc(size, GFP_KERNEL); - } else { - return (struct tnode *) - __get_free_pages(GFP_KERNEL, get_order(size)); - } + struct page *pages; + + if (size <= PAGE_SIZE) + return kcalloc(size, 1, GFP_KERNEL); + + pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); + if (!pages) + return NULL; + + return page_address(pages); } -static void __tnode_free(struct tnode *tn) +static void __tnode_free_rcu(struct rcu_head *head) { + struct tnode *tn = container_of(head, struct tnode, rcu); unsigned int size = sizeof(struct tnode) + - (1<<tn->bits) * sizeof(struct node *); + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); @@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn) free_pages((unsigned long)tn, get_order(size)); } +static inline void tnode_free(struct tnode *tn) +{ + call_rcu(&tn->rcu, __tnode_free_rcu); +} + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + if (l) { + l->parent = T_LEAF; + INIT_HLIST_HEAD(&l->list); + } + return l; +} + +static struct leaf_info *leaf_info_new(int plen) +{ + struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); + if (li) { + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + } + return li; +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1<<bits; int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); struct tnode *tn = tnode_alloc(sz); - if (tn) { + if (tn) { memset(tn, 0, sz); - NODE_INIT_PARENT(tn, T_TNODE); + tn->parent = T_TNODE; tn->pos = pos; tn->bits = bits; tn->key = key; @@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<<bits; } - if (trie_debug > 0) - printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned int) (sizeof(struct node) * 1<<bits)); + pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), + (unsigned int) (sizeof(struct node) * 1<<bits)); return tn; } -static void tnode_free(struct tnode *tn) -{ - if (!tn) { - trie_bug("tnode_free\n"); - } - if (IS_LEAF(tn)) { - free_leaf((struct leaf *)tn); - if (trie_debug > 0 ) - printk("FL %p \n", tn); - } - else if (IS_TNODE(tn)) { - __tnode_free(tn); - if (trie_debug > 0 ) - printk("FT %p \n", tn); - } - else { - trie_bug("tnode_free\n"); - } -} - /* * Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(struct tnode *tn, struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct node *n) { if (n == NULL || IS_LEAF(n)) return 0; @@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) { - struct node *chi; + struct node *chi = tn->child[i]; int isfull; - if (i >= 1<<tn->bits) { - printk("bits=%d, i=%d\n", tn->bits, i); - trie_bug("tnode_put_child_reorg bits"); - } - write_lock_bh(&fib_lock); - chi = tn->child[i]; + BUG_ON(i >= 1<<tn->bits); + /* update emptyChildren */ if (n == NULL && chi != NULL) @@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w tn->empty_children--; /* update fullChildren */ - if (wasfull == -1) + if (wasfull == -1) wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn->full_children--; - else if (!wasfull && isfull) tn->full_children++; + if (n) NODE_SET_PARENT(n, tn); - tn->child[i] = n; - write_unlock_bh(&fib_lock); + rcu_assign_pointer(tn->child[i], n); } static struct node *resize(struct trie *t, struct tnode *tn) { int i; int err = 0; + struct tnode *old_tn; if (!tn) return NULL; - if (trie_debug) - printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { @@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { + struct node *n; - write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - - /* compress one level */ - struct node *n = tn->child[i]; - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); + n = tn->child[i]; + if (!n) + continue; - write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; - } - write_unlock_bh(&fib_lock); + /* compress one level */ + NODE_SET_PARENT(n, NULL); + tnode_free(tn); + return n; } /* * Double as long as the resulting node has a number of @@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= inflate_threshold * new_child_length + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= + * tn->full_children) >= * inflate_threshold * tnode_child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + tnode_child_length(tn) - - * tn->empty_children ) >= inflate_threshold * + * tn->empty_children) >= inflate_threshold * * tnode_child_length(tn) * */ @@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn, &err); - - if (err) { + old_tn = tn; + tn = inflate(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold * tnode_child_length(tn)) { - tn = halve(t, tn, &err); - - if (err) { + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { - - write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - /* compress one level */ - struct node *n = tn->child[i]; - - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); - - write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; - } - write_unlock_bh(&fib_lock); + struct node *n; + + n = tn->child[i]; + if (!n) + continue; + + /* compress one level */ + + NODE_SET_PARENT(n, NULL); + tnode_free(tn); + return n; } return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) +static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *inode; struct tnode *oldtnode = tn; int olen = tnode_child_length(tn); int i; - if (trie_debug) - printk("In inflate\n"); + pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) * fails. In case of failure we return the oldnode and inflate * of tnode is ignored. */ - - for(i = 0; i < olen; i++) { + + for (i = 0; i < olen; i++) { struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && @@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) inode->pos == oldtnode->pos + oldtnode->bits && inode->bits > 1) { struct tnode *left, *right; - t_key m = TKEY_GET_MASK(inode->pos, 1); left = tnode_new(inode->key&(~m), inode->pos + 1, inode->bits - 1); + if (!left) + goto nomem; - if (!left) { - *err = -ENOMEM; - break; - } - right = tnode_new(inode->key|m, inode->pos + 1, inode->bits - 1); - if (!right) { - *err = -ENOMEM; - break; - } + if (!right) { + tnode_free(left); + goto nomem; + } put_child(t, tn, 2*i, (struct node *) left); put_child(t, tn, 2*i+1, (struct node *) right); } } - if (*err) { - int size = tnode_child_length(tn); - int j; - - for(j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; - } - - for(i = 0; i < olen; i++) { + for (i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); + struct tnode *left, *right; + int size, j; /* An empty child */ if (node == NULL) @@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) put_child(t, tn, 2*i+1, inode->child[1]); tnode_free(inode); + continue; } - /* An internal node with more than two children */ - else { - struct tnode *left, *right; - int size, j; - - /* We will replace this node 'inode' with two new - * ones, 'left' and 'right', each with half of the - * original children. The two new nodes will have - * a position one bit further down the key and this - * means that the "significant" part of their keys - * (see the discussion near the top of this file) - * will differ by one bit, which will be "0" in - * left's key and "1" in right's key. Since we are - * moving the key position by one step, the bit that - * we are moving away from - the bit at position - * (inode->pos) - is the one that will differ between - * left and right. So... we synthesize that bit in the - * two new keys. - * The mask 'm' below will be a single "one" bit at - * the position (inode->pos) - */ - - /* Use the old key, but set the new significant - * bit to zero. - */ + /* An internal node with more than two children */ + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ - left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + /* Use the old key, but set the new significant + * bit to zero. + */ - if (!left) - BUG(); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); - right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + BUG_ON(!left); - if (!right) - BUG(); + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); - size = tnode_child_length(left); - for(j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); - } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + BUG_ON(!right); - tnode_free(inode); + size = tnode_child_length(left); + for (j = 0; j < size; j++) { + put_child(t, left, j, inode->child[j]); + put_child(t, right, j, inode->child[j + size]); } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free(inode); } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for (j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) +static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; struct node *left, *right; int i; int olen = tnode_child_length(tn); - if (trie_debug) printk("In halve\n"); + pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) * of tnode is ignored. */ - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); /* Two nonempty children */ - if (left && right) { - struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); + if (left && right) { + struct tnode *newn; - if (!newBinNode) { - *err = -ENOMEM; - break; - } - put_child(t, tn, i/2, (struct node *)newBinNode); - } - } + newn = tnode_new(left->key, tn->pos + tn->bits, 1); - if (*err) { - int size = tnode_child_length(tn); - int j; + if (!newn) + goto nomem; - for(j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); + put_child(t, tn, i/2, (struct node *)newn); + } - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; } - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { + struct tnode *newBinNode; + left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); @@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) if (right == NULL) /* Both are empty */ continue; put_child(t, tn, i/2, right); - } else if (right == NULL) + continue; + } + + if (right == NULL) { put_child(t, tn, i/2, left); + continue; + } /* Two nonempty children */ - else { - struct tnode *newBinNode = - (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); - - if (!newBinNode) - BUG(); - - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); - } + newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for (j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } -static void *trie_init(struct trie *t) +static void trie_init(struct trie *t) { - if (t) { - t->size = 0; - t->trie = NULL; - t->revision = 0; + if (!t) + return; + + t->size = 0; + rcu_assign_pointer(t->trie, NULL); + t->revision = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS - memset(&t->stats, 0, sizeof(struct trie_use_stats)); + memset(&t->stats, 0, sizeof(struct trie_use_stats)); #endif - } - return t; } +/* readside most use rcu_read_lock currently dump routines + via get_fa_head and dump */ + static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) { struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry(li, node, head, hlist) { + hlist_for_each_entry_rcu(li, node, head, hlist) if (li->plen == plen) return li; - } + return NULL; } static inline struct list_head * get_fa_head(struct leaf *l, int plen) { - struct list_head *fa_head = NULL; struct leaf_info *li = find_leaf_info(&l->list, plen); - if (li) - fa_head = &li->falh; + if (!li) + return NULL; - return fa_head; + return &li->falh; } static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { - struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node, *tmp; - - write_lock_bh(&fib_lock); - - if (hlist_empty(head)) - hlist_add_head(&new->hlist, head); - else { - hlist_for_each_entry_safe(li, node, tmp, head, hlist) { - - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_after(&last->hlist, &new->hlist); - else - hlist_add_before(&new->hlist, &li->hlist); - } - write_unlock_bh(&fib_lock); + struct leaf_info *li = NULL, *last = NULL; + struct hlist_node *node; + + if (hlist_empty(head)) { + hlist_add_head_rcu(&new->hlist, head); + } else { + hlist_for_each_entry(li, node, head, hlist) { + if (new->plen > li->plen) + break; + + last = li; + } + if (last) + hlist_add_after_rcu(&last->hlist, &new->hlist); + else + hlist_add_before_rcu(&new->hlist, &li->hlist); + } } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf * fib_find_node(struct trie *t, u32 key) { @@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = t->trie; + n = rcu_dereference(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - } - else + } else break; } /* Case we have found a leaf. Compare prefixes */ - if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = (struct leaf *) n; - return l; - } + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) + return (struct leaf *)n; + return NULL; } static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { - int i = 0; int wasfull; t_key cindex, key; struct tnode *tp = NULL; - if (!tn) - BUG(); - key = tn->key; - i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - if (i > 10) { - printk("Rebalance tn=%p \n", tn); - if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); - - printk("Rebalance tp=%p \n", tp); - if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); - } - - if (i > 12) BUG(); - i++; - tp = NODE_PARENT(tn); cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize (t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); - + if (!NODE_PARENT(tn)) break; @@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) return (struct node*) tn; } +/* only used from updater-side */ + static struct list_head * fib_insert_node(struct trie *t, int *err, u32 key, int plen) { @@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } - } - else + BUG_ON(n && NODE_PARENT(n) != tn); + } else break; } @@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) * tp is n's (parent) ----> NULL or TNODE */ - if (tp && IS_LEAF(tp)) - BUG(); - + BUG_ON(tp && IS_LEAF(tp)); /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = ( struct leaf *) n; - + struct leaf *l = (struct leaf *) n; + li = leaf_info_new(plen); - + if (!li) { *err = -ENOMEM; goto err; @@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) fa_head = &li->falh; insert_leaf_info(&l->list, li); - /* Case 2: n is NULL, and will just insert a new leaf */ if (t->trie && n == NULL) { + /* Case 2: n is NULL, and will just insert a new leaf */ NODE_SET_PARENT(l, tp); - - if (!tp) - BUG(); - else { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); - } - } - /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ - else { + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + } else { + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* * Add a new tnode here * first tnode need some special handling */ if (tp) - pos=tp->pos+tp->bits; + pos = tp->pos+tp->bits; else - pos=0; + pos = 0; + if (n) { newpos = tkey_mismatch(key, pos, n->key); tn = tnode_new(n->key, newpos, 1); - } - else { + } else { newpos = 0; tn = tnode_new(key, newpos, 1); /* First tnode */ } @@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) tnode_free((struct tnode *) l); *err = -ENOMEM; goto err; - } - + } + NODE_SET_PARENT(tn, tp); - missbit=tkey_extract_bits(key, newpos, 1); + missbit = tkey_extract_bits(key, newpos, 1); put_child(t, tn, missbit, (struct node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); - } - else { - t->trie = (struct node*) tn; /* First tnode */ + } else { + rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ tp = tn; } } - if (tp && tp->pos+tp->bits > 32) { + + if (tp && tp->pos + tp->bits > 32) printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", tp, tp->pos, tp->bits, key, plen); - } + /* Rebalance the trie */ - t->trie = trie_rebalance(t, tp); + + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); done: t->revision++; -err:; +err: return fa_head; } @@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, key = ntohl(key); - if (trie_debug) - printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; key = key & mask; - if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) + fi = fib_create_info(r, rta, nlhdr, &err); + + if (!fi) goto err; l = fib_find_node(t, key); @@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, * and we need to allocate a new one of those as well. */ - if (fa && - fa->fa_info->fib_priority == fi->fib_priority) { + if (fa && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_orig; err = -EEXIST; @@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_info *fi_drop; u8 state; - write_lock_bh(&fib_lock); + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = type; - fa->fa_scope = r->rtm_scope; + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state &= ~FA_S_ACCESSED; - write_unlock_bh(&fib_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); + alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(-1); - goto succeeded; + goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop @@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fa = fa_orig; } err = -ENOENT; - if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) + if (!(nlhdr->nlmsg_flags & NLM_F_CREATE)) goto out; err = -ENOBUFS; @@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, new_fa->fa_type = type; new_fa->fa_scope = r->rtm_scope; new_fa->fa_state = 0; -#if 0 - new_fa->dst = NULL; -#endif /* * Insert new entry to the list. */ @@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, goto out_free_new_fa; } - write_lock_bh(&fib_lock); - - list_add_tail(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); - - write_unlock_bh(&fib_lock); + list_add_tail_rcu(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); @@ -1328,11 +1244,14 @@ out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); -err:; +err: return err; } -static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, + +/* should be clalled with rcu_read_lock */ +static inline int check_leaf(struct trie *t, struct leaf *l, + t_key key, int *plen, const struct flowi *flp, struct fib_result *res) { int err, i; @@ -1341,8 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *pl struct hlist_head *hhead = &l->list; struct hlist_node *node; - hlist_for_each_entry(li, node, hhead, hlist) { - + hlist_for_each_entry_rcu(li, node, hhead, hlist) { i = li->plen; mask = ntohl(inet_make_mask(i)); if (l->key != (key & mask)) @@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result struct node *n; struct tnode *pn; int pos, bits; - t_key key=ntohl(flp->fl4_dst); + t_key key = ntohl(flp->fl4_dst); int chopped_off; t_key cindex = 0; int current_prefix_length = KEYLENGTH; - n = t->trie; + struct tnode *cn; + t_key node_prefix, key_prefix, pref_mismatch; + int mp; + + rcu_read_lock(); - read_lock(&fib_lock); + n = rcu_dereference(t->trie); if (!n) goto failed; @@ -1393,8 +1315,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result pn = (struct tnode *) n; chopped_off = 0; - while (pn) { - + while (pn) { pos = pn->pos; bits = pn->bits; @@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result goto backtrace; } - if (IS_TNODE(n)) { + if (IS_LEAF(n)) { + if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) + goto found; + else + goto backtrace; + } + #define HL_OPTIMIZE #ifdef HL_OPTIMIZE - struct tnode *cn = (struct tnode *)n; - t_key node_prefix, key_prefix, pref_mismatch; - int mp; + cn = (struct tnode *)n; - /* - * It's a tnode, and we can do some extra checks here if we - * like, to avoid descending into a dead-end branch. - * This tnode is in the parent's child array at index - * key[p_pos..p_pos+p_bits] but potentially with some bits - * chopped off, so in reality the index may be just a - * subprefix, padded with zero at the end. - * We can also take a look at any skipped bits in this - * tnode - everything up to p_pos is supposed to be ok, - * and the non-chopped bits of the index (se previous - * paragraph) are also guaranteed ok, but the rest is - * considered unknown. - * - * The skipped bits are key[pos+bits..cn->pos]. - */ - - /* If current_prefix_length < pos+bits, we are already doing - * actual prefix matching, which means everything from - * pos+(bits-chopped_off) onward must be zero along some - * branch of this subtree - otherwise there is *no* valid - * prefix present. Here we can only check the skipped - * bits. Remember, since we have already indexed into the - * parent's child array, we know that the bits we chopped of - * *are* zero. - */ + /* + * It's a tnode, and we can do some extra checks here if we + * like, to avoid descending into a dead-end branch. + * This tnode is in the parent's child array at index + * key[p_pos..p_pos+p_bits] but potentially with some bits + * chopped off, so in reality the index may be just a + * subprefix, padded with zero at the end. + * We can also take a look at any skipped bits in this + * tnode - everything up to p_pos is supposed to be ok, + * and the non-chopped bits of the index (se previous + * paragraph) are also guaranteed ok, but the rest is + * considered unknown. + * + * The skipped bits are key[pos+bits..cn->pos]. + */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - - if (current_prefix_length < pos+bits) { - if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) - goto backtrace; - } + /* If current_prefix_length < pos+bits, we are already doing + * actual prefix matching, which means everything from + * pos+(bits-chopped_off) onward must be zero along some + * branch of this subtree - otherwise there is *no* valid + * prefix present. Here we can only check the skipped + * bits. Remember, since we have already indexed into the + * parent's child array, we know that the bits we chopped of + * *are* zero. + */ - /* - * If chopped_off=0, the index is fully validated and we - * only need to look at the skipped bits for this, the new, - * tnode. What we actually want to do is to find out if - * these skipped bits match our key perfectly, or if we will - * have to count on finding a matching prefix further down, - * because if we do, we would like to have some way of - * verifying the existence of such a prefix at this point. - */ + /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - /* The only thing we can do at this point is to verify that - * any such matching prefix can indeed be a prefix to our - * key, and if the bits in the node we are inspecting that - * do not match our key are not ZERO, this cannot be true. - * Thus, find out where there is a mismatch (before cn->pos) - * and verify that all the mismatching bits are zero in the - * new tnode's key. - */ + if (current_prefix_length < pos+bits) { + if (tkey_extract_bits(cn->key, current_prefix_length, + cn->pos - current_prefix_length) != 0 || + !(cn->child[0])) + goto backtrace; + } - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. - */ + /* + * If chopped_off=0, the index is fully validated and we + * only need to look at the skipped bits for this, the new, + * tnode. What we actually want to do is to find out if + * these skipped bits match our key perfectly, or if we will + * have to count on finding a matching prefix further down, + * because if we do, we would like to have some way of + * verifying the existence of such a prefix at this point. + */ - node_prefix = MASK_PFX(cn->key, cn->pos); - key_prefix = MASK_PFX(key, cn->pos); - pref_mismatch = key_prefix^node_prefix; - mp = 0; + /* The only thing we can do at this point is to verify that + * any such matching prefix can indeed be a prefix to our + * key, and if the bits in the node we are inspecting that + * do not match our key are not ZERO, this cannot be true. + * Thus, find out where there is a mismatch (before cn->pos) + * and verify that all the mismatching bits are zero in the + * new tnode's key. + */ - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. - */ - if (pref_mismatch) { - while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { - mp++; - pref_mismatch = pref_mismatch <<1; - } - key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); - - if (key_prefix != 0) - goto backtrace; - - if (current_prefix_length >= cn->pos) - current_prefix_length=mp; - } -#endif - pn = (struct tnode *)n; /* Descend */ - chopped_off = 0; - continue; + /* Note: We aren't very concerned about the piece of the key + * that precede pn->pos+pn->bits, since these have already been + * checked. The bits after cn->pos aren't checked since these are + * by definition "unknown" at this point. Thus, what we want to + * see is if we are about to enter the "prefix matching" state, + * and in that case verify that the skipped bits that will prevail + * throughout this subtree are zero, as they have to be if we are + * to find a matching prefix. + */ + + node_prefix = MASK_PFX(cn->key, cn->pos); + key_prefix = MASK_PFX(key, cn->pos); + pref_mismatch = key_prefix^node_prefix; + mp = 0; + + /* In short: If skipped bits in this node do not match the search + * key, enter the "prefix matching" state.directly. + */ + if (pref_mismatch) { + while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { + mp++; + pref_mismatch = pref_mismatch <<1; + } + key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + + if (key_prefix != 0) + goto backtrace; + + if (current_prefix_length >= cn->pos) + current_prefix_length = mp; } - if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - } +#endif + pn = (struct tnode *)n; /* Descend */ + chopped_off = 0; + continue; + backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { + while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) chopped_off++; - } /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) current_prefix_length = pn->pos + pn->bits - chopped_off; - + /* * Either we do the actual chop off according or if we have * chopped off all bits in this tnode walk up to our parent. */ - if (chopped_off <= pn->bits) + if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); - else { + } else { if (NODE_PARENT(pn) == NULL) goto failed; - + /* Get Child's index */ cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); pn = NODE_PARENT(pn); @@ -1548,10 +1468,11 @@ backtrace: failed: ret = 1; found: - read_unlock(&fib_lock); + rcu_read_unlock(); return ret; } +/* only called from updater side */ static int trie_leaf_remove(struct trie *t, t_key key) { t_key cindex; @@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key) struct node *n = t->trie; struct leaf *l; - if (trie_debug) - printk("entering trie_leaf_remove(%p)\n", n); + pr_debug("entering trie_leaf_remove(%p)\n", n); /* Note that in the case skipped bits, those bits are *not* checked! * When we finish this, we will have NULL or a T_LEAF, and the * T_LEAF may or may not match our key. */ - while (n != NULL && IS_TNODE(n)) { + while (n != NULL && IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } - } + BUG_ON(n && NODE_PARENT(n) != tn); + } l = (struct leaf *) n; if (!n || !tkey_equals(l->key, key)) @@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key) t->revision++; t->size--; + preempt_disable(); tp = NODE_PARENT(n); tnode_free((struct tnode *) n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); - t->trie = trie_rebalance(t, tp); - } - else - t->trie = NULL; + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); + } else + rcu_assign_pointer(t->trie, NULL); + preempt_enable(); return 1; } static int fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, - struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) + struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) { struct trie *t = (struct trie *) tb->tb_data; u32 key, mask; @@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; struct leaf *l; + struct leaf_info *li; + if (plen > 32) return -EINVAL; @@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, memcpy(&key, rta->rta_dst, 4); key = ntohl(key); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; @@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (!fa) return -ESRCH; - if (trie_debug) - printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; fa_head = fa->fa_list.prev; + list_for_each_entry(fa, fa_head, fa_list) { struct fib_info *fi = fa->fa_info; @@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, } } - if (fa_to_delete) { - int kill_li = 0; - struct leaf_info *li; - - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + if (!fa_to_delete) + return -ESRCH; - l = fib_find_node(t, key); - li = find_leaf_info(&l->list, plen); + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); - write_lock_bh(&fib_lock); + l = fib_find_node(t, key); + li = find_leaf_info(&l->list, plen); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); - if (list_empty(fa_head)) { - hlist_del(&li->hlist); - kill_li = 1; - } - write_unlock_bh(&fib_lock); - - if (kill_li) - free_leaf_info(li); + if (list_empty(fa_head)) { + hlist_del_rcu(&li->hlist); + free_leaf_info(li); + } - if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + if (hlist_empty(&l->list)) + trie_leaf_remove(t, key); - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); - fn_free_alias(fa); - return 0; - } - return -ESRCH; + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + return 0; } static int trie_flush_list(struct trie *t, struct list_head *head) @@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head) list_for_each_entry_safe(fa, fa_node, head, fa_list) { struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - - write_lock_bh(&fib_lock); - list_del(&fa->fa_list); - write_unlock_bh(&fib_lock); - fn_free_alias(fa); + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + list_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); found++; } } @@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) struct leaf_info *li = NULL; hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { - found += trie_flush_list(t, &li->falh); if (list_empty(&li->falh)) { - - write_lock_bh(&fib_lock); - hlist_del(&li->hlist); - write_unlock_bh(&fib_lock); - + hlist_del_rcu(&li->hlist); free_leaf_info(li); } } return found; } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) { struct node *c = (struct node *) thisleaf; struct tnode *p; int idx; + struct node *trie = rcu_dereference(t->trie); if (c == NULL) { - if (t->trie == NULL) + if (trie == NULL) return NULL; - if (IS_LEAF(t->trie)) /* trie w. just a leaf */ - return (struct leaf *) t->trie; + if (IS_LEAF(trie)) /* trie w. just a leaf */ + return (struct leaf *) trie; - p = (struct tnode*) t->trie; /* Start */ - } - else + p = (struct tnode*) trie; /* Start */ + } else p = (struct tnode *) NODE_PARENT(c); while (p) { @@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) pos = 0; last = 1 << p->bits; - for(idx = pos; idx < last ; idx++) { - if (p->child[idx]) { - - /* Decend if tnode */ - - while (IS_TNODE(p->child[idx])) { - p = (struct tnode*) p->child[idx]; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || p->child[idx] == NULL ) - goto up; - } - return (struct leaf*) p->child[idx]; + for (idx = pos; idx < last ; idx++) { + c = rcu_dereference(p->child[idx]); + + if (!c) + continue; + + /* Decend if tnode */ + while (IS_TNODE(c)) { + p = (struct tnode *) c; + idx = 0; + + /* Rightmost non-NULL branch */ + if (p && IS_TNODE(p)) + while (!(c = rcu_dereference(p->child[idx])) + && idx < (1<<p->bits)) idx++; + + /* Done with this tnode? */ + if (idx >= (1 << p->bits) || !c) + goto up; } + return (struct leaf *) c; } up: /* No more children go up one step */ - c = (struct node*) p; + c = (struct node *) p; p = (struct tnode *) NODE_PARENT(p); } return NULL; /* Ready. Root of trie */ @@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb) t->revision++; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + rcu_read_lock(); + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); ll = l; } + rcu_read_unlock(); if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); - if (trie_debug) - printk("trie_flush found=%d\n", found); + pr_debug("trie_flush found=%d\n", found); return found; } -static int trie_last_dflt=-1; +static int trie_last_dflt = -1; static void fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) @@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib last_resort = NULL; order = -1; - read_lock(&fib_lock); + rcu_read_lock(); l = fib_find_node(t, 0); if (!l) @@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (list_empty(fa_head)) goto out; - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - + if (fa->fa_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - + if (next_fi->fib_priority > res->fi->fib_priority) break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; fa->fa_state |= FA_S_ACCESSED; - + if (fi == NULL) { if (next_fi != res->fi) break; @@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib } trie_last_dflt = last_idx; out:; - read_unlock(&fib_lock); + rcu_read_unlock(); } static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, @@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi int i, s_i; struct fib_alias *fa; - u32 xkey=htonl(key); + u32 xkey = htonl(key); - s_i=cb->args[3]; + s_i = cb->args[3]; i = 0; - list_for_each_entry(fa, fah, fa_list) { + /* rcu_read_lock is hold by caller */ + + list_for_each_entry_rcu(fa, fah, fa_list) { if (i < s_i) { i++; continue; @@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi fa->fa_info, 0) < 0) { cb->args[3] = i; return -1; - } + } i++; } - cb->args[3]=i; + cb->args[3] = i; return skb->len; } @@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str int h, s_h; struct list_head *fa_head; struct leaf *l = NULL; - s_h=cb->args[2]; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + s_h = cb->args[2]; + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { if (h < s_h) continue; if (h > s_h) @@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str sizeof(cb->args) - 3*sizeof(cb->args[0])); fa_head = get_fa_head(l, plen); - + if (!fa_head) continue; @@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str continue; if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { - cb->args[2]=h; + cb->args[2] = h; return -1; } } - cb->args[2]=h; + cb->args[2] = h; return skb->len; } @@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin s_m = cb->args[1]; - read_lock(&fib_lock); - for (m=0; m<=32; m++) { - + rcu_read_lock(); + for (m = 0; m <= 32; m++) { if (m < s_m) continue; if (m > s_m) memset(&cb->args[2], 0, - sizeof(cb->args) - 2*sizeof(cb->args[0])); + sizeof(cb->args) - 2*sizeof(cb->args[0])); if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { cb->args[1] = m; goto out; } } - read_unlock(&fib_lock); + rcu_read_unlock(); cb->args[1] = m; return skb->len; - out: - read_unlock(&fib_lock); +out: + rcu_read_unlock(); return -1; } @@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id) trie_init(t); if (id == RT_TABLE_LOCAL) - trie_local = t; + trie_local = t; else if (id == RT_TABLE_MAIN) - trie_main = t; + trie_main = t; if (id == RT_TABLE_LOCAL) printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); @@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id) static void putspace_seq(struct seq_file *seq, int n) { - while (n--) seq_printf(seq, " "); + while (n--) + seq_printf(seq, " "); } static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) @@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, seq_printf(seq, "%d/", cindex); printbin_seq(seq, cindex, bits); seq_printf(seq, ": "); - } - else + } else seq_printf(seq, "<root>: "); seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); - if (IS_LEAF(n)) - seq_printf(seq, "key=%d.%d.%d.%d\n", - n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); - else { - int plen = ((struct tnode *)n)->pos; - t_key prf=MASK_PFX(n->key, plen); - seq_printf(seq, "key=%d.%d.%d.%d/%d\n", - prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); - } if (IS_LEAF(n)) { - struct leaf *l=(struct leaf *)n; + struct leaf *l = (struct leaf *)n; struct fib_alias *fa; int i; - for (i=32; i>=0; i--) - if (find_leaf_info(&l->list, i)) { - + + seq_printf(seq, "key=%d.%d.%d.%d\n", + n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); + + for (i = 32; i >= 0; i--) + if (find_leaf_info(&l->list, i)) { struct list_head *fa_head = get_fa_head(l, i); - + if (!fa_head) continue; @@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, putspace_seq(seq, indent+2); seq_printf(seq, "{/%d...dumping}\n", i); - - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { putspace_seq(seq, indent+2); - if (fa->fa_info->fib_nh == NULL) { - seq_printf(seq, "Error _fib_nh=NULL\n"); - continue; - } if (fa->fa_info == NULL) { seq_printf(seq, "Error fa_info=NULL\n"); continue; } + if (fa->fa_info->fib_nh == NULL) { + seq_printf(seq, "Error _fib_nh=NULL\n"); + continue; + } seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", fa->fa_type, @@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, fa->fa_tos); } } - } - else if (IS_TNODE(n)) { + } else { struct tnode *tn = (struct tnode *)n; + int plen = ((struct tnode *)n)->pos; + t_key prf = MASK_PFX(n->key, plen); + + seq_printf(seq, "key=%d.%d.%d.%d/%d\n", + prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); + putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); + seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos)); printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); seq_printf(seq, "}\n"); putspace_seq(seq, indent); seq_printf(seq, "| "); @@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, static void trie_dump_seq(struct seq_file *seq, struct trie *t) { - struct node *n = t->trie; - int cindex=0; - int indent=1; - int pend=0; + struct node *n; + int cindex = 0; + int indent = 1; + int pend = 0; int depth = 0; + struct tnode *tn; - read_lock(&fib_lock); - + rcu_read_lock(); + n = rcu_dereference(t->trie); seq_printf(seq, "------ trie_dump of t=%p ------\n", t); - if (n) { - printnode_seq(seq, indent, n, pend, cindex, 0); - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent += 3; - depth++; - - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - - /* Got a child */ - - printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - } - else { - /* - * New tnode. Decend one level - */ - - depth++; - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent+=3; - cindex=0; - } - } - else - cindex++; + if (!n) { + seq_printf(seq, "------ trie is empty\n"); + + rcu_read_unlock(); + return; + } + + printnode_seq(seq, indent, n, pend, cindex, 0); + + if (!IS_TNODE(n)) { + rcu_read_unlock(); + return; + } + + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + struct node *child = rcu_dereference(tn->child[cindex]); + if (!child) + cindex++; + else { + /* Got a child */ + printnode_seq(seq, indent, child, pend, + cindex, tn->bits); + + if (IS_LEAF(child)) + cindex++; + + else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent-=3; - depth--; - } - } + depth++; + n = child; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); + seq_printf(seq, "\\--\n"); + indent += 3; + cindex = 0; } } - else n = NULL; - } - else seq_printf(seq, "------ trie is empty\n"); - read_unlock(&fib_lock); + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + cindex++; + tn = NODE_PARENT(tn); + pend = tn->pos + tn->bits; + indent -= 3; + depth--; + } + } + rcu_read_unlock(); } static struct trie_stat *trie_stat_new(void) { - struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + struct trie_stat *s; int i; - if (s) { - s->totdepth = 0; - s->maxdepth = 0; - s->tnodes = 0; - s->leaves = 0; - s->nullpointers = 0; - - for(i=0; i< MAX_CHILDS; i++) - s->nodesizes[i] = 0; - } + s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + if (!s) + return NULL; + + s->totdepth = 0; + s->maxdepth = 0; + s->tnodes = 0; + s->leaves = 0; + s->nullpointers = 0; + + for (i = 0; i < MAX_CHILDS; i++) + s->nodesizes[i] = 0; + return s; } static struct trie_stat *trie_collect_stats(struct trie *t) { - struct node *n = t->trie; + struct node *n; struct trie_stat *s = trie_stat_new(); int cindex = 0; - int indent = 1; int pend = 0; int depth = 0; - read_lock(&fib_lock); + if (!s) + return NULL; - if (s) { - if (n) { - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - indent += 3; - s->nodesizes[tn->bits]++; - depth++; + rcu_read_lock(); + n = rcu_dereference(t->trie); - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - /* Got a child */ - - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - /* stats */ - if (depth > s->maxdepth) - s->maxdepth = depth; - s->totdepth += depth; - s->leaves++; - } - - else { - /* - * New tnode. Decend one level - */ - - s->tnodes++; - s->nodesizes[tn->bits]++; - depth++; - - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - - indent += 3; - cindex = 0; - } - } - else { - cindex++; - s->nullpointers++; - } + if (!n) + return s; + + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + s->nodesizes[tn->bits]++; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + struct node *ch = rcu_dereference(tn->child[cindex]); + if (ch) { + /* Got a child */ + + if (IS_LEAF(tn->child[cindex])) { + cindex++; + + /* stats */ + if (depth > s->maxdepth) + s->maxdepth = depth; + s->totdepth += depth; + s->leaves++; + } else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent -= 3; - depth--; - } - } + + s->tnodes++; + s->nodesizes[tn->bits]++; + depth++; + + n = ch; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + + cindex = 0; } + } else { + cindex++; + s->nullpointers++; } - else n = NULL; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + n = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + tn = NODE_PARENT(tn); + cindex++; + n = (struct node *)tn; + pend = tn->pos+tn->bits; + depth--; + } } } - read_unlock(&fib_lock); + rcu_read_unlock(); return s; } @@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_triestat_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_triestat_get_first(seq); + else + return fib_triestat_get_next(seq); } static void fib_triestat_seq_stop(struct seq_file *seq, void *v) @@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) { int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ int i, max, pointers; - struct trie_stat *stat; + struct trie_stat *stat; int avdepth; stat = trie_collect_stats(t); - bytes=0; + bytes = 0; seq_printf(seq, "trie=%p\n", t); if (stat) { if (stat->leaves) - avdepth=stat->totdepth*100 / stat->leaves; + avdepth = stat->totdepth*100 / stat->leaves; else - avdepth=0; - seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + avdepth = 0; + seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); - + seq_printf(seq, "Leaves: %d\n", stat->leaves); bytes += sizeof(struct leaf) * stat->leaves; seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); @@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) if (trie_main) collect_and_show(trie_main, seq); - } - else { - snprintf(bf, sizeof(bf), - "*\t%08X\t%08X", 200, 400); - + } else { + snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); + seq_printf(seq, "%-127s\n", bf); } return 0; @@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq) static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_trie_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_trie_get_first(seq); + else + return fib_trie_get_next(seq); + } static void fib_trie_seq_stop(struct seq_file *seq, void *v) { - } /* @@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (trie_main) trie_dump_seq(seq, trie_main); - } - - else { + } else { snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); seq_printf(seq, "%-127s\n", bf); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index badfc584997..24eb56ae1b5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -114,7 +114,7 @@ struct icmp_bxm { /* * Statistics */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); +DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ @@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb) break; case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " "fragmentation needed " "and DF set.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); } else { info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); @@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " "Route Failed.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); break; default: break; @@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5088f90835a..44607f4767b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMP_MTRACE_RESP: break; default: - NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } in_dev_put(in_dev); kfree_skb(skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 00000000000..fe3c6d3d0c9 --- /dev/null +++ b/net/ipv4/inet_connection_sock.c @@ -0,0 +1,641 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET connection oriented protocols. + * + * Authors: See the TCP sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/jhash.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_hashtables.h> +#include <net/inet_timewait_sock.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/tcp_states.h> +#include <net/xfrm.h> + +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); +#endif + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) +{ + struct inet_bind_hashbucket *head; + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) + rover = low; + else + rover = hashinfo->port_rover; + do { + rover++; + if (rover > high) + rover = low; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); + + /* Exhausted local port range during search? It is not + * possible for us to be holding one of the bind hash + * locks if this test triggers, because if 'remaining' + * drops to zero, we broke out of the do/while loop at + * the top level, not from the 'break;' statement. + */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (inet_csk_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +EXPORT_SYMBOL_GPL(inet_csk_get_port); + +/* + * Wait for an incoming connection, avoid race conditions. This must be called + * with the socket locked. + */ +static int inet_csk_wait_for_connect(struct sock *sk, long timeo) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = inet_csk_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); +out: + release_sock(sk); + return newsk; +out_err: + newsk = NULL; + *err = error; + goto out; +} + +EXPORT_SYMBOL(inet_csk_accept); + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); + + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; + + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; +} + +EXPORT_SYMBOL(inet_csk_init_xmit_timers); + +void inet_csk_clear_xmit_timers(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); + +void inet_csk_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); + +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); + +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct rtable *rt; + const struct inet_request_sock *ireq = inet_rsk(req); + struct ip_options *opt = inet_rsk(req)->opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + ireq->rmt_addr), + .saddr = ireq->loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = ireq->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +EXPORT_SYMBOL_GPL(inet_csk_route_req); + +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define AF_INET_FAMILY(fam) 1 +#endif + +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet_request_sock *ireq = inet_rsk(req); + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + AF_INET_FAMILY(req->rsk_ops->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +EXPORT_SYMBOL_GPL(inet_csk_search_req); + +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + const unsigned timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +/* Only thing we need from tcp.h */ +extern int sysctl_tcp_synack_retries; + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto) +{ + struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct listen_sock *lopt = queue->listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct request_sock **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (inet_rsk(req)->acked && req->retrans < max_retries)) + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((timeout << req->retrans), max_rto); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + inet_csk_reqsk_queue_unlink(parent, req, reqp); + reqsk_queue_removed(queue, req); + reqsk_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i + 1) & (lopt->nr_table_entries - 1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); + +struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, + const unsigned int __nocast priority) +{ + struct sock *newsk = sk_clone(sk, priority); + + if (newsk != NULL) { + struct inet_connection_sock *newicsk = inet_csk(newsk); + + newsk->sk_state = TCP_SYN_RECV; + newicsk->icsk_bind_hash = NULL; + + inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + newsk->sk_write_space = sk_stream_write_space; + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + } + return newsk; +} + +EXPORT_SYMBOL_GPL(inet_csk_clone); + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void inet_csk_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + atomic_dec(sk->sk_prot->orphan_count); + sock_put(sk); +} + +EXPORT_SYMBOL(inet_csk_destroy_sock); + +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + + if (rc != 0) + return rc; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + inet_csk_delack_init(sk); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + __reqsk_queue_destroy(&icsk->icsk_accept_queue); + return -EADDRINUSE; +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ +void inet_csk_listen_stop(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *acc_req; + struct request_sock *req; + + inet_csk_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&icsk->icsk_accept_queue); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + __reqsk_free(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c new file mode 100644 index 00000000000..71f3c7350c6 --- /dev/null +++ b/net/ipv4/inet_diag.c @@ -0,0 +1,868 @@ +/* + * inet_diag.c Module for monitoring INET transport protocols sockets. + * + * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/random.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/time.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/ipv6.h> +#include <net/inet_common.h> +#include <net/inet_connection_sock.h> +#include <net/inet_hashtables.h> +#include <net/inet_timewait_sock.h> +#include <net/inet6_hashtables.h> + +#include <linux/inet.h> +#include <linux/stddef.h> + +#include <linux/inet_diag.h> + +static const struct inet_diag_handler **inet_diag_table; + +struct inet_diag_entry { + u32 *saddr; + u32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *idiagnl; + +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) + +static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + void *info = NULL; + struct inet_diag_meminfo *minfo = NULL; + unsigned char *b = skb->tail; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + + r = NLMSG_DATA(nlh); + if (sk->sk_state != TCP_TIME_WAIT) { + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, + sizeof(*minfo)); + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, + handler->idiag_info_size); + + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { + size_t len = strlen(icsk->icsk_ca_ops->name); + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), + icsk->icsk_ca_ops->name); + } + } + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + if (r->idiag_state == TCP_TIME_WAIT) { + const struct inet_timewait_sock *tw = inet_twsk(sk); + long tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6tw->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6tw->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = inet->dport; + r->id.idiag_src[0] = inet->rcv_saddr; + r->id.idiag_dst[0] = inet->daddr; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->idiag_timer = 0; + r->idiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); + } + + handler->idiag_get_info(sk, r, info); + + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct inet_diag_req *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + if (req->idiag_family == AF_INET) { + sk = inet_lookup(hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + else if (req->idiag_family == AF_INET6) { + sk = inet6_lookup(hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); + if (!rep) + goto out; + + if (inet_diag_fill(rep, sk, req->idiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, nlh) <= 0) + BUG(); + + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put((struct inet_timewait_sock *)sk); + else + sock_put(sk); + } + return err; +} + +static int bitstring_match(const u32 *a1, const u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct inet_diag_bc_op *op = bc; + + switch (op->code) { + case INET_DIAG_BC_NOP: + break; + case INET_DIAG_BC_JMP: + yes = 0; + break; + case INET_DIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case INET_DIAG_BC_S_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case INET_DIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { + struct inet_diag_hostcond *cond; + u32 *addr; + + cond = (struct inet_diag_hostcond *)(op + 1); + if (cond->port != -1 && + cond->port != (op->code == INET_DIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == INET_DIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct inet_diag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +{ + const unsigned char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + case INET_DIAG_BC_JMP: + if (op->no < 4 || op->no > len + 4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + break; + case INET_DIAG_BC_NOP: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct inet_diag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->rcv_saddr; + entry.daddr = &inet->daddr; + } + entry.sport = inet->num; + entry.dport = ntohs(inet->dport); + entry.userlocks = sk->sk_userlocks; + + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, + struct request_sock *req, + u32 pid, u32 seq, + const struct nlmsghdr *unlh) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb->tail; + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6_rsk(req)->loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6_rsk(req)->rmt_addr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt; + struct rtattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + lopt = icsk->icsk_accept_queue.listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + bc = (struct rtattr *)(r + 1); + entry.sport = inet->num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < lopt->nr_table_entries; j++) { + struct request_sock *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + struct inet_request_sock *ireq = inet_rsk(req); + + if (reqnum < s_reqnum) + continue; + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) + continue; + + if (bc) { + entry.saddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->loc_addr.s6_addr32 : +#endif + &ireq->loc_addr; + entry.daddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->rmt_addr.s6_addr32 : +#endif + &ireq->rmt_addr; + entry.dport = ntohs(ireq->rmt_port); + + if (!inet_diag_bc_run(RTA_DATA(bc), + RTA_PAYLOAD(bc), &entry)) + continue; + } + + err = inet_diag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, cb->nlh); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + return err; +} + +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; + + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + goto skip_listen_ht; + + inet_listen_lock(hashinfo); + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_node *node; + + num = 0; + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_listen; + + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +syn_recv: + if (!(r->idiag_states & TCPF_SYN_RECV)) + goto next_listen; + + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } + inet_listen_unlock(hashinfo); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < hashinfo->ehash_size; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + struct sock *sk; + struct hlist_node *node; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + num = 0; + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) + goto next_normal; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_normal: + ++num; + } + + if (r->idiag_states & TCPF_TIME_WAIT) { + sk_for_each(sk, node, + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_dying; + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) + goto next_dying; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_dying: + ++num; + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) + goto err_inval; + + if (inet_diag_table[nlh->nlmsg_type] == NULL) + return -ENOENT; + + if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > + (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { + struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + + sizeof(struct inet_diag_req)); + if (rta->rta_type != INET_DIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > + (nlh->nlmsg_len - + NLMSG_SPACE(sizeof(struct inet_diag_req)))) + goto err_inval; + if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, + inet_diag_dump_done); + } else { + return inet_diag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +static inline void inet_diag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = inet_diag_rcv_msg(skb, nlh); + if (err || nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, err); + } +} + +static void inet_diag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + + while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { + inet_diag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static DEFINE_SPINLOCK(inet_diag_register_lock); + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + spin_lock(&inet_diag_register_lock); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + spin_unlock(&inet_diag_register_lock); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + spin_lock(&inet_diag_register_lock); + inet_diag_table[type] = NULL; + spin_unlock(&inet_diag_register_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + +static int __init inet_diag_init(void) +{ + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + memset(inet_diag_table, 0, inet_diag_table_size); + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, + THIS_MODULE); + if (idiagnl == NULL) + goto out_free_table; + err = 0; +out: + return err; +out_free_table: + kfree(inet_diag_table); + goto out; +} + +static void __exit inet_diag_exit(void) +{ + sock_release(idiagnl->sk_socket); + kfree(inet_diag_table); +} + +module_init(inet_diag_init); +module_exit(inet_diag_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c new file mode 100644 index 00000000000..e8d29fe736d --- /dev/null +++ b/net/ipv4/inet_hashtables.c @@ -0,0 +1,165 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/wait.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_hashtables.h> + +/* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) +{ + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + + if (tb != NULL) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +EXPORT_SYMBOL(inet_bind_bucket_create); + +/* + * Caller must hold hashbucket lock for this tb with local BH disabled + */ +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(cachep, tb); + } +} + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) +{ + inet_sk(sk)->num = snum; + sk_add_bind_node(sk, &tb->owners); + inet_csk(sk)->icsk_bind_hash = tb; +} + +EXPORT_SYMBOL(inet_bind_hash); + +/* + * Get rid of any references to a local port held by the given sock. + */ +static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); + struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); + inet_csk(sk)->icsk_bind_hash = NULL; + inet_sk(sk)->num = 0; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&head->lock); +} + +void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + local_bh_disable(); + __inet_put_port(hashinfo, sk); + local_bh_enable(); +} + +EXPORT_SYMBOL(inet_put_port); + +/* + * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ +void inet_listen_wlock(struct inet_hashinfo *hashinfo) +{ + write_lock(&hashinfo->lhash_lock); + + if (atomic_read(&hashinfo->lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&hashinfo->lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&hashinfo->lhash_users)) + break; + write_unlock_bh(&hashinfo->lhash_lock); + schedule(); + write_lock_bh(&hashinfo->lhash_lock); + } + + finish_wait(&hashinfo->lhash_wait, &wait); + } +} + +EXPORT_SYMBOL(inet_listen_wlock); + +/* + * Don't inline this cruft. Here are some nice properties to exploit here. The + * BSD API does not allow a listening sock to specify the remote port nor the + * remote address for the connection. So always assume those are both + * wildcarded during the search since they can never be otherwise. + */ +struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *result = NULL, *sk; + const struct hlist_node *node; + int hiscore = -1; + + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; + + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score += 2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} + +EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c new file mode 100644 index 00000000000..4d1502a4985 --- /dev/null +++ b/net/ipv4/inet_timewait_sock.c @@ -0,0 +1,384 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic TIME_WAIT sockets functions + * + * From code orinally in TCP + */ + +#include <linux/config.h> + +#include <net/inet_hashtables.h> +#include <net/inet_timewait_sock.h> +#include <net/ip.h> + +/* Must be called with locally disabled BHs. */ +void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) +{ + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; + /* Unlink from established hashes. */ + struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; + + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&bhead->lock); +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + inet_twsk_put(tw); +} + +EXPORT_SYMBOL_GPL(__inet_twsk_kill); + +/* + * Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the relevant info into it + * from the SK, and mess with hash chains and list linkage. + */ +void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} + +EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +{ + struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, + SLAB_ATOMIC); + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_hashent = sk->sk_hashent; + tw->tw_ipv6only = 0; + tw->tw_prot = sk->sk_prot_creator; + atomic_set(&tw->tw_refcnt, 1); + inet_twsk_dead_node_init(tw); + } + + return tw; +} + +EXPORT_SYMBOL_GPL(inet_twsk_alloc); + +/* Returns non-zero if quota exceeded. */ +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) +{ + struct inet_timewait_sock *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + __inet_twsk_del_dead_node(tw); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { + ret = 1; + break; + } + + /* While we dropped twdr->death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + twdr->tw_count -= killed; + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); + + return ret; +} + +void inet_twdr_hangman(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int unsigned need_timer; + + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); + + if (twdr->tw_count == 0) + goto out; + + need_timer = 0; + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); + mb(); + schedule_work(&twdr->twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (twdr->tw_count) + need_timer = 1; + } + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); + if (need_timer) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); +out: + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_hangman); + +extern void twkill_slots_invalid(void); + +void inet_twdr_twkill_work(void *data) +{ + struct inet_timewait_death_row *twdr = data; + int i; + + if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) + twkill_slots_invalid(); + + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) + continue; + + while (inet_twdr_do_twkill_work(twdr, i) != 0) { + if (need_resched()) { + spin_unlock_bh(&twdr->death_lock); + schedule(); + spin_lock_bh(&twdr->death_lock); + } + } + + twdr->thread_slots &= ~(1 << i); + } + spin_unlock_bh(&twdr->death_lock); + } +} + +EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) +{ + spin_lock(&twdr->death_lock); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); + } + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); +} + +EXPORT_SYMBOL(inet_twsk_deschedule); + +void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; + + spin_lock(&twdr->death_lock); + + /* Unlink it, if it was scheduled */ + if (inet_twsk_del_dead_node(tw)) + twdr->tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= INET_TWDR_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= timewait_len) { + slot = INET_TWDR_TWKILL_SLOTS - 1; + } else { + slot = (timeo + twdr->period - 1) / twdr->period; + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; + } + tw->tw_ttd = jiffies + timeo; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; + } else { + tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); + } else { + if (time_after(twdr->twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + list = &twdr->twcal_row[slot]; + } + + hlist_add_head(&tw->tw_death_node, list); + + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twsk_schedule); + +void inet_twdr_twcal_tick(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) + goto out; + + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; + + for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { + if (time_before_eq(j, now)) { + struct hlist_node *node, *safe; + struct inet_timewait_sock *tw; + + inet_twsk_for_each_inmate_safe(tw, node, safe, + &twdr->twcal_row[slot]) { + __inet_twsk_del_dead_node(tw); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; + } + + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); + goto out; + } + } + j += 1 << INET_TWDR_RECYCLE_TICK; + slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + twdr->twcal_hand = -1; + +out: + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ab18a853d7c..f84ba9c9655 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> +#include <net/ip.h> #include <net/inetpeer.h> /* @@ -72,7 +73,7 @@ /* Exported for inet_getid inline function. */ DEFINE_SPINLOCK(inet_peer_idlock); -static kmem_cache_t *peer_cachep; +static kmem_cache_t *peer_cachep __read_mostly; #define node_height(x) x->avl_height static struct inet_peer peer_fake_node = { @@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy) peer_total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } - -EXPORT_SYMBOL(inet_peer_idlock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 77094aac6c2..0923add122b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb) * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ - - iph = skb->nh.iph; - - if (iph->ttl <= 1) + if (skb->nh.iph->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) goto drop; - iph = skb->nh.iph; rt = (struct rtable*)skb->dst; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb377ae1530..9e6e683cc34 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) return ip_frag_intern(hash, qp); out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } @@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - qp->stamp = skb->stamp; + skb_get_timestamp(skb, &qp->stamp); qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - head->stamp = qp->stamp; + skb_set_timestamp(head, &qp->stamp); iph = head->nh.iph; iph->frag_off = 0; @@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) return head; out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing " - "queue %p\n", qp)); + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); goto out_fail; out_oversize: if (net_ratelimit()) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c703528e0bc..473d0f2b2e0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -150,7 +150,7 @@ * SNMP management statistics */ -DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); +DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; /* * Process Router Attention IP option @@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk) - raw_v4_input(skb, skb->nh.iph, hash); + if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; @@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb) ip_local_deliver_finish); } -static inline int ip_rcv_finish(struct sk_buff *skb) +static inline int ip_rcv_options(struct sk_buff *skb) { + struct ip_options *opt; + struct iphdr *iph; struct net_device *dev = skb->dev; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + + iph = skb->nh.iph; + + if (ip_options_compile(NULL, skb)) { + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + opt = &(IPCB(skb)->opt); + if (unlikely(opt->srr)) { + struct in_device *in_dev = in_dev_get(dev); + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && + net_ratelimit()) + printk(KERN_INFO "source route option " + "%u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + in_dev_put(in_dev); + goto drop; + } + + in_dev_put(in_dev); + } + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; +drop: + return -1; +} + +static inline int ip_rcv_finish(struct sk_buff *skb) +{ struct iphdr *iph = skb->nh.iph; - int err; /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb->dst == NULL) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + if (likely(skb->dst == NULL)) { + int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, + skb->dev); + if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); goto drop; @@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #ifdef CONFIG_NET_CLS_ROUTE - if (skb->dst->tclassid) { + if (unlikely(skb->dst->tclassid)) { struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); u32 idx = skb->dst->tclassid; st[idx&0xFF].o_packets++; @@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #endif - if (iph->ihl > 5) { - struct ip_options *opt; - - /* It looks as overkill, because not all - IP options require packet mangling. - But it is the easiest for now, especially taking - into account that combination of IP options - and running sniffer is extremely rare condition. - --ANK (980813) - */ - - if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } - iph = skb->nh.iph; - - if (ip_options_compile(NULL, skb)) - goto inhdr_error; - - opt = &(IPCB(skb)->opt); - if (opt->srr) { - struct in_device *in_dev = in_dev_get(dev); - if (in_dev) { - if (!IN_DEV_SOURCE_ROUTE(in_dev)) { - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - in_dev_put(in_dev); - goto drop; - } - in_dev_put(in_dev); - } - if (ip_options_rcv_srr(skb)) - goto drop; - } - } + if (iph->ihl > 5 && ip_rcv_options(skb)) + goto drop; return dst_input(skb); -inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); return NET_RX_DROP; @@ -358,9 +373,10 @@ drop: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; + u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. @@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) */ if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; + goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = skb->nh.iph; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto inhdr_error; + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; - { - __u32 len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl<<2)) - goto inhdr_error; + len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl*4)) + goto inhdr_error; - /* Our transport medium may have padded the buffer out. Now we know it - * is IP we can trim to the true length of the frame. - * Note this now means skb->len holds ntohs(iph->tot_len). - */ - if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; } return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, @@ -428,5 +442,4 @@ out: return NET_RX_DROP; } -EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(ip_statistics); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 6d89f3f3e70..bce4e875193 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt) } } -int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +static struct ip_options *ip_options_get_alloc(const int optlen) { - struct ip_options *opt; + struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3), + GFP_KERNEL); + if (opt) + memset(opt, 0, sizeof(*opt)); + return opt; +} - opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); - if (!opt) - return -ENOMEM; - memset(opt, 0, sizeof(struct ip_options)); - if (optlen) { - if (user) { - if (copy_from_user(opt->__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - } else - memcpy(opt->__data, data, optlen); - } +static int ip_options_get_finish(struct ip_options **optp, + struct ip_options *opt, int optlen) +{ while (optlen & 3) opt->__data[optlen++] = IPOPT_END; opt->optlen = optlen; @@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in return 0; } +int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen && copy_from_user(opt->__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + return ip_options_get_finish(optp, opt, optlen); +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen) + memcpy(opt->__data, data, optlen); + return ip_options_get_finish(optp, opt, optlen); +} + void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); @@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } - -EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_options_undo); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 80d13103b2b..3f1a263e124 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -69,13 +69,10 @@ #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> -#include <net/tcp.h> -#include <net/udp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> -#include <net/raw.h> #include <net/checksum.h> #include <net/inetpeer.h> #include <net/checksum.h> @@ -84,12 +81,8 @@ #include <linux/netfilter_bridge.h> #include <linux/mroute.h> #include <linux/netlink.h> +#include <linux/tcp.h> -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ @@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; @@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; @@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); @@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; @@ -580,7 +573,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } @@ -1329,12 +1322,7 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ff4bd067b39..2f0b47da5b3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) return err; break; @@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, struct ip_options * opt = NULL; if (optlen > 40 || optlen < 0) goto e_inval; - err = ip_options_get(&opt, optval, optlen, 1); + err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; if (sk->sk_type == SOCK_STREAM) { @@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case IP_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct ip_msfilter *msf; @@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case MCAST_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct sockaddr_in *psin; struct ip_msfilter *msf = NULL; @@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, EXPORT_SYMBOL(ip_cmsg_recv); -#ifdef CONFIG_IP_SCTP_MODULE EXPORT_SYMBOL(ip_getsockopt); EXPORT_SYMBOL(ip_setsockopt); -#endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 7ded6e60f43..dcb7ee6c485 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", - spi, NIPQUAD(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d2bf8e1930a..63e106605f2 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -393,7 +393,7 @@ static int __init ic_defaults(void) #ifdef IPCONFIG_RARP -static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { .type = __constant_htons(ETH_P_RARP), @@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void) * Process received RARP packet. */ static int __init -ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *rarp; unsigned char *rarp_ptr; @@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet format */ #define DHCPRELEASE 7 #define DHCPINFORM 8 -static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { .type = __constant_htons(ETH_P_IP), @@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext) /* * Receive BOOTP reply. */ -static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct bootp_pkt *b; struct iphdr *h; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index dc806b57842..9dbf5909f3a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); In this case data path is free of exclusive locks at all. */ -static kmem_cache_t *mrt_cachep; +static kmem_cache_t *mrt_cachep __read_mostly; static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d9212addd19..6e092dadb38 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -26,6 +26,7 @@ #include <linux/in.h> #include <linux/ip.h> #include <net/protocol.h> +#include <net/tcp.h> #include <asm/system.h> #include <linux/stat.h> #include <linux/proc_fs.h> diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d0145a8b155..e11952ea17a 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -40,7 +40,7 @@ static struct list_head *ip_vs_conn_tab; /* SLAB cache for IPVS connections */ -static kmem_cache_t *ip_vs_conn_cachep; +static kmem_cache_t *ip_vs_conn_cachep __read_mostly; /* counter for current IPVS connections */ static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5fb257dd07c..3ac7eeca04a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -22,6 +22,7 @@ * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache * */ @@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + if (!((*pskb)->ipvs_property)) return NF_ACCEPT; /* The packet was sent from IPVS, exit this chain */ @@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; verdict = NF_ACCEPT; out: @@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, EnterFunction(11); - if (skb->nfcache & NFC_IPVS_PROPERTY) + if (skb->ipvs_property) return NF_ACCEPT; iph = skb->nh.iph; @@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); ip_vs_conn_put(cp); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; LeaveFunction(11); return NF_ACCEPT; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7d99ede2ef7..2d66848e7aa 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table, + .child = ipvs_ipv4_table, }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c035838b780..561cda326fa 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -131,7 +131,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 22b5dd55d27..ce456dbf09a 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -320,7 +320,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index e65de675da7..c19408973c0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) } -static void tcp_init(struct ip_vs_protocol *pp) +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) { IP_VS_INIT_HASH_TABLE(tcp_apps); pp->timeout_table = tcp_timeouts; } -static void tcp_exit(struct ip_vs_protocol *pp) +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) { } @@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), - .init = tcp_init, - .exit = tcp_exit, + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index a8512a3fd08..3b87482049c 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ - (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ipvs_property = 1; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index c9cf8726051..db67373f9b3 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this, return NOTIFY_DONE; } -struct notifier_block drr_dev_notifier = { +static struct notifier_block drr_dev_notifier = { .notifier_call = drr_dev_event, }; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c new file mode 100644 index 00000000000..ae0779d82c5 --- /dev/null +++ b/net/ipv4/netfilter.c @@ -0,0 +1,139 @@ +/* IPv4 specific functions of netfilter core */ + +#include <linux/config.h> +#ifdef CONFIG_NETFILTER + +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/route.h> +#include <linux/ip.h> + +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip_rt_info { + u_int32_t daddr; + u_int32_t saddr; + u_int8_t tos; +}; + +static void queue_save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + const struct iphdr *iph = skb->nh.iph; + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + const struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = (*pskb)->nh.iph; + + if (!(iph->tos == rt_info->tos + && iph->daddr == rt_info->daddr + && iph->saddr == rt_info->saddr)) + return ip_route_me_harder(pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip_reroute = { + .rer_size = sizeof(struct ip_rt_info), + .save = queue_save, + .reroute = queue_reroute, +}; + +static int init(void) +{ + return nf_register_queue_rerouter(PF_INET, &ip_reroute); +} + +static void fini(void) +{ + nf_unregister_queue_rerouter(PF_INET); +} + +module_init(init); +module_exit(fini); + +#endif /* CONFIG_NETFILTER */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 46d4cb1c06f..e046f552181 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + IF unsure, say `N'. + config IP_NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' depends on IP_NF_CONNTRACK && EXPERIMENTAL @@ -100,11 +110,15 @@ config IP_NF_AMANDA To compile it as a module, choose M here. If unsure, say Y. config IP_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. + This option enables the old IPv4-only "ip_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + To compile it as a module, choose M here. If unsure, say N. config IP_NF_IPTABLES @@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP If you want to compile it as a module, say M here and read <file:Documentation/modules.txt>. If unsure, say `N'. +config IP_NF_MATCH_DCCP + tristate 'DCCP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + config IP_NF_MATCH_COMMENT tristate 'comment match support' depends on IP_NF_IPTABLES @@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK <file:Documentation/modules.txt>. The module will be called ipt_connmark.o. If unsure, say `N'. +config IP_NF_MATCH_CONNBYTES + tristate 'Connection byte/packet counter match support' + depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + config IP_NF_MATCH_HASHLIMIT tristate 'hashlimit match support' depends on IP_NF_IPTABLES @@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT destination IP' or `500pps from any given source IP' with a single IPtables rule. +config IP_NF_MATCH_STRING + tristate 'string match support' + depends on IP_NF_IPTABLES + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_BM + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" @@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_TTL + tristate 'TTL target support' + depends on IP_NF_MANGLE + help + This option adds a `TTL' target, which enables the user to modify + the TTL value of the IP header. + + While it is safe to decrement/lower the TTL, this target also enables + functionality to increment and set the TTL value of the IP header to + arbitrary values. This is EXTREMELY DANGEROUS since you can easily + create immortal packets that loop forever on the network. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE @@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE Allows altering the ARP packet payload: source and destination hardware and network addresses. +config IP_NF_CONNTRACK_NETLINK + tristate 'Connection tracking netlink interface' + depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + help + This option enables support for a netlink-based userspace interface + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 45796d5924d..a7bd38f5052 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# conntrack netlink interface +obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o + + # SCTP protocol connection tracking obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o @@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o @@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o +obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o # targets obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o @@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o @@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 01e1b58322a..be4c9eb3243 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); static char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ -static char amanda_buffer[65536]; +static char *amanda_buffer; static DEFINE_SPINLOCK(amanda_buffer_lock); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, @@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit fini(void) { ip_conntrack_helper_unregister(&amanda_helper); + kfree(amanda_buffer); } static int __init init(void) { - return ip_conntrack_helper_register(&amanda_helper); + int ret; + + amanda_buffer = kmalloc(65536, GFP_KERNEL); + if (!amanda_buffer) + return -ENOMEM; + + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) { + kfree(amanda_buffer); + return ret; + } + return 0; + + } module_init(init); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a7f0c821a9b..a0648600190 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -37,6 +37,7 @@ #include <linux/err.h> #include <linux/percpu.h> #include <linux/moduleparam.h> +#include <linux/notifier.h> /* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -49,7 +50,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_core.h> #include <linux/netfilter_ipv4/listhelp.h> -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.3" #if 0 #define DEBUGP printk @@ -69,22 +70,81 @@ static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; -static kmem_cache_t *ip_conntrack_expect_cachep; +static kmem_cache_t *ip_conntrack_cachep __read_mostly; +static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; -DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); +static unsigned int ip_conntrack_next_id = 1; +static unsigned int ip_conntrack_expect_next_id = 1; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +struct notifier_block *ip_conntrack_chain; +struct notifier_block *ip_conntrack_expect_chain; + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); -void -ip_conntrack_put(struct ip_conntrack *ct) +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { - IP_NF_ASSERT(ct); - nf_conntrack_put(&ct->ct_general); + DEBUGP("ecache: delivering events for %p\n", ecache->ct); + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; } +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); +} + +void __ip_ct_event_cache_init(struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; + + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); +} + +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) +{ + struct ip_conntrack_ecache *ecache; + int cpu; + + for_each_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) + ip_conntrack_put(ecache->ct); + } +} +#else +static inline void ip_ct_event_cache_flush(void) {} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; @@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp) list_del(&exp->list); CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; + ip_conntrack_expect_put(exp); +} + +void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) +{ + unlink_expect(exp); + ip_conntrack_expect_put(exp); } static void expectation_timed_out(unsigned long ul_expect) @@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect) ip_conntrack_expect_put(exp); } +struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + read_unlock_bh(&ip_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct ip_conntrack_expect * @@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple) } /* delete all expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct) +void ip_ct_remove_expectations(struct ip_conntrack *ct) { struct ip_conntrack_expect *i, *tmp; @@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct) LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); } static void @@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct) * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!is_confirmed(ct)) { @@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct) ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) @@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, && ip_ct_tuple_equal(tuple, &i->tuple); } -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { @@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, return h; } +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++ip_conntrack_next_id; + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void ip_conntrack_hash_insert(struct ip_conntrack *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __ip_conntrack_confirm(struct sk_buff **pskb) @@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb) /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __ip_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb) set_bit(IPS_CONFIRMED_BIT, &ct->status); CONNTRACK_STAT_INC(insert); write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; } @@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; +} + +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); - if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) @@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); @@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; + atomic_inc(&ip_conntrack_count); + + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); @@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { - conntrack->helper = ip_ct_find_helper(&repl_tuple); + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); CONNTRACK_STAT_INC(new); } @@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, /* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - atomic_inc(&ip_conntrack_count); write_unlock_bh(&ip_conntrack_lock); if (exp) { @@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; /* Previously seen (loopback or untracked)? Ignore. */ @@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return -ret; } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { return ip_ct_invert_tuple(inverse, orig, - ip_ct_find_proto(orig->dst.protonum)); + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ @@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } @@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) evict_oldest_expect(expect->master); ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: write_unlock_bh(&ip_conntrack_lock); @@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = ip_ct_find_helper(newreply); + conntrack->helper = __ip_conntrack_helper_find(newreply); write_unlock_bh(&ip_conntrack_lock); } @@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) return 0; } +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (tuplehash_to_ctrack(i)->helper == me) + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); tuplehash_to_ctrack(i)->helper = NULL; + } return 0; } @@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + ip_conntrack_event_cache(IPCT_REFRESH, skb); } ct_add_counters(ct, ctinfo, skb); write_unlock_bh(&ip_conntrack_lock); } } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Returns new sk_buff, or NULL */ struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) skb = ip_defrag(skb, user); local_bh_enable(); - if (skb) { + if (skb) ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; - } return skb; } @@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void) * ip_conntrack_htable_size)); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) +void ip_conntrack_flush() { - ip_ct_attach = NULL; /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + + ip_ct_event_cache_flush(); i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { @@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); +} +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 7a3b773be3f..3a2627db172 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); /* This is slow, but it's simple. --RR */ -static char ftp_buffer[65536]; - +static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 @@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) oldest = i; } - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - else if (oldest != NUM_SEQ_TO_REMEMBER) + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][oldest] = nl_seq; + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } } static int help(struct sk_buff **pskb, @@ -439,7 +442,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir); + update_nl_seq(seq, ct_ftp_info,dir, *pskb); out: spin_unlock_bh(&ip_ftp_lock); return ret; @@ -457,6 +460,8 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&ftp[i]); } + + kfree(ftp_buffer); } static int __init init(void) @@ -464,6 +469,10 @@ static int __init init(void) int i, ret; char *tmpname; + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + if (ports_c == 0) ports[ports_c++] = FTP_PORT; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 4a28f297d50..25438eec21a 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -39,7 +39,7 @@ static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; /* This is slow, but it's simple. --RR */ -static char irc_buffer[65536]; +static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, @@ -257,6 +257,10 @@ static int __init init(void) printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); return -EBUSY; } + + irc_buffer = kmalloc(65536, GFP_KERNEL); + if (!irc_buffer) + return -ENOMEM; /* If no port given, default to standard irc port */ if (ports_c == 0) @@ -304,6 +308,7 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&irc_helpers[i]); } + kfree(irc_buffer); } module_init(init); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c new file mode 100644 index 00000000000..a4e9278db4e --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -0,0 +1,1579 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist <jschlst@samba.org> + * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> + * (C) 2003 by Patrick Mchardy <kaber@trash.net> + * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net> + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.90"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_protocol *proto; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + if (proto && proto->tuple_to_nfattr) + return proto->tuple_to_nfattr(skb, tuple); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip); + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + struct nfattr *nest_proto; + int ret; + + if (!proto || !proto->to_nfattr) + return 0; + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + ip_conntrack_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int64_t tmp; + + tmp = cpu_to_be64(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp); + + tmp = cpu_to_be64(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_IP_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + unsigned int use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct ip_conntrack *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct ip_conntrack *ct = (struct ip_conntrack *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, group; + + /* ignore our fake conntrack entry */ + if (ct == &ip_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + group = NFNLGRP_CONNTRACK_DESTROY; + goto alloc_skb; + } + if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + group = NFNLGRP_CONNTRACK_NEW; + goto alloc_skb; + } + if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_UPDATE; + goto alloc_skb; + } + + return NOTIFY_DONE; + +alloc_skb: + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, group, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static const int cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + + if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + if (!tb[CTA_IP_V4_SRC-1]) + return -EINVAL; + tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + + if (!tb[CTA_IP_V4_DST-1]) + return -EINVAL; + tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +static const int cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct ip_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + + if (likely(proto && proto->nfattr_to_tuple)) { + ret = proto->nfattr_to_tuple(tb, tuple); + ip_conntrack_proto_put(proto); + } + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, + enum ctattr_tuple type) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tuple, 0, sizeof(*tuple)); + + if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) + goto nfattr_failure; + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const int cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct ip_conntrack *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + goto nfattr_failure; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + if (!npt) + return 0; + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct ip_conntrack *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(range, 0, sizeof(*range)); + + if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) + goto nfattr_failure; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) + goto nfattr_failure; + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else { + /* Flush the whole table */ + ip_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + ip_conntrack_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) { + ip_conntrack_put(ct); + ct->timeout.function((unsigned long)ct); + return 0; + } + ip_conntrack_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + struct sk_buff *skb2 = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_IP_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb2) { + ip_conntrack_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + ip_conntrack_put(ct); + if (err <= 0) + goto out; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +out: + if (skb2) + kfree_skb(skb2); + return -1; +} + +static inline int +ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, hooknum)) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + struct ip_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __ip_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + ip_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static int +ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, + struct ip_conntrack_tuple *rtuple) +{ + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = ip_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + ct->helper = ip_conntrack_helper_find_get(rtuple); + + add_timer(&ct->timeout); + ip_conntrack_hash_insert(ct); + + if (ct->helper) + ip_conntrack_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + ip_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple otuple, rtuple; + struct ip_conntrack_tuple_hash *h = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); + if (err < 0) + return err; + } + + write_lock_bh(&ip_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __ip_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __ip_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&ip_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct ip_conntrack_expect *exp) +{ + struct ip_conntrack *master = exp->master; + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct ip_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + u16 proto; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + proto = exp->tuple.dst.protonum; + nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); + list_for_each_prev(i, &ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + struct sk_buff *skb2; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto out; + + ip_conntrack_expect_put(exp); + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto free; + + return err; + +out: + ip_conntrack_expect_put(exp); +free: + if (skb2) + kfree_skb(skb2); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_expect *exp, *tmp; + struct ip_conntrack_tuple tuple; + struct ip_conntrack_helper *h; + int err; + + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&ip_conntrack_lock); + h = __ip_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock(&ip_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock_bh(&ip_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[]) +{ + struct ip_conntrack_tuple tuple, mask, master_tuple; + struct ip_conntrack_tuple_hash *h = NULL; + struct ip_conntrack_expect *exp; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = ip_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = ip_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); + + err = ip_conntrack_expect_related(exp); + ip_conntrack_expect_put(exp); + +out: + ip_conntrack_put(tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + write_lock_bh(&ip_conntrack_lock); + exp = __ip_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&ip_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ret = ip_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +err_unreg_notifier: + ip_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ip_conntrack_unregister_notifier(&ctnl_notifier_exp); + ip_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 602c74db325..838d1d69b36 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct, ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } return NF_ACCEPT; } +static u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 +}; + /* Called when a new connection for this protocol found. */ static int icmp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ @@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb, return NF_ACCEPT; } - innerproto = ip_ct_find_proto(inside->ip.protocol); + innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; /* Are they talking about one of our connections? */ if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } @@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb, been preserved inside the ICMP. */ if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } + ip_conntrack_proto_put(innerproto); *ctinfo = IP_CT_RELATED; @@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, if (!(u16)csum_fold(skb->csum)) break; if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } @@ -249,7 +254,7 @@ checksum_skipped: */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } @@ -265,6 +270,47 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + if (t->dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[t->dst.u.icmp.type]) + return -EINVAL; + + return 0; + +nfattr_failure: + return -1; +} + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -1; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + return 0; +} +#endif + struct ip_conntrack_protocol ip_conntrack_protocol_icmp = { .proto = IPPROTO_ICMP, @@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp = .packet = icmp_packet, .new = icmp_new, .error = icmp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 31d75390bf1..a875f35e576 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack, } conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); write_unlock_bh(&sctp_lock); } @@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { .packet = sctp_packet, .new = sctp_new, .destroy = NULL, - .me = THIS_MODULE + .me = THIS_MODULE, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 809dfed766d..f23ef1f88c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s, return seq_printf(s, "%s ", tcp_conntrack_names[state]); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct) +{ + read_lock_bh(&tcp_lock); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + return 0; + +nfattr_failure: + return -1; +} +#endif + static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; @@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state, res = 1; } else { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb, tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack, */ write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: killing out of sync session "); + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, "ip_ct_tcp: " + "killing out of sync session "); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); @@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid packet ignored "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: @@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, old_state); write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_SYN_SENT: @@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: invalid SYN"); + NULL, "ip_ct_tcp: invalid SYN"); return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: @@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack, ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common @@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = .packet = tcp_packet, .new = tcp_new, .error = tcp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 8c1eaba098d..f2dcac7c766 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack, ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); } else ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); @@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: short packet "); return -NF_ACCEPT; } @@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } @@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp = .packet = udp_packet, .new = udp_new, .error = udp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 61798c46e91..ee5895afd0c 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -5,7 +5,7 @@ */ /* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (DIRECTION(hash)) return 0; - proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); IP_NF_ASSERT(proto); if (seq_printf(s, "%-8s %u %ld ", @@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%lu ", conntrack->mark)) + if (seq_printf(s, "mark=%u ", conntrack->mark)) return -ENOSPC; #endif @@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - ip_ct_find_proto(expect->tuple.dst.protonum)); + __ip_conntrack_proto_find(expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -889,6 +888,7 @@ static int init_or_cleanup(int init) return ret; cleanup: + synchronize_net(); #ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_ct_sysctl_header); cleanup_localinops: @@ -971,6 +971,14 @@ void need_ip_conntrack(void) { } +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(ip_conntrack_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); +EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); +#endif EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(ip_ct_get_tuple); @@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); EXPORT_SYMBOL(ip_ct_refresh_acct); -EXPORT_SYMBOL(ip_ct_protos); -EXPORT_SYMBOL(ip_ct_find_proto); + EXPORT_SYMBOL(ip_conntrack_expect_alloc); EXPORT_SYMBOL(ip_conntrack_expect_put); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); +EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); +EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); + EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); @@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock); EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_put); #ifdef CONFIG_IP_NF_NAT_NEEDED EXPORT_SYMBOL(ip_conntrack_tcp_update); #endif + +EXPORT_SYMBOL_GPL(ip_conntrack_flush); +EXPORT_SYMBOL_GPL(__ip_conntrack_find); + +EXPORT_SYMBOL_GPL(ip_conntrack_alloc); +EXPORT_SYMBOL_GPL(ip_conntrack_free); +EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); + +EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); + +EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); + +EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); +#endif diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 739b6dde1c8..1adedb743f6 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock); static unsigned int ip_nat_htable_size; static struct list_head *bysource; + +#define MAX_IP_NAT_PROTO 256 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static inline struct ip_nat_protocol * +__ip_nat_proto_find(u_int8_t protonum) +{ + return ip_nat_protos[protonum]; +} + +struct ip_nat_protocol * +ip_nat_proto_find_get(u_int8_t protonum) +{ + struct ip_nat_protocol *p; + + /* we need to disable preemption to make sure 'p' doesn't get + * removed until we've grabbed the reference */ + preempt_disable(); + p = __ip_nat_proto_find(protonum); + if (p) { + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; + } + preempt_enable(); + + return p; +} + +void +ip_nat_proto_put(struct ip_nat_protocol *p) +{ + module_put(p->me); +} /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int @@ -103,7 +134,8 @@ static int in_range(const struct ip_conntrack_tuple *tuple, const struct ip_nat_range *range) { - struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + struct ip_nat_protocol *proto = + __ip_nat_proto_find(tuple->dst.protonum); /* If we are supposed to map IPs, then we must be in the range specified, otherwise let this drag us onto a new src IP. */ @@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, struct ip_conntrack *conntrack, enum ip_nat_manip_type maniptype) { - struct ip_nat_protocol *proto - = ip_nat_find_proto(orig_tuple->dst.protonum); + struct ip_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ + proto = ip_nat_proto_find_get(orig_tuple->dst.protonum); + /* Only bother mapping if it's not already in range and unique */ if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) + && !ip_nat_used_tuple(tuple, conntrack)) { + ip_nat_proto_put(proto); return; + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, conntrack); + + ip_nat_proto_put(proto); } unsigned int @@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto, enum ip_nat_manip_type maniptype) { struct iphdr *iph; + struct ip_nat_protocol *p; - (*pskb)->nfcache |= NFC_ALTERED; - if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; iph = (void *)(*pskb)->data + iphdroff; /* Manipulate protcol part. */ - if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, - target, maniptype)) + p = ip_nat_proto_find_get(proto); + if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) { + ip_nat_proto_put(p); return 0; + } + ip_nat_proto_put(p); iph = (void *)(*pskb)->data + iphdroff; @@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb, struct ip_conntrack_tuple inner, target; int hdrlen = (*pskb)->nh.iph->ihl * 4; - if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; @@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb, if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, ip_ct_find_proto(inside->ip.protocol))) + &inner, + __ip_conntrack_proto_find(inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) synchronize_net(); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +int +ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range) +{ + NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t), + &range->min.tcp.port); + NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t), + &range->max.tcp.port); + + return 0; + +nfattr_failure: + return -1; +} + +int +ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) +{ + int ret = 0; + + /* we have to return whether we actually parsed something or not */ + + if (tb[CTA_PROTONAT_PORT_MIN-1]) { + ret = 1; + range->min.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + } + + if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (ret) + range->max.tcp.port = range->min.tcp.port; + } else { + ret = 1; + range->max.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + } + + return ret; +} +#endif + int __init ip_nat_init(void) { size_t i; diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 158f34f32c0..d2dd5d31355 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb, struct tcphdr *tcph; int datalen; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb, match_offset + match_len) return 0; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb, optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - if (!skb_ip_make_writable(pskb, optend)) + if (!skb_make_writable(pskb, optend)) return 0; dir = CTINFO2DIR(ctinfo); @@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb, this_way = &ct->nat.info.seq[dir]; other_way = &ct->nat.info.seq[!dir]; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 6596c9ee165..93871904399 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb, struct icmphdr *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; hdr = (struct icmphdr *)((*pskb)->data + hdroff); @@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, - icmp_manip_pkt, - icmp_in_range, - icmp_unique_tuple, - icmp_print, - icmp_print_range +struct ip_nat_protocol ip_nat_protocol_icmp = { + .name = "ICMP", + .protonum = IPPROTO_ICMP, + .me = THIS_MODULE, + .manip_pkt = icmp_manip_pkt, + .in_range = icmp_in_range, + .unique_tuple = icmp_unique_tuple, + .print = icmp_print, + .print_range = icmp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index a98e36d2b3c..1d381bf6857 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -12,6 +12,7 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/if.h> +#include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter_ipv4/ip_nat.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_nat_protocol.h> @@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb, if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + if (!skb_make_writable(pskb, hdroff + hdrsize)) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); @@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, - tcp_manip_pkt, - tcp_in_range, - tcp_unique_tuple, - tcp_print, - tcp_print_range +struct ip_nat_protocol ip_nat_protocol_tcp = { + .name = "TCP", + .protonum = IPPROTO_TCP, + .me = THIS_MODULE, + .manip_pkt = tcp_manip_pkt, + .in_range = tcp_in_range, + .unique_tuple = tcp_unique_tuple, + .print = tcp_print, + .print_range = tcp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 9f66e562566..c4906e1aa24 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb, u32 oldip, newip; u16 *portptr, newport; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); @@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, - udp_manip_pkt, - udp_in_range, - udp_unique_tuple, - udp_print, - udp_print_range +struct ip_nat_protocol ip_nat_protocol_udp = { + .name = "UDP", + .protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .manip_pkt = udp_manip_pkt, + .in_range = udp_in_range, + .unique_tuple = udp_unique_tuple, + .print = udp_print, + .print_range = udp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index f5525bd58d1..99bbef56f84 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, - unknown_manip_pkt, - unknown_in_range, - unknown_unique_tuple, - unknown_print, - unknown_print_range + .name = "unknown", + .me = THIS_MODULE, + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, + .print = unknown_print, + .print_range = unknown_print_range }; diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 2a48b6e635a..93b2c5111bb 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb, return NF_DROP; } - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; spin_lock_bh(&snmp_lock); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 91d5ea1dbbc..89db052add8 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum, IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); - (*pskb)->nfcache |= NFC_UNKNOWN; - /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(*pskb, (out == NULL))) @@ -396,6 +394,8 @@ module_exit(fini); EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_protocol_register); EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); +EXPORT_SYMBOL_GPL(ip_nat_proto_put); EXPORT_SYMBOL(ip_nat_cheat_check); EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); EXPORT_SYMBOL(ip_nat_mangle_udp_packet); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c6baa817438..d54f14d926f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -43,17 +43,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_rt_info { - __u8 tos; - __u32 daddr; - __u32 saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->stamp.tv_sec; - pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; + pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; pmsg->mark = entry->skb->nfmark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; @@ -287,7 +280,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; @@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = skb->nh.iph; - - entry->rt_info.tos = iph->tos; - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - e->skb->nfcache |= NFC_ALTERED; - - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = e->skb->nh.iph; - - if (!(iph->tos == e->rt_info.tos - && iph->daddr == e->rt_info.daddr - && iph->saddr == e->rt_info.saddr)) - return ip_route_me_harder(&e->skb); - } + return 0; } @@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) } #endif /* CONFIG_PROC_FS */ +static struct nf_queue_handler nfqh = { + .name = "ip_queue", + .outfn = &ipq_enqueue_packet, +}; + static int init_or_cleanup(int init) { @@ -693,7 +671,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; @@ -710,7 +689,7 @@ init_or_cleanup(int init) register_netdevice_notifier(&ipq_dev_notifier); ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); - status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + status = nf_register_queue_handler(PF_INET, &nfqh); if (status < 0) { printk(KERN_ERR "ip_queue: failed to register queue handler\n"); goto cleanup_sysctl; @@ -718,7 +697,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handler(PF_INET); + nf_unregister_queue_handlers(&nfqh); synchronize_net(); ipq_flush(NF_DROP); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c88dfcd38c5..eef99a1b5de 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { struct ipt_entry_target *t; @@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb, back->comefrom); continue; } - if (table_base + v - != (void *)e + e->next_offset) { + if (table_base + v != (void *)e + e->next_offset + && !(e->ip.flags & IPT_F_GOTO)) { /* Save old back ptr in next entry */ struct ipt_entry *next = (void *)e + e->next_offset; diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c index 9842e6e2318..dab78d8bd49 100644 --- a/net/ipv4/netfilter/ipt_CLASSIFY.c +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -32,10 +32,8 @@ target(struct sk_buff **pskb, { const struct ipt_classify_target_info *clinfo = targinfo; - if((*pskb)->priority != clinfo->priority) { + if((*pskb)->priority != clinfo->priority) (*pskb)->priority = clinfo->priority; - (*pskb)->nfcache |= NFC_ALTERED; - } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6706d3a1bc4..2d05cafec22 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,7 +367,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 30ddd3e18eb..13463802133 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -40,9 +40,9 @@ target(struct sk_buff **pskb, void *userinfo) { const struct ipt_connmark_target_info *markinfo = targinfo; - unsigned long diff; - unsigned long nfmark; - unsigned long newmark; + u_int32_t diff; + u_int32_t nfmark; + u_int32_t newmark; enum ip_conntrack_info ctinfo; struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); @@ -61,10 +61,8 @@ target(struct sk_buff **pskb, case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; diff = (ct->mark ^ nfmark) & markinfo->mask; - if (diff != 0) { + if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; - (*pskb)->nfcache |= NFC_ALTERED; - } break; } } @@ -94,6 +92,11 @@ checkentry(const char *tablename, } } + if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { + printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 3ea4509099f..6e319570a28 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -39,7 +39,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -51,7 +51,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^ 0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 94a0ce1c1c9..a1319693f64 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return 1; } @@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; @@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->check = csum_fold(csum_partial((char *)diffs, sizeof(diffs), tcph->check^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; return 1; } diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index ef08733d26d..92ed050fac6 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -27,10 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables syslog logging module"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - #if 0 #define DEBUGP printk #else @@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ipt_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { struct iphdr _iph, *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { @@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info, if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((info->logflags & IPT_LOG_IPOPT) + if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; unsigned int i, optsize; @@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info, printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IPT_LOG_TCPSEQ) + if (logflags & IPT_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IPT_LOG_TCPOPT) + if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; unsigned char *op; @@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info, } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } +struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ipt_log_packet(unsigned int hooknum, +ipt_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ipt_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER @@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb, void *userinfo) { const struct ipt_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; - level_string[1] = '0' + (loginfo->level % 8); - ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - return IPT_CONTINUE; -} + nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); -static void -ipt_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ipt_log_info loginfo = { - .level = 0, - .logflags = IPT_LOG_MASK, - .prefix = "" - }; - - ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); + return IPT_CONTINUE; } static int ipt_log_checkentry(const char *tablename, @@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_log_logger ={ + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ipt_register_target(&ipt_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { + printk(KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_log_logger); ipt_unregister_target(&ipt_log_reg); } diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 33c6f9b63b8..52b4f2c296b 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb, { const struct ipt_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb, break; } - if((*pskb)->nfmark != mark) { + if((*pskb)->nfmark != mark) (*pskb)->nfmark = mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -76,6 +74,8 @@ checkentry_v0(const char *tablename, unsigned int targinfosize, unsigned int hook_mask) { + struct ipt_mark_target_info *markinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, @@ -88,6 +88,11 @@ checkentry_v0(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } @@ -120,6 +125,11 @@ checkentry_v1(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 91e74502c3d..2f3e181c8e9 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb, IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); - /* FIXME: For the moment, don't do local packets, breaks - testsuite for 2.3.49 --RR */ - if ((*pskb)->sk) - return NF_ACCEPT; - ct = ip_conntrack_get(*pskb, &ctinfo); IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 06254b29d03..e6e7b609536 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -46,7 +46,8 @@ check(const char *tablename, DEBUGP(MODULENAME":check: size %u.\n", targinfosize); return 0; } - if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | + (1 << NF_IP_LOCAL_OUT))) { DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); return 0; } @@ -76,12 +77,13 @@ target(struct sk_buff **pskb, struct ip_nat_range newrange; IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING); + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_OUT); ct = ip_conntrack_get(*pskb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_IP_PRE_ROUTING) + if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) new_ip = (*pskb)->nh.iph->daddr & ~netmask; else new_ip = (*pskb)->nh.iph->saddr & ~netmask; diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c new file mode 100644 index 00000000000..3cedc9be880 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NFQUEUE.c @@ -0,0 +1,70 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_NFQUEUE.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 91569644602..f115a84a4ac 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); - nskb->nfcache = 0; nskb->nfmark = 0; #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(nskb->nf_bridge); diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 7b84a254440..8db70d6908c 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb, unsigned int i; u_int8_t *opt; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; if ((*pskb)->ip_summed == CHECKSUM_HW && @@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, newmss); retmodified: - (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 85c70d240f8..deadb36d442 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -33,7 +33,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -46,7 +46,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c new file mode 100644 index 00000000000..b9ae6a9382f --- /dev/null +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -0,0 +1,119 @@ +/* TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TTL.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = targinfo; + u_int16_t diffs[2]; + int new_ttl; + + if (!skb_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF; + iph->ttl = new_ttl; + diffs[1] = htons(((unsigned)iph->ttl) << 8); + iph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + iph->check^0xFFFF)); + } + + return IPT_CONTINUE; +} + +static int ipt_ttl_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_TTL_info *info = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) { + printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_TTL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "ipt_TTL: can only be called from " + "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IPT_TTL_MAXMODE) { + printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IPT_TTL_SET) && (info->ttl == 0)) + return 0; + + return 1; +} + +static struct ipt_target ipt_TTL = { + .name = "TTL", + .target = ipt_ttl_target, + .checkentry = ipt_ttl_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_TTL); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_TTL); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 52a0076302a..e2c14f3cb2f 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -62,6 +62,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ @@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum) if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; - NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); - DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", - ub->qlen, nlgroupnum); - netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; + DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n", + ub->qlen, nlgroupnum + 1); + netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; @@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum, pm = NLMSG_DATA(nlh); /* We might not have a timestamp, get one */ - if (skb->stamp.tv_sec == 0) - do_gettimeofday((struct timeval *)&skb->stamp); + if (skb->tstamp.off_sec == 0) + __net_timestamp((struct sk_buff *)skb); /* copy hook, prefix, timestamp, payload, etc. */ pm->data_len = copy_len; - pm->timestamp_sec = skb->stamp.tv_sec; - pm->timestamp_usec = skb->stamp.tv_usec; + pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; + pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; pm->mark = skb->nfmark; pm->hook = hooknum; if (prefix != NULL) @@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb, return IPT_CONTINUE; } -static void ipt_logfn(unsigned int hooknum, +static void ipt_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix) { - struct ipt_ulog_info loginfo = { - .nl_group = ULOG_DEFAULT_NLGROUP, - .copy_range = 0, - .qthreshold = ULOG_DEFAULT_QTHRESHOLD, - .prefix = "" - }; + struct ipt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nl_group = ULOG_DEFAULT_NLGROUP; + loginfo.copy_range = 0; + loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nl_group = li->u.ulog.group; + loginfo.copy_range = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } @@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_ulog_logger = { + .name = "ipt_ULOG", + .logfn = &ipt_logfn, + .me = THIS_MODULE, +}; + static int __init init(void) { int i; @@ -372,7 +388,8 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + THIS_MODULE); if (!nflognl) return -ENOMEM; @@ -381,7 +398,7 @@ static int __init init(void) return -EINVAL; } if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + nf_log_register(PF_INET, &ipt_ulog_logger); return 0; } @@ -394,7 +411,7 @@ static void __exit fini(void) DEBUGP("ipt_ULOG: cleanup_module\n"); if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_ulog_logger); ipt_unregister_target(&ipt_ulog_reg); sock_release(nflognl->sk_socket); diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c new file mode 100644 index 00000000000..df4a42c6da2 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -0,0 +1,162 @@ +/* Kernel module to match connection tracking byte counter. + * GPL (C) 2002 Martin Devera (devik@cdi.cz). + * + * 2004-07-20 Harald Welte <laforge@netfilter.org> + * - reimplemented to use per-connection accounting counters + * - add functionality to match number of packets + * - add functionality to match average packet size + * - add support to match directions seperately + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_connbytes.h> + +#include <asm/div64.h> +#include <asm/bitops.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); + +/* 64bit divisor, dividend and result. dynamic precision */ +static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + do_div(dividend, d); + return dividend; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct; + u_int64_t what = 0; /* initialize to make gcc happy */ + + if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + return 0; /* no match */ + + switch (sinfo->what) { + case IPT_CONNBYTES_PKTS: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].packets; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what += ct->counters[IP_CT_DIR_REPLY].packets; + break; + } + break; + case IPT_CONNBYTES_BYTES: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].bytes; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what += ct->counters[IP_CT_DIR_REPLY].bytes; + break; + } + break; + case IPT_CONNBYTES_AVGPKT: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, + ct->counters[IP_CT_DIR_ORIGINAL].packets); + break; + case IPT_CONNBYTES_DIR_REPLY: + what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, + ct->counters[IP_CT_DIR_REPLY].packets); + break; + case IPT_CONNBYTES_DIR_BOTH: + { + u_int64_t bytes; + u_int64_t pkts; + bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + + ct->counters[IP_CT_DIR_REPLY].bytes; + pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ + ct->counters[IP_CT_DIR_REPLY].packets; + + /* FIXME_THEORETICAL: what to do if sum + * overflows ? */ + + what = div64_64(bytes, pkts); + } + break; + } + break; + } + + if (sinfo->count.to) + return (what <= sinfo->count.to && what >= sinfo->count.from); + else + return (what >= sinfo->count.from); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) + return 0; + + if (sinfo->what != IPT_CONNBYTES_PKTS && + sinfo->what != IPT_CONNBYTES_BYTES && + sinfo->what != IPT_CONNBYTES_AVGPKT) + return 0; + + if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != IPT_CONNBYTES_DIR_REPLY && + sinfo->direction != IPT_CONNBYTES_DIR_BOTH) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "connbytes", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index 2706f96cea5..bf8de47ce00 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -54,9 +54,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_connmark_info *cm = + (struct ipt_connmark_info *)matchinfo; if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) return 0; + if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { + printk(KERN_WARNING "connmark: only support 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c new file mode 100644 index 00000000000..ad3278bba6c --- /dev/null +++ b/net/ipv4/netfilter/ipt_dccp.c @@ -0,0 +1,176 @@ +/* + * iptables module for DCCP protocol header matching + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <net/ip.h> +#include <linux/dccp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_dccp.h> + +#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static unsigned char *dccp_optbuf; +static DEFINE_SPINLOCK(dccp_buflock); + +static inline int +dccp_find_option(u_int8_t option, + const struct sk_buff *skb, + const struct dccp_hdr *dh, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + unsigned char *op; + unsigned int optoff = __dccp_hdr_len(dh); + unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); + unsigned int i; + + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) { + *hotdrop = 1; + return 0; + } + + if (!optlen) + return 0; + + spin_lock_bh(&dccp_buflock); + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + optoff, + optlen, dccp_optbuf); + if (op == NULL) { + /* If we don't have the whole header, drop packet. */ + spin_unlock_bh(&dccp_buflock); + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) { + spin_unlock_bh(&dccp_buflock); + return 1; + } + + if (op[i] < 2) + i++; + else + i += op[i+1]?:1; + } + + spin_unlock_bh(&dccp_buflock); + return 0; +} + + +static inline int +match_types(const struct dccp_hdr *dh, u_int16_t typemask) +{ + return (typemask & (1 << dh->dccph_type)); +} + +static inline int +match_option(u_int8_t option, const struct sk_buff *skb, + const struct dccp_hdr *dh, int *hotdrop) +{ + return dccp_find_option(option, skb, dh, hotdrop); +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_dccp_info *info = + (const struct ipt_dccp_info *)matchinfo; + struct dccp_hdr _dh, *dh; + + if (offset) + return 0; + + dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh); + if (dh == NULL) { + *hotdrop = 1; + return 0; + } + + return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) + && (ntohs(dh->dccph_sport) <= info->spts[1])), + IPT_DCCP_SRC_PORTS, info->flags, info->invflags) + && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) + && (ntohs(dh->dccph_dport) <= info->dpts[1])), + IPT_DCCP_DEST_PORTS, info->flags, info->invflags) + && DCCHECK(match_types(dh, info->typemask), + IPT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, dh, hotdrop), + IPT_DCCP_OPTION, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_dccp_info *info; + + info = (const struct ipt_dccp_info *)matchinfo; + + return ip->proto == IPPROTO_DCCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info)) + && !(info->flags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~info->flags); +} + +static struct ipt_match dccp_match = +{ + .name = "dccp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + /* doff is 8 bits, so the maximum option size is (4*256). Don't put + * this in BSS since DaveM is worried about locked TLB's for kernel + * BSS. */ + dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); + if (!dccp_optbuf) + return -ENOMEM; + ret = ipt_register_match(&dccp_match); + if (ret) + kfree(dccp_optbuf); + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dccp_match); + kfree(dccp_optbuf); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Match for DCCP protocol packets"); + diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 564b49bfebc..2dd1cccbdab 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -94,7 +94,7 @@ struct ipt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ static HLIST_HEAD(hashlimit_htables); -static kmem_cache_t *hashlimit_cachep; +static kmem_cache_t *hashlimit_cachep __read_mostly; static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) { diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 8955728127b..00bef6cdd3f 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c @@ -37,9 +37,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) return 0; + if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { + printk(KERN_WARNING "mark: only supports 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 3b9065e0638..c1889f88262 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("iptables owner match"); static int -match_comm(const struct sk_buff *skb, const char *comm) -{ - struct task_struct *g, *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if(strncmp(p->comm, comm, sizeof(p->comm))) - continue; - - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_pid(const struct sk_buff *skb, pid_t pid) -{ - struct task_struct *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out; - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); -out: - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_sid(const struct sk_buff *skb, pid_t sid) -{ - struct task_struct *g, *p; - struct file *file = skb->sk->sk_socket->file; - int i, found=0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct files_struct *files; - if (p->signal->session != sid) - continue; - - task_lock(p); - files = p->files; - if (files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == file) { - found = 1; - break; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - if (found) - goto out; - } while_each_thread(g, p); -out: - read_unlock(&tasklist_lock); - - return found; -} - -static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -145,24 +45,6 @@ match(const struct sk_buff *skb, return 0; } - if(info->match & IPT_OWNER_PID) { - if (!match_pid(skb, info->pid) ^ - !!(info->invert & IPT_OWNER_PID)) - return 0; - } - - if(info->match & IPT_OWNER_SID) { - if (!match_sid(skb, info->sid) ^ - !!(info->invert & IPT_OWNER_SID)) - return 0; - } - - if(info->match & IPT_OWNER_COMM) { - if (!match_comm(skb, info->comm) ^ - !!(info->invert & IPT_OWNER_COMM)) - return 0; - } - return 1; } @@ -173,6 +55,8 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + const struct ipt_owner_info *info = matchinfo; + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); @@ -184,15 +68,13 @@ checkentry(const char *tablename, IPT_ALIGN(sizeof(struct ipt_owner_info))); return 0; } -#ifdef CONFIG_SMP - /* files->file_lock can not be used in a BH */ - if (((struct ipt_owner_info *)matchinfo)->match - & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { - printk("ipt_owner: pid, sid and command matching is broken " - "on SMP.\n"); + + if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching " + "not supported anymore\n"); return 0; } -#endif + return 1; } diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c new file mode 100644 index 00000000000..b5def204d79 --- /dev/null +++ b/net/ipv4/netfilter/ipt_string.c @@ -0,0 +1,91 @@ +/* String matching match for iptables + * + * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_string.h> +#include <linux/textsearch.h> + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); +MODULE_DESCRIPTION("IP tables string match module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ts_state state; + struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo; + + memset(&state, 0, sizeof(struct ts_state)); + + return (skb_find_text((struct sk_buff *)skb, conf->from_offset, + conf->to_offset, conf->config, &state) + != UINT_MAX) && !conf->invert; +} + +#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m) + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_string_info *conf = matchinfo; + struct ts_config *ts_conf; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) + return 0; + + /* Damn, can't handle this case properly with iptables... */ + if (conf->from_offset > conf->to_offset) + return 0; + + ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, + GFP_KERNEL, TS_AUTOLOAD); + if (IS_ERR(ts_conf)) + return 0; + + conf->config = ts_conf; + + return 1; +} + +static void destroy(void *matchinfo, unsigned int matchsize) +{ + textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); +} + +static struct ipt_match string_match = { + .name = "string", + .match = match, + .checkentry = checkentry, + .destroy = destroy, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&string_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&string_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 912bbcc7f41..f7943ba1f43 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto) */ static int sockstat_seq_show(struct seq_file *seq, void *v) { - /* From net/socket.c */ - extern void socket_seq_show(struct seq_file *seq); - socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), - tcp_tw_count, atomic_read(&tcp_sockets_allocated), + tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0db405a869f..291831e792a 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -40,7 +40,6 @@ #include <linux/timer.h> #include <net/ip.h> #include <net/protocol.h> -#include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/icmp.h> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d1835b1bc8c..304bb0a1d4f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -59,7 +59,6 @@ #include <linux/netdevice.h> #include <linux/in_route.h> #include <linux/route.h> -#include <linux/tcp.h> #include <linux/skbuff.h> #include <net/dst.h> #include <net/sock.h> @@ -71,6 +70,7 @@ #include <net/udp.h> #include <net/raw.h> #include <net/snmp.h> +#include <net/tcp_states.h> #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> @@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; + int delivered = 0; read_lock(&raw_v4_lock); head = &raw_v4_htable[hash]; @@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) skb->dev->ifindex); while (sk) { + delivered = 1; if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) } out: read_unlock(&raw_v4_lock); + return delivered; } void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d675ff80b04..8c0b14e3bee 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,9 @@ static unsigned rt_hash_mask; static int rt_hash_log; static unsigned int rt_hash_rnd; -struct rt_cache_stat *rt_cache_stat; +static struct rt_cache_stat *rt_cache_stat; +#define RT_CACHE_STAT_INC(field) \ + (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); @@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) return ip_route_output_slow(rp, flp); } +EXPORT_SYMBOL_GPL(__ip_route_output_key); + int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; @@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, return 0; } +EXPORT_SYMBOL_GPL(ip_route_output_flow); + int ip_route_output_key(struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(rp, flp, NULL, 0); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 72d01444218..a34e60ea48a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; } -extern struct request_sock_ops tcp_request_sock_ops; - static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) @@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); if (child) - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); else reqsk_free(req); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e3289453241..65268562351 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -11,7 +11,9 @@ #include <linux/module.h> #include <linux/sysctl.h> #include <linux/config.h> +#include <linux/igmp.h> #include <net/snmp.h> +#include <net/icmp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> @@ -19,36 +21,6 @@ /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; -/* From icmp.c */ -extern int sysctl_icmp_echo_ignore_all; -extern int sysctl_icmp_echo_ignore_broadcasts; -extern int sysctl_icmp_ignore_bogus_error_responses; -extern int sysctl_icmp_errors_use_inbound_ifaddr; - -/* From ip_fragment.c */ -extern int sysctl_ipfrag_low_thresh; -extern int sysctl_ipfrag_high_thresh; -extern int sysctl_ipfrag_time; -extern int sysctl_ipfrag_secret_interval; - -/* From ip_output.c */ -extern int sysctl_ip_dynaddr; - -/* From icmp.c */ -extern int sysctl_icmp_ratelimit; -extern int sysctl_icmp_ratemask; - -/* From igmp.c */ -extern int sysctl_igmp_max_memberships; -extern int sysctl_igmp_max_msf; - -/* From inetpeer.c */ -extern int inet_peer_threshold; -extern int inet_peer_minttl; -extern int inet_peer_maxttl; -extern int inet_peer_gc_mintime; -extern int inet_peer_gc_maxtime; - #ifdef CONFIG_SYSCTL static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 }; struct ipv4_config ipv4_config; -extern ctl_table ipv4_route_table[]; - #ifdef CONFIG_SYSCTL static @@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) +static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) { char val[TCP_CA_NAME_MAX]; ctl_table tbl = { @@ -259,7 +230,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_MAX_TW_BUCKETS, .procname = "tcp_max_tw_buckets", - .data = &sysctl_tcp_max_tw_buckets, + .data = &tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec @@ -363,7 +334,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_TW_RECYCLE, .procname = "tcp_tw_recycle", - .data = &sysctl_tcp_tw_recycle, + .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 69b1fcf7007..02fdda68718 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,13 +269,12 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); - -kmem_cache_t *tcp_bucket_cachep; -kmem_cache_t *tcp_timewait_cachep; +DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; atomic_t tcp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(tcp_orphan_count); + int sysctl_tcp_mem[3]; int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void) EXPORT_SYMBOL(tcp_enter_memory_pressure); /* - * LISTEN is a special case for poll.. - */ -static __inline__ unsigned int tcp_listen_poll(struct sock *sk, - poll_table *wait) -{ - return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; -} - -/* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers @@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) - return tcp_listen_poll(sk, wait); + return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes @@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } - -int tcp_listen_start(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); - - if (rc != 0) - return rc; - - sk->sk_max_ack_backlog = 0; - sk->sk_ack_backlog = 0; - tcp_delack_init(tp); - - /* There is race window here: we announce ourselves listening, - * but this transition is still not validated by get_port(). - * It is OK, because this socket enters to hash table only - * after validation is complete. - */ - sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); - - sk_dst_reset(sk); - sk->sk_prot->hash(sk); - - return 0; - } - - sk->sk_state = TCP_CLOSE; - reqsk_queue_destroy(&tp->accept_queue); - return -EADDRINUSE; -} - -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. - */ - -static void tcp_listen_stop (struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt; - struct request_sock *acc_req; - struct request_sock *req; - int i; - - tcp_delete_keepalive_timer(sk); - - /* make all the listen_opt local to us */ - lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); - acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); - - if (lopt->qlen) { - for (i = 0; i < TCP_SYNQ_HSIZE; i++) { - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - } - } - } - BUG_TRAP(!lopt->qlen); - - kfree(lopt); - - while ((req = acc_req) != NULL) { - struct sock *child = req->sk; - - acc_req = req->dl_next; - - local_bh_disable(); - bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); - sock_hold(child); - - tcp_disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - atomic_inc(&tcp_orphan_count); - - tcp_destroy_sock(child); - - bh_unlock_sock(child); - local_bh_enable(); - sock_put(child); - - sk_acceptq_removed(sk); - __reqsk_free(req); - } - BUG_TRAP(!sk->sk_ack_backlog); -} - static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -975,7 +862,7 @@ do_fault: if (!skb->len) { if (sk->sk_send_head == skb) sk->sk_send_head = NULL; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } @@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied) BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif - if (tcp_ack_scheduled(tp)) { + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); /* Delayed ACKs frequently hit locked sockets during bulk * receive. */ - if (tp->ack.blocked || + if (icsk->icsk_ack.blocked || /* Once-per-two-segments ACK was not sent by tcp_input.c */ - tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && - !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } @@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how) } } -/* - * At this point, there should be no process reference to this - * socket, and thus no user references at all. Therefore we - * can assume the socket waitqueue is inactive and nobody will - * try to jump onto it. - */ -void tcp_destroy_sock(struct sock *sk) -{ - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); - - /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); - - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); - - sk->sk_prot->destroy(sk); - - sk_stream_kill_queues(sk); - - xfrm_sk_free_policy(sk); - -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) { - printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); - } -#endif - - atomic_dec(&tcp_orphan_count); - sock_put(sk); -} - void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout) tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); goto adjudge_to_death; } @@ -1721,12 +1575,12 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); } else { - int tmo = tcp_fin_time(tp); + const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1734,7 +1588,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) @@ -1745,10 +1599,10 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: @@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state) int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err = 0; int old_state = sk->sk_state; @@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; - tp->backoff = 0; + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; - tp->probes_out = 0; + icsk->icsk_probes_out = 0; tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); - tcp_delack_init(tp); + inet_csk_delack_init(sk); sk->sk_send_head = NULL; tp->rx_opt.saw_tstamp = 0; tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || tp->bind_hash); + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; } /* - * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked. - */ -static int wait_for_connect(struct sock *sk, long timeo) -{ - struct tcp_sock *tp = tcp_sk(sk); - DEFINE_WAIT(wait); - int err; - - /* - * True wake-one mechanism for incoming connections: only - * one process gets woken up, not the 'whole herd'. - * Since we do not 'race & poll' for established sockets - * anymore, the common case will execute the loop only once. - * - * Subtle issue: "add_wait_queue_exclusive()" will be added - * after any current non-exclusive waiters, and we know that - * it will always _stay_ after any new non-exclusive waiters - * because all non-exclusive waiters are added at the - * beginning of the wait-queue. As such, it's ok to "drop" - * our exclusiveness temporarily when we get woken up without - * having to remove and re-insert us on the wait queue. - */ - for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (reqsk_queue_empty(&tp->accept_queue)) - timeo = schedule_timeout(timeo); - lock_sock(sk); - err = 0; - if (!reqsk_queue_empty(&tp->accept_queue)) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!timeo) - break; - } - finish_wait(sk->sk_sleep, &wait); - return err; -} - -/* - * This will accept the next outstanding connection. - */ - -struct sock *tcp_accept(struct sock *sk, int flags, int *err) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sock *newsk; - int error; - - lock_sock(sk); - - /* We need to make sure that this socket is listening, - * and that it has something pending. - */ - error = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - goto out_err; - - /* Find already established connection */ - if (reqsk_queue_empty(&tp->accept_queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - - /* If this is a non blocking socket don't sleep */ - error = -EAGAIN; - if (!timeo) - goto out_err; - - error = wait_for_connect(sk, timeo); - if (error) - goto out_err; - } - - newsk = reqsk_queue_get_child(&tp->accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); -out: - release_sock(sk); - return newsk; -out_err: - newsk = NULL; - *err = error; - goto out; -} - -/* * Socket option code for TCP. */ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; @@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(tp, name); + err = tcp_set_congestion_control(sk, name); release_sock(sk); return err; } @@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - tcp_reset_keepalive_timer(sk, elapsed); + inet_csk_reset_keepalive_timer(sk, elapsed); } } break; @@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else - tp->syn_retries = val; + icsk->icsk_syn_retries = val; break; case TCP_LINGER2: @@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, break; case TCP_DEFER_ACCEPT: - tp->defer_accept = 0; + icsk->icsk_accept_queue.rskq_defer_accept = 0; if (val > 0) { /* Translate value in seconds to number of * retransmits */ - while (tp->defer_accept < 32 && + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && val > ((TCP_TIMEOUT_INIT / HZ) << - tp->defer_accept)) - tp->defer_accept++; - tp->defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, case TCP_QUICKACK: if (!val) { - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } else { - tp->ack.pingpong = 0; + icsk->icsk_ack.pingpong = 0; if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - tcp_ack_scheduled(tp)) { - tp->ack.pending |= TCP_ACK_PUSHED; + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; cleanup_rbuf(sk, 1); if (!(val & 1)) - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } } break; @@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, void tcp_get_info(struct sock *sk, struct tcp_info *info) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; - info->tcpi_ca_state = tp->ca_state; - info->tcpi_retransmits = tp->retransmits; - info->tcpi_probes = tp->probes_out; - info->tcpi_backoff = tp->backoff; + info->tcpi_ca_state = icsk->icsk_ca_state; + info->tcpi_retransmits = icsk->icsk_retransmits; + info->tcpi_probes = icsk->icsk_probes_out; + info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; @@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->ecn_flags&TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; - info->tcpi_rto = jiffies_to_usecs(tp->rto); - info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); + info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; - info->tcpi_rcv_mss = tp->ack.rcv_mss; + info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; @@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_fackets = tp->fackets_out; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); - info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = tp->pmtu_cookie; @@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; @@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = tp->syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << - (tp->defer_accept - 1)); + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return 0; } case TCP_QUICKACK: - val = !tp->ack.pingpong; + val = !icsk->icsk_ack.pingpong; break; case TCP_CONGESTION: @@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, tp->ca_ops->name, len)) + if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; default: @@ -2278,79 +2044,72 @@ void __init tcp_init(void) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), sizeof(skb->cb)); - tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct tcp_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_bucket_cachep) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); - tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", - sizeof(struct tcp_tw_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_timewait_cachep) - panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); - /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = (struct tcp_ehash_bucket *) + tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", - sizeof(struct tcp_ehash_bucket), + sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_ehash_size, + &tcp_hashinfo.ehash_size, NULL, 0); - tcp_ehash_size = (1 << tcp_ehash_size) >> 1; - for (i = 0; i < (tcp_ehash_size << 1); i++) { - rwlock_init(&tcp_ehash[i].lock); - INIT_HLIST_HEAD(&tcp_ehash[i].chain); + tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; + for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&tcp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } - tcp_bhash = (struct tcp_bind_hashbucket *) + tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", - sizeof(struct tcp_bind_hashbucket), - tcp_ehash_size, + sizeof(struct inet_bind_hashbucket), + tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_bhash_size, + &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_bhash_size = 1 << tcp_bhash_size; - for (i = 0; i < tcp_bhash_size; i++) { - spin_lock_init(&tcp_bhash[i].lock); - INIT_HLIST_HEAD(&tcp_bhash[i].chain); + tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + for (i = 0; i < tcp_hashinfo.bhash_size; i++) { + spin_lock_init(&tcp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } /* Try to be a bit smarter and adjust defaults depending * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; - sysctl_tcp_max_tw_buckets = 180000; + tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024 * (3 - order); - sysctl_tcp_max_tw_buckets >>= (3 - order); + tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_port_rover = sysctl_local_port_range[0] - 1; + tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; @@ -2365,14 +2124,12 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", - tcp_ehash_size << 1, tcp_bhash_size); + tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(tcp_accept); EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); @@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics); -EXPORT_SYMBOL(tcp_timewait_cachep); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ec38d45d664..b940346de4e 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca) ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } -static void bictcp_init(struct tcp_sock *tp) +static void bictcp_init(struct sock *sk) { - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); if (initial_ssthresh) - tp->snd_ssthresh = initial_ssthresh; + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* @@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +static inline void bictcp_low_utilization(struct sock *sk, int flag) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); u32 dist, delay; /* No time stamp */ @@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) } -static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { - struct bictcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(tp, data_acked); + bictcp_low_utilization(sk, data_acked); if (in_flight < tp->snd_cwnd) return; @@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, * behave like Reno until low_window is reached, * then increase congestion window slowly */ -static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 bictcp_recalc_ssthresh(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +static u32 bictcp_undo_cwnd(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); - + const struct tcp_sock *tp = tcp_sk(sk); + const struct bictcp *ca = inet_csk_ca(sk); return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct tcp_sock *tp) +static u32 bictcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void bictcp_state(struct tcp_sock *tp, u8 new_state) +static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgement ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt) { - if (cnt > 0 && tp->ca_state == TCP_CA_Open) { - struct bictcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } @@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = { static int __init bictcp_register(void) { - BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4970d10a778..bbf2d6624e8 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct tcp_sock *tp) +void tcp_init_congestion_control(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - if (tp->ca_ops != &tcp_init_congestion_ops) + if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) return; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { - tp->ca_ops = ca; + icsk->icsk_ca_ops = ca; break; } } rcu_read_unlock(); - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } /* Manage refcounts on socket close. */ -void tcp_cleanup_congestion_control(struct tcp_sock *tp) +void tcp_cleanup_congestion_control(struct sock *sk) { - if (tp->ca_ops->release) - tp->ca_ops->release(tp); - module_put(tp->ca_ops->owner); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name) } /* Change congestion control for socket */ -int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) +int tcp_set_congestion_control(struct sock *sk, const char *name) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; int err = 0; rcu_read_lock(); ca = tcp_ca_find(name); - if (ca == tp->ca_ops) + if (ca == icsk->icsk_ca_ops) goto out; if (!ca) @@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) err = -EBUSY; else { - tcp_cleanup_congestion_control(tp); - tp->ca_ops = ca; - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } out: rcu_read_unlock(); @@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + if (in_flight < tp->snd_cwnd) return; @@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct tcp_sock *tp) +u32 tcp_reno_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); /* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +u32 tcp_reno_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; } EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f66945cb158..c148c108188 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,5 +1,5 @@ /* - * tcp_diag.c Module for monitoring TCP sockets. + * tcp_diag.c Module for monitoring TCP transport protocols sockets. * * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ * @@ -12,779 +12,43 @@ */ #include <linux/config.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/fcntl.h> -#include <linux/random.h> -#include <linux/cache.h> -#include <linux/init.h> -#include <linux/time.h> - -#include <net/icmp.h> -#include <net/tcp.h> -#include <net/ipv6.h> -#include <net/inet_common.h> - -#include <linux/inet.h> -#include <linux/stddef.h> - -#include <linux/tcp_diag.h> -struct tcpdiag_entry -{ - u32 *saddr; - u32 *daddr; - u16 sport; - u16 dport; - u16 family; - u16 userlocks; -}; +#include <linux/module.h> +#include <linux/inet_diag.h> -static struct sock *tcpnl; +#include <linux/tcp.h> -#define TCPDIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) +#include <net/tcp.h> -static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags) +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) { - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcpdiagmsg *r; - struct nlmsghdr *nlh; - struct tcp_info *info = NULL; - struct tcpdiag_meminfo *minfo = NULL; - unsigned char *b = skb->tail; - - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); - nlh->nlmsg_flags = nlmsg_flags; - r = NLMSG_DATA(nlh); - if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1<<(TCPDIAG_MEMINFO-1))) - minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); - if (ext & (1<<(TCPDIAG_INFO-1))) - info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - - if (ext & (1<<(TCPDIAG_CONG-1))) { - size_t len = strlen(tp->ca_ops->name); - strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), - tp->ca_ops->name); - } - } - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = sk->sk_state; - r->tcpdiag_timer = 0; - r->tcpdiag_retrans = 0; - - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - - if (r->tcpdiag_state == TCP_TIME_WAIT) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; - long tmo = tw->tw_ttd - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.tcpdiag_sport = tw->tw_sport; - r->id.tcpdiag_dport = tw->tw_dport; - r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; - r->id.tcpdiag_dst[0] = tw->tw_daddr; - r->tcpdiag_state = tw->tw_substate; - r->tcpdiag_timer = 3; - r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = 0; - r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 - if (r->tcpdiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &tw->tw_v6_daddr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - return skb->len; - } - - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = inet->dport; - r->id.tcpdiag_src[0] = inet->rcv_saddr; - r->id.tcpdiag_dst[0] = inet->daddr; - -#ifdef CONFIG_IP_TCPDIAG_IPV6 - if (r->tcpdiag_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &np->daddr); - } -#endif - -#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - - if (tp->pending == TCP_TIME_RETRANS) { - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = tp->retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); - } else if (tp->pending == TCP_TIME_PROBE0) { - r->tcpdiag_timer = 4; - r->tcpdiag_retrans = tp->probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); - } else if (timer_pending(&sk->sk_timer)) { - r->tcpdiag_timer = 2; - r->tcpdiag_retrans = tp->probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); - } else { - r->tcpdiag_timer = 0; - r->tcpdiag_expires = 0; - } -#undef EXPIRES_IN_MS + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = sock_i_ino(sk); - - if (minfo) { - minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); - minfo->tcpdiag_wmem = sk->sk_wmem_queued; - minfo->tcpdiag_fmem = sk->sk_forward_alloc; - minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); - } - - if (info) + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) tcp_get_info(sk, info); - - if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) - tp->ca_ops->get_info(tp, ext, skb); - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -rtattr_failure: -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, - int dif); -#ifdef CONFIG_IP_TCPDIAG_IPV6 -extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif); -#else -static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif) -{ - return NULL; -} -#endif - -static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) -{ - int err; - struct sock *sk; - struct tcpdiagreq *req = NLMSG_DATA(nlh); - struct sk_buff *rep; - - if (req->tcpdiag_family == AF_INET) { - sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, - req->id.tcpdiag_src[0], req->id.tcpdiag_sport, - req->id.tcpdiag_if); - } -#ifdef CONFIG_IP_TCPDIAG_IPV6 - else if (req->tcpdiag_family == AF_INET6) { - sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, - (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, - req->id.tcpdiag_if); - } -#endif - else { - return -EINVAL; - } - - if (sk == NULL) - return -ENOENT; - - err = -ESTALE; - if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || - req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) - goto out; - - err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ - sizeof(struct tcpdiag_meminfo)+ - sizeof(struct tcp_info)+64), GFP_KERNEL); - if (!rep) - goto out; - - if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, - NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0) <= 0) - BUG(); - - err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); - if (err > 0) - err = 0; - -out: - if (sk) { - if (sk->sk_state == TCP_TIME_WAIT) - tcp_tw_put((struct tcp_tw_bucket*)sk); - else - sock_put(sk); - } - return err; -} - -static int bitstring_match(const u32 *a1, const u32 *a2, int bits) -{ - int words = bits >> 5; - - bits &= 0x1f; - - if (words) { - if (memcmp(a1, a2, words << 2)) - return 0; - } - if (bits) { - __u32 w1, w2; - __u32 mask; - - w1 = a1[words]; - w2 = a2[words]; - - mask = htonl((0xffffffff) << (32 - bits)); - - if ((w1 ^ w2) & mask) - return 0; - } - - return 1; -} - - -static int tcpdiag_bc_run(const void *bc, int len, - const struct tcpdiag_entry *entry) -{ - while (len > 0) { - int yes = 1; - const struct tcpdiag_bc_op *op = bc; - - switch (op->code) { - case TCPDIAG_BC_NOP: - break; - case TCPDIAG_BC_JMP: - yes = 0; - break; - case TCPDIAG_BC_S_GE: - yes = entry->sport >= op[1].no; - break; - case TCPDIAG_BC_S_LE: - yes = entry->dport <= op[1].no; - break; - case TCPDIAG_BC_D_GE: - yes = entry->dport >= op[1].no; - break; - case TCPDIAG_BC_D_LE: - yes = entry->dport <= op[1].no; - break; - case TCPDIAG_BC_AUTO: - yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); - break; - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: - { - struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); - u32 *addr; - - if (cond->port != -1 && - cond->port != (op->code == TCPDIAG_BC_S_COND ? - entry->sport : entry->dport)) { - yes = 0; - break; - } - - if (cond->prefix_len == 0) - break; - - if (op->code == TCPDIAG_BC_S_COND) - addr = entry->saddr; - else - addr = entry->daddr; - - if (bitstring_match(addr, cond->addr, cond->prefix_len)) - break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr+3, cond->addr, cond->prefix_len)) - break; - } - yes = 0; - break; - } - } - - if (yes) { - len -= op->yes; - bc += op->yes; - } else { - len -= op->no; - bc += op->no; - } - } - return (len == 0); -} - -static int valid_cc(const void *bc, int len, int cc) -{ - while (len >= 0) { - const struct tcpdiag_bc_op *op = bc; - - if (cc > len) - return 0; - if (cc == len) - return 1; - if (op->yes < 4) - return 0; - len -= op->yes; - bc += op->yes; - } - return 0; -} - -static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) -{ - const unsigned char *bc = bytecode; - int len = bytecode_len; - - while (len > 0) { - struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; - -//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); - switch (op->code) { - case TCPDIAG_BC_AUTO: - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: - case TCPDIAG_BC_S_GE: - case TCPDIAG_BC_S_LE: - case TCPDIAG_BC_D_GE: - case TCPDIAG_BC_D_LE: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - case TCPDIAG_BC_JMP: - if (op->no < 4 || op->no > len+4) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len-op->no)) - return -EINVAL; - break; - case TCPDIAG_BC_NOP: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - break; - default: - return -EINVAL; - } - bc += op->yes; - len -= op->yes; - } - return len == 0 ? 0 : -EINVAL; -} - -static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - struct tcpdiag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); - struct inet_sock *inet = inet_sk(sk); - - entry.family = sk->sk_family; -#ifdef CONFIG_IP_TCPDIAG_IPV6 - if (entry.family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - entry.saddr = np->rcv_saddr.s6_addr32; - entry.daddr = np->daddr.s6_addr32; - } else -#endif - { - entry.saddr = &inet->rcv_saddr; - entry.daddr = &inet->daddr; - } - entry.sport = inet->num; - entry.dport = ntohs(inet->dport); - entry.userlocks = sk->sk_userlocks; - - if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) - return 0; - } - - return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); } -static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - u32 pid, u32 seq) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb->tail; - struct tcpdiagmsg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); - nlh->nlmsg_flags = NLM_F_MULTI; - r = NLMSG_DATA(nlh); - - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = TCP_SYN_RECV; - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = req->retrans; - - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); - - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = ireq->rmt_port; - r->id.tcpdiag_src[0] = ireq->loc_addr; - r->id.tcpdiag_dst[0] = ireq->rmt_addr; - r->tcpdiag_expires = jiffies_to_msecs(tmo), - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 - if (r->tcpdiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &tcp6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &tcp6_rsk(req)->rmt_addr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - - return skb->len; - -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct tcpdiag_entry entry; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt; - struct rtattr *bc = NULL; - struct inet_sock *inet = inet_sk(sk); - int j, s_j; - int reqnum, s_reqnum; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - read_lock_bh(&tp->accept_queue.syn_wait_lock); - - lopt = tp->accept_queue.listen_opt; - if (!lopt || !lopt->qlen) - goto out; - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); - entry.sport = inet->num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.tcpdiag_dport != ireq->rmt_port && - r->id.tcpdiag_dport) - continue; - - if (bc) { - entry.saddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 - (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 - (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; - entry.dport = ntohs(ireq->rmt_port); - - if (!tcpdiag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) - continue; - } - - err = tcpdiag_fill_req(skb, sk, req, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - read_unlock_bh(&tp->accept_queue.syn_wait_lock); - - return err; -} - -static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - int i, num; - int s_i, s_num; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) - goto skip_listen_ht; - tcp_listen_lock(); - for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { - struct sock *sk; - struct hlist_node *node; - - num = 0; - sk_for_each(sk, node, &tcp_listening_hash[i]) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) { - num++; - continue; - } - - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) - goto next_listen; - - if (!(r->tcpdiag_states&TCPF_LISTEN) || - r->id.tcpdiag_dport || - cb->args[3] > 0) - goto syn_recv; - - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - tcp_listen_unlock(); - goto done; - } - -syn_recv: - if (!(r->tcpdiag_states&TCPF_SYN_RECV)) - goto next_listen; - - if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - tcp_listen_unlock(); - goto done; - } - -next_listen: - cb->args[3] = 0; - cb->args[4] = 0; - ++num; - } - - s_num = 0; - cb->args[3] = 0; - cb->args[4] = 0; - } - tcp_listen_unlock(); -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - - if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) - return skb->len; - - for (i = s_i; i < tcp_ehash_size; i++) { - struct tcp_ehash_bucket *head = &tcp_ehash[i]; - struct sock *sk; - struct hlist_node *node; - - if (i > s_i) - s_num = 0; - - read_lock_bh(&head->lock); - - num = 0; - sk_for_each(sk, node, &head->chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_normal; - if (!(r->tcpdiag_states & (1 << sk->sk_state))) - goto next_normal; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) - goto next_normal; - if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) - goto next_normal; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_normal: - ++num; - } - - if (r->tcpdiag_states&TCPF_TIME_WAIT) { - sk_for_each(sk, node, - &tcp_ehash[i + tcp_ehash_size].chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_dying; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) - goto next_dying; - if (r->id.tcpdiag_dport != inet->dport && - r->id.tcpdiag_dport) - goto next_dying; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_dying: - ++num; - } - } - read_unlock_bh(&head->lock); - } - -done: - cb->args[1] = i; - cb->args[2] = num; - return skb->len; -} - -static int tcpdiag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - -static __inline__ int -tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; - - if (nlh->nlmsg_type != TCPDIAG_GETSOCK) - goto err_inval; - - if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) - goto err_inval; - - if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { - struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); - if (rta->rta_type != TCPDIAG_REQ_BYTECODE || - rta->rta_len < 8 || - rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) - goto err_inval; - if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) - goto err_inval; - } - return netlink_dump_start(tcpnl, skb, nlh, - tcpdiag_dump, - tcpdiag_dump_done); - } else { - return tcpdiag_get_exact(skb, nlh); - } - -err_inval: - return -EINVAL; -} - - -static inline void tcpdiag_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - if (skb->len >= NLMSG_SPACE(0)) { - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - err = tcpdiag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, err); - } -} - -static void tcpdiag_rcv(struct sock *sk, int len) -{ - struct sk_buff *skb; - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - tcpdiag_rcv_skb(skb); - kfree_skb(skb); - } -} +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; -static int __init tcpdiag_init(void) +static int __init tcp_diag_init(void) { - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); - if (tcpnl == NULL) - return -ENOMEM; - return 0; + return inet_diag_register(&tcp_diag_handler); } -static void __exit tcpdiag_exit(void) +static void __exit tcp_diag_exit(void) { - sock_release(tcpnl->sk_socket); + inet_diag_unregister(&tcp_diag_handler); } -module_init(tcpdiag_init); -module_exit(tcpdiag_exit); +module_init(tcp_diag_init); +module_exit(tcp_diag_exit); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 36c51f8136b..6acc04bde08 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,9 +98,10 @@ struct hstcp { u32 ai; }; -static void hstcp_init(struct tcp_sock *tp) +static void hstcp_init(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); ca->ai = 0; @@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, u32 in_flight, int good) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, } } -static u32 hstcp_ssthresh(struct tcp_sock *tp) +static u32 hstcp_ssthresh(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct hstcp *ca = inet_csk_ca(sk); /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); @@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = { static int __init hstcp_register(void) { - BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_highspeed); } diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 40168275acf..e47b37984e9 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca) ca->snd_cwnd_cnt2 = 0; } -static u32 htcp_cwnd_undo(struct tcp_sock *tp) +static u32 htcp_cwnd_undo(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); ca->ccount = ca->undo_ccount; ca->maxRTT = ca->undo_maxRTT; ca->old_maxB = ca->undo_old_maxB; return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); } -static inline void measure_rtt(struct tcp_sock *tp) +static inline void measure_rtt(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 srtt = tp->srtt>>3; /* keep track of minimum RTT seen so far, minRTT is zero at first */ @@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp) ca->minRTT = srtt; /* max RTT */ - if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) @@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp) } } -static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 now = tcp_time_stamp; /* achieved throughput calculations */ - if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) { ca->packetcount = 0; ca->lasttime = now; return; @@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca) * that point do we really have a real sense of maxRTT (the queues en route * were getting just too full now). */ -static void htcp_param_update(struct tcp_sock *tp) +static void htcp_param_update(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); u32 minRTT = ca->minRTT; u32 maxRTT = ca->maxRTT; @@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp) ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; } -static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 htcp_recalc_ssthresh(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); - htcp_param_update(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct htcp *ca = inet_csk_ca(sk); + htcp_param_update(sk); return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int data_acked) { - struct htcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } else { - measure_rtt(tp); + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { @@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, } /* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct tcp_sock *tp) +static u32 htcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void htcp_init(struct tcp_sock *tp) +static void htcp_init(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); memset(ca, 0, sizeof(struct htcp)); ca->alpha = ALPHA_BASE; ca->beta = BETA_MIN; } -static void htcp_state(struct tcp_sock *tp, u8 new_state) +static void htcp_state(struct sock *sk, u8 new_state) { switch (new_state) { case TCP_CA_CWR: case TCP_CA_Recovery: case TCP_CA_Loss: - htcp_reset(tcp_ca(tp)); + htcp_reset(inet_csk_ca(sk)); break; } } @@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = { static int __init htcp_register(void) { - BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); BUILD_BUG_ON(BETA_MIN >= BETA_MAX); if (!use_bandwidth_switch) htcp.pkts_acked = NULL; diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 13a66342c30..77add63623d 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); /* This is called to refresh values for hybla parameters */ -static inline void hybla_recalc_param (struct tcp_sock *tp) +static inline void hybla_recalc_param (struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >>7; } -static void hybla_init(struct tcp_sock *tp) +static void hybla_init(struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); ca->rho = 0; ca->rho2 = 0; @@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ - hybla_recalc_param(tp); + hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ ca->minrtt = tp->srtt; tp->snd_cwnd = ca->rho; } -static void hybla_state(struct tcp_sock *tp, u8 ca_state) +static void hybla_state(struct sock *sk, u8 ca_state) { - struct hybla *ca = tcp_ca(tp); - + struct hybla *ca = inet_csk_ca(sk); ca->hybla_en = (ca_state == TCP_CA_Open); } @@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); u32 increment, odd, rho_fractions; int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ if (tp->srtt < ca->minrtt){ - hybla_recalc_param(tp); + hybla_recalc_param(sk); ca->minrtt = tp->srtt; } if (!ca->hybla_en) - return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); if (in_flight < tp->snd_cwnd) return; if (ca->rho == 0) - hybla_recalc_param(tp); + hybla_recalc_param(sk); rho_fractions = ca->rho_3ls - (ca->rho << 3); @@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = { static int __init hybla_register(void) { - BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_hybla); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53a8a5399f1..1afb080bdf0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { - unsigned int len, lss; + struct inet_connection_sock *icsk = inet_csk(sk); + const unsigned int lss = icsk->icsk_ack.last_seg_size; + unsigned int len; - lss = tp->ack.last_seg_size; - tp->ack.last_seg_size = 0; + icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb->len; - if (len >= tp->ack.rcv_mss) { - tp->ack.rcv_mss = len; + if (len >= icsk->icsk_ack.rcv_mss) { + icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ - len -= tp->tcp_header_len; - tp->ack.last_seg_size = len; + len -= tcp_sk(sk)->tcp_header_len; + icsk->icsk_ack.last_seg_size = len; if (len == lss) { - tp->ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = len; return; } } - tp->ack.pending |= TCP_ACK_PUSHED; + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } -static void tcp_incr_quickack(struct tcp_sock *tp) +static void tcp_incr_quickack(struct sock *sk) { - unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks==0) quickacks=2; - if (quickacks > tp->ack.quick) - tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + if (quickacks > icsk->icsk_ack.quick) + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct tcp_sock *tp) +void tcp_enter_quickack_mode(struct sock *sk) { - tcp_incr_quickack(tp); - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ -static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +static inline int tcp_in_quickack_mode(const struct sock *sk) { - return (tp->ack.quick && !tp->ack.pingpong); + const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, + const struct sk_buff *skb) { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; @@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) - return 2*tp->ack.rcv_mss; + return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; @@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); - tp->ack.quick |= 1; + inet_csk(sk)->icsk_ack.quick |= 1; } } } @@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk) /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { + struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - tp->ack.quick = 0; + icsk->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > tp->ack.rcv_mss) - app_win -= tp->ack.rcv_mss; + if (app_win > icsk->icsk_ack.rcv_mss) + app_win -= icsk->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -415,11 +420,12 @@ new_measure: tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); } @@ -492,41 +498,42 @@ new_measure: */ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); u32 now; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); - tcp_measure_rcv_mss(tp, skb); + tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; - if (!tp->ack.ato) { + if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(tp); - tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + icsk->icsk_ack.ato = TCP_ATO_MIN; } else { - int m = now - tp->ack.lrcvtime; + int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ - tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; - } else if (m < tp->ack.ato) { - tp->ack.ato = (tp->ack.ato>>1) + m; - if (tp->ack.ato > tp->rto) - tp->ack.ato = tp->rto; - } else if (m > tp->rto) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; + } else if (m < icsk->icsk_ack.ato) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; + if (icsk->icsk_ack.ato > icsk->icsk_rto) + icsk->icsk_ack.ato = icsk->icsk_rto; + } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender falled to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(tp); + tcp_incr_quickack(sk); sk_stream_mem_reclaim(sk); } } - tp->ack.lrcvtime = now; + icsk->icsk_ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); @@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) { + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) tp->rtt_seq = tp->snd_nxt; } - if (tp->ca_ops->rtt_sample) - tp->ca_ops->rtt_sample(tp, *usrtt); + if (icsk->icsk_ca_ops->rtt_sample) + icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct tcp_sock *tp) +static inline void tcp_set_rto(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: @@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. */ - tp->rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ -static inline void tcp_bound_rto(struct tcp_sock *tp) +static inline void tcp_bound_rto(struct sock *sk) { - if (tp->rto > TCP_RTO_MAX) - tp->rto = TCP_RTO_MAX; + if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) + inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); if (dst && (dst->flags&DST_HOST)) { + const struct inet_connection_sock *icsk = inet_csk(sk); int m; - if (tp->backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk) tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && - tp->ca_state == TCP_CA_Open) { + icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = @@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); } - tcp_set_rto(tp); - tcp_bound_rto(tp); - if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + tcp_set_rto(sk); + tcp_bound_rto(sk); + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -817,12 +828,14 @@ reset: if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } } -static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) +static void tcp_update_reordering(struct sock *sk, const int metric, + const int ts) { + struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); @@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", - tp->rx_opt.sack_ok, tp->ca_state, + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, tp->fackets_out, tp->sacked_out, @@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); @@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ * we have to account for reordering! Ugly, * but should help. */ - if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { @@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) - tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); @@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ */ void tcp_enter_frto(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; tp->frto_counter = 1; - if (tp->ca_state <= TCP_CA_Disorder || + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_FRTO); + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tp->frto_highmark = tp->snd_nxt; } @@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); } @@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp) */ void tcp_enter_loss(struct sock *sk, int how) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ - if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_LOSS); + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } -static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +static int tcp_check_sack_reneging(struct sock *sk) { struct sk_buff *skb; @@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) */ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - tp->retransmits++; + icsk->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + icsk->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; } -static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { return tp->packets_out && - tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ -static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) +static void tcp_check_reno_reordering(struct sock *sk, const int addend) { + struct tcp_sock *tp = tcp_sk(sk); u32 holes; holes = max(tp->lost_out, 1U); @@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(tp, tp->packets_out+addend, 0); + tcp_update_reordering(sk, tp->packets_out + addend, 0); } } /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct tcp_sock *tp) +static void tcp_add_reno_sack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->sacked_out++; - tcp_check_reno_reordering(tp, 0); + tcp_check_reno_reordering(sk, 0); tcp_sync_left_out(tp); } @@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke else tp->sacked_out -= acked-1; } - tcp_check_reno_reordering(tp, acked); + tcp_check_reno_reordering(sk, acked); tcp_sync_left_out(tp); } @@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(tp, skb) && + if (tcp_skb_timedout(sk, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct tcp_sock *tp) +static void tcp_cwnd_down(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) + if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct tcp_sock *tp, int undo) +static void tcp_undo_cwr(struct sock *sk, const int undo) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->prior_ssthresh) { - if (tp->ca_ops->undo_cwnd) - tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->undo_cwnd) + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(tp, 1); - if (tp->ca_state == TCP_CA_Loss) + DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(sk, 1); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); @@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) tcp_moderate_cwnd(tp); return 1; } - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 0; } @@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) { if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, tp, "D-SACK"); - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); } @@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, if (tp->retrans_out == 0) tp->retrans_stamp = 0; - tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, tp, "Hoe"); - tcp_undo_cwr(tp, 0); + tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 1; } return 0; } -static inline void tcp_complete_cwr(struct tcp_sock *tp) +static inline void tcp_complete_cwr(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); + tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); - if (tp->ca_state != TCP_CA_CWR) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; if (tp->left_out || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; - if (tp->ca_state != state) { - tcp_set_ca_state(tp, state); + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); } } @@ -1733,6 +1760,7 @@ static void tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, int prior_packets, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); @@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + if (tp->sacked_out && tcp_check_sack_reneging(sk)) return; /* C. Process data loss notification, provided it is valid. */ if ((flag&FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && - tp->ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); @@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ - if (tp->ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { if (!sysctl_tcp_frto) BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Loss: - tp->retransmits = 0; + icsk->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { - tcp_complete_cwr(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_complete_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk, tp)) return; - tcp_complete_cwr(tp); + tcp_complete_cwr(sk); break; } } /* F. Process state. */ - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (prior_snd_una == tp->snd_una) { if (IsReno(tp) && is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) @@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - tp->retransmits = 0; + icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; } - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return; /* Loss is undone; fall through to processing in Open state. */ default: @@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->snd_una != prior_snd_una) tcp_reset_reno_sack(tp); if (is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } - if (tp->ca_state == TCP_CA_Disorder) + if (icsk->icsk_ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); if (!tcp_time_to_recover(sk, tp)) { @@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; - if (tp->ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Recovery); + tcp_set_ca_state(sk, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) { - __u32 seq_rtt; - /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the @@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + struct tcp_sock *tp = tcp_sk(sk); + const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt, u32 *usrtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + const s32 seq_rtt, u32 *usrtt) { + const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, usrtt, flag); + tcp_ack_saw_tstamp(sk, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good) { - tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); - tp->snd_cwnd_stamp = tcp_time_stamp; + const struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. @@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { - tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if (seq_usrtt) - *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 - + (usnow.tv_usec - skb->stamp.tv_usec); + if (seq_usrtt) { + struct timeval tv; + + skb_get_timestamp(skb, &tv); + *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 + + (usnow.tv_usec - tv.tv_usec); + } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); @@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = now - scb->when; tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); + const struct inet_connection_sock *icsk = inet_csk(sk); + tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); - if (tp->ca_ops->pkts_acked) - tp->ca_ops->pkts_acked(tp, pkts_acked); + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); if (!tp->packets_out && tp->rx_opt.sack_ok) { + const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", - tp->lost_out, tp->ca_state); + tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { printk(KERN_DEBUG "Leak s=%u %d\n", - tp->sacked_out, tp->ca_state); + tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { printk(KERN_DEBUG "Leak r=%u %d\n", - tp->retrans_out, tp->ca_state); + tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } @@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt static void tcp_ack_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { - tp->backoff = 0; - tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } } -static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) +static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - tp->ca_state != TCP_CA_Open); + inet_csk(sk)->icsk_ca_state != TCP_CA_Open); } -static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) +static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) { + const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); + !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); } /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, - u32 ack_seq, u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; @@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(tp, CA_EVENT_FAST_ACK); + tcp_ca_event(sk, CA_EVENT_FAST_ACK); NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { @@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_ca_event(tp, CA_EVENT_SLOW_ACK); + tcp_ca_event(sk, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); + icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); - if (tcp_ack_is_dubious(tp, flag)) { + if (tcp_ack_is_dubious(sk, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { if ((flag & FLAG_DATA_ACKED)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) return 1; no_queue: - tp->probes_out = 0; + icsk->icsk_probes_out = 0; /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * up to bandwidth of 18Gigabit/sec. 8) ] */ -static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th = skb->h.th; u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(tp, skb)); + !tcp_disordered_ack(sk, skb)); } /* Check segment sequence number for validity. @@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); @@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - tp->ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk) if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) @@ -2942,7 +2983,7 @@ queue_and_out: * gap in queue is filled. */ if (skb_queue_empty(&tp->out_of_order_queue)) - tp->ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pingpong = 0; } if (tp->rx_opt.num_sacks) @@ -2963,8 +3004,8 @@ queue_and_out: tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(tp); - tcp_schedule_ack(tp); + tcp_enter_quickack_mode(sk); + inet_csk_schedule_ack(sk); drop: __kfree_skb(skb); return; @@ -2974,7 +3015,7 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -3003,7 +3044,7 @@ drop: /* Disable header prediction. */ tp->pred_flags = 0; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -3027,7 +3068,7 @@ drop: u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_append(skb1, skb); + __skb_append(skb1, skb, &tp->out_of_order_queue); if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) @@ -3071,7 +3112,7 @@ drop: tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); break; } - __skb_unlink(skb1, skb1->list); + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -3088,8 +3129,9 @@ add_sack: * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff *head, - struct sk_buff *tail, u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, + struct sk_buff *head, struct sk_buff *tail, + u32 start, u32 end) { struct sk_buff *skb; @@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, skb->list); + __skb_insert(nskb, skb->prev, skb, list); sk_stream_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk) if (skb == (struct sk_buff *)&tp->out_of_order_queue || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, head, skb, start, end); + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); head = skb; if (skb == (struct sk_buff *)&tp->out_of_order_queue) break; @@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, sk->sk_receive_queue.next, + tcp_collapse(sk, &sk->sk_receive_queue, + sk->sk_receive_queue.next, (struct sk_buff*)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); sk_stream_mem_reclaim(sk); @@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ca_state == TCP_CA_Open && + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 win_used = max(tp->snd_cwnd_used, 2U); if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; } tp->snd_cwnd_used = 0; @@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ && __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ - tcp_in_quickack_mode(tp) || + tcp_in_quickack_mode(sk) || /* We have out of order data. */ (ofo_possible && skb_peek(&tp->out_of_order_queue))) { @@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) static __inline__ void tcp_ack_snd_check(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_ack_scheduled(tp)) { + if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } @@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } @@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* We know that such packets are checksummed * on entry. @@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk, tp); - if (!tcp_ack_scheduled(tp)) + if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -3741,7 +3784,7 @@ slow_path: * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -3788,7 +3831,7 @@ step5: if(th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); @@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { + struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_buffer_space(sk); if (sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + icsk = inet_csk(sk); + + if (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || + icsk->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ - tcp_schedule_ack(tp); - tp->ack.lrcvtime = tcp_time_stamp; - tp->ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(tp); - tcp_enter_quickack_mode(tp); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); discard: __kfree_skb(skb); @@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0, 0); + tcp_ack_saw_tstamp(sk, NULL, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. @@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - tmo = tcp_fin_time(tp); + tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * if it spins in bh_lock_sock(), but it is really * marginal case. */ - tcp_reset_keepalive_timer(sk, tmo); + inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67c670886c1..13dfb391cdf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -64,7 +64,9 @@ #include <linux/times.h> #include <net/icmp.h> +#include <net/inet_hashtables.h> #include <net/tcp.h> +#include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/xfrm.h> @@ -75,7 +77,6 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -extern int sysctl_ip_dynaddr; int sysctl_tcp_tw_reuse; int sysctl_tcp_low_latency; @@ -88,463 +89,29 @@ static struct socket *tcp_socket; void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { - .__tcp_lhash_lock = RW_LOCK_UNLOCKED, - .__tcp_lhash_users = ATOMIC_INIT(0), - .__tcp_lhash_wait - = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), - .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, }; -/* - * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 - */ -int sysctl_local_port_range[2] = { 1024, 4999 }; -int tcp_port_rover = 1024 - 1; - -static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, - __u32 faddr, __u16 fport) -{ - int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h & (tcp_ehash_size - 1); -} - -static __inline__ int tcp_sk_hashfn(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - __u32 laddr = inet->rcv_saddr; - __u16 lport = inet->num; - __u32 faddr = inet->daddr; - __u16 fport = inet->dport; - - return tcp_hashfn(laddr, lport, faddr, fport); -} - -/* Allocate and initialize a new TCP local port bind bucket. - * The bindhash mutex for snum's hash chain must be held here. - */ -struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum) -{ - struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, - SLAB_ATOMIC); - if (tb) { - tb->port = snum; - tb->fastreuse = 0; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -/* Caller must hold hashbucket lock for this tb with local BH disabled */ -void tcp_bucket_destroy(struct tcp_bind_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(tcp_bucket_cachep, tb); - } -} - -/* Caller must disable local BH processing. */ -static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) -{ - struct tcp_bind_hashbucket *head = - &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; - struct tcp_bind_bucket *tb; - - spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; - sk_add_bind_node(child, &tb->owners); - tcp_sk(child)->bind_hash = tb; - spin_unlock(&head->lock); -} - -inline void tcp_inherit_port(struct sock *sk, struct sock *child) -{ - local_bh_disable(); - __tcp_inherit_port(sk, child); - local_bh_enable(); -} - -void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum) -{ - inet_sk(sk)->num = snum; - sk_add_bind_node(sk, &tb->owners); - tcp_sk(sk)->bind_hash = tb; -} - -static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) -{ - const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); - struct sock *sk2; - struct hlist_node *node; - int reuse = sk->sk_reuse; - - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - !tcp_v6_ipv6only(sk2) && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) - break; - } - } - } - return node != NULL; -} - -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - */ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; - struct hlist_node *node; - struct tcp_bind_bucket *tb; - int ret; - - local_bh_disable(); - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover; - - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) - rover = low; - else - rover = tcp_port_rover; - do { - rover++; - if (rover > high) - rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; - spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; - spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; -success: - if (!tcp_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - -/* Get rid of any references to a local port held by the - * given sock. - */ -static void __tcp_put_port(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; - struct tcp_bind_bucket *tb; - - spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; - __sk_del_bind_node(sk); - tcp_sk(sk)->bind_hash = NULL; - inet->num = 0; - tcp_bucket_destroy(tb); - spin_unlock(&head->lock); -} - -void tcp_put_port(struct sock *sk) -{ - local_bh_disable(); - __tcp_put_port(sk); - local_bh_enable(); -} - -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines (wake up each - * exclusive lock release). It should be ifdefed really. - */ - -void tcp_listen_wlock(void) -{ - write_lock(&tcp_lhash_lock); - - if (atomic_read(&tcp_lhash_users)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait_exclusive(&tcp_lhash_wait, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_lhash_users)) - break; - write_unlock_bh(&tcp_lhash_lock); - schedule(); - write_lock_bh(&tcp_lhash_lock); - } - - finish_wait(&tcp_lhash_wait, &wait); - } -} - -static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; - tcp_listen_wlock(); - } else { - list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; - write_lock(lock); - } - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); - if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + return inet_csk_get_port(&tcp_hashinfo, sk, snum); } static void tcp_v4_hash(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); - __tcp_v4_hash(sk, 1); - local_bh_enable(); - } + inet_hash(&tcp_hashinfo, sk); } void tcp_unhash(struct sock *sk) { - rwlock_t *lock; - - if (sk_unhashed(sk)) - goto ende; - - if (sk->sk_state == TCP_LISTEN) { - local_bh_disable(); - tcp_listen_wlock(); - lock = &tcp_lhash_lock; - } else { - struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; - lock = &head->lock; - write_lock_bh(&head->lock); - } - - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(lock); - - ende: - if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); -} - -/* Don't inline this cruft. Here are some nice properties to - * exploit here. The BSD API does not allow a listening TCP - * to specify the remote port nor the remote address for the - * connection. So always assume those are both wildcarded - * during the search since they can never be otherwise. - */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, - unsigned short hnum, int dif) -{ - struct sock *result = NULL, *sk; - struct hlist_node *node; - int score, hiscore; - - hiscore=-1; - sk_for_each(sk, node, head) { - struct inet_sock *inet = inet_sk(sk); - - if (inet->num == hnum && !ipv6_only_sock(sk)) { - __u32 rcv_saddr = inet->rcv_saddr; - - score = (sk->sk_family == PF_INET ? 1 : 0); - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 5) - return sk; - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - return result; -} - -/* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(u32 daddr, - unsigned short hnum, int dif) -{ - struct sock *sk = NULL; - struct hlist_head *head; - - read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[tcp_lhashfn(hnum)]; - if (!hlist_empty(head)) { - struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if) - goto sherry_cache; - sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); - } - if (sk) { -sherry_cache: - sock_hold(sk); - } - read_unlock(&tcp_lhash_lock); - return sk; + inet_unhash(&tcp_hashinfo, sk); } -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * Local BH must be disabled here. - */ - -static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, - u32 daddr, u16 hnum, - int dif) -{ - struct tcp_ehash_bucket *head; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); - struct sock *sk; - struct hlist_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - int hash = tcp_hashfn(daddr, hnum, saddr, sport); - head = &tcp_ehash[hash]; - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { - if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; - } - sk = NULL; -out: - read_unlock(&head->lock); - return sk; -hit: - sock_hold(sk); - goto out; -} - -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) -{ - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, - daddr, hnum, dif); - - return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); -} - -inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, - u16 dport, int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; -} - -EXPORT_SYMBOL_GPL(tcp_v4_lookup); - static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { return secure_tcp_sequence_number(skb->nh.iph->daddr, @@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); u32 daddr = inet->rcv_saddr; u32 saddr = inet->daddr; int dif = sk->sk_bound_dev_if; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { - tw = (struct tcp_tw_bucket *)sk2; + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { + tw = inet_twsk(sk2); - if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); /* With PAWS, it is safe from the viewpoint @@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->tw_ts_recent_stamp && + if (tcptw->tw_ts_recent_stamp && (!twp || (sysctl_tcp_tw_reuse && xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { - if ((tp->write_seq = - tw->tw_snd_nxt + 65535 + 2) == 0) + tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; } @@ -631,10 +199,10 @@ unique: NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk) */ static inline int tcp_v4_hash_connect(struct sock *sk) { - unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk) static u32 hint; u32 offset = hint + connect_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -715,27 +283,27 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); } spin_unlock(&head->lock); if (tw) { - tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_deschedule(tw, &tcp_death_row);; + inet_twsk_put(tw); } ret = 0; goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; - tb = tcp_sk(sk)->bind_hash; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); spin_unlock_bh(&head->lock); return 0; } else { @@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (sysctl_tcp_tw_recycle && + if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); @@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; /* OK, now commit destination to socket. */ - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, @@ -864,53 +431,6 @@ failure: return err; } -static __inline__ int tcp_v4_iif(struct sk_buff *skb) -{ - return ((struct rtable *)skb->dst)->rt_iif; -} - -static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) -{ - return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); -} - -static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, - struct request_sock ***prevp, - __u16 rport, - __u32 raddr, __u32 laddr) -{ - struct listen_sock *lopt = tp->accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && - TCP_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); - *prevp = prev; - break; - } - } - - return req; -} - -static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); -} - - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, + th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) if (sock_owned_by_user(sk)) goto out; - req = tcp_v4_search_req(tp, &prev, th->dest, - iph->daddr, iph->saddr); + req = inet_csk_search_req(sk, &prev, th->dest, + iph->daddr, iph->saddr); if (!req) goto out; @@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * created socket, and POSIX does not want network * errors returned from accept(). */ - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) req->ts_recent); } -static struct dst_entry* tcp_v4_route_req(struct sock *sk, - struct request_sock *req) -{ - struct rtable *rt; - const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = ((opt && opt->srr) ? - opt->faddr : - ireq->rmt_addr), - .saddr = ireq->loc_addr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, - .dport = ireq->rmt_port } } }; - - if (ip_route_output_flow(&rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - return &rt->u.dst; -} - /* * Send a SYN-ACK after having received an ACK. * This still operates on a request_sock only, not on a big @@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto out; skb = tcp_make_synack(sk, dst, req); @@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * limitations, they conserve resources and peer is * evidently real one. */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * clogging syn queue with openreqs with exponentially increasing * timeout. */ - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp_request_sock_ops); @@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * are made in the function processing timewait state. */ if (tmp_opt.saw_tstamp && - sysctl_tcp_tw_recycle && - (dst = tcp_v4_route_req(sk, req)) != NULL && + tcp_death_row.sysctl_tw_recycle && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && @@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } /* Kill the following clause, if you dislike this way. */ else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { @@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * to destinations, already remembered * to the moment of synflood. */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " - "request from %u.%u." - "%u.%u/%u\n", - NIPQUAD(saddr), - ntohs(skb->h.th->source))); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " + "request from %u.%u.%u.%u/%u\n", + NIPQUAD(saddr), + ntohs(skb->h.th->source)); dst_release(dst); goto drop_and_free; } @@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { reqsk_free(req); } else { - tcp_v4_synq_add(sk, req); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; @@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; - newsk->sk_dst_cache = dst; - tcp_v4_setup_caps(newsk, dst); + sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; - newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newinet->opt) @@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); - __tcp_v4_hash(newsk, 0); - __tcp_inherit_port(sk, newsk); + __inet_hash(&tcp_hashinfo, newsk, 0); + __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ - struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, - iph->saddr, iph->daddr); + struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, + iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, + th->source, skb->nh.iph->daddr, + ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket *)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb) skb->nh.iph->daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->len <= 76) { @@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1801,24 +1287,26 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } - switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { - struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, + skb->nh.iph->daddr, + ntohs(th->dest), + inet_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + inet_twsk_deschedule((struct inet_timewait_sock *)sk, + &tcp_death_row); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -1834,112 +1322,6 @@ do_time_wait: goto discard_it; } -/* With per-bucket locks this operation is not-atomic, so that - * this version is not worse. - */ -static void __tcp_v4_rehash(struct sock *sk) -{ - sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); -} - -static int tcp_v4_reselect_saddr(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; - __u32 old_saddr = inet->saddr; - __u32 new_saddr; - __u32 daddr = inet->daddr; - - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->sport, inet->dport, sk); - if (err) - return err; - - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); - - new_saddr = rt->rt_src; - - if (new_saddr == old_saddr) - return 0; - - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); - } - - inet->saddr = new_saddr; - inet->rcv_saddr = new_saddr; - - /* XXX The only one ugly spot where we need to - * XXX really change the sockets identity after - * XXX it has entered the hashes. -DaveM - * - * Besides that, it does not check for connection - * uniqueness. Wait for troubles. - */ - __tcp_v4_rehash(sk); - return 0; -} - -int tcp_v4_rebuild_header(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); - u32 daddr; - int err; - - /* Route is OK, nothing to do. */ - if (rt) - return 0; - - /* Reroute. */ - daddr = inet->daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; - - err = ip_route_output_flow(&rt, &fl, sk, 0); - } - if (!err) { - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); - return 0; - } - - /* Routing failed... */ - sk->sk_route_caps = 0; - - if (!sysctl_ip_dynaddr || - sk->sk_state != TCP_SYN_SENT || - (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || - (err = tcp_v4_reselect_saddr(sk)) != 0) - sk->sk_err_soft = -err; - - return err; -} - static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; @@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk) return 0; } -int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = NULL; - - peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); if (peer) { - if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && - peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; - peer->tcp_ts = tw->tw_ts_recent; + peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); return 1; @@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) struct tcp_func ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = { */ static int tcp_v4_init_sock(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; - tp->ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); - tcp_cleanup_congestion_control(tp); + tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); @@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (tp->bind_hash) - tcp_put_port(sk); + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(&tcp_hashinfo, sk); /* * If sendmsg cached page exists, toss it. @@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) { return hlist_empty(head) ? NULL : - list_entry(head->first, struct tcp_tw_bucket, tw_node); + list_entry(head->first, struct inet_timewait_sock, tw_node); } -static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { return tw->tw_node.next ? hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; @@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_sock *tp; + struct inet_connection_sock *icsk; struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; if (!sk) { st->bucket = 0; - sk = sk_head(&tcp_listening_hash[0]); + sk = sk_head(&tcp_hashinfo.listening_hash[0]); goto get_sk; } @@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; - tp = tcp_sk(st->syn_wait_sk); + icsk = inet_csk(st->syn_wait_sk); req = req->dl_next; while (1) { while (req) { @@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (++st->sbucket >= TCP_SYNQ_HSIZE) break; get_req: - req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; + req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } sk = sk_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_next(sk); } get_sk: @@ -2169,9 +1552,9 @@ get_sk: cur = sk; goto out; } - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) { + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; @@ -2179,10 +1562,10 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } - if (++st->bucket < TCP_LHTABLE_SIZE) { - sk = sk_head(&tcp_listening_hash[st->bucket]); + if (++st->bucket < INET_LHTABLE_SIZE) { + sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); goto get_sk; } cur = NULL; @@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq) struct tcp_iter_state* st = seq->private; void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_ehash[st->bucket].lock); - sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; } @@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq) goto out; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw_for_each(tw, node, - &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } rc = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2241,7 +1624,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; @@ -2258,15 +1641,15 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; /* We can reschedule between buckets: */ cond_resched_softirq(); - if (++st->bucket < tcp_ehash_size) { - read_lock(&tcp_ehash[st->bucket].lock); - sk = sk_head(&tcp_ehash[st->bucket].chain); + if (++st->bucket < tcp_hashinfo.ehash_size) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; goto out; @@ -2280,7 +1663,7 @@ get_tw: } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); goto get_tw; found: cur = sk; @@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) void *rc; struct tcp_iter_state* st = seq->private; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); @@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); @@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_OPENREQ: if (v) { - struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); local_bh_enable(); break; } @@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) int timer_active; unsigned long timer_expires; struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct inet_sock *inet = inet_sk(sp); unsigned int dest = inet->daddr; unsigned int src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); __u16 srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), - tp->probes_out, + icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } -static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; @@ -2588,7 +1974,7 @@ struct proto tcp_prot = { .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, @@ -2603,6 +1989,7 @@ struct proto tcp_prot = { .get_port = tcp_v4_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, @@ -2610,6 +1997,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .twsk_obj_size = sizeof(struct tcp_timewait_sock), .rsk_prot = &tcp_request_sock_ops, }; @@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops) } EXPORT_SYMBOL(ipv4_specific); -EXPORT_SYMBOL(tcp_bind_hash); -EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_inherit_port); -EXPORT_SYMBOL(tcp_listen_wlock); -EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_put_port); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_do_rcv); -EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_remember_stamp); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f42a284164b..a88db28b0af 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,13 +35,27 @@ #define SYNC_INIT 1 #endif -int sysctl_tcp_tw_recycle; -int sysctl_tcp_max_tw_buckets = NR_FILE*2; - int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); +struct inet_timewait_death_row tcp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = SPIN_LOCK_UNLOCKED, + .hashinfo = &tcp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&tcp_death_row), + .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, + inet_twdr_twkill_work, + &tcp_death_row), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&tcp_death_row), +}; + +EXPORT_SYMBOL_GPL(tcp_death_row); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return (seq == e_win && seq == end_seq); } -/* New-style handling of TIME_WAIT sockets. */ - -int tcp_tw_count; - - -/* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct tcp_tw_bucket *tw) -{ - struct tcp_ehash_bucket *ehead; - struct tcp_bind_hashbucket *bhead; - struct tcp_bind_bucket *tb; - - /* Unlink from established hashes. */ - ehead = &tcp_ehash[tw->tw_hashent]; - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); - return; - } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; - spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - tcp_bucket_destroy(tb); - spin_unlock(&bhead->lock); - -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, - atomic_read(&tw->tw_refcnt)); - } -#endif - tcp_tw_put(tw); -} - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) +tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + const struct tcphdr *th) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; int paws_reject = 0; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, 0); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = tw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tmp_opt.ts_recent = tcptw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tw->tw_rcv_nxt, - tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + tcptw->tw_rcv_nxt, + tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return TCP_TW_ACK; if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) goto kill_with_rst; /* Dup ACK? */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); return TCP_TW_RST; } /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tw->tw_substate = TCP_TIME_WAIT; + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent_stamp = xtime.tv_sec; - tw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } /* I am shamed, but failed to make it more elegant. @@ -187,11 +161,13 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) - tcp_tw_schedule(tw, tw->tw_timeout); + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, + TCP_TIMEWAIT_LEN); else - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -213,7 +189,7 @@ kill_with_rst: */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -224,19 +200,20 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent = tmp_opt.rcv_tsval; - tw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -258,9 +235,10 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || - (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { - u32 isn = tw->tw_snd_nxt + 65535 + 2; + (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && + (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->when = isn; @@ -278,107 +256,57 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is called with locally disabled BH. - * Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) -{ - struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; - struct tcp_bind_hashbucket *bhead; - - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in - binding cache, even if it is closed. - */ - bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; - spin_lock(&bhead->lock); - tw->tw_tb = tcp_sk(sk)->bind_hash; - BUG_TRAP(tcp_sk(sk)->bind_hash); - tw_add_bind_node(tw, &tw->tw_tb->owners); - spin_unlock(&bhead->lock); - - write_lock(&ehead->lock); - - /* Step 2: Remove SK from established hash. */ - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - - /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); - atomic_inc(&tw->tw_refcnt); - - write_unlock(&ehead->lock); -} - /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw = NULL; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw = NULL; + const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; - if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); - if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - - if(tw != NULL) { - struct inet_sock *inet = inet_sk(sk); - int rto = (tp->rto<<2) - (tp->rto>>1); - - /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; - tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; - tw->tw_state = TCP_TIME_WAIT; - tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; - tw->tw_family = sk->sk_family; - tw->tw_reuse = sk->sk_reuse; - tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; - atomic_set(&tw->tw_refcnt, 1); + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) + tw = inet_twsk_alloc(sk, state); - tw->tw_hashent = sk->sk_hashent; - tw->tw_rcv_nxt = tp->rcv_nxt; - tw->tw_snd_nxt = tp->snd_nxt; - tw->tw_rcv_wnd = tcp_receive_window(tp); - tw->tw_ts_recent = tp->rx_opt.ts_recent; - tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - tw_dead_node_init(tw); + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); + + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); - tw->tw_v6_ipv6only = np->ipv6only; - } else { - memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); - memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); - tw->tw_v6_ipv6only = 0; + ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; } #endif /* Linkage updates. */ - __tcp_tw_hashdance(sk, tw); + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - tcp_tw_schedule(tw, timeo); - tcp_tw_put(tw); + inet_twsk_schedule(tw, &tcp_death_row, timeo, + TCP_TIMEWAIT_LEN); + inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than @@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_done(sk); } -/* Kill off TIME_WAIT sockets once their lifetime has expired. */ -static int tcp_tw_death_row_slot; - -static void tcp_twkill(unsigned long); - -/* TIME_WAIT reaping mechanism. */ -#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ -#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) - -#define TCP_TWKILL_QUOTA 100 - -static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static DEFINE_SPINLOCK(tw_death_lock); -static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); -static void twkill_work(void *); -static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); -static u32 twkill_thread_slots; - -/* Returns non-zero if quota exceeded. */ -static int tcp_do_twkill_work(int slot, unsigned int quota) -{ - struct tcp_tw_bucket *tw; - struct hlist_node *node; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { - __tw_del_dead_node(tw); - spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); - tcp_tw_put(tw); - killed++; - spin_lock(&tw_death_lock); - if (killed > quota) { - ret = 1; - break; - } - - /* While we dropped tw_death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - tcp_tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - - return ret; -} - -static void tcp_twkill(unsigned long dummy) -{ - int need_timer, ret; - - spin_lock(&tw_death_lock); - - if (tcp_tw_count == 0) - goto out; - - need_timer = 0; - ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); - if (ret) { - twkill_thread_slots |= (1 << tcp_tw_death_row_slot); - mb(); - schedule_work(&tcp_twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (tcp_tw_count) - need_timer = 1; - } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - if (need_timer) - mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); -out: - spin_unlock(&tw_death_lock); -} - -extern void twkill_slots_invalid(void); - -static void twkill_work(void *dummy) -{ - int i; - - if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) - twkill_slots_invalid(); - - while (twkill_thread_slots) { - spin_lock_bh(&tw_death_lock); - for (i = 0; i < TCP_TWKILL_SLOTS; i++) { - if (!(twkill_thread_slots & (1 << i))) - continue; - - while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { - if (need_resched()) { - spin_unlock_bh(&tw_death_lock); - schedule(); - spin_lock_bh(&tw_death_lock); - } - } - - twkill_thread_slots &= ~(1 << i); - } - spin_unlock_bh(&tw_death_lock); - } -} - -/* These are always called from BH context. See callers in - * tcp_input.c to verify this. - */ - -/* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) -{ - spin_lock(&tw_death_lock); - if (tw_del_dead_node(tw)) { - tcp_tw_put(tw); - if (--tcp_tw_count == 0) - del_timer(&tcp_tw_timer); - } - spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); -} - -/* Short-time timewait calendar */ - -static int tcp_twcal_hand = -1; -static int tcp_twcal_jiffie; -static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = - TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); -static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; - -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) -{ - struct hlist_head *list; - int slot; - - /* timeout := RTO * 3.5 - * - * 3.5 = 1+2+0.5 to wait for two retransmits. - * - * RATIONALE: if FIN arrived and we entered TIME-WAIT state, - * our ACK acking that FIN can be lost. If N subsequent retransmitted - * FINs (or previous seqments) are lost (probability of such event - * is p^(N+1), where p is probability to lose single packet and - * time to detect the loss is about RTO*(2^N - 1) with exponential - * backoff). Normal timewait length is calculated so, that we - * waited at least for one retransmitted FIN (maximal RTO is 120sec). - * [ BTW Linux. following BSD, violates this requirement waiting - * only for 60sec, we should wait at least for 240 secs. - * Well, 240 consumes too much of resources 8) - * ] - * This interval is not reduced to catch old duplicate and - * responces to our wandering segments living for two MSLs. - * However, if we use PAWS to detect - * old duplicates, we can reduce the interval to bounds required - * by RTO, rather than MSL. So, if peer understands PAWS, we - * kill tw bucket after 3.5*RTO (it is important that this number - * is greater than TS tick!) and detect old duplicates with help - * of PAWS. - */ - slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; - - spin_lock(&tw_death_lock); - - /* Unlink it, if it was scheduled */ - if (tw_del_dead_node(tw)) - tcp_tw_count--; - else - atomic_inc(&tw->tw_refcnt); - - if (slot >= TCP_TW_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= TCP_TIMEWAIT_LEN) { - slot = TCP_TWKILL_SLOTS-1; - } else { - slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; - if (slot >= TCP_TWKILL_SLOTS) - slot = TCP_TWKILL_SLOTS-1; - } - tw->tw_ttd = jiffies + timeo; - slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); - list = &tcp_tw_death_row[slot]; - } else { - tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); - - if (tcp_twcal_hand < 0) { - tcp_twcal_hand = 0; - tcp_twcal_jiffie = jiffies; - tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); - add_timer(&tcp_twcal_timer); - } else { - if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK))) - mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); - slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); - } - list = &tcp_twcal_row[slot]; - } - - hlist_add_head(&tw->tw_death_node, list); - - if (tcp_tw_count++ == 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - spin_unlock(&tw_death_lock); -} - -void tcp_twcal_tick(unsigned long dummy) -{ - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - spin_lock(&tw_death_lock); - if (tcp_twcal_hand < 0) - goto out; - - slot = tcp_twcal_hand; - j = tcp_twcal_jiffie; - - for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *node, *safe; - struct tcp_tw_bucket *tw; - - tw_for_each_inmate_safe(tw, node, safe, - &tcp_twcal_row[slot]) { - __tw_del_dead_node(tw); - tcp_timewait_kill(tw); - tcp_tw_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - tcp_twcal_jiffie = j; - tcp_twcal_hand = slot; - } - - if (!hlist_empty(&tcp_twcal_row[slot])) { - mod_timer(&tcp_twcal_timer, j); - goto out; - } - } - j += (1<<TCP_TW_RECYCLE_TICK); - slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); - } - tcp_twcal_hand = -1; - -out: - if ((tcp_tw_count -= killed) == 0) - del_timer(&tcp_tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); - spin_unlock(&tw_death_lock); -} - /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -686,75 +344,27 @@ out: */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - /* allocate the newsk from the same slab of the master sock, - * if not, at sk_free time we'll try to free it from the wrong - * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */ - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); - if(newsk != NULL) { - struct inet_request_sock *ireq = inet_rsk(req); + if (newsk != NULL) { + const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(sk); struct tcp_sock *newtp; - struct sk_filter *filter; - - memcpy(newsk, sk, sizeof(struct tcp_sock)); - newsk->sk_state = TCP_SYN_RECV; - - /* SANITY */ - sk_node_init(&newsk->sk_node); - tcp_sk(newsk)->bind_hash = NULL; - - /* Clone the TCP header template */ - inet_sk(newsk)->dport = ireq->rmt_port; - - sock_lock_init(newsk); - bh_lock_sock(newsk); - - rwlock_init(&newsk->sk_dst_lock); - atomic_set(&newsk->sk_rmem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - atomic_set(&newsk->sk_wmem_alloc, 0); - skb_queue_head_init(&newsk->sk_write_queue); - atomic_set(&newsk->sk_omem_alloc, 0); - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - - sock_reset_flag(newsk, SOCK_DONE); - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_send_head = NULL; - rwlock_init(&newsk->sk_callback_lock); - skb_queue_head_init(&newsk->sk_error_queue); - newsk->sk_write_space = sk_stream_write_space; - - if ((filter = newsk->sk_filter) != NULL) - sk_filter_charge(newsk, filter); - - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_una = treq->snt_isn + 1; - newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; tcp_prequeue_init(newtp); tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newtp->retransmits = 0; - newtp->backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->rto = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; newtp->left_out = 0; @@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; - newtp->ca_ops = &tcp_reno; + newicsk->icsk_ca_ops = &tcp_reno; - tcp_set_ca_state(newtp, TCP_CA_Open); + tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->rcv_wup = treq->rcv_isn + 1; @@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.dsack = 0; newtp->rx_opt.eff_sacks = 0; - newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); - - /* Back to base struct sock members. */ - newsk->sk_err = 0; - newsk->sk_priority = 0; - atomic_set(&newsk->sk_refcnt, 2); -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif - atomic_inc(&tcp_sockets_allocated); if (sock_flag(newsk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); - newsk->sk_socket = NULL; - newsk->sk_sleep = NULL; + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { @@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->tcp_header_len = sizeof(struct tcphdr); } if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) - newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); if (newtp->ecn_flags&TCP_ECN_OK) @@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If tp->defer_accept, we silently drop this bare ACK. Otherwise, - we create an established connection. Both ends (listening sockets) - accept the new incoming connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, return NULL; /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; } @@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (child == NULL) goto listen_overflow; - tcp_synq_unlink(tp, req, prev); - tcp_synq_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: @@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(skb); - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } @@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_timewait_state_process); -EXPORT_SYMBOL(tcp_tw_deschedule); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dd30dd137b7..75b68116682 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) { + struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - tcp_ca_event(tp, CA_EVENT_CWND_RESTART); + tcp_ca_event(sk, CA_EVENT_CWND_RESTART); - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); - while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) static inline void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { - u32 now = tcp_time_stamp; + struct inet_connection_sock *icsk = inet_csk(sk); + const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) - tcp_cwnd_restart(tp, __sk_dst_get(sk)); + if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) - tp->ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { - struct tcp_sock *tp = tcp_sk(sk); - - tcp_dec_quickack_mode(tp, pkts); - tcp_clear_xmit_timer(sk, TCP_TIME_DACK); + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. @@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk) static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (skb != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_SACK 0x4 /* If congestion control is doing timestamping */ - if (tp->ca_ops->rtt_sample) - do_gettimeofday(&skb->stamp); + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { @@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) } if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(tp, CA_EVENT_TX_START); + tcp_ca_event(sk, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (err <= 0) return err; - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; - buff->stamp = skb->stamp; + buff->tstamp = skb->tstamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, */ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 0; - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return 0; in_flight = tcp_packets_in_flight(tp); @@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) */ u32 __tcp_select_window(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses @@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - int mss = tp->ack.rcv_mss; + int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; @@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk) mss = full_space; if (free_space < full_space/2) { - tp->ack.quick = 0; + icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); @@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_skb_pcount(next_skb) != 1); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, next_skb->list); + __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ void tcp_simple_retransmit(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); @@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk) * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ - if (tp->ca_state != TCP_CA_Loss) { + if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = 0; tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } @@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ void tcp_xmit_retransmit_queue(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int packet_cnt = tp->lost_out; @@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) return; - if (tp->ca_state != TCP_CA_Loss) + if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } packet_cnt -= tcp_skb_pcount(skb); @@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ - if (tp->ca_state != TCP_CA_Recovery) + if (icsk->icsk_ca_state != TCP_CA_Recovery) return; /* No forward retransmissions in Reno are possible. */ @@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); if (skb) break; yield(); @@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk) tp->rcv_wup = 0; tp->copied_seq = 0; - tp->rto = TCP_TIMEOUT_INIT; - tp->retransmits = 0; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk) tcp_connect_init(sk); - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; @@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } @@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk) */ void tcp_send_delayed_ack(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int ato = tp->ack.ato; + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { + const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; - if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. - * Do not use tp->rto here, use results of rtt measurements + * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt) { @@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk) timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ - if (tp->ack.pending&TCP_ACK_TIMER) { + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + if (icsk->icsk_ack.blocked || + time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, tp->ack.timeout)) - timeout = tp->ack.timeout; + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; } - tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; - tp->ack.timeout = timeout; - sk_reset_timer(sk, &tp->delack_timer, timeout); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ @@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk) */ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (buff == NULL) { - tcp_schedule_ack(tp); - tp->ack.ato = TCP_ATO_MIN; - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); return; } @@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk) */ void tcp_send_probe0(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err; @@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ - tp->probes_out = 0; - tp->backoff = 0; + icsk->icsk_probes_out = 0; + icsk->icsk_backoff = 0; return; } if (err <= 0) { - if (tp->backoff < sysctl_tcp_retries2) - tp->backoff++; - tp->probes_out++; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + if (icsk->icsk_backoff < sysctl_tcp_retries2) + icsk->icsk_backoff++; + icsk->icsk_probes_out++; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember probes_out. + * do not backoff and do not remember icsk_probes_out. * Let local senders to fight for local resources. * * Use accumulated backoff yet. */ - if (!tp->probes_out) - tp->probes_out=1; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 70e108e15c7..327770bf552 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -16,9 +16,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); if (in_flight < tp->snd_cwnd) return; @@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, tp->snd_cwnd_stamp = tcp_time_stamp; } -static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +static u32 tcp_scalable_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0084227438c..415ee47ac1c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef TCP_DEBUG -const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -EXPORT_SYMBOL(tcp_timer_bug_msg); -#endif - -/* - * Using different timers for retransmit, delayed acks and probes - * We may wish use just one timer maintaining a list of expire jiffies - * to optimize. - */ - void tcp_init_xmit_timers(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - - init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_write_timer; - tp->retransmit_timer.data = (unsigned long) sk; - tp->pending = 0; - - init_timer(&tp->delack_timer); - tp->delack_timer.function=&tcp_delack_timer; - tp->delack_timer.data = (unsigned long) sk; - tp->ack.pending = 0; - - init_timer(&sk->sk_timer); - sk->sk_timer.function = &tcp_keepalive_timer; - sk->sk_timer.data = (unsigned long)sk; + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); } -void tcp_clear_xmit_timers(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->pending = 0; - sk_stop_timer(sk, &tp->retransmit_timer); - - tp->ack.pending = 0; - tp->ack.blocked = 0; - sk_stop_timer(sk, &tp->delack_timer); - - sk_stop_timer(sk, &sk->sk_timer); -} +EXPORT_SYMBOL(tcp_init_xmit_timers); static void tcp_write_err(struct sock *sk) { @@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive) /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (tp->retransmits) + if (icsk->icsk_retransmits) dst_negative_advice(&sk->sk_dst_cache); - retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (tp->retransmits >= sysctl_tcp_retries1) { + if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( @@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = (tp->rto < TCP_RTO_MAX); + const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) return 1; } } - if (tp->retransmits >= retry_until) { + if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tp->ack.blocked = 1; + icsk->icsk_ack.blocked = 1; NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } sk_stream_mem_reclaim(sk); - if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; - if (time_after(tp->ack.timeout, jiffies)) { - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } - tp->ack.pending &= ~TCP_ACK_TIMER; + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; @@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data) tp->ucopy.memory = 0; } - if (tcp_ack_scheduled(tp)) { - if (!tp->ack.pingpong) { + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); @@ -268,11 +233,12 @@ out_unlock: static void tcp_probe_timer(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !sk->sk_send_head) { - tp->probes_out = 0; + icsk->icsk_probes_out = 0; return; } @@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk) * FIXME: We ought not to do it, Solaris 2.5 actually has fixing * this behaviour in Solaris down as a bug fix. [AC] * - * Let me to explain. probes_out is zeroed by incoming ACKs + * Let me to explain. icsk_probes_out is zeroed by incoming ACKs * even if they advertise zero window. Hence, connection is killed only * if we received no ACKs for normal connection timeout. It is not killed * only because window stays zero for some time, window may be zero @@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) return; } - if (tp->probes_out > max_probes) { + if (icsk->icsk_probes_out > max_probes) { tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk) static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out) goto out; @@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk) if (tcp_write_timeout(sk)) goto out; - if (tp->retransmits == 0) { - if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (icsk->icsk_retransmits == 0) { + if (icsk->icsk_ca_state == TCP_CA_Disorder || + icsk->icsk_ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); } else { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); } - } else if (tp->ca_state == TCP_CA_Loss) { + } else if (icsk->icsk_ca_state == TCP_CA_Loss) { NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); } else { NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); @@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * do not backoff. */ - if (!tp->retransmits) - tp->retransmits=1; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, - min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); goto out; } @@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; - tp->retransmits++; + icsk->icsk_backoff++; + icsk->icsk_retransmits++; out_reset_timer: - tp->rto = min(tp->rto << 1, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - if (tp->retransmits > sysctl_tcp_retries1) + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); out:; @@ -418,32 +387,32 @@ out:; static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int event; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ - sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out_unlock; } - if (sk->sk_state == TCP_CLOSE || !tp->pending) + if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; - if (time_after(tp->timeout, jiffies)) { - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } - event = tp->pending; - tp->pending = 0; + event = icsk->icsk_pending; + icsk->icsk_pending = 0; switch (event) { - case TCP_TIME_RETRANS: + case ICSK_TIME_RETRANS: tcp_retransmit_timer(sk); break; - case TCP_TIME_PROBE0: + case ICSK_TIME_PROBE0: tcp_probe_timer(sk); break; } @@ -462,96 +431,8 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; - - if (lopt == NULL || lopt->qlen == 0) - return; - - /* Normally all the openreqs are young and become mature - * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means - * one of the following things: synack was lost, ack was lost, - * rtt is high or nobody planned to ack (i.e. synflood). - * When server is a bit loaded, queue is populated with old - * open requests, reducing effective size of queue. - * When server is well loaded, queue size reduces to zero - * after several minutes of work. It is not synflood, - * it is normal operation. The solution is pruning - * too old entries overriding normal timeout, when - * situation becomes dangerous. - * - * Essentially, we reserve half of room for young - * embrions; and abort old ones without pity, if old - * ones are about to clog our table. - */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); - - while (thresh > 2) { - if (lopt->qlen < young) - break; - thresh--; - young <<= 1; - } - } - - if (tp->defer_accept) - max_retries = tp->defer_accept; - - budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) { - unsigned long timeo; - - if (req->retrans++ == 0) - lopt->qlen_young--; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - TCP_RTO_MAX); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - tcp_synq_unlink(tp, req, reqp); - reqsk_queue_removed(&tp->accept_queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } - - i = (i+1)&(TCP_SYNQ_HSIZE-1); - - } while (--budget > 0); - - lopt->clock_hand = i; - - if (lopt->qlen) - tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); -} - -void tcp_delete_keepalive_timer (struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} - -void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); } void tcp_set_keepalive(struct sock *sk, int val) @@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); } static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; @@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tcp_reset_keepalive_timer (sk, HZ/20); + inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } @@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data) if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = tcp_time_stamp - tp->rcv_tstamp; if (elapsed >= keepalive_time_when(tp)) { - if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || - (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk) <= 0) { - tp->probes_out++; + icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, @@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data) sk_stream_mem_reclaim(sk); resched: - tcp_reset_keepalive_timer (sk, elapsed); + inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: @@ -644,8 +526,3 @@ out: bh_unlock_sock(sk); sock_put(sk); } - -EXPORT_SYMBOL(tcp_clear_xmit_timers); -EXPORT_SYMBOL(tcp_delete_keepalive_timer); -EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(tcp_reset_keepalive_timer); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9bd443db519..93c5f92070f 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -35,7 +35,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/tcp_diag.h> +#include <linux/inet_diag.h> #include <net/tcp.h> @@ -82,9 +82,10 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct tcp_sock *tp) +static inline void vegas_enable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); /* Begin taking Vegas samples next time we send something. */ vegas->doing_vegas_now = 1; @@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp) } /* Stop taking Vegas samples for now. */ -static inline void vegas_disable(struct tcp_sock *tp) +static inline void vegas_disable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct tcp_sock *tp) +static void tcp_vegas_init(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; - vegas_enable(tp); + vegas_enable(sk); } /* Do RTT sampling needed for Vegas. @@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ /* Filter to find propagation delay: */ @@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) vegas->cntRTT++; } -static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +static void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) - vegas_enable(tp); + vegas_enable(sk); else - vegas_disable(tp); + vegas_disable(sk); } /* @@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) - tcp_vegas_init(tp); + tcp_vegas_init(sk); } -static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { - struct vegas *vegas = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, * but that's not too awful, since we're taking the min, * rather than averaging. */ - tcp_vegas_rtt_calc(tp, seq_rtt*1000); + tcp_vegas_rtt_calc(sk, seq_rtt * 1000); /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, +static void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct vegas *ca = tcp_ca(tp); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + const struct vegas *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct tcpvegas_info *info; - info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info))); info->tcpv_enabled = ca->doing_vegas_now; @@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = { static int __init tcp_vegas_register(void) { - BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_vegas); return 0; } diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index ef827242c94..0c340c3756c 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -8,7 +8,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> -#include <linux/tcp_diag.h> +#include <linux/inet_diag.h> #include <net/tcp.h> /* TCP Westwood structure */ @@ -40,9 +40,9 @@ struct westwood { * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ -static void tcp_westwood_init(struct tcp_sock *tp) +static void tcp_westwood_init(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; @@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp) w->cumul_ack = 0; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; - w->snd_una = tp->snd_una; + w->snd_una = tcp_sk(sk)->snd_una; } /* @@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); if (cnt > 0) - w->rtt = tp->srtt >> 3; + w->rtt = tcp_sk(sk)->srtt >> 3; } /* @@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ -static void westwood_update_window(struct tcp_sock *tp) +static void westwood_update_window(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; /* @@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp) * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ -static inline void westwood_fast_bw(struct tcp_sock *tp) +static inline void westwood_fast_bw(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); - westwood_update_window(tp); + westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; @@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp) * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ -static inline u32 westwood_acked_count(struct tcp_sock *tp) +static inline u32 westwood_acked_count(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; @@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +static inline u32 westwood_bw_rttmin(const struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } @@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +static u32 tcp_westwood_cwnd_min(struct sock *sk) { - return westwood_bw_rttmin(tp); + return westwood_bw_rttmin(sk); } -static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { - struct westwood *w = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); switch(event) { case CA_EVENT_FAST_ACK: - westwood_fast_bw(tp); + westwood_fast_bw(sk); break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: - westwood_update_window(tp); - w->bk += westwood_acked_count(tp); + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); w->rtt_min = min(w->rtt, w->rtt_min); break; @@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, +static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct westwood *ca = tcp_ca(tp); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + const struct westwood *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct rtattr *rta; struct tcpvegas_info *info; - rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); info = RTA_DATA(rta); info->tcpv_enabled = 1; info->tcpv_rttcnt = 0; @@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = { static int __init tcp_westwood_register(void) { - BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc4d07357e3..e5beca7de86 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -95,7 +95,8 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/snmp.h> -#include <net/tcp.h> +#include <net/ip.h> +#include <net/tcp_states.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> @@ -112,7 +113,7 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); +DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); @@ -628,7 +629,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } @@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); return -EINVAL; } @@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) @@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb) return(0); short_packet: - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", - NIPQUAD(saddr), - ntohs(uh->source), - ulen, - len, - NIPQUAD(daddr), - ntohs(uh->dest))); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + NIPQUAD(saddr), + ntohs(uh->source), + ulen, + len, + NIPQUAD(daddr), + ntohs(uh->dest)); no_header: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); @@ -1198,12 +1199,12 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", - NIPQUAD(saddr), - ntohs(uh->source), - NIPQUAD(daddr), - ntohs(uh->dest), - ulen)); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen); drop: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 050611d7a96..d23e07fc81f 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -128,8 +128,10 @@ void __init xfrm4_state_init(void) xfrm_state_register_afinfo(&xfrm4_state_afinfo); } +#if 0 void __exit xfrm4_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); } +#endif /* 0 */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index b39e0494059..6460eec834b 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o + ip6_flowlabel.o ipv6_syms.o netfilter.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o @@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-y += exthdrs_core.o + +obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 77004b9456c..937ad32db77 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1041,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; - u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); - int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int sk2_ipv6only = inet_v6_ipv6only(sk2); int addr_type = ipv6_addr_type(sk_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; @@ -1126,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) __ipv6_dev_mc_dec(idev, &maddr); } -void addrconf_join_anycast(struct inet6_ifaddr *ifp) +static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -1135,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp) ipv6_dev_ac_inc(ifp->idev->dev, &addr); } -void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); @@ -2858,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS); return; } if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC); } static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, @@ -2994,16 +2994,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS); return; } if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); } static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, @@ -3054,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS); return; } if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC); } static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = { diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 28d9bcab097..4f8795af2ed 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -44,6 +44,7 @@ #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <linux/smp_lock.h> +#include <linux/netfilter_ipv6.h> #include <net/ip.h> #include <net/ipv6.h> @@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); -/* IPv6 procfs goodies... */ - -#ifdef CONFIG_PROC_FS -extern int raw6_proc_init(void); -extern void raw6_proc_exit(void); -extern int tcp6_proc_init(void); -extern void tcp6_proc_exit(void); -extern int udp6_proc_init(void); -extern void udp6_proc_exit(void); -extern int ipv6_misc_proc_init(void); -extern void ipv6_misc_proc_exit(void); -extern int ac6_proc_init(void); -extern void ac6_proc_exit(void); -extern int if6_proc_init(void); -extern void if6_proc_exit(void); -#endif - int sysctl_ipv6_bindv6only; -#ifdef INET_REFCNT_DEBUG -atomic_t inet6_sock_nr; -EXPORT_SYMBOL(inet6_sock_nr); -#endif - /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); -static void inet6_sock_destruct(struct sock *sk) -{ - inet_sock_destruct(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif -} - static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol) inet->hdrincl = 1; } - sk->sk_destruct = inet6_sock_destruct; + sk->sk_destruct = inet_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; @@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; + /* + * Increment only the relevant sk_prot->socks debug field, this changes + * the previous behaviour of incrementing both the equivalent to + * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. + * + * This allows better debug granularity as we'll know exactly how many + * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 + * transport protocol socks. -acme + */ + sk_refcnt_debug_inc(sk); - -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif if (inet->num) { /* It assumes that any protocol which allows * the user to assign a number at socket @@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = { .owner = THIS_MODULE, }; -#ifdef CONFIG_SYSCTL -extern void ipv6_sysctl_register(void); -extern void ipv6_sysctl_unregister(void); -#endif - /* Same as inet6_dgram_ops, sans udp_poll. */ static struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, @@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void) snmp6_mib_free((void **)udp_stats_in6); } -extern int ipv6_misc_proc_init(void); - static int __init inet6_init(void) { struct sk_buff *dummy_skb; @@ -757,6 +725,9 @@ static int __init inet6_init(void) err = igmp6_init(&inet6_family_ops); if (err) goto igmp_fail; + err = ipv6_netfilter_init(); + if (err) + goto netfilter_fail; /* Create /proc/foo6 entries. */ #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -813,6 +784,8 @@ proc_tcp6_fail: raw6_proc_exit(); proc_raw6_fail: #endif + ipv6_netfilter_fini(); +netfilter_fail: igmp6_cleanup(); igmp_fail: ndisc_cleanup(); @@ -852,6 +825,7 @@ static void __exit inet6_exit(void) ip6_route_cleanup(); ipv6_packet_cleanup(); igmp6_cleanup(); + ipv6_netfilter_fini(); ndisc_cleanup(); icmpv6_cleanup(); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 986fdfdccbc..0ebfad907a0 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len) case NEXTHDR_HOP: case NEXTHDR_DEST: if (!zero_out_mutable_opts(exthdr.opth)) { - LIMIT_NETDEBUG(printk( + LIMIT_NETDEBUG( KERN_WARNING "overrun %sopts\n", nexthdr == NEXTHDR_HOP ? - "hop" : "dest")); + "hop" : "dest"); return -EINVAL; } break; @@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc skb_push(skb, skb->data - skb->nh.raw); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { - LIMIT_NETDEBUG( - printk(KERN_WARNING "ipsec ah authentication error\n")); + LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); x->stats.integrity_failed++; goto free_out; } @@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" - "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", - ntohl(ah->spi), NIP6(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(ah->spi), NIP6(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5229365cd8b..01468fab3d3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -29,6 +29,7 @@ #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> +#include <net/tcp_states.h> #include <linux/errqueue.h> #include <asm/uaccess.h> @@ -588,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, break; default: - LIMIT_NETDEBUG( - printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type)); + LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", + cmsg->cmsg_type); err = -EINVAL; break; }; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 324db62515a..e8bff9d3d96 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru padlen = nexthdr[0]; if (padlen+2 >= elen) { - LIMIT_NETDEBUG( - printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen)); + LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen); ret = -EINVAL; goto out; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index e0839eafc3a..5be6da2584e 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff) IP6CB(skb)->ra = optoff; return 1; } - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1])); + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", + skb->nh.raw[optoff+1]); kfree_skb(skb); return 0; } @@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) u32 pkt_len; if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1])); + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", + skb->nh.raw[optoff+1]); IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); goto drop; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ff3ec9822e3..5176fc655ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -67,7 +67,7 @@ #include <asm/uaccess.h> #include <asm/system.h> -DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); +DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly; /* * The ICMP socket(s). This is the most convenient way to flow control @@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * for now we don't know that. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); return; } @@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); return; } @@ -393,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); if (len < 0) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmp: len problem\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); goto out_dst_release; } @@ -551,7 +548,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info) read_lock(&raw_v6_lock); if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { - while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { + while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, + skb->dev->ifindex))) { rawv6_err(sk, skb, NULL, type, code, inner_offset, info); sk = sk_next(sk); } @@ -583,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) skb->ip_summed = CHECKSUM_UNNECESSARY; if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, skb->csum)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ICMPv6 hw checksum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); skb->ip_summed = CHECKSUM_NONE; } } if (skb->ip_summed == CHECKSUM_NONE) { if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, skb_checksum(skb, 0, skb->len, 0))) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", - NIP6(*saddr), NIP6(*daddr))); + LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", + NIP6(*saddr), NIP6(*daddr)); goto discard_it; } } @@ -669,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) break; default: - LIMIT_NETDEBUG( - printk(KERN_DEBUG "icmpv6: msg of unknown type\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); /* informational */ if (type & ICMPV6_INFOMSG_MASK) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c new file mode 100644 index 00000000000..01d5f46d4e4 --- /dev/null +++ b/net/ipv6/inet6_hashtables.c @@ -0,0 +1,81 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET6 transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> + +#include <linux/module.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> + +struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, + const struct in6_addr *daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + const struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore = 0; + + read_lock(&hashinfo->lhash_lock); + sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if (score == 3) { + result = sk; + break; + } + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + if (result) + sock_hold(result); + read_unlock(&hashinfo->lhash_lock); + return result; +} + +EXPORT_SYMBOL_GPL(inet6_lookup_listener); + +struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const u16 sport, + const struct in6_addr *daddr, const u16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(inet6_lookup); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1b354aa9793..16af874c9e8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -49,7 +49,7 @@ struct rt6_statistics rt6_stats; -static kmem_cache_t * fib6_node_kmem; +static kmem_cache_t * fib6_node_kmem __read_mostly; enum fib_walk_state_t { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 10fbb50daea..6e348042693 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb) return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct ipv6hdr *hdr; u32 pkt_len; @@ -166,8 +166,8 @@ resubmit: nexthdr = skb->nh.raw[nhoff]; raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); - if (raw_sk) - ipv6_raw_deliver(skb, nexthdr); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; hash = nexthdr & (MAX_INET_PROTOS - 1); if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ae652ca14bc..01ef94f7c7f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb) return ip6_output2(skb); } -#ifdef CONFIG_NETFILTER -int ip6_route_me_harder(struct sk_buff *skb) -{ - struct ipv6hdr *iph = skb->nh.ipv6h; - struct dst_entry *dst; - struct flowi fl = { - .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, - .nl_u = - { .ip6_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, } }, - .proto = iph->nexthdr, - }; - - dst = ip6_route_output(skb->sk, &fl); - - if (dst->error) { - IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); - LIMIT_NETDEBUG( - printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); - dst_release(dst); - return -EINVAL; - } - - /* Drop old route. */ - dst_release(skb->dst); - - skb->dst = dst; - return 0; -} -#endif - -static inline int ip6_maybe_reroute(struct sk_buff *skb) -{ -#ifdef CONFIG_NETFILTER - if (skb->nfcache & NFC_ALTERED){ - if (ip6_route_me_harder(skb) != 0){ - kfree_skb(skb); - return -EINVAL; - } - } -#endif /* CONFIG_NETFILTER */ - return dst_output(skb); -} - /* * xmit an sk_buff (used by TCP) */ @@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok) { IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); - return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); + return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); } if (net_ratelimit()) @@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; - if (sk && ra->sel == sel) { + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) @@ -667,7 +625,7 @@ slow_path: */ if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3bc144a79fa..76466af8331 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -55,7 +55,7 @@ #include <asm/uaccess.h> -DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); +DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly; static struct packet_type ipv6_packet_type = { .type = __constant_htons(ETH_P_IPV6), @@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) return 0; } -extern int ip6_mc_source(int add, int omode, struct sock *sk, - struct group_source_req *pgsr); -extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); -extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen); - - int ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, fl6_free_socklist(sk); ipv6_sock_mc_close(sk); + /* + * Sock is moving from IPv6 to IPv4 (sk_prot), so + * remove it from the refcnt debug socks count in the + * original family... + */ + sk_refcnt_debug_dec(sk); + if (sk->sk_protocol == IPPROTO_TCP) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, kfree_skb(pktopt); sk->sk_destruct = inet_sock_destruct; -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet6_sock_nr); -#endif + /* + * ... and add it to the refcnt debug socks count + * in the new family. -acme + */ + sk_refcnt_debug_inc(sk); module_put(THIS_MODULE); retv = 0; break; @@ -437,7 +439,6 @@ done: } case MCAST_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_mld_max_msf; struct group_filter *gsf; diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c index 5ade5a5d199..37a4a99c9fe 100644 --- a/net/ipv6/ipv6_syms.c +++ b/net/ipv6/ipv6_syms.c @@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map); EXPORT_SYMBOL(register_inet6addr_notifier); EXPORT_SYMBOL(unregister_inet6addr_notifier); EXPORT_SYMBOL(ip6_route_output); -#ifdef CONFIG_NETFILTER -EXPORT_SYMBOL(ip6_route_me_harder); -#endif EXPORT_SYMBOL(addrconf_lock); EXPORT_SYMBOL(ipv6_setsockopt); EXPORT_SYMBOL(ipv6_getsockopt); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7ae72d4c9bd..a7eae30f455 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) if (ipv6_chk_acast_addr(dev, &msg->target) || (idev->cnf.forwarding && pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) { - if (skb->stamp.tv_sec != LOCALLY_ENQUEUED && + if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc != 0 && idev->nd_parms->proxy_delay != 0) { @@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb) return 0; } + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: ndisc_recv_ns(skb); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c new file mode 100644 index 00000000000..f8626ebf90f --- /dev/null +++ b/net/ipv6/netfilter.c @@ -0,0 +1,104 @@ +#include <linux/config.h> +#include <linux/init.h> + +#ifdef CONFIG_NETFILTER + +#include <linux/kernel.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <net/dst.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> + +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct ipv6hdr *iph = skb->nh.ipv6h; + struct dst_entry *dst; + struct flowi fl = { + .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; + + dst = ip6_route_output(skb->sk, &fl); + + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + dst_release(skb->dst); + + skb->dst = dst; + return 0; +} +EXPORT_SYMBOL(ip6_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip6_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; +}; + +static void save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = skb->nh.ipv6h; + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + struct ip6_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP6_LOCAL_OUT) { + struct ipv6hdr *iph = (*pskb)->nh.ipv6h; + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr)) + return ip6_route_me_harder(*pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip6_reroute = { + .rer_size = sizeof(struct ip6_rt_info), + .save = &save, + .reroute = &reroute, +}; + +int __init ipv6_netfilter_init(void) +{ + return nf_register_queue_rerouter(PF_INET6, &ip6_reroute); +} + +void ipv6_netfilter_fini(void) +{ + nf_unregister_queue_rerouter(PF_INET6); +} + +#else /* CONFIG_NETFILTER */ +int __init ipv6_netfilter_init(void) +{ + return 0; +} + +void ipv6_netfilter_fini(void) +{ +} +#endif /* CONFIG_NETFILTER */ diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 77ec704c9ee..216fbe1ac65 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" # dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK #fi config IP6_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" ---help--- This option adds a queue handler to the kernel for IPv6 - packets which lets us to receive the filtered packets - with QUEUE target using libiptc as we can do with - the IPv4 now. + packets which enables users to receive the filtered packets + with QUEUE target using libipq. + + THis option enables the old IPv6-only "ip6_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). (C) Fernando Anton 2001 IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. @@ -196,6 +199,16 @@ config IP6_NF_TARGET_LOG To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP6_NF_FILTER + help + The REJECT target allows a filtering rule to specify that an ICMPv6 + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + # if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then # dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER # if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then @@ -226,6 +239,22 @@ config IP6_NF_TARGET_MARK To compile it as a module, choose M here. If unsure, say N. +config IP6_NF_TARGET_HL + tristate 'HL (hoplimit) target support' + depends on IP6_NF_MANGLE + help + This option adds a `HL' target, which enables the user to decrement + the hoplimit value of the IPv6 header or set it to a given (lower) + value. + + While it is safe to decrement the hoplimit value, this option also + enables functionality to increment and set the hoplimit value of the + IPv6 header to arbitrary values. This is EXTREMELY DANGEROUS since + you can easily create immortal packets that loop forever on the + network. + + To compile it as a module, choose M here. If unsure, say N. + #dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES config IP6_NF_RAW tristate 'raw table support (required for TRACE)' diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 2e51714953b..bd9a16a5cbb 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o +obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o +obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c index a16df5b27c8..aa11cf366ef 100644 --- a/net/ipv6/netfilter/ip6_queue.c +++ b/net/ipv6/netfilter/ip6_queue.c @@ -47,16 +47,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" -struct ipq_rt_info { - struct in6_addr daddr; - struct in6_addr saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -244,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->stamp.tv_sec; - pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; + pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; pmsg->mark = entry->skb->nfmark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; @@ -284,7 +278,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; @@ -302,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = skb->nh.ipv6h; - - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -384,23 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - e->skb->nfcache |= NFC_ALTERED; - - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - * Not a nice way to cmp, but works - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct ipv6hdr *iph = e->skb->nh.ipv6h; - if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) || - !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr)) - return ip6_route_me_harder(e->skb); - } + return 0; } @@ -676,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) return len; } +static struct nf_queue_handler nfqh = { + .name = "ip6_queue", + .outfn = &ipq_enqueue_packet, +}; + static int init_or_cleanup(int init) { @@ -686,7 +667,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; @@ -703,7 +685,7 @@ init_or_cleanup(int init) register_netdevice_notifier(&ipq_dev_notifier); ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); - status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL); + status = nf_register_queue_handler(PF_INET6, &nfqh); if (status < 0) { printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); goto cleanup_sysctl; @@ -711,7 +693,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handler(PF_INET6); + nf_unregister_queue_handlers(&nfqh); synchronize_net(); ipq_flush(NF_DROP); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 73034511c8d..1cb8adb2787 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, &protoff, &offset)) { struct ip6t_entry_target *t; @@ -434,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb, back->comefrom); continue; } - if (table_base + v - != (void *)e + e->next_offset) { + if (table_base + v != (void *)e + e->next_offset + && !(e->ipv6.flags & IP6T_F_GOTO)) { /* Save old back ptr in next entry */ struct ip6t_entry *next = (void *)e + e->next_offset; diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c new file mode 100644 index 00000000000..8f5549b7272 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_HL.c @@ -0,0 +1,118 @@ +/* + * Hop Limit modification target for ip6tables + * Maciej Soltysiak <solt@dns.toxicfilms.tv> + * Based on HW's TTL module + * + * This software is distributed under the terms of GNU GPL + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_HL.h> + +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("IP tables Hop Limit modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int ip6t_hl_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct ipv6hdr *ip6h; + const struct ip6t_HL_info *info = targinfo; + u_int16_t diffs[2]; + int new_hl; + + if (!skb_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + ip6h = (*pskb)->nh.ipv6h; + + switch (info->mode) { + case IP6T_HL_SET: + new_hl = info->hop_limit; + break; + case IP6T_HL_INC: + new_hl = ip6h->hop_limit + info->hop_limit; + if (new_hl > 255) + new_hl = 255; + break; + case IP6T_HL_DEC: + new_hl = ip6h->hop_limit - info->hop_limit; + if (new_hl < 0) + new_hl = 0; + break; + default: + new_hl = ip6h->hop_limit; + break; + } + + if (new_hl != ip6h->hop_limit) { + diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF; + ip6h->hop_limit = new_hl; + diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8); + } + + return IP6T_CONTINUE; +} + +static int ip6t_hl_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip6t_HL_info *info = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) { + printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ip6t_HL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "ip6t_HL: can only be called from " + "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IP6T_HL_MAXMODE) { + printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) { + printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't " + "make sense with value 0\n"); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_HL = { + .name = "HL", + .target = ip6t_hl_target, + .checkentry = ip6t_hl_checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ip6t_register_target(&ip6t_HL); +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_HL); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index a692e26a4fa..0cd1d1bd903 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); MODULE_DESCRIPTION("IP6 tables LOG target module"); MODULE_LICENSE("GPL"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - struct in_device; #include <net/route.h> #include <linux/netfilter_ipv6/ip6t_LOG.h> @@ -44,7 +40,7 @@ struct in_device; static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ip6t_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { @@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info, struct ipv6hdr _ip6h, *ih; unsigned int ptr; unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { @@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info, } /* Max length: 48 "OPT (...) " */ - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk("OPT ( "); switch (currenthdr) { @@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info, case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(")"); return; } @@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info, break; /* Max Length */ case IPPROTO_AH: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_auth_hdr _ahdr, *ah; /* Max length: 3 "AH " */ @@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info, hdrlen = (hp->hdrlen+2)<<2; break; case IPPROTO_ESP: - if (info->logflags & IP6T_LOG_IPOPT) { + if (logflags & IP6T_LOG_IPOPT) { struct ip_esp_hdr _esph, *eh; /* Max length: 4 "ESP " */ @@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info, printk("Unknown Ext Hdr %u", currenthdr); return; } - if (info->logflags & IP6T_LOG_IPOPT) + if (logflags & IP6T_LOG_IPOPT) printk(") "); currenthdr = hp->nexthdr; @@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info, printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IP6T_LOG_TCPSEQ) + if (logflags & IP6T_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IP6T_LOG_TCPOPT) + if ((logflags & IP6T_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; unsigned int i; @@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info, } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { + if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -357,19 +359,31 @@ static void dump_packet(const struct ip6t_log_info *info, } } +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ip6t_log_packet(unsigned int hooknum, +ip6t_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ip6t_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); if (in && !out) { @@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb, void *userinfo) { const struct ip6t_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - level_string[1] = '0' + (loginfo->level % 8); - ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix); return IP6T_CONTINUE; } -static void -ip6t_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ip6t_log_info loginfo = { - .level = 0, - .logflags = IP6T_LOG_MASK, - .prefix = "" - }; - - ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); -} static int ip6t_log_checkentry(const char *tablename, const struct ip6t_entry *e, @@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ip6t_logger = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ip6t_register_target(&ip6t_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET6, &ip6t_logfn); + if (nf_log_register(PF_INET6, &ip6t_logger) < 0) { + printk(KERN_WARNING "ip6t_LOG: not logging via system console " + "since somebody else already registered for PF_INET6\n"); + /* we cannot make module load fail here, since otherwise + * ip6tables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET6, &ip6t_logfn); + nf_log_unregister_logger(&ip6t_logger); ip6t_unregister_target(&ip6t_log_reg); } diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c index d09ceb05013..81924fcc585 100644 --- a/net/ipv6/netfilter/ip6t_MARK.c +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -28,10 +28,9 @@ target(struct sk_buff **pskb, { const struct ip6t_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IP6T_CONTINUE; } diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c new file mode 100644 index 00000000000..c6e3730e740 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c @@ -0,0 +1,70 @@ +/* ip6tables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv4/ipt_NFQUEUE.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ip6t_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c new file mode 100644 index 00000000000..14316c3ebde --- /dev/null +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -0,0 +1,284 @@ +/* + * IP6 tables REJECT target module + * Linux INET6 implementation + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Authors: + * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> + * + * Based on net/ipv4/netfilter/ipt_REJECT.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <linux/netdevice.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <net/icmp.h> +#include <net/ip6_checksum.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/flow.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_REJECT.h> + +MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); +MODULE_DESCRIPTION("IP6 tables REJECT target module"); +MODULE_LICENSE("GPL"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h; + struct dst_entry *dst = NULL; + u8 proto; + struct flowi fl; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + DEBUGP("ip6t_REJECT: addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + DEBUGP("ip6t_REJECT: Can't get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) { + DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + DEBUGP("ip6t_REJECT: RST is set\n"); + return; + } + + /* Check checksum. */ + if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, + skb_checksum(oldskb, tcphoff, otcplen, 0))) { + DEBUGP("ip6t_REJECT: TCP checksum is invalid\n"); + return; + } + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr); + ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr); + fl.fl_ip_sport = otcph.dest; + fl.fl_ip_dport = otcph.source; + dst = ip6_route_output(NULL, &fl); + if (dst == NULL) + return; + if (dst->error || + xfrm_lookup(&dst, &fl, NULL, 0)) { + dst_release(dst); + return; + } + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + if (net_ratelimit()) + printk("ip6t_REJECT: Can't alloc skb\n"); + dst_release(dst); + return; + } + + nskb->dst = dst; + + skb_reserve(nskb, hh_len + dst->header_len); + + ip6h = nskb->nh.ipv6h = (struct ipv6hdr *) + skb_put(nskb, sizeof(struct ipv6hdr)); + ip6h->version = 6; + ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->payload_len = htons(sizeof(struct tcphdr)); + ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); + ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr); + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr, + &nskb->nh.ipv6h->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial((char *)tcph, + sizeof(struct tcphdr), 0)); + + NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + dst_output); +} + +static inline void +send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) +{ + if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL) + skb_in->dev = &loopback_dev; + + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL); +} + +static unsigned int reject6_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_reject_info *reject = targinfo; + + DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__); + /* WARNING: This code causes reentry within ip6tables. + This means that the ip6tables jump stack is now crap. We + must return an absolute verdict. --RR */ + switch (reject->with) { + case IP6T_ICMP6_NO_ROUTE: + send_unreach(*pskb, ICMPV6_NOROUTE, hooknum); + break; + case IP6T_ICMP6_ADM_PROHIBITED: + send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum); + break; + case IP6T_ICMP6_NOT_NEIGHBOUR: + send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum); + break; + case IP6T_ICMP6_ADDR_UNREACH: + send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum); + break; + case IP6T_ICMP6_PORT_UNREACH: + send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum); + break; + case IP6T_ICMP6_ECHOREPLY: + /* Do nothing */ + break; + case IP6T_TCP_RESET: + send_reset(*pskb); + break; + default: + if (net_ratelimit()) + printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with); + break; + } + + return NF_DROP; +} + +static int check(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip6t_reject_info *rejinfo = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) { + DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if (strcmp(tablename, "filter") != 0) { + DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename); + return 0; + } + + if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN) + | (1 << NF_IP6_FORWARD) + | (1 << NF_IP6_LOCAL_OUT))) != 0) { + DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { + printk("ip6t_REJECT: ECHOREPLY is not supported.\n"); + return 0; + } else if (rejinfo->with == IP6T_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ipv6.proto != IPPROTO_TCP + || (e->ipv6.invflags & IP6T_INV_PROTO)) { + DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ip6t_target ip6t_reject_reg = { + .name = "REJECT", + .target = reject6_target, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + if (ip6t_register_target(&ip6t_reject_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_reject_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c index ab0e32d3de4..9b91decbfdd 100644 --- a/net/ipv6/netfilter/ip6t_owner.c +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("IP6 tables owner matching module"); MODULE_LICENSE("GPL"); -static int -match_pid(const struct sk_buff *skb, pid_t pid) -{ - struct task_struct *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out; - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); -out: - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_sid(const struct sk_buff *skb, pid_t sid) -{ - struct task_struct *g, *p; - struct file *file = skb->sk->sk_socket->file; - int i, found=0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct files_struct *files; - if (p->signal->session != sid) - continue; - - task_lock(p); - files = p->files; - if (files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == file) { - found = 1; - break; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - if (found) - goto out; - } while_each_thread(g, p); -out: - read_unlock(&tasklist_lock); - - return found; -} static int match(const struct sk_buff *skb, @@ -112,18 +47,6 @@ match(const struct sk_buff *skb, return 0; } - if(info->match & IP6T_OWNER_PID) { - if (!match_pid(skb, info->pid) ^ - !!(info->invert & IP6T_OWNER_PID)) - return 0; - } - - if(info->match & IP6T_OWNER_SID) { - if (!match_sid(skb, info->sid) ^ - !!(info->invert & IP6T_OWNER_SID)) - return 0; - } - return 1; } @@ -134,6 +57,8 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + const struct ip6t_owner_info *info = matchinfo; + if (hook_mask & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) { printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); @@ -142,14 +67,13 @@ checkentry(const char *tablename, if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info))) return 0; -#ifdef CONFIG_SMP - /* files->file_lock can not be used in a BH */ - if (((struct ip6t_owner_info *)matchinfo)->match - & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { - printk("ip6t_owner: pid and sid matching is broken on SMP.\n"); + + if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { + printk("ipt_owner: pid and sid matching " + "not supported anymore\n"); return 0; } -#endif + return 1; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 1d4d75b34d3..7a5863298f3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -49,6 +49,7 @@ #include <net/transp_v6.h> #include <net/udp.h> #include <net/inet_common.h> +#include <net/tcp_states.h> #include <net/rawv6.h> #include <net/xfrm.h> @@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk) /* Grumble... icmp and ip_input want to get at this... */ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, - struct in6_addr *loc_addr, struct in6_addr *rmt_addr) + struct in6_addr *loc_addr, struct in6_addr *rmt_addr, + int dif) { struct hlist_node *node; int is_multicast = ipv6_addr_is_multicast(loc_addr); @@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, !ipv6_addr_equal(&np->daddr, rmt_addr)) continue; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + if (!ipv6_addr_any(&np->rcv_saddr)) { if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) goto found; @@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) * * Caller owns SKB so we must make clones. */ -void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct in6_addr *saddr; struct in6_addr *daddr; struct sock *sk; + int delivered = 0; __u8 hash; saddr = &skb->nh.ipv6h->saddr; @@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (sk == NULL) goto out; - sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex); while (sk) { + delivered = 1; if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (clone) rawv6_rcv(sk, clone); } - sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); + sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr, + skb->dev->ifindex); } out: read_unlock(&raw_v6_lock); + return delivered; } /* This cleans up af_inet6 a bit. -DaveM */ @@ -334,8 +343,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, skb->len, inet->num, skb->csum)) { - LIMIT_NETDEBUG( - printk(KERN_DEBUG "raw v6 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 59e7c631787..9d9e04344c7 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (skb->dev) fq->iif = skb->dev->ifindex; skb->dev = NULL; - fq->stamp = skb->stamp; + skb_get_timestamp(skb, &fq->stamp); fq->meat += skb->len; atomic_add(skb->truesize, &ip6_frag_mem); @@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, head->next = NULL; head->dev = dev; - head->stamp = fq->stamp; + skb_set_timestamp(head, &fq->stamp); head->nh.ipv6h->payload_len = htons(payload_len); *skb_in = head; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 878789b3122..5d5bbb49ec7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg) * Drop the packet on the floor */ -int ip6_pkt_discard(struct sk_buff *skb) +static int ip6_pkt_discard(struct sk_buff *skb) { IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev); @@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb) return 0; } -int ip6_pkt_discard_out(struct sk_buff *skb) +static int ip6_pkt_discard_out(struct sk_buff *skb) { skb->dev = skb->dst->dev; return ip6_pkt_discard(skb); @@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, skb = alloc_skb(size, gfp_any()); if (!skb) { - netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS); return; } if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any()); + NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any()); } /* @@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length) return arg.len; } -extern struct rt6_statistics rt6_stats; - static int rt6_stats_seq_show(struct seq_file *seq, void *v) { seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e553e5b80d6..c3123c9e1a8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -770,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev) return 0; } -int __init ipip6_fb_tunnel_init(struct net_device *dev) +static int __init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = dev->priv; struct iphdr *iph = &tunnel->parms.iph; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 3a18e0e6ffe..8eff9fa1e98 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -14,9 +14,6 @@ #include <net/ipv6.h> #include <net/addrconf.h> -extern ctl_table ipv6_route_table[]; -extern ctl_table ipv6_icmp_table[]; - #ifdef CONFIG_SYSCTL static ctl_table ipv6_table[] = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ef29cfd936d..794734f1d23 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -47,6 +47,7 @@ #include <net/tcp.h> #include <net/ndisc.h> +#include <net/inet6_hashtables.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -75,34 +76,11 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; -/* I have no idea if this is a good hash for v6 or not. -DaveM */ -static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, - struct in6_addr *faddr, u16 fport) +static inline int tcp_v6_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { - int hashent = (lport ^ fport); - - hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); - hashent ^= hashent>>16; - hashent ^= hashent>>8; - return (hashent & (tcp_ehash_size - 1)); -} - -static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *laddr = &np->rcv_saddr; - struct in6_addr *faddr = &np->daddr; - __u16 lport = inet->num; - __u16 fport = inet->dport; - return tcp_v6_hashfn(laddr, lport, faddr, fport); -} - -static inline int tcp_v6_bind_conflict(struct sock *sk, - struct tcp_bind_bucket *tb) -{ - struct sock *sk2; - struct hlist_node *node; + const struct sock *sk2; + const struct hlist_node *node; /* We must walk the whole port owner list in this case. -DaveM */ sk_for_each_bound(sk2, node, &tb->owners) { @@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk, */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; struct hlist_node *node; int ret; @@ -138,25 +116,25 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) + spin_lock(&tcp_hashinfo.portalloc_lock); + if (tcp_hashinfo.port_rover < low) rover = low; else - rover = tcp_port_rover; + rover = tcp_hashinfo.port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); + tcp_hashinfo.port_rover = rover; + spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -171,9 +149,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) /* OK, here is the one we will use. */ snum = rover; } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } @@ -192,8 +170,11 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) - goto fail_unlock; + if (tb == NULL) { + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); + if (tb == NULL) + goto fail_unlock; + } if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) tb->fastreuse = 1; @@ -204,9 +185,9 @@ tb_not_found: tb->fastreuse = 0; success: - if (!tcp_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); ret = 0; fail_unlock: @@ -224,13 +205,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); if (sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; - tcp_listen_wlock(); + list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &tcp_hashinfo.lhash_lock; + inet_listen_wlock(&tcp_hashinfo); } else { - sk->sk_hashent = tcp_v6_sk_hashfn(sk); - list = &tcp_ehash[sk->sk_hashent].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; + sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size); + list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; + lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); } @@ -255,131 +236,11 @@ static void tcp_v6_hash(struct sock *sk) } } -static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif) -{ - struct sock *sk; - struct hlist_node *node; - struct sock *result = NULL; - int score, hiscore; - - hiscore=0; - read_lock(&tcp_lhash_lock); - sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) { - if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - score = 1; - if (!ipv6_addr_any(&np->rcv_saddr)) { - if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) - continue; - score++; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score++; - } - if (score == 3) { - result = sk; - break; - } - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - if (result) - sock_hold(result); - read_unlock(&tcp_lhash_lock); - return result; -} - -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * The sockhash lock must be held as a reader here. - */ - -static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 hnum, - int dif) -{ - struct tcp_ehash_bucket *head; - struct sock *sk; - struct hlist_node *node; - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); - int hash; - - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); - head = &tcp_ehash[hash]; - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - /* For IPV6 do the cheaper port and family tests first. */ - if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { - /* FIXME: acme: check this... */ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk->sk_family == PF_INET6) { - if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && - (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) - goto hit; - } - } - read_unlock(&head->lock); - return NULL; - -hit: - sock_hold(sk); - read_unlock(&head->lock); - return sk; -} - - -static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 hnum, - int dif) -{ - struct sock *sk; - - sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif); - - if (sk) - return sk; - - return tcp_v6_lookup_listener(daddr, hnum, dif); -} - -inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; -} - -EXPORT_SYMBOL_GPL(tcp_v6_lookup); - - /* * Open request hash tables. */ -static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) +static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) { u32 a, b, c; @@ -399,14 +260,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) return c & (TCP_SYNQ_HSIZE - 1); } -static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp, +static struct request_sock *tcp_v6_search_req(const struct sock *sk, struct request_sock ***prevp, __u16 rport, struct in6_addr *raddr, struct in6_addr *laddr, int iif) { - struct listen_sock *lopt = tp->accept_queue.listen_opt; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; @@ -451,44 +313,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) +static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *daddr = &np->rcv_saddr; - struct in6_addr *saddr = &np->daddr; - int dif = sk->sk_bound_dev_if; - u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport, + tcp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { - tw = (struct tcp_tw_bucket*)sk2; + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); + + tw = inet_twsk(sk2); if(*((__u32 *)&(tw->tw_dport)) == ports && sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); - if (tw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { + if (tcptw->tw_ts_recent_stamp && + (!twp || + (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { /* See comment in tcp_ipv4.c */ - tp->write_seq = tw->tw_snd_nxt + 65535 + 2; + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (!tp->write_seq) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -499,7 +365,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport, /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) + if (INET6_MATCH(sk2, saddr, daddr, ports, dif)) goto not_unique; } @@ -515,10 +381,10 @@ unique: NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -540,8 +406,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk) static int tcp_v6_hash_connect(struct sock *sk) { unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -553,19 +419,19 @@ static int tcp_v6_hash_connect(struct sock *sk) static u32 hint; u32 offset = hint + tcpv6_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -578,7 +444,7 @@ static int tcp_v6_hash_connect(struct sock *sk) } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -597,7 +463,7 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); __tcp_v6_hash(sk); @@ -605,16 +471,16 @@ ok: spin_unlock(&head->lock); if (tw) { - tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); } ret = 0; goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; - tb = tcp_sk(sk)->bind_hash; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -631,11 +497,6 @@ out: } } -static __inline__ int tcp_v6_iif(struct sk_buff *skb) -{ - return IP6CB(skb)->iif; -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -827,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int type, int code, int offset, __u32 info) { struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; - struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct ipv6_pinfo *np; struct sock *sk; int err; struct tcp_sock *tp; __u32 seq; - sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr, + th->source, skb->dev->ifindex); if (sk == NULL) { ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); @@ -842,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -920,8 +782,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, - &hdr->saddr, tcp_v6_iif(skb)); + req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -935,7 +797,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -1132,7 +994,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) buff->csum); fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); + fl.oif = inet6_iif(skb); fl.fl_ip_dport = t1->dest; fl.fl_ip_sport = t1->source; @@ -1201,7 +1063,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 buff->csum); fl.proto = IPPROTO_TCP; - fl.oif = tcp_v6_iif(skb); + fl.oif = inet6_iif(skb); fl.fl_ip_dport = t1->dest; fl.fl_ip_sport = t1->source; @@ -1220,12 +1082,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1237,28 +1101,25 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) { struct request_sock *req, **prev; - struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = skb->h.th; struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); + req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, - th->source, - &skb->nh.ipv6h->daddr, - ntohs(th->dest), - tcp_v6_iif(skb)); + nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr, + th->source, &skb->nh.ipv6h->daddr, + ntohs(th->dest), inet6_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket*)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1271,12 +1132,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); } @@ -1301,13 +1162,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) /* * There are no SYN attacks on IPv6, yet... */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { if (net_ratelimit()) printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); goto drop; } - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp6_request_sock_ops); @@ -1339,7 +1200,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) - treq->iif = tcp_v6_iif(skb); + treq->iif = inet6_iif(skb); if (isn == 0) isn = tcp_v6_init_sequence(sk,skb); @@ -1404,15 +1265,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; - newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; - /* Charge newly allocated IPv6 socket. Though it is mapped, - * it is IPv6 yet. + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, tcp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 af_tcp.af_specific. @@ -1467,10 +1327,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newsk == NULL) goto out; - /* Charge newly allocated IPv6 socket */ -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); -#endif + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ ip6_dst_store(newsk, dst, NULL); newsk->sk_route_caps = dst->dev->features & @@ -1509,7 +1370,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, skb_set_owner_r(newnp->pktoptions, newsk); } newnp->opt = NULL; - newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = skb->nh.ipv6h->hop_limit; /* Clone native IPv6 options from listening socket (if any) @@ -1536,7 +1397,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; __tcp_v6_hash(newsk); - tcp_inherit_port(sk, newsk); + inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1557,7 +1418,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb) if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); } if (skb->len <= 76) { if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, @@ -1684,7 +1545,7 @@ ipv6_pktoptions: if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo) - np->mcast_oif = tcp_v6_iif(opt_skb); + np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim) np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; if (ipv6_opt_accepted(sk, opt_skb)) { @@ -1739,8 +1600,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source, - &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source, + &skb->nh.ipv6h->daddr, ntohs(th->dest), + inet6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1795,26 +1657,29 @@ discard_and_relse: do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *)sk); goto discard_it; } - switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { struct sock *sk2; - sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + sk2 = inet6_lookup_listener(&tcp_hashinfo, + &skb->nh.ipv6h->daddr, + ntohs(th->dest), inet6_iif(skb)); if (sk2 != NULL) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + struct inet_timewait_sock *tw = inet_twsk(sk); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); sk = sk2; goto process; } @@ -1983,7 +1848,7 @@ static struct tcp_func ipv6_specific = { static struct tcp_func ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2002,13 +1867,14 @@ static struct tcp_func ipv6_mapped = { */ static int tcp_v6_init_sock(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -2030,7 +1896,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - tp->ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -2044,8 +1910,6 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { - extern int tcp_v4_destroy_sock(struct sock *sk); - tcp_v4_destroy_sock(sk); return inet6_destroy_sock(sk); } @@ -2091,18 +1955,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) unsigned long timer_expires; struct inet_sock *inet = inet_sk(sp); struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct ipv6_pinfo *np = inet6_sk(sp); dest = &np->daddr; src = &np->rcv_saddr; destp = ntohs(inet->dport); srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2123,28 +1989,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), - tp->probes_out, + icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh ); } static void get_timewait6_sock(struct seq_file *seq, - struct tcp_tw_bucket *tw, int i) + struct inet_timewait_sock *tw, int i) { struct in6_addr *dest, *src; __u16 destp, srcp; + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tw->tw_v6_daddr; - src = &tw->tw_v6_rcv_saddr; + dest = &tcp6tw->tw_v6_daddr; + src = &tcp6tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); @@ -2219,7 +2088,7 @@ struct proto tcpv6_prot = { .close = tcp_close, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v6_destroy_sock, @@ -2236,11 +2105,13 @@ struct proto tcpv6_prot = { .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, + .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .rsk_prot = &tcp6_request_sock_ops, }; @@ -2250,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; -extern struct proto_ops inet6_stream_ops; - static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index eff050ac704..390d750449c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -51,6 +51,7 @@ #include <net/udp.h> #include <net/raw.h> #include <net/inet_common.h> +#include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> @@ -58,7 +59,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; /* Grrr, addr_type already calculated by caller, but I don't want * to add some silly "cookie" argument to this method just for that. @@ -477,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) /* RFC 2460 section 8.1 says that we SHOULD log this error. Well, it is reasonable. */ - LIMIT_NETDEBUG( - printk(KERN_INFO "IPv6: udp checksum is 0\n")); + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); goto discard; } @@ -493,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (skb->ip_summed==CHECKSUM_HW) { skb->ip_summed = CHECKSUM_UNNECESSARY; if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } } @@ -825,7 +825,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } @@ -1054,8 +1054,6 @@ struct proto udpv6_prot = { .obj_size = sizeof(struct udp6_sock), }; -extern struct proto_ops inet6_dgram_ops; - static struct inet_protosw udpv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 60c26c87277..fbef7826a74 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi; #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff -static kmem_cache_t *xfrm6_tunnel_spi_kmem; +static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly; #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 5a27e5df588..34b3bb86840 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -44,7 +44,6 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/string.h> -#include <linux/tcp.h> #include <linux/types.h> #include <linux/termios.h> @@ -52,6 +51,7 @@ #include <net/p8022.h> #include <net/psnap.h> #include <net/sock.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> @@ -1627,7 +1627,7 @@ out: return rc; } -static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { /* NULL here for pt means the packet was looped back */ struct ipx_interface *intrfc; @@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, copied); if (rc) goto out_free; - if (skb->stamp.tv_sec) - sk->sk_stamp = skb->stamp; + if (skb->tstamp.off_sec) + skb_get_timestamp(skb, &sk->sk_stamp); msg->msg_namelen = sizeof(*sipx); @@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = { }; extern struct datalink_proto *make_EII_client(void); -extern struct datalink_proto *make_8023_client(void); extern void destroy_EII_client(struct datalink_proto *); -extern void destroy_8023_client(struct datalink_proto *); static unsigned char ipx_8022_type = 0xE0; static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c index b6761913445..1f73d9ea434 100644 --- a/net/ipx/ipx_proc.c +++ b/net/ipx/ipx_proc.c @@ -10,7 +10,7 @@ #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/seq_file.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <net/ipx.h> static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92c6e8d4e73..6f92f9c6299 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -56,7 +56,7 @@ #include <asm/uaccess.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/irda/af_irda.h> diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 6dafbb43b52..3e9a06abbdd 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command) IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; @@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command) IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); return; } - /* Unlink tx_skb from list */ - tx_skb->next = tx_skb->prev = NULL; - tx_skb->list = NULL; /* Clear old Nr field + poll bit */ tx_skb->data[1] &= 0x0f; @@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, * Jean II */ int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct irlap_info info; struct irlap_cb *self; diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 7a4a4d7fbe6..c19e9ce05a3 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL; /* These can be altered by the sysctl interface */ int sysctl_discovery = 0; int sysctl_discovery_timeout = 3; /* 3 seconds by default */ -EXPORT_SYMBOL(sysctl_discovery_timeout); int sysctl_discovery_slots = 6; /* 6 slots by default */ int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; char sysctl_devname[65]; @@ -67,7 +66,6 @@ const char *irlmp_reasons[] = { "LM_INIT_DISCONNECT", "ERROR, NOT USED", }; -EXPORT_SYMBOL(irlmp_reasons); /* * Function irlmp_init (void) @@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance) return new; } -EXPORT_SYMBOL(irlmp_dup); /* * Function irlmp_disconnect_request (handle, userdata) diff --git a/net/irda/irmod.c b/net/irda/irmod.c index 6ffaed4544e..634901dd156 100644 --- a/net/irda/irmod.c +++ b/net/irda/irmod.c @@ -54,7 +54,7 @@ extern int irsock_init(void); extern void irsock_cleanup(void); /* irlap_frame.c */ extern int irlap_driver_rcv(struct sk_buff *, struct net_device *, - struct packet_type *); + struct packet_type *, struct net_device *); /* * Module parameters diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index 9004f7349a7..b391cb3893d 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -517,9 +517,6 @@ extern int irda_irnet_init(void); /* Initialise IrDA part of IrNET */ extern void irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */ -/* ---------------------------- MODULE ---------------------------- */ -extern int - irnet_init(void); /* Initialise IrNET module */ /**************************** VARIABLES ****************************/ diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index f8f984bb992..e53bf9e0053 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void) /* * Module main entry point */ -int __init +static int __init irnet_init(void) { int err; diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c index b0dd3ea3599..1ba8c710663 100644 --- a/net/irda/irqueue.c +++ b/net/irda/irqueue.c @@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name, return entry; } -EXPORT_SYMBOL(hashbin_find_next); /* * Function hashbin_get_first (hashbin) diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c index 5de05a0bc0f..8b5eefd70f0 100644 --- a/net/lapb/lapb_subr.c +++ b/net/lapb/lapb_subr.c @@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb) if (!skb_prev) skb_queue_head(&lapb->write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &lapb->write_queue); skb_prev = skb; } } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 20b4cfebd74..66f55e514b5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -23,13 +23,13 @@ #include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/tcp.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> +#include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; @@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (uaddr) memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); - if (!skb->list) { + if (!skb->next) { dgram_free: kfree_skb(skb); } diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index eba812a9c69..4c644bc70ea 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -16,7 +16,7 @@ #include <net/llc_sap.h> #include <net/llc_conn.h> #include <net/sock.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> @@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) if (!ev->ind_prim && !ev->cfm_prim) { /* indicate or confirm not required */ - if (!skb->list) + /* XXX this is not very pretty, perhaps we should store + * XXX indicate/confirm-needed state in the llc_conn_state_ev + * XXX control block of the SKB instead? -DaveM + */ + if (!skb->next) goto out_kfree_skb; goto out_skb_put; } diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 5ff02c080a0..9727455bf0e 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -103,7 +103,8 @@ out: struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt)) + struct packet_type *pt, + struct net_device *orig_dev)) { struct llc_sap *sap = llc_sap_find(lsap); diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index 0f9fc48aeaf..0f84f66018e 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -15,7 +15,6 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> -#include <linux/tcp.h> #include <asm/errno.h> #include <net/llc_if.h> #include <net/llc_sap.h> @@ -25,6 +24,7 @@ #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> +#include <net/tcp_states.h> u8 llc_mac_null_var[IFHWADDRLEN]; diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 4da6976efc9..13b46240b7a 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb) * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) + struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; @@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, * LLC functionality */ if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt); + sap->rcv_func(skb, dev, pt, orig_dev); goto out; } dest = llc_pdu_type(skb); diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 965c94eb4bb..34228ef1498 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -21,7 +21,7 @@ #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/sock.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <linux/llc.h> /** diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig new file mode 100644 index 00000000000..8296b38bf27 --- /dev/null +++ b/net/netfilter/Kconfig @@ -0,0 +1,24 @@ +config NETFILTER_NETLINK + tristate "Netfilter netlink interface" + help + If this option is enabled, the kernel will include support + for the new netfilter netlink interface. + +config NETFILTER_NETLINK_QUEUE + tristate "Netfilter NFQUEUE over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option isenabled, the kernel will include support + for queueing packets via NFNETLINK. + +config NETFILTER_NETLINK_LOG + tristate "Netfilter LOG over NFNETLINK interface" + depends on NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for logging packets via NFNETLINK. + + This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms, + and is also scheduled to replace the old syslog-based ipt_LOG + and ip6t_LOG modules. + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile new file mode 100644 index 00000000000..b3b44f8b415 --- /dev/null +++ b/net/netfilter/Makefile @@ -0,0 +1,7 @@ +netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o + +obj-$(CONFIG_NETFILTER) = netfilter.o + +obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c new file mode 100644 index 00000000000..1ceb1a6c254 --- /dev/null +++ b/net/netfilter/core.c @@ -0,0 +1,216 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + * + * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000: Added NF_REPEAT --RR. + * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/proc_fs.h> +#include <net/sock.h> + +#include "nf_internals.h" + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE. We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks); +static DEFINE_SPINLOCK(nf_hook_lock); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); + list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } + list_add_rcu(®->list, i->prev); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); + return 0; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + spin_lock_bh(&nf_hook_lock); + list_del_rcu(®->list); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(nf_unregister_hook); + +unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + *i = (*i)->prev; + } + } + return NF_ACCEPT; +} + + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + goto unlock; + } else if (verdict == NF_DROP) { + kfree_skb(*pskb); + ret = -EPERM; + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_BITS)) + goto next_hook; + } +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_hook_slow); + + +int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len) +{ + struct sk_buff *nskb; + + if (writable_len > (*pskb)->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (skb_shared(*pskb) || skb_cloned(*pskb)) + goto copy_skb; + + return pskb_may_pull(*pskb, writable_len); + +copy_skb: + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return 0; + BUG_ON(skb_is_nonlinear(nskb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} +EXPORT_SYMBOL(skb_make_writable); + + +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +EXPORT_SYMBOL(ip_ct_attach); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct && (attach = ip_ct_attach) != NULL) { + mb(); /* Just to be sure: must be read before executing this */ + attach(new, skb); + } +} +EXPORT_SYMBOL(nf_ct_attach); + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); +#endif + +void __init netfilter_init(void) +{ + int i, h; + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); +#endif + + if (netfilter_queue_init() < 0) + panic("cannot initialize nf_queue"); + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h new file mode 100644 index 00000000000..6bdee291061 --- /dev/null +++ b/net/netfilter/nf_internals.h @@ -0,0 +1,39 @@ +#ifndef _NF_INTERNALS_H +#define _NF_INTERNALS_H + +#include <linux/config.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + + +/* core.c */ +extern unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh); + +/* nf_queue.c */ +extern int nf_queue(struct sk_buff **skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum); +extern int __init netfilter_queue_init(void); + +/* nf_log.c */ +extern int __init netfilter_log_init(void); + +#endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c new file mode 100644 index 00000000000..3e76bd0824a --- /dev/null +++ b/net/netfilter/nf_log.c @@ -0,0 +1,178 @@ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/seq_file.h> +#include <net/protocol.h> + +#include "nf_internals.h" + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 + +static struct nf_logger *nf_logging[NPROTO]; /* = NULL */ +static DEFINE_SPINLOCK(nf_log_lock); + +/* return EBUSY if somebody else is registered, EEXIST if the same logger + * is registred, 0 on success. */ +int nf_log_register(int pf, struct nf_logger *logger) +{ + int ret = -EBUSY; + + if (pf >= NPROTO) + return -EINVAL; + + /* Any setup of logging members must be done before + * substituting pointer. */ + spin_lock(&nf_log_lock); + if (!nf_logging[pf]) { + rcu_assign_pointer(nf_logging[pf], logger); + ret = 0; + } else if (nf_logging[pf] == logger) + ret = -EEXIST; + + spin_unlock(&nf_log_lock); + return ret; +} +EXPORT_SYMBOL(nf_log_register); + +int nf_log_unregister_pf(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + spin_lock(&nf_log_lock); + nf_logging[pf] = NULL; + spin_unlock(&nf_log_lock); + + /* Give time to concurrent readers. */ + synchronize_net(); + + return 0; +} +EXPORT_SYMBOL(nf_log_unregister_pf); + +void nf_log_unregister_logger(struct nf_logger *logger) +{ + int i; + + spin_lock(&nf_log_lock); + for (i = 0; i < NPROTO; i++) { + if (nf_logging[i] == logger) + nf_logging[i] = NULL; + } + spin_unlock(&nf_log_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(nf_log_unregister_logger); + +void nf_log_packet(int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + struct nf_loginfo *loginfo, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(nf_logging[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + /* We must read logging before nf_logfn[pf] */ + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } else if (net_ratelimit()) { + printk(KERN_WARNING "nf_log_packet: can\'t log since " + "no backend logging module loaded in! Please either " + "load one, or disable logging explicitly\n"); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_packet); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + rcu_read_lock(); + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + rcu_read_unlock(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + + logger = rcu_dereference(nf_logging[*pos]); + + if (!logger) + return seq_printf(s, "%2lld NONE\n", *pos); + + return seq_printf(s, "%2lld %s\n", *pos, logger->name); +} + +static struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ + + +int __init netfilter_log_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; + + pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter); + if (!pde) + return -1; + + pde->proc_fops = &nflog_file_ops; +#endif + return 0; +} diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c new file mode 100644 index 00000000000..d10d552d9c4 --- /dev/null +++ b/net/netfilter/nf_queue.c @@ -0,0 +1,343 @@ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/seq_file.h> +#include <net/protocol.h> + +#include "nf_internals.h" + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static struct nf_queue_handler *queue_handler[NPROTO]; +static struct nf_queue_rerouter *queue_rerouter; + +static DEFINE_RWLOCK(queue_handler_lock); + +/* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ +int nf_register_queue_handler(int pf, struct nf_queue_handler *qh) +{ + int ret; + + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + if (queue_handler[pf] == qh) + ret = -EEXIST; + else if (queue_handler[pf]) + ret = -EBUSY; + else { + queue_handler[pf] = qh; + ret = 0; + } + write_unlock_bh(&queue_handler_lock); + + return ret; +} +EXPORT_SYMBOL(nf_register_queue_handler); + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + queue_handler[pf] = NULL; + write_unlock_bh(&queue_handler_lock); + + return 0; +} +EXPORT_SYMBOL(nf_unregister_queue_handler); + +int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_register_queue_rerouter); + +int nf_unregister_queue_rerouter(int pf) +{ + if (pf >= NPROTO) + return -EINVAL; + + write_lock_bh(&queue_handler_lock); + memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf])); + write_unlock_bh(&queue_handler_lock); + return 0; +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter); + +void nf_unregister_queue_handlers(struct nf_queue_handler *qh) +{ + int pf; + + write_lock_bh(&queue_handler_lock); + for (pf = 0; pf < NPROTO; pf++) { + if (queue_handler[pf] == qh) + queue_handler[pf] = NULL; + } + write_unlock_bh(&queue_handler_lock); +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +int nf_queue(struct sk_buff **skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + int status; + struct nf_info *info; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev = NULL; + struct net_device *physoutdev = NULL; +#endif + + /* QUEUE == DROP if noone is waiting, to be safe. */ + read_lock(&queue_handler_lock); + if (!queue_handler[pf]->outfn) { + read_unlock(&queue_handler_lock); + kfree_skb(*skb); + return 1; + } + + info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC); + if (!info) { + if (net_ratelimit()) + printk(KERN_ERR "OOM queueing packet %p\n", + *skb); + read_unlock(&queue_handler_lock); + kfree_skb(*skb); + return 1; + } + + *info = (struct nf_info) { + (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(info->elem->owner)) { + read_unlock(&queue_handler_lock); + kfree(info); + return 0; + } + + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) dev_hold(indev); + if (outdev) dev_hold(outdev); + +#ifdef CONFIG_BRIDGE_NETFILTER + if ((*skb)->nf_bridge) { + physindev = (*skb)->nf_bridge->physindev; + if (physindev) dev_hold(physindev); + physoutdev = (*skb)->nf_bridge->physoutdev; + if (physoutdev) dev_hold(physoutdev); + } +#endif + if (queue_rerouter[pf].save) + queue_rerouter[pf].save(*skb, info); + + status = queue_handler[pf]->outfn(*skb, info, queuenum, + queue_handler[pf]->data); + + if (status >= 0 && queue_rerouter[pf].reroute) + status = queue_rerouter[pf].reroute(skb, info); + + read_unlock(&queue_handler_lock); + + if (status < 0) { + /* James M doesn't say fuck enough. */ + if (indev) dev_put(indev); + if (outdev) dev_put(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (physindev) dev_put(physindev); + if (physoutdev) dev_put(physoutdev); +#endif + module_put(info->elem->owner); + kfree(info); + kfree_skb(*skb); + + return 1; + } + + return 1; +} + +void nf_reinject(struct sk_buff *skb, struct nf_info *info, + unsigned int verdict) +{ + struct list_head *elem = &info->elem->list; + struct list_head *i; + + rcu_read_lock(); + + /* Release those devices we held, or Alexey will kill me. */ + if (info->indev) dev_put(info->indev); + if (info->outdev) dev_put(info->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + if (skb->nf_bridge->physindev) + dev_put(skb->nf_bridge->physindev); + if (skb->nf_bridge->physoutdev) + dev_put(skb->nf_bridge->physoutdev); + } +#endif + + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + + list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + + if (elem == &nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); + verdict = NF_DROP; + } + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + info->okfn(skb); + break; + + case NF_QUEUE: + if (!nf_queue(&skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn, + verdict >> NF_VERDICT_BITS)) + goto next_hook; + break; + } + rcu_read_unlock(); + + if (verdict == NF_DROP) + kfree_skb(skb); + + kfree(info); + return; +} +EXPORT_SYMBOL(nf_reinject); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= NPROTO) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + +} + +static int seq_show(struct seq_file *s, void *v) +{ + int ret; + loff_t *pos = v; + struct nf_queue_handler *qh; + + read_lock_bh(&queue_handler_lock); + qh = queue_handler[*pos]; + if (!qh) + ret = seq_printf(s, "%2lld NONE\n", *pos); + else + ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); + read_unlock_bh(&queue_handler_lock); + + return ret; +} + +static struct seq_operations nfqueue_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nfqueue_seq_ops); +} + +static struct file_operations nfqueue_file_ops = { + .owner = THIS_MODULE, + .open = nfqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* PROC_FS */ + + +int __init netfilter_queue_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif + queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter), + GFP_KERNEL); + if (!queue_rerouter) + return -ENOMEM; + +#ifdef CONFIG_PROC_FS + pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter); + if (!pde) { + kfree(queue_rerouter); + return -1; + } + pde->proc_fops = &nfqueue_file_ops; +#endif + memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter)); + + return 0; +} + diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c new file mode 100644 index 00000000000..61a833a9caa --- /dev/null +++ b/net/netfilter/nf_sockopt.c @@ -0,0 +1,132 @@ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <net/sock.h> + +#include "nf_internals.h" + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DECLARE_MUTEX(nf_sockopt_mutex); +static LIST_HEAD(nf_sockopts); + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct list_head *i; + int ret = 0; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + up(&nf_sockopt_mutex); + return ret; +} +EXPORT_SYMBOL(nf_register_sockopt); + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + /* No point being interruptible: we're probably in cleanup_module() */ + restart: + down(&nf_sockopt_mutex); + if (reg->use != 0) { + /* To be woken by nf_sockopt call... */ + /* FIXME: Stuart Young's name appears gratuitously. */ + set_current_state(TASK_UNINTERRUPTIBLE); + reg->cleanup_task = current; + up(&nf_sockopt_mutex); + schedule(); + goto restart; + } + list_del(®->list); + up(&nf_sockopt_mutex); +} +EXPORT_SYMBOL(nf_unregister_sockopt); + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val, + char __user *opt, int *len, int get) +{ + struct list_head *i; + struct nf_sockopt_ops *ops; + int ret; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + ops = (struct nf_sockopt_ops *)i; + if (ops->pf == pf) { + if (get) { + if (val >= ops->get_optmin + && val < ops->get_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->get(sk, val, opt, len); + goto out; + } + } else { + if (val >= ops->set_optmin + && val < ops->set_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->set(sk, val, opt, *len); + goto out; + } + } + } + } + up(&nf_sockopt_mutex); + return -ENOPROTOOPT; + + out: + down(&nf_sockopt_mutex); + ops->use--; + if (ops->cleanup_task) + wake_up_process(ops->cleanup_task); + up(&nf_sockopt_mutex); + return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, + int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(nf_setsockopt); + +int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(nf_getsockopt); + diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c new file mode 100644 index 00000000000..e089f17bb80 --- /dev/null +++ b/net/netfilter/nfnetlink.c @@ -0,0 +1,376 @@ +/* Netfilter messages via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist <jschlst@samba.org>, + * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> + * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net> + * + * Initial netfilter messages via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/skbuff.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <net/sock.h> +#include <linux/init.h> +#include <linux/spinlock.h> + +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/netfilter/nfnetlink.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); + +static char __initdata nfversion[] = "0.30"; + +#if 0 +#define DEBUGP(format, args...) \ + printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \ + __LINE__, __FUNCTION__, ## args) +#else +#define DEBUGP(format, args...) +#endif + +static struct sock *nfnl = NULL; +static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT]; +DECLARE_MUTEX(nfnl_sem); + +void nfnl_lock(void) +{ + nfnl_shlock(); +} + +void nfnl_unlock(void) +{ + nfnl_shunlock(); +} + +int nfnetlink_subsys_register(struct nfnetlink_subsystem *n) +{ + DEBUGP("registering subsystem ID %u\n", n->subsys_id); + + nfnl_lock(); + if (subsys_table[n->subsys_id]) { + nfnl_unlock(); + return -EBUSY; + } + subsys_table[n->subsys_id] = n; + nfnl_unlock(); + + return 0; +} + +int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n) +{ + DEBUGP("unregistering subsystem ID %u\n", n->subsys_id); + + nfnl_lock(); + subsys_table[n->subsys_id] = NULL; + nfnl_unlock(); + + return 0; +} + +static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +{ + u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + + if (subsys_id >= NFNL_SUBSYS_COUNT + || subsys_table[subsys_id] == NULL) + return NULL; + + return subsys_table[subsys_id]; +} + +static inline struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss) +{ + u_int8_t cb_id = NFNL_MSG_TYPE(type); + + if (cb_id >= ss->cb_count) { + DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count); + return NULL; + } + + return &ss->cb[cb_id]; +} + +void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen, + const void *data) +{ + struct nfattr *nfa; + int size = NFA_LENGTH(attrlen); + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = attrtype; + nfa->nfa_len = size; + memcpy(NFA_DATA(nfa), data, attrlen); + memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size); +} + +int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len) +{ + memset(tb, 0, sizeof(struct nfattr *) * maxattr); + + while (NFA_OK(nfa, len)) { + unsigned flavor = nfa->nfa_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = nfa; + nfa = NFA_NEXT(nfa, len); + } + + return 0; +} + +/** + * nfnetlink_check_attributes - check and parse nfnetlink attributes + * + * subsys: nfnl subsystem for which this message is to be parsed + * nlmsghdr: netlink message to be checked/parsed + * cda: array of pointers, needs to be at least subsys->attr_count big + * + */ +static int +nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys, + struct nlmsghdr *nlh, struct nfattr *cda[]) +{ + int min_len; + u_int16_t attr_count; + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + + if (unlikely(cb_id >= subsys->cb_count)) { + DEBUGP("msgtype %u >= %u, returning\n", + cb_id, subsys->cb_count); + return -EINVAL; + } + + min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg)); + if (unlikely(nlh->nlmsg_len < min_len)) + return -EINVAL; + + attr_count = subsys->cb[cb_id].attr_count; + memset(cda, 0, sizeof(struct nfattr *) * attr_count); + + /* check attribute lengths. */ + if (likely(nlh->nlmsg_len > min_len)) { + struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh)); + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + + while (NFA_OK(attr, attrlen)) { + unsigned flavor = attr->nfa_type; + if (flavor) { + if (flavor > attr_count) + return -EINVAL; + cda[flavor - 1] = attr; + } + attr = NFA_NEXT(attr, attrlen); + } + } + + /* implicit: if nlmsg_len == min_len, we return 0, and an empty + * (zeroed) cda[] array. The message is valid, but empty. */ + + return 0; +} + +int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL; + int err = 0; + + NETLINK_CB(skb).dst_group = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(nfnl, skb, pid, group, allocation); + if (echo) + err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT); + + return err; +} + +int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags) +{ + return netlink_unicast(nfnl, skb, pid, flags); +} + +/* Process one complete nfnetlink message. */ +static inline int nfnetlink_rcv_msg(struct sk_buff *skb, + struct nlmsghdr *nlh, int *errp) +{ + struct nfnl_callback *nc; + struct nfnetlink_subsystem *ss; + int type, err = 0; + + DEBUGP("entered; subsys=%u, msgtype=%u\n", + NFNL_SUBSYS_ID(nlh->nlmsg_type), + NFNL_MSG_TYPE(nlh->nlmsg_type)); + + /* Only requests are handled by kernel now. */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) { + DEBUGP("received non-request message\n"); + return 0; + } + + /* All the messages must at least contain nfgenmsg */ + if (nlh->nlmsg_len < + NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) { + DEBUGP("received message was too short\n"); + return 0; + } + + type = nlh->nlmsg_type; + ss = nfnetlink_get_subsys(type); + if (!ss) { +#ifdef CONFIG_KMOD + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + nfnl_shlock(); + ss = nfnetlink_get_subsys(type); + if (!ss) +#endif + goto err_inval; + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + DEBUGP("unable to find client for type %d\n", type); + goto err_inval; + } + + if (nc->cap_required && + !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) { + DEBUGP("permission denied for type %d\n", type); + *errp = -EPERM; + return -1; + } + + { + u_int16_t attr_count = + ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count; + struct nfattr *cda[attr_count]; + + memset(cda, 0, sizeof(struct nfattr *) * attr_count); + + err = nfnetlink_check_attributes(ss, nlh, cda); + if (err < 0) + goto err_inval; + + DEBUGP("calling handler\n"); + err = nc->call(nfnl, skb, nlh, cda, errp); + *errp = err; + return err; + } + +err_inval: + DEBUGP("returning -EINVAL\n"); + *errp = -EINVAL; + return -1; +} + +/* Process one packet of messages. */ +static inline int nfnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(struct nlmsghdr) + || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (nfnetlink_rcv_msg(skb, nlh, &err)) { + if (!err) + return -1; + netlink_ack(skb, nlh, err); + } else + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +static void nfnetlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (nfnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (nfnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + /* don't call nfnl_shunlock, since it would reenter + * with further packet processing */ + up(&nfnl_sem); + } while(nfnl && nfnl->sk_receive_queue.qlen); +} + +void __exit nfnetlink_exit(void) +{ + printk("Removing netfilter NETLINK layer.\n"); + sock_release(nfnl->sk_socket); + return; +} + +int __init nfnetlink_init(void) +{ + printk("Netfilter messages via NETLINK v%s.\n", nfversion); + + nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, THIS_MODULE); + if (!nfnl) { + printk(KERN_ERR "cannot initialize nfnetlink!\n"); + return -1; + } + + return 0; +} + +module_init(nfnetlink_init); +module_exit(nfnetlink_exit); + +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); +EXPORT_SYMBOL_GPL(nfnetlink_send); +EXPORT_SYMBOL_GPL(nfnetlink_unicast); +EXPORT_SYMBOL_GPL(nfattr_parse); +EXPORT_SYMBOL_GPL(__nfa_fill); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c new file mode 100644 index 00000000000..ff5601ceedc --- /dev/null +++ b/net/netfilter/nfnetlink_log.c @@ -0,0 +1,1055 @@ +/* + * This is a module which is used for logging packets to userspace via + * nfetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * Based on the old ipv4-only ipt_ULOG.c: + * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_log.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <net/sock.h> + +#include <asm/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFULNL_NLBUFSIZ_DEFAULT 4096 +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ +#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ + +#define PRINTR(x, args...) do { if (net_ratelimit()) \ + printk(x, ## args); } while (0); + +#if 0 +#define UDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define UDEBUG(x, ...) +#endif + +struct nfulnl_instance { + struct hlist_node hlist; /* global list of instances */ + spinlock_t lock; + atomic_t use; /* use count */ + + unsigned int qlen; /* number of nlmsgs in skb */ + struct sk_buff *skb; /* pre-allocatd skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct timer_list timer; + int peer_pid; /* PID of the peer process */ + + /* configurable parameters */ + unsigned int flushtimeout; /* timeout until queue flush */ + unsigned int nlbufsiz; /* netlink buffer allocation size */ + unsigned int qthreshold; /* threshold of the queue */ + u_int32_t copy_range; + u_int16_t group_num; /* number of this queue */ + u_int8_t copy_mode; +}; + +static DEFINE_RWLOCK(instances_lock); + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static unsigned int hash_init; + +static inline u_int8_t instance_hashfn(u_int16_t group_num) +{ + return ((group_num & 0xff) % INSTANCE_BUCKETS); +} + +static struct nfulnl_instance * +__instance_lookup(u_int16_t group_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u)\n", group_num); + + head = &instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->group_num == group_num) + return inst; + } + return NULL; +} + +static inline void +instance_get(struct nfulnl_instance *inst) +{ + atomic_inc(&inst->use); +} + +static struct nfulnl_instance * +instance_lookup_get(u_int16_t group_num) +{ + struct nfulnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(group_num); + if (inst) + instance_get(inst); + read_unlock_bh(&instances_lock); + + return inst; +} + +static void +instance_put(struct nfulnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + UDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + +static void nfulnl_timer(unsigned long data); + +static struct nfulnl_instance * +instance_create(u_int16_t group_num, int pid) +{ + struct nfulnl_instance *inst; + + UDEBUG("entering (group_num=%u, pid=%d)\n", group_num, + pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(group_num)) { + inst = NULL; + UDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + INIT_HLIST_NODE(&inst->hlist); + inst->lock = SPIN_LOCK_UNLOCKED; + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + + init_timer(&inst->timer); + inst->timer.function = nfulnl_timer; + inst->timer.data = (unsigned long)inst; + /* don't start timer yet. (re)start it with every packet */ + + inst->peer_pid = pid; + inst->group_num = group_num; + + inst->qthreshold = NFULNL_QTHRESH_DEFAULT; + inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; + inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; + inst->copy_mode = NFULNL_COPY_PACKET; + inst->copy_range = 0xffff; + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(group_num)]); + + UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, + inst->hlist.next); + + write_unlock_bh(&instances_lock); + + return inst; + +out_free: + instance_put(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static int __nfulnl_send(struct nfulnl_instance *inst); + +static void +_instance_destroy2(struct nfulnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + UDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->group_num); + + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending packets from skb */ + + spin_lock_bh(&inst->lock); + if (inst->skb) { + if (inst->qlen) + __nfulnl_send(inst); + if (inst->skb) { + kfree_skb(inst->skb); + inst->skb = NULL; + } + } + spin_unlock_bh(&inst->lock); + + /* and finally put the refcount */ + instance_put(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfulnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + +static int +nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, + unsigned int range) +{ + int status = 0; + + spin_lock_bh(&inst->lock); + + switch (mode) { + case NFULNL_COPY_NONE: + case NFULNL_COPY_META: + inst->copy_mode = mode; + inst->copy_range = 0; + break; + + case NFULNL_COPY_PACKET: + inst->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + inst->copy_range = 0xffff; + else + inst->copy_range = range; + break; + + default: + status = -EINVAL; + break; + } + + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) +{ + int status; + + spin_lock_bh(&inst->lock); + if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) + status = -ERANGE; + else if (nlbufsiz > 131072) + status = -ERANGE; + else { + inst->nlbufsiz = nlbufsiz; + status = 0; + } + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) +{ + spin_lock_bh(&inst->lock); + inst->flushtimeout = timeout; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) +{ + spin_lock_bh(&inst->lock); + inst->qthreshold = qthresh; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, + unsigned int pkt_size) +{ + struct sk_buff *skb; + + UDEBUG("entered (%u, %u)\n", inst_size, pkt_size); + + /* alloc skb which should be big enough for a whole multipart + * message. WARNING: has to be <= 128k due to slab restrictions */ + + skb = alloc_skb(inst_size, GFP_ATOMIC); + if (!skb) { + PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n", + inst_size); + + /* try to allocate only as much as we need for current + * packet */ + + skb = alloc_skb(pkt_size, GFP_ATOMIC); + if (!skb) + PRINTR("nfnetlink_log: can't even alloc %u bytes\n", + pkt_size); + } + + return skb; +} + +static int +__nfulnl_send(struct nfulnl_instance *inst) +{ + int status; + + if (timer_pending(&inst->timer)) + del_timer(&inst->timer); + + if (inst->qlen > 1) + inst->lastnlh->nlmsg_type = NLMSG_DONE; + + status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT); + if (status < 0) { + UDEBUG("netlink_unicast() failed\n"); + /* FIXME: statistics */ + } + + inst->qlen = 0; + inst->skb = NULL; + inst->lastnlh = NULL; + + return status; +} + +static void nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + + UDEBUG("timer function called, flushing buffer\n"); + + spin_lock_bh(&inst->lock); + __nfulnl_send(inst); + instance_put(inst); + spin_unlock_bh(&inst->lock); +} + +static inline int +__build_packet_message(struct nfulnl_instance *inst, + const struct sk_buff *skb, + unsigned int data_len, + unsigned int pf, + unsigned int hooknum, + const struct net_device *indev, + const struct net_device *outdev, + const struct nf_loginfo *li, + const char *prefix) +{ + unsigned char *old_tail; + struct nfulnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + u_int32_t tmp_uint; + + UDEBUG("entered\n"); + + old_tail = inst->skb->tail; + nlh = NLMSG_PUT(inst->skb, 0, 0, + NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(inst->group_num); + + pmsg.hw_protocol = htons(skb->protocol); + pmsg.hook = hooknum; + + NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (prefix) { + int slen = strlen(prefix); + if (slen > NFULNL_PREFIXLEN) + slen = NFULNL_PREFIXLEN; + NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix); + } + + if (indev) { + tmp_uint = htonl(indev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(indev->br_port->br->dev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, + sizeof(tmp_uint), &tmp_uint); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, + sizeof(tmp_uint), &tmp_uint); + if (skb->nf_bridge && skb->nf_bridge->physindev) { + tmp_uint = + htonl(skb->nf_bridge->physindev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif + } + + if (outdev) { + tmp_uint = htonl(outdev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(outdev->br_port->br->dev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, + sizeof(tmp_uint), &tmp_uint); + } else { + /* Case 2: indev is a bridge group, we need to look + * for physical device (when called from ipv4) */ + NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, + sizeof(tmp_uint), &tmp_uint); + if (skb->nf_bridge) { + tmp_uint = + htonl(skb->nf_bridge->physoutdev->ifindex); + NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif + } + + if (skb->nfmark) { + tmp_uint = htonl(skb->nfmark); + NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint); + } + + if (indev && skb->dev && skb->dev->hard_header_parse) { + struct nfulnl_msg_packet_hw phw; + + phw.hw_addrlen = + skb->dev->hard_header_parse((struct sk_buff *)skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + } + + if (skb->tstamp.off_sec) { + struct nfulnl_msg_packet_timestamp ts; + + ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec); + ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec); + + NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + } + + /* UID */ + if (skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) { + u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid); + /* need to unlock here since NFA_PUT may goto */ + read_unlock_bh(&skb->sk->sk_callback_lock); + NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid); + } else + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size)); + nfa->nfa_type = NFULA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = inst->skb->tail - old_tail; + return 0; + +nlmsg_failure: + UDEBUG("nlmsg_failure\n"); +nfattr_failure: + PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); + return -1; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_ULOG, + .u = { + .ulog = { + .copy_len = 0xffff, + .group = 0, + .qthreshold = 1, + }, + }, +}; + +/* log handler for internal netfilter logging api */ +static void +nfulnl_log_packet(unsigned int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li_user, + const char *prefix) +{ + unsigned int size, data_len; + struct nfulnl_instance *inst; + const struct nf_loginfo *li; + unsigned int qthreshold; + unsigned int nlbufsiz; + + if (li_user && li_user->type == NF_LOG_TYPE_ULOG) + li = li_user; + else + li = &default_loginfo; + + inst = instance_lookup_get(li->u.ulog.group); + if (!inst) + inst = instance_lookup_get(0); + if (!inst) { + PRINTR("nfnetlink_log: trying to log packet, " + "but no instance for group %u\n", li->u.ulog.group); + return; + } + + /* all macros expand to constant values at compile time */ + /* FIXME: do we want to make the size calculation conditional based on + * what is actually present? way more branches and checks, but more + * memory efficient... */ + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr)) + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NFA_SPACE(sizeof(u_int32_t)) /* ifindex */ +#endif + + NFA_SPACE(sizeof(u_int32_t)) /* mark */ + + NFA_SPACE(sizeof(u_int32_t)) /* uid */ + + NFA_SPACE(NFULNL_PREFIXLEN) /* prefix */ + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw)) + + NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp)); + + UDEBUG("initial size=%u\n", size); + + spin_lock_bh(&inst->lock); + + qthreshold = inst->qthreshold; + /* per-rule qthreshold overrides per-instance */ + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + + switch (inst->copy_mode) { + case NFULNL_COPY_META: + case NFULNL_COPY_NONE: + data_len = 0; + break; + + case NFULNL_COPY_PACKET: + if (inst->copy_range == 0 + || inst->copy_range > skb->len) + data_len = skb->len; + else + data_len = inst->copy_range; + + size += NFA_SPACE(data_len); + UDEBUG("copy_packet, therefore size now %u\n", size); + break; + + default: + spin_unlock_bh(&inst->lock); + instance_put(inst); + return; + } + + if (size > inst->nlbufsiz) + nlbufsiz = size; + else + nlbufsiz = inst->nlbufsiz; + + if (!inst->skb) { + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } else if (inst->qlen >= qthreshold || + size > skb_tailroom(inst->skb)) { + /* either the queue len is too high or we don't have + * enough room in the skb left. flush to userspace. */ + UDEBUG("flushing old skb\n"); + + __nfulnl_send(inst); + + if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) { + UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n", + inst->nlbufsiz, size); + goto alloc_failure; + } + } + + UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold); + inst->qlen++; + + __build_packet_message(inst, skb, data_len, pf, + hooknum, in, out, li, prefix); + + /* timer_pending always called within inst->lock, so there + * is no chance of a race here */ + if (!timer_pending(&inst->timer)) { + instance_get(inst); + inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); + add_timer(&inst->timer); + } + spin_unlock_bh(&inst->lock); + + return; + +alloc_failure: + spin_unlock_bh(&inst->lock); + instance_put(inst); + UDEBUG("error allocating skb\n"); + /* FIXME: statistics */ +} + +static int +nfulnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfulnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + UDEBUG("node = %p\n", inst); + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfulnl_rtnl_notifier = { + .notifier_call = nfulnl_rcv_nl_event, +}; + +static int +nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static struct nf_logger nfulnl_logger = { + .name = "nfnetlink_log", + .logfn = &nfulnl_log_packet, + .me = THIS_MODULE, +}; + +static const int nfula_min[NFULA_MAX] = { + [NFULA_PACKET_HDR-1] = sizeof(struct nfulnl_msg_packet_hdr), + [NFULA_MARK-1] = sizeof(u_int32_t), + [NFULA_TIMESTAMP-1] = sizeof(struct nfulnl_msg_packet_timestamp), + [NFULA_IFINDEX_INDEV-1] = sizeof(u_int32_t), + [NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t), + [NFULA_HWADDR-1] = sizeof(struct nfulnl_msg_packet_hw), + [NFULA_PAYLOAD-1] = 0, + [NFULA_PREFIX-1] = 0, + [NFULA_UID-1] = sizeof(u_int32_t), +}; + +static const int nfula_cfg_min[NFULA_CFG_MAX] = { + [NFULA_CFG_CMD-1] = sizeof(struct nfulnl_msg_config_cmd), + [NFULA_CFG_MODE-1] = sizeof(struct nfulnl_msg_config_mode), + [NFULA_CFG_TIMEOUT-1] = sizeof(u_int32_t), + [NFULA_CFG_QTHRESH-1] = sizeof(u_int32_t), + [NFULA_CFG_NLBUFSIZ-1] = sizeof(u_int32_t), +}; + +static int +nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t group_num = ntohs(nfmsg->res_id); + struct nfulnl_instance *inst; + int ret = 0; + + UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) { + UDEBUG("bad attribute size\n"); + return -EINVAL; + } + + inst = instance_lookup_get(group_num); + if (nfula[NFULA_CFG_CMD-1]) { + u_int8_t pf = nfmsg->nfgen_family; + struct nfulnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]); + UDEBUG("found CFG_CMD for\n"); + + switch (cmd->command) { + case NFULNL_CFG_CMD_BIND: + if (inst) { + ret = -EBUSY; + goto out_put; + } + + inst = instance_create(group_num, + NETLINK_CB(skb).pid); + if (!inst) { + ret = -EINVAL; + goto out_put; + } + break; + case NFULNL_CFG_CMD_UNBIND: + if (!inst) { + ret = -ENODEV; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + instance_destroy(inst); + break; + case NFULNL_CFG_CMD_PF_BIND: + UDEBUG("registering log handler for pf=%u\n", pf); + ret = nf_log_register(pf, &nfulnl_logger); + break; + case NFULNL_CFG_CMD_PF_UNBIND: + UDEBUG("unregistering log handler for pf=%u\n", pf); + /* This is a bug and a feature. We cannot unregister + * other handlers, like nfnetlink_inst can */ + nf_log_unregister_pf(pf); + break; + default: + ret = -EINVAL; + break; + } + } else { + if (!inst) { + UDEBUG("no config command, and no instance for " + "group=%u pid=%u =>ENOENT\n", + group_num, NETLINK_CB(skb).pid); + ret = -ENOENT; + goto out_put; + } + + if (inst->peer_pid != NETLINK_CB(skb).pid) { + UDEBUG("no config command, and wrong pid\n"); + ret = -EPERM; + goto out_put; + } + } + + if (nfula[NFULA_CFG_MODE-1]) { + struct nfulnl_msg_config_mode *params; + params = NFA_DATA(nfula[NFULA_CFG_MODE-1]); + + nfulnl_set_mode(inst, params->copy_mode, + ntohs(params->copy_range)); + } + + if (nfula[NFULA_CFG_TIMEOUT-1]) { + u_int32_t timeout = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]); + + nfulnl_set_timeout(inst, ntohl(timeout)); + } + + if (nfula[NFULA_CFG_NLBUFSIZ-1]) { + u_int32_t nlbufsiz = + *(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]); + + nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); + } + + if (nfula[NFULA_CFG_QTHRESH-1]) { + u_int32_t qthresh = + *(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]); + + nfulnl_set_qthresh(inst, ntohl(qthresh)); + } + +out_put: + instance_put(inst); + return ret; +} + +static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { + [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, + .attr_count = NFULA_MAX, + .cap_required = CAP_NET_ADMIN, }, + [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .attr_count = NFULA_CFG_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfulnl_subsys = { + .name = "log", + .subsys_id = NFNL_SUBSYS_ULOG, + .cb_count = NFULNL_MSG_MAX, + .cb = nfulnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfulnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + inst->group_num, + inst->peer_pid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); +} + +static struct seq_operations nful_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nful_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nful_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nful_file_ops = { + .owner = THIS_MODULE, + .open = nful_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nful; +#endif + + if (!init) + goto cleanup; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + /* it's not really all that important to have a random value, so + * we can do this from the init function, even if there hasn't + * been that much entropy yet */ + get_random_bytes(&hash_init, sizeof(hash_init)); + + netlink_register_notifier(&nfulnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfulnl_subsys); + if (status < 0) { + printk(KERN_ERR "log: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc_nful = create_proc_entry("nfnetlink_log", 0440, + proc_net_netfilter); + if (!proc_nful) + goto cleanup_subsys; + proc_nful->proc_fops = &nful_file_ops; +#endif + + return status; + +cleanup: + nf_log_unregister_logger(&nfulnl_logger); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", proc_net_netfilter); +cleanup_subsys: +#endif + nfnetlink_subsys_unregister(&nfulnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfulnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter userspace logging"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); + +module_init(init); +module_exit(fini); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c new file mode 100644 index 00000000000..e3a5285329a --- /dev/null +++ b/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,1132 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <linux/list.h> +#include <net/sock.h> + +#include <asm/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +#if 0 +#define QDEBUG(x, args ...) printk(KERN_DEBUG "%s(%d):%s(): " x, \ + __FILE__, __LINE__, __FUNCTION__, \ + ## args) +#else +#define QDEBUG(x, ...) +#endif + +struct nfqnl_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + unsigned int id; +}; + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + atomic_t use; + + int peer_pid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_total; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + atomic_t id_sequence; /* 'sequence' of pkt ids */ + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; + + spinlock_t lock; + + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long); + +static DEFINE_RWLOCK(instances_lock); + +u_int64_t htonll(u_int64_t in) +{ + u_int64_t out; + int i; + + for (i = 0; i < sizeof(u_int64_t); i++) + ((u_int8_t *)&out)[sizeof(u_int64_t)-1] = ((u_int8_t *)&in)[i]; + + return out; +} + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +__instance_lookup(u_int16_t queue_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfqnl_instance *inst; + + head = &instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry(inst, pos, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_lookup_get(u_int16_t queue_num) +{ + struct nfqnl_instance *inst; + + read_lock_bh(&instances_lock); + inst = __instance_lookup(queue_num); + if (inst) + atomic_inc(&inst->use); + read_unlock_bh(&instances_lock); + + return inst; +} + +static void +instance_put(struct nfqnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) { + QDEBUG("kfree(inst=%p)\n", inst); + kfree(inst); + } +} + +static struct nfqnl_instance * +instance_create(u_int16_t queue_num, int pid) +{ + struct nfqnl_instance *inst; + + QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid); + + write_lock_bh(&instances_lock); + if (__instance_lookup(queue_num)) { + inst = NULL; + QDEBUG("aborting, instance already exists\n"); + goto out_unlock; + } + + inst = kmalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) + goto out_unlock; + + memset(inst, 0, sizeof(*inst)); + inst->queue_num = queue_num; + inst->peer_pid = pid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = 0xfffff; + inst->copy_mode = NFQNL_COPY_NONE; + atomic_set(&inst->id_sequence, 0); + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + inst->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) + goto out_free; + + hlist_add_head(&inst->hlist, + &instance_table[instance_hashfn(queue_num)]); + + write_unlock_bh(&instances_lock); + + QDEBUG("successfully created new instance\n"); + + return inst; + +out_free: + kfree(inst); +out_unlock: + write_unlock_bh(&instances_lock); + return NULL; +} + +static void nfqnl_flush(struct nfqnl_instance *queue, int verdict); + +static void +_instance_destroy2(struct nfqnl_instance *inst, int lock) +{ + /* first pull it out of the global list */ + if (lock) + write_lock_bh(&instances_lock); + + QDEBUG("removing instance %p (queuenum=%u) from hash\n", + inst, inst->queue_num); + hlist_del(&inst->hlist); + + if (lock) + write_unlock_bh(&instances_lock); + + /* then flush all pending skbs from the queue */ + nfqnl_flush(inst, NF_DROP); + + /* and finally put the refcount */ + instance_put(inst); + + module_put(THIS_MODULE); +} + +static inline void +__instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 0); +} + +static inline void +instance_destroy(struct nfqnl_instance *inst) +{ + _instance_destroy2(inst, 1); +} + + + +static void +issue_verdict(struct nfqnl_queue_entry *entry, int verdict) +{ + QDEBUG("entering for entry %p, verdict %u\n", entry, verdict); + + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); + nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + + kfree(entry); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry) +{ + list_add(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct nfqnl_queue_entry * +__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue->queue_list) { + struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry) +{ + list_del(&entry->list); + q->queue_total--; +} + +static inline struct nfqnl_queue_entry * +__find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + entry = __find_entry(queue, cmpfn, data); + if (entry == NULL) + return NULL; + + __dequeue_entry(queue, entry); + return entry; +} + + +static inline void +__nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + struct nfqnl_queue_entry *entry; + + while ((entry = __find_dequeue_entry(queue, NULL, 0))) + issue_verdict(entry, verdict); +} + +static inline int +__nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nfattr which has 16bit nfa_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static struct nfqnl_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, + nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nfqnl_queue_entry *entry; + + spin_lock_bh(&queue->lock); + entry = __find_dequeue_entry(queue, cmpfn, data); + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, int verdict) +{ + spin_lock_bh(&queue->lock); + __nfqnl_flush(queue, verdict); + spin_unlock_bh(&queue->lock); +} + +static struct sk_buff * +nfqnl_build_packet_message(struct nfqnl_instance *queue, + struct nfqnl_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size; + size_t data_len = 0; + struct sk_buff *skb; + struct nfqnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int tmp_uint; + + QDEBUG("entered\n"); + + /* all macros expand to constant values at compile time */ + size = NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr)) + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ + + NLMSG_SPACE(sizeof(u_int32_t)) /* ifindex */ +#endif + + NLMSG_SPACE(sizeof(u_int32_t)) /* mark */ + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw)) + + NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp)); + + spin_lock_bh(&queue->lock); + + switch (queue->copy_mode) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + data_len = 0; + break; + + case NFQNL_COPY_PACKET: + if (queue->copy_range == 0 + || queue->copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = queue->copy_range; + + size += NLMSG_SPACE(data_len); + break; + + default: + *errp = -EINVAL; + spin_unlock_bh(&queue->lock); + return NULL; + } + + spin_unlock_bh(&queue->lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = entry->info->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + pmsg.packet_id = htonl(entry->id); + pmsg.hw_protocol = htons(entry->skb->protocol); + pmsg.hook = entry->info->hook; + + NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (entry->info->indev) { + tmp_uint = htonl(entry->info->indev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (entry->info->pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), + &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), + &tmp_uint); + if (entry->skb->nf_bridge + && entry->skb->nf_bridge->physindev) { + tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif + } + + if (entry->info->outdev) { + tmp_uint = htonl(entry->info->outdev->ifindex); +#ifndef CONFIG_BRIDGE_NETFILTER + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint); +#else + if (entry->info->pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint), + &tmp_uint); + /* this is the bridge group "brX" */ + tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + } else { + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), + &tmp_uint); + if (entry->skb->nf_bridge + && entry->skb->nf_bridge->physoutdev) { + tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex); + NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, + sizeof(tmp_uint), &tmp_uint); + } + } +#endif + } + + if (entry->skb->nfmark) { + tmp_uint = htonl(entry->skb->nfmark); + NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint); + } + + if (entry->info->indev && entry->skb->dev + && entry->skb->dev->hard_header_parse) { + struct nfqnl_msg_packet_hw phw; + + phw.hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + phw.hw_addr); + phw.hw_addrlen = htons(phw.hw_addrlen); + NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + } + + if (entry->skb->tstamp.off_sec) { + struct nfqnl_msg_packet_timestamp ts; + + ts.sec = htonll(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec); + ts.usec = htonll(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec); + + NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + } + + if (data_len) { + struct nfattr *nfa; + int size = NFA_LENGTH(data_len); + + if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) { + printk(KERN_WARNING "nf_queue: no tailroom!\n"); + goto nlmsg_failure; + } + + nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size)); + nfa->nfa_type = NFQA_PAYLOAD; + nfa->nfa_len = size; + + if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len)) + BUG(); + } + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: +nfattr_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + if (net_ratelimit()) + printk(KERN_ERR "nf_queue: error creating packet message\n"); + return NULL; +} + +static int +nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct nfqnl_instance *queue; + struct nfqnl_queue_entry *entry; + + QDEBUG("entered\n"); + + queue = instance_lookup_get(queuenum); + if (!queue) { + QDEBUG("no queue instance matching\n"); + return -EINVAL; + } + + if (queue->copy_mode == NFQNL_COPY_NONE) { + QDEBUG("mode COPY_NONE, aborting\n"); + status = -EAGAIN; + goto err_out_put; + } + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + if (net_ratelimit()) + printk(KERN_ERR + "nf_queue: OOM in nfqnl_enqueue_packet()\n"); + status = -ENOMEM; + goto err_out_put; + } + + entry->info = info; + entry->skb = skb; + entry->id = atomic_inc_return(&queue->id_sequence); + + nskb = nfqnl_build_packet_message(queue, entry, &status); + if (nskb == NULL) + goto err_out_free; + + spin_lock_bh(&queue->lock); + + if (!queue->peer_pid) + goto err_out_free_nskb; + + if (queue->queue_total >= queue->queue_maxlen) { + queue->queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk(KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", + queue->queue_total, queue->queue_dropped); + goto err_out_free_nskb; + } + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + instance_put(queue); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue->lock); + +err_out_free: + kfree(entry); +err_out_put: + instance_put(queue); + return status; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e) +{ + int diff; + + diff = data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, data_len); + else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(&e->skb, data_len)) + return -ENOMEM; + memcpy(e->skb->data, data, data_len); + + return 0; +} + +static inline int +id_cmp(struct nfqnl_queue_entry *e, unsigned long id) +{ + return (id == e->id); +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue->lock); + status = __nfqnl_set_mode(queue, mode, range); + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(int ifindex) +{ + int i; + + QDEBUG("entering for ifindex %u\n", ifindex); + + /* this only looks like we have to hold the readlock for a way too long + * time, issue_verdict(), nf_reinject(), ... - but we always only + * issue NF_DROP, which is processed directly in nf_reinject() */ + read_lock_bh(&instances_lock); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry(inst, tmp, head, hlist) { + struct nfqnl_queue_entry *entry; + while ((entry = find_dequeue_entry(inst, dev_cmp, + ifindex)) != NULL) + issue_verdict(entry, NF_DROP); + } + } + + read_unlock_bh(&instances_lock); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_NETFILTER && n->pid) { + int i; + + /* destroy all instances for this pid */ + write_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } + write_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const int nfqa_verdict_min[NFQA_MAX] = { + [NFQA_VERDICT_HDR-1] = sizeof(struct nfqnl_msg_verdict_hdr), + [NFQA_MARK-1] = sizeof(u_int32_t), + [NFQA_PAYLOAD-1] = 0, +}; + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nfqnl_queue_entry *entry; + int err; + + if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + err = -EPERM; + goto err_out_put; + } + + if (!nfqa[NFQA_VERDICT_HDR-1]) { + err = -EINVAL; + goto err_out_put; + } + + vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]); + verdict = ntohl(vhdr->verdict); + + if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) { + err = -EINVAL; + goto err_out_put; + } + + entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id)); + if (entry == NULL) { + err = -ENOENT; + goto err_out_put; + } + + if (nfqa[NFQA_PAYLOAD-1]) { + if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]), + NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0) + verdict = NF_DROP; + } + + if (nfqa[NFQA_MARK-1]) + skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1])); + + issue_verdict(entry, verdict); + instance_put(queue); + return 0; + +err_out_put: + instance_put(queue); + return err; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + return -ENOTSUPP; +} + +static const int nfqa_cfg_min[NFQA_CFG_MAX] = { + [NFQA_CFG_CMD-1] = sizeof(struct nfqnl_msg_config_cmd), + [NFQA_CFG_PARAMS-1] = sizeof(struct nfqnl_msg_config_params), +}; + +static struct nf_queue_handler nfqh = { + .name = "nf_queue", + .outfn = &nfqnl_enqueue_packet, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + int ret = 0; + + QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type)); + + if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) { + QDEBUG("bad attribute size\n"); + return -EINVAL; + } + + queue = instance_lookup_get(queue_num); + if (nfqa[NFQA_CFG_CMD-1]) { + struct nfqnl_msg_config_cmd *cmd; + cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]); + QDEBUG("found CFG_CMD\n"); + + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) + return -EBUSY; + + queue = instance_create(queue_num, NETLINK_CB(skb).pid); + if (!queue) + return -EINVAL; + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) + return -ENODEV; + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + instance_destroy(queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + QDEBUG("registering queue handler for pf=%u\n", + ntohs(cmd->pf)); + ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh); + break; + case NFQNL_CFG_CMD_PF_UNBIND: + QDEBUG("unregistering queue handler for pf=%u\n", + ntohs(cmd->pf)); + /* This is a bug and a feature. We can unregister + * other handlers(!) */ + ret = nf_unregister_queue_handler(ntohs(cmd->pf)); + break; + default: + ret = -EINVAL; + break; + } + } else { + if (!queue) { + QDEBUG("no config command, and no instance ENOENT\n"); + ret = -ENOENT; + goto out_put; + } + + if (queue->peer_pid != NETLINK_CB(skb).pid) { + QDEBUG("no config command, and wrong pid\n"); + ret = -EPERM; + goto out_put; + } + } + + if (nfqa[NFQA_CFG_PARAMS-1]) { + struct nfqnl_msg_config_params *params; + params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]); + + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + +out_put: + instance_put(queue); + return ret; +} + +static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .cap_required = CAP_NET_ADMIN }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) +{ + read_unlock_bh(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_pid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + atomic_read(&inst->id_sequence), + atomic_read(&inst->use)); +} + +static struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct iter_state *is; + int ret; + + is = kmalloc(sizeof(*is), GFP_KERNEL); + if (!is) + return -ENOMEM; + memset(is, 0, sizeof(*is)); + ret = seq_open(file, &nfqnl_seq_ops); + if (ret < 0) + goto out_free; + seq = file->private_data; + seq->private = is; + return ret; +out_free: + kfree(is); + return ret; +} + +static struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int i, status = -ENOMEM; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_nfqueue; +#endif + + if (!init) + goto cleanup; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440, + proc_net_netfilter); + if (!proc_nfqueue) + goto cleanup_subsys; + proc_nfqueue->proc_fops = &nfqnl_file_ops; +#endif + + register_netdevice_notifier(&nfqnl_dev_notifier); + + return status; + +cleanup: + nf_unregister_queue_handlers(&nfqh); + unregister_netdevice_notifier(&nfqnl_dev_notifier); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", proc_net_netfilter); +cleanup_subsys: +#endif + nfnetlink_subsys_unregister(&nfqnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(init); +module_exit(fini); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ff774a06c89..62435ffc618 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -13,7 +13,12 @@ * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) - * + * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> + * - inc module use count of module that owns + * the kernel socket in case userspace opens + * socket of same protocol + * - remove all module support, since netlink is + * mandatory if CONFIG_NET=y these days */ #include <linux/config.h> @@ -55,21 +60,29 @@ #include <net/scm.h> #define Nprintk(a...) +#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; u32 pid; - unsigned int groups; u32 dst_pid; - unsigned int dst_groups; + u32 dst_group; + u32 flags; + u32 subscriptions; + u32 ngroups; + unsigned long *groups; unsigned long state; wait_queue_head_t wait; struct netlink_callback *cb; spinlock_t cb_lock; void (*data_ready)(struct sock *sk, int bytes); + struct module *module; }; +#define NETLINK_KERNEL_SOCKET 0x1 +#define NETLINK_RECV_PKTINFO 0x2 + static inline struct netlink_sock *nlk_sk(struct sock *sk) { return (struct netlink_sock *)sk; @@ -92,6 +105,9 @@ struct netlink_table { struct nl_pid_hash hash; struct hlist_head mc_list; unsigned int nl_nonroot; + unsigned int groups; + struct module *module; + int registered; }; static struct netlink_table *nl_table; @@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static struct notifier_block *netlink_chain; +static u32 netlink_group_mask(u32 group) +{ + return group ? 1 << (group - 1) : 0; +} + static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) { return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; @@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk) BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); BUG_TRAP(!nlk_sk(sk)->cb); + BUG_TRAP(!nlk_sk(sk)->groups); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. @@ -317,7 +339,7 @@ static void netlink_remove(struct sock *sk) netlink_table_grab(); if (sk_del_node_init(sk)) nl_table[sk->sk_protocol].hash.entries--; - if (nlk_sk(sk)->groups) + if (nlk_sk(sk)->subscriptions) __sk_del_bind_node(sk); netlink_table_ungrab(); } @@ -328,19 +350,11 @@ static struct proto netlink_proto = { .obj_size = sizeof(struct netlink_sock), }; -static int netlink_create(struct socket *sock, int protocol) +static int __netlink_create(struct socket *sock, int protocol) { struct sock *sk; struct netlink_sock *nlk; - sock->state = SS_UNCONNECTED; - - if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) - return -ESOCKTNOSUPPORT; - - if (protocol<0 || protocol >= MAX_LINKS) - return -EPROTONOSUPPORT; - sock->ops = &netlink_ops; sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); @@ -350,15 +364,67 @@ static int netlink_create(struct socket *sock, int protocol) sock_init_data(sock, sk); nlk = nlk_sk(sk); - spin_lock_init(&nlk->cb_lock); init_waitqueue_head(&nlk->wait); - sk->sk_destruct = netlink_sock_destruct; + sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } +static int netlink_create(struct socket *sock, int protocol) +{ + struct module *module = NULL; + struct netlink_sock *nlk; + unsigned int groups; + int err = 0; + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol<0 || protocol >= MAX_LINKS) + return -EPROTONOSUPPORT; + + netlink_lock_table(); +#ifdef CONFIG_KMOD + if (!nl_table[protocol].registered) { + netlink_unlock_table(); + request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); + netlink_lock_table(); + } +#endif + if (nl_table[protocol].registered && + try_module_get(nl_table[protocol].module)) + module = nl_table[protocol].module; + else + err = -EPROTONOSUPPORT; + groups = nl_table[protocol].groups; + netlink_unlock_table(); + + if (err || (err = __netlink_create(sock, protocol) < 0)) + goto out_module; + + nlk = nlk_sk(sock->sk); + + nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL); + if (nlk->groups == NULL) { + err = -ENOMEM; + goto out_module; + } + memset(nlk->groups, 0, NLGRPSZ(groups)); + nlk->ngroups = groups; + + nlk->module = module; +out: + return err; + +out_module: + module_put(module); + goto out; +} + static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -387,14 +453,27 @@ static int netlink_release(struct socket *sock) skb_queue_purge(&sk->sk_write_queue); - if (nlk->pid && !nlk->groups) { + if (nlk->pid && !nlk->subscriptions) { struct netlink_notify n = { .protocol = sk->sk_protocol, .pid = nlk->pid, }; notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } - + + if (nlk->module) + module_put(nlk->module); + + if (nlk->flags & NETLINK_KERNEL_SOCKET) { + netlink_table_grab(); + nl_table[sk->sk_protocol].module = NULL; + nl_table[sk->sk_protocol].registered = 0; + netlink_table_ungrab(); + } + + kfree(nlk->groups); + nlk->groups = NULL; + sock_put(sk); return 0; } @@ -443,6 +522,18 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag) capable(CAP_NET_ADMIN); } +static void +netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->subscriptions && !subscriptions) + __sk_del_bind_node(sk); + else if (!nlk->subscriptions && subscriptions) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->subscriptions = subscriptions; +} + static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; @@ -468,15 +559,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len return err; } - if (!nladdr->nl_groups && !nlk->groups) + if (!nladdr->nl_groups && !(u32)nlk->groups[0]) return 0; netlink_table_grab(); - if (nlk->groups && !nladdr->nl_groups) - __sk_del_bind_node(sk); - else if (!nlk->groups && nladdr->nl_groups) - sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); - nlk->groups = nladdr->nl_groups; + netlink_update_subscriptions(sk, nlk->subscriptions + + hweight32(nladdr->nl_groups) - + hweight32(nlk->groups[0])); + nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; netlink_table_ungrab(); return 0; @@ -493,7 +583,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family == AF_UNSPEC) { sk->sk_state = NETLINK_UNCONNECTED; nlk->dst_pid = 0; - nlk->dst_groups = 0; + nlk->dst_group = 0; return 0; } if (addr->sa_family != AF_NETLINK) @@ -509,7 +599,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, if (err == 0) { sk->sk_state = NETLINK_CONNECTED; nlk->dst_pid = nladdr->nl_pid; - nlk->dst_groups = nladdr->nl_groups; + nlk->dst_group = ffs(nladdr->nl_groups); } return err; @@ -527,10 +617,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr if (peer) { nladdr->nl_pid = nlk->dst_pid; - nladdr->nl_groups = nlk->dst_groups; + nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->pid; - nladdr->nl_groups = nlk->groups; + nladdr->nl_groups = nlk->groups[0]; } return 0; } @@ -731,7 +821,8 @@ static inline int do_one_broadcast(struct sock *sk, if (p->exclude_sk == sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & p->group)) + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) goto out; if (p->failure) { @@ -770,7 +861,7 @@ out: } int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, - u32 group, int allocation) + u32 group, unsigned int __nocast allocation) { struct netlink_broadcast_data info; struct hlist_node *node; @@ -827,7 +918,8 @@ static inline int do_one_set_err(struct sock *sk, if (sk == p->exclude_sk) goto out; - if (nlk->pid == p->pid || !(nlk->groups & p->group)) + if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || + !test_bit(p->group - 1, nlk->groups)) goto out; sk->sk_err = p->code; @@ -855,6 +947,94 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) read_unlock(&nl_table_lock); } +static int netlink_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int val = 0, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (optlen >= sizeof(int) && + get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETLINK_PKTINFO: + if (val) + nlk->flags |= NETLINK_RECV_PKTINFO; + else + nlk->flags &= ~NETLINK_RECV_PKTINFO; + err = 0; + break; + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: { + unsigned int subscriptions; + int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0; + + if (!netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + if (!val || val - 1 >= nlk->ngroups) + return -EINVAL; + netlink_table_grab(); + old = test_bit(val - 1, nlk->groups); + subscriptions = nlk->subscriptions - old + new; + if (new) + __set_bit(val - 1, nlk->groups); + else + __clear_bit(val - 1, nlk->groups); + netlink_update_subscriptions(sk, subscriptions); + netlink_table_ungrab(); + err = 0; + break; + } + default: + err = -ENOPROTOOPT; + } + return err; +} + +static int netlink_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int len, val, err; + + if (level != SOL_NETLINK) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETLINK_PKTINFO: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0; + put_user(len, optlen); + put_user(val, optval); + err = 0; + break; + default: + err = -ENOPROTOOPT; + } + return err; +} + +static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct nl_pktinfo info; + + info.group = NETLINK_CB(skb).dst_group; + put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); +} + static inline void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -873,7 +1053,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *addr=msg->msg_name; u32 dst_pid; - u32 dst_groups; + u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; @@ -891,12 +1071,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (addr->nl_family != AF_NETLINK) return -EINVAL; dst_pid = addr->nl_pid; - dst_groups = addr->nl_groups; - if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + dst_group = ffs(addr->nl_groups); + if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) return -EPERM; } else { dst_pid = nlk->dst_pid; - dst_groups = nlk->dst_groups; + dst_group = nlk->dst_group; } if (!nlk->pid) { @@ -914,9 +1094,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; NETLINK_CB(skb).pid = nlk->pid; - NETLINK_CB(skb).groups = nlk->groups; NETLINK_CB(skb).dst_pid = dst_pid; - NETLINK_CB(skb).dst_groups = dst_groups; + NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); @@ -938,9 +1117,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } - if (dst_groups) { + if (dst_group) { atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); + netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); @@ -986,7 +1165,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).pid; - addr->nl_groups = NETLINK_CB(skb).dst_groups; + addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } @@ -1001,6 +1180,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, netlink_dump(sk); scm_recv(sock, msg, siocb->scm, flags); + if (nlk->flags & NETLINK_RECV_PKTINFO) + netlink_cmsg_recv_pktinfo(msg, skb); out: netlink_rcv_wake(sk); @@ -1023,10 +1204,13 @@ static void netlink_data_ready(struct sock *sk, int len) */ struct sock * -netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +netlink_kernel_create(int unit, unsigned int groups, + void (*input)(struct sock *sk, int len), + struct module *module) { struct socket *sock; struct sock *sk; + struct netlink_sock *nlk; if (!nl_table) return NULL; @@ -1037,20 +1221,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; - if (netlink_create(sock, unit) < 0) { - sock_release(sock); - return NULL; - } + if (__netlink_create(sock, unit) < 0) + goto out_sock_release; + sk = sock->sk; sk->sk_data_ready = netlink_data_ready; if (input) nlk_sk(sk)->data_ready = input; - if (netlink_insert(sk, 0)) { - sock_release(sock); - return NULL; - } + if (netlink_insert(sk, 0)) + goto out_sock_release; + + nlk = nlk_sk(sk); + nlk->flags |= NETLINK_KERNEL_SOCKET; + + netlink_table_grab(); + nl_table[unit].groups = groups < 32 ? 32 : groups; + nl_table[unit].module = module; + nl_table[unit].registered = 1; + netlink_table_ungrab(); + return sk; + +out_sock_release: + sock_release(sock); + return NULL; } void netlink_set_nonroot(int protocol, unsigned int flags) @@ -1288,7 +1483,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v) s, s->sk_protocol, nlk->pid, - nlk->groups, + nlk->flags & NETLINK_KERNEL_SOCKET ? + 0 : (unsigned int)nlk->groups[0], atomic_read(&s->sk_rmem_alloc), atomic_read(&s->sk_wmem_alloc), nlk->cb, @@ -1362,8 +1558,8 @@ static struct proto_ops netlink_ops = { .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, - .setsockopt = sock_no_setsockopt, - .getsockopt = sock_no_getsockopt, + .setsockopt = netlink_setsockopt, + .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, @@ -1438,21 +1634,7 @@ out: return err; } -static void __exit netlink_proto_exit(void) -{ - sock_unregister(PF_NETLINK); - proc_net_remove("netlink"); - kfree(nl_table); - nl_table = NULL; - proto_unregister(&netlink_proto); -} - core_initcall(netlink_proto_init); -module_exit(netlink_proto_exit); - -MODULE_LICENSE("GPL"); - -MODULE_ALIAS_NETPROTO(PF_NETLINK); EXPORT_SYMBOL(netlink_ack); EXPORT_SYMBOL(netlink_broadcast); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 162a85fed15..4b53de98211 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -39,7 +39,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/ip.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/arp.h> #include <linux/init.h> @@ -858,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) frametype = skb->data[19] & 0x0F; flags = skb->data[19] & 0xF0; -#ifdef CONFIG_INET /* * Check for an incoming IP over NET/ROM frame. */ - if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { + if (frametype == NR_PROTOEXT && + circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); skb->h.raw = skb->data; return nr_rx_ip(skb, dev); } -#endif /* * Find an existing socket connection, based on circuit ID, if it's diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 220bf7494f7..263da4c2649 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -38,8 +38,6 @@ #include <net/ax25.h> #include <net/netrom.h> -#ifdef CONFIG_INET - /* * Only allow IP over NET/ROM frames through if the netrom device is up. */ @@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) skb->nh.raw = skb->data; skb->pkt_type = PACKET_HOST; - ip_rcv(skb, skb->dev, NULL); + netif_rx(skb); return 1; } +#ifdef CONFIG_INET static int nr_rebuild_header(struct sk_buff *skb) { diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 9c44b379412..64b81a79690 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -22,8 +22,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> -#include <net/ip.h> /* For ip_rcv */ +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 0627347b14b..587bed2674b 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -21,7 +21,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> @@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk) if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index faabda8088b..75b72d389ba 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -22,7 +22,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c9d5980aa4d..ba997095f08 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -241,7 +241,7 @@ static struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET static struct proto_ops packet_ops_spkt; -static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; @@ -441,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned we will not harm anyone. */ -static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_ll *sll; @@ -546,7 +546,7 @@ drop: } #ifdef CONFIG_PACKET_MMAP -static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct packet_sock *po; @@ -635,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct pack h->tp_snaplen = snaplen; h->tp_mac = macoff; h->tp_net = netoff; - if (skb->stamp.tv_sec == 0) { - do_gettimeofday(&skb->stamp); + if (skb->tstamp.off_sec == 0) { + __net_timestamp(skb); sock_enable_timestamp(sk); } - h->tp_sec = skb->stamp.tv_sec; - h->tp_usec = skb->stamp.tv_usec; + h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; + h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); sll->sll_halen = 0; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5480caf8ccc..c6e59f84c3a 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -41,7 +41,7 @@ #include <net/rose.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/ip.h> #include <net/arp.h> diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index ef475a1bb1b..8348d33f1ef 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -26,8 +26,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/ip.h> /* For ip_rcv */ -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 25da6f699fd..4510cd7613e 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -24,7 +24,7 @@ #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/system.h> #include <asm/uaccess.h> #include <linux/fcntl.h> diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index 7db7e1cedc3..a29a3a960fd 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -21,7 +21,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> @@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk) if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 84dd4403f79..50ae0371dab 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -22,7 +22,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 9bce7794130..122c086ee2d 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c @@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans, msg->trans = trans; msg->state = RXRPC_MSG_RECEIVED; - msg->stamp = pkt->stamp; + skb_get_timestamp(pkt, &msg->stamp); if (msg->stamp.tv_sec == 0) { do_gettimeofday(&msg->stamp); if (pkt->sk) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 59d3e71f8b8..45d3bc0812c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -491,6 +491,7 @@ config NET_EMATCH_TEXT depends on NET_EMATCH select TEXTSEARCH select TEXTSEARCH_KMP + select TEXTSEARCH_BM select TEXTSEARCH_FSM ---help--- Say Y here if you want to be ablt to classify packets based on diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 249c61936ea..8aebe8f6d27 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, while ((a = act) != NULL) { repeat: if (a->ops && a->ops->act) { - ret = a->ops->act(&skb, a); + ret = a->ops->act(&skb, a, res); if (TC_MUNGED & skb->tc_verd) { /* copied already, allow trampling */ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); @@ -179,11 +179,6 @@ repeat: act = a->next; } exec_done: - if (skb->tc_classid > 0) { - res->classid = skb->tc_classid; - res->class = 0; - skb->tc_classid = 0; - } return ret; } @@ -598,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (err > 0) return 0; @@ -661,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) /* now do the delete */ tcf_action_destroy(head, 0); - ret = rtnetlink_send(skb, pid, RTMGRP_TC, + ret = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); if (ret > 0) return 0; @@ -703,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, x->rta_len = skb->tail - (u8*)x; nlh->nlmsg_len = skb->tail - b; - NETLINK_CB(skb).dst_groups = RTMGRP_TC; + NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO); if (err > 0) err = 0; return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3b5714ef4d1..b4d89fbb378 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct tcf_dump_args diff --git a/net/sched/gact.c b/net/sched/gact.c index a811c89fef7..d1c6d542912 100644 --- a/net/sched/gact.c +++ b/net/sched/gact.c @@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind) } static int -tcf_gact(struct sk_buff **pskb, struct tc_action *a) +tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_gact *p = PRIV(a, gact); struct sk_buff *skb = *pskb; diff --git a/net/sched/ipt.c b/net/sched/ipt.c index b114d994d52..f50136eed21 100644 --- a/net/sched/ipt.c +++ b/net/sched/ipt.c @@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind) } static int -tcf_ipt(struct sk_buff **pskb, struct tc_action *a) +tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *p = PRIV(a, ipt); diff --git a/net/sched/mirred.c b/net/sched/mirred.c index f309ce33680..20d06916dc0 100644 --- a/net/sched/mirred.c +++ b/net/sched/mirred.c @@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind) } static int -tcf_mirred(struct sk_buff **pskb, struct tc_action *a) +tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *p = PRIV(a, mirred); struct net_device *dev; diff --git a/net/sched/pedit.c b/net/sched/pedit.c index 678be6a645f..767d24f4610 100644 --- a/net/sched/pedit.c +++ b/net/sched/pedit.c @@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind) } static int -tcf_pedit(struct sk_buff **pskb, struct tc_action *a) +tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = PRIV(a, pedit); struct sk_buff *skb = *pskb; diff --git a/net/sched/police.c b/net/sched/police.c index c03545faf52..eb39fb2f39b 100644 --- a/net/sched/police.c +++ b/net/sched/police.c @@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind) return 0; } -static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) +static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a, + struct tcf_result *res) { psched_time_t now; struct sk_buff *skb = *pskb; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b9a069af4a0..737681cb9a9 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, } if (skb->len) - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); err_out: kfree_skb(skb); @@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, return -EINVAL; } - return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); } struct qdisc_dump_args diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0d066c96534..99ceb91f015 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev) spin_unlock_bh(&dev->xmit_lock); } +void netif_carrier_on(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) + linkwatch_fire_event(dev); + if (netif_running(dev)) + __netdev_watchdog_up(dev); +} + +void netif_carrier_off(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) + linkwatch_fire_event(dev); +} + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. @@ -600,6 +614,8 @@ void dev_shutdown(struct net_device *dev) } EXPORT_SYMBOL(__netdev_watchdog_up); +EXPORT_SYMBOL(netif_carrier_on); +EXPORT_SYMBOL(netif_carrier_off); EXPORT_SYMBOL(noop_qdisc); EXPORT_SYMBOL(noop_qdisc_ops); EXPORT_SYMBOL(qdisc_create_dflt); diff --git a/net/sched/simple.c b/net/sched/simple.c index 3ab4c675ab5..8a6ae4f491e 100644 --- a/net/sched/simple.c +++ b/net/sched/simple.c @@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock); #include <net/pkt_act.h> #include <net/act_generic.h> -static int tcf_simp(struct sk_buff **pskb, struct tc_action *a) +static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res) { struct sk_buff *skb = *pskb; struct tcf_defact *p = PRIV(a, defact); diff --git a/net/sctp/input.c b/net/sctp/input.c index 742be9171b7..28f32243397 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb) } /* SCTP seems to always need a timestamp right now (FIXME) */ - if (skb->stamp.tv_sec == 0) { - do_gettimeofday(&skb->stamp); + if (skb->tstamp.off_sec == 0) { + __net_timestamp(skb); sock_enable_timestamp(sk); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e9b2fd480d6..fa3be2b8fb5 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -66,8 +66,8 @@ #include <linux/seq_file.h> #include <net/protocol.h> -#include <net/tcp.h> #include <net/ndisc.h> +#include <net/ip.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, else newinet->pmtudisc = IP_PMTUDISC_WANT; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet6_sock_nr); - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index ce9245e71fc..e7025be7769 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -62,7 +62,7 @@ /* Global data structures. */ struct sctp_globals sctp_globals; struct proc_dir_entry *proc_net_sctp; -DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics); +DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly; struct idr sctp_assocs_id; DEFINE_SPINLOCK(sctp_assocs_id_lock); @@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific; static struct sctp_af *sctp_af_v4_specific; static struct sctp_af *sctp_af_v6_specific; -kmem_cache_t *sctp_chunk_cachep; -kmem_cache_t *sctp_bucket_cachep; +kmem_cache_t *sctp_chunk_cachep __read_mostly; +kmem_cache_t *sctp_bucket_cachep __read_mostly; extern int sctp_snmp_proc_init(void); extern int sctp_snmp_proc_exit(void); @@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, newinet->mc_index = 0; newinet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); @@ -1244,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void) module_init(sctp_init); module_exit(sctp_exit); +/* + * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>"); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); MODULE_LICENSE("GPL"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 00d32b7c826..3868a8d70cc 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1362,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie( char *key; sctp_scope_t scope; struct sk_buff *skb = chunk->skb; + struct timeval tv; headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE; bodysize = ntohs(chunk->chunk_hdr->length) - headersize; @@ -1434,7 +1435,8 @@ no_hmac: * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. */ - if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) { + skb_get_timestamp(skb, &tv); + if (!asoc && tv_lt(bear_cookie->expiration, tv)) { __u16 len; /* * Section 3.3.10.3 Stale Cookie Error (3) @@ -1447,10 +1449,9 @@ no_hmac: len = ntohs(chunk->chunk_hdr->length); *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { - suseconds_t usecs = (skb->stamp.tv_sec - + suseconds_t usecs = (tv.tv_sec - bear_cookie->expiration.tv_sec) * 1000000L + - skb->stamp.tv_usec - - bear_cookie->expiration.tv_usec; + tv.tv_usec - bear_cookie->expiration.tv_usec; usecs = htonl(usecs); sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 091a66f06a3..4454afe4727 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4892,7 +4892,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); } } @@ -4921,7 +4921,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); } } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 8bbc279d6c9..ec2c857eae7 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -50,9 +50,9 @@ /* Forward declarations for internal helpers. */ static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *); + struct sctp_ulpevent *); static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, - struct sctp_ulpevent *); + struct sctp_ulpevent *); /* 1st Level Abstractions */ @@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, event = sctp_ulpq_order(ulpq, event); } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); @@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) return sctp_clear_pd(ulpq->asoc->base.sk); } - - +/* If the SKB of 'event' is on a list, it is the first such member + * of that list. + */ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sock *sk = ulpq->asoc->base.sk; - struct sk_buff_head *queue; + struct sk_buff_head *queue, *skb_list; + struct sk_buff *skb = sctp_event2skb(event); int clear_pd = 0; + skb_list = (struct sk_buff_head *) skb->prev; + /* If the socket is just going to throw this away, do not * even try to deliver it. */ @@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) /* If we are harvesting multiple skbs they will be * collected on a list. */ - if (sctp_event2skb(event)->list) - sctp_skb_list_tail(sctp_event2skb(event)->list, queue); + if (skb_list) + sctp_skb_list_tail(skb_list, queue); else - __skb_queue_tail(queue, sctp_event2skb(event)); + __skb_queue_tail(queue, skb); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive @@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) return 1; out_free: - if (sctp_event2skb(event)->list) - sctp_queue_purge_ulpevents(sctp_event2skb(event)->list); + if (skb_list) + sctp_queue_purge_ulpevents(skb_list); else sctp_ulpevent_free(event); + return 0; } @@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ -static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag) +static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; struct sctp_ulpevent *event; @@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, skb_shinfo(f_frag)->frag_list = pos; /* Remove the first fragment from the reassembly queue. */ - __skb_unlink(f_frag, f_frag->list); + __skb_unlink(f_frag, queue); while (pos) { pnext = pos->next; @@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, f_frag->data_len += pos->len; /* Remove the fragment from the reassembly queue. */ - __skb_unlink(pos, pos->list); + __skb_unlink(pos, queue); /* Break if we have reached the last fragment. */ if (pos == l_frag) @@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u done: return retval; found: - retval = sctp_make_reassembled_event(first_frag, pos); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; goto done; @@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; @@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u * further. */ done: - retval = sctp_make_reassembled_event(first_frag, last_frag); + retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag); return retval; } @@ -537,6 +544,7 @@ done: static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { + struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_stream *in; @@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, ssn = event->ssn; in = &ulpq->asoc->ssnmap->in; + event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; + /* We are holding the chunks by stream, by SSN. */ sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; @@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, sid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(event_list, pos); } } @@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, - struct sctp_ulpevent *event) + struct sctp_ulpevent *event) { __u16 sid, ssn; struct sctp_stream *in; @@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; - struct sctp_ulpevent *event = NULL; + struct sctp_ulpevent *event; struct sctp_stream *in; struct sk_buff_head temp; __u16 csid, cssn; @@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) in = &ulpq->asoc->ssnmap->in; /* We are holding the chunks by stream, by SSN. */ + skb_queue_head_init(&temp); + event = NULL; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; @@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) /* Found it, so mark in the ssnmap. */ sctp_ssn_next(in, csid); - __skb_unlink(pos, pos->list); + __skb_unlink(pos, &ulpq->lobby); if (!event) { /* Create a temporary list to collect chunks on. */ event = sctp_skb2event(pos); - skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } else { /* Attach all gathered skbs to the event. */ - __skb_queue_tail(sctp_event2skb(event)->list, pos); + __skb_queue_tail(&temp, pos); } } - /* Send event to the ULP. */ + /* Send event to the ULP. 'event' is the sctp_ulpevent for + * very first SKB on the 'temp' list. + */ if (event) sctp_ulpq_tail_event(ulpq, event); } diff --git a/net/socket.c b/net/socket.c index 6f2a1788197..94fe638b4d7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -70,6 +70,8 @@ #include <linux/seq_file.h> #include <linux/wanrouter.h> #include <linux/if_bridge.h> +#include <linux/if_frad.h> +#include <linux/if_vlan.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/cache.h> @@ -272,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule #define SOCKFS_MAGIC 0x534F434B -static kmem_cache_t * sock_inode_cachep; +static kmem_cache_t * sock_inode_cachep __read_mostly; static struct inode *sock_alloc_inode(struct super_block *sb) { @@ -331,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type, return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); } -static struct vfsmount *sock_mnt; +static struct vfsmount *sock_mnt __read_mostly; static struct file_system_type sock_fs_type = { .name = "sockfs", @@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock) file->f_mode = FMODE_READ | FMODE_WRITE; file->f_flags = O_RDWR; file->f_pos = 0; + file->private_data = sock; fd_install(fd, file); } @@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err) return NULL; } + if (file->f_op == &socket_file_ops) + return file->private_data; /* set in sock_map_fd */ + inode = file->f_dentry->d_inode; if (!S_ISSOCK(inode->i_mode)) { *err = -ENOTSOCK; @@ -720,8 +726,8 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, return __sock_sendmsg(iocb, sock, &x->async_msg, size); } -ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { struct socket *sock; int flags; @@ -944,7 +950,7 @@ static int sock_mmap(struct file * file, struct vm_area_struct * vma) return sock->ops->mmap(file, sock, vma); } -int sock_close(struct inode *inode, struct file *filp) +static int sock_close(struct inode *inode, struct file *filp) { /* * It was possible the inode is NULL we were @@ -2023,9 +2029,6 @@ int sock_unregister(int family) return 0; } - -extern void sk_init(void); - void __init sock_init(void) { /* diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 554f224c044..fe1a73ce6cf 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -28,13 +28,13 @@ #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> -static struct vfsmount *rpc_mount; +static struct vfsmount *rpc_mount __read_mostly; static int rpc_mount_count; static struct file_system_type rpc_pipe_fs_type; -static kmem_cache_t *rpc_inode_cachep; +static kmem_cache_t *rpc_inode_cachep __read_mostly; #define RPC_UPCALL_TIMEOUT (30*HZ) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 2d9eb7fbd52..f3104035e35 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -34,10 +34,10 @@ static int rpc_task_id; #define RPC_BUFFER_MAXSIZE (2048) #define RPC_BUFFER_POOLSIZE (8) #define RPC_TASK_POOLSIZE (8) -static kmem_cache_t *rpc_task_slabp; -static kmem_cache_t *rpc_buffer_slabp; -static mempool_t *rpc_task_mempool; -static mempool_t *rpc_buffer_mempool; +static kmem_cache_t *rpc_task_slabp __read_mostly; +static kmem_cache_t *rpc_buffer_slabp __read_mostly; +static mempool_t *rpc_task_mempool __read_mostly; +static mempool_t *rpc_buffer_mempool __read_mostly; static void __rpc_default_timer(struct rpc_task *task); static void rpciod_killall(void); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d0c3120d023..05fe2e73553 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -34,7 +34,7 @@ #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); } - if (skb->stamp.tv_sec == 0) { - skb->stamp.tv_sec = xtime.tv_sec; - skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; + if (skb->tstamp.off_sec == 0) { + struct timeval tv; + + tv.tv_sec = xtime.tv_sec; + tv.tv_usec = xtime.tv_nsec * 1000; + skb_set_timestamp(skb, &tv); /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ } - svsk->sk_sk->sk_stamp = skb->stamp; + skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp); set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ /* diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 3f6e31069c5..c5241fcbb96 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -17,17 +17,15 @@ #include <linux/sysctl.h> #ifdef CONFIG_INET -extern struct ctl_table ipv4_table[]; +#include <net/ip.h> #endif -extern struct ctl_table core_table[]; - #ifdef CONFIG_NET -extern struct ctl_table ether_table[]; +#include <linux/if_ether.h> #endif #ifdef CONFIG_TR -extern struct ctl_table tr_table[]; +#include <linux/if_tr.h> #endif struct ctl_table net_table[] = { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d403e34088a..41feca3bef8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -105,7 +105,7 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/sock.h> -#include <linux/tcp.h> +#include <net/tcp_states.h> #include <net/af_unix.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = { .owner = THIS_MODULE, }; -#ifdef CONFIG_SYSCTL -extern void unix_sysctl_register(void); -extern void unix_sysctl_unregister(void); -#else -static inline void unix_sysctl_register(void) {} -static inline void unix_sysctl_unregister(void) {} -#endif - static int __init af_unix_init(void) { int rc = -1; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 4bd95c8f593..6ffc64e1712 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -76,11 +76,11 @@ #include <linux/netdevice.h> #include <linux/file.h> #include <linux/proc_fs.h> -#include <linux/tcp.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> +#include <net/tcp_states.h> /* Internal data structures and random procedures: */ @@ -286,16 +286,16 @@ void unix_gc(void) skb = skb_peek(&s->sk_receive_queue); while (skb && skb != (struct sk_buff *)&s->sk_receive_queue) { - nextsk=skb->next; + nextsk = skb->next; /* * Do we have file descriptors ? */ - if(UNIXCB(skb).fp) - { - __skb_unlink(skb, skb->list); - __skb_queue_tail(&hitlist,skb); + if (UNIXCB(skb).fp) { + __skb_unlink(skb, + &s->sk_receive_queue); + __skb_queue_tail(&hitlist, skb); } - skb=nextsk; + skb = nextsk; } spin_unlock(&s->sk_receive_queue.lock); } diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index c974dac4580..690ffa5d5bf 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -12,7 +12,7 @@ #include <linux/mm.h> #include <linux/sysctl.h> -extern int sysctl_unix_max_dgram_qlen; +#include <net/af_unix.h> static ctl_table unix_table[] = { { diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index d93b19faaab..596cb96e5f4 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -57,7 +57,7 @@ #include <linux/wanpipe.h> #include <linux/if_wanpipe.h> #include <linux/pkt_sched.h> -#include <linux/tcp.h> +#include <linux/tcp_states.h> #include <linux/if_wanpipe_common.h> #include <linux/sdla_x25.h> diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 04bec047fa9..020d73cc841 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -47,7 +47,7 @@ #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <asm/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 36fc3bf6d88..adfe7b8df35 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) } int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, - struct packet_type *ptype) + struct packet_type *ptype, struct net_device *orig_dev) { struct sk_buff *nskb; struct x25_neigh *nb; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index b0197c70a9f..26146874b83 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -28,7 +28,7 @@ #include <linux/string.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/x25.h> static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 7fd872ad0c2..8be9b8fbc24 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -27,7 +27,7 @@ #include <linux/string.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/x25.h> /* @@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk) if (!skb_prev) skb_queue_head(&sk->sk_write_queue, skb); else - skb_append(skb_prev, skb); + skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index d6a21a3ad80..0a92e1da392 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -23,7 +23,7 @@ #include <linux/jiffies.h> #include <linux/timer.h> #include <net/sock.h> -#include <net/tcp.h> +#include <net/tcp_states.h> #include <net/x25.h> static void x25_heartbeat_expiry(unsigned long); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c58a6f05a0b..2407a707232 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -12,7 +12,7 @@ #include <net/ip.h> #include <net/xfrm.h> -static kmem_cache_t *secpath_cachep; +static kmem_cache_t *secpath_cachep __read_mostly; void __secpath_destroy(struct sec_path *sp) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d65ed8684fc..83c8135e176 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list); static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; -static kmem_cache_t *xfrm_dst_cache; +static kmem_cache_t *xfrm_dst_cache __read_mostly; static struct work_struct xfrm_policy_gc_work; static struct list_head xfrm_policy_gc_list = diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8da3e25b2c4..c35336a0f71 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c) if (build_expire(skb, x, c->data.hard) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); } static int xfrm_notify_sa_flush(struct km_event *c) @@ -1152,7 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); nlmsg_failure: kfree_skb(skb); @@ -1226,7 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c) nlh->nlmsg_len = skb->tail - b; - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_SA; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC); nlmsg_failure: rtattr_failure: @@ -1304,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, if (build_acquire(skb, x, xt, xp, dir) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC); } /* User gives us xfrm_user_policy_info followed by an array of 0 @@ -1405,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve if (build_polexpire(skb, xp, dir, c->data.hard) < 0) BUG(); - NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; - - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC); } static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c) @@ -1455,7 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event * nlh->nlmsg_len = skb->tail - b; - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); nlmsg_failure: rtattr_failure: @@ -1480,7 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c) nlh->nlmsg_len = skb->tail - b; - return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC); + NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY; + return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC); nlmsg_failure: kfree_skb(skb); @@ -1519,7 +1520,8 @@ static int __init xfrm_user_init(void) { printk(KERN_INFO "Initializing IPsec netlink socket\n"); - xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX, + xfrm_netlink_rcv, THIS_MODULE); if (xfrm_nl == NULL) return -ENOMEM; @@ -1537,3 +1539,4 @@ static void __exit xfrm_user_exit(void) module_init(xfrm_user_init); module_exit(xfrm_user_exit); MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 5180405c1a8..d8ee38aede2 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -341,6 +341,22 @@ static int do_of_entry (const char *filename, struct of_device_id *of, char *ali return 1; } +static int do_vio_entry(const char *filename, struct vio_device_id *vio, + char *alias) +{ + char *tmp; + + sprintf(alias, "vio:T%sS%s", vio->type[0] ? vio->type : "*", + vio->compat[0] ? vio->compat : "*"); + + /* Replace all whitespace with underscores */ + for (tmp = alias; tmp && *tmp; tmp++) + if (isspace (*tmp)) + *tmp = '_'; + + return 1; +} + /* Ignore any prefix, eg. v850 prepends _ */ static inline int sym_is(const char *symbol, const char *name) { @@ -422,6 +438,9 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, else if (sym_is(symname, "__mod_of_device_table")) do_table(symval, sym->st_size, sizeof(struct of_device_id), do_of_entry, mod); + else if (sym_is(symname, "__mod_vio_device_table")) + do_table(symval, sym->st_size, sizeof(struct vio_device_id), + do_vio_entry, mod); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 2253f388234..8641f8894b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -659,7 +659,7 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_NETLINK_ROUTE_SOCKET; case NETLINK_FIREWALL: return SECCLASS_NETLINK_FIREWALL_SOCKET; - case NETLINK_TCPDIAG: + case NETLINK_INET_DIAG: return SECCLASS_NETLINK_TCPDIAG_SOCKET; case NETLINK_NFLOG: return SECCLASS_NETLINK_NFLOG_SOCKET; diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c index 18d08acafa7..e203883406d 100644 --- a/security/selinux/netlink.c +++ b/security/selinux/netlink.c @@ -80,7 +80,8 @@ static void selnl_notify(int msgtype, void *data) nlh = NLMSG_PUT(skb, 0, 0, msgtype, len); selnl_add_payload(nlh, len, msgtype, data); nlh->nlmsg_len = skb->tail - tmp; - netlink_broadcast(selnl, skb, 0, SELNL_GRP_AVC, GFP_USER); + NETLINK_CB(skb).dst_group = SELNLGRP_AVC; + netlink_broadcast(selnl, skb, 0, SELNLGRP_AVC, GFP_USER); out: return; @@ -103,7 +104,8 @@ void selnl_notify_policyload(u32 seqno) static int __init selnl_init(void) { - selnl = netlink_kernel_create(NETLINK_SELINUX, NULL); + selnl = netlink_kernel_create(NETLINK_SELINUX, SELNLGRP_MAX, NULL, + THIS_MODULE); if (selnl == NULL) panic("SELinux: Cannot create netlink socket."); netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 92b057becb4..69b9329b205 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -16,7 +16,7 @@ #include <linux/rtnetlink.h> #include <linux/if.h> #include <linux/netfilter_ipv4/ip_queue.h> -#include <linux/tcp_diag.h> +#include <linux/inet_diag.h> #include <linux/xfrm.h> #include <linux/audit.h> @@ -76,6 +76,7 @@ static struct nlmsg_perm nlmsg_firewall_perms[] = static struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, + { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, }; static struct nlmsg_perm nlmsg_xfrm_perms[] = |