611 files changed, 28552 insertions, 10723 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 8b1430b4665..0665cb12bd6 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -135,3 +135,15 @@ Why:	With the 16-bit PCMCIA subsystem now behaving (almost) like a
 	pcmciautils package available at
 	http://kernel.org/pub/linux/utils/kernel/pcmcia/
 Who:	Dominik Brodowski <linux@brodo.de>
+
+---------------------------
+
+What:	ip_queue and ip6_queue (old ipv4-only and ipv6-only netfilter queue)
+When:	December 2005
+Why:	This interface has been obsoleted by the new layer3-independent
+	"nfnetlink_queue".  The Kernel interface is compatible, so the old
+	ip[6]tables "QUEUE" targets still work and will transparently handle
+	all packets into nfnetlink queue number 0.  Userspace users will have
+	to link against API-compatible library on top of libnfnetlink_queue 
+	instead of the current 'libipq'.
+Who:	Harald Welte <laforge@netfilter.org>
diff --git a/arch/ppc/Makefile b/arch/ppc/Makefile
index f9b0d778dd8..d1b6e6dcb50 100644
--- a/arch/ppc/Makefile
+++ b/arch/ppc/Makefile
@@ -21,11 +21,13 @@ CC		:= $(CC) -m32
 endif
 
 LDFLAGS_vmlinux	:= -Ttext $(KERNELLOAD) -Bstatic
-CPPFLAGS	+= -Iarch/$(ARCH)
+CPPFLAGS	+= -Iarch/$(ARCH) -Iinclude3
 AFLAGS		+= -Iarch/$(ARCH)
 CFLAGS		+= -Iarch/$(ARCH) -msoft-float -pipe \
 		-ffixed-r2 -mmultiple
 CPP		= $(CC) -E $(CFLAGS)
+# Temporary hack until we have migrated to asm-powerpc
+LINUXINCLUDE    += -Iinclude3
 
 CHECKFLAGS	+= -D__powerpc__
 
@@ -101,6 +103,7 @@ endef
 
 archclean:
 	$(Q)$(MAKE) $(clean)=arch/ppc/boot
+	$(Q)rm -rf include3
 
 prepare: include/asm-$(ARCH)/offsets.h checkbin
 
@@ -110,6 +113,12 @@ arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
 include/asm-$(ARCH)/offsets.h: arch/$(ARCH)/kernel/asm-offsets.s
 	$(call filechk,gen-asm-offsets)
 
+# Temporary hack until we have migrated to asm-powerpc
+include/asm: include3/asm
+include3/asm:
+	$(Q)if [ ! -d include3 ]; then mkdir -p include3; fi
+	$(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm
+
 # Use the file '.tmp_gas_check' for binutils tests, as gas won't output
 # to stdout and these checks are run even on install targets.
 TOUT	:= .tmp_gas_check
diff --git a/arch/ppc/boot/utils/addRamDisk.c b/arch/ppc/boot/utils/addRamDisk.c
deleted file mode 100644
index 93400dfcce7..00000000000
--- a/arch/ppc/boot/utils/addRamDisk.c
+++ /dev/null
@@ -1,203 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <netinet/in.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
-
-#define ElfHeaderSize  (64 * 1024)
-#define ElfPages  (ElfHeaderSize / 4096)
-#define KERNELBASE (0xc0000000)
-
-void get4k(FILE *file, char *buf )
-{
-    unsigned j;
-    unsigned num = fread(buf, 1, 4096, file);
-    for (  j=num; j<4096; ++j )
-	buf[j] = 0;
-}
-
-void put4k(FILE *file, char *buf )
-{
-    fwrite(buf, 1, 4096, file);
-}
-
-void death(const char *msg, FILE *fdesc, const char *fname)
-{
-    printf(msg);
-    fclose(fdesc);
-    unlink(fname);
-    exit(1);
-}
-
-int main(int argc, char **argv)
-{
-    char inbuf[4096];
-    FILE *ramDisk = NULL;
-    FILE *inputVmlinux = NULL;
-    FILE *outputVmlinux = NULL;
-    unsigned i = 0;
-    u_int32_t ramFileLen = 0;
-    u_int32_t ramLen = 0;
-    u_int32_t roundR = 0;
-    u_int32_t kernelLen = 0;
-    u_int32_t actualKernelLen = 0;
-    u_int32_t round = 0;
-    u_int32_t roundedKernelLen = 0;
-    u_int32_t ramStartOffs = 0;
-    u_int32_t ramPages = 0;
-    u_int32_t roundedKernelPages = 0;
-    u_int32_t hvReleaseData = 0;
-    u_int32_t eyeCatcher = 0xc8a5d9c4;
-    u_int32_t naca = 0;
-    u_int32_t xRamDisk = 0;
-    u_int32_t xRamDiskSize = 0;
-    if ( argc < 2 ) {
-	printf("Name of RAM disk file missing.\n");
-	exit(1);
-    }
-
-    if ( argc < 3 ) {
-	printf("Name of vmlinux file missing.\n");
-	exit(1);
-    }
-
-    if ( argc < 4 ) {
-	printf("Name of vmlinux output file missing.\n");
-	exit(1);
-    }
-
-    ramDisk = fopen(argv[1], "r");
-    if ( ! ramDisk ) {
-	printf("RAM disk file \"%s\" failed to open.\n", argv[1]);
-	exit(1);
-    }
-    inputVmlinux = fopen(argv[2], "r");
-    if ( ! inputVmlinux ) {
-	printf("vmlinux file \"%s\" failed to open.\n", argv[2]);
-	exit(1);
-    }
-    outputVmlinux = fopen(argv[3], "w+");
-    if ( ! outputVmlinux ) {
-	printf("output vmlinux file \"%s\" failed to open.\n", argv[3]);
-	exit(1);
-    }
-    fseek(ramDisk, 0, SEEK_END);
-    ramFileLen = ftell(ramDisk);
-    fseek(ramDisk, 0, SEEK_SET);
-    printf("%s file size = %d\n", argv[1], ramFileLen);
-
-    ramLen = ramFileLen;
-
-    roundR = 4096 - (ramLen % 4096);
-    if ( roundR ) {
-	printf("Rounding RAM disk file up to a multiple of 4096, adding %d\n", roundR);
-	ramLen += roundR;
-    }
-
-    printf("Rounded RAM disk size is %d\n", ramLen);
-    fseek(inputVmlinux, 0, SEEK_END);
-    kernelLen = ftell(inputVmlinux);
-    fseek(inputVmlinux, 0, SEEK_SET);
-    printf("kernel file size = %d\n", kernelLen);
-    if ( kernelLen == 0 ) {
-	printf("You must have a linux kernel specified as argv[2]\n");
-	exit(1);
-    }
-
-    actualKernelLen = kernelLen - ElfHeaderSize;
-
-    printf("actual kernel length (minus ELF header) = %d\n", actualKernelLen);
-
-    round = actualKernelLen % 4096;
-    roundedKernelLen = actualKernelLen;
-    if ( round )
-	roundedKernelLen += (4096 - round);
-
-    printf("actual kernel length rounded up to a 4k multiple = %d\n", roundedKernelLen);
-
-    ramStartOffs = roundedKernelLen;
-    ramPages = ramLen / 4096;
-
-    printf("RAM disk pages to copy = %d\n", ramPages);
-
-    // Copy 64K ELF header
-      for (i=0; i<(ElfPages); ++i) {
-	  get4k( inputVmlinux, inbuf );
-	  put4k( outputVmlinux, inbuf );
-      }
-
-    roundedKernelPages = roundedKernelLen / 4096;
-
-    fseek(inputVmlinux, ElfHeaderSize, SEEK_SET);
-
-    for ( i=0; i<roundedKernelPages; ++i ) {
-	get4k( inputVmlinux, inbuf );
-	put4k( outputVmlinux, inbuf );
-    }
-
-    for ( i=0; i<ramPages; ++i ) {
-	get4k( ramDisk, inbuf );
-	put4k( outputVmlinux, inbuf );
-    }
-
-    /* Close the input files */
-    fclose(ramDisk);
-    fclose(inputVmlinux);
-    /* And flush the written output file */
-    fflush(outputVmlinux);
-
-    /* fseek to the hvReleaseData pointer */
-    fseek(outputVmlinux, ElfHeaderSize + 0x24, SEEK_SET);
-    if (fread(&hvReleaseData, 4, 1, outputVmlinux) != 1) {
-        death("Could not read hvReleaseData pointer\n", outputVmlinux, argv[3]);
-    }
-    hvReleaseData = ntohl(hvReleaseData); /* Convert to native int */
-    printf("hvReleaseData is at %08x\n", hvReleaseData);
-
-    /* fseek to the hvReleaseData */
-    fseek(outputVmlinux, ElfHeaderSize + hvReleaseData, SEEK_SET);
-    if (fread(inbuf, 0x40, 1, outputVmlinux) != 1) {
-        death("Could not read hvReleaseData\n", outputVmlinux, argv[3]);
-    }
-    /* Check hvReleaseData sanity */
-    if (memcmp(inbuf, &eyeCatcher, 4) != 0) {
-        death("hvReleaseData is invalid\n", outputVmlinux, argv[3]);
-    }
-    /* Get the naca pointer */
-    naca = ntohl(*((u_int32_t *) &inbuf[0x0c])) - KERNELBASE;
-    printf("naca is at %08x\n", naca);
-
-    /* fseek to the naca */
-    fseek(outputVmlinux, ElfHeaderSize + naca, SEEK_SET);
-    if (fread(inbuf, 0x18, 1, outputVmlinux) != 1) {
-        death("Could not read naca\n", outputVmlinux, argv[3]);
-    }
-    xRamDisk = ntohl(*((u_int32_t *) &inbuf[0x0c]));
-    xRamDiskSize = ntohl(*((u_int32_t *) &inbuf[0x14]));
-    /* Make sure a RAM disk isn't already present */
-    if ((xRamDisk != 0) || (xRamDiskSize != 0)) {
-        death("RAM disk is already attached to this kernel\n", outputVmlinux, argv[3]);
-    }
-    /* Fill in the values */
-    *((u_int32_t *) &inbuf[0x0c]) = htonl(ramStartOffs);
-    *((u_int32_t *) &inbuf[0x14]) = htonl(ramPages);
-
-    /* Write out the new naca */
-    fflush(outputVmlinux);
-    fseek(outputVmlinux, ElfHeaderSize + naca, SEEK_SET);
-    if (fwrite(inbuf, 0x18, 1, outputVmlinux) != 1) {
-        death("Could not write naca\n", outputVmlinux, argv[3]);
-    }
-    printf("RAM Disk of 0x%x pages size is attached to the kernel at offset 0x%08x\n",
-            ramPages, ramStartOffs);
-
-    /* Done */
-    fclose(outputVmlinux);
-    /* Set permission to executable */
-    chmod(argv[3], S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
-
-    return 0;
-}
-
diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index 2ce87836c67..13b262f1021 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -302,12 +302,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config MSCHUNKS
-	bool
-	depends on PPC_ISERIES
-	default y
-
-
 config PPC_RTAS
 	bool
 	depends on PPC_PSERIES || PPC_BPA
@@ -350,13 +344,46 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source "fs/Kconfig.binfmt"
+
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs"
+	depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC)
+	select HOTPLUG
+	---help---
+	  Say Y here to be able to turn CPUs off and on.
+
+	  Say N if you are unsure.
+
+config PROC_DEVICETREE
+	bool "Support for Open Firmware device tree in /proc"
+	depends on !PPC_ISERIES
+	help
+	  This option adds a device-tree directory under /proc which contains
+	  an image of the device tree that the kernel copies from Open
+	  Firmware. If unsure, say Y here.
+
+config CMDLINE_BOOL
+	bool "Default bootloader kernel arguments"
+	depends on !PPC_ISERIES
+
+config CMDLINE
+	string "Initial kernel command string"
+	depends on CMDLINE_BOOL
+	default "console=ttyS0,9600 console=tty0 root=/dev/sda2"
+	help
+	  On some platforms, there is currently no way for the boot loader to
+	  pass arguments to the kernel. For these platforms, you can supply
+	  some command-line options at build time by entering them here.  In
+	  most cases you will need to specify the root device here.
+
 endmenu
 
 config ISA_DMA_API
 	bool
 	default y
 
-menu "General setup"
+menu "Bus Options"
 
 config ISA
 	bool
@@ -389,45 +416,12 @@ config PCI_DOMAINS
 	bool
 	default PCI
 
-source "fs/Kconfig.binfmt"
-
 source "drivers/pci/Kconfig"
 
-config HOTPLUG_CPU
-	bool "Support for hot-pluggable CPUs"
-	depends on SMP && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC)
-	select HOTPLUG
-	---help---
-	  Say Y here to be able to turn CPUs off and on.
-
-	  Say N if you are unsure.
-
 source "drivers/pcmcia/Kconfig"
 
 source "drivers/pci/hotplug/Kconfig"
 
-config PROC_DEVICETREE
-	bool "Support for Open Firmware device tree in /proc"
-	depends on !PPC_ISERIES
-	help
-	  This option adds a device-tree directory under /proc which contains
-	  an image of the device tree that the kernel copies from Open
-	  Firmware. If unsure, say Y here.
-
-config CMDLINE_BOOL
-	bool "Default bootloader kernel arguments"
-	depends on !PPC_ISERIES
-
-config CMDLINE
-	string "Initial kernel command string"
-	depends on CMDLINE_BOOL
-	default "console=ttyS0,9600 console=tty0 root=/dev/sda2"
-	help
-	  On some platforms, there is currently no way for the boot loader to
-	  pass arguments to the kernel. For these platforms, you can supply
-	  some command-line options at build time by entering them here.  In
-	  most cases you will need to specify the root device here.
-
 endmenu
 
 source "net/Kconfig"
diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile
index 731b8475833..6350cce82ef 100644
--- a/arch/ppc64/Makefile
+++ b/arch/ppc64/Makefile
@@ -55,6 +55,8 @@ LDFLAGS		:= -m elf64ppc
 LDFLAGS_vmlinux	:= -Bstatic -e $(KERNELLOAD) -Ttext $(KERNELLOAD)
 CFLAGS		+= -msoft-float -pipe -mminimal-toc -mtraceback=none \
 		   -mcall-aixdesc
+# Temporary hack until we have migrated to asm-powerpc
+CPPFLAGS	+= -Iinclude3
 
 GCC_VERSION     := $(call cc-version)
 GCC_BROKEN_VEC	:= $(shell if [ $(GCC_VERSION) -lt 0400 ] ; then echo "y"; fi ;)
@@ -112,6 +114,7 @@ all: $(KBUILD_IMAGE)
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
+	$(Q)rm -rf include3
 
 prepare: include/asm-ppc64/offsets.h
 
@@ -121,6 +124,12 @@ arch/ppc64/kernel/asm-offsets.s: include/asm include/linux/version.h \
 include/asm-ppc64/offsets.h: arch/ppc64/kernel/asm-offsets.s
 	$(call filechk,gen-asm-offsets)
 
+# Temporary hack until we have migrated to asm-powerpc
+include/asm: include3/asm
+include3/asm:
+	$(Q)if [ ! -d include3 ]; then mkdir -p include3; fi;
+	$(Q)ln -fsn $(srctree)/include/asm-powerpc include3/asm
+
 define archhelp
   echo  '* zImage       - Compressed kernel image (arch/$(ARCH)/boot/zImage)'
   echo  '  zImage.initrd- Compressed kernel image with initrd attached,'
diff --git a/arch/ppc64/boot/Makefile b/arch/ppc64/boot/Makefile
index 683b2d43c15..2c5f5e73d00 100644
--- a/arch/ppc64/boot/Makefile
+++ b/arch/ppc64/boot/Makefile
@@ -22,8 +22,8 @@
 
 
 HOSTCC		:= gcc
-BOOTCFLAGS	:= $(HOSTCFLAGS) $(LINUXINCLUDE) -fno-builtin 
-BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional
+BOOTCFLAGS	:= $(HOSTCFLAGS) -fno-builtin -nostdinc -isystem $(shell $(CROSS32CC) -print-file-name=include)
+BOOTAFLAGS	:= -D__ASSEMBLY__ $(BOOTCFLAGS) -traditional -nostdinc
 BOOTLFLAGS	:= -Ttext 0x00400000 -e _start -T $(srctree)/$(src)/zImage.lds
 OBJCOPYFLAGS    := contents,alloc,load,readonly,data
 
diff --git a/arch/ppc64/boot/addnote.c b/arch/ppc64/boot/addnote.c
index 719663a694b..8041a9845ab 100644
--- a/arch/ppc64/boot/addnote.c
+++ b/arch/ppc64/boot/addnote.c
@@ -157,7 +157,7 @@ main(int ac, char **av)
 	PUT_32BE(ns, strlen(arch) + 1);
 	PUT_32BE(ns + 4, N_DESCR * 4);
 	PUT_32BE(ns + 8, 0x1275);
-	strcpy(&buf[ns + 12], arch);
+	strcpy((char *) &buf[ns + 12], arch);
 	ns += 12 + strlen(arch) + 1;
 	for (i = 0; i < N_DESCR; ++i, ns += 4)
 		PUT_32BE(ns, descr[i]);
@@ -172,7 +172,7 @@ main(int ac, char **av)
 	PUT_32BE(ns, strlen(rpaname) + 1);
 	PUT_32BE(ns + 4, sizeof(rpanote));
 	PUT_32BE(ns + 8, 0x12759999);
-	strcpy(&buf[ns + 12], rpaname);
+	strcpy((char *) &buf[ns + 12], rpaname);
 	ns += 12 + ROUNDUP(strlen(rpaname) + 1);
 	for (i = 0; i < N_RPA_DESCR; ++i, ns += 4)
 		PUT_32BE(ns, rpanote[i]);
diff --git a/arch/ppc64/boot/crt0.S b/arch/ppc64/boot/crt0.S
index 04d3e74cd72..3861e7f9cf1 100644
--- a/arch/ppc64/boot/crt0.S
+++ b/arch/ppc64/boot/crt0.S
@@ -9,7 +9,7 @@
  * NOTE: this code runs in 32 bit mode and is packaged as ELF32.
  */
 
-#include <asm/ppc_asm.h>
+#include "ppc_asm.h"
 
 	.text
 	.globl	_start
diff --git a/arch/ppc64/boot/div64.S b/arch/ppc64/boot/div64.S
index 38f7e466d7d..722f360a32a 100644
--- a/arch/ppc64/boot/div64.S
+++ b/arch/ppc64/boot/div64.S
@@ -13,7 +13,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include <asm/ppc_asm.h>
+#include "ppc_asm.h"
 
 	.globl __div64_32
 __div64_32:
diff --git a/arch/ppc64/boot/elf.h b/arch/ppc64/boot/elf.h
new file mode 100644
index 00000000000..d4828fcf1cb
--- /dev/null
+++ b/arch/ppc64/boot/elf.h
@@ -0,0 +1,149 @@
+#ifndef _PPC_BOOT_ELF_H_
+#define _PPC_BOOT_ELF_H_
+
+/* 32-bit ELF base types. */
+typedef unsigned int Elf32_Addr;
+typedef unsigned short Elf32_Half;
+typedef unsigned int Elf32_Off;
+typedef signed int Elf32_Sword;
+typedef unsigned int Elf32_Word;
+
+/* 64-bit ELF base types. */
+typedef unsigned long long Elf64_Addr;
+typedef unsigned short Elf64_Half;
+typedef signed short Elf64_SHalf;
+typedef unsigned long long Elf64_Off;
+typedef signed int Elf64_Sword;
+typedef unsigned int Elf64_Word;
+typedef unsigned long long Elf64_Xword;
+typedef signed long long Elf64_Sxword;
+
+/* These constants are for the segment types stored in the image headers */
+#define PT_NULL    0
+#define PT_LOAD    1
+#define PT_DYNAMIC 2
+#define PT_INTERP  3
+#define PT_NOTE    4
+#define PT_SHLIB   5
+#define PT_PHDR    6
+#define PT_TLS     7		/* Thread local storage segment */
+#define PT_LOOS    0x60000000	/* OS-specific */
+#define PT_HIOS    0x6fffffff	/* OS-specific */
+#define PT_LOPROC  0x70000000
+#define PT_HIPROC  0x7fffffff
+#define PT_GNU_EH_FRAME		0x6474e550
+
+#define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+
+/* These constants define the different elf file types */
+#define ET_NONE   0
+#define ET_REL    1
+#define ET_EXEC   2
+#define ET_DYN    3
+#define ET_CORE   4
+#define ET_LOPROC 0xff00
+#define ET_HIPROC 0xffff
+
+/* These constants define the various ELF target machines */
+#define EM_NONE  0
+#define EM_PPC	       20	/* PowerPC */
+#define EM_PPC64       21	/* PowerPC64 */
+
+#define EI_NIDENT	16
+
+typedef struct elf32_hdr {
+	unsigned char e_ident[EI_NIDENT];
+	Elf32_Half e_type;
+	Elf32_Half e_machine;
+	Elf32_Word e_version;
+	Elf32_Addr e_entry;	/* Entry point */
+	Elf32_Off e_phoff;
+	Elf32_Off e_shoff;
+	Elf32_Word e_flags;
+	Elf32_Half e_ehsize;
+	Elf32_Half e_phentsize;
+	Elf32_Half e_phnum;
+	Elf32_Half e_shentsize;
+	Elf32_Half e_shnum;
+	Elf32_Half e_shstrndx;
+} Elf32_Ehdr;
+
+typedef struct elf64_hdr {
+	unsigned char e_ident[16];	/* ELF "magic number" */
+	Elf64_Half e_type;
+	Elf64_Half e_machine;
+	Elf64_Word e_version;
+	Elf64_Addr e_entry;	/* Entry point virtual address */
+	Elf64_Off e_phoff;	/* Program header table file offset */
+	Elf64_Off e_shoff;	/* Section header table file offset */
+	Elf64_Word e_flags;
+	Elf64_Half e_ehsize;
+	Elf64_Half e_phentsize;
+	Elf64_Half e_phnum;
+	Elf64_Half e_shentsize;
+	Elf64_Half e_shnum;
+	Elf64_Half e_shstrndx;
+} Elf64_Ehdr;
+
+/* These constants define the permissions on sections in the program
+   header, p_flags. */
+#define PF_R		0x4
+#define PF_W		0x2
+#define PF_X		0x1
+
+typedef struct elf32_phdr {
+	Elf32_Word p_type;
+	Elf32_Off p_offset;
+	Elf32_Addr p_vaddr;
+	Elf32_Addr p_paddr;
+	Elf32_Word p_filesz;
+	Elf32_Word p_memsz;
+	Elf32_Word p_flags;
+	Elf32_Word p_align;
+} Elf32_Phdr;
+
+typedef struct elf64_phdr {
+	Elf64_Word p_type;
+	Elf64_Word p_flags;
+	Elf64_Off p_offset;	/* Segment file offset */
+	Elf64_Addr p_vaddr;	/* Segment virtual address */
+	Elf64_Addr p_paddr;	/* Segment physical address */
+	Elf64_Xword p_filesz;	/* Segment size in file */
+	Elf64_Xword p_memsz;	/* Segment size in memory */
+	Elf64_Xword p_align;	/* Segment alignment, file & memory */
+} Elf64_Phdr;
+
+#define	EI_MAG0		0	/* e_ident[] indexes */
+#define	EI_MAG1		1
+#define	EI_MAG2		2
+#define	EI_MAG3		3
+#define	EI_CLASS	4
+#define	EI_DATA		5
+#define	EI_VERSION	6
+#define	EI_OSABI	7
+#define	EI_PAD		8
+
+#define	ELFMAG0		0x7f	/* EI_MAG */
+#define	ELFMAG1		'E'
+#define	ELFMAG2		'L'
+#define	ELFMAG3		'F'
+#define	ELFMAG		"\177ELF"
+#define	SELFMAG		4
+
+#define	ELFCLASSNONE	0	/* EI_CLASS */
+#define	ELFCLASS32	1
+#define	ELFCLASS64	2
+#define	ELFCLASSNUM	3
+
+#define ELFDATANONE	0	/* e_ident[EI_DATA] */
+#define ELFDATA2LSB	1
+#define ELFDATA2MSB	2
+
+#define EV_NONE		0	/* e_version, EI_VERSION */
+#define EV_CURRENT	1
+#define EV_NUM		2
+
+#define ELFOSABI_NONE	0
+#define ELFOSABI_LINUX	3
+
+#endif				/* _PPC_BOOT_ELF_H_ */
diff --git a/arch/ppc64/boot/main.c b/arch/ppc64/boot/main.c
index 199d9804f61..99e68cfbe68 100644
--- a/arch/ppc64/boot/main.c
+++ b/arch/ppc64/boot/main.c
@@ -8,36 +8,28 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
-#include "ppc32-types.h"
+#include <stdarg.h>
+#include <stddef.h>
+#include "elf.h"
+#include "page.h"
+#include "string.h"
+#include "stdio.h"
+#include "prom.h"
 #include "zlib.h"
-#include <linux/elf.h>
-#include <linux/string.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-
-extern void *finddevice(const char *);
-extern int getprop(void *, const char *, void *, int);
-extern void printf(const char *fmt, ...);
-extern int sprintf(char *buf, const char *fmt, ...);
-void gunzip(void *, int, unsigned char *, int *);
-void *claim(unsigned int, unsigned int, unsigned int);
-void flush_cache(void *, unsigned long);
-void pause(void);
-extern void exit(void);
-
-unsigned long strlen(const char *s);
-void *memmove(void *dest, const void *src, unsigned long n);
-void *memcpy(void *dest, const void *src, unsigned long n);
+
+static void gunzip(void *, int, unsigned char *, int *);
+extern void flush_cache(void *, unsigned long);
+
 
 /* Value picked to match that used by yaboot */
 #define PROG_START	0x01400000
 #define RAM_END		(256<<20) // Fixme: use OF */
 
-char *avail_ram;
-char *begin_avail, *end_avail;
-char *avail_high;
-unsigned int heap_use;
-unsigned int heap_max;
+static char *avail_ram;
+static char *begin_avail, *end_avail;
+static char *avail_high;
+static unsigned int heap_use;
+static unsigned int heap_max;
 
 extern char _start[];
 extern char _vmlinux_start[];
@@ -52,9 +44,9 @@ struct addr_range {
 	unsigned long size;
 	unsigned long memsize;
 };
-struct addr_range vmlinux = {0, 0, 0};
-struct addr_range vmlinuz = {0, 0, 0};
-struct addr_range initrd  = {0, 0, 0};
+static struct addr_range vmlinux = {0, 0, 0};
+static struct addr_range vmlinuz = {0, 0, 0};
+static struct addr_range initrd  = {0, 0, 0};
 
 static char scratch[128<<10];	/* 128kB of scratch space for gunzip */
 
@@ -64,13 +56,6 @@ typedef void (*kernel_entry_t)( unsigned long,
 				void *);
 
 
-int (*prom)(void *);
-
-void *chosen_handle;
-void *stdin;
-void *stdout;
-void *stderr;
-
 #undef DEBUG
 
 static unsigned long claim_base = PROG_START;
@@ -277,7 +262,7 @@ void zfree(void *x, void *addr, unsigned nb)
 
 #define DEFLATED	8
 
-void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp)
+static void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp)
 {
 	z_stream s;
 	int r, i, flags;
diff --git a/arch/ppc64/boot/page.h b/arch/ppc64/boot/page.h
new file mode 100644
index 00000000000..14eca30fef6
--- /dev/null
+++ b/arch/ppc64/boot/page.h
@@ -0,0 +1,34 @@
+#ifndef _PPC_BOOT_PAGE_H
+#define _PPC_BOOT_PAGE_H
+/*
+ * Copyright (C) 2001 PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifdef __ASSEMBLY__
+#define ASM_CONST(x) x
+#else
+#define __ASM_CONST(x) x##UL
+#define ASM_CONST(x) __ASM_CONST(x)
+#endif
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT	12
+#define PAGE_SIZE	(ASM_CONST(1) << PAGE_SHIFT)
+#define PAGE_MASK	(~(PAGE_SIZE-1))
+
+/* align addr on a size boundary - adjust address up/down if needed */
+#define _ALIGN_UP(addr,size)	(((addr)+((size)-1))&(~((size)-1)))
+#define _ALIGN_DOWN(addr,size)	((addr)&(~((size)-1)))
+
+/* align addr on a size boundary - adjust address up if needed */
+#define _ALIGN(addr,size)     _ALIGN_UP(addr,size)
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr)	_ALIGN(addr, PAGE_SIZE)
+
+#endif				/* _PPC_BOOT_PAGE_H */
diff --git a/arch/ppc64/boot/ppc32-types.h b/arch/ppc64/boot/ppc32-types.h
deleted file mode 100644
index f7b8884f8f7..00000000000
--- a/arch/ppc64/boot/ppc32-types.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _PPC64_TYPES_H
-#define _PPC64_TYPES_H
-
-typedef __signed__ char __s8;
-typedef unsigned char __u8;
-
-typedef __signed__ short __s16;
-typedef unsigned short __u16;
-
-typedef __signed__ int __s32;
-typedef unsigned int __u32;
-
-typedef __signed__ long long __s64;
-typedef unsigned long long __u64;
-
-typedef signed char s8;
-typedef unsigned char u8;
-
-typedef signed short s16;
-typedef unsigned short u16;
-
-typedef signed int s32;
-typedef unsigned int u32;
-
-typedef signed long long s64;
-typedef unsigned long long u64;
-
-typedef struct {
-	__u32 u[4];
-} __attribute((aligned(16))) __vector128;
-
-#define BITS_PER_LONG 32
-
-typedef __vector128 vector128;
-
-#endif /* _PPC64_TYPES_H */
diff --git a/arch/ppc64/boot/ppc_asm.h b/arch/ppc64/boot/ppc_asm.h
new file mode 100644
index 00000000000..1c2c2817f9b
--- /dev/null
+++ b/arch/ppc64/boot/ppc_asm.h
@@ -0,0 +1,62 @@
+#ifndef _PPC64_PPC_ASM_H
+#define _PPC64_PPC_ASM_H
+/*
+ *
+ * Definitions used by various bits of low-level assembly code on PowerPC.
+ *
+ * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+/* Condition Register Bit Fields */
+
+#define	cr0	0
+#define	cr1	1
+#define	cr2	2
+#define	cr3	3
+#define	cr4	4
+#define	cr5	5
+#define	cr6	6
+#define	cr7	7
+
+
+/* General Purpose Registers (GPRs) */
+
+#define	r0	0
+#define	r1	1
+#define	r2	2
+#define	r3	3
+#define	r4	4
+#define	r5	5
+#define	r6	6
+#define	r7	7
+#define	r8	8
+#define	r9	9
+#define	r10	10
+#define	r11	11
+#define	r12	12
+#define	r13	13
+#define	r14	14
+#define	r15	15
+#define	r16	16
+#define	r17	17
+#define	r18	18
+#define	r19	19
+#define	r20	20
+#define	r21	21
+#define	r22	22
+#define	r23	23
+#define	r24	24
+#define	r25	25
+#define	r26	26
+#define	r27	27
+#define	r28	28
+#define	r29	29
+#define	r30	30
+#define	r31	31
+
+#endif /* _PPC64_PPC_ASM_H */
diff --git a/arch/ppc64/boot/prom.c b/arch/ppc64/boot/prom.c
index 5e48b80ff5a..4bea2f4dcb0 100644
--- a/arch/ppc64/boot/prom.c
+++ b/arch/ppc64/boot/prom.c
@@ -7,43 +7,19 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <stdarg.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-
-extern __u32 __div64_32(unsigned long long *dividend, __u32 divisor);
-
-/* The unnecessary pointer compare is there
- * to check for type safety (n must be 64bit)
- */
-# define do_div(n,base) ({				\
-	__u32 __base = (base);			\
-	__u32 __rem;					\
-	(void)(((typeof((n)) *)0) == ((unsigned long long *)0));	\
-	if (((n) >> 32) == 0) {			\
-		__rem = (__u32)(n) % __base;		\
-		(n) = (__u32)(n) / __base;		\
-	} else 						\
-		__rem = __div64_32(&(n), __base);	\
-	__rem;						\
- })
+#include <stddef.h>
+#include "string.h"
+#include "stdio.h"
+#include "prom.h"
 
 int (*prom)(void *);
 
 void *chosen_handle;
+
 void *stdin;
 void *stdout;
 void *stderr;
 
-void exit(void);
-void *finddevice(const char *name);
-int getprop(void *phandle, const char *name, void *buf, int buflen);
-void chrpboot(int a1, int a2, void *prom);	/* in main.c */
-
-int printf(char *fmt, ...);
-
-/* there is no convenient header to get this from...  -- paulus */
-extern unsigned long strlen(const char *);
 
 int
 write(void *handle, void *ptr, int nb)
@@ -210,107 +186,6 @@ fputs(char *str, void *f)
 	return write(f, str, n) == n? 0: -1;
 }
 
-int
-readchar(void)
-{
-	char ch;
-
-	for (;;) {
-		switch (read(stdin, &ch, 1)) {
-		case 1:
-			return ch;
-		case -1:
-			printf("read(stdin) returned -1\r\n");
-			return -1;
-		}
-	}
-}
-
-static char line[256];
-static char *lineptr;
-static int lineleft;
-
-int
-getchar(void)
-{
-	int c;
-
-	if (lineleft == 0) {
-		lineptr = line;
-		for (;;) {
-			c = readchar();
-			if (c == -1 || c == 4)
-				break;
-			if (c == '\r' || c == '\n') {
-				*lineptr++ = '\n';
-				putchar('\n');
-				break;
-			}
-			switch (c) {
-			case 0177:
-			case '\b':
-				if (lineptr > line) {
-					putchar('\b');
-					putchar(' ');
-					putchar('\b');
-					--lineptr;
-				}
-				break;
-			case 'U' & 0x1F:
-				while (lineptr > line) {
-					putchar('\b');
-					putchar(' ');
-					putchar('\b');
-					--lineptr;
-				}
-				break;
-			default:
-				if (lineptr >= &line[sizeof(line) - 1])
-					putchar('\a');
-				else {
-					putchar(c);
-					*lineptr++ = c;
-				}
-			}
-		}
-		lineleft = lineptr - line;
-		lineptr = line;
-	}
-	if (lineleft == 0)
-		return -1;
-	--lineleft;
-	return *lineptr++;
-}
-
-
-
-/* String functions lifted from lib/vsprintf.c and lib/ctype.c */
-unsigned char _ctype[] = {
-_C,_C,_C,_C,_C,_C,_C,_C,			/* 0-7 */
-_C,_C|_S,_C|_S,_C|_S,_C|_S,_C|_S,_C,_C,		/* 8-15 */
-_C,_C,_C,_C,_C,_C,_C,_C,			/* 16-23 */
-_C,_C,_C,_C,_C,_C,_C,_C,			/* 24-31 */
-_S|_SP,_P,_P,_P,_P,_P,_P,_P,			/* 32-39 */
-_P,_P,_P,_P,_P,_P,_P,_P,			/* 40-47 */
-_D,_D,_D,_D,_D,_D,_D,_D,			/* 48-55 */
-_D,_D,_P,_P,_P,_P,_P,_P,			/* 56-63 */
-_P,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U,	/* 64-71 */
-_U,_U,_U,_U,_U,_U,_U,_U,			/* 72-79 */
-_U,_U,_U,_U,_U,_U,_U,_U,			/* 80-87 */
-_U,_U,_U,_P,_P,_P,_P,_P,			/* 88-95 */
-_P,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L,	/* 96-103 */
-_L,_L,_L,_L,_L,_L,_L,_L,			/* 104-111 */
-_L,_L,_L,_L,_L,_L,_L,_L,			/* 112-119 */
-_L,_L,_L,_P,_P,_P,_P,_C,			/* 120-127 */
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,		/* 128-143 */
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,		/* 144-159 */
-_S|_SP,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,   /* 160-175 */
-_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,       /* 176-191 */
-_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,       /* 192-207 */
-_U,_U,_U,_U,_U,_U,_U,_P,_U,_U,_U,_U,_U,_U,_U,_L,       /* 208-223 */
-_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,       /* 224-239 */
-_L,_L,_L,_L,_L,_L,_L,_P,_L,_L,_L,_L,_L,_L,_L,_L};      /* 240-255 */
-
 size_t strnlen(const char * s, size_t count)
 {
 	const char *sc;
@@ -320,44 +195,30 @@ size_t strnlen(const char * s, size_t count)
 	return sc - s;
 }
 
-unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base)
-{
-	unsigned long result = 0,value;
+extern unsigned int __div64_32(unsigned long long *dividend,
+			       unsigned int divisor);
 
-	if (!base) {
-		base = 10;
-		if (*cp == '0') {
-			base = 8;
-			cp++;
-			if ((*cp == 'x') && isxdigit(cp[1])) {
-				cp++;
-				base = 16;
-			}
-		}
-	}
-	while (isxdigit(*cp) &&
-	       (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) {
-		result = result*base + value;
-		cp++;
-	}
-	if (endp)
-		*endp = (char *)cp;
-	return result;
-}
-
-long simple_strtol(const char *cp,char **endp,unsigned int base)
-{
-	if(*cp=='-')
-		return -simple_strtoul(cp+1,endp,base);
-	return simple_strtoul(cp,endp,base);
-}
+/* The unnecessary pointer compare is there
+ * to check for type safety (n must be 64bit)
+ */
+# define do_div(n,base) ({						\
+	unsigned int __base = (base);					\
+	unsigned int __rem;						\
+	(void)(((typeof((n)) *)0) == ((unsigned long long *)0));	\
+	if (((n) >> 32) == 0) {						\
+		__rem = (unsigned int)(n) % __base;			\
+		(n) = (unsigned int)(n) / __base;			\
+	} else								\
+		__rem = __div64_32(&(n), __base);			\
+	__rem;								\
+ })
 
 static int skip_atoi(const char **s)
 {
-	int i=0;
+	int i, c;
 
-	while (isdigit(**s))
-		i = i*10 + *((*s)++) - '0';
+	for (i = 0; '0' <= (c = **s) && c <= '9'; ++*s)
+		i = i*10 + c - '0';
 	return i;
 }
 
@@ -436,9 +297,6 @@ static char * number(char * str, unsigned long long num, int base, int size, int
 	return str;
 }
 
-/* Forward decl. needed for IP address printing stuff... */
-int sprintf(char * buf, const char *fmt, ...);
-
 int vsprintf(char *buf, const char *fmt, va_list args)
 {
 	int len;
@@ -477,7 +335,7 @@ int vsprintf(char *buf, const char *fmt, va_list args)
 		
 		/* get field width */
 		field_width = -1;
-		if (isdigit(*fmt))
+		if ('0' <= *fmt && *fmt <= '9')
 			field_width = skip_atoi(&fmt);
 		else if (*fmt == '*') {
 			++fmt;
@@ -493,7 +351,7 @@ int vsprintf(char *buf, const char *fmt, va_list args)
 		precision = -1;
 		if (*fmt == '.') {
 			++fmt;	
-			if (isdigit(*fmt))
+			if ('0' <= *fmt && *fmt <= '9')
 				precision = skip_atoi(&fmt);
 			else if (*fmt == '*') {
 				++fmt;
@@ -628,7 +486,7 @@ int sprintf(char * buf, const char *fmt, ...)
 static char sprint_buf[1024];
 
 int
-printf(char *fmt, ...)
+printf(const char *fmt, ...)
 {
 	va_list args;
 	int n;
diff --git a/arch/ppc64/boot/prom.h b/arch/ppc64/boot/prom.h
new file mode 100644
index 00000000000..96ab5aec740
--- /dev/null
+++ b/arch/ppc64/boot/prom.h
@@ -0,0 +1,18 @@
+#ifndef _PPC_BOOT_PROM_H_
+#define _PPC_BOOT_PROM_H_
+
+extern int (*prom) (void *);
+extern void *chosen_handle;
+
+extern void *stdin;
+extern void *stdout;
+extern void *stderr;
+
+extern int write(void *handle, void *ptr, int nb);
+extern int read(void *handle, void *ptr, int nb);
+extern void exit(void);
+extern void pause(void);
+extern void *finddevice(const char *);
+extern void *claim(unsigned long virt, unsigned long size, unsigned long align);
+extern int getprop(void *phandle, const char *name, void *buf, int buflen);
+#endif				/* _PPC_BOOT_PROM_H_ */
diff --git a/arch/ppc64/boot/stdio.h b/arch/ppc64/boot/stdio.h
new file mode 100644
index 00000000000..24bd3a8dee9
--- /dev/null
+++ b/arch/ppc64/boot/stdio.h
@@ -0,0 +1,16 @@
+#ifndef _PPC_BOOT_STDIO_H_
+#define _PPC_BOOT_STDIO_H_
+
+extern int printf(const char *fmt, ...);
+
+extern int sprintf(char *buf, const char *fmt, ...);
+
+extern int vsprintf(char *buf, const char *fmt, va_list args);
+
+extern int putc(int c, void *f);
+extern int putchar(int c);
+extern int getchar(void);
+
+extern int fputs(char *str, void *f);
+
+#endif				/* _PPC_BOOT_STDIO_H_ */
diff --git a/arch/ppc64/boot/string.S b/arch/ppc64/boot/string.S
index ba5f2d21c9e..7ade87ae771 100644
--- a/arch/ppc64/boot/string.S
+++ b/arch/ppc64/boot/string.S
@@ -9,7 +9,7 @@
  * NOTE: this code runs in 32 bit mode and is packaged as ELF32.
  */
 
-#include <asm/ppc_asm.h>
+#include "ppc_asm.h"
 
 	.text
 	.globl	strcpy
diff --git a/arch/ppc64/boot/string.h b/arch/ppc64/boot/string.h
new file mode 100644
index 00000000000..9289258bcbd
--- /dev/null
+++ b/arch/ppc64/boot/string.h
@@ -0,0 +1,16 @@
+#ifndef _PPC_BOOT_STRING_H_
+#define _PPC_BOOT_STRING_H_
+
+extern char *strcpy(char *dest, const char *src);
+extern char *strncpy(char *dest, const char *src, size_t n);
+extern char *strcat(char *dest, const char *src);
+extern int strcmp(const char *s1, const char *s2);
+extern size_t strlen(const char *s);
+extern size_t strnlen(const char *s, size_t count);
+
+extern void *memset(void *s, int c, size_t n);
+extern void *memmove(void *dest, const void *src, unsigned long n);
+extern void *memcpy(void *dest, const void *src, unsigned long n);
+extern int memcmp(const void *s1, const void *s2, size_t n);
+
+#endif	/* _PPC_BOOT_STRING_H_ */
diff --git a/arch/ppc64/boot/zlib.c b/arch/ppc64/boot/zlib.c
index 78837e884b8..0d910cd2079 100644
--- a/arch/ppc64/boot/zlib.c
+++ b/arch/ppc64/boot/zlib.c
@@ -107,7 +107,7 @@ extern void *memcpy(void *, const void *, unsigned long);
 
 /* Diagnostic functions */
 #ifdef DEBUG_ZLIB
-#  include <stdio.h>
+#  include "stdio.h"
 #  ifndef verbose
 #    define verbose 0
 #  endif
diff --git a/arch/ppc64/configs/g5_defconfig b/arch/ppc64/configs/g5_defconfig
index ab567741e80..fc83d933028 100644
--- a/arch/ppc64/configs/g5_defconfig
+++ b/arch/ppc64/configs/g5_defconfig
@@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 # CONFIG_PREEMPT_BKL is not set
-CONFIG_HZ_100=y
-# CONFIG_HZ_250 is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
 # CONFIG_HZ_1000 is not set
-CONFIG_HZ=100
+CONFIG_HZ=250
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
diff --git a/arch/ppc64/configs/iSeries_defconfig b/arch/ppc64/configs/iSeries_defconfig
index 394ba18b58c..013d4e0e400 100644
--- a/arch/ppc64/configs/iSeries_defconfig
+++ b/arch/ppc64/configs/iSeries_defconfig
@@ -94,12 +94,11 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 # CONFIG_PREEMPT_BKL is not set
-CONFIG_HZ_100=y
-# CONFIG_HZ_250 is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
 # CONFIG_HZ_1000 is not set
-CONFIG_HZ=100
+CONFIG_HZ=250
 CONFIG_GENERIC_HARDIRQS=y
-CONFIG_MSCHUNKS=y
 CONFIG_LPARCFG=y
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
diff --git a/arch/ppc64/configs/maple_defconfig b/arch/ppc64/configs/maple_defconfig
index 2033fe663db..dd42892cd87 100644
--- a/arch/ppc64/configs/maple_defconfig
+++ b/arch/ppc64/configs/maple_defconfig
@@ -103,10 +103,10 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 # CONFIG_PREEMPT_BKL is not set
-CONFIG_HZ_100=y
-# CONFIG_HZ_250 is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
 # CONFIG_HZ_1000 is not set
-CONFIG_HZ=100
+CONFIG_HZ=250
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_SECCOMP=y
 CONFIG_ISA_DMA_API=y
diff --git a/arch/ppc64/configs/pSeries_defconfig b/arch/ppc64/configs/pSeries_defconfig
index 297fd522948..29f7b80b0ef 100644
--- a/arch/ppc64/configs/pSeries_defconfig
+++ b/arch/ppc64/configs/pSeries_defconfig
@@ -112,10 +112,10 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 # CONFIG_PREEMPT_BKL is not set
-CONFIG_HZ_100=y
-# CONFIG_HZ_250 is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
 # CONFIG_HZ_1000 is not set
-CONFIG_HZ=100
+CONFIG_HZ=250
 CONFIG_EEH=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_PPC_RTAS=y
diff --git a/arch/ppc64/defconfig b/arch/ppc64/defconfig
index c361e7727b7..7cb4750bb7a 100644
--- a/arch/ppc64/defconfig
+++ b/arch/ppc64/defconfig
@@ -114,10 +114,10 @@ CONFIG_PREEMPT_NONE=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 # CONFIG_PREEMPT is not set
 # CONFIG_PREEMPT_BKL is not set
-CONFIG_HZ_100=y
-# CONFIG_HZ_250 is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
 # CONFIG_HZ_1000 is not set
-CONFIG_HZ=100
+CONFIG_HZ=250
 CONFIG_EEH=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_PPC_RTAS=y
diff --git a/arch/ppc64/kernel/LparData.c b/arch/ppc64/kernel/LparData.c
index 1c11031c838..0a9c23ca2f0 100644
--- a/arch/ppc64/kernel/LparData.c
+++ b/arch/ppc64/kernel/LparData.c
@@ -51,6 +51,17 @@ struct HvReleaseData hvReleaseData = {
 		0xf4, 0x4b, 0xf6, 0xf4 },
 };
 
+/*
+ * The NACA.  The first dword of the naca is required by the iSeries
+ * hypervisor to point to itVpdAreas.  The hypervisor finds the NACA
+ * through the pointer in hvReleaseData.
+ */
+struct naca_struct naca = {
+	.xItVpdAreas = &itVpdAreas,
+	.xRamDisk = 0,
+	.xRamDiskSize = 0,
+};
+
 extern void system_reset_iSeries(void);
 extern void machine_check_iSeries(void);
 extern void data_access_iSeries(void);
@@ -214,29 +225,3 @@ struct ItVpdAreas itVpdAreas = {
 		0,0
 	}
 };
-
-struct msChunks msChunks;
-EXPORT_SYMBOL(msChunks);
-
-/* Depending on whether this is called from iSeries or pSeries setup
- * code, the location of the msChunks struct may or may not have
- * to be reloc'd, so we force the caller to do that for us by passing
- * in a pointer to the structure.
- */
-unsigned long
-msChunks_alloc(unsigned long mem, unsigned long num_chunks, unsigned long chunk_size)
-{
-	unsigned long offset = reloc_offset();
-	struct msChunks *_msChunks = PTRRELOC(&msChunks);
-
-	_msChunks->num_chunks  = num_chunks;
-	_msChunks->chunk_size  = chunk_size;
-	_msChunks->chunk_shift = __ilog2(chunk_size);
-	_msChunks->chunk_mask  = (1UL<<_msChunks->chunk_shift)-1;
-
-	mem = _ALIGN(mem, sizeof(msChunks_entry));
-	_msChunks->abs = (msChunks_entry *)(mem + offset);
-	mem += num_chunks * sizeof(msChunks_entry);
-
-	return mem;
-}
diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile
index 2ecccb6b4f8..f4b3bfcc109 100644
--- a/arch/ppc64/kernel/Makefile
+++ b/arch/ppc64/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y               :=	setup.o entry.o traps.o irq.o idle.o dma.o \
 			udbg.o binfmt_elf32.o sys_ppc32.o ioctl32.o \
 			ptrace32.o signal32.o rtc.o init_task.o \
 			lmb.o cputable.o cpu_setup_power4.o idle_power4.o \
-			iommu.o sysfs.o vdso.o pmc.o
+			iommu.o sysfs.o vdso.o pmc.o firmware.o
 obj-y += vdso32/ vdso64/
 
 obj-$(CONFIG_PPC_OF) +=	of_device.o
@@ -50,7 +50,10 @@ obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 obj-$(CONFIG_HVC_CONSOLE)	+= hvconsole.o
 obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
 obj-$(CONFIG_HVCS)		+= hvcserver.o
-obj-$(CONFIG_IBMVIO)		+= vio.o
+
+vio-obj-$(CONFIG_PPC_PSERIES)	+= pSeries_vio.o
+vio-obj-$(CONFIG_PPC_ISERIES)	+= iSeries_vio.o
+obj-$(CONFIG_IBMVIO)		+= vio.o $(vio-obj-y)
 obj-$(CONFIG_XICS)		+= xics.o
 obj-$(CONFIG_MPIC)		+= mpic.o
 
diff --git a/arch/ppc64/kernel/asm-offsets.c b/arch/ppc64/kernel/asm-offsets.c
index abb9e5b5da0..17e35d0fed0 100644
--- a/arch/ppc64/kernel/asm-offsets.c
+++ b/arch/ppc64/kernel/asm-offsets.c
@@ -94,7 +94,8 @@ int main(void)
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_HUGETLB_PAGE
-	DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs));
+	DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
+	DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
 #endif /* CONFIG_HUGETLB_PAGE */
 	DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr));
         DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c
index 77cec42f952..4847f2ac8c9 100644
--- a/arch/ppc64/kernel/cputable.c
+++ b/arch/ppc64/kernel/cputable.c
@@ -5,7 +5,7 @@
  *
  *  Modifications for ppc64:
  *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
- * 
+ *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
@@ -60,7 +60,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Power3+ */
 		.pvr_mask		= 0xffff0000,
@@ -73,7 +72,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Northstar */
 		.pvr_mask		= 0xffff0000,
@@ -86,7 +84,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Pulsar */
 		.pvr_mask		= 0xffff0000,
@@ -99,7 +96,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* I-star */
 		.pvr_mask		= 0xffff0000,
@@ -112,7 +108,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* S-star */
 		.pvr_mask		= 0xffff0000,
@@ -125,7 +120,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power3,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Power4 */
 		.pvr_mask		= 0xffff0000,
@@ -138,7 +132,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power4,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Power4+ */
 		.pvr_mask		= 0xffff0000,
@@ -151,7 +144,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power4,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* PPC970 */
 		.pvr_mask		= 0xffff0000,
@@ -166,7 +158,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_ppc970,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* PPC970FX */
 		.pvr_mask		= 0xffff0000,
@@ -181,7 +172,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_ppc970,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* PPC970MP */
 		.pvr_mask		= 0xffff0000,
@@ -196,7 +186,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_ppc970,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Power5 */
 		.pvr_mask		= 0xffff0000,
@@ -211,7 +200,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power4,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* Power5 */
 		.pvr_mask		= 0xffff0000,
@@ -226,7 +214,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power4,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* BE DD1.x */
 		.pvr_mask		= 0xffff0000,
@@ -241,7 +228,6 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_be,
-		.firmware_features	= COMMON_PPC64_FW,
 	},
 	{	/* default match */
 		.pvr_mask		= 0x00000000,
@@ -254,29 +240,5 @@ struct cpu_spec	cpu_specs[] = {
 		.icache_bsize		= 128,
 		.dcache_bsize		= 128,
 		.cpu_setup		= __setup_cpu_power4,
-		.firmware_features	= COMMON_PPC64_FW,
 	}
 };
-
-firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = {
-	{FW_FEATURE_PFT,		"hcall-pft"},
-	{FW_FEATURE_TCE,		"hcall-tce"},
-	{FW_FEATURE_SPRG0,		"hcall-sprg0"},
-	{FW_FEATURE_DABR,		"hcall-dabr"},
-	{FW_FEATURE_COPY,		"hcall-copy"},
-	{FW_FEATURE_ASR,		"hcall-asr"},
-	{FW_FEATURE_DEBUG,		"hcall-debug"},
-	{FW_FEATURE_PERF,		"hcall-perf"},
-	{FW_FEATURE_DUMP,		"hcall-dump"},
-	{FW_FEATURE_INTERRUPT,		"hcall-interrupt"},
-	{FW_FEATURE_MIGRATE,		"hcall-migrate"},
-	{FW_FEATURE_PERFMON,		"hcall-perfmon"},
-	{FW_FEATURE_CRQ,		"hcall-crq"},
-	{FW_FEATURE_VIO,		"hcall-vio"},
-	{FW_FEATURE_RDMA,		"hcall-rdma"},
-	{FW_FEATURE_LLAN,		"hcall-lLAN"},
-	{FW_FEATURE_BULK,		"hcall-bulk"},
-	{FW_FEATURE_XDABR,		"hcall-xdabr"},
-	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
-	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
-};
diff --git a/arch/ppc64/kernel/firmware.c b/arch/ppc64/kernel/firmware.c
new file mode 100644
index 00000000000..d8432c0fb27
--- /dev/null
+++ b/arch/ppc64/kernel/firmware.c
@@ -0,0 +1,47 @@
+/*
+ *  arch/ppc64/kernel/firmware.c
+ *
+ *  Extracted from cputable.c
+ *
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *  Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+
+#include <asm/firmware.h>
+
+unsigned long ppc64_firmware_features;
+
+#ifdef CONFIG_PPC_PSERIES
+firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = {
+	{FW_FEATURE_PFT,		"hcall-pft"},
+	{FW_FEATURE_TCE,		"hcall-tce"},
+	{FW_FEATURE_SPRG0,		"hcall-sprg0"},
+	{FW_FEATURE_DABR,		"hcall-dabr"},
+	{FW_FEATURE_COPY,		"hcall-copy"},
+	{FW_FEATURE_ASR,		"hcall-asr"},
+	{FW_FEATURE_DEBUG,		"hcall-debug"},
+	{FW_FEATURE_PERF,		"hcall-perf"},
+	{FW_FEATURE_DUMP,		"hcall-dump"},
+	{FW_FEATURE_INTERRUPT,		"hcall-interrupt"},
+	{FW_FEATURE_MIGRATE,		"hcall-migrate"},
+	{FW_FEATURE_PERFMON,		"hcall-perfmon"},
+	{FW_FEATURE_CRQ,		"hcall-crq"},
+	{FW_FEATURE_VIO,		"hcall-vio"},
+	{FW_FEATURE_RDMA,		"hcall-rdma"},
+	{FW_FEATURE_LLAN,		"hcall-lLAN"},
+	{FW_FEATURE_BULK,		"hcall-bulk"},
+	{FW_FEATURE_XDABR,		"hcall-xdabr"},
+	{FW_FEATURE_MULTITCE,		"hcall-multi-tce"},
+	{FW_FEATURE_SPLPAR,		"hcall-splpar"},
+};
+#endif
diff --git a/arch/ppc64/kernel/head.S b/arch/ppc64/kernel/head.S
index accaa052d31..03695977562 100644
--- a/arch/ppc64/kernel/head.S
+++ b/arch/ppc64/kernel/head.S
@@ -23,14 +23,11 @@
  *  2 of the License, or (at your option) any later version.
  */
 
-#define SECONDARY_PROCESSORS
-
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
-#include <asm/naca.h>
 #include <asm/systemcfg.h>
 #include <asm/ppc_asm.h>
 #include <asm/offsets.h>
@@ -45,18 +42,13 @@
 #endif
 
 /*
- * hcall interface to pSeries LPAR
- */
-#define H_SET_ASR	0x30
-
-/*
  * We layout physical memory as follows:
  * 0x0000 - 0x00ff : Secondary processor spin code
  * 0x0100 - 0x2fff : pSeries Interrupt prologs
- * 0x3000 - 0x3fff : Interrupt support
- * 0x4000 - 0x4fff : NACA
- * 0x6000	   : iSeries and common interrupt prologs
- * 0x9000 - 0x9fff : Initial segment table
+ * 0x3000 - 0x5fff : interrupt support, iSeries and common interrupt prologs
+ * 0x6000 - 0x6fff : Initial (CPU0) segment table
+ * 0x7000 - 0x7fff : FWNMI data area
+ * 0x8000 -        : Early init and support code
  */
 
 /*
@@ -94,6 +86,7 @@ END_FTR_SECTION(0, 1)
 
 	/* Catch branch to 0 in real mode */
 	trap
+
 #ifdef CONFIG_PPC_ISERIES
 	/*
 	 * At offset 0x20, there is a pointer to iSeries LPAR data.
@@ -103,12 +96,12 @@ END_FTR_SECTION(0, 1)
 	.llong hvReleaseData-KERNELBASE
 
 	/*
-	 * At offset 0x28 and 0x30 are offsets to the msChunks
+	 * At offset 0x28 and 0x30 are offsets to the mschunks_map
 	 * array (used by the iSeries LPAR debugger to do translation
 	 * between physical addresses and absolute addresses) and
 	 * to the pidhash table (also used by the debugger)
 	 */
-	.llong msChunks-KERNELBASE
+	.llong mschunks_map-KERNELBASE
 	.llong 0	/* pidhash-KERNELBASE SFRXXX */
 
 	/* Offset 0x38 - Pointer to start of embedded System.map */
@@ -120,7 +113,7 @@ embedded_sysmap_start:
 embedded_sysmap_end:
 	.llong	0
 
-#else /* CONFIG_PPC_ISERIES */
+#endif /* CONFIG_PPC_ISERIES */
 
 	/* Secondary processors spin on this value until it goes to 1. */
 	.globl  __secondary_hold_spinloop
@@ -155,7 +148,7 @@ _GLOBAL(__secondary_hold)
 	std	r24,__secondary_hold_acknowledge@l(0)
 	sync
 
-	/* All secondary cpu's wait here until told to start. */
+	/* All secondary cpus wait here until told to start. */
 100:	ld	r4,__secondary_hold_spinloop@l(0)
 	cmpdi	0,r4,1
 	bne	100b
@@ -170,7 +163,6 @@ _GLOBAL(__secondary_hold)
 	BUG_OPCODE
 #endif
 #endif
-#endif
 
 /* This value is used to mark exception frames on the stack. */
 	.section ".toc","aw"
@@ -502,33 +494,37 @@ system_call_pSeries:
 	STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint)
 	STD_EXCEPTION_PSERIES(0x1700, altivec_assist)
 
+	. = 0x3000
+
+/*** pSeries interrupt support ***/
+
 	/* moved from 0xf00 */
-	STD_EXCEPTION_PSERIES(0x3000, performance_monitor)
+	STD_EXCEPTION_PSERIES(., performance_monitor)
 
-	. = 0x3100
+	.align	7
 _GLOBAL(do_stab_bolted_pSeries)
 	mtcrf	0x80,r12
 	mfspr	r12,SPRG2
 	EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted)
 
-	
-	/* Space for the naca.  Architected to be located at real address
-	 * NACA_PHYS_ADDR.  Various tools rely on this location being fixed.
-	 * The first dword of the naca is required by iSeries LPAR to
-	 * point to itVpdAreas.  On pSeries native, this value is not used.
-	 */
-	. = NACA_PHYS_ADDR
-	.globl __end_interrupts
-__end_interrupts:
-#ifdef CONFIG_PPC_ISERIES
-	.globl naca
-naca:
-	.llong	itVpdAreas
-	.llong	0		/* xRamDisk */
-	.llong	0		/* xRamDiskSize */
+/*
+ * Vectors for the FWNMI option.  Share common code.
+ */
+      .globl system_reset_fwnmi
+system_reset_fwnmi:
+      HMT_MEDIUM
+      mtspr   SPRG1,r13               /* save r13 */
+      RUNLATCH_ON(r13)
+      EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common)
 
-	. = 0x6100
+      .globl machine_check_fwnmi
+machine_check_fwnmi:
+      HMT_MEDIUM
+      mtspr   SPRG1,r13               /* save r13 */
+      RUNLATCH_ON(r13)
+      EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
 
+#ifdef CONFIG_PPC_ISERIES
 /***  ISeries-LPAR interrupt handlers ***/
 
 	STD_EXCEPTION_ISERIES(0x200, machine_check, PACA_EXMC)
@@ -626,9 +622,7 @@ system_reset_iSeries:
 
 	cmpwi	0,r23,0
 	beq	iSeries_secondary_smp_loop	/* Loop until told to go */
-#ifdef SECONDARY_PROCESSORS
 	bne	.__secondary_start		/* Loop until told to go */
-#endif
 iSeries_secondary_smp_loop:
 	/* Let the Hypervisor know we are alive */
 	/* 8002 is a call to HvCallCfg::getLps, a harmless Hypervisor function */
@@ -671,51 +665,8 @@ hardware_interrupt_iSeries_masked:
 	ld	r13,PACA_EXGEN+EX_R13(r13)
 	rfid
 	b	.	/* prevent speculative execution */
-#endif
-
-/*
- * Data area reserved for FWNMI option.
- */
-	.= 0x7000
-	.globl fwnmi_data_area
-fwnmi_data_area:
-
-#ifdef CONFIG_PPC_ISERIES
-	. = LPARMAP_PHYS
-#include "lparmap.s"
 #endif /* CONFIG_PPC_ISERIES */
 
-/*
- * Vectors for the FWNMI option.  Share common code.
- */
-	. = 0x8000
-	.globl system_reset_fwnmi
-system_reset_fwnmi:
-	HMT_MEDIUM
-	mtspr	SPRG1,r13		/* save r13 */
-	RUNLATCH_ON(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common)
-	.globl machine_check_fwnmi
-machine_check_fwnmi:
-	HMT_MEDIUM
-	mtspr	SPRG1,r13		/* save r13 */
-	RUNLATCH_ON(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
-
-	/*
-	 * Space for the initial segment table
-	 * For LPAR, the hypervisor must fill in at least one entry
-	 * before we get control (with relocate on)
-	 */
-	. = STAB0_PHYS_ADDR
-	.globl __start_stab
-__start_stab:
-
-	. = (STAB0_PHYS_ADDR + PAGE_SIZE)
-	.globl __end_stab
-__end_stab:
-
-
 /*** Common interrupt handlers ***/
 
 	STD_EXCEPTION_COMMON(0x100, system_reset, .system_reset_exception)
@@ -752,8 +703,8 @@ machine_check_common:
  * R9 contains the saved CR, r13 points to the paca,
  * r10 contains the (bad) kernel stack pointer,
  * r11 and r12 contain the saved SRR0 and SRR1.
- * We switch to using the paca guard page as an emergency stack,
- * save the registers there, and call kernel_bad_stack(), which panics.
+ * We switch to using an emergency stack, save the registers there,
+ * and call kernel_bad_stack(), which panics.
  */
 bad_stack:
 	ld	r1,PACAEMERGSP(r13)
@@ -906,6 +857,62 @@ fp_unavailable_common:
 	bl	.kernel_fp_unavailable_exception
 	BUG_OPCODE
 
+/*
+ * load_up_fpu(unused, unused, tsk)
+ * Disable FP for the task which had the FPU previously,
+ * and save its floating-point registers in its thread_struct.
+ * Enables the FPU for use in the kernel on return.
+ * On SMP we know the fpu is free, since we give it up every
+ * switch (ie, no lazy save of the FP registers).
+ * On entry: r13 == 'current' && last_task_used_math != 'current'
+ */
+_STATIC(load_up_fpu)
+	mfmsr	r5			/* grab the current MSR */
+	ori	r5,r5,MSR_FP
+	mtmsrd	r5			/* enable use of fpu now */
+	isync
+/*
+ * For SMP, we don't do lazy FPU switching because it just gets too
+ * horrendously complex, especially when a task switches from one CPU
+ * to another.  Instead we call giveup_fpu in switch_to.
+ *
+ */
+#ifndef CONFIG_SMP
+	ld	r3,last_task_used_math@got(r2)
+	ld	r4,0(r3)
+	cmpdi	0,r4,0
+	beq	1f
+	/* Save FP state to last_task_used_math's THREAD struct */
+	addi	r4,r4,THREAD
+	SAVE_32FPRS(0, r4)
+	mffs	fr0
+	stfd	fr0,THREAD_FPSCR(r4)
+	/* Disable FP for last_task_used_math */
+	ld	r5,PT_REGS(r4)
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	li	r6,MSR_FP|MSR_FE0|MSR_FE1
+	andc	r4,r4,r6
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#endif /* CONFIG_SMP */
+	/* enable use of FP after return */
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	ld	r4,THREAD_FPEXC_MODE(r5)
+	ori	r12,r12,MSR_FP
+	or	r12,r12,r4
+	std	r12,_MSR(r1)
+	lfd	fr0,THREAD_FPSCR(r5)
+	mtfsf	0xff,fr0
+	REST_32FPRS(0, r5)
+#ifndef CONFIG_SMP
+	/* Update last_task_used_math to 'current' */
+	subi	r4,r5,THREAD		/* Back to 'current' */
+	std	r4,0(r3)
+#endif /* CONFIG_SMP */
+	/* restore registers and return */
+	b	fast_exception_return
+
 	.align	7
 	.globl altivec_unavailable_common
 altivec_unavailable_common:
@@ -921,6 +928,80 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	bl	.altivec_unavailable_exception
 	b	.ret_from_except
 
+#ifdef CONFIG_ALTIVEC
+/*
+ * load_up_altivec(unused, unused, tsk)
+ * Disable VMX for the task which had it previously,
+ * and save its vector registers in its thread_struct.
+ * Enables the VMX for use in the kernel on return.
+ * On SMP we know the VMX is free, since we give it up every
+ * switch (ie, no lazy save of the vector registers).
+ * On entry: r13 == 'current' && last_task_used_altivec != 'current'
+ */
+_STATIC(load_up_altivec)
+	mfmsr	r5			/* grab the current MSR */
+	oris	r5,r5,MSR_VEC@h
+	mtmsrd	r5			/* enable use of VMX now */
+	isync
+
+/*
+ * For SMP, we don't do lazy VMX switching because it just gets too
+ * horrendously complex, especially when a task switches from one CPU
+ * to another.  Instead we call giveup_altvec in switch_to.
+ * VRSAVE isn't dealt with here, that is done in the normal context
+ * switch code. Note that we could rely on vrsave value to eventually
+ * avoid saving all of the VREGs here...
+ */
+#ifndef CONFIG_SMP
+	ld	r3,last_task_used_altivec@got(r2)
+	ld	r4,0(r3)
+	cmpdi	0,r4,0
+	beq	1f
+	/* Save VMX state to last_task_used_altivec's THREAD struct */
+	addi	r4,r4,THREAD
+	SAVE_32VRS(0,r5,r4)
+	mfvscr	vr0
+	li	r10,THREAD_VSCR
+	stvx	vr0,r10,r4
+	/* Disable VMX for last_task_used_altivec */
+	ld	r5,PT_REGS(r4)
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	lis	r6,MSR_VEC@h
+	andc	r4,r4,r6
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#endif /* CONFIG_SMP */
+	/* Hack: if we get an altivec unavailable trap with VRSAVE
+	 * set to all zeros, we assume this is a broken application
+	 * that fails to set it properly, and thus we switch it to
+	 * all 1's
+	 */
+	mfspr	r4,SPRN_VRSAVE
+	cmpdi	0,r4,0
+	bne+	1f
+	li	r4,-1
+	mtspr	SPRN_VRSAVE,r4
+1:
+	/* enable use of VMX after return */
+	ld	r4,PACACURRENT(r13)
+	addi	r5,r4,THREAD		/* Get THREAD */
+	oris	r12,r12,MSR_VEC@h
+	std	r12,_MSR(r1)
+	li	r4,1
+	li	r10,THREAD_VSCR
+	stw	r4,THREAD_USED_VR(r5)
+	lvx	vr0,r10,r5
+	mtvscr	vr0
+	REST_32VRS(0,r4,r5)
+#ifndef CONFIG_SMP
+	/* Update last_task_used_math to 'current' */
+	subi	r4,r5,THREAD		/* Back to 'current' */
+	std	r4,0(r3)
+#endif /* CONFIG_SMP */
+	/* restore registers and return */
+	b	fast_exception_return
+#endif /* CONFIG_ALTIVEC */
+
 /*
  * Hash table stuff
  */
@@ -1167,6 +1248,42 @@ unrecov_slb:
 	bl	.unrecoverable_exception
 	b	1b
 
+/*
+ * Space for CPU0's segment table.
+ *
+ * On iSeries, the hypervisor must fill in at least one entry before
+ * we get control (with relocate on).  The address is give to the hv
+ * as a page number (see xLparMap in LparData.c), so this must be at a
+ * fixed address (the linker can't compute (u64)&initial_stab >>
+ * PAGE_SHIFT).
+ */
+	. = STAB0_PHYS_ADDR	/* 0x6000 */
+	.globl initial_stab
+initial_stab:
+	.space	4096
+
+/*
+ * Data area reserved for FWNMI option.
+ * This address (0x7000) is fixed by the RPA.
+ */
+	.= 0x7000
+	.globl fwnmi_data_area
+fwnmi_data_area:
+
+	/* iSeries does not use the FWNMI stuff, so it is safe to put
+	 * this here, even if we later allow kernels that will boot on
+	 * both pSeries and iSeries */
+#ifdef CONFIG_PPC_ISERIES
+        . = LPARMAP_PHYS
+#include "lparmap.s"
+/*
+ * This ".text" is here for old compilers that generate a trailing
+ * .note section when compiling .c files to .s
+ */
+	.text
+#endif /* CONFIG_PPC_ISERIES */
+
+        . = 0x8000
 
 /*
  * On pSeries, secondary processors spin in the following code.
@@ -1200,7 +1317,7 @@ _GLOBAL(pSeries_secondary_smp_init)
 	b	.kexec_wait		/* next kernel might do better	 */
 
 2:	mtspr	SPRG3,r13		/* Save vaddr of paca in SPRG3	 */
-	/* From now on, r24 is expected to be logica cpuid */
+	/* From now on, r24 is expected to be logical cpuid */
 	mr	r24,r5
 3:	HMT_LOW
 	lbz	r23,PACAPROCSTART(r13)	/* Test if this processor should */
@@ -1213,10 +1330,8 @@ _GLOBAL(pSeries_secondary_smp_init)
 
 	cmpwi	0,r23,0
 #ifdef CONFIG_SMP
-#ifdef SECONDARY_PROCESSORS
 	bne	.__secondary_start
 #endif
-#endif
 	b 	3b			/* Loop until told to go	 */
 
 #ifdef CONFIG_PPC_ISERIES
@@ -1430,228 +1545,6 @@ _GLOBAL(copy_and_flush)
 .align 8
 copy_to_here:
 
-/*
- * load_up_fpu(unused, unused, tsk)
- * Disable FP for the task which had the FPU previously,
- * and save its floating-point registers in its thread_struct.
- * Enables the FPU for use in the kernel on return.
- * On SMP we know the fpu is free, since we give it up every
- * switch (ie, no lazy save of the FP registers).
- * On entry: r13 == 'current' && last_task_used_math != 'current'
- */
-_STATIC(load_up_fpu)
-	mfmsr	r5			/* grab the current MSR */
-	ori	r5,r5,MSR_FP
-	mtmsrd	r5			/* enable use of fpu now */
-	isync
-/*
- * For SMP, we don't do lazy FPU switching because it just gets too
- * horrendously complex, especially when a task switches from one CPU
- * to another.  Instead we call giveup_fpu in switch_to.
- *
- */
-#ifndef CONFIG_SMP
-	ld	r3,last_task_used_math@got(r2)
-	ld	r4,0(r3)
-	cmpdi	0,r4,0
-	beq	1f
-	/* Save FP state to last_task_used_math's THREAD struct */
-	addi	r4,r4,THREAD
-	SAVE_32FPRS(0, r4)
-	mffs	fr0
-	stfd	fr0,THREAD_FPSCR(r4)
-	/* Disable FP for last_task_used_math */
-	ld	r5,PT_REGS(r4)
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	li	r6,MSR_FP|MSR_FE0|MSR_FE1
-	andc	r4,r4,r6
-	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* CONFIG_SMP */
-	/* enable use of FP after return */
-	ld	r4,PACACURRENT(r13)
-	addi	r5,r4,THREAD		/* Get THREAD */
-	ld	r4,THREAD_FPEXC_MODE(r5)
-	ori	r12,r12,MSR_FP
-	or	r12,r12,r4
-	std	r12,_MSR(r1)
-	lfd	fr0,THREAD_FPSCR(r5)
-	mtfsf	0xff,fr0
-	REST_32FPRS(0, r5)
-#ifndef CONFIG_SMP
-	/* Update last_task_used_math to 'current' */
-	subi	r4,r5,THREAD		/* Back to 'current' */
-	std	r4,0(r3)
-#endif /* CONFIG_SMP */
-	/* restore registers and return */
-	b	fast_exception_return
-
-/*
- * disable_kernel_fp()
- * Disable the FPU.
- */
-_GLOBAL(disable_kernel_fp)
-	mfmsr	r3
-	rldicl	r0,r3,(63-MSR_FP_LG),1
-	rldicl	r3,r0,(MSR_FP_LG+1),0
-	mtmsrd	r3			/* disable use of fpu now */
-	isync
-	blr
-
-/*
- * giveup_fpu(tsk)
- * Disable FP for the task given as the argument,
- * and save the floating-point registers in its thread_struct.
- * Enables the FPU for use in the kernel on return.
- */
-_GLOBAL(giveup_fpu)
-	mfmsr	r5
-	ori	r5,r5,MSR_FP
-	mtmsrd	r5			/* enable use of fpu now */
-	isync
-	cmpdi	0,r3,0
-	beqlr-				/* if no previous owner, done */
-	addi	r3,r3,THREAD		/* want THREAD of task */
-	ld	r5,PT_REGS(r3)
-	cmpdi	0,r5,0
-	SAVE_32FPRS(0, r3)
-	mffs	fr0
-	stfd	fr0,THREAD_FPSCR(r3)
-	beq	1f
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	li	r3,MSR_FP|MSR_FE0|MSR_FE1
-	andc	r4,r4,r3		/* disable FP for previous task */
-	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	ld	r4,last_task_used_math@got(r2)
-	std	r5,0(r4)
-#endif /* CONFIG_SMP */
-	blr
-
-
-#ifdef CONFIG_ALTIVEC
-		
-/*
- * load_up_altivec(unused, unused, tsk)
- * Disable VMX for the task which had it previously,
- * and save its vector registers in its thread_struct.
- * Enables the VMX for use in the kernel on return.
- * On SMP we know the VMX is free, since we give it up every
- * switch (ie, no lazy save of the vector registers).
- * On entry: r13 == 'current' && last_task_used_altivec != 'current'
- */
-_STATIC(load_up_altivec)
-	mfmsr	r5			/* grab the current MSR */
-	oris	r5,r5,MSR_VEC@h
-	mtmsrd	r5			/* enable use of VMX now */
-	isync
-	
-/*
- * For SMP, we don't do lazy VMX switching because it just gets too
- * horrendously complex, especially when a task switches from one CPU
- * to another.  Instead we call giveup_altvec in switch_to.
- * VRSAVE isn't dealt with here, that is done in the normal context
- * switch code. Note that we could rely on vrsave value to eventually
- * avoid saving all of the VREGs here...
- */
-#ifndef CONFIG_SMP
-	ld	r3,last_task_used_altivec@got(r2)
-	ld	r4,0(r3)
-	cmpdi	0,r4,0
-	beq	1f
-	/* Save VMX state to last_task_used_altivec's THREAD struct */
-	addi	r4,r4,THREAD
-	SAVE_32VRS(0,r5,r4)
-	mfvscr	vr0
-	li	r10,THREAD_VSCR
-	stvx	vr0,r10,r4
-	/* Disable VMX for last_task_used_altivec */
-	ld	r5,PT_REGS(r4)
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r6,MSR_VEC@h
-	andc	r4,r4,r6
-	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#endif /* CONFIG_SMP */
-	/* Hack: if we get an altivec unavailable trap with VRSAVE
-	 * set to all zeros, we assume this is a broken application
-	 * that fails to set it properly, and thus we switch it to
-	 * all 1's
-	 */
-	mfspr	r4,SPRN_VRSAVE
-	cmpdi	0,r4,0
-	bne+	1f
-	li	r4,-1
-	mtspr	SPRN_VRSAVE,r4
-1:
-	/* enable use of VMX after return */
-	ld	r4,PACACURRENT(r13)
-	addi	r5,r4,THREAD		/* Get THREAD */
-	oris	r12,r12,MSR_VEC@h
-	std	r12,_MSR(r1)
-	li	r4,1
-	li	r10,THREAD_VSCR
-	stw	r4,THREAD_USED_VR(r5)
-	lvx	vr0,r10,r5
-	mtvscr	vr0
-	REST_32VRS(0,r4,r5)
-#ifndef CONFIG_SMP
-	/* Update last_task_used_math to 'current' */
-	subi	r4,r5,THREAD		/* Back to 'current' */
-	std	r4,0(r3)
-#endif /* CONFIG_SMP */
-	/* restore registers and return */
-	b	fast_exception_return
-
-/*
- * disable_kernel_altivec()
- * Disable the VMX.
- */
-_GLOBAL(disable_kernel_altivec)
-	mfmsr	r3
-	rldicl	r0,r3,(63-MSR_VEC_LG),1
-	rldicl	r3,r0,(MSR_VEC_LG+1),0
-	mtmsrd	r3			/* disable use of VMX now */
-	isync
-	blr
-
-/*
- * giveup_altivec(tsk)
- * Disable VMX for the task given as the argument,
- * and save the vector registers in its thread_struct.
- * Enables the VMX for use in the kernel on return.
- */
-_GLOBAL(giveup_altivec)
-	mfmsr	r5
-	oris	r5,r5,MSR_VEC@h
-	mtmsrd	r5			/* enable use of VMX now */
-	isync
-	cmpdi	0,r3,0
-	beqlr-				/* if no previous owner, done */
-	addi	r3,r3,THREAD		/* want THREAD of task */
-	ld	r5,PT_REGS(r3)
-	cmpdi	0,r5,0
-	SAVE_32VRS(0,r4,r3)
-	mfvscr	vr0
-	li	r4,THREAD_VSCR
-	stvx	vr0,r4,r3
-	beq	1f
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r3,MSR_VEC@h
-	andc	r4,r4,r3		/* disable FP for previous task */
-	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-#ifndef CONFIG_SMP
-	li	r5,0
-	ld	r4,last_task_used_altivec@got(r2)
-	std	r5,0(r4)
-#endif /* CONFIG_SMP */
-	blr
-
-#endif /* CONFIG_ALTIVEC */
-
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PPC_PMAC
 /*
@@ -2002,9 +1895,6 @@ _STATIC(start_here_common)
 
 	bl .start_kernel
 
-_GLOBAL(__setup_cpu_power3)
-	blr
-
 _GLOBAL(hmt_init)
 #ifdef CONFIG_HMT
 	LOADADDR(r5, hmt_thread_data)
@@ -2095,20 +1985,19 @@ _GLOBAL(smp_release_cpus)
 
 /*
  * We put a few things here that have to be page-aligned.
- * This stuff goes at the beginning of the data segment,
- * which is page-aligned.
+ * This stuff goes at the beginning of the bss, which is page-aligned.
  */
-	.data
+	.section ".bss"
+
 	.align	12
-	.globl	sdata
-sdata:
+
 	.globl	empty_zero_page
 empty_zero_page:
-	.space	4096
+	.space	PAGE_SIZE
 
 	.globl	swapper_pg_dir
 swapper_pg_dir:
-	.space	4096
+	.space	PAGE_SIZE
 
 /*
  * This space gets a copy of optional info passed to us by the bootstrap
diff --git a/arch/ppc64/kernel/iSeries_htab.c b/arch/ppc64/kernel/iSeries_htab.c
index b0250ae4a72..2192055a90a 100644
--- a/arch/ppc64/kernel/iSeries_htab.c
+++ b/arch/ppc64/kernel/iSeries_htab.c
@@ -41,6 +41,7 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 				unsigned long prpn, unsigned long vflags,
 				unsigned long rflags)
 {
+	unsigned long arpn;
 	long slot;
 	hpte_t lhpte;
 	int secondary = 0;
@@ -70,8 +71,10 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va,
 		slot &= 0x7fffffffffffffff;
 	}
 
+	arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT;
+
 	lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
-	lhpte.r = (physRpn_to_absRpn(prpn) << HPTE_R_RPN_SHIFT) | rflags;
+	lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	/* Now fill in the actual HPTE */
 	HvCallHpt_addValidate(slot, secondary, &lhpte);
diff --git a/arch/ppc64/kernel/iSeries_setup.c b/arch/ppc64/kernel/iSeries_setup.c
index a649edbb23b..3ffefbbc662 100644
--- a/arch/ppc64/kernel/iSeries_setup.c
+++ b/arch/ppc64/kernel/iSeries_setup.c
@@ -39,6 +39,7 @@
 #include <asm/cputable.h>
 #include <asm/sections.h>
 #include <asm/iommu.h>
+#include <asm/firmware.h>
 
 #include <asm/time.h>
 #include "iSeries_setup.h"
@@ -314,6 +315,8 @@ static void __init iSeries_init_early(void)
 
 	DBG(" -> iSeries_init_early()\n");
 
+	ppc64_firmware_features = FW_FEATURE_ISERIES;
+
 	ppcdbg_initialize();
 
 #if defined(CONFIG_BLK_DEV_INITRD)
@@ -412,6 +415,22 @@ static void __init iSeries_init_early(void)
 	DBG(" <- iSeries_init_early()\n");
 }
 
+struct mschunks_map mschunks_map = {
+	/* XXX We don't use these, but Piranha might need them. */
+	.chunk_size  = MSCHUNKS_CHUNK_SIZE,
+	.chunk_shift = MSCHUNKS_CHUNK_SHIFT,
+	.chunk_mask  = MSCHUNKS_OFFSET_MASK,
+};
+EXPORT_SYMBOL(mschunks_map);
+
+void mschunks_alloc(unsigned long num_chunks)
+{
+	klimit = _ALIGN(klimit, sizeof(u32));
+	mschunks_map.mapping = (u32 *)klimit;
+	klimit += num_chunks * sizeof(u32);
+	mschunks_map.num_chunks = num_chunks;
+}
+
 /*
  * The iSeries may have very large memories ( > 128 GB ) and a partition
  * may get memory in "chunks" that may be anywhere in the 2**52 real
@@ -449,7 +468,7 @@ static void __init build_iSeries_Memory_Map(void)
 
 	/* Chunk size on iSeries is 256K bytes */
 	totalChunks = (u32)HvLpConfig_getMsChunks();
-	klimit = msChunks_alloc(klimit, totalChunks, 1UL << 18);
+	mschunks_alloc(totalChunks);
 
 	/*
 	 * Get absolute address of our load area
@@ -486,7 +505,7 @@ static void __init build_iSeries_Memory_Map(void)
 	printk("Load area size %dK\n", loadAreaSize * 256);
 
 	for (nextPhysChunk = 0; nextPhysChunk < loadAreaSize; ++nextPhysChunk)
-		msChunks.abs[nextPhysChunk] =
+		mschunks_map.mapping[nextPhysChunk] =
 			loadAreaFirstChunk + nextPhysChunk;
 
 	/*
@@ -495,7 +514,7 @@ static void __init build_iSeries_Memory_Map(void)
 	 */
 	hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress());
 	hptSizePages = (u32)HvCallHpt_getHptPages();
-	hptSizeChunks = hptSizePages >> (msChunks.chunk_shift - PAGE_SHIFT);
+	hptSizeChunks = hptSizePages >> (MSCHUNKS_CHUNK_SHIFT - PAGE_SHIFT);
 	hptLastChunk = hptFirstChunk + hptSizeChunks - 1;
 
 	printk("HPT absolute addr = %016lx, size = %dK\n",
@@ -552,7 +571,8 @@ static void __init build_iSeries_Memory_Map(void)
 				     (absChunk > hptLastChunk)) &&
 				    ((absChunk < loadAreaFirstChunk) ||
 				     (absChunk > loadAreaLastChunk))) {
-					msChunks.abs[nextPhysChunk] = absChunk;
+					mschunks_map.mapping[nextPhysChunk] =
+						absChunk;
 					++nextPhysChunk;
 				}
 			}
@@ -944,6 +964,8 @@ void __init iSeries_early_setup(void)
 	ppc_md.calibrate_decr = iSeries_calibrate_decr;
 	ppc_md.progress = iSeries_progress;
 
+	/* XXX Implement enable_pmcs for iSeries */
+
 	if (get_paca()->lppaca.shared_proc) {
 		ppc_md.idle_loop = iseries_shared_idle;
 		printk(KERN_INFO "Using shared processor idle loop\n");
diff --git a/arch/ppc64/kernel/iSeries_vio.c b/arch/ppc64/kernel/iSeries_vio.c
new file mode 100644
index 00000000000..6b754b0c834
--- /dev/null
+++ b/arch/ppc64/kernel/iSeries_vio.c
@@ -0,0 +1,155 @@
+/*
+ * IBM PowerPC iSeries Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2005 Stephen Rothwell, IBM Corp.
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/init.h>
+
+#include <asm/vio.h>
+#include <asm/iommu.h>
+#include <asm/abs_addr.h>
+#include <asm/page.h>
+#include <asm/iSeries/vio.h>
+#include <asm/iSeries/HvTypes.h>
+#include <asm/iSeries/HvLpConfig.h>
+#include <asm/iSeries/HvCallXm.h>
+
+struct device *iSeries_vio_dev = &vio_bus_device.dev;
+EXPORT_SYMBOL(iSeries_vio_dev);
+
+static struct iommu_table veth_iommu_table;
+static struct iommu_table vio_iommu_table;
+
+static void __init iommu_vio_init(void)
+{
+	struct iommu_table *t;
+	struct iommu_table_cb cb;
+	unsigned long cbp;
+	unsigned long itc_entries;
+
+	cb.itc_busno = 255;    /* Bus 255 is the virtual bus */
+	cb.itc_virtbus = 0xff; /* Ask for virtual bus */
+
+	cbp = virt_to_abs(&cb);
+	HvCallXm_getTceTableParms(cbp);
+
+	itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry);
+	veth_iommu_table.it_size        = itc_entries / 2;
+	veth_iommu_table.it_busno       = cb.itc_busno;
+	veth_iommu_table.it_offset      = cb.itc_offset;
+	veth_iommu_table.it_index       = cb.itc_index;
+	veth_iommu_table.it_type        = TCE_VB;
+	veth_iommu_table.it_blocksize	= 1;
+
+	t = iommu_init_table(&veth_iommu_table);
+
+	if (!t)
+		printk("Virtual Bus VETH TCE table failed.\n");
+
+	vio_iommu_table.it_size         = itc_entries - veth_iommu_table.it_size;
+	vio_iommu_table.it_busno        = cb.itc_busno;
+	vio_iommu_table.it_offset       = cb.itc_offset +
+					  veth_iommu_table.it_size;
+	vio_iommu_table.it_index        = cb.itc_index;
+	vio_iommu_table.it_type         = TCE_VB;
+	vio_iommu_table.it_blocksize	= 1;
+
+	t = iommu_init_table(&vio_iommu_table);
+
+	if (!t)
+		printk("Virtual Bus VIO TCE table failed.\n");
+}
+
+/**
+ * vio_register_device_iseries: - Register a new iSeries vio device.
+ * @voidev:	The device to register.
+ */
+static struct vio_dev *__init vio_register_device_iseries(char *type,
+		uint32_t unit_num)
+{
+	struct vio_dev *viodev;
+
+	/* allocate a vio_dev for this device */
+	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!viodev)
+		return NULL;
+	memset(viodev, 0, sizeof(struct vio_dev));
+
+	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num);
+
+	viodev->name = viodev->dev.bus_id;
+	viodev->type = type;
+	viodev->unit_address = unit_num;
+	viodev->iommu_table = &vio_iommu_table;
+	if (vio_register_device(viodev) == NULL) {
+		kfree(viodev);
+		return NULL;
+	}
+	return viodev;
+}
+
+void __init probe_bus_iseries(void)
+{
+	HvLpIndexMap vlan_map;
+	struct vio_dev *viodev;
+	int i;
+
+	/* there is only one of each of these */
+	vio_register_device_iseries("viocons", 0);
+	vio_register_device_iseries("vscsi", 0);
+
+	vlan_map = HvLpConfig_getVirtualLanIndexMap();
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) {
+		if ((vlan_map & (0x8000 >> i)) == 0)
+			continue;
+		viodev = vio_register_device_iseries("vlan", i);
+		/* veth is special and has it own iommu_table */
+		viodev->iommu_table = &veth_iommu_table;
+	}
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++)
+		vio_register_device_iseries("viodasd", i);
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++)
+		vio_register_device_iseries("viocd", i);
+	for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++)
+		vio_register_device_iseries("viotape", i);
+}
+
+/**
+ * vio_match_device_iseries: - Tell if a iSeries VIO device matches a
+ *	vio_device_id
+ */
+static int vio_match_device_iseries(const struct vio_device_id *id,
+		const struct vio_dev *dev)
+{
+	return strncmp(dev->type, id->type, strlen(id->type)) == 0;
+}
+
+static struct vio_bus_ops vio_bus_ops_iseries = {
+	.match = vio_match_device_iseries,
+};
+
+/**
+ * vio_bus_init_iseries: - Initialize the iSeries virtual IO bus
+ */
+static int __init vio_bus_init_iseries(void)
+{
+	int err;
+
+	err = vio_bus_init(&vio_bus_ops_iseries);
+	if (err == 0) {
+		iommu_vio_init();
+		vio_bus_device.iommu_table = &vio_iommu_table;
+		iSeries_vio_dev = &vio_bus_device.dev;
+		probe_bus_iseries();
+	}
+	return err;
+}
+
+__initcall(vio_bus_init_iseries);
diff --git a/arch/ppc64/kernel/lmb.c b/arch/ppc64/kernel/lmb.c
index d6c6bd03d2a..5adaca2ddc9 100644
--- a/arch/ppc64/kernel/lmb.c
+++ b/arch/ppc64/kernel/lmb.c
@@ -28,33 +28,28 @@ void lmb_dump_all(void)
 {
 #ifdef DEBUG
 	unsigned long i;
-	struct lmb *_lmb  = &lmb;
 
 	udbg_printf("lmb_dump_all:\n");
 	udbg_printf("    memory.cnt		  = 0x%lx\n",
-		    _lmb->memory.cnt);
+		    lmb.memory.cnt);
 	udbg_printf("    memory.size		  = 0x%lx\n",
-		    _lmb->memory.size);
-	for (i=0; i < _lmb->memory.cnt ;i++) {
+		    lmb.memory.size);
+	for (i=0; i < lmb.memory.cnt ;i++) {
 		udbg_printf("    memory.region[0x%x].base       = 0x%lx\n",
-			    i, _lmb->memory.region[i].base);
-		udbg_printf("		      .physbase = 0x%lx\n",
-			    _lmb->memory.region[i].physbase);
+			    i, lmb.memory.region[i].base);
 		udbg_printf("		      .size     = 0x%lx\n",
-			    _lmb->memory.region[i].size);
+			    lmb.memory.region[i].size);
 	}
 
 	udbg_printf("\n    reserved.cnt	  = 0x%lx\n",
-		    _lmb->reserved.cnt);
+		    lmb.reserved.cnt);
 	udbg_printf("    reserved.size	  = 0x%lx\n",
-		    _lmb->reserved.size);
-	for (i=0; i < _lmb->reserved.cnt ;i++) {
+		    lmb.reserved.size);
+	for (i=0; i < lmb.reserved.cnt ;i++) {
 		udbg_printf("    reserved.region[0x%x].base       = 0x%lx\n",
-			    i, _lmb->reserved.region[i].base);
-		udbg_printf("		      .physbase = 0x%lx\n",
-			    _lmb->reserved.region[i].physbase);
+			    i, lmb.reserved.region[i].base);
 		udbg_printf("		      .size     = 0x%lx\n",
-			    _lmb->reserved.region[i].size);
+			    lmb.reserved.region[i].size);
 	}
 #endif /* DEBUG */
 }
@@ -98,7 +93,6 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2)
 	rgn->region[r1].size += rgn->region[r2].size;
 	for (i=r2; i < rgn->cnt-1; i++) {
 		rgn->region[i].base = rgn->region[i+1].base;
-		rgn->region[i].physbase = rgn->region[i+1].physbase;
 		rgn->region[i].size = rgn->region[i+1].size;
 	}
 	rgn->cnt--;
@@ -108,49 +102,29 @@ lmb_coalesce_regions(struct lmb_region *rgn, unsigned long r1, unsigned long r2)
 void __init
 lmb_init(void)
 {
-	struct lmb *_lmb = &lmb;
-
 	/* Create a dummy zero size LMB which will get coalesced away later.
 	 * This simplifies the lmb_add() code below...
 	 */
-	_lmb->memory.region[0].base = 0;
-	_lmb->memory.region[0].size = 0;
-	_lmb->memory.cnt = 1;
+	lmb.memory.region[0].base = 0;
+	lmb.memory.region[0].size = 0;
+	lmb.memory.cnt = 1;
 
 	/* Ditto. */
-	_lmb->reserved.region[0].base = 0;
-	_lmb->reserved.region[0].size = 0;
-	_lmb->reserved.cnt = 1;
+	lmb.reserved.region[0].base = 0;
+	lmb.reserved.region[0].size = 0;
+	lmb.reserved.cnt = 1;
 }
 
 /* This routine called with relocation disabled. */
 void __init
 lmb_analyze(void)
 {
-	unsigned long i;
-	unsigned long mem_size = 0;
-	unsigned long size_mask = 0;
-	struct lmb *_lmb = &lmb;
-#ifdef CONFIG_MSCHUNKS
-	unsigned long physbase = 0;
-#endif
-
-	for (i=0; i < _lmb->memory.cnt; i++) {
-		unsigned long lmb_size;
-
-		lmb_size = _lmb->memory.region[i].size;
-
-#ifdef CONFIG_MSCHUNKS
-		_lmb->memory.region[i].physbase = physbase;
-		physbase += lmb_size;
-#else
-		_lmb->memory.region[i].physbase = _lmb->memory.region[i].base;
-#endif
-		mem_size += lmb_size;
-		size_mask |= lmb_size;
-	}
+	int i;
+
+	lmb.memory.size = 0;
 
-	_lmb->memory.size = mem_size;
+	for (i = 0; i < lmb.memory.cnt; i++)
+		lmb.memory.size += lmb.memory.region[i].size;
 }
 
 /* This routine called with relocation disabled. */
@@ -168,7 +142,6 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
 		adjacent = lmb_addrs_adjacent(base,size,rgnbase,rgnsize);
 		if ( adjacent > 0 ) {
 			rgn->region[i].base -= size;
-			rgn->region[i].physbase -= size;
 			rgn->region[i].size += size;
 			coalesced++;
 			break;
@@ -195,11 +168,9 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
 	for (i=rgn->cnt-1; i >= 0; i--) {
 		if (base < rgn->region[i].base) {
 			rgn->region[i+1].base = rgn->region[i].base;
-			rgn->region[i+1].physbase = rgn->region[i].physbase;
 			rgn->region[i+1].size = rgn->region[i].size;
 		}  else {
 			rgn->region[i+1].base = base;
-			rgn->region[i+1].physbase = lmb_abs_to_phys(base);
 			rgn->region[i+1].size = size;
 			break;
 		}
@@ -213,12 +184,11 @@ lmb_add_region(struct lmb_region *rgn, unsigned long base, unsigned long size)
 long __init
 lmb_add(unsigned long base, unsigned long size)
 {
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_rgn = &(_lmb->memory);
+	struct lmb_region *_rgn = &(lmb.memory);
 
 	/* On pSeries LPAR systems, the first LMB is our RMO region. */
 	if ( base == 0 )
-		_lmb->rmo_size = size;
+		lmb.rmo_size = size;
 
 	return lmb_add_region(_rgn, base, size);
 
@@ -227,8 +197,7 @@ lmb_add(unsigned long base, unsigned long size)
 long __init
 lmb_reserve(unsigned long base, unsigned long size)
 {
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_rgn = &(_lmb->reserved);
+	struct lmb_region *_rgn = &(lmb.reserved);
 
 	return lmb_add_region(_rgn, base, size);
 }
@@ -260,13 +229,10 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr)
 {
 	long i, j;
 	unsigned long base = 0;
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_mem = &(_lmb->memory);
-	struct lmb_region *_rsv = &(_lmb->reserved);
 
-	for (i=_mem->cnt-1; i >= 0; i--) {
-		unsigned long lmbbase = _mem->region[i].base;
-		unsigned long lmbsize = _mem->region[i].size;
+	for (i=lmb.memory.cnt-1; i >= 0; i--) {
+		unsigned long lmbbase = lmb.memory.region[i].base;
+		unsigned long lmbsize = lmb.memory.region[i].size;
 
 		if ( max_addr == LMB_ALLOC_ANYWHERE )
 			base = _ALIGN_DOWN(lmbbase+lmbsize-size, align);
@@ -276,8 +242,8 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr)
 			continue;
 
 		while ( (lmbbase <= base) &&
-			((j = lmb_overlaps_region(_rsv,base,size)) >= 0) ) {
-			base = _ALIGN_DOWN(_rsv->region[j].base-size, align);
+			((j = lmb_overlaps_region(&lmb.reserved,base,size)) >= 0) ) {
+			base = _ALIGN_DOWN(lmb.reserved.region[j].base-size, align);
 		}
 
 		if ( (base != 0) && (lmbbase <= base) )
@@ -287,62 +253,24 @@ lmb_alloc_base(unsigned long size, unsigned long align, unsigned long max_addr)
 	if ( i < 0 )
 		return 0;
 
-	lmb_add_region(_rsv, base, size);
+	lmb_add_region(&lmb.reserved, base, size);
 
 	return base;
 }
 
+/* You must call lmb_analyze() before this. */
 unsigned long __init
 lmb_phys_mem_size(void)
 {
-	struct lmb *_lmb = &lmb;
-#ifdef CONFIG_MSCHUNKS
-	return _lmb->memory.size;
-#else
-	struct lmb_region *_mem = &(_lmb->memory);
-	unsigned long total = 0;
-	int i;
-
-	/* add all physical memory to the bootmem map */
-	for (i=0; i < _mem->cnt; i++)
-		total += _mem->region[i].size;
-	return total;
-#endif /* CONFIG_MSCHUNKS */
+	return lmb.memory.size;
 }
 
 unsigned long __init
 lmb_end_of_DRAM(void)
 {
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_mem = &(_lmb->memory);
-	int idx = _mem->cnt - 1;
-
-#ifdef CONFIG_MSCHUNKS
-	return (_mem->region[idx].physbase + _mem->region[idx].size);
-#else
-	return (_mem->region[idx].base + _mem->region[idx].size);
-#endif /* CONFIG_MSCHUNKS */
-
-	return 0;
-}
-
-unsigned long __init
-lmb_abs_to_phys(unsigned long aa)
-{
-	unsigned long i, pa = aa;
-	struct lmb *_lmb = &lmb;
-	struct lmb_region *_mem = &(_lmb->memory);
-
-	for (i=0; i < _mem->cnt; i++) {
-		unsigned long lmbbase = _mem->region[i].base;
-		unsigned long lmbsize = _mem->region[i].size;
-		if ( lmb_addrs_overlap(aa,1,lmbbase,lmbsize) ) {
-			pa = _mem->region[i].physbase + (aa - lmbbase);
-			break;
-		}
-	}
+	int idx = lmb.memory.cnt - 1;
 
-	return pa;
+	return (lmb.memory.region[idx].base + lmb.memory.region[idx].size);
 }
 
 /*
@@ -353,20 +281,19 @@ void __init lmb_enforce_memory_limit(void)
 {
 	extern unsigned long memory_limit;
 	unsigned long i, limit;
-	struct lmb_region *mem = &(lmb.memory);
 
 	if (! memory_limit)
 		return;
 
 	limit = memory_limit;
-	for (i = 0; i < mem->cnt; i++) {
-		if (limit > mem->region[i].size) {
-			limit -= mem->region[i].size;
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		if (limit > lmb.memory.region[i].size) {
+			limit -= lmb.memory.region[i].size;
 			continue;
 		}
 
-		mem->region[i].size = limit;
-		mem->cnt = i + 1;
+		lmb.memory.region[i].size = limit;
+		lmb.memory.cnt = i + 1;
 		break;
 	}
 }
diff --git a/arch/ppc64/kernel/lparcfg.c b/arch/ppc64/kernel/lparcfg.c
index 02e96627fa6..edad361a8db 100644
--- a/arch/ppc64/kernel/lparcfg.c
+++ b/arch/ppc64/kernel/lparcfg.c
@@ -29,7 +29,7 @@
 #include <asm/iSeries/HvLpConfig.h>
 #include <asm/lppaca.h>
 #include <asm/hvcall.h>
-#include <asm/cputable.h>
+#include <asm/firmware.h>
 #include <asm/rtas.h>
 #include <asm/system.h>
 #include <asm/time.h>
@@ -273,6 +273,7 @@ static void parse_system_parameter_string(struct seq_file *m)
 		if (!workbuffer) {
 			printk(KERN_ERR "%s %s kmalloc failure at line %d \n",
 			       __FILE__, __FUNCTION__, __LINE__);
+			kfree(local_buffer);			
 			return;
 		}
 #ifdef LPARCFG_DEBUG
@@ -377,7 +378,7 @@ static int lparcfg_data(struct seq_file *m, void *v)
 
 	partition_active_processors = lparcfg_count_active_processors();
 
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		unsigned long h_entitled, h_unallocated;
 		unsigned long h_aggregation, h_resource;
 		unsigned long pool_idle_time, pool_procs;
@@ -571,7 +572,7 @@ int __init lparcfg_init(void)
 	mode_t mode = S_IRUSR;
 
 	/* Allow writing if we have FW_FEATURE_SPLPAR */
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		lparcfg_fops.write = lparcfg_write;
 		mode |= S_IWUSR;
 	}
diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index a05b50b738e..474df0a862b 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -680,6 +680,104 @@ _GLOBAL(kernel_thread)
 	ld	r30,-16(r1)
 	blr
 
+/*
+ * disable_kernel_fp()
+ * Disable the FPU.
+ */
+_GLOBAL(disable_kernel_fp)
+	mfmsr	r3
+	rldicl	r0,r3,(63-MSR_FP_LG),1
+	rldicl	r3,r0,(MSR_FP_LG+1),0
+	mtmsrd	r3			/* disable use of fpu now */
+	isync
+	blr
+
+/*
+ * giveup_fpu(tsk)
+ * Disable FP for the task given as the argument,
+ * and save the floating-point registers in its thread_struct.
+ * Enables the FPU for use in the kernel on return.
+ */
+_GLOBAL(giveup_fpu)
+	mfmsr	r5
+	ori	r5,r5,MSR_FP
+	mtmsrd	r5			/* enable use of fpu now */
+	isync
+	cmpdi	0,r3,0
+	beqlr-				/* if no previous owner, done */
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	ld	r5,PT_REGS(r3)
+	cmpdi	0,r5,0
+	SAVE_32FPRS(0, r3)
+	mffs	fr0
+	stfd	fr0,THREAD_FPSCR(r3)
+	beq	1f
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	li	r3,MSR_FP|MSR_FE0|MSR_FE1
+	andc	r4,r4,r3		/* disable FP for previous task */
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#ifndef CONFIG_SMP
+	li	r5,0
+	ld	r4,last_task_used_math@got(r2)
+	std	r5,0(r4)
+#endif /* CONFIG_SMP */
+	blr
+
+#ifdef CONFIG_ALTIVEC
+
+#if 0 /* this has no callers for now */
+/*
+ * disable_kernel_altivec()
+ * Disable the VMX.
+ */
+_GLOBAL(disable_kernel_altivec)
+	mfmsr	r3
+	rldicl	r0,r3,(63-MSR_VEC_LG),1
+	rldicl	r3,r0,(MSR_VEC_LG+1),0
+	mtmsrd	r3			/* disable use of VMX now */
+	isync
+	blr
+#endif /* 0 */
+
+/*
+ * giveup_altivec(tsk)
+ * Disable VMX for the task given as the argument,
+ * and save the vector registers in its thread_struct.
+ * Enables the VMX for use in the kernel on return.
+ */
+_GLOBAL(giveup_altivec)
+	mfmsr	r5
+	oris	r5,r5,MSR_VEC@h
+	mtmsrd	r5			/* enable use of VMX now */
+	isync
+	cmpdi	0,r3,0
+	beqlr-				/* if no previous owner, done */
+	addi	r3,r3,THREAD		/* want THREAD of task */
+	ld	r5,PT_REGS(r3)
+	cmpdi	0,r5,0
+	SAVE_32VRS(0,r4,r3)
+	mfvscr	vr0
+	li	r4,THREAD_VSCR
+	stvx	vr0,r4,r3
+	beq	1f
+	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+	lis	r3,MSR_VEC@h
+	andc	r4,r4,r3		/* disable FP for previous task */
+	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
+1:
+#ifndef CONFIG_SMP
+	li	r5,0
+	ld	r4,last_task_used_altivec@got(r2)
+	std	r5,0(r4)
+#endif /* CONFIG_SMP */
+	blr
+
+#endif /* CONFIG_ALTIVEC */
+
+_GLOBAL(__setup_cpu_power3)
+	blr
+
 /* kexec_wait(phys_cpu)
  *
  * wait for the flag to change, indicating this kernel is going away but
diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c
index b80e81984ba..da580812ddf 100644
--- a/arch/ppc64/kernel/of_device.c
+++ b/arch/ppc64/kernel/of_device.c
@@ -236,7 +236,6 @@ void of_device_unregister(struct of_device *ofdev)
 struct of_device* of_platform_device_create(struct device_node *np, const char *bus_id)
 {
 	struct of_device *dev;
-	u32 *reg;
 
 	dev = kmalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev)
@@ -250,7 +249,6 @@ struct of_device* of_platform_device_create(struct device_node *np, const char *
 	dev->dev.bus = &of_platform_bus_type;
 	dev->dev.release = of_release_dev;
 
-	reg = (u32 *)get_property(np, "reg", NULL);
 	strlcpy(dev->dev.bus_id, bus_id, BUS_ID_SIZE);
 
 	if (of_device_register(dev) != 0) {
diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c
index 69130522a87..9d5e1e7fc38 100644
--- a/arch/ppc64/kernel/pSeries_iommu.c
+++ b/arch/ppc64/kernel/pSeries_iommu.c
@@ -45,6 +45,7 @@
 #include <asm/plpar_wrappers.h>
 #include <asm/pSeries_reconfig.h>
 #include <asm/systemcfg.h>
+#include <asm/firmware.h>
 #include "pci.h"
 
 #define DBG(fmt...)
@@ -546,7 +547,7 @@ void iommu_init_early_pSeries(void)
 	}
 
 	if (systemcfg->platform & PLATFORM_LPAR) {
-		if (cur_cpu_spec->firmware_features & FW_FEATURE_MULTITCE) {
+		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
 			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
 			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
 		} else {
diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c
index 74dd144dcce..0a3ddc9227c 100644
--- a/arch/ppc64/kernel/pSeries_lpar.c
+++ b/arch/ppc64/kernel/pSeries_lpar.c
@@ -52,7 +52,6 @@ EXPORT_SYMBOL(plpar_hcall_4out);
 EXPORT_SYMBOL(plpar_hcall_norets);
 EXPORT_SYMBOL(plpar_hcall_8arg_2ret);
 
-extern void fw_feature_init(void);
 extern void pSeries_find_serial_port(void);
 
 
@@ -279,7 +278,6 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 			      unsigned long va, unsigned long prpn,
 			      unsigned long vflags, unsigned long rflags)
 {
-	unsigned long arpn = physRpn_to_absRpn(prpn);
 	unsigned long lpar_rc;
 	unsigned long flags;
 	unsigned long slot;
@@ -290,7 +288,7 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	if (vflags & HPTE_V_LARGE)
 		hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT);
 
-	hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	/* Now fill in the actual HPTE */
 	/* Set CEC cookie to 0         */
diff --git a/arch/ppc64/kernel/pSeries_setup.c b/arch/ppc64/kernel/pSeries_setup.c
index 5bec956e44a..f0f0630cf07 100644
--- a/arch/ppc64/kernel/pSeries_setup.c
+++ b/arch/ppc64/kernel/pSeries_setup.c
@@ -60,7 +60,8 @@
 #include <asm/nvram.h>
 #include <asm/plpar_wrappers.h>
 #include <asm/xics.h>
-#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/pmc.h>
 
 #include "i8259.h"
 #include "mpic.h"
@@ -187,6 +188,21 @@ static void __init pSeries_setup_mpic(void)
 				  " MPIC     ");
 }
 
+static void pseries_lpar_enable_pmcs(void)
+{
+	unsigned long set, reset;
+
+	power4_enable_pmcs();
+
+	set = 1UL << 63;
+	reset = 0;
+	plpar_hcall_norets(H_PERFMON, set, reset);
+
+	/* instruct hypervisor to maintain PMCs */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
+		get_paca()->lppaca.pmcregs_in_use = 1;
+}
+
 static void __init pSeries_setup_arch(void)
 {
 	/* Fixup ppc_md depending on the type of interrupt controller */
@@ -231,11 +247,9 @@ static void __init pSeries_setup_arch(void)
 
 	pSeries_nvram_init();
 
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
-		vpa_init(boot_cpuid);
-
 	/* Choose an idle loop */
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+		vpa_init(boot_cpuid);
 		if (get_paca()->lppaca.shared_proc) {
 			printk(KERN_INFO "Using shared processor idle loop\n");
 			ppc_md.idle_loop = pseries_shared_idle;
@@ -247,6 +261,11 @@ static void __init pSeries_setup_arch(void)
 		printk(KERN_INFO "Using default idle loop\n");
 		ppc_md.idle_loop = default_idle;
 	}
+
+	if (systemcfg->platform & PLATFORM_LPAR)
+		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+	else
+		ppc_md.enable_pmcs = power4_enable_pmcs;
 }
 
 static int __init pSeries_init_panel(void)
@@ -260,11 +279,11 @@ static int __init pSeries_init_panel(void)
 arch_initcall(pSeries_init_panel);
 
 
-/* Build up the firmware_features bitmask field
+/* Build up the ppc64_firmware_features bitmask field
  * using contents of device-tree/ibm,hypertas-functions.
  * Ultimately this functionality may be moved into prom.c prom_init().
  */
-void __init fw_feature_init(void)
+static void __init fw_feature_init(void)
 {
 	struct device_node * dn;
 	char * hypertas;
@@ -272,7 +291,7 @@ void __init fw_feature_init(void)
 
 	DBG(" -> fw_feature_init()\n");
 
-	cur_cpu_spec->firmware_features = 0;
+	ppc64_firmware_features = 0;
 	dn = of_find_node_by_path("/rtas");
 	if (dn == NULL) {
 		printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n");
@@ -288,7 +307,7 @@ void __init fw_feature_init(void)
 				if ((firmware_features_table[i].name) &&
 				    (strcmp(firmware_features_table[i].name,hypertas))==0) {
 					/* we have a match */
-					cur_cpu_spec->firmware_features |= 
+					ppc64_firmware_features |= 
 						(firmware_features_table[i].val);
 					break;
 				} 
@@ -302,7 +321,7 @@ void __init fw_feature_init(void)
 	of_node_put(dn);
  no_rtas:
 	printk(KERN_INFO "firmware_features = 0x%lx\n", 
-	       cur_cpu_spec->firmware_features);
+	       ppc64_firmware_features);
 
 	DBG(" <- fw_feature_init()\n");
 }
diff --git a/arch/ppc64/kernel/pSeries_smp.c b/arch/ppc64/kernel/pSeries_smp.c
index 62c55a12356..79c7f322366 100644
--- a/arch/ppc64/kernel/pSeries_smp.c
+++ b/arch/ppc64/kernel/pSeries_smp.c
@@ -41,6 +41,7 @@
 #include <asm/machdep.h>
 #include <asm/xics.h>
 #include <asm/cputable.h>
+#include <asm/firmware.h>
 #include <asm/system.h>
 #include <asm/rtas.h>
 #include <asm/plpar_wrappers.h>
@@ -326,7 +327,7 @@ static void __devinit smp_xics_setup_cpu(int cpu)
 	if (cpu != boot_cpuid)
 		xics_setup_cpu();
 
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
+	if (firmware_has_feature(FW_FEATURE_SPLPAR))
 		vpa_init(cpu);
 
 	cpu_clear(cpu, of_spin_map);
diff --git a/arch/ppc64/kernel/pSeries_vio.c b/arch/ppc64/kernel/pSeries_vio.c
new file mode 100644
index 00000000000..e0ae06f58f8
--- /dev/null
+++ b/arch/ppc64/kernel/pSeries_vio.c
@@ -0,0 +1,273 @@
+/*
+ * IBM PowerPC pSeries Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2003-2005 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kobject.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/prom.h>
+#include <asm/vio.h>
+#include <asm/hvcall.h>
+
+extern struct subsystem devices_subsys; /* needed for vio_find_name() */
+
+static void probe_bus_pseries(void)
+{
+	struct device_node *node_vroot, *of_node;
+
+	node_vroot = find_devices("vdevice");
+	if ((node_vroot == NULL) || (node_vroot->child == NULL))
+		/* this machine doesn't do virtual IO, and that's ok */
+		return;
+
+	/*
+	 * Create struct vio_devices for each virtual device in the device tree.
+	 * Drivers will associate with them later.
+	 */
+	for (of_node = node_vroot->child; of_node != NULL;
+			of_node = of_node->sibling) {
+		printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node);
+		vio_register_device_node(of_node);
+	}
+}
+
+/**
+ * vio_match_device_pseries: - Tell if a pSeries VIO device matches a
+ *	vio_device_id
+ */
+static int vio_match_device_pseries(const struct vio_device_id *id,
+		const struct vio_dev *dev)
+{
+	return (strncmp(dev->type, id->type, strlen(id->type)) == 0) &&
+			device_is_compatible(dev->dev.platform_data, id->compat);
+}
+
+static void vio_release_device_pseries(struct device *dev)
+{
+	/* XXX free TCE table */
+	of_node_put(dev->platform_data);
+}
+
+static ssize_t viodev_show_devspec(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct device_node *of_node = dev->platform_data;
+
+	return sprintf(buf, "%s\n", of_node->full_name);
+}
+DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL);
+
+static void vio_unregister_device_pseries(struct vio_dev *viodev)
+{
+	device_remove_file(&viodev->dev, &dev_attr_devspec);
+}
+
+static struct vio_bus_ops vio_bus_ops_pseries = {
+	.match = vio_match_device_pseries,
+	.unregister_device = vio_unregister_device_pseries,
+	.release_device = vio_release_device_pseries,
+};
+
+/**
+ * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus
+ */
+static int __init vio_bus_init_pseries(void)
+{
+	int err;
+
+	err = vio_bus_init(&vio_bus_ops_pseries);
+	if (err == 0)
+		probe_bus_pseries();
+	return err;
+}
+
+__initcall(vio_bus_init_pseries);
+
+/**
+ * vio_build_iommu_table: - gets the dma information from OF and
+ *	builds the TCE tree.
+ * @dev: the virtual device.
+ *
+ * Returns a pointer to the built tce tree, or NULL if it can't
+ * find property.
+*/
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+	unsigned int *dma_window;
+	struct iommu_table *newTceTable;
+	unsigned long offset;
+	int dma_window_property_size;
+
+	dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size);
+	if(!dma_window) {
+		return NULL;
+	}
+
+	newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+	/*  There should be some code to extract the phys-encoded offset
+		using prom_n_addr_cells(). However, according to a comment
+		on earlier versions, it's always zero, so we don't bother */
+	offset = dma_window[1] >>  PAGE_SHIFT;
+
+	/* TCE table size - measured in tce entries */
+	newTceTable->it_size		= dma_window[4] >> PAGE_SHIFT;
+	/* offset for VIO should always be 0 */
+	newTceTable->it_offset		= offset;
+	newTceTable->it_busno		= 0;
+	newTceTable->it_index		= (unsigned long)dma_window[0];
+	newTceTable->it_type		= TCE_VB;
+
+	return iommu_init_table(newTceTable);
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node:	The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node (dev.platform_data) and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
+{
+	struct vio_dev *viodev;
+	unsigned int *unit_address;
+	unsigned int *irq_p;
+
+	/* we need the 'device_type' property, in order to match with drivers */
+	if ((NULL == of_node->type)) {
+		printk(KERN_WARNING
+			"%s: node %s missing 'device_type'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	unit_address = (unsigned int *)get_property(of_node, "reg", NULL);
+	if (!unit_address) {
+		printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__,
+			of_node->name ? of_node->name : "<unknown>");
+		return NULL;
+	}
+
+	/* allocate a vio_dev for this node */
+	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+	if (!viodev) {
+		return NULL;
+	}
+	memset(viodev, 0, sizeof(struct vio_dev));
+
+	viodev->dev.platform_data = of_node_get(of_node);
+
+	viodev->irq = NO_IRQ;
+	irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
+	if (irq_p) {
+		int virq = virt_irq_create_mapping(*irq_p);
+		if (virq == NO_IRQ) {
+			printk(KERN_ERR "Unable to allocate interrupt "
+			       "number for %s\n", of_node->full_name);
+		} else
+			viodev->irq = irq_offset_up(virq);
+	}
+
+	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
+	viodev->name = of_node->name;
+	viodev->type = of_node->type;
+	viodev->unit_address = *unit_address;
+	viodev->iommu_table = vio_build_iommu_table(viodev);
+
+	/* register with generic device framework */
+	if (vio_register_device(viodev) == NULL) {
+		/* XXX free TCE table */
+		kfree(viodev);
+		return NULL;
+	}
+	device_create_file(&viodev->dev, &dev_attr_devspec);
+
+	return viodev;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev:	The vio device to get property.
+ * @which:	The property/attribute to be extracted.
+ * @length:	Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's get_property() to return the value of the
+ * attribute specified by the preprocessor constant @which
+*/
+const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length)
+{
+	return get_property(vdev->dev.platform_data, (char*)which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ * XXX once vio_bus_type.devices is actually used as a kset in
+ * drivers/base/bus.c, this function should be removed in favor of
+ * "device_find(kobj_name, &vio_bus_type)"
+ */
+static struct vio_dev *vio_find_name(const char *kobj_name)
+{
+	struct kobject *found;
+
+	found = kset_find_obj(&devices_subsys.kset, kobj_name);
+	if (!found)
+		return NULL;
+
+	return to_vio_dev(container_of(found, struct device, kobj));
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+	uint32_t *unit_address;
+	char kobj_name[BUS_ID_SIZE];
+
+	/* construct the kobject name from the device node */
+	unit_address = (uint32_t *)get_property(vnode, "reg", NULL);
+	if (!unit_address)
+		return NULL;
+	snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address);
+
+	return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+	if (rc != H_Success)
+		printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+	if (rc != H_Success)
+		printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
diff --git a/arch/ppc64/kernel/pacaData.c b/arch/ppc64/kernel/pacaData.c
index 6316188737b..6182a2cd90a 100644
--- a/arch/ppc64/kernel/pacaData.c
+++ b/arch/ppc64/kernel/pacaData.c
@@ -78,7 +78,7 @@ extern unsigned long __toc_start;
 
 #define BOOTCPU_PACA_INIT(number)					    \
 {									    \
-	PACA_INIT_COMMON(number, 1, 0, STAB0_VIRT_ADDR)			    \
+	PACA_INIT_COMMON(number, 1, 0, (u64)&initial_stab)		    \
 	PACA_INIT_ISERIES(number)					    \
 }
 
@@ -90,7 +90,7 @@ extern unsigned long __toc_start;
 
 #define BOOTCPU_PACA_INIT(number)					    \
 {									    \
-	PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, STAB0_VIRT_ADDR)	    \
+	PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, (u64)&initial_stab)    \
 }
 #endif
 
diff --git a/arch/ppc64/kernel/pmac_setup.c b/arch/ppc64/kernel/pmac_setup.c
index e40877fa67c..8ff86a766cd 100644
--- a/arch/ppc64/kernel/pmac_setup.c
+++ b/arch/ppc64/kernel/pmac_setup.c
@@ -71,6 +71,7 @@
 #include <asm/of_device.h>
 #include <asm/lmb.h>
 #include <asm/smu.h>
+#include <asm/pmc.h>
 
 #include "pmac.h"
 #include "mpic.h"
@@ -511,4 +512,5 @@ struct machdep_calls __initdata pmac_md = {
 	.progress		= pmac_progress,
 	.check_legacy_ioport	= pmac_check_legacy_ioport,
 	.idle_loop		= native_idle,
+	.enable_pmcs		= power4_enable_pmcs,
 };
diff --git a/arch/ppc64/kernel/pmc.c b/arch/ppc64/kernel/pmc.c
index 67be773f9c0..cdfec7438d0 100644
--- a/arch/ppc64/kernel/pmc.c
+++ b/arch/ppc64/kernel/pmc.c
@@ -65,3 +65,24 @@ void release_pmc_hardware(void)
 	spin_unlock(&pmc_owner_lock);
 }
 EXPORT_SYMBOL_GPL(release_pmc_hardware);
+
+void power4_enable_pmcs(void)
+{
+	unsigned long hid0;
+
+	hid0 = mfspr(HID0);
+	hid0 |= 1UL << (63 - 20);
+
+	/* POWER4 requires the following sequence */
+	asm volatile(
+		"sync\n"
+		"mtspr     %1, %0\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"mfspr     %0, %1\n"
+		"isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0):
+		"memory");
+}
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index f7cae05e40f..7a7e027653a 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -50,6 +50,7 @@
 #include <asm/machdep.h>
 #include <asm/iSeries/HvCallHpt.h>
 #include <asm/cputable.h>
+#include <asm/firmware.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/time.h>
@@ -202,11 +203,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 
-/* Collect purr utilization data per process and per processor wise */
-/* purr is nothing but processor time base                          */
-
-#if defined(CONFIG_PPC_PSERIES)
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+	/* Collect purr utilization data per process and per processor
+	 * wise purr is nothing but processor time base
+	 */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
 		long unsigned start_tb, current_tb;
 		start_tb = old_thread->start_tb;
@@ -214,8 +214,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		old_thread->accum_tb += (current_tb - start_tb);
 		new_thread->start_tb = current_tb;
 	}
-#endif
-
 
 	local_irq_save(flags);
 	last = _switch(old_thread, new_thread);
diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c
index 5aca01ddd81..b2184882679 100644
--- a/arch/ppc64/kernel/prom.c
+++ b/arch/ppc64/kernel/prom.c
@@ -625,8 +625,8 @@ void __init finish_device_tree(void)
 
 static inline char *find_flat_dt_string(u32 offset)
 {
-	return ((char *)initial_boot_params) + initial_boot_params->off_dt_strings
-		+ offset;
+	return ((char *)initial_boot_params) +
+		initial_boot_params->off_dt_strings + offset;
 }
 
 /**
@@ -635,26 +635,33 @@ static inline char *find_flat_dt_string(u32 offset)
  * unflatten the tree
  */
 static int __init scan_flat_dt(int (*it)(unsigned long node,
-					 const char *full_path, void *data),
+					 const char *uname, int depth,
+					 void *data),
 			       void *data)
 {
 	unsigned long p = ((unsigned long)initial_boot_params) +
 		initial_boot_params->off_dt_struct;
 	int rc = 0;
+	int depth = -1;
 
 	do {
 		u32 tag = *((u32 *)p);
 		char *pathp;
 		
 		p += 4;
-		if (tag == OF_DT_END_NODE)
+		if (tag == OF_DT_END_NODE) {
+			depth --;
+			continue;
+		}
+		if (tag == OF_DT_NOP)
 			continue;
 		if (tag == OF_DT_END)
 			break;
 		if (tag == OF_DT_PROP) {
 			u32 sz = *((u32 *)p);
 			p += 8;
-			p = _ALIGN(p, sz >= 8 ? 8 : 4);
+			if (initial_boot_params->version < 0x10)
+				p = _ALIGN(p, sz >= 8 ? 8 : 4);
 			p += sz;
 			p = _ALIGN(p, 4);
 			continue;
@@ -664,9 +671,18 @@ static int __init scan_flat_dt(int (*it)(unsigned long node,
 			       " device tree !\n", tag);
 			return -EINVAL;
 		}
+		depth++;
 		pathp = (char *)p;
 		p = _ALIGN(p + strlen(pathp) + 1, 4);
-		rc = it(p, pathp, data);
+		if ((*pathp) == '/') {
+			char *lp, *np;
+			for (lp = NULL, np = pathp; *np; np++)
+				if ((*np) == '/')
+					lp = np+1;
+			if (lp != NULL)
+				pathp = lp;
+		}
+		rc = it(p, pathp, depth, data);
 		if (rc != 0)
 			break;		
 	} while(1);
@@ -689,17 +705,21 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name,
 		const char *nstr;
 
 		p += 4;
+		if (tag == OF_DT_NOP)
+			continue;
 		if (tag != OF_DT_PROP)
 			return NULL;
 
 		sz = *((u32 *)p);
 		noff = *((u32 *)(p + 4));
 		p += 8;
-		p = _ALIGN(p, sz >= 8 ? 8 : 4);
+		if (initial_boot_params->version < 0x10)
+			p = _ALIGN(p, sz >= 8 ? 8 : 4);
 
 		nstr = find_flat_dt_string(noff);
 		if (nstr == NULL) {
-			printk(KERN_WARNING "Can't find property index name !\n");
+			printk(KERN_WARNING "Can't find property index"
+			       " name !\n");
 			return NULL;
 		}
 		if (strcmp(name, nstr) == 0) {
@@ -713,7 +733,7 @@ static void* __init get_flat_dt_prop(unsigned long node, const char *name,
 }
 
 static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size,
-					       unsigned long align)
+				       unsigned long align)
 {
 	void *res;
 
@@ -727,13 +747,16 @@ static void *__init unflatten_dt_alloc(unsigned long *mem, unsigned long size,
 static unsigned long __init unflatten_dt_node(unsigned long mem,
 					      unsigned long *p,
 					      struct device_node *dad,
-					      struct device_node ***allnextpp)
+					      struct device_node ***allnextpp,
+					      unsigned long fpsize)
 {
 	struct device_node *np;
 	struct property *pp, **prev_pp = NULL;
 	char *pathp;
 	u32 tag;
-	unsigned int l;
+	unsigned int l, allocl;
+	int has_name = 0;
+	int new_format = 0;
 
 	tag = *((u32 *)(*p));
 	if (tag != OF_DT_BEGIN_NODE) {
@@ -742,21 +765,62 @@ static unsigned long __init unflatten_dt_node(unsigned long mem,
 	}
 	*p += 4;
 	pathp = (char *)*p;
-	l = strlen(pathp) + 1;
+	l = allocl = strlen(pathp) + 1;
 	*p = _ALIGN(*p + l, 4);
 
-	np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + l,
+	/* version 0x10 has a more compact unit name here instead of the full
+	 * path. we accumulate the full path size using "fpsize", we'll rebuild
+	 * it later. We detect this because the first character of the name is
+	 * not '/'.
+	 */
+	if ((*pathp) != '/') {
+		new_format = 1;
+		if (fpsize == 0) {
+			/* root node: special case. fpsize accounts for path
+			 * plus terminating zero. root node only has '/', so
+			 * fpsize should be 2, but we want to avoid the first
+			 * level nodes to have two '/' so we use fpsize 1 here
+			 */
+			fpsize = 1;
+			allocl = 2;
+		} else {
+			/* account for '/' and path size minus terminal 0
+			 * already in 'l'
+			 */
+			fpsize += l;
+			allocl = fpsize;
+		}
+	}
+
+
+	np = unflatten_dt_alloc(&mem, sizeof(struct device_node) + allocl,
 				__alignof__(struct device_node));
 	if (allnextpp) {
 		memset(np, 0, sizeof(*np));
 		np->full_name = ((char*)np) + sizeof(struct device_node);
-		memcpy(np->full_name, pathp, l);
+		if (new_format) {
+			char *p = np->full_name;
+			/* rebuild full path for new format */
+			if (dad && dad->parent) {
+				strcpy(p, dad->full_name);
+#ifdef DEBUG
+				if ((strlen(p) + l + 1) != allocl) {
+					DBG("%s: p: %d, l: %d, a: %d\n",
+					    pathp, strlen(p), l, allocl);
+				}
+#endif
+				p += strlen(p);
+			}
+			*(p++) = '/';
+			memcpy(p, pathp, l);
+		} else
+			memcpy(np->full_name, pathp, l);
 		prev_pp = &np->properties;
 		**allnextpp = np;
 		*allnextpp = &np->allnext;
 		if (dad != NULL) {
 			np->parent = dad;
-			/* we temporarily use the `next' field as `last_child'. */
+			/* we temporarily use the next field as `last_child'*/
 			if (dad->next == 0)
 				dad->child = np;
 			else
@@ -770,18 +834,26 @@ static unsigned long __init unflatten_dt_node(unsigned long mem,
 		char *pname;
 
 		tag = *((u32 *)(*p));
+		if (tag == OF_DT_NOP) {
+			*p += 4;
+			continue;
+		}
 		if (tag != OF_DT_PROP)
 			break;
 		*p += 4;
 		sz = *((u32 *)(*p));
 		noff = *((u32 *)((*p) + 4));
-		*p = _ALIGN((*p) + 8, sz >= 8 ? 8 : 4);
+		*p += 8;
+		if (initial_boot_params->version < 0x10)
+			*p = _ALIGN(*p, sz >= 8 ? 8 : 4);
 
 		pname = find_flat_dt_string(noff);
 		if (pname == NULL) {
 			printk("Can't find property name in list !\n");
 			break;
 		}
+		if (strcmp(pname, "name") == 0)
+			has_name = 1;
 		l = strlen(pname) + 1;
 		pp = unflatten_dt_alloc(&mem, sizeof(struct property),
 					__alignof__(struct property));
@@ -801,6 +873,36 @@ static unsigned long __init unflatten_dt_node(unsigned long mem,
 		}
 		*p = _ALIGN((*p) + sz, 4);
 	}
+	/* with version 0x10 we may not have the name property, recreate
+	 * it here from the unit name if absent
+	 */
+	if (!has_name) {
+		char *p = pathp, *ps = pathp, *pa = NULL;
+		int sz;
+
+		while (*p) {
+			if ((*p) == '@')
+				pa = p;
+			if ((*p) == '/')
+				ps = p + 1;
+			p++;
+		}
+		if (pa < ps)
+			pa = p;
+		sz = (pa - ps) + 1;
+		pp = unflatten_dt_alloc(&mem, sizeof(struct property) + sz,
+					__alignof__(struct property));
+		if (allnextpp) {
+			pp->name = "name";
+			pp->length = sz;
+			pp->value = (unsigned char *)(pp + 1);
+			*prev_pp = pp;
+			prev_pp = &pp->next;
+			memcpy(pp->value, ps, sz - 1);
+			((char *)pp->value)[sz - 1] = 0;
+			DBG("fixed up name for %s -> %s\n", pathp, pp->value);
+		}
+	}
 	if (allnextpp) {
 		*prev_pp = NULL;
 		np->name = get_property(np, "name", NULL);
@@ -812,11 +914,11 @@ static unsigned long __init unflatten_dt_node(unsigned long mem,
 			np->type = "<NULL>";
 	}
 	while (tag == OF_DT_BEGIN_NODE) {
-		mem = unflatten_dt_node(mem, p, np, allnextpp);
+		mem = unflatten_dt_node(mem, p, np, allnextpp, fpsize);
 		tag = *((u32 *)(*p));
 	}
 	if (tag != OF_DT_END_NODE) {
-		printk("Weird tag at start of node: %x\n", tag);
+		printk("Weird tag at end of node: %x\n", tag);
 		return mem;
 	}
 	*p += 4;
@@ -842,21 +944,32 @@ void __init unflatten_device_tree(void)
 	/* First pass, scan for size */
 	start = ((unsigned long)initial_boot_params) +
 		initial_boot_params->off_dt_struct;
-	size = unflatten_dt_node(0, &start, NULL, NULL);
+	size = unflatten_dt_node(0, &start, NULL, NULL, 0);
+	size = (size | 3) + 1;
 
 	DBG("  size is %lx, allocating...\n", size);
 
 	/* Allocate memory for the expanded device tree */
-	mem = (unsigned long)abs_to_virt(lmb_alloc(size,
-						   __alignof__(struct device_node)));
+	mem = lmb_alloc(size + 4, __alignof__(struct device_node));
+	if (!mem) {
+		DBG("Couldn't allocate memory with lmb_alloc()!\n");
+		panic("Couldn't allocate memory with lmb_alloc()!\n");
+	}
+	mem = (unsigned long)abs_to_virt(mem);
+
+	((u32 *)mem)[size / 4] = 0xdeadbeef;
+
 	DBG("  unflattening...\n", mem);
 
 	/* Second pass, do actual unflattening */
 	start = ((unsigned long)initial_boot_params) +
 		initial_boot_params->off_dt_struct;
-	unflatten_dt_node(mem, &start, NULL, &allnextp);
+	unflatten_dt_node(mem, &start, NULL, &allnextp, 0);
 	if (*((u32 *)start) != OF_DT_END)
-		printk(KERN_WARNING "Weird tag at end of tree: %x\n", *((u32 *)start));
+		printk(KERN_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start));
+	if (((u32 *)mem)[size / 4] != 0xdeadbeef)
+		printk(KERN_WARNING "End of tree marker overwritten: %08x\n",
+		       ((u32 *)mem)[size / 4] );
 	*allnextp = NULL;
 
 	/* Get pointer to OF "/chosen" node for use everywhere */
@@ -880,7 +993,7 @@ void __init unflatten_device_tree(void)
 
 
 static int __init early_init_dt_scan_cpus(unsigned long node,
-					  const char *full_path, void *data)
+					  const char *uname, int depth, void *data)
 {
 	char *type = get_flat_dt_prop(node, "device_type", NULL);
 	u32 *prop;
@@ -947,13 +1060,15 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 }
 
 static int __init early_init_dt_scan_chosen(unsigned long node,
-					    const char *full_path, void *data)
+					    const char *uname, int depth, void *data)
 {
 	u32 *prop;
 	u64 *prop64;
 	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
 
-	if (strcmp(full_path, "/chosen") != 0)
+	DBG("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
+
+	if (depth != 1 || strcmp(uname, "chosen") != 0)
 		return 0;
 
 	/* get platform type */
@@ -1003,18 +1118,20 @@ static int __init early_init_dt_scan_chosen(unsigned long node,
 }
 
 static int __init early_init_dt_scan_root(unsigned long node,
-					  const char *full_path, void *data)
+					  const char *uname, int depth, void *data)
 {
 	u32 *prop;
 
-	if (strcmp(full_path, "/") != 0)
+	if (depth != 0)
 		return 0;
 
 	prop = (u32 *)get_flat_dt_prop(node, "#size-cells", NULL);
 	dt_root_size_cells = (prop == NULL) ? 1 : *prop;
-		
+	DBG("dt_root_size_cells = %x\n", dt_root_size_cells);
+
 	prop = (u32 *)get_flat_dt_prop(node, "#address-cells", NULL);
 	dt_root_addr_cells = (prop == NULL) ? 2 : *prop;
+	DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells);
 	
 	/* break now */
 	return 1;
@@ -1042,7 +1159,7 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp)
 
 
 static int __init early_init_dt_scan_memory(unsigned long node,
-					    const char *full_path, void *data)
+					    const char *uname, int depth, void *data)
 {
 	char *type = get_flat_dt_prop(node, "device_type", NULL);
 	cell_t *reg, *endp;
@@ -1058,7 +1175,9 @@ static int __init early_init_dt_scan_memory(unsigned long node,
 
 	endp = reg + (l / sizeof(cell_t));
 
-	DBG("memory scan node %s ...\n", full_path);
+	DBG("memory scan node %s ..., reg size %ld, data: %x %x %x %x, ...\n",
+	    uname, l, reg[0], reg[1], reg[2], reg[3]);
+
 	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
 		unsigned long base, size;
 
@@ -1469,10 +1588,11 @@ struct device_node *of_find_node_by_path(const char *path)
 	struct device_node *np = allnodes;
 
 	read_lock(&devtree_lock);
-	for (; np != 0; np = np->allnext)
+	for (; np != 0; np = np->allnext) {
 		if (np->full_name != 0 && strcasecmp(np->full_name, path) == 0
 		    && of_node_get(np))
 			break;
+	}
 	read_unlock(&devtree_lock);
 	return np;
 }
diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c
index dbbe6c79d8d..122283a1d39 100644
--- a/arch/ppc64/kernel/prom_init.c
+++ b/arch/ppc64/kernel/prom_init.c
@@ -892,7 +892,10 @@ static void __init prom_init_mem(void)
 	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
 		RELOC(alloc_top) = RELOC(rmo_top);
 	else
-		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
+		/* Some RS64 machines have buggy firmware where claims up at 1GB
+		 * fails. Cap at 768MB as a workaround. Still plenty of room.
+		 */
+		RELOC(alloc_top) = RELOC(rmo_top) = min(0x30000000ul, RELOC(ram_top));
 
 	prom_printf("memory layout at init:\n");
 	prom_printf("  memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit));
@@ -1534,7 +1537,8 @@ static unsigned long __init dt_find_string(char *str)
  */
 #define MAX_PROPERTY_NAME 64
 
-static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start,
+static void __init scan_dt_build_strings(phandle node,
+					 unsigned long *mem_start,
 					 unsigned long *mem_end)
 {
 	unsigned long offset = reloc_offset();
@@ -1547,16 +1551,21 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start,
 	/* get and store all property names */
 	prev_name = RELOC("");
 	for (;;) {
-		int rc;
-
 		/* 64 is max len of name including nul. */
 		namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1);
-		rc = call_prom("nextprop", 3, 1, node, prev_name, namep);
-		if (rc != 1) {
+		if (call_prom("nextprop", 3, 1, node, prev_name, namep) != 1) {
 			/* No more nodes: unwind alloc */
 			*mem_start = (unsigned long)namep;
 			break;
 		}
+
+ 		/* skip "name" */
+ 		if (strcmp(namep, RELOC("name")) == 0) {
+ 			*mem_start = (unsigned long)namep;
+ 			prev_name = RELOC("name");
+ 			continue;
+ 		}
+		/* get/create string entry */
 		soff = dt_find_string(namep);
 		if (soff != 0) {
 			*mem_start = (unsigned long)namep;
@@ -1571,7 +1580,7 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start,
 
 	/* do all our children */
 	child = call_prom("child", 1, 1, node);
-	while (child != (phandle)0) {
+	while (child != 0) {
 		scan_dt_build_strings(child, mem_start, mem_end);
 		child = call_prom("peer", 1, 1, child);
 	}
@@ -1580,16 +1589,13 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start,
 static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 					unsigned long *mem_end)
 {
-	int l, align;
 	phandle child;
-	char *namep, *prev_name, *sstart, *p, *ep;
+	char *namep, *prev_name, *sstart, *p, *ep, *lp, *path;
 	unsigned long soff;
 	unsigned char *valp;
 	unsigned long offset = reloc_offset();
-	char pname[MAX_PROPERTY_NAME];
-	char *path;
-
-	path = RELOC(prom_scratch);
+	static char pname[MAX_PROPERTY_NAME];
+	int l;
 
 	dt_push_token(OF_DT_BEGIN_NODE, mem_start, mem_end);
 
@@ -1599,23 +1605,33 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 		      namep, *mem_end - *mem_start);
 	if (l >= 0) {
 		/* Didn't fit?  Get more room. */
-		if (l+1 > *mem_end - *mem_start) {
+		if ((l+1) > (*mem_end - *mem_start)) {
 			namep = make_room(mem_start, mem_end, l+1, 1);
 			call_prom("package-to-path", 3, 1, node, namep, l);
 		}
 		namep[l] = '\0';
+
 		/* Fixup an Apple bug where they have bogus \0 chars in the
 		 * middle of the path in some properties
 		 */
 		for (p = namep, ep = namep + l; p < ep; p++)
 			if (*p == '\0') {
 				memmove(p, p+1, ep - p);
-				ep--; l--;
+				ep--; l--; p--;
 			}
-		*mem_start = _ALIGN(((unsigned long) namep) + strlen(namep) + 1, 4);
+
+		/* now try to extract the unit name in that mess */
+		for (p = namep, lp = NULL; *p; p++)
+			if (*p == '/')
+				lp = p + 1;
+		if (lp != NULL)
+			memmove(namep, lp, strlen(lp) + 1);
+		*mem_start = _ALIGN(((unsigned long) namep) +
+				    strlen(namep) + 1, 4);
 	}
 
 	/* get it again for debugging */
+	path = RELOC(prom_scratch);
 	memset(path, 0, PROM_SCRATCH_SIZE);
 	call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1);
 
@@ -1623,23 +1639,27 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 	prev_name = RELOC("");
 	sstart = (char *)RELOC(dt_string_start);
 	for (;;) {
-		int rc;
-
-		rc = call_prom("nextprop", 3, 1, node, prev_name, pname);
-		if (rc != 1)
+		if (call_prom("nextprop", 3, 1, node, prev_name,
+			      RELOC(pname)) != 1)
 			break;
 
+ 		/* skip "name" */
+ 		if (strcmp(RELOC(pname), RELOC("name")) == 0) {
+ 			prev_name = RELOC("name");
+ 			continue;
+ 		}
+
 		/* find string offset */
-		soff = dt_find_string(pname);
+		soff = dt_find_string(RELOC(pname));
 		if (soff == 0) {
-			prom_printf("WARNING: Can't find string index for <%s>, node %s\n",
-				    pname, path);
+			prom_printf("WARNING: Can't find string index for"
+				    " <%s>, node %s\n", RELOC(pname), path);
 			break;
 		}
 		prev_name = sstart + soff;
 
 		/* get length */
-		l = call_prom("getproplen", 2, 1, node, pname);
+		l = call_prom("getproplen", 2, 1, node, RELOC(pname));
 
 		/* sanity checks */
 		if (l == PROM_ERROR)
@@ -1648,7 +1668,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 			prom_printf("WARNING: ignoring large property ");
 			/* It seems OF doesn't null-terminate the path :-( */
 			prom_printf("[%s] ", path);
-			prom_printf("%s length 0x%x\n", pname, l);
+			prom_printf("%s length 0x%x\n", RELOC(pname), l);
 			continue;
 		}
 
@@ -1658,17 +1678,16 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 		dt_push_token(soff, mem_start, mem_end);
 
 		/* push property content */
-		align = (l >= 8) ? 8 : 4;
-		valp = make_room(mem_start, mem_end, l, align);
-		call_prom("getprop", 4, 1, node, pname, valp, l);
+		valp = make_room(mem_start, mem_end, l, 4);
+		call_prom("getprop", 4, 1, node, RELOC(pname), valp, l);
 		*mem_start = _ALIGN(*mem_start, 4);
 	}
 
 	/* Add a "linux,phandle" property. */
 	soff = dt_find_string(RELOC("linux,phandle"));
 	if (soff == 0)
-		prom_printf("WARNING: Can't find string index for <linux-phandle>"
-			    " node %s\n", path);
+		prom_printf("WARNING: Can't find string index for"
+			    " <linux-phandle> node %s\n", path);
 	else {
 		dt_push_token(OF_DT_PROP, mem_start, mem_end);
 		dt_push_token(4, mem_start, mem_end);
@@ -1679,7 +1698,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start,
 
 	/* do all our children */
 	child = call_prom("child", 1, 1, node);
-	while (child != (phandle)0) {
+	while (child != 0) {
 		scan_dt_build_struct(child, mem_start, mem_end);
 		child = call_prom("peer", 1, 1, child);
 	}
@@ -1718,7 +1737,8 @@ static void __init flatten_device_tree(void)
 
 	/* Build header and make room for mem rsv map */ 
 	mem_start = _ALIGN(mem_start, 4);
-	hdr = make_room(&mem_start, &mem_end, sizeof(struct boot_param_header), 4);
+	hdr = make_room(&mem_start, &mem_end,
+			sizeof(struct boot_param_header), 4);
 	RELOC(dt_header_start) = (unsigned long)hdr;
 	rsvmap = make_room(&mem_start, &mem_end, sizeof(mem_reserve_map), 8);
 
@@ -1731,11 +1751,11 @@ static void __init flatten_device_tree(void)
 	namep = make_room(&mem_start, &mem_end, 16, 1);
 	strcpy(namep, RELOC("linux,phandle"));
 	mem_start = (unsigned long)namep + strlen(namep) + 1;
-	RELOC(dt_string_end) = mem_start;
 
 	/* Build string array */
 	prom_printf("Building dt strings...\n"); 
 	scan_dt_build_strings(root, &mem_start, &mem_end);
+	RELOC(dt_string_end) = mem_start;
 
 	/* Build structure */
 	mem_start = PAGE_ALIGN(mem_start);
@@ -1750,9 +1770,11 @@ static void __init flatten_device_tree(void)
 	hdr->totalsize = RELOC(dt_struct_end) - RELOC(dt_header_start);
 	hdr->off_dt_struct = RELOC(dt_struct_start) - RELOC(dt_header_start);
 	hdr->off_dt_strings = RELOC(dt_string_start) - RELOC(dt_header_start);
+	hdr->dt_strings_size = RELOC(dt_string_end) - RELOC(dt_string_start);
 	hdr->off_mem_rsvmap = ((unsigned long)rsvmap) - RELOC(dt_header_start);
 	hdr->version = OF_DT_VERSION;
-	hdr->last_comp_version = 1;
+	/* Version 16 is not backward compatible */
+	hdr->last_comp_version = 0x10;
 
 	/* Reserve the whole thing and copy the reserve map in, we
 	 * also bump mem_reserve_cnt to cause further reservations to
@@ -1808,6 +1830,9 @@ static void __init fixup_device_tree(void)
 	/* does it need fixup ? */
 	if (prom_getproplen(i2c, "interrupts") > 0)
 		return;
+
+	prom_printf("fixing up bogus interrupts for u3 i2c...\n");
+
 	/* interrupt on this revision of u3 is number 0 and level */
 	interrupts[0] = 0;
 	interrupts[1] = 1;
diff --git a/arch/ppc64/kernel/rtas_pci.c b/arch/ppc64/kernel/rtas_pci.c
index 1048817befb..1dccadaddd1 100644
--- a/arch/ppc64/kernel/rtas_pci.c
+++ b/arch/ppc64/kernel/rtas_pci.c
@@ -58,6 +58,21 @@ static int config_access_valid(struct device_node *dn, int where)
 	return 0;
 }
 
+static int of_device_available(struct device_node * dn)
+{
+        char * status;
+
+        status = get_property(dn, "status", NULL);
+
+        if (!status)
+                return 1;
+
+        if (!strcmp(status, "okay"))
+                return 1;
+
+        return 0;
+}
+
 static int rtas_read_config(struct device_node *dn, int where, int size, u32 *val)
 {
 	int returnval = -1;
@@ -103,7 +118,7 @@ static int rtas_pci_read_config(struct pci_bus *bus,
 
 	/* Search only direct children of the bus */
 	for (dn = busdn->child; dn; dn = dn->sibling)
-		if (dn->devfn == devfn)
+		if (dn->devfn == devfn && of_device_available(dn))
 			return rtas_read_config(dn, where, size, val);
 	return PCIBIOS_DEVICE_NOT_FOUND;
 }
@@ -146,7 +161,7 @@ static int rtas_pci_write_config(struct pci_bus *bus,
 
 	/* Search only direct children of the bus */
 	for (dn = busdn->child; dn; dn = dn->sibling)
-		if (dn->devfn == devfn)
+		if (dn->devfn == devfn && of_device_available(dn))
 			return rtas_write_config(dn, where, size, val);
 	return PCIBIOS_DEVICE_NOT_FOUND;
 }
diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c
index e9c24d2dbd9..ee3b20de2e7 100644
--- a/arch/ppc64/kernel/setup.c
+++ b/arch/ppc64/kernel/setup.c
@@ -536,15 +536,19 @@ static void __init check_for_initrd(void)
 
 	DBG(" -> check_for_initrd()\n");
 
-	prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL);
-	if (prop != NULL) {
-		initrd_start = (unsigned long)__va(*prop);
-		prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL);
+	if (of_chosen) {
+		prop = (u64 *)get_property(of_chosen,
+				"linux,initrd-start", NULL);
 		if (prop != NULL) {
-			initrd_end = (unsigned long)__va(*prop);
-			initrd_below_start_ok = 1;
-		} else
-			initrd_start = 0;
+			initrd_start = (unsigned long)__va(*prop);
+			prop = (u64 *)get_property(of_chosen,
+					"linux,initrd-end", NULL);
+			if (prop != NULL) {
+				initrd_end = (unsigned long)__va(*prop);
+				initrd_below_start_ok = 1;
+			} else
+				initrd_start = 0;
+		}
 	}
 
 	/* If we were passed an initrd, set the ROOT_DEV properly if the values
@@ -627,7 +631,7 @@ void __init setup_system(void)
 	 * Initialize xmon
 	 */
 #ifdef CONFIG_XMON_DEFAULT
-	xmon_init();
+	xmon_init(1);
 #endif
 	/*
 	 * Register early console
@@ -1343,11 +1347,13 @@ static int __init early_xmon(char *p)
 	/* ensure xmon is enabled */
 	if (p) {
 		if (strncmp(p, "on", 2) == 0)
-			xmon_init();
+			xmon_init(1);
+		if (strncmp(p, "off", 3) == 0)
+			xmon_init(0);
 		if (strncmp(p, "early", 5) != 0)
 			return 0;
 	}
-	xmon_init();
+	xmon_init(1);
 	debugger(NULL);
 
 	return 0;
diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c
index 02b8ac4e016..f311ee7c007 100644
--- a/arch/ppc64/kernel/sysfs.c
+++ b/arch/ppc64/kernel/sysfs.c
@@ -13,6 +13,7 @@
 #include <asm/current.h>
 #include <asm/processor.h>
 #include <asm/cputable.h>
+#include <asm/firmware.h>
 #include <asm/hvcall.h>
 #include <asm/prom.h>
 #include <asm/systemcfg.h>
@@ -100,6 +101,8 @@ static int __init setup_smt_snooze_delay(char *str)
 }
 __setup("smt-snooze-delay=", setup_smt_snooze_delay);
 
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
 /*
  * Enabling PMCs will slow partition context switch times so we only do
  * it the first time we write to the PMCs.
@@ -109,65 +112,15 @@ static DEFINE_PER_CPU(char, pmcs_enabled);
 
 void ppc64_enable_pmcs(void)
 {
-	unsigned long hid0;
-#ifdef CONFIG_PPC_PSERIES
-	unsigned long set, reset;
-#endif /* CONFIG_PPC_PSERIES */
-
 	/* Only need to enable them once */
 	if (__get_cpu_var(pmcs_enabled))
 		return;
 
 	__get_cpu_var(pmcs_enabled) = 1;
 
-	switch (systemcfg->platform) {
-	case PLATFORM_PSERIES:
-	case PLATFORM_POWERMAC:
-		hid0 = mfspr(HID0);
-		hid0 |= 1UL << (63 - 20);
-
-		/* POWER4 requires the following sequence */
-		asm volatile(
-			     "sync\n"
-			     "mtspr	%1, %0\n"
-			     "mfspr	%0, %1\n"
-			     "mfspr	%0, %1\n"
-			     "mfspr	%0, %1\n"
-			     "mfspr	%0, %1\n"
-			     "mfspr	%0, %1\n"
-			     "mfspr	%0, %1\n"
-			     "isync" : "=&r" (hid0) : "i" (HID0), "0" (hid0):
-			     "memory");
-		break;
-
-#ifdef CONFIG_PPC_PSERIES
-	case PLATFORM_PSERIES_LPAR:
-		set = 1UL << 63;
-		reset = 0;
-		plpar_hcall_norets(H_PERFMON, set, reset);
-		break;
-#endif /* CONFIG_PPC_PSERIES */
-
-	default:
-		break;
-	}
-
-#ifdef CONFIG_PPC_PSERIES
-	/* instruct hypervisor to maintain PMCs */
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR)
-		get_paca()->lppaca.pmcregs_in_use = 1;
-#endif /* CONFIG_PPC_PSERIES */
+	if (ppc_md.enable_pmcs)
+		ppc_md.enable_pmcs();
 }
-
-#else
-
-/* PMC stuff */
-void ppc64_enable_pmcs(void)
-{
-	/* XXX Implement for iseries */
-}
-#endif /* CONFIG_PPC_MULTIPLATFORM */
-
 EXPORT_SYMBOL(ppc64_enable_pmcs);
 
 /* XXX convert to rusty's on_one_cpu */
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index 909462e1ade..1696e1b05bb 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -67,6 +67,7 @@
 #include <asm/prom.h>
 #include <asm/sections.h>
 #include <asm/systemcfg.h>
+#include <asm/firmware.h>
 
 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
@@ -370,13 +371,11 @@ int timer_interrupt(struct pt_regs * regs)
 		process_hvlpevents(regs);
 #endif
 
-/* collect purr register values often, for accurate calculations */
-#if defined(CONFIG_PPC_PSERIES)
-	if (cur_cpu_spec->firmware_features & FW_FEATURE_SPLPAR) {
+	/* collect purr register values often, for accurate calculations */
+	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
 		cu->current_tb = mfspr(SPRN_PURR);
 	}
-#endif
 
 	irq_exit();
 
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index 0c0ba71ac0e..c90e1dd875c 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -1,10 +1,11 @@
 /*
  * IBM PowerPC Virtual I/O Infrastructure Support.
  *
- *    Copyright (c) 2003 IBM Corp.
+ *    Copyright (c) 2003-2005 IBM Corp.
  *     Dave Engebretsen engebret@us.ibm.com
  *     Santiago Leon santil@us.ibm.com
  *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -14,61 +15,30 @@
 
 #include <linux/init.h>
 #include <linux/console.h>
-#include <linux/version.h>
 #include <linux/module.h>
-#include <linux/kobject.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
-#include <asm/rtas.h>
 #include <asm/iommu.h>
 #include <asm/dma.h>
-#include <asm/ppcdebug.h>
 #include <asm/vio.h>
-#include <asm/hvcall.h>
-#include <asm/iSeries/vio.h>
-#include <asm/iSeries/HvTypes.h>
-#include <asm/iSeries/HvCallXm.h>
-#include <asm/iSeries/HvLpConfig.h>
-
-#define DBGENTER() pr_debug("%s entered\n", __FUNCTION__)
-
-extern struct subsystem devices_subsys; /* needed for vio_find_name() */
 
 static const struct vio_device_id *vio_match_device(
 		const struct vio_device_id *, const struct vio_dev *);
 
-#ifdef CONFIG_PPC_PSERIES
-static struct iommu_table *vio_build_iommu_table(struct vio_dev *);
-static int vio_num_address_cells;
-#endif
-#ifdef CONFIG_PPC_ISERIES
-static struct iommu_table veth_iommu_table;
-static struct iommu_table vio_iommu_table;
-#endif
-static struct vio_dev vio_bus_device  = { /* fake "parent" device */
+struct vio_dev vio_bus_device  = { /* fake "parent" device */
 	.name = vio_bus_device.dev.bus_id,
 	.type = "",
-#ifdef CONFIG_PPC_ISERIES
-	.iommu_table = &vio_iommu_table,
-#endif
 	.dev.bus_id = "vio",
 	.dev.bus = &vio_bus_type,
 };
 
-#ifdef CONFIG_PPC_ISERIES
-static struct vio_dev *__init vio_register_device_iseries(char *type,
-		uint32_t unit_num);
-
-struct device *iSeries_vio_dev = &vio_bus_device.dev;
-EXPORT_SYMBOL(iSeries_vio_dev);
-
-#define device_is_compatible(a, b)	1
+static struct vio_bus_ops vio_bus_ops;
 
-#endif
-
-/* convert from struct device to struct vio_dev and pass to driver.
+/*
+ * Convert from struct device to struct vio_dev and pass to driver.
  * dev->driver has already been set by generic code because vio_bus_match
- * succeeded. */
+ * succeeded.
+ */
 static int vio_bus_probe(struct device *dev)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
@@ -76,15 +46,12 @@ static int vio_bus_probe(struct device *dev)
 	const struct vio_device_id *id;
 	int error = -ENODEV;
 
-	DBGENTER();
-
 	if (!viodrv->probe)
 		return error;
 
 	id = vio_match_device(viodrv->id_table, viodev);
-	if (id) {
+	if (id)
 		error = viodrv->probe(viodev, id);
-	}
 
 	return error;
 }
@@ -95,11 +62,8 @@ static int vio_bus_remove(struct device *dev)
 	struct vio_dev *viodev = to_vio_dev(dev);
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
 
-	DBGENTER();
-
-	if (viodrv->remove) {
+	if (viodrv->remove)
 		return viodrv->remove(viodev);
-	}
 
 	/* driver can't remove */
 	return 1;
@@ -135,193 +99,72 @@ void vio_unregister_driver(struct vio_driver *viodrv)
 EXPORT_SYMBOL(vio_unregister_driver);
 
 /**
- * vio_match_device: - Tell if a VIO device has a matching VIO device id structure.
- * @ids: 	array of VIO device id structures to search in
- * @dev: 	the VIO device structure to match against
+ * vio_match_device: - Tell if a VIO device has a matching
+ *			VIO device id structure.
+ * @ids:	array of VIO device id structures to search in
+ * @dev:	the VIO device structure to match against
  *
  * Used by a driver to check whether a VIO device present in the
  * system is in its list of supported devices. Returns the matching
  * vio_device_id structure or NULL if there is no match.
  */
-static const struct vio_device_id * vio_match_device(const struct vio_device_id *ids,
-	const struct vio_dev *dev)
+static const struct vio_device_id *vio_match_device(
+		const struct vio_device_id *ids, const struct vio_dev *dev)
 {
-	DBGENTER();
-
-	while (ids->type) {
-		if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
-			device_is_compatible(dev->dev.platform_data, ids->compat))
+	while (ids->type[0] != '\0') {
+		if (vio_bus_ops.match(ids, dev))
 			return ids;
 		ids++;
 	}
 	return NULL;
 }
 
-#ifdef CONFIG_PPC_ISERIES
-void __init iommu_vio_init(void)
-{
-	struct iommu_table *t;
-	struct iommu_table_cb cb;
-	unsigned long cbp;
-	unsigned long itc_entries;
-
-	cb.itc_busno = 255;    /* Bus 255 is the virtual bus */
-	cb.itc_virtbus = 0xff; /* Ask for virtual bus */
-
-	cbp = virt_to_abs(&cb);
-	HvCallXm_getTceTableParms(cbp);
-
-	itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry);
-	veth_iommu_table.it_size        = itc_entries / 2;
-	veth_iommu_table.it_busno       = cb.itc_busno;
-	veth_iommu_table.it_offset      = cb.itc_offset;
-	veth_iommu_table.it_index       = cb.itc_index;
-	veth_iommu_table.it_type        = TCE_VB;
-	veth_iommu_table.it_blocksize	= 1;
-
-	t = iommu_init_table(&veth_iommu_table);
-
-	if (!t)
-		printk("Virtual Bus VETH TCE table failed.\n");
-
-	vio_iommu_table.it_size         = itc_entries - veth_iommu_table.it_size;
-	vio_iommu_table.it_busno        = cb.itc_busno;
-	vio_iommu_table.it_offset       = cb.itc_offset +
-					  veth_iommu_table.it_size;
-	vio_iommu_table.it_index        = cb.itc_index;
-	vio_iommu_table.it_type         = TCE_VB;
-	vio_iommu_table.it_blocksize	= 1;
-
-	t = iommu_init_table(&vio_iommu_table);
-
-	if (!t)
-		printk("Virtual Bus VIO TCE table failed.\n");
-}
-#endif
-
-#ifdef CONFIG_PPC_PSERIES
-static void probe_bus_pseries(void)
-{
-	struct device_node *node_vroot, *of_node;
-
-	node_vroot = find_devices("vdevice");
-	if ((node_vroot == NULL) || (node_vroot->child == NULL))
-		/* this machine doesn't do virtual IO, and that's ok */
-		return;
-
-	vio_num_address_cells = prom_n_addr_cells(node_vroot->child);
-
-	/*
-	 * Create struct vio_devices for each virtual device in the device tree.
-	 * Drivers will associate with them later.
-	 */
-	for (of_node = node_vroot->child; of_node != NULL;
-			of_node = of_node->sibling) {
-		printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node);
-		vio_register_device_node(of_node);
-	}
-}
-#endif
-
-#ifdef CONFIG_PPC_ISERIES
-static void probe_bus_iseries(void)
-{
-	HvLpIndexMap vlan_map = HvLpConfig_getVirtualLanIndexMap();
-	struct vio_dev *viodev;
-	int i;
-
-	/* there is only one of each of these */
-	vio_register_device_iseries("viocons", 0);
-	vio_register_device_iseries("vscsi", 0);
-
-	vlan_map = HvLpConfig_getVirtualLanIndexMap();
-	for (i = 0; i < HVMAXARCHITECTEDVIRTUALLANS; i++) {
-		if ((vlan_map & (0x8000 >> i)) == 0)
-			continue;
-		viodev = vio_register_device_iseries("vlan", i);
-		/* veth is special and has it own iommu_table */
-		viodev->iommu_table = &veth_iommu_table;
-	}
-	for (i = 0; i < HVMAXARCHITECTEDVIRTUALDISKS; i++)
-		vio_register_device_iseries("viodasd", i);
-	for (i = 0; i < HVMAXARCHITECTEDVIRTUALCDROMS; i++)
-		vio_register_device_iseries("viocd", i);
-	for (i = 0; i < HVMAXARCHITECTEDVIRTUALTAPES; i++)
-		vio_register_device_iseries("viotape", i);
-}
-#endif
-
 /**
  * vio_bus_init: - Initialize the virtual IO bus
  */
-static int __init vio_bus_init(void)
+int __init vio_bus_init(struct vio_bus_ops *ops)
 {
 	int err;
 
+	vio_bus_ops = *ops;
+
 	err = bus_register(&vio_bus_type);
 	if (err) {
 		printk(KERN_ERR "failed to register VIO bus\n");
 		return err;
 	}
 
-	/* the fake parent of all vio devices, just to give us a nice directory */
+	/*
+	 * The fake parent of all vio devices, just to give us
+	 * a nice directory
+	 */
 	err = device_register(&vio_bus_device.dev);
 	if (err) {
-		printk(KERN_WARNING "%s: device_register returned %i\n", __FUNCTION__,
-			err);
+		printk(KERN_WARNING "%s: device_register returned %i\n",
+				__FUNCTION__, err);
 		return err;
 	}
 
-#ifdef CONFIG_PPC_PSERIES
-	probe_bus_pseries();
-#endif
-#ifdef CONFIG_PPC_ISERIES
-	probe_bus_iseries();
-#endif
-
 	return 0;
 }
 
-__initcall(vio_bus_init);
-
 /* vio_dev refcount hit 0 */
 static void __devinit vio_dev_release(struct device *dev)
 {
-	DBGENTER();
-
-#ifdef CONFIG_PPC_PSERIES
-	/* XXX free TCE table */
-	of_node_put(dev->platform_data);
-#endif
+	if (vio_bus_ops.release_device)
+		vio_bus_ops.release_device(dev);
 	kfree(to_vio_dev(dev));
 }
 
-#ifdef CONFIG_PPC_PSERIES
-static ssize_t viodev_show_devspec(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct device_node *of_node = dev->platform_data;
-
-	return sprintf(buf, "%s\n", of_node->full_name);
-}
-DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL);
-#endif
-
-static ssize_t viodev_show_name(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t viodev_show_name(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
 }
 DEVICE_ATTR(name, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_name, NULL);
 
-static struct vio_dev * __devinit vio_register_device_common(
-		struct vio_dev *viodev, char *name, char *type,
-		uint32_t unit_address, struct iommu_table *iommu_table)
+struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev)
 {
-	DBGENTER();
-
-	viodev->name = name;
-	viodev->type = type;
-	viodev->unit_address = unit_address;
-	viodev->iommu_table = iommu_table;
 	/* init generic 'struct device' fields: */
 	viodev->dev.parent = &vio_bus_device.dev;
 	viodev->dev.bus = &vio_bus_type;
@@ -338,222 +181,15 @@ static struct vio_dev * __devinit vio_register_device_common(
 	return viodev;
 }
 
-#ifdef CONFIG_PPC_PSERIES
-/**
- * vio_register_device_node: - Register a new vio device.
- * @of_node:	The OF node for this device.
- *
- * Creates and initializes a vio_dev structure from the data in
- * of_node (dev.platform_data) and adds it to the list of virtual devices.
- * Returns a pointer to the created vio_dev or NULL if node has
- * NULL device_type or compatible fields.
- */
-struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
-{
-	struct vio_dev *viodev;
-	unsigned int *unit_address;
-	unsigned int *irq_p;
-
-	DBGENTER();
-
-	/* we need the 'device_type' property, in order to match with drivers */
-	if ((NULL == of_node->type)) {
-		printk(KERN_WARNING
-			"%s: node %s missing 'device_type'\n", __FUNCTION__,
-			of_node->name ? of_node->name : "<unknown>");
-		return NULL;
-	}
-
-	unit_address = (unsigned int *)get_property(of_node, "reg", NULL);
-	if (!unit_address) {
-		printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__,
-			of_node->name ? of_node->name : "<unknown>");
-		return NULL;
-	}
-
-	/* allocate a vio_dev for this node */
-	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
-	if (!viodev) {
-		return NULL;
-	}
-	memset(viodev, 0, sizeof(struct vio_dev));
-
-	viodev->dev.platform_data = of_node_get(of_node);
-
-	viodev->irq = NO_IRQ;
-	irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
-	if (irq_p) {
-		int virq = virt_irq_create_mapping(*irq_p);
-		if (virq == NO_IRQ) {
-			printk(KERN_ERR "Unable to allocate interrupt "
-			       "number for %s\n", of_node->full_name);
-		} else
-			viodev->irq = irq_offset_up(virq);
-	}
-
-	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
-
-	/* register with generic device framework */
-	if (vio_register_device_common(viodev, of_node->name, of_node->type,
-				*unit_address, vio_build_iommu_table(viodev))
-			== NULL) {
-		/* XXX free TCE table */
-		kfree(viodev);
-		return NULL;
-	}
-	device_create_file(&viodev->dev, &dev_attr_devspec);
-
-	return viodev;
-}
-EXPORT_SYMBOL(vio_register_device_node);
-#endif
-
-#ifdef CONFIG_PPC_ISERIES
-/**
- * vio_register_device: - Register a new vio device.
- * @voidev:	The device to register.
- */
-static struct vio_dev *__init vio_register_device_iseries(char *type,
-		uint32_t unit_num)
-{
-	struct vio_dev *viodev;
-
-	DBGENTER();
-
-	/* allocate a vio_dev for this node */
-	viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
-	if (!viodev)
-		return NULL;
-	memset(viodev, 0, sizeof(struct vio_dev));
-
-	snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%s%d", type, unit_num);
-
-	return vio_register_device_common(viodev, viodev->dev.bus_id, type,
-			unit_num, &vio_iommu_table);
-}
-#endif
-
 void __devinit vio_unregister_device(struct vio_dev *viodev)
 {
-	DBGENTER();
-#ifdef CONFIG_PPC_PSERIES
-	device_remove_file(&viodev->dev, &dev_attr_devspec);
-#endif
+	if (vio_bus_ops.unregister_device)
+		vio_bus_ops.unregister_device(viodev);
 	device_remove_file(&viodev->dev, &dev_attr_name);
 	device_unregister(&viodev->dev);
 }
 EXPORT_SYMBOL(vio_unregister_device);
 
-#ifdef CONFIG_PPC_PSERIES
-/**
- * vio_get_attribute: - get attribute for virtual device
- * @vdev:	The vio device to get property.
- * @which:	The property/attribute to be extracted.
- * @length:	Pointer to length of returned data size (unused if NULL).
- *
- * Calls prom.c's get_property() to return the value of the
- * attribute specified by the preprocessor constant @which
-*/
-const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length)
-{
-	return get_property(vdev->dev.platform_data, (char*)which, length);
-}
-EXPORT_SYMBOL(vio_get_attribute);
-
-/* vio_find_name() - internal because only vio.c knows how we formatted the
- * kobject name
- * XXX once vio_bus_type.devices is actually used as a kset in
- * drivers/base/bus.c, this function should be removed in favor of
- * "device_find(kobj_name, &vio_bus_type)"
- */
-static struct vio_dev *vio_find_name(const char *kobj_name)
-{
-	struct kobject *found;
-
-	found = kset_find_obj(&devices_subsys.kset, kobj_name);
-	if (!found)
-		return NULL;
-
-	return to_vio_dev(container_of(found, struct device, kobj));
-}
-
-/**
- * vio_find_node - find an already-registered vio_dev
- * @vnode: device_node of the virtual device we're looking for
- */
-struct vio_dev *vio_find_node(struct device_node *vnode)
-{
-	uint32_t *unit_address;
-	char kobj_name[BUS_ID_SIZE];
-
-	/* construct the kobject name from the device node */
-	unit_address = (uint32_t *)get_property(vnode, "reg", NULL);
-	if (!unit_address)
-		return NULL;
-	snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address);
-
-	return vio_find_name(kobj_name);
-}
-EXPORT_SYMBOL(vio_find_node);
-
-/**
- * vio_build_iommu_table: - gets the dma information from OF and builds the TCE tree.
- * @dev: the virtual device.
- *
- * Returns a pointer to the built tce tree, or NULL if it can't
- * find property.
-*/
-static struct iommu_table * vio_build_iommu_table(struct vio_dev *dev)
-{
-	unsigned int *dma_window;
-	struct iommu_table *newTceTable;
-	unsigned long offset;
-	int dma_window_property_size;
-
-	dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size);
-	if(!dma_window) {
-		return NULL;
-	}
-
-	newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
-
-	/*  There should be some code to extract the phys-encoded offset
-		using prom_n_addr_cells(). However, according to a comment
-		on earlier versions, it's always zero, so we don't bother */
-	offset = dma_window[1] >>  PAGE_SHIFT;
-
-	/* TCE table size - measured in tce entries */
-	newTceTable->it_size		= dma_window[4] >> PAGE_SHIFT;
-	/* offset for VIO should always be 0 */
-	newTceTable->it_offset		= offset;
-	newTceTable->it_busno		= 0;
-	newTceTable->it_index		= (unsigned long)dma_window[0];
-	newTceTable->it_type		= TCE_VB;
-
-	return iommu_init_table(newTceTable);
-}
-
-int vio_enable_interrupts(struct vio_dev *dev)
-{
-	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
-	if (rc != H_Success) {
-		printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
-	}
-	return rc;
-}
-EXPORT_SYMBOL(vio_enable_interrupts);
-
-int vio_disable_interrupts(struct vio_dev *dev)
-{
-	int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
-	if (rc != H_Success) {
-		printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
-	}
-	return rc;
-}
-EXPORT_SYMBOL(vio_disable_interrupts);
-#endif
-
 static dma_addr_t vio_map_single(struct device *dev, void *vaddr,
 			  size_t size, enum dma_data_direction direction)
 {
@@ -615,18 +251,8 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv)
 	const struct vio_dev *vio_dev = to_vio_dev(dev);
 	struct vio_driver *vio_drv = to_vio_driver(drv);
 	const struct vio_device_id *ids = vio_drv->id_table;
-	const struct vio_device_id *found_id;
-
-	DBGENTER();
 
-	if (!ids)
-		return 0;
-
-	found_id = vio_match_device(ids, vio_dev);
-	if (found_id)
-		return 1;
-
-	return 0;
+	return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL);
 }
 
 struct bus_type vio_bus_type = {
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
index fbff24827ae..35eb49e1b89 100644
--- a/arch/ppc64/mm/hash_low.S
+++ b/arch/ppc64/mm/hash_low.S
@@ -129,12 +129,10 @@ _GLOBAL(__hash_page)
 	 * code rather than call a C function...) 
 	 */
 BEGIN_FTR_SECTION
-BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
 	bl	.hash_page_do_lazy_icache
-END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
-END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
+END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
diff --git a/arch/ppc64/mm/hash_native.c b/arch/ppc64/mm/hash_native.c
index a6abd3a979b..7626bb59954 100644
--- a/arch/ppc64/mm/hash_native.c
+++ b/arch/ppc64/mm/hash_native.c
@@ -51,7 +51,6 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 			unsigned long prpn, unsigned long vflags,
 			unsigned long rflags)
 {
-	unsigned long arpn = physRpn_to_absRpn(prpn);
 	hpte_t *hptep = htab_address + hpte_group;
 	unsigned long hpte_v, hpte_r;
 	int i;
@@ -74,7 +73,7 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va,
 	hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID;
 	if (vflags & HPTE_V_LARGE)
 		va &= ~(1UL << HPTE_V_AVPN_SHIFT);
-	hpte_r = (arpn << HPTE_R_RPN_SHIFT) | rflags;
+	hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags;
 
 	hptep->r = hpte_r;
 	/* Guarantee the second dword is visible before the valid bit */
diff --git a/arch/ppc64/mm/hash_utils.c b/arch/ppc64/mm/hash_utils.c
index 623b5d130c3..09475c8edf7 100644
--- a/arch/ppc64/mm/hash_utils.c
+++ b/arch/ppc64/mm/hash_utils.c
@@ -210,7 +210,7 @@ void __init htab_initialize(void)
 
 	/* create bolted the linear mapping in the hash table */
 	for (i=0; i < lmb.memory.cnt; i++) {
-		base = lmb.memory.region[i].physbase + KERNELBASE;
+		base = lmb.memory.region[i].base + KERNELBASE;
 		size = lmb.memory.region[i].size;
 
 		DBG("creating mapping for region: %lx : %lx\n", base, size);
@@ -302,7 +302,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 	int local = 0;
 	cpumask_t tmp;
 
-	if ((ea & ~REGION_MASK) > EADDR_MASK)
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
 		return 1;
 
  	switch (REGION_ID(ea)) {
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index f9524602818..e7833c80eb6 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -27,124 +27,94 @@
 
 #include <linux/sysctl.h>
 
-#define	HUGEPGDIR_SHIFT		(HPAGE_SHIFT + PAGE_SHIFT - 3)
-#define HUGEPGDIR_SIZE		(1UL << HUGEPGDIR_SHIFT)
-#define HUGEPGDIR_MASK		(~(HUGEPGDIR_SIZE-1))
+#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
+#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 
-#define HUGEPTE_INDEX_SIZE	9
-#define HUGEPGD_INDEX_SIZE	10
-
-#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD	(1 << HUGEPGD_INDEX_SIZE)
-
-static inline int hugepgd_index(unsigned long addr)
-{
-	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
-
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	int index;
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-	if (! mm->context.huge_pgdir)
-		return NULL;
+	BUG_ON(! in_hugepage_area(mm->context, addr));
 
+	addr &= HPAGE_MASK;
+
+	pg = pgd_offset(mm, addr);
+	if (!pgd_none(*pg)) {
+		pu = pud_offset(pg, addr);
+		if (!pud_none(*pu)) {
+			pm = pmd_offset(pu, addr);
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
+		}
+	}
 
-	index = hugepgd_index(addr);
-	BUG_ON(index >= PTRS_PER_HUGEPGD);
-	return (pud_t *)(mm->context.huge_pgdir + index);
+	return NULL;
 }
 
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-	int index;
-
-	if (pud_none(*dir))
-		return NULL;
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-	return (pte_t *)pud_page(*dir) + index;
-}
-
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
 	BUG_ON(! in_hugepage_area(mm->context, addr));
 
-	if (! mm->context.huge_pgdir) {
-		pgd_t *new;
-		spin_unlock(&mm->page_table_lock);
-		/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
+	addr &= HPAGE_MASK;
 
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (mm->context.huge_pgdir)
-			pgd_free(new);
-		else
-			mm->context.huge_pgdir = new;
-	}
-	return hugepgd_offset(mm, addr);
-}
+	pg = pgd_offset(mm, addr);
+	pu = pud_alloc(mm, pg, addr);
 
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
-{
-	if (! pud_present(*dir)) {
-		pte_t *new;
-
-		spin_unlock(&mm->page_table_lock);
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (pud_present(*dir)) {
-			if (new)
-				kmem_cache_free(zero_cache, new);
-		} else {
-			struct page *ptepage;
-
-			if (! new)
-				return NULL;
-			ptepage = virt_to_page(new);
-			ptepage->mapping = (void *) mm;
-			ptepage->index = addr & HUGEPGDIR_MASK;
-			pud_populate(mm, dir, new);
+	if (pu) {
+		pm = pmd_alloc(mm, pu, addr);
+		if (pm) {
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
 		}
 	}
 
-	return hugepte_offset(dir, addr);
+	return NULL;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-	pud_t *pud;
+#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
+{
+	int i;
 
-	pud = hugepgd_offset(mm, addr);
-	if (! pud)
-		return NULL;
+	if (pte_present(*ptep)) {
+		pte_clear(mm, addr, ptep);
+		flush_tlb_pending();
+	}
 
-	return hugepte_offset(pud, addr);
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+		ptep++;
+	}
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
 {
-	pud_t *pud;
+	unsigned long old = pte_update(ptep, ~0UL);
+	int i;
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+	if (old & _PAGE_HASHPTE)
+		hpte_update(mm, addr, old, 0);
 
-	pud = hugepgd_alloc(mm, addr);
-	if (! pud)
-		return NULL;
+	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++)
+		ptep[i] = __pte(0);
 
-	return hugepte_alloc(mm, pud, addr);
+	return __pte(old);
 }
 
 /*
@@ -162,15 +132,17 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 	return 0;
 }
 
-static void flush_segments(void *parm)
+static void flush_low_segments(void *parm)
 {
-	u16 segs = (unsigned long) parm;
+	u16 areas = (unsigned long) parm;
 	unsigned long i;
 
 	asm volatile("isync" : : : "memory");
 
-	for (i = 0; i < 16; i++) {
-		if (! (segs & (1U << i)))
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
+
+	for (i = 0; i < NUM_LOW_AREAS; i++) {
+		if (! (areas & (1U << i)))
 			continue;
 		asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
 	}
@@ -178,13 +150,33 @@ static void flush_segments(void *parm)
 	asm volatile("isync" : : : "memory");
 }
 
-static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
+static void flush_high_segments(void *parm)
 {
-	unsigned long start = seg << SID_SHIFT;
-	unsigned long end = (seg+1) << SID_SHIFT;
+	u16 areas = (unsigned long) parm;
+	unsigned long i, j;
+
+	asm volatile("isync" : : : "memory");
+
+	BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++) {
+		if (! (areas & (1U << i)))
+			continue;
+		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
+			asm volatile("slbie %0"
+				     :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
+	}
+
+	asm volatile("isync" : : : "memory");
+}
+
+static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+	unsigned long start = area << SID_SHIFT;
+	unsigned long end = (area+1) << SID_SHIFT;
 	struct vm_area_struct *vma;
 
-	BUG_ON(seg >= 16);
+	BUG_ON(area >= NUM_LOW_AREAS);
 
 	/* Check no VMAs are in the region */
 	vma = find_vma(mm, start);
@@ -194,20 +186,39 @@ static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
 	return 0;
 }
 
-static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
+{
+	unsigned long start = area << HTLB_AREA_SHIFT;
+	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
+	struct vm_area_struct *vma;
+
+	BUG_ON(area >= NUM_HIGH_AREAS);
+
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, start);
+	if (vma && (vma->vm_start < end))
+		return -EBUSY;
+
+	return 0;
+}
+
+static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
 {
 	unsigned long i;
 
-	newsegs &= ~(mm->context.htlb_segs);
-	if (! newsegs)
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
+
+	newareas &= ~(mm->context.low_htlb_areas);
+	if (! newareas)
 		return 0; /* The segments we want are already open */
 
-	for (i = 0; i < 16; i++)
-		if ((1 << i) & newsegs)
-			if (prepare_low_seg_for_htlb(mm, i) != 0)
+	for (i = 0; i < NUM_LOW_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_low_area_for_htlb(mm, i) != 0)
 				return -EBUSY;
 
-	mm->context.htlb_segs |= newsegs;
+	mm->context.low_htlb_areas |= newareas;
 
 	/* update the paca copy of the context struct */
 	get_paca()->context = mm->context;
@@ -215,29 +226,63 @@ static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
 	/* the context change must make it to memory before the flush,
 	 * so that further SLB misses do the right thing. */
 	mb();
-	on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
+	on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
+
+	return 0;
+}
+
+static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
+{
+	unsigned long i;
+
+	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
+	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
+		     != NUM_HIGH_AREAS);
+
+	newareas &= ~(mm->context.high_htlb_areas);
+	if (! newareas)
+		return 0; /* The areas we want are already open */
+
+	for (i = 0; i < NUM_HIGH_AREAS; i++)
+		if ((1 << i) & newareas)
+			if (prepare_high_area_for_htlb(mm, i) != 0)
+				return -EBUSY;
+
+	mm->context.high_htlb_areas |= newareas;
+
+	/* update the paca copy of the context struct */
+	get_paca()->context = mm->context;
+
+	/* the context change must make it to memory before the flush,
+	 * so that further SLB misses do the right thing. */
+	mb();
+	on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
 
 	return 0;
 }
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (within_hugepage_high_range(addr, len))
-		return 0;
-	else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
-		int err;
-		/* Yes, we need both tests, in case addr+len overflows
-		 * 64-bit arithmetic */
-		err = open_low_hpage_segs(current->mm,
+	int err;
+
+	if ( (addr+len) < addr )
+		return -EINVAL;
+
+	if ((addr + len) < 0x100000000UL)
+		err = open_low_hpage_areas(current->mm,
 					  LOW_ESID_MASK(addr, len));
-		if (err)
-			printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
-			       " failed (segs: 0x%04hx)\n", addr, len,
-			       LOW_ESID_MASK(addr, len));
+	else
+		err = open_high_hpage_areas(current->mm,
+					    HTLB_AREA_MASK(addr, len));
+	if (err) {
+		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
+		       addr, len,
+		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
 		return err;
 	}
 
-	return -EINVAL;
+	return 0;
 }
 
 struct page *
@@ -309,8 +354,8 @@ full_search:
 			vma = find_vma(mm, addr);
 			continue;
 		}
-		if (touches_hugepage_high_range(addr, len)) {
-			addr = TASK_HPAGE_END;
+		if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
 			vma = find_vma(mm, addr);
 			continue;
 		}
@@ -389,8 +434,9 @@ hugepage_recheck:
 		if (touches_hugepage_low_range(mm, addr, len)) {
 			addr = (addr & ((~0) << SID_SHIFT)) - len;
 			goto hugepage_recheck;
-		} else if (touches_hugepage_high_range(addr, len)) {
-			addr = TASK_HPAGE_BASE - len;
+		} else if (touches_hugepage_high_range(mm, addr, len)) {
+			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
+			goto hugepage_recheck;
 		}
 
 		/*
@@ -481,23 +527,28 @@ static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
 	return -ENOMEM;
 }
 
-static unsigned long htlb_get_high_area(unsigned long len)
+static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
 {
-	unsigned long addr = TASK_HPAGE_BASE;
+	unsigned long addr = 0x100000000UL;
 	struct vm_area_struct *vma;
 
 	vma = find_vma(current->mm, addr);
-	for (vma = find_vma(current->mm, addr);
-	     addr + len <= TASK_HPAGE_END;
-	     vma = vma->vm_next) {
+	while (addr + len <= TASK_SIZE_USER64) {
 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
-		BUG_ON(! within_hugepage_high_range(addr, len));
+
+		if (! __within_hugepage_high_range(addr, len, areamask)) {
+			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
+			vma = find_vma(current->mm, addr);
+			continue;
+		}
 
 		if (!vma || (addr + len) <= vma->vm_start)
 			return addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
-		/* Because we're in a hugepage region, this alignment
-		 * should not skip us over any VMAs */
+		/* Depending on segmask this might not be a confirmed
+		 * hugepage region, so the ALIGN could have skipped
+		 * some VMAs */
+		vma = find_vma(current->mm, addr);
 	}
 
 	return -ENOMEM;
@@ -507,6 +558,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
+	int lastshift;
+	u16 areamask, curareas;
+
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -514,67 +568,49 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -EINVAL;
 
 	if (test_thread_flag(TIF_32BIT)) {
-		int lastshift = 0;
-		u16 segmask, cursegs = current->mm->context.htlb_segs;
+		curareas = current->mm->context.low_htlb_areas;
 
 		/* First see if we can do the mapping in the existing
-		 * low hpage segments */
-		addr = htlb_get_low_area(len, cursegs);
+		 * low areas */
+		addr = htlb_get_low_area(len, curareas);
 		if (addr != -ENOMEM)
 			return addr;
 
-		for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
-		     ! lastshift; segmask >>=1) {
-			if (segmask & 1)
+		lastshift = 0;
+		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
 				lastshift = 1;
 
-			addr = htlb_get_low_area(len, cursegs | segmask);
+			addr = htlb_get_low_area(len, curareas | areamask);
 			if ((addr != -ENOMEM)
-			    && open_low_hpage_segs(current->mm, segmask) == 0)
+			    && open_low_hpage_areas(current->mm, areamask) == 0)
 				return addr;
 		}
-		printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
-		       " enough segments\n");
-		return -ENOMEM;
 	} else {
-		return htlb_get_high_area(len);
-	}
-}
-
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-	int i;
-	pgd_t *pgdir;
-
-	spin_lock(&mm->page_table_lock);
-
-	pgdir = mm->context.huge_pgdir;
-	if (! pgdir)
-		goto out;
-
-	mm->context.huge_pgdir = NULL;
+		curareas = current->mm->context.high_htlb_areas;
 
-	/* cleanup any hugepte pages leftover */
-	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
-		pud_t *pud = (pud_t *)(pgdir + i);
-
-		if (! pud_none(*pud)) {
-			pte_t *pte = (pte_t *)pud_page(*pud);
-			struct page *ptepage = virt_to_page(pte);
+		/* First see if we can do the mapping in the existing
+		 * high areas */
+		addr = htlb_get_high_area(len, curareas);
+		if (addr != -ENOMEM)
+			return addr;
 
-			ptepage->mapping = NULL;
+		lastshift = 0;
+		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
+		     ! lastshift; areamask >>=1) {
+			if (areamask & 1)
+				lastshift = 1;
 
-			BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
-			kmem_cache_free(zero_cache, pte);
+			addr = htlb_get_high_area(len, curareas | areamask);
+			if ((addr != -ENOMEM)
+			    && open_high_hpage_areas(current->mm, areamask) == 0)
+				return addr;
 		}
-		pud_clear(pud);
 	}
-
-	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
-	kmem_cache_free(zero_cache, pgdir);
-
- out:
-	spin_unlock(&mm->page_table_lock);
+	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+	       " enough areas\n");
+	return -ENOMEM;
 }
 
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c
index b6e75b891ac..c65b87b9275 100644
--- a/arch/ppc64/mm/imalloc.c
+++ b/arch/ppc64/mm/imalloc.c
@@ -31,7 +31,7 @@ static int get_free_im_addr(unsigned long size, unsigned long *im_addr)
 			break;
 		if ((unsigned long)tmp->addr >= ioremap_bot)
 			addr = tmp->size + (unsigned long) tmp->addr;
-		if (addr > IMALLOC_END-size) 
+		if (addr >= IMALLOC_END-size)
 			return 1;
 	}
 	*im_addr = addr;
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index e58a24d4287..c02dc9809ca 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -42,7 +42,6 @@
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
-#include <asm/abs_addr.h>
 #include <asm/prom.h>
 #include <asm/lmb.h>
 #include <asm/rtas.h>
@@ -66,6 +65,14 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -159,7 +166,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags)
 		ptep = pte_alloc_kernel(&init_mm, pmdp, ea);
 		if (!ptep)
 			return -ENOMEM;
-		pa = abs_to_phys(pa);
 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 							  __pgprot(flags)));
 		spin_unlock(&init_mm.page_table_lock);
@@ -226,7 +232,7 @@ void __iomem * __ioremap(unsigned long addr, unsigned long size,
 	 * Before that, we map using addresses going
 	 * up from ioremap_bot.  imalloc will use
 	 * the addresses from ioremap_bot through
-	 * IMALLOC_END (0xE000001fffffffff)
+	 * IMALLOC_END
 	 * 
 	 */
 	pa = addr & PAGE_MASK;
@@ -417,12 +423,6 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	int index;
 	int err;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* We leave htlb_segs as it was, but for a fork, we need to
-	 * clear the huge_pgdir. */
-	mm->context.huge_pgdir = NULL;
-#endif
-
 again:
 	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
 		return -ENOMEM;
@@ -453,8 +453,6 @@ void destroy_context(struct mm_struct *mm)
 	spin_unlock(&mmu_context_lock);
 
 	mm->context.id = NO_CONTEXT;
-
-	hugetlb_mm_free_pgd(mm);
 }
 
 /*
@@ -484,9 +482,9 @@ void __init mm_init_ppc64(void)
 	for (i = 1; i < lmb.memory.cnt; i++) {
 		unsigned long base, prevbase, prevsize;
 
-		prevbase = lmb.memory.region[i-1].physbase;
+		prevbase = lmb.memory.region[i-1].base;
 		prevsize = lmb.memory.region[i-1].size;
-		base = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		if (base > (prevbase + prevsize)) {
 			io_hole_start = prevbase + prevsize;
 			io_hole_size = base  - (prevbase + prevsize);
@@ -513,11 +511,8 @@ int page_is_ram(unsigned long pfn)
 	for (i=0; i < lmb.memory.cnt; i++) {
 		unsigned long base;
 
-#ifdef CONFIG_MSCHUNKS
-		base = lmb.memory.region[i].physbase;
-#else
 		base = lmb.memory.region[i].base;
-#endif
+
 		if ((paddr >= base) &&
 			(paddr < (base + lmb.memory.region[i].size))) {
 			return 1;
@@ -547,7 +542,7 @@ void __init do_init_bootmem(void)
 	 */
 	bootmap_pages = bootmem_bootmap_pages(total_pages);
 
-	start = abs_to_phys(lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE));
+	start = lmb_alloc(bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 	BUG_ON(!start);
 
 	boot_mapsize = init_bootmem(start >> PAGE_SHIFT, total_pages);
@@ -558,25 +553,25 @@ void __init do_init_bootmem(void)
 	 * present.
 	 */
 	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long physbase, size;
+		unsigned long base, size;
 		unsigned long start_pfn, end_pfn;
 
-		physbase = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		size = lmb.memory.region[i].size;
 
-		start_pfn = physbase >> PAGE_SHIFT;
+		start_pfn = base >> PAGE_SHIFT;
 		end_pfn = start_pfn + (size >> PAGE_SHIFT);
 		memory_present(0, start_pfn, end_pfn);
 
-		free_bootmem(physbase, size);
+		free_bootmem(base, size);
 	}
 
 	/* reserve the sections we're already using */
 	for (i=0; i < lmb.reserved.cnt; i++) {
-		unsigned long physbase = lmb.reserved.region[i].physbase;
+		unsigned long base = lmb.reserved.region[i].base;
 		unsigned long size = lmb.reserved.region[i].size;
 
-		reserve_bootmem(physbase, size);
+		reserve_bootmem(base, size);
 	}
 }
 
@@ -615,10 +610,10 @@ static int __init setup_kcore(void)
 	int i;
 
 	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long physbase, size;
+		unsigned long base, size;
 		struct kcore_list *kcore_mem;
 
-		physbase = lmb.memory.region[i].physbase;
+		base = lmb.memory.region[i].base;
 		size = lmb.memory.region[i].size;
 
 		/* GFP_ATOMIC to avoid might_sleep warnings during boot */
@@ -626,7 +621,7 @@ static int __init setup_kcore(void)
 		if (!kcore_mem)
 			panic("mem_init: kmalloc failed\n");
 
-		kclist_add(kcore_mem, __va(physbase), size);
+		kclist_add(kcore_mem, __va(base), size);
 	}
 
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
@@ -686,9 +681,6 @@ void __init mem_init(void)
 
 	mem_init_done = 1;
 
-#ifdef CONFIG_PPC_ISERIES
-	iommu_vio_init();
-#endif
 	/* Initialize the vDSO */
 	vdso_init();
 }
@@ -833,23 +825,43 @@ void __iomem * reserve_phb_iospace(unsigned long size)
 	return virt_addr;
 }
 
-kmem_cache_t *zero_cache;
-
-static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
 {
-	memset(pte, 0, PAGE_SIZE);
+	memset(addr, 0, kmem_cache_size(cache));
 }
 
+static const int pgtable_cache_size[2] = {
+	PTE_TABLE_SIZE, PMD_TABLE_SIZE
+};
+static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
+	"pgd_pte_cache", "pud_pmd_cache",
+};
+
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+
 void pgtable_cache_init(void)
 {
-	zero_cache = kmem_cache_create("zero",
-				PAGE_SIZE,
-				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
-				zero_ctor,
-				NULL);
-	if (!zero_cache)
-		panic("pgtable_cache_init(): could not create zero_cache!\n");
+	int i;
+
+	BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]);
+	BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]);
+	BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]);
+	BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]);
+
+	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
+		int size = pgtable_cache_size[i];
+		const char *name = pgtable_cache_name[i];
+
+		pgtable_cache[i] = kmem_cache_create(name,
+						     size, size,
+						     SLAB_HWCACHE_ALIGN
+						     | SLAB_MUST_HWCACHE_ALIGN,
+						     zero_ctor,
+						     NULL);
+		if (! pgtable_cache[i])
+			panic("pgtable_cache_init(): could not create %s!\n",
+			      name);
+	}
 }
 
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index 0b191f2de01..c3116f0d788 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -671,7 +671,7 @@ new_range:
 		 * Mark reserved regions on this node
 		 */
 		for (i = 0; i < lmb.reserved.cnt; i++) {
-			unsigned long physbase = lmb.reserved.region[i].physbase;
+			unsigned long physbase = lmb.reserved.region[i].base;
 			unsigned long size = lmb.reserved.region[i].size;
 
 			if (pa_to_nid(physbase) != nid &&
diff --git a/arch/ppc64/mm/slb_low.S b/arch/ppc64/mm/slb_low.S
index 8379d678f70..bab255889c5 100644
--- a/arch/ppc64/mm/slb_low.S
+++ b/arch/ppc64/mm/slb_low.S
@@ -89,28 +89,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	b	9f
 
 0:	/* user address: proto-VSID = context<<15 | ESID */
-	li	r11,SLB_VSID_USER
-
-	srdi.	r9,r3,13
+	srdi.	r9,r3,USER_ESID_BITS
 	bne-	8f			/* invalid ea bits set */
 
 #ifdef CONFIG_HUGETLB_PAGE
 BEGIN_FTR_SECTION
-	/* check against the hugepage ranges */
-	cmpldi	r3,(TASK_HPAGE_END>>SID_SHIFT)
-	bge	6f			/* >= TASK_HPAGE_END */
-	cmpldi	r3,(TASK_HPAGE_BASE>>SID_SHIFT)
-	bge	5f			/* TASK_HPAGE_BASE..TASK_HPAGE_END */
+	lhz	r9,PACAHIGHHTLBAREAS(r13)
+	srdi	r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
+	srd	r9,r9,r11
+	andi.	r9,r9,1
+	bne	5f
+
+	li	r11,SLB_VSID_USER
+
 	cmpldi	r3,16
-	bge	6f			/* 4GB..TASK_HPAGE_BASE */
+	bge	6f
 
-	lhz	r9,PACAHTLBSEGS(r13)
+	lhz	r9,PACALOWHTLBAREAS(r13)
 	srd	r9,r9,r3
 	andi.	r9,r9,1
+
 	beq	6f
 
-5:	/* this is a hugepage user address */
-	li	r11,(SLB_VSID_USER|SLB_VSID_L)
+5:	li	r11,SLB_VSID_USER|SLB_VSID_L
 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
index 26f0172c452..d8a6593a13f 100644
--- a/arch/ppc64/mm/tlb.c
+++ b/arch/ppc64/mm/tlb.c
@@ -41,7 +41,58 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
 	/* This is safe as we are holding page_table_lock */
         cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pte_free(ptepage);
+		pgtable_free(pgf);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pte_free_now(ptepage);
+			pgtable_free_now(pgf);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
-	(*batchp)->pages[(*batchp)->index++] = ptepage;
+	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);
 		*batchp = NULL;
@@ -132,42 +183,6 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch)
 	put_cpu();
 }
 
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-void pte_free_now(struct page *ptepage)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
-	pte_free(ptepage);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pte_free(batch->pages[i]);
-	free_page((unsigned long)batch);
-}
-
-void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
 void pte_free_finish(void)
 {
 	/* This is safe as we are holding page_table_lock */
diff --git a/arch/ppc64/oprofile/common.c b/arch/ppc64/oprofile/common.c
index b28bfda23d9..4acd1a42493 100644
--- a/arch/ppc64/oprofile/common.c
+++ b/arch/ppc64/oprofile/common.c
@@ -153,6 +153,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 
 		case PV_970:
 		case PV_970FX:
+		case PV_970MP:
 			model = &op_model_power4;
 			model->num_counters = 8;
 			ops->cpu_type = "ppc64/970";
diff --git a/arch/ppc64/xmon/start.c b/arch/ppc64/xmon/start.c
index a9265bcc79b..f86b584acd7 100644
--- a/arch/ppc64/xmon/start.c
+++ b/arch/ppc64/xmon/start.c
@@ -27,7 +27,7 @@ static void sysrq_handle_xmon(int key, struct pt_regs *pt_regs,
 			      struct tty_struct *tty) 
 {
 	/* ensure xmon is enabled */
-	xmon_init();
+	xmon_init(1);
 	debugger(pt_regs);
 }
 
diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c
index 05539439e6b..45908b10acd 100644
--- a/arch/ppc64/xmon/xmon.c
+++ b/arch/ppc64/xmon/xmon.c
@@ -2496,15 +2496,25 @@ static void dump_stab(void)
 	}
 }
 
-void xmon_init(void)
-{
-	__debugger = xmon;
-	__debugger_ipi = xmon_ipi;
-	__debugger_bpt = xmon_bpt;
-	__debugger_sstep = xmon_sstep;
-	__debugger_iabr_match = xmon_iabr_match;
-	__debugger_dabr_match = xmon_dabr_match;
-	__debugger_fault_handler = xmon_fault_handler;
+void xmon_init(int enable)
+{
+	if (enable) {
+		__debugger = xmon;
+		__debugger_ipi = xmon_ipi;
+		__debugger_bpt = xmon_bpt;
+		__debugger_sstep = xmon_sstep;
+		__debugger_iabr_match = xmon_iabr_match;
+		__debugger_dabr_match = xmon_dabr_match;
+		__debugger_fault_handler = xmon_fault_handler;
+	} else {
+		__debugger = NULL;
+		__debugger_ipi = NULL;
+		__debugger_bpt = NULL;
+		__debugger_sstep = NULL;
+		__debugger_iabr_match = NULL;
+		__debugger_dabr_match = NULL;
+		__debugger_fault_handler = NULL;
+	}
 }
 
 void dump_segments(void)
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index 73c6b85299c..d74a7c5e75d 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -513,7 +513,7 @@ static void rx_complete (amb_dev * dev, rx_out * rx) {
 	  
 	  // VC layer stats
 	  atomic_inc(&atm_vcc->stats->rx);
-	  do_gettimeofday(&skb->stamp);
+	  __net_timestamp(skb);
 	  // end of our responsability
 	  atm_vcc->push (atm_vcc, skb);
 	  return;
diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c
index f2f01cb82cb..57f1810fdcc 100644
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -325,7 +325,7 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb)
 		result = -ENOBUFS;
 		goto done;
 	}
-	do_gettimeofday(&new_skb->stamp);
+	__net_timestamp(new_skb);
 	memcpy(skb_put(new_skb,skb->len),skb->data,skb->len);
 	out_vcc->push(out_vcc,new_skb);
 	atomic_inc(&vcc->stats->tx);
diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c
index 10da3693476..c13c4d736ef 100644
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -537,7 +537,7 @@ static int rx_aal0(struct atm_vcc *vcc)
 		return 0;
 	}
 	skb_put(skb,length);
-	skb->stamp = eni_vcc->timestamp;
+	skb_set_timestamp(skb, &eni_vcc->timestamp);
 	DPRINTK("got len %ld\n",length);
 	if (do_rx_dma(vcc,skb,1,length >> 2,length >> 2)) return 1;
 	eni_vcc->rxing++;
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index b078fa548eb..58219744f5d 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -815,7 +815,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q)
 				skb_put (skb, qe->p1 & 0xffff); 
 				ATM_SKB(skb)->vcc = atm_vcc;
 				atomic_inc(&atm_vcc->stats->rx);
-				do_gettimeofday(&skb->stamp);
+				__net_timestamp(skb);
 				fs_dprintk (FS_DEBUG_ALLOC, "Free rec-skb: %p (pushed)\n", skb);
 				atm_vcc->push (atm_vcc, skb);
 				fs_dprintk (FS_DEBUG_ALLOC, "Free rec-d: %p\n", pe);
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 5f702199543..2bf723a7b6e 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -1176,7 +1176,7 @@ fore200e_push_rpd(struct fore200e* fore200e, struct atm_vcc* vcc, struct rpd* rp
 	return -ENOMEM;
     } 
 
-    do_gettimeofday(&skb->stamp);
+    __net_timestamp(skb);
     
 #ifdef FORE200E_52BYTE_AAL0_SDU
     if (cell_header) {
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 28250c9b32d..fde9334059a 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1886,7 +1886,7 @@ he_service_rbrq(struct he_dev *he_dev, int group)
 		if (rx_skb_reserve > 0)
 			skb_reserve(skb, rx_skb_reserve);
 
-		do_gettimeofday(&skb->stamp);
+		__net_timestamp(skb);
 
 		for (iov = he_vcc->iov_head;
 				iov < he_vcc->iov_tail; ++iov) {
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index 924a2c8988b..0cded046800 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -1034,7 +1034,7 @@ static void rx_schedule (hrz_dev * dev, int irq) {
 	  struct atm_vcc * vcc = ATM_SKB(skb)->vcc;
 	  // VC layer stats
 	  atomic_inc(&vcc->stats->rx);
-	  do_gettimeofday(&skb->stamp);
+	  __net_timestamp(skb);
 	  // end of our responsability
 	  vcc->push (vcc, skb);
 	}
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 30b7e990ed0..b4a76cade64 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -1101,7 +1101,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe)
 			       cell, ATM_CELL_PAYLOAD);
 
 			ATM_SKB(sb)->vcc = vcc;
-			do_gettimeofday(&sb->stamp);
+			__net_timestamp(sb);
 			vcc->push(vcc, sb);
 			atomic_inc(&vcc->stats->rx);
 
@@ -1179,7 +1179,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe)
 
 			skb_trim(skb, len);
 			ATM_SKB(skb)->vcc = vcc;
-			do_gettimeofday(&skb->stamp);
+			__net_timestamp(skb);
 
 			vcc->push(vcc, skb);
 			atomic_inc(&vcc->stats->rx);
@@ -1201,7 +1201,7 @@ dequeue_rx(struct idt77252_dev *card, struct rsq_entry *rsqe)
 
 		skb_trim(skb, len);
 		ATM_SKB(skb)->vcc = vcc;
-		do_gettimeofday(&skb->stamp);
+		__net_timestamp(skb);
 
 		vcc->push(vcc, skb);
 		atomic_inc(&vcc->stats->rx);
@@ -1340,7 +1340,7 @@ idt77252_rx_raw(struct idt77252_dev *card)
 		       ATM_CELL_PAYLOAD);
 
 		ATM_SKB(sb)->vcc = vcc;
-		do_gettimeofday(&sb->stamp);
+		__net_timestamp(sb);
 		vcc->push(vcc, sb);
 		atomic_inc(&vcc->stats->rx);
 
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index ffe3afa723b..51ec1478729 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1427,7 +1427,7 @@ static void vcc_rx_aal5(struct lanai_vcc *lvcc, int endptr)
 	skb_put(skb, size);
 	vcc_rx_memcpy(skb->data, lvcc, size);
 	ATM_SKB(skb)->vcc = lvcc->rx.atmvcc;
-	do_gettimeofday(&skb->stamp);
+	__net_timestamp(skb);
 	lvcc->rx.atmvcc->push(lvcc->rx.atmvcc, skb);
 	atomic_inc(&lvcc->rx.atmvcc->stats->rx);
     out:
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index b2a7b754fd1..c57e20dcb0f 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -214,8 +214,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev);
 static void __devinit ns_init_card_error(ns_dev *card, int error);
 static scq_info *get_scq(int size, u32 scd);
 static void free_scq(scq_info *scq, struct atm_vcc *vcc);
-static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
-                       u32 handle2, u32 addr2);
+static void push_rxbufs(ns_dev *, struct sk_buff *);
 static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs);
 static int ns_open(struct atm_vcc *vcc);
 static void ns_close(struct atm_vcc *vcc);
@@ -766,6 +765,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
          ns_init_card_error(card, error);
 	 return error;
       }
+      NS_SKB_CB(hb)->buf_type = BUF_NONE;
       skb_queue_tail(&card->hbpool.queue, hb);
       card->hbpool.count++;
    }
@@ -786,9 +786,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
          ns_init_card_error(card, error);
 	 return error;
       }
+      NS_SKB_CB(lb)->buf_type = BUF_LG;
       skb_queue_tail(&card->lbpool.queue, lb);
       skb_reserve(lb, NS_SMBUFSIZE);
-      push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0);
+      push_rxbufs(card, lb);
       /* Due to the implementation of push_rxbufs() this is 1, not 0 */
       if (j == 1)
       {
@@ -822,9 +823,10 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
          ns_init_card_error(card, error);
 	 return error;
       }
+      NS_SKB_CB(sb)->buf_type = BUF_SM;
       skb_queue_tail(&card->sbpool.queue, sb);
       skb_reserve(sb, NS_AAL0_HEADER);
-      push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0);
+      push_rxbufs(card, sb);
    }
    /* Test for strange behaviour which leads to crashes */
    if ((bcount = ns_stat_sfbqc_get(readl(card->membase + STAT))) < card->sbnr.min)
@@ -852,6 +854,7 @@ static int __devinit ns_init_card(int i, struct pci_dev *pcidev)
          ns_init_card_error(card, error);
 	 return error;
       }
+      NS_SKB_CB(iovb)->buf_type = BUF_NONE;
       skb_queue_tail(&card->iovpool.queue, iovb);
       card->iovpool.count++;
    }
@@ -1078,12 +1081,18 @@ static void free_scq(scq_info *scq, struct atm_vcc *vcc)
 
 /* The handles passed must be pointers to the sk_buff containing the small
    or large buffer(s) cast to u32. */
-static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
-                       u32 handle2, u32 addr2)
+static void push_rxbufs(ns_dev *card, struct sk_buff *skb)
 {
+   struct ns_skb_cb *cb = NS_SKB_CB(skb);
+   u32 handle1, addr1;
+   u32 handle2, addr2;
    u32 stat;
    unsigned long flags;
    
+   /* *BARF* */
+   handle2 = addr2 = 0;
+   handle1 = (u32)skb;
+   addr1 = (u32)virt_to_bus(skb->data);
 
 #ifdef GENERAL_DEBUG
    if (!addr1)
@@ -1093,7 +1102,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
    stat = readl(card->membase + STAT);
    card->sbfqc = ns_stat_sfbqc_get(stat);
    card->lbfqc = ns_stat_lfbqc_get(stat);
-   if (type == BUF_SM)
+   if (cb->buf_type == BUF_SM)
    {
       if (!addr2)
       {
@@ -1111,7 +1120,7 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
 	 }
       }      
    }
-   else /* type == BUF_LG */
+   else /* buf_type == BUF_LG */
    {
       if (!addr2)
       {
@@ -1132,26 +1141,26 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
 
    if (addr2)
    {
-      if (type == BUF_SM)
+      if (cb->buf_type == BUF_SM)
       {
          if (card->sbfqc >= card->sbnr.max)
          {
-            skb_unlink((struct sk_buff *) handle1);
+            skb_unlink((struct sk_buff *) handle1, &card->sbpool.queue);
             dev_kfree_skb_any((struct sk_buff *) handle1);
-            skb_unlink((struct sk_buff *) handle2);
+            skb_unlink((struct sk_buff *) handle2, &card->sbpool.queue);
             dev_kfree_skb_any((struct sk_buff *) handle2);
             return;
          }
 	 else
             card->sbfqc += 2;
       }
-      else /* (type == BUF_LG) */
+      else /* (buf_type == BUF_LG) */
       {
          if (card->lbfqc >= card->lbnr.max)
          {
-            skb_unlink((struct sk_buff *) handle1);
+            skb_unlink((struct sk_buff *) handle1, &card->lbpool.queue);
             dev_kfree_skb_any((struct sk_buff *) handle1);
-            skb_unlink((struct sk_buff *) handle2);
+            skb_unlink((struct sk_buff *) handle2, &card->lbpool.queue);
             dev_kfree_skb_any((struct sk_buff *) handle2);
             return;
          }
@@ -1166,12 +1175,12 @@ static void push_rxbufs(ns_dev *card, u32 type, u32 handle1, u32 addr1,
       writel(handle2, card->membase + DR2);
       writel(addr1, card->membase + DR1);
       writel(handle1, card->membase + DR0);
-      writel(NS_CMD_WRITE_FREEBUFQ | (u32) type, card->membase + CMD);
+      writel(NS_CMD_WRITE_FREEBUFQ | cb->buf_type, card->membase + CMD);
  
       spin_unlock_irqrestore(&card->res_lock, flags);
 
       XPRINTK("nicstar%d: Pushing %s buffers at 0x%x and 0x%x.\n", card->index,
-              (type == BUF_SM ? "small" : "large"), addr1, addr2);
+              (cb->buf_type == BUF_SM ? "small" : "large"), addr1, addr2);
    }
 
    if (!card->efbie && card->sbfqc >= card->sbnr.min &&
@@ -1322,9 +1331,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
             card->efbie = 0;
             break;
          }
+         NS_SKB_CB(sb)->buf_type = BUF_SM;
          skb_queue_tail(&card->sbpool.queue, sb);
          skb_reserve(sb, NS_AAL0_HEADER);
-         push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0);
+         push_rxbufs(card, sb);
       }
       card->sbfqc = i;
       process_rsq(card);
@@ -1348,9 +1358,10 @@ static irqreturn_t ns_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
             card->efbie = 0;
             break;
          }
+         NS_SKB_CB(lb)->buf_type = BUF_LG;
          skb_queue_tail(&card->lbpool.queue, lb);
          skb_reserve(lb, NS_SMBUFSIZE);
-         push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0);
+         push_rxbufs(card, lb);
       }
       card->lbfqc = i;
       process_rsq(card);
@@ -2202,7 +2213,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
          memcpy(sb->tail, cell, ATM_CELL_PAYLOAD);
          skb_put(sb, ATM_CELL_PAYLOAD);
          ATM_SKB(sb)->vcc = vcc;
-         do_gettimeofday(&sb->stamp);
+	 __net_timestamp(sb);
          vcc->push(vcc, sb);
          atomic_inc(&vcc->stats->rx);
          cell += ATM_CELL_PAYLOAD;
@@ -2227,6 +2238,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
             recycle_rx_buf(card, skb);
             return;
 	 }
+         NS_SKB_CB(iovb)->buf_type = BUF_NONE;
       }
       else
          if (--card->iovpool.count < card->iovnr.min)
@@ -2234,6 +2246,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
 	    struct sk_buff *new_iovb;
 	    if ((new_iovb = alloc_skb(NS_IOVBUFSIZE, GFP_ATOMIC)) != NULL)
 	    {
+               NS_SKB_CB(iovb)->buf_type = BUF_NONE;
                skb_queue_tail(&card->iovpool.queue, new_iovb);
                card->iovpool.count++;
 	    }
@@ -2264,7 +2277,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
 
    if (NS_SKB(iovb)->iovcnt == 1)
    {
-      if (skb->list != &card->sbpool.queue)
+      if (NS_SKB_CB(skb)->buf_type != BUF_SM)
       {
          printk("nicstar%d: Expected a small buffer, and this is not one.\n",
 	        card->index);
@@ -2278,7 +2291,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
    }
    else /* NS_SKB(iovb)->iovcnt >= 2 */
    {
-      if (skb->list != &card->lbpool.queue)
+      if (NS_SKB_CB(skb)->buf_type != BUF_LG)
       {
          printk("nicstar%d: Expected a large buffer, and this is not one.\n",
 	        card->index);
@@ -2322,8 +2335,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
          /* skb points to a small buffer */
          if (!atm_charge(vcc, skb->truesize))
          {
-            push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data),
-                        0, 0);
+            push_rxbufs(card, skb);
             atomic_inc(&vcc->stats->rx_drop);
          }
          else
@@ -2334,7 +2346,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
             skb->destructor = ns_sb_destructor;
 #endif /* NS_USE_DESTRUCTORS */
             ATM_SKB(skb)->vcc = vcc;
-            do_gettimeofday(&skb->stamp);
+	    __net_timestamp(skb);
             vcc->push(vcc, skb);
             atomic_inc(&vcc->stats->rx);
          }
@@ -2350,8 +2362,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
 	 {
             if (!atm_charge(vcc, sb->truesize))
             {
-               push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data),
-                           0, 0);
+               push_rxbufs(card, sb);
                atomic_inc(&vcc->stats->rx_drop);
             }
             else
@@ -2362,21 +2373,19 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
                sb->destructor = ns_sb_destructor;
 #endif /* NS_USE_DESTRUCTORS */
                ATM_SKB(sb)->vcc = vcc;
-               do_gettimeofday(&sb->stamp);
+	       __net_timestamp(sb);
                vcc->push(vcc, sb);
                atomic_inc(&vcc->stats->rx);
             }
 
-            push_rxbufs(card, BUF_LG, (u32) skb,
-	                   (u32) virt_to_bus(skb->data), 0, 0);
+            push_rxbufs(card, skb);
 
 	 }
 	 else			/* len > NS_SMBUFSIZE, the usual case */
 	 {
             if (!atm_charge(vcc, skb->truesize))
             {
-               push_rxbufs(card, BUF_LG, (u32) skb,
-                           (u32) virt_to_bus(skb->data), 0, 0);
+               push_rxbufs(card, skb);
                atomic_inc(&vcc->stats->rx_drop);
             }
             else
@@ -2389,13 +2398,12 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
                memcpy(skb->data, sb->data, NS_SMBUFSIZE);
                skb_put(skb, len - NS_SMBUFSIZE);
                ATM_SKB(skb)->vcc = vcc;
-               do_gettimeofday(&skb->stamp);
+	       __net_timestamp(skb);
                vcc->push(vcc, skb);
                atomic_inc(&vcc->stats->rx);
             }
 
-            push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data),
-                        0, 0);
+            push_rxbufs(card, sb);
 
          }
 	 
@@ -2430,6 +2438,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
                   card->hbpool.count++;
                }
             }
+            NS_SKB_CB(hb)->buf_type = BUF_NONE;
 	 }
 	 else
          if (--card->hbpool.count < card->hbnr.min)
@@ -2437,6 +2446,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
             struct sk_buff *new_hb;
             if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL)
             {
+               NS_SKB_CB(new_hb)->buf_type = BUF_NONE;
                skb_queue_tail(&card->hbpool.queue, new_hb);
                card->hbpool.count++;
             }
@@ -2444,6 +2454,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
 	    {
                if ((new_hb = dev_alloc_skb(NS_HBUFSIZE)) != NULL)
                {
+                  NS_SKB_CB(new_hb)->buf_type = BUF_NONE;
                   skb_queue_tail(&card->hbpool.queue, new_hb);
                   card->hbpool.count++;
                }
@@ -2473,8 +2484,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
             remaining = len - iov->iov_len;
             iov++;
             /* Free the small buffer */
-            push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data),
-                        0, 0);
+            push_rxbufs(card, sb);
 
             /* Copy all large buffers to the huge buffer and free them */
             for (j = 1; j < NS_SKB(iovb)->iovcnt; j++)
@@ -2485,8 +2495,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
                skb_put(hb, tocopy);
                iov++;
                remaining -= tocopy;
-               push_rxbufs(card, BUF_LG, (u32) lb,
-                           (u32) virt_to_bus(lb->data), 0, 0);
+               push_rxbufs(card, lb);
             }
 #ifdef EXTRA_DEBUG
             if (remaining != 0 || hb->len != len)
@@ -2496,7 +2505,7 @@ static void dequeue_rx(ns_dev *card, ns_rsqe *rsqe)
 #ifdef NS_USE_DESTRUCTORS
             hb->destructor = ns_hb_destructor;
 #endif /* NS_USE_DESTRUCTORS */
-            do_gettimeofday(&hb->stamp);
+	    __net_timestamp(hb);
             vcc->push(vcc, hb);
             atomic_inc(&vcc->stats->rx);
          }
@@ -2527,9 +2536,10 @@ static void ns_sb_destructor(struct sk_buff *sb)
       sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL);
       if (sb == NULL)
          break;
+      NS_SKB_CB(sb)->buf_type = BUF_SM;
       skb_queue_tail(&card->sbpool.queue, sb);
       skb_reserve(sb, NS_AAL0_HEADER);
-      push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0);
+      push_rxbufs(card, sb);
    } while (card->sbfqc < card->sbnr.min);
 }
 
@@ -2550,9 +2560,10 @@ static void ns_lb_destructor(struct sk_buff *lb)
       lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL);
       if (lb == NULL)
          break;
+      NS_SKB_CB(lb)->buf_type = BUF_LG;
       skb_queue_tail(&card->lbpool.queue, lb);
       skb_reserve(lb, NS_SMBUFSIZE);
-      push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0);
+      push_rxbufs(card, lb);
    } while (card->lbfqc < card->lbnr.min);
 }
 
@@ -2569,6 +2580,7 @@ static void ns_hb_destructor(struct sk_buff *hb)
       hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL);
       if (hb == NULL)
          break;
+      NS_SKB_CB(hb)->buf_type = BUF_NONE;
       skb_queue_tail(&card->hbpool.queue, hb);
       card->hbpool.count++;
    }
@@ -2577,45 +2589,25 @@ static void ns_hb_destructor(struct sk_buff *hb)
 #endif /* NS_USE_DESTRUCTORS */
 
 
-
 static void recycle_rx_buf(ns_dev *card, struct sk_buff *skb)
 {
-   if (skb->list == &card->sbpool.queue)
-      push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0);
-   else if (skb->list == &card->lbpool.queue)
-      push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data), 0, 0);
-   else
-   {
-      printk("nicstar%d: What kind of rx buffer is this?\n", card->index);
-      dev_kfree_skb_any(skb);
-   }
-}
+	struct ns_skb_cb *cb = NS_SKB_CB(skb);
 
+	if (unlikely(cb->buf_type == BUF_NONE)) {
+		printk("nicstar%d: What kind of rx buffer is this?\n", card->index);
+		dev_kfree_skb_any(skb);
+	} else
+		push_rxbufs(card, skb);
+}
 
 
 static void recycle_iovec_rx_bufs(ns_dev *card, struct iovec *iov, int count)
 {
-   struct sk_buff *skb;
-
-   for (; count > 0; count--)
-   {
-      skb = (struct sk_buff *) (iov++)->iov_base;
-      if (skb->list == &card->sbpool.queue)
-         push_rxbufs(card, BUF_SM, (u32) skb, (u32) virt_to_bus(skb->data),
-	             0, 0);
-      else if (skb->list == &card->lbpool.queue)
-         push_rxbufs(card, BUF_LG, (u32) skb, (u32) virt_to_bus(skb->data),
-	             0, 0);
-      else
-      {
-         printk("nicstar%d: What kind of rx buffer is this?\n", card->index);
-         dev_kfree_skb_any(skb);
-      }
-   }
+	while (count-- > 0)
+		recycle_rx_buf(card, (struct sk_buff *) (iov++)->iov_base);
 }
 
 
-
 static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb)
 {
    if (card->iovpool.count < card->iovnr.max)
@@ -2631,7 +2623,7 @@ static void recycle_iov_buf(ns_dev *card, struct sk_buff *iovb)
 
 static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb)
 {
-   skb_unlink(sb);
+   skb_unlink(sb, &card->sbpool.queue);
 #ifdef NS_USE_DESTRUCTORS
    if (card->sbfqc < card->sbnr.min)
 #else
@@ -2640,10 +2632,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb)
       struct sk_buff *new_sb;
       if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL)
       {
+         NS_SKB_CB(new_sb)->buf_type = BUF_SM;
          skb_queue_tail(&card->sbpool.queue, new_sb);
          skb_reserve(new_sb, NS_AAL0_HEADER);
-         push_rxbufs(card, BUF_SM, (u32) new_sb,
-                     (u32) virt_to_bus(new_sb->data), 0, 0);
+         push_rxbufs(card, new_sb);
       }
    }
    if (card->sbfqc < card->sbnr.init)
@@ -2652,10 +2644,10 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb)
       struct sk_buff *new_sb;
       if ((new_sb = dev_alloc_skb(NS_SMSKBSIZE)) != NULL)
       {
+         NS_SKB_CB(new_sb)->buf_type = BUF_SM;
          skb_queue_tail(&card->sbpool.queue, new_sb);
          skb_reserve(new_sb, NS_AAL0_HEADER);
-         push_rxbufs(card, BUF_SM, (u32) new_sb,
-                     (u32) virt_to_bus(new_sb->data), 0, 0);
+         push_rxbufs(card, new_sb);
       }
    }
 }
@@ -2664,7 +2656,7 @@ static void dequeue_sm_buf(ns_dev *card, struct sk_buff *sb)
 
 static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb)
 {
-   skb_unlink(lb);
+   skb_unlink(lb, &card->lbpool.queue);
 #ifdef NS_USE_DESTRUCTORS
    if (card->lbfqc < card->lbnr.min)
 #else
@@ -2673,10 +2665,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb)
       struct sk_buff *new_lb;
       if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL)
       {
+         NS_SKB_CB(new_lb)->buf_type = BUF_LG;
          skb_queue_tail(&card->lbpool.queue, new_lb);
          skb_reserve(new_lb, NS_SMBUFSIZE);
-         push_rxbufs(card, BUF_LG, (u32) new_lb,
-                     (u32) virt_to_bus(new_lb->data), 0, 0);
+         push_rxbufs(card, new_lb);
       }
    }
    if (card->lbfqc < card->lbnr.init)
@@ -2685,10 +2677,10 @@ static void dequeue_lg_buf(ns_dev *card, struct sk_buff *lb)
       struct sk_buff *new_lb;
       if ((new_lb = dev_alloc_skb(NS_LGSKBSIZE)) != NULL)
       {
+         NS_SKB_CB(new_lb)->buf_type = BUF_LG;
          skb_queue_tail(&card->lbpool.queue, new_lb);
          skb_reserve(new_lb, NS_SMBUFSIZE);
-         push_rxbufs(card, BUF_LG, (u32) new_lb,
-                     (u32) virt_to_bus(new_lb->data), 0, 0);
+         push_rxbufs(card, new_lb);
       }
    }
 }
@@ -2880,9 +2872,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg)
                   sb = __dev_alloc_skb(NS_SMSKBSIZE, GFP_KERNEL);
                   if (sb == NULL)
                      return -ENOMEM;
+                  NS_SKB_CB(sb)->buf_type = BUF_SM;
                   skb_queue_tail(&card->sbpool.queue, sb);
                   skb_reserve(sb, NS_AAL0_HEADER);
-                  push_rxbufs(card, BUF_SM, (u32) sb, (u32) virt_to_bus(sb->data), 0, 0);
+                  push_rxbufs(card, sb);
 	       }
 	       break;
 
@@ -2894,9 +2887,10 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg)
                   lb = __dev_alloc_skb(NS_LGSKBSIZE, GFP_KERNEL);
                   if (lb == NULL)
                      return -ENOMEM;
+                  NS_SKB_CB(lb)->buf_type = BUF_LG;
                   skb_queue_tail(&card->lbpool.queue, lb);
                   skb_reserve(lb, NS_SMBUFSIZE);
-                  push_rxbufs(card, BUF_LG, (u32) lb, (u32) virt_to_bus(lb->data), 0, 0);
+                  push_rxbufs(card, lb);
 	       }
 	       break;
 
@@ -2923,6 +2917,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg)
                   hb = __dev_alloc_skb(NS_HBUFSIZE, GFP_KERNEL);
                   if (hb == NULL)
                      return -ENOMEM;
+                  NS_SKB_CB(hb)->buf_type = BUF_NONE;
                   ns_grab_int_lock(card, flags);
                   skb_queue_tail(&card->hbpool.queue, hb);
                   card->hbpool.count++;
@@ -2953,6 +2948,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg)
                   iovb = alloc_skb(NS_IOVBUFSIZE, GFP_KERNEL);
                   if (iovb == NULL)
                      return -ENOMEM;
+                  NS_SKB_CB(iovb)->buf_type = BUF_NONE;
                   ns_grab_int_lock(card, flags);
                   skb_queue_tail(&card->iovpool.queue, iovb);
                   card->iovpool.count++;
@@ -2979,17 +2975,12 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user *arg)
 }
 
 
-
 static void which_list(ns_dev *card, struct sk_buff *skb)
 {
-   printk("It's a %s buffer.\n", skb->list == &card->sbpool.queue ?
-          "small" : skb->list == &card->lbpool.queue ? "large" :
-	  skb->list == &card->hbpool.queue ? "huge" :
-	  skb->list == &card->iovpool.queue ? "iovec" : "unknown");
+	printk("skb buf_type: 0x%08x\n", NS_SKB_CB(skb)->buf_type);
 }
 
 
-
 static void ns_poll(unsigned long arg)
 {
    int i;
diff --git a/drivers/atm/nicstar.h b/drivers/atm/nicstar.h
index ea83c46c8ba..5997bcb45b5 100644
--- a/drivers/atm/nicstar.h
+++ b/drivers/atm/nicstar.h
@@ -103,8 +103,14 @@
 
 #define NS_IOREMAP_SIZE 4096
 
-#define BUF_SM 0x00000000	/* These two are used for push_rxbufs() */
-#define BUF_LG 0x00000001       /* CMD, Write_FreeBufQ, LBUF bit */
+/*
+ * BUF_XX distinguish the Rx buffers depending on their (small/large) size.
+ * BUG_SM and BUG_LG are both used by the driver and the device.
+ * BUF_NONE is only used by the driver.
+ */
+#define BUF_SM		0x00000000	/* These two are used for push_rxbufs() */
+#define BUF_LG		0x00000001	/* CMD, Write_FreeBufQ, LBUF bit */
+#define BUF_NONE 	0xffffffff	/* Software only: */
 
 #define NS_HBUFSIZE 65568	/* Size of max. AAL5 PDU */
 #define NS_MAX_IOVECS (2 + (65568 - NS_SMBUFSIZE) / \
@@ -684,6 +690,12 @@ enum ns_regs
 /* Device driver structures ***************************************************/
 
 
+struct ns_skb_cb {
+	u32 buf_type;			/* BUF_SM/BUF_LG/BUF_NONE */
+};
+
+#define NS_SKB_CB(skb)	((struct ns_skb_cb *)((skb)->cb))
+
 typedef struct tsq_info
 {
    void *org;
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index a2b236a966e..c4b75ecf946 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -400,7 +400,7 @@ unsigned long *x;
 EVENT("error code 0x%x/0x%x\n",(here[3] & uPD98401_AAL5_ES) >>
   uPD98401_AAL5_ES_SHIFT,error);
 		skb = ((struct rx_buffer_head *) bus_to_virt(here[2]))->skb;
-		do_gettimeofday(&skb->stamp);
+		__net_timestamp(skb);
 #if 0
 printk("[-3..0] 0x%08lx 0x%08lx 0x%08lx 0x%08lx\n",((unsigned *) skb->data)[-3],
   ((unsigned *) skb->data)[-2],((unsigned *) skb->data)[-1],
@@ -417,10 +417,12 @@ printk("dummy: 0x%08lx, 0x%08lx\n",dummy[0],dummy[1]);
 		chan = (here[3] & uPD98401_AAL5_CHAN) >>
 		    uPD98401_AAL5_CHAN_SHIFT;
 		if (chan < zatm_dev->chans && zatm_dev->rx_map[chan]) {
+			int pos = ZATM_VCC(vcc)->pool;
+
 			vcc = zatm_dev->rx_map[chan];
-			if (skb == zatm_dev->last_free[ZATM_VCC(vcc)->pool])
-				zatm_dev->last_free[ZATM_VCC(vcc)->pool] = NULL;
-			skb_unlink(skb);
+			if (skb == zatm_dev->last_free[pos])
+				zatm_dev->last_free[pos] = NULL;
+			skb_unlink(skb, zatm_dev->pool + pos);
 		}
 		else {
 			printk(KERN_ERR DEV_LABEL "(itf %d): RX indication "
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 9e6f51c528b..4be976940f6 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -120,7 +120,7 @@ aoenet_xmit(struct sk_buff *sl)
  * (1) len doesn't include the header by default.  I want this. 
  */
 static int
-aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt)
+aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct aoe_hdr *h;
 	u32 n;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 46e56a25d2c..e46ecd23b3a 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -776,7 +776,7 @@ static int viodasd_remove(struct vio_dev *vdev)
  */
 static struct vio_device_id viodasd_device_table[] __devinitdata = {
 	{ "viodasd", "" },
-	{ 0, }
+	{ "", "" }
 };
 
 MODULE_DEVICE_TABLE(vio, viodasd_device_table);
diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c
index c42d7e6ac1c..1e9db0156ea 100644
--- a/drivers/bluetooth/bfusb.c
+++ b/drivers/bluetooth/bfusb.c
@@ -158,7 +158,7 @@ static int bfusb_send_bulk(struct bfusb *bfusb, struct sk_buff *skb)
 	if (err) {
 		BT_ERR("%s bulk tx submit failed urb %p err %d", 
 					bfusb->hdev->name, urb, err);
-		skb_unlink(skb);
+		skb_unlink(skb, &bfusb->pending_q);
 		usb_free_urb(urb);
 	} else
 		atomic_inc(&bfusb->pending_tx);
@@ -212,7 +212,7 @@ static void bfusb_tx_complete(struct urb *urb, struct pt_regs *regs)
 
 	read_lock(&bfusb->lock);
 
-	skb_unlink(skb);
+	skb_unlink(skb, &bfusb->pending_q);
 	skb_queue_tail(&bfusb->completed_q, skb);
 
 	bfusb_tx_wakeup(bfusb);
@@ -253,7 +253,7 @@ static int bfusb_rx_submit(struct bfusb *bfusb, struct urb *urb)
 	if (err) {
 		BT_ERR("%s bulk rx submit failed urb %p err %d",
 					bfusb->hdev->name, urb, err);
-		skb_unlink(skb);
+		skb_unlink(skb, &bfusb->pending_q);
 		kfree_skb(skb);
 		usb_free_urb(urb);
 	}
@@ -330,7 +330,7 @@ static inline int bfusb_recv_block(struct bfusb *bfusb, int hdr, unsigned char *
 		}
 
 		skb->dev = (void *) bfusb->hdev;
-		skb->pkt_type = pkt_type;
+		bt_cb(skb)->pkt_type = pkt_type;
 
 		bfusb->reassembly = skb;
 	} else {
@@ -398,7 +398,7 @@ static void bfusb_rx_complete(struct urb *urb, struct pt_regs *regs)
 		buf   += len;
 	}
 
-	skb_unlink(skb);
+	skb_unlink(skb, &bfusb->pending_q);
 	kfree_skb(skb);
 
 	bfusb_rx_submit(bfusb, urb);
@@ -485,7 +485,7 @@ static int bfusb_send_frame(struct sk_buff *skb)
 	unsigned char buf[3];
 	int sent = 0, size, count;
 
-	BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len);
+	BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len);
 
 	if (!hdev) {
 		BT_ERR("Frame for unknown HCI device (hdev=NULL)");
@@ -497,7 +497,7 @@ static int bfusb_send_frame(struct sk_buff *skb)
 
 	bfusb = (struct bfusb *) hdev->driver_data;
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		break;
@@ -510,7 +510,7 @@ static int bfusb_send_frame(struct sk_buff *skb)
 	};
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &(skb->pkt_type), 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 
 	count = skb->len;
 
diff --git a/drivers/bluetooth/bluecard_cs.c b/drivers/bluetooth/bluecard_cs.c
index bd2ec7e284c..26fe9c0e1d2 100644
--- a/drivers/bluetooth/bluecard_cs.c
+++ b/drivers/bluetooth/bluecard_cs.c
@@ -270,7 +270,7 @@ static void bluecard_write_wakeup(bluecard_info_t *info)
 		if (!(skb = skb_dequeue(&(info->txq))))
 			break;
 
-		if (skb->pkt_type & 0x80) {
+		if (bt_cb(skb)->pkt_type & 0x80) {
 			/* Disable RTS */
 			info->ctrl_reg |= REG_CONTROL_RTS;
 			outb(info->ctrl_reg, iobase + REG_CONTROL);
@@ -288,13 +288,13 @@ static void bluecard_write_wakeup(bluecard_info_t *info)
 		/* Mark the buffer as dirty */
 		clear_bit(ready_bit, &(info->tx_state));
 
-		if (skb->pkt_type & 0x80) {
+		if (bt_cb(skb)->pkt_type & 0x80) {
 			DECLARE_WAIT_QUEUE_HEAD(wq);
 			DEFINE_WAIT(wait);
 
 			unsigned char baud_reg;
 
-			switch (skb->pkt_type) {
+			switch (bt_cb(skb)->pkt_type) {
 			case PKT_BAUD_RATE_460800:
 				baud_reg = REG_CONTROL_BAUD_RATE_460800;
 				break;
@@ -410,9 +410,9 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset)
 		if (info->rx_state == RECV_WAIT_PACKET_TYPE) {
 
 			info->rx_skb->dev = (void *) info->hdev;
-			info->rx_skb->pkt_type = buf[i];
+			bt_cb(info->rx_skb)->pkt_type = buf[i];
 
-			switch (info->rx_skb->pkt_type) {
+			switch (bt_cb(info->rx_skb)->pkt_type) {
 
 			case 0x00:
 				/* init packet */
@@ -444,7 +444,7 @@ static void bluecard_receive(bluecard_info_t *info, unsigned int offset)
 
 			default:
 				/* unknown packet */
-				BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type);
+				BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type);
 				info->hdev->stat.err_rx++;
 
 				kfree_skb(info->rx_skb);
@@ -586,21 +586,21 @@ static int bluecard_hci_set_baud_rate(struct hci_dev *hdev, int baud)
 	switch (baud) {
 	case 460800:
 		cmd[4] = 0x00;
-		skb->pkt_type = PKT_BAUD_RATE_460800;
+		bt_cb(skb)->pkt_type = PKT_BAUD_RATE_460800;
 		break;
 	case 230400:
 		cmd[4] = 0x01;
-		skb->pkt_type = PKT_BAUD_RATE_230400;
+		bt_cb(skb)->pkt_type = PKT_BAUD_RATE_230400;
 		break;
 	case 115200:
 		cmd[4] = 0x02;
-		skb->pkt_type = PKT_BAUD_RATE_115200;
+		bt_cb(skb)->pkt_type = PKT_BAUD_RATE_115200;
 		break;
 	case 57600:
 		/* Fall through... */
 	default:
 		cmd[4] = 0x03;
-		skb->pkt_type = PKT_BAUD_RATE_57600;
+		bt_cb(skb)->pkt_type = PKT_BAUD_RATE_57600;
 		break;
 	}
 
@@ -680,7 +680,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb)
 
 	info = (bluecard_info_t *)(hdev->driver_data);
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		break;
@@ -693,7 +693,7 @@ static int bluecard_hci_send_frame(struct sk_buff *skb)
 	};
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &(skb->pkt_type), 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 	skb_queue_tail(&(info->txq), skb);
 
 	bluecard_write_wakeup(info);
diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index f696da6f417..a1bf8f066c8 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -105,7 +105,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c
 			if (skb) {
 				memcpy(skb_put(skb, len), buf, len);
 				skb->dev = (void *) data->hdev;
-				skb->pkt_type = HCI_ACLDATA_PKT;
+				bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
 				hci_recv_frame(skb);
 			}
 			break;
@@ -117,7 +117,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c
 			if (skb) {
 				memcpy(skb_put(skb, len), buf, len);
 				skb->dev = (void *) data->hdev;
-				skb->pkt_type = HCI_SCODATA_PKT;
+				bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
 				hci_recv_frame(skb);
 			}
 			break;
@@ -129,7 +129,7 @@ static void bpa10x_recv_bulk(struct bpa10x_data *data, unsigned char *buf, int c
 			if (skb) {
 				memcpy(skb_put(skb, len), buf, len);
 				skb->dev = (void *) data->hdev;
-				skb->pkt_type = HCI_VENDOR_PKT;
+				bt_cb(skb)->pkt_type = HCI_VENDOR_PKT;
 				hci_recv_frame(skb);
 			}
 			break;
@@ -190,7 +190,7 @@ static int bpa10x_recv_event(struct bpa10x_data *data, unsigned char *buf, int s
 		}
 
 		skb->dev = (void *) data->hdev;
-		skb->pkt_type = pkt_type;
+		bt_cb(skb)->pkt_type = pkt_type;
 
 		memcpy(skb_put(skb, size), buf, size);
 
@@ -307,7 +307,8 @@ unlock:
 	read_unlock(&data->lock);
 }
 
-static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe, size_t size, int flags, void *data)
+static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe,
+					size_t size, unsigned int __nocast flags, void *data)
 {
 	struct urb *urb;
 	struct usb_ctrlrequest *cr;
@@ -487,7 +488,7 @@ static int bpa10x_send_frame(struct sk_buff *skb)
 	struct hci_dev *hdev = (struct hci_dev *) skb->dev;
 	struct bpa10x_data *data;
 
-	BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, skb->pkt_type, skb->len);
+	BT_DBG("hdev %p skb %p type %d len %d", hdev, skb, bt_cb(skb)->pkt_type, skb->len);
 
 	if (!hdev) {
 		BT_ERR("Frame for unknown HCI device");
@@ -500,9 +501,9 @@ static int bpa10x_send_frame(struct sk_buff *skb)
 	data = hdev->driver_data;
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &(skb->pkt_type), 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		skb_queue_tail(&data->cmd_queue, skb);
diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c
index adf1750ea58..2e0338d80f3 100644
--- a/drivers/bluetooth/bt3c_cs.c
+++ b/drivers/bluetooth/bt3c_cs.c
@@ -259,11 +259,11 @@ static void bt3c_receive(bt3c_info_t *info)
 		if (info->rx_state == RECV_WAIT_PACKET_TYPE) {
 
 			info->rx_skb->dev = (void *) info->hdev;
-			info->rx_skb->pkt_type = inb(iobase + DATA_L);
+			bt_cb(info->rx_skb)->pkt_type = inb(iobase + DATA_L);
 			inb(iobase + DATA_H);
-			//printk("bt3c: PACKET_TYPE=%02x\n", info->rx_skb->pkt_type);
+			//printk("bt3c: PACKET_TYPE=%02x\n", bt_cb(info->rx_skb)->pkt_type);
 
-			switch (info->rx_skb->pkt_type) {
+			switch (bt_cb(info->rx_skb)->pkt_type) {
 
 			case HCI_EVENT_PKT:
 				info->rx_state = RECV_WAIT_EVENT_HEADER;
@@ -282,7 +282,7 @@ static void bt3c_receive(bt3c_info_t *info)
 
 			default:
 				/* Unknown packet */
-				BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type);
+				BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type);
 				info->hdev->stat.err_rx++;
 				clear_bit(HCI_RUNNING, &(info->hdev->flags));
 
@@ -439,7 +439,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb)
 
 	info = (bt3c_info_t *) (hdev->driver_data);
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		break;
@@ -452,7 +452,7 @@ static int bt3c_hci_send_frame(struct sk_buff *skb)
 	};
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &(skb->pkt_type), 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 	skb_queue_tail(&(info->txq), skb);
 
 	spin_lock_irqsave(&(info->lock), flags);
diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c
index e4c59fdc0e1..89486ea7a02 100644
--- a/drivers/bluetooth/btuart_cs.c
+++ b/drivers/bluetooth/btuart_cs.c
@@ -211,9 +211,9 @@ static void btuart_receive(btuart_info_t *info)
 		if (info->rx_state == RECV_WAIT_PACKET_TYPE) {
 
 			info->rx_skb->dev = (void *) info->hdev;
-			info->rx_skb->pkt_type = inb(iobase + UART_RX);
+			bt_cb(info->rx_skb)->pkt_type = inb(iobase + UART_RX);
 
-			switch (info->rx_skb->pkt_type) {
+			switch (bt_cb(info->rx_skb)->pkt_type) {
 
 			case HCI_EVENT_PKT:
 				info->rx_state = RECV_WAIT_EVENT_HEADER;
@@ -232,7 +232,7 @@ static void btuart_receive(btuart_info_t *info)
 
 			default:
 				/* Unknown packet */
-				BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type);
+				BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type);
 				info->hdev->stat.err_rx++;
 				clear_bit(HCI_RUNNING, &(info->hdev->flags));
 
@@ -447,7 +447,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb)
 
 	info = (btuart_info_t *)(hdev->driver_data);
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		break;
@@ -460,7 +460,7 @@ static int btuart_hci_send_frame(struct sk_buff *skb)
 	};
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &(skb->pkt_type), 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 	skb_queue_tail(&(info->txq), skb);
 
 	btuart_write_wakeup(info);
diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c
index e39868c3da4..84c1f883942 100644
--- a/drivers/bluetooth/dtl1_cs.c
+++ b/drivers/bluetooth/dtl1_cs.c
@@ -251,7 +251,7 @@ static void dtl1_receive(dtl1_info_t *info)
 				info->rx_count = nsh->len + (nsh->len & 0x0001);
 				break;
 			case RECV_WAIT_DATA:
-				info->rx_skb->pkt_type = nsh->type;
+				bt_cb(info->rx_skb)->pkt_type = nsh->type;
 
 				/* remove PAD byte if it exists */
 				if (nsh->len & 0x0001) {
@@ -262,7 +262,7 @@ static void dtl1_receive(dtl1_info_t *info)
 				/* remove NSH */
 				skb_pull(info->rx_skb, NSHL);
 
-				switch (info->rx_skb->pkt_type) {
+				switch (bt_cb(info->rx_skb)->pkt_type) {
 				case 0x80:
 					/* control data for the Nokia Card */
 					dtl1_control(info, info->rx_skb);
@@ -272,12 +272,12 @@ static void dtl1_receive(dtl1_info_t *info)
 				case 0x84:
 					/* send frame to the HCI layer */
 					info->rx_skb->dev = (void *) info->hdev;
-					info->rx_skb->pkt_type &= 0x0f;
+					bt_cb(info->rx_skb)->pkt_type &= 0x0f;
 					hci_recv_frame(info->rx_skb);
 					break;
 				default:
 					/* unknown packet */
-					BT_ERR("Unknown HCI packet with type 0x%02x received", info->rx_skb->pkt_type);
+					BT_ERR("Unknown HCI packet with type 0x%02x received", bt_cb(info->rx_skb)->pkt_type);
 					kfree_skb(info->rx_skb);
 					break;
 				}
@@ -410,7 +410,7 @@ static int dtl1_hci_send_frame(struct sk_buff *skb)
 
 	info = (dtl1_info_t *)(hdev->driver_data);
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		nsh.type = 0x81;
diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c
index 858fddb046d..0ee324e1265 100644
--- a/drivers/bluetooth/hci_bcsp.c
+++ b/drivers/bluetooth/hci_bcsp.c
@@ -149,7 +149,7 @@ static int bcsp_enqueue(struct hci_uart *hu, struct sk_buff *skb)
 		return 0;
 	}
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_ACLDATA_PKT:
 	case HCI_COMMAND_PKT:
 		skb_queue_tail(&bcsp->rel, skb);
@@ -227,7 +227,7 @@ static struct sk_buff *bcsp_prepare_pkt(struct bcsp_struct *bcsp, u8 *data,
 	if (!nskb)
 		return NULL;
 
-	nskb->pkt_type = pkt_type;
+	bt_cb(nskb)->pkt_type = pkt_type;
 
 	bcsp_slip_msgdelim(nskb);
 
@@ -286,7 +286,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu)
 	   since they have priority */
 
 	if ((skb = skb_dequeue(&bcsp->unrel)) != NULL) {
-		struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type);
+		struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type);
 		if (nskb) {
 			kfree_skb(skb);
 			return nskb;
@@ -303,7 +303,7 @@ static struct sk_buff *bcsp_dequeue(struct hci_uart *hu)
 	spin_lock_irqsave(&bcsp->unack.lock, flags);
 
 	if (bcsp->unack.qlen < BCSP_TXWINSIZE && (skb = skb_dequeue(&bcsp->rel)) != NULL) {
-		struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, skb->pkt_type);
+		struct sk_buff *nskb = bcsp_prepare_pkt(bcsp, skb->data, skb->len, bt_cb(skb)->pkt_type);
 		if (nskb) {
 			__skb_queue_tail(&bcsp->unack, skb);
 			mod_timer(&bcsp->tbcsp, jiffies + HZ / 4);
@@ -401,7 +401,7 @@ static void bcsp_handle_le_pkt(struct hci_uart *hu)
 		if (!nskb)
 			return;
 		memcpy(skb_put(nskb, 4), conf_rsp_pkt, 4);
-		nskb->pkt_type = BCSP_LE_PKT;
+		bt_cb(nskb)->pkt_type = BCSP_LE_PKT;
 
 		skb_queue_head(&bcsp->unrel, nskb);
 		hci_uart_tx_wakeup(hu);
@@ -483,14 +483,14 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu)
 	bcsp_pkt_cull(bcsp);
 	if ((bcsp->rx_skb->data[1] & 0x0f) == 6 &&
 			bcsp->rx_skb->data[0] & 0x80) {
-		bcsp->rx_skb->pkt_type = HCI_ACLDATA_PKT;
+		bt_cb(bcsp->rx_skb)->pkt_type = HCI_ACLDATA_PKT;
 		pass_up = 1;
 	} else if ((bcsp->rx_skb->data[1] & 0x0f) == 5 &&
 			bcsp->rx_skb->data[0] & 0x80) {
-		bcsp->rx_skb->pkt_type = HCI_EVENT_PKT;
+		bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT;
 		pass_up = 1;
 	} else if ((bcsp->rx_skb->data[1] & 0x0f) == 7) {
-		bcsp->rx_skb->pkt_type = HCI_SCODATA_PKT;
+		bt_cb(bcsp->rx_skb)->pkt_type = HCI_SCODATA_PKT;
 		pass_up = 1;
 	} else if ((bcsp->rx_skb->data[1] & 0x0f) == 1 &&
 			!(bcsp->rx_skb->data[0] & 0x80)) {
@@ -512,7 +512,7 @@ static inline void bcsp_complete_rx_pkt(struct hci_uart *hu)
 				hdr.evt = 0xff;
 				hdr.plen = bcsp->rx_skb->len;
 				memcpy(skb_push(bcsp->rx_skb, HCI_EVENT_HDR_SIZE), &hdr, HCI_EVENT_HDR_SIZE);
-				bcsp->rx_skb->pkt_type = HCI_EVENT_PKT;
+				bt_cb(bcsp->rx_skb)->pkt_type = HCI_EVENT_PKT;
 
 				hci_recv_frame(bcsp->rx_skb);
 			} else {
diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c
index 533323b60e6..cf8a22d58d9 100644
--- a/drivers/bluetooth/hci_h4.c
+++ b/drivers/bluetooth/hci_h4.c
@@ -112,7 +112,7 @@ static int h4_enqueue(struct hci_uart *hu, struct sk_buff *skb)
 	BT_DBG("hu %p skb %p", hu, skb);
 
 	/* Prepend skb with frame type */
-	memcpy(skb_push(skb, 1), &skb->pkt_type, 1);
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
 	skb_queue_tail(&h4->txq, skb);
 	return 0;
 }
@@ -239,7 +239,7 @@ static int h4_recv(struct hci_uart *hu, void *data, int count)
 			return 0;
 		}
 		h4->rx_skb->dev = (void *) hu->hdev;
-		h4->rx_skb->pkt_type = type;
+		bt_cb(h4->rx_skb)->pkt_type = type;
 	}
 	return count;
 }
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 90be2eae52e..aed80cc2289 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -153,7 +153,7 @@ restart:
 			break;
 		}
 	
-		hci_uart_tx_complete(hu, skb->pkt_type);
+		hci_uart_tx_complete(hu, bt_cb(skb)->pkt_type);
 		kfree_skb(skb);
 	} 
 	
@@ -229,7 +229,7 @@ static int hci_uart_send_frame(struct sk_buff *skb)
 	hu = (struct hci_uart *) hdev->driver_data;
 	tty = hu->tty;
 
-	BT_DBG("%s: type %d len %d", hdev->name, skb->pkt_type, skb->len);
+	BT_DBG("%s: type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
 
 	hu->proto->enqueue(hu, skb);
 
diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c
index 657719b8254..67d96b5cbb9 100644
--- a/drivers/bluetooth/hci_usb.c
+++ b/drivers/bluetooth/hci_usb.c
@@ -127,7 +127,7 @@ static struct usb_device_id blacklist_ids[] = {
 	{ }	/* Terminating entry */
 };
 
-static struct _urb *_urb_alloc(int isoc, int gfp)
+static struct _urb *_urb_alloc(int isoc, unsigned int __nocast gfp)
 {
 	struct _urb *_urb = kmalloc(sizeof(struct _urb) +
 				sizeof(struct usb_iso_packet_descriptor) * isoc, gfp);
@@ -443,7 +443,7 @@ static int __tx_submit(struct hci_usb *husb, struct _urb *_urb)
 
 static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb)
 {
-	struct _urb *_urb = __get_completed(husb, skb->pkt_type);
+	struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type);
 	struct usb_ctrlrequest *dr;
 	struct urb *urb;
 
@@ -451,7 +451,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb)
 		_urb = _urb_alloc(0, GFP_ATOMIC);
 		if (!_urb)
 			return -ENOMEM;
-		_urb->type = skb->pkt_type;
+		_urb->type = bt_cb(skb)->pkt_type;
 
 		dr = kmalloc(sizeof(*dr), GFP_ATOMIC);
 		if (!dr) {
@@ -479,7 +479,7 @@ static inline int hci_usb_send_ctrl(struct hci_usb *husb, struct sk_buff *skb)
 
 static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb)
 {
-	struct _urb *_urb = __get_completed(husb, skb->pkt_type);
+	struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type);
 	struct urb *urb;
 	int pipe;
 
@@ -487,7 +487,7 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb)
 		_urb = _urb_alloc(0, GFP_ATOMIC);
 		if (!_urb)
 			return -ENOMEM;
-		_urb->type = skb->pkt_type;
+		_urb->type = bt_cb(skb)->pkt_type;
 	}
 
 	urb  = &_urb->urb;
@@ -505,14 +505,14 @@ static inline int hci_usb_send_bulk(struct hci_usb *husb, struct sk_buff *skb)
 #ifdef CONFIG_BT_HCIUSB_SCO
 static inline int hci_usb_send_isoc(struct hci_usb *husb, struct sk_buff *skb)
 {
-	struct _urb *_urb = __get_completed(husb, skb->pkt_type);
+	struct _urb *_urb = __get_completed(husb, bt_cb(skb)->pkt_type);
 	struct urb *urb;
 
 	if (!_urb) {
 		_urb = _urb_alloc(HCI_MAX_ISOC_FRAMES, GFP_ATOMIC);
 		if (!_urb)
 			return -ENOMEM;
-		_urb->type = skb->pkt_type;
+		_urb->type = bt_cb(skb)->pkt_type;
 	}
 
 	BT_DBG("%s skb %p len %d", husb->hdev->name, skb, skb->len);
@@ -601,11 +601,11 @@ static int hci_usb_send_frame(struct sk_buff *skb)
 	if (!test_bit(HCI_RUNNING, &hdev->flags))
 		return -EBUSY;
 
-	BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len);
+	BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
 
 	husb = (struct hci_usb *) hdev->driver_data;
 
-	switch (skb->pkt_type) {
+	switch (bt_cb(skb)->pkt_type) {
 	case HCI_COMMAND_PKT:
 		hdev->stat.cmd_tx++;
 		break;
@@ -627,7 +627,7 @@ static int hci_usb_send_frame(struct sk_buff *skb)
 
 	read_lock(&husb->completion_lock);
 
-	skb_queue_tail(__transmit_q(husb, skb->pkt_type), skb);
+	skb_queue_tail(__transmit_q(husb, bt_cb(skb)->pkt_type), skb);
 	hci_usb_tx_wakeup(husb);
 
 	read_unlock(&husb->completion_lock);
@@ -682,7 +682,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c
 				return -ENOMEM;
 			}
 			skb->dev = (void *) husb->hdev;
-			skb->pkt_type = type;
+			bt_cb(skb)->pkt_type = type;
 	
 			__reassembly(husb, type) = skb;
 
@@ -702,6 +702,7 @@ static inline int __recv_frame(struct hci_usb *husb, int type, void *data, int c
 		if (!scb->expect) {
 			/* Complete frame */
 			__reassembly(husb, type) = NULL;
+			bt_cb(skb)->pkt_type = type;
 			hci_recv_frame(skb);
 		}
 
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index f9b956fb2b8..52cbd45c308 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -1,229 +1,220 @@
-/* 
-   BlueZ - Bluetooth protocol stack for Linux
-   Copyright (C) 2000-2001 Qualcomm Incorporated
-
-   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License version 2 as
-   published by the Free Software Foundation;
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
-   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES 
-   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
-   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 
-   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, 
-   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS 
-   SOFTWARE IS DISCLAIMED.
-*/
-
 /*
- * Bluetooth HCI virtual device driver.
  *
- * $Id: hci_vhci.c,v 1.3 2002/04/17 17:37:20 maxk Exp $ 
+ *  Bluetooth virtual HCI driver
+ *
+ *  Copyright (C) 2000-2001 Qualcomm Incorporated
+ *  Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com>
+ *  Copyright (C) 2004-2005 Marcel Holtmann <marcel@holtmann.org>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
  */
-#define VERSION "1.1"
 
 #include <linux/config.h>
 #include <linux/module.h>
 
-#include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/sched.h>
+#include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/poll.h>
-#include <linux/fcntl.h>
-#include <linux/init.h>
-#include <linux/random.h>
 
 #include <linux/skbuff.h>
 #include <linux/miscdevice.h>
 
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
-#include "hci_vhci.h"
 
-/* HCI device part */
+#ifndef CONFIG_BT_HCIVHCI_DEBUG
+#undef  BT_DBG
+#define BT_DBG(D...)
+#endif
+
+#define VERSION "1.2"
+
+static int minor = MISC_DYNAMIC_MINOR;
+
+struct vhci_data {
+	struct hci_dev *hdev;
+
+	unsigned long flags;
+
+	wait_queue_head_t read_wait;
+	struct sk_buff_head readq;
+
+	struct fasync_struct *fasync;
+};
 
-static int hci_vhci_open(struct hci_dev *hdev)
+#define VHCI_FASYNC	0x0010
+
+static struct miscdevice vhci_miscdev;
+
+static int vhci_open_dev(struct hci_dev *hdev)
 {
 	set_bit(HCI_RUNNING, &hdev->flags);
-	return 0;
-}
 
-static int hci_vhci_flush(struct hci_dev *hdev)
-{
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) hdev->driver_data;
-	skb_queue_purge(&hci_vhci->readq);
 	return 0;
 }
 
-static int hci_vhci_close(struct hci_dev *hdev)
+static int vhci_close_dev(struct hci_dev *hdev)
 {
+	struct vhci_data *vhci = hdev->driver_data;
+
 	if (!test_and_clear_bit(HCI_RUNNING, &hdev->flags))
 		return 0;
 
-	hci_vhci_flush(hdev);
+	skb_queue_purge(&vhci->readq);
+
 	return 0;
 }
 
-static void hci_vhci_destruct(struct hci_dev *hdev)
+static int vhci_flush(struct hci_dev *hdev)
 {
-	struct hci_vhci_struct *vhci;
+	struct vhci_data *vhci = hdev->driver_data;
 
-	if (!hdev) return;
+	skb_queue_purge(&vhci->readq);
 
-	vhci = (struct hci_vhci_struct *) hdev->driver_data;
-	kfree(vhci);
+	return 0;
 }
 
-static int hci_vhci_send_frame(struct sk_buff *skb)
+static int vhci_send_frame(struct sk_buff *skb)
 {
 	struct hci_dev* hdev = (struct hci_dev *) skb->dev;
-	struct hci_vhci_struct *hci_vhci;
+	struct vhci_data *vhci;
 
 	if (!hdev) {
-		BT_ERR("Frame for uknown device (hdev=NULL)");
+		BT_ERR("Frame for unknown HCI device (hdev=NULL)");
 		return -ENODEV;
 	}
 
 	if (!test_bit(HCI_RUNNING, &hdev->flags))
 		return -EBUSY;
 
-	hci_vhci = (struct hci_vhci_struct *) hdev->driver_data;
+	vhci = hdev->driver_data;
+
+	memcpy(skb_push(skb, 1), &bt_cb(skb)->pkt_type, 1);
+	skb_queue_tail(&vhci->readq, skb);
 
-	memcpy(skb_push(skb, 1), &skb->pkt_type, 1);
-	skb_queue_tail(&hci_vhci->readq, skb);
+	if (vhci->flags & VHCI_FASYNC)
+		kill_fasync(&vhci->fasync, SIGIO, POLL_IN);
 
-	if (hci_vhci->flags & VHCI_FASYNC)
-		kill_fasync(&hci_vhci->fasync, SIGIO, POLL_IN);
-	wake_up_interruptible(&hci_vhci->read_wait);
+	wake_up_interruptible(&vhci->read_wait);
 
 	return 0;
 }
 
-/* Character device part */
-
-/* Poll */
-static unsigned int hci_vhci_chr_poll(struct file *file, poll_table * wait)
-{  
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data;
-
-	poll_wait(file, &hci_vhci->read_wait, wait);
- 
-	if (!skb_queue_empty(&hci_vhci->readq))
-		return POLLIN | POLLRDNORM;
-
-	return POLLOUT | POLLWRNORM;
+static void vhci_destruct(struct hci_dev *hdev)
+{
+	kfree(hdev->driver_data);
 }
 
-/* Get packet from user space buffer(already verified) */
-static inline ssize_t hci_vhci_get_user(struct hci_vhci_struct *hci_vhci, const char __user *buf, size_t count)
+static inline ssize_t vhci_get_user(struct vhci_data *vhci,
+					const char __user *buf, size_t count)
 {
 	struct sk_buff *skb;
 
 	if (count > HCI_MAX_FRAME_SIZE)
 		return -EINVAL;
 
-	if (!(skb = bt_skb_alloc(count, GFP_KERNEL)))
+	skb = bt_skb_alloc(count, GFP_KERNEL);
+	if (!skb)
 		return -ENOMEM;
-	
+
 	if (copy_from_user(skb_put(skb, count), buf, count)) {
 		kfree_skb(skb);
 		return -EFAULT;
 	}
 
-	skb->dev = (void *) hci_vhci->hdev;
-	skb->pkt_type = *((__u8 *) skb->data);
+	skb->dev = (void *) vhci->hdev;
+	bt_cb(skb)->pkt_type = *((__u8 *) skb->data);
 	skb_pull(skb, 1);
 
 	hci_recv_frame(skb);
 
 	return count;
-} 
-
-/* Write */
-static ssize_t hci_vhci_chr_write(struct file * file, const char __user * buf, 
-			     size_t count, loff_t *pos)
-{
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data;
-
-	if (!access_ok(VERIFY_READ, buf, count))
-		return -EFAULT;
-
-	return hci_vhci_get_user(hci_vhci, buf, count);
 }
 
-/* Put packet to user space buffer(already verified) */
-static inline ssize_t hci_vhci_put_user(struct hci_vhci_struct *hci_vhci,
-				       struct sk_buff *skb, char __user *buf,
-				       int count)
+static inline ssize_t vhci_put_user(struct vhci_data *vhci,
+			struct sk_buff *skb, char __user *buf, int count)
 {
-	int len = count, total = 0;
 	char __user *ptr = buf;
+	int len, total = 0;
+
+	len = min_t(unsigned int, skb->len, count);
 
-	len = min_t(unsigned int, skb->len, len);
 	if (copy_to_user(ptr, skb->data, len))
 		return -EFAULT;
+
 	total += len;
 
-	hci_vhci->hdev->stat.byte_tx += len;
-	switch (skb->pkt_type) {
-		case HCI_COMMAND_PKT:
-			hci_vhci->hdev->stat.cmd_tx++;
-			break;
+	vhci->hdev->stat.byte_tx += len;
 
-		case HCI_ACLDATA_PKT:
-			hci_vhci->hdev->stat.acl_tx++;
-			break;
+	switch (bt_cb(skb)->pkt_type) {
+	case HCI_COMMAND_PKT:
+		vhci->hdev->stat.cmd_tx++;
+		break;
+
+	case HCI_ACLDATA_PKT:
+		vhci->hdev->stat.acl_tx++;
+		break;
 
-		case HCI_SCODATA_PKT:
-			hci_vhci->hdev->stat.cmd_tx++;
-			break;
+	case HCI_SCODATA_PKT:
+		vhci->hdev->stat.cmd_tx++;
+		break;
 	};
 
 	return total;
 }
 
-/* Read */
-static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t count, loff_t *pos)
+static loff_t vhci_llseek(struct file * file, loff_t offset, int origin)
+{
+	return -ESPIPE;
+}
+
+static ssize_t vhci_read(struct file * file, char __user * buf, size_t count, loff_t *pos)
 {
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data;
 	DECLARE_WAITQUEUE(wait, current);
+	struct vhci_data *vhci = file->private_data;
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 
-	add_wait_queue(&hci_vhci->read_wait, &wait);
+	add_wait_queue(&vhci->read_wait, &wait);
 	while (count) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		/* Read frames from device queue */
-		if (!(skb = skb_dequeue(&hci_vhci->readq))) {
+		skb = skb_dequeue(&vhci->readq);
+		if (!skb) {
 			if (file->f_flags & O_NONBLOCK) {
 				ret = -EAGAIN;
 				break;
 			}
+
 			if (signal_pending(current)) {
 				ret = -ERESTARTSYS;
 				break;
 			}
 
-			/* Nothing to read, let's sleep */
 			schedule();
 			continue;
 		}
 
 		if (access_ok(VERIFY_WRITE, buf, count))
-			ret = hci_vhci_put_user(hci_vhci, skb, buf, count);
+			ret = vhci_put_user(vhci, skb, buf, count);
 		else
 			ret = -EFAULT;
 
@@ -231,84 +222,90 @@ static ssize_t hci_vhci_chr_read(struct file * file, char __user * buf, size_t c
 		break;
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&hci_vhci->read_wait, &wait);
+	remove_wait_queue(&vhci->read_wait, &wait);
 
 	return ret;
 }
 
-static loff_t hci_vhci_chr_lseek(struct file * file, loff_t offset, int origin)
+static ssize_t vhci_write(struct file *file,
+			const char __user *buf, size_t count, loff_t *pos)
 {
-	return -ESPIPE;
-}
+	struct vhci_data *vhci = file->private_data;
 
-static int hci_vhci_chr_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return -EINVAL;
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	return vhci_get_user(vhci, buf, count);
 }
 
-static int hci_vhci_chr_fasync(int fd, struct file *file, int on)
+static unsigned int vhci_poll(struct file *file, poll_table *wait)
 {
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data;
-	int ret;
+	struct vhci_data *vhci = file->private_data;
 
-	if ((ret = fasync_helper(fd, file, on, &hci_vhci->fasync)) < 0)
-		return ret; 
- 
-	if (on)
-		hci_vhci->flags |= VHCI_FASYNC;
-	else 
-		hci_vhci->flags &= ~VHCI_FASYNC;
+	poll_wait(file, &vhci->read_wait, wait);
 
-	return 0;
+	if (!skb_queue_empty(&vhci->readq))
+		return POLLIN | POLLRDNORM;
+
+	return POLLOUT | POLLWRNORM;
 }
 
-static int hci_vhci_chr_open(struct inode *inode, struct file * file)
+static int vhci_ioctl(struct inode *inode, struct file *file,
+					unsigned int cmd, unsigned long arg)
 {
-	struct hci_vhci_struct *hci_vhci = NULL; 
+	return -EINVAL;
+}
+
+static int vhci_open(struct inode *inode, struct file *file)
+{
+	struct vhci_data *vhci;
 	struct hci_dev *hdev;
 
-	if (!(hci_vhci = kmalloc(sizeof(struct hci_vhci_struct), GFP_KERNEL)))
+	vhci = kmalloc(sizeof(struct vhci_data), GFP_KERNEL);
+	if (!vhci)
 		return -ENOMEM;
 
-	memset(hci_vhci, 0, sizeof(struct hci_vhci_struct));
+	memset(vhci, 0, sizeof(struct vhci_data));
 
-	skb_queue_head_init(&hci_vhci->readq);
-	init_waitqueue_head(&hci_vhci->read_wait);
+	skb_queue_head_init(&vhci->readq);
+	init_waitqueue_head(&vhci->read_wait);
 
-	/* Initialize and register HCI device */
 	hdev = hci_alloc_dev();
 	if (!hdev) {
-		kfree(hci_vhci);
+		kfree(vhci);
 		return -ENOMEM;
 	}
 
-	hci_vhci->hdev = hdev;
+	vhci->hdev = hdev;
 
 	hdev->type = HCI_VHCI;
-	hdev->driver_data = hci_vhci;
+	hdev->driver_data = vhci;
+	SET_HCIDEV_DEV(hdev, vhci_miscdev.dev);
 
-	hdev->open  = hci_vhci_open;
-	hdev->close = hci_vhci_close;
-	hdev->flush = hci_vhci_flush;
-	hdev->send  = hci_vhci_send_frame;
-	hdev->destruct = hci_vhci_destruct;
+	hdev->open     = vhci_open_dev;
+	hdev->close    = vhci_close_dev;
+	hdev->flush    = vhci_flush;
+	hdev->send     = vhci_send_frame;
+	hdev->destruct = vhci_destruct;
 
 	hdev->owner = THIS_MODULE;
-	
+
 	if (hci_register_dev(hdev) < 0) {
-		kfree(hci_vhci);
+		BT_ERR("Can't register HCI device");
+		kfree(vhci);
 		hci_free_dev(hdev);
 		return -EBUSY;
 	}
 
-	file->private_data = hci_vhci;
-	return nonseekable_open(inode, file);   
+	file->private_data = vhci;
+
+	return nonseekable_open(inode, file);
 }
 
-static int hci_vhci_chr_close(struct inode *inode, struct file *file)
+static int vhci_release(struct inode *inode, struct file *file)
 {
-	struct hci_vhci_struct *hci_vhci = (struct hci_vhci_struct *) file->private_data;
-	struct hci_dev *hdev = hci_vhci->hdev;
+	struct vhci_data *vhci = file->private_data;
+	struct hci_dev *hdev = vhci->hdev;
 
 	if (hci_unregister_dev(hdev) < 0) {
 		BT_ERR("Can't unregister HCI device %s", hdev->name);
@@ -317,48 +314,71 @@ static int hci_vhci_chr_close(struct inode *inode, struct file *file)
 	hci_free_dev(hdev);
 
 	file->private_data = NULL;
+
 	return 0;
 }
 
-static struct file_operations hci_vhci_fops = {
-	.owner	= THIS_MODULE,	
-	.llseek	= hci_vhci_chr_lseek,
-	.read	= hci_vhci_chr_read,
-	.write	= hci_vhci_chr_write,
-	.poll	= hci_vhci_chr_poll,
-	.ioctl	= hci_vhci_chr_ioctl,
-	.open	= hci_vhci_chr_open,
-	.release	= hci_vhci_chr_close,
-	.fasync	= hci_vhci_chr_fasync		
+static int vhci_fasync(int fd, struct file *file, int on)
+{
+	struct vhci_data *vhci = file->private_data;
+	int err;
+
+	err = fasync_helper(fd, file, on, &vhci->fasync);
+	if (err < 0)
+		return err;
+
+	if (on)
+		vhci->flags |= VHCI_FASYNC;
+	else
+		vhci->flags &= ~VHCI_FASYNC;
+
+	return 0;
+}
+
+static struct file_operations vhci_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= vhci_llseek,
+	.read		= vhci_read,
+	.write		= vhci_write,
+	.poll		= vhci_poll,
+	.ioctl		= vhci_ioctl,
+	.open		= vhci_open,
+	.release	= vhci_release,
+	.fasync		= vhci_fasync,
 };
 
-static struct miscdevice hci_vhci_miscdev=
-{
-        VHCI_MINOR,
-        "hci_vhci",
-        &hci_vhci_fops
+static struct miscdevice vhci_miscdev= {
+	.name		= "vhci",
+	.fops		= &vhci_fops,
 };
 
-static int __init hci_vhci_init(void)
+static int __init vhci_init(void)
 {
-	BT_INFO("VHCI driver ver %s", VERSION);
+	BT_INFO("Virtual HCI driver ver %s", VERSION);
 
-	if (misc_register(&hci_vhci_miscdev)) {
-		BT_ERR("Can't register misc device %d\n", VHCI_MINOR);
+	vhci_miscdev.minor = minor;
+
+	if (misc_register(&vhci_miscdev) < 0) {
+		BT_ERR("Can't register misc device with minor %d", minor);
 		return -EIO;
 	}
 
 	return 0;
 }
 
-static void hci_vhci_cleanup(void)
+static void __exit vhci_exit(void)
 {
-	misc_deregister(&hci_vhci_miscdev);
+	if (misc_deregister(&vhci_miscdev) < 0)
+		BT_ERR("Can't unregister misc device with minor %d", minor);
 }
 
-module_init(hci_vhci_init);
-module_exit(hci_vhci_cleanup);
+module_init(vhci_init);
+module_exit(vhci_exit);
+
+module_param(minor, int, 0444);
+MODULE_PARM_DESC(minor, "Miscellaneous minor device number");
 
-MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>");
-MODULE_DESCRIPTION("Bluetooth VHCI driver ver " VERSION);
-MODULE_LICENSE("GPL"); 
+MODULE_AUTHOR("Maxim Krasnyansky <maxk@qualcomm.com>, Marcel Holtmann <marcel@holtmann.org>");
+MODULE_DESCRIPTION("Bluetooth virtual HCI driver ver " VERSION);
+MODULE_VERSION(VERSION);
+MODULE_LICENSE("GPL");
diff --git a/drivers/bluetooth/hci_vhci.h b/drivers/bluetooth/hci_vhci.h
deleted file mode 100644
index 53b11f9ef76..00000000000
--- a/drivers/bluetooth/hci_vhci.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* 
-   BlueZ - Bluetooth protocol stack for Linux
-   Copyright (C) 2000-2001 Qualcomm Incorporated
-
-   Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License version 2 as
-   published by the Free Software Foundation;
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
-   IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
-   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES 
-   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
-   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 
-   OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
-   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, 
-   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS 
-   SOFTWARE IS DISCLAIMED.
-*/
-
-/*
- * $Id: hci_vhci.h,v 1.1.1.1 2002/03/08 21:03:15 maxk Exp $
- */
-
-#ifndef __HCI_VHCI_H
-#define __HCI_VHCI_H
-
-#ifdef __KERNEL__
-
-struct hci_vhci_struct {
-	struct hci_dev       *hdev;
-	__u32                flags;
-	wait_queue_head_t    read_wait;
-	struct sk_buff_head  readq;
-	struct fasync_struct *fasync;
-};
-
-/* VHCI device flags */
-#define VHCI_FASYNC		0x0010
-
-#endif /* __KERNEL__ */
-
-#define VHCI_DEV	"/dev/vhci"
-#define VHCI_MINOR	250
-
-#endif /* __HCI_VHCI_H */
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 38dd9ffbe8b..0829db58462 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -734,7 +734,7 @@ static int viocd_remove(struct vio_dev *vdev)
  */
 static struct vio_device_id viocd_device_table[] __devinitdata = {
 	{ "viocd", "" },
-	{ 0, }
+	{ "", "" }
 };
 
 MODULE_DEVICE_TABLE(vio, viocd_device_table);
diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c
index 60bb9152b83..78d681dc35a 100644
--- a/drivers/char/hvc_vio.c
+++ b/drivers/char/hvc_vio.c
@@ -39,7 +39,7 @@ char hvc_driver_name[] = "hvc_console";
 
 static struct vio_device_id hvc_driver_table[] __devinitdata = {
 	{"serial", "hvterm1"},
-	{ NULL, }
+	{ "", "" }
 };
 MODULE_DEVICE_TABLE(vio, hvc_driver_table);
 
diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c
index 3236d240490..f47f009f925 100644
--- a/drivers/char/hvcs.c
+++ b/drivers/char/hvcs.c
@@ -527,7 +527,7 @@ static int khvcsd(void *unused)
 
 static struct vio_device_id hvcs_driver_table[] __devinitdata= {
 	{"serial-server", "hvterm2"},
-	{ NULL, }
+	{ "", "" }
 };
 MODULE_DEVICE_TABLE(vio, hvcs_driver_table);
 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6b11d6b2129..7999da25fe4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1589,6 +1589,40 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp
 EXPORT_SYMBOL(secure_tcpv6_port_ephemeral);
 #endif
 
+#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
+/* Similar to secure_tcp_sequence_number but generate a 48 bit value
+ * bit's 32-47 increase every key exchange
+ *       0-31  hash(source, dest)
+ */
+u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr,
+				__u16 sport, __u16 dport)
+{
+	struct timeval tv;
+	u64 seq;
+	__u32 hash[4];
+	struct keydata *keyptr = get_keyptr();
+
+	hash[0] = saddr;
+	hash[1] = daddr;
+	hash[2] = (sport << 16) + dport;
+	hash[3] = keyptr->secret[11];
+
+	seq = half_md4_transform(hash, keyptr->secret);
+	seq |= ((u64)keyptr->count) << (32 - HASH_BITS);
+
+	do_gettimeofday(&tv);
+	seq += tv.tv_usec + tv.tv_sec * 1000000;
+	seq &= (1ull << 48) - 1;
+#if 0
+	printk("dccp init_seq(%lx, %lx, %d, %d) = %d\n",
+	       saddr, daddr, sport, dport, seq);
+#endif
+	return seq;
+}
+
+EXPORT_SYMBOL(secure_dccp_sequence_number);
+#endif
+
 #endif /* CONFIG_INET */
 
 
diff --git a/drivers/char/viotape.c b/drivers/char/viotape.c
index 4764b4f9555..0aff45fac2e 100644
--- a/drivers/char/viotape.c
+++ b/drivers/char/viotape.c
@@ -991,7 +991,7 @@ static int viotape_remove(struct vio_dev *vdev)
  */
 static struct vio_device_id viotape_device_table[] __devinitdata = {
 	{ "viotape", "" },
-	{ 0, }
+	{ "", "" }
 };
 
 MODULE_DEVICE_TABLE(vio, viotape_device_table);
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index b248d89de8b..d633770fac8 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -681,7 +681,7 @@ static void handle_packet_response(struct hpsb_host *host, int tcode,
                 return;
         }
 
-	__skb_unlink(skb, skb->list);
+	__skb_unlink(skb, &host->pending_packet_queue);
 
 	if (packet->state == hpsb_queued) {
 		packet->sendtime = jiffies;
@@ -989,7 +989,7 @@ void abort_timedouts(unsigned long __opaque)
 		packet = (struct hpsb_packet *)skb->data;
 
 		if (time_before(packet->sendtime + expire, jiffies)) {
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &host->pending_packet_queue);
 			packet->state = hpsb_complete;
 			packet->ack_code = ACKX_TIMEOUT;
 			queue_packet_complete(packet);
diff --git a/drivers/isdn/act2000/capi.c b/drivers/isdn/act2000/capi.c
index afa46681f98..6ae6eb32211 100644
--- a/drivers/isdn/act2000/capi.c
+++ b/drivers/isdn/act2000/capi.c
@@ -606,7 +606,7 @@ handle_ack(act2000_card *card, act2000_chan *chan, __u8 blocknr) {
                 if ((((m->msg.data_b3_req.fakencci >> 8) & 0xff) == chan->ncci) &&
 		    (m->msg.data_b3_req.blocknr == blocknr)) {
 			/* found corresponding DATA_B3_REQ */
-                        skb_unlink(tmp);
+                        skb_unlink(tmp, &card->ackq);
 			chan->queued -= m->msg.data_b3_req.datalen;
 			if (m->msg.data_b3_req.flags)
 				ret = m->msg.data_b3_req.datalen;
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index f30e8e63ae0..96c115e1338 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1786,7 +1786,6 @@ isdn_net_receive(struct net_device *ndev, struct sk_buff *skb)
 		lp->stats.rx_bytes += skb->len;
 	}
 	skb->dev = ndev;
-	skb->input_dev = ndev;
 	skb->pkt_type = PACKET_HOST;
 	skb->mac.raw = skb->data;
 #ifdef ISDN_DEBUG_NET_DUMP
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index 260a323a96d..d97a9be5469 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -1177,7 +1177,6 @@ isdn_ppp_push_higher(isdn_net_dev * net_dev, isdn_net_local * lp, struct sk_buff
 		mlp->huptimer = 0;
 #endif /* CONFIG_IPPP_FILTER */
 	skb->dev = dev;
-	skb->input_dev = dev;
 	skb->mac.raw = skb->data;
 	netif_rx(skb);
 	/* net_dev->local->stats.rx_packets++; done in isdn_net.c */
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 8acc655ec1e..7babf6af4e2 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -14,8 +14,8 @@
 
 #define DRV_MODULE_NAME		"bnx2"
 #define PFX DRV_MODULE_NAME	": "
-#define DRV_MODULE_VERSION	"1.2.19"
-#define DRV_MODULE_RELDATE	"May 23, 2005"
+#define DRV_MODULE_VERSION	"1.2.20"
+#define DRV_MODULE_RELDATE	"August 22, 2005"
 
 #define RUN_AT(x) (jiffies + (x))
 
@@ -52,7 +52,6 @@ static struct {
 	{ "HP NC370i Multifunction Gigabit Server Adapter" },
 	{ "Broadcom NetXtreme II BCM5706 1000Base-SX" },
 	{ "HP NC370F Multifunction Gigabit Server Adapter" },
-	{ 0 },
 	};
 
 static struct pci_device_id bnx2_pci_tbl[] = {
@@ -108,6 +107,15 @@ static struct flash_spec flash_table[] =
 
 MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
 
+static inline u32 bnx2_tx_avail(struct bnx2 *bp)
+{
+	u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
+
+	if (diff > MAX_TX_DESC_CNT)
+		diff = (diff & MAX_TX_DESC_CNT) - 1;
+	return (bp->tx_ring_size - diff);
+}
+
 static u32
 bnx2_reg_rd_ind(struct bnx2 *bp, u32 offset)
 {
@@ -807,7 +815,19 @@ bnx2_setup_serdes_phy(struct bnx2 *bp)
 		bnx2_write_phy(bp, MII_ADVERTISE, new_adv);
 		bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART |
 			BMCR_ANENABLE);
-		bp->serdes_an_pending = SERDES_AN_TIMEOUT / bp->timer_interval;
+		if (CHIP_NUM(bp) == CHIP_NUM_5706) {
+			/* Speed up link-up time when the link partner
+			 * does not autonegotiate which is very common
+			 * in blade servers. Some blade servers use
+			 * IPMI for kerboard input and it's important
+			 * to minimize link disruptions. Autoneg. involves
+			 * exchanging base pages plus 3 next pages and
+			 * normally completes in about 120 msec.
+			 */
+			bp->current_interval = SERDES_AN_TIMEOUT;
+			bp->serdes_an_pending = 1;
+			mod_timer(&bp->timer, jiffies + bp->current_interval);
+		}
 	}
 
 	return 0;
@@ -1327,22 +1347,17 @@ bnx2_tx_int(struct bnx2 *bp)
 		}
 	}
 
-	atomic_add(tx_free_bd, &bp->tx_avail_bd);
+	bp->tx_cons = sw_cons;
 
 	if (unlikely(netif_queue_stopped(bp->dev))) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&bp->tx_lock, flags);
+		spin_lock(&bp->tx_lock);
 		if ((netif_queue_stopped(bp->dev)) &&
-			(atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS)) {
+		    (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) {
 
 			netif_wake_queue(bp->dev);
 		}
-		spin_unlock_irqrestore(&bp->tx_lock, flags);
+		spin_unlock(&bp->tx_lock);
 	}
-
-	bp->tx_cons = sw_cons;
-
 }
 
 static inline void
@@ -1523,15 +1538,12 @@ bnx2_msi(int irq, void *dev_instance, struct pt_regs *regs)
 		BNX2_PCICFG_INT_ACK_CMD_MASK_INT);
 
 	/* Return here if interrupt is disabled. */
-	if (unlikely(atomic_read(&bp->intr_sem) != 0)) {
-		return IRQ_RETVAL(1);
-	}
+	if (unlikely(atomic_read(&bp->intr_sem) != 0))
+		return IRQ_HANDLED;
 
-	if (netif_rx_schedule_prep(dev)) {
-		__netif_rx_schedule(dev);
-	}
+	netif_rx_schedule(dev);
 
-	return IRQ_RETVAL(1);
+	return IRQ_HANDLED;
 }
 
 static irqreturn_t
@@ -1549,22 +1561,19 @@ bnx2_interrupt(int irq, void *dev_instance, struct pt_regs *regs)
 	if ((bp->status_blk->status_idx == bp->last_status_idx) ||
 	    (REG_RD(bp, BNX2_PCICFG_MISC_STATUS) &
 	     BNX2_PCICFG_MISC_STATUS_INTA_VALUE))
-		return IRQ_RETVAL(0);
+		return IRQ_NONE;
 
 	REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD,
 		BNX2_PCICFG_INT_ACK_CMD_USE_INT_HC_PARAM |
 		BNX2_PCICFG_INT_ACK_CMD_MASK_INT);
 
 	/* Return here if interrupt is shared and is disabled. */
-	if (unlikely(atomic_read(&bp->intr_sem) != 0)) {
-		return IRQ_RETVAL(1);
-	}
+	if (unlikely(atomic_read(&bp->intr_sem) != 0))
+		return IRQ_HANDLED;
 
-	if (netif_rx_schedule_prep(dev)) {
-		__netif_rx_schedule(dev);
-	}
+	netif_rx_schedule(dev);
 
-	return IRQ_RETVAL(1);
+	return IRQ_HANDLED;
 }
 
 static int
@@ -1581,11 +1590,9 @@ bnx2_poll(struct net_device *dev, int *budget)
 		(bp->status_blk->status_attn_bits_ack &
 		STATUS_ATTN_BITS_LINK_STATE)) {
 
-		unsigned long flags;
-
-		spin_lock_irqsave(&bp->phy_lock, flags);
+		spin_lock(&bp->phy_lock);
 		bnx2_phy_int(bp);
-		spin_unlock_irqrestore(&bp->phy_lock, flags);
+		spin_unlock(&bp->phy_lock);
 	}
 
 	if (bp->status_blk->status_tx_quick_consumer_index0 != bp->tx_cons) {
@@ -1628,9 +1635,8 @@ bnx2_set_rx_mode(struct net_device *dev)
 	struct bnx2 *bp = dev->priv;
 	u32 rx_mode, sort_mode;
 	int i;
-	unsigned long flags;
 
-	spin_lock_irqsave(&bp->phy_lock, flags);
+	spin_lock_bh(&bp->phy_lock);
 
 	rx_mode = bp->rx_mode & ~(BNX2_EMAC_RX_MODE_PROMISCUOUS |
 				  BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG);
@@ -1691,7 +1697,7 @@ bnx2_set_rx_mode(struct net_device *dev)
 	REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode);
 	REG_WR(bp, BNX2_RPM_SORT_USER0, sort_mode | BNX2_RPM_SORT_USER0_ENA);
 
-	spin_unlock_irqrestore(&bp->phy_lock, flags);
+	spin_unlock_bh(&bp->phy_lock);
 }
 
 static void
@@ -2960,7 +2966,6 @@ bnx2_init_tx_ring(struct bnx2 *bp)
 	bp->tx_prod = 0;
 	bp->tx_cons = 0;
 	bp->tx_prod_bseq = 0;
-	atomic_set(&bp->tx_avail_bd, bp->tx_ring_size);
 	
 	val = BNX2_L2CTX_TYPE_TYPE_L2;
 	val |= BNX2_L2CTX_TYPE_SIZE_L2;
@@ -3507,11 +3512,11 @@ bnx2_test_registers(struct bnx2 *bp)
 		rw_mask = reg_tbl[i].rw_mask;
 		ro_mask = reg_tbl[i].ro_mask;
 
-		save_val = readl((u8 *) bp->regview + offset);
+		save_val = readl(bp->regview + offset);
 
-		writel(0, (u8 *) bp->regview + offset);
+		writel(0, bp->regview + offset);
 
-		val = readl((u8 *) bp->regview + offset);
+		val = readl(bp->regview + offset);
 		if ((val & rw_mask) != 0) {
 			goto reg_test_err;
 		}
@@ -3520,9 +3525,9 @@ bnx2_test_registers(struct bnx2 *bp)
 			goto reg_test_err;
 		}
 
-		writel(0xffffffff, (u8 *) bp->regview + offset);
+		writel(0xffffffff, bp->regview + offset);
 
-		val = readl((u8 *) bp->regview + offset);
+		val = readl(bp->regview + offset);
 		if ((val & rw_mask) != rw_mask) {
 			goto reg_test_err;
 		}
@@ -3531,11 +3536,11 @@ bnx2_test_registers(struct bnx2 *bp)
 			goto reg_test_err;
 		}
 
-		writel(save_val, (u8 *) bp->regview + offset);
+		writel(save_val, bp->regview + offset);
 		continue;
 
 reg_test_err:
-		writel(save_val, (u8 *) bp->regview + offset);
+		writel(save_val, bp->regview + offset);
 		ret = -ENODEV;
 		break;
 	}
@@ -3752,10 +3757,10 @@ bnx2_test_link(struct bnx2 *bp)
 {
 	u32 bmsr;
 
-	spin_lock_irq(&bp->phy_lock);
+	spin_lock_bh(&bp->phy_lock);
 	bnx2_read_phy(bp, MII_BMSR, &bmsr);
 	bnx2_read_phy(bp, MII_BMSR, &bmsr);
-	spin_unlock_irq(&bp->phy_lock);
+	spin_unlock_bh(&bp->phy_lock);
 		
 	if (bmsr & BMSR_LSTATUS) {
 		return 0;
@@ -3801,6 +3806,9 @@ bnx2_timer(unsigned long data)
 	struct bnx2 *bp = (struct bnx2 *) data;
 	u32 msg;
 
+	if (!netif_running(bp->dev))
+		return;
+
 	if (atomic_read(&bp->intr_sem) != 0)
 		goto bnx2_restart_timer;
 
@@ -3809,15 +3817,16 @@ bnx2_timer(unsigned long data)
 
 	if ((bp->phy_flags & PHY_SERDES_FLAG) &&
 	    (CHIP_NUM(bp) == CHIP_NUM_5706)) {
-		unsigned long flags;
 
-		spin_lock_irqsave(&bp->phy_lock, flags);
+		spin_lock(&bp->phy_lock);
 		if (bp->serdes_an_pending) {
 			bp->serdes_an_pending--;
 		}
 		else if ((bp->link_up == 0) && (bp->autoneg & AUTONEG_SPEED)) {
 			u32 bmcr;
 
+			bp->current_interval = bp->timer_interval;
+
 			bnx2_read_phy(bp, MII_BMCR, &bmcr);
 
 			if (bmcr & BMCR_ANENABLE) {
@@ -3860,14 +3869,14 @@ bnx2_timer(unsigned long data)
 
 			}
 		}
+		else
+			bp->current_interval = bp->timer_interval;
 
-		spin_unlock_irqrestore(&bp->phy_lock, flags);
+		spin_unlock(&bp->phy_lock);
 	}
 
 bnx2_restart_timer:
-	bp->timer.expires = RUN_AT(bp->timer_interval);
-
-	add_timer(&bp->timer);
+	mod_timer(&bp->timer, jiffies + bp->current_interval);
 }
 
 /* Called with rtnl_lock */
@@ -3920,12 +3929,7 @@ bnx2_open(struct net_device *dev)
 		return rc;
 	}
 	
-	init_timer(&bp->timer);
-
-	bp->timer.expires = RUN_AT(bp->timer_interval);
-	bp->timer.data = (unsigned long) bp;
-	bp->timer.function = bnx2_timer;
-	add_timer(&bp->timer);
+	mod_timer(&bp->timer, jiffies + bp->current_interval);
 
 	atomic_set(&bp->intr_sem, 0);
 
@@ -3976,12 +3980,17 @@ bnx2_reset_task(void *data)
 {
 	struct bnx2 *bp = data;
 
+	if (!netif_running(bp->dev))
+		return;
+
+	bp->in_reset_task = 1;
 	bnx2_netif_stop(bp);
 
 	bnx2_init_nic(bp);
 
 	atomic_set(&bp->intr_sem, 1);
 	bnx2_netif_start(bp);
+	bp->in_reset_task = 0;
 }
 
 static void
@@ -4041,9 +4050,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	u16 prod, ring_prod;
 	int i;
 
-	if (unlikely(atomic_read(&bp->tx_avail_bd) <
-		(skb_shinfo(skb)->nr_frags + 1))) {
-
+	if (unlikely(bnx2_tx_avail(bp) < (skb_shinfo(skb)->nr_frags + 1))) {
 		netif_stop_queue(dev);
 		printk(KERN_ERR PFX "%s: BUG! Tx ring full when queue awake!\n",
 			dev->name);
@@ -4140,8 +4147,6 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	prod = NEXT_TX_BD(prod);
 	bp->tx_prod_bseq += skb->len;
 
-	atomic_sub(last_frag + 1, &bp->tx_avail_bd);
-
 	REG_WR16(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BIDX, prod);
 	REG_WR(bp, MB_TX_CID_ADDR + BNX2_L2CTX_TX_HOST_BSEQ, bp->tx_prod_bseq);
 
@@ -4150,17 +4155,13 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	bp->tx_prod = prod;
 	dev->trans_start = jiffies;
 
-	if (unlikely(atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&bp->tx_lock, flags);
-		if (atomic_read(&bp->tx_avail_bd) <= MAX_SKB_FRAGS) {
-			netif_stop_queue(dev);
-
-			if (atomic_read(&bp->tx_avail_bd) > MAX_SKB_FRAGS)
-				netif_wake_queue(dev);
-		}
-		spin_unlock_irqrestore(&bp->tx_lock, flags);
+	if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
+		spin_lock(&bp->tx_lock);
+		netif_stop_queue(dev);
+		
+		if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
+			netif_wake_queue(dev);
+		spin_unlock(&bp->tx_lock);
 	}
 
 	return NETDEV_TX_OK;
@@ -4173,7 +4174,13 @@ bnx2_close(struct net_device *dev)
 	struct bnx2 *bp = dev->priv;
 	u32 reset_code;
 
-	flush_scheduled_work();
+	/* Calling flush_scheduled_work() may deadlock because
+	 * linkwatch_event() may be on the workqueue and it will try to get
+	 * the rtnl_lock which we are holding.
+	 */
+	while (bp->in_reset_task)
+		msleep(1);
+
 	bnx2_netif_stop(bp);
 	del_timer_sync(&bp->timer);
 	if (bp->wol)
@@ -4390,11 +4397,11 @@ bnx2_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	bp->req_line_speed = req_line_speed;
 	bp->req_duplex = req_duplex;
 
-	spin_lock_irq(&bp->phy_lock);
+	spin_lock_bh(&bp->phy_lock);
 
 	bnx2_setup_phy(bp);
 
-	spin_unlock_irq(&bp->phy_lock);
+	spin_unlock_bh(&bp->phy_lock);
 
 	return 0;
 }
@@ -4464,19 +4471,20 @@ bnx2_nway_reset(struct net_device *dev)
 		return -EINVAL;
 	}
 
-	spin_lock_irq(&bp->phy_lock);
+	spin_lock_bh(&bp->phy_lock);
 
 	/* Force a link down visible on the other side */
 	if (bp->phy_flags & PHY_SERDES_FLAG) {
 		bnx2_write_phy(bp, MII_BMCR, BMCR_LOOPBACK);
-		spin_unlock_irq(&bp->phy_lock);
+		spin_unlock_bh(&bp->phy_lock);
 
 		msleep(20);
 
-		spin_lock_irq(&bp->phy_lock);
+		spin_lock_bh(&bp->phy_lock);
 		if (CHIP_NUM(bp) == CHIP_NUM_5706) {
-			bp->serdes_an_pending = SERDES_AN_TIMEOUT /
-				bp->timer_interval;
+			bp->current_interval = SERDES_AN_TIMEOUT;
+			bp->serdes_an_pending = 1;
+			mod_timer(&bp->timer, jiffies + bp->current_interval);
 		}
 	}
 
@@ -4484,7 +4492,7 @@ bnx2_nway_reset(struct net_device *dev)
 	bmcr &= ~BMCR_LOOPBACK;
 	bnx2_write_phy(bp, MII_BMCR, bmcr | BMCR_ANRESTART | BMCR_ANENABLE);
 
-	spin_unlock_irq(&bp->phy_lock);
+	spin_unlock_bh(&bp->phy_lock);
 
 	return 0;
 }
@@ -4670,11 +4678,11 @@ bnx2_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *epause)
 		bp->autoneg &= ~AUTONEG_FLOW_CTRL;
 	}
 
-	spin_lock_irq(&bp->phy_lock);
+	spin_lock_bh(&bp->phy_lock);
 
 	bnx2_setup_phy(bp);
 
-	spin_unlock_irq(&bp->phy_lock);
+	spin_unlock_bh(&bp->phy_lock);
 
 	return 0;
 }
@@ -4698,7 +4706,7 @@ bnx2_set_rx_csum(struct net_device *dev, u32 data)
 
 #define BNX2_NUM_STATS 45
 
-struct {
+static struct {
 	char string[ETH_GSTRING_LEN];
 } bnx2_stats_str_arr[BNX2_NUM_STATS] = {
 	{ "rx_bytes" },
@@ -4750,7 +4758,7 @@ struct {
 
 #define STATS_OFFSET32(offset_name) (offsetof(struct statistics_block, offset_name) / 4)
 
-unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = {
+static unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = {
     STATS_OFFSET32(stat_IfHCInOctets_hi),
     STATS_OFFSET32(stat_IfHCInBadOctets_hi),
     STATS_OFFSET32(stat_IfHCOutOctets_hi),
@@ -4801,7 +4809,7 @@ unsigned long bnx2_stats_offset_arr[BNX2_NUM_STATS] = {
 /* stat_IfHCInBadOctets and stat_Dot3StatsCarrierSenseErrors are
  * skipped because of errata.
  */               
-u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = {
+static u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = {
 	8,0,8,8,8,8,8,8,8,8,
 	4,0,4,4,4,4,4,4,4,4,
 	4,4,4,4,4,4,4,4,4,4,
@@ -4811,7 +4819,7 @@ u8 bnx2_5706_stats_len_arr[BNX2_NUM_STATS] = {
 
 #define BNX2_NUM_TESTS 6
 
-struct {
+static struct {
 	char string[ETH_GSTRING_LEN];
 } bnx2_tests_str_arr[BNX2_NUM_TESTS] = {
 	{ "register_test (offline)" },
@@ -4910,7 +4918,7 @@ bnx2_get_ethtool_stats(struct net_device *dev,
 	struct bnx2 *bp = dev->priv;
 	int i;
 	u32 *hw_stats = (u32 *) bp->stats_blk;
-	u8 *stats_len_arr = 0;
+	u8 *stats_len_arr = NULL;
 
 	if (hw_stats == NULL) {
 		memset(buf, 0, sizeof(u64) * BNX2_NUM_STATS);
@@ -5012,7 +5020,7 @@ static struct ethtool_ops bnx2_ethtool_ops = {
 static int
 bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-	struct mii_ioctl_data *data = (struct mii_ioctl_data *)&ifr->ifr_data;
+	struct mii_ioctl_data *data = if_mii(ifr);
 	struct bnx2 *bp = dev->priv;
 	int err;
 
@@ -5024,9 +5032,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCGMIIREG: {
 		u32 mii_regval;
 
-		spin_lock_irq(&bp->phy_lock);
+		spin_lock_bh(&bp->phy_lock);
 		err = bnx2_read_phy(bp, data->reg_num & 0x1f, &mii_regval);
-		spin_unlock_irq(&bp->phy_lock);
+		spin_unlock_bh(&bp->phy_lock);
 
 		data->val_out = mii_regval;
 
@@ -5037,9 +5045,9 @@ bnx2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
 
-		spin_lock_irq(&bp->phy_lock);
+		spin_lock_bh(&bp->phy_lock);
 		err = bnx2_write_phy(bp, data->reg_num & 0x1f, data->val_in);
-		spin_unlock_irq(&bp->phy_lock);
+		spin_unlock_bh(&bp->phy_lock);
 
 		return err;
 
@@ -5057,6 +5065,9 @@ bnx2_change_mac_addr(struct net_device *dev, void *p)
 	struct sockaddr *addr = p;
 	struct bnx2 *bp = dev->priv;
 
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
 	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
 	if (netif_running(dev))
 		bnx2_set_mac_addr(bp);
@@ -5305,6 +5316,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bp->stats_ticks = 1000000 & 0xffff00;
 
 	bp->timer_interval =  HZ;
+	bp->current_interval =  HZ;
 
 	/* Disable WOL support if we are running on a SERDES chip. */
 	if (CHIP_BOND_ID(bp) & CHIP_BOND_ID_SERDES_BIT) {
@@ -5328,6 +5340,15 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bp->req_line_speed = 0;
 	if (bp->phy_flags & PHY_SERDES_FLAG) {
 		bp->advertising = ETHTOOL_ALL_FIBRE_SPEED | ADVERTISED_Autoneg;
+
+		reg = REG_RD_IND(bp, HOST_VIEW_SHMEM_BASE +
+				 BNX2_PORT_HW_CFG_CONFIG);
+		reg &= BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK;
+		if (reg == BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G) {
+			bp->autoneg = 0;
+			bp->req_line_speed = bp->line_speed = SPEED_1000;
+			bp->req_duplex = DUPLEX_FULL;
+		}
 	}
 	else {
 		bp->advertising = ETHTOOL_ALL_COPPER_SPEED | ADVERTISED_Autoneg;
@@ -5335,11 +5356,17 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 
 	bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX;
 
+	init_timer(&bp->timer);
+	bp->timer.expires = RUN_AT(bp->timer_interval);
+	bp->timer.data = (unsigned long) bp;
+	bp->timer.function = bnx2_timer;
+
 	return 0;
 
 err_out_unmap:
 	if (bp->regview) {
 		iounmap(bp->regview);
+		bp->regview = NULL;
 	}
 
 err_out_release:
@@ -5454,6 +5481,8 @@ bnx2_remove_one(struct pci_dev *pdev)
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct bnx2 *bp = dev->priv;
 
+	flush_scheduled_work();
+
 	unregister_netdev(dev);
 
 	if (bp->regview)
@@ -5505,12 +5534,12 @@ bnx2_resume(struct pci_dev *pdev)
 }
 
 static struct pci_driver bnx2_pci_driver = {
-	name:		DRV_MODULE_NAME,
-	id_table:	bnx2_pci_tbl,
-	probe:		bnx2_init_one,
-	remove:		__devexit_p(bnx2_remove_one),
-	suspend:	bnx2_suspend,
-	resume:		bnx2_resume,
+	.name		= DRV_MODULE_NAME,
+	.id_table	= bnx2_pci_tbl,
+	.probe		= bnx2_init_one,
+	.remove		= __devexit_p(bnx2_remove_one),
+	.suspend	= bnx2_suspend,
+	.resume		= bnx2_resume,
 };
 
 static int __init bnx2_init(void)
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index 8214a2853d0..9ad3f5740cd 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -3841,12 +3841,12 @@ struct bnx2 {
 	struct status_block	*status_blk;
 	u32 			last_status_idx;
 
-	atomic_t		tx_avail_bd;
 	struct tx_bd		*tx_desc_ring;
 	struct sw_bd		*tx_buf_ring;
 	u32			tx_prod_bseq;
 	u16			tx_prod;
 	u16			tx_cons;
+	int			tx_ring_size;
 
 #ifdef BCM_VLAN 
 	struct			vlan_group *vlgrp;
@@ -3872,8 +3872,10 @@ struct bnx2 {
 	char			*name;
 
 	int			timer_interval;
+	int			current_interval;
 	struct			timer_list timer;
 	struct work_struct	reset_task;
+	int			in_reset_task;
 
 	/* Used to synchronize phy accesses. */
 	spinlock_t		phy_lock;
@@ -3927,7 +3929,6 @@ struct bnx2 {
 	u16			fw_wr_seq;
 	u16			fw_drv_pulse_wr_seq;
 
-	int			tx_ring_size;
 	dma_addr_t		tx_desc_mapping;
 
 
@@ -3985,7 +3986,7 @@ struct bnx2 {
 #define PHY_LOOPBACK		2
 
 	u8			serdes_an_pending;
-#define SERDES_AN_TIMEOUT	(2 * HZ)
+#define SERDES_AN_TIMEOUT	(HZ / 3)
 
 	u8			mac_addr[8];
 
@@ -4171,6 +4172,9 @@ struct fw_info {
 
 #define BNX2_PORT_HW_CFG_MAC_LOWER		0x00000054
 #define BNX2_PORT_HW_CFG_CONFIG			0x00000058
+#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_MASK	 0x001f0000
+#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_AN	 0x00000000
+#define BNX2_PORT_HW_CFG_CFG_DFLT_LINK_1G	 0x00030000
 
 #define BNX2_PORT_HW_CFG_IMD_MAC_A_UPPER	0x00000068
 #define BNX2_PORT_HW_CFG_IMD_MAC_A_LOWER	0x0000006c
diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index a2e8dda5afa..d2f34d5a808 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2419,22 +2419,19 @@ out:
 	return 0;
 }
 
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype)
+int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev)
 {
 	struct bonding *bond = dev->priv;
 	struct slave *slave = NULL;
 	int ret = NET_RX_DROP;
 
-	if (!(dev->flags & IFF_MASTER)) {
+	if (!(dev->flags & IFF_MASTER))
 		goto out;
-	}
 
 	read_lock(&bond->lock);
-	slave = bond_get_slave_by_dev((struct bonding *)dev->priv,
-				      skb->real_dev);
-	if (slave == NULL) {
+	slave = bond_get_slave_by_dev((struct bonding *)dev->priv, orig_dev);
+	if (!slave)
 		goto out_unlock;
-	}
 
 	bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len);
 
diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h
index f4682389418..673a30af566 100644
--- a/drivers/net/bonding/bond_3ad.h
+++ b/drivers/net/bonding/bond_3ad.h
@@ -295,6 +295,6 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave);
 void bond_3ad_handle_link_change(struct slave *slave, char link);
 int  bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info);
 int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev);
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype);
+int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev);
 #endif //__BOND_3AD_H__
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 19e829b567d..f8fce396119 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -354,15 +354,14 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)
 	_unlock_rx_hashtbl(bond);
 }
 
-static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype)
+static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev)
 {
 	struct bonding *bond = bond_dev->priv;
 	struct arp_pkt *arp = (struct arp_pkt *)skb->data;
 	int res = NET_RX_DROP;
 
-	if (!(bond_dev->flags & IFF_MASTER)) {
+	if (!(bond_dev->flags & IFF_MASTER))
 		goto out;
-	}
 
 	if (!arp) {
 		dprintk("Packet has no ARP data\n");
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index ba9f0580e1f..2946e037a9b 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -98,7 +98,7 @@ static char bcast_addr[6]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
 
 static char bpq_eth_addr[6];
 
-static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 static int bpq_device_event(struct notifier_block *, unsigned long, void *);
 static const char *bpq_print_ethaddr(const unsigned char *);
 
@@ -165,7 +165,7 @@ static inline int dev_is_ethdev(struct net_device *dev)
 /*
  *	Receive an AX.25 frame via an ethernet interface.
  */
-static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype)
+static int bpq_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev)
 {
 	int len;
 	char * ptr;
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index c39b0609742..32d5fabd4b1 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -1144,7 +1144,7 @@ static void ibmveth_proc_unregister_driver(void)
 
 static struct vio_device_id ibmveth_device_table[] __devinitdata= {
 	{ "network", "IBM,l-lan"},
-	{ 0,}
+	{ "", "" }
 };
 
 MODULE_DEVICE_TABLE(vio, ibmveth_device_table);
diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c
index 55af32e9bf0..183ba97785b 100644
--- a/drivers/net/iseries_veth.c
+++ b/drivers/net/iseries_veth.c
@@ -1370,7 +1370,7 @@ static int veth_probe(struct vio_dev *vdev, const struct vio_device_id *id)
  */
 static struct vio_device_id veth_device_table[] __devinitdata = {
 	{ "vlan", "" },
-	{ NULL, NULL }
+	{ "", "" }
 };
 MODULE_DEVICE_TABLE(vio, veth_device_table);
 
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index a32668e88e0..bb71638a7c4 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1657,7 +1657,6 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
 			skb->dev = ppp->dev;
 			skb->protocol = htons(npindex_to_ethertype[npi]);
 			skb->mac.raw = skb->data;
-			skb->input_dev = ppp->dev;
 			netif_rx(skb);
 			ppp->dev->last_rx = jiffies;
 		}
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index ce1a9bf7b9a..82f236cc3b9 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -377,7 +377,8 @@ abort_kfree:
  ***********************************************************************/
 static int pppoe_rcv(struct sk_buff *skb,
 		     struct net_device *dev,
-		     struct packet_type *pt)
+		     struct packet_type *pt,
+		     struct net_device *orig_dev)
 
 {
 	struct pppoe_hdr *ph;
@@ -426,7 +427,8 @@ out:
  ***********************************************************************/
 static int pppoe_disc_rcv(struct sk_buff *skb,
 			  struct net_device *dev,
-			  struct packet_type *pt)
+			  struct packet_type *pt,
+			  struct net_device *orig_dev)
 
 {
 	struct pppoe_hdr *ph;
diff --git a/drivers/net/rrunner.c b/drivers/net/rrunner.c
index 12a86f96d97..ec1a18d189a 100644
--- a/drivers/net/rrunner.c
+++ b/drivers/net/rrunner.c
@@ -1429,6 +1429,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct rr_private *rrpriv = netdev_priv(dev);
 	struct rr_regs __iomem *regs = rrpriv->regs;
+	struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
 	struct ring_ctrl *txctrl;
 	unsigned long flags;
 	u32 index, len = skb->len;
@@ -1460,7 +1461,7 @@ static int rr_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	ifield = (u32 *)skb_push(skb, 8);
 
 	ifield[0] = 0;
-	ifield[1] = skb->private.ifield;
+	ifield[1] = hcb->ifield;
 
 	/*
 	 * We don't need the lock before we are actually going to start
diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c
index 3ad0b6751f6..221354eea21 100644
--- a/drivers/net/shaper.c
+++ b/drivers/net/shaper.c
@@ -156,52 +156,6 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
  	 
  	SHAPERCB(skb)->shapelen= shaper_clocks(shaper,skb);
  	
-#ifdef SHAPER_COMPLEX /* and broken.. */
-
- 	while(ptr && ptr!=(struct sk_buff *)&shaper->sendq)
- 	{
- 		if(ptr->pri<skb->pri 
- 			&& jiffies - SHAPERCB(ptr)->shapeclock < SHAPER_MAXSLIP)
- 		{
- 			struct sk_buff *tmp=ptr->prev;
-
- 			/*
- 			 *	It goes before us therefore we slip the length
- 			 *	of the new frame.
- 			 */
-
- 			SHAPERCB(ptr)->shapeclock+=SHAPERCB(skb)->shapelen;
- 			SHAPERCB(ptr)->shapelatency+=SHAPERCB(skb)->shapelen;
-
- 			/*
- 			 *	The packet may have slipped so far back it
- 			 *	fell off.
- 			 */
- 			if(SHAPERCB(ptr)->shapelatency > SHAPER_LATENCY)
- 			{
- 				skb_unlink(ptr);
- 				dev_kfree_skb(ptr);
- 			}
- 			ptr=tmp;
- 		}
- 		else
- 			break;
- 	}
- 	if(ptr==NULL || ptr==(struct sk_buff *)&shaper->sendq)
- 		skb_queue_head(&shaper->sendq,skb);
- 	else
- 	{
- 		struct sk_buff *tmp;
- 		/*
- 		 *	Set the packet clock out time according to the
- 		 *	frames ahead. Im sure a bit of thought could drop
- 		 *	this loop.
- 		 */
- 		for(tmp=skb_peek(&shaper->sendq); tmp!=NULL && tmp!=ptr; tmp=tmp->next)
- 			SHAPERCB(skb)->shapeclock+=tmp->shapelen;
- 		skb_append(ptr,skb);
- 	}
-#else
 	{
 		struct sk_buff *tmp;
 		/*
@@ -220,7 +174,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		} else
 			skb_queue_tail(&shaper->sendq, skb);
 	}
-#endif 	
+
 	if(sh_debug)
  		printk("Frame queued.\n");
  	if(skb_queue_len(&shaper->sendq)>SHAPER_QLEN)
@@ -302,7 +256,7 @@ static void shaper_kick(struct shaper *shaper)
 			 *	Pull the frame and get interrupts back on.
 			 */
 			 
-			skb_unlink(skb);
+			skb_unlink(skb, &shaper->sendq);
 			if (shaper->recovery < 
 			    SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen)
 				shaper->recovery = SHAPERCB(skb)->shapeclock + SHAPERCB(skb)->shapelen;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 6d4ab1e333b..af8263a1580 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -340,41 +340,92 @@ static struct {
 
 static void tg3_write_indirect_reg32(struct tg3 *tp, u32 off, u32 val)
 {
-	if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) {
-		spin_lock_bh(&tp->indirect_lock);
-		pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off);
-		pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val);
-		spin_unlock_bh(&tp->indirect_lock);
-	} else {
-		writel(val, tp->regs + off);
-		if ((tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) != 0)
-			readl(tp->regs + off);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
+}
+
+static void tg3_write_flush_reg32(struct tg3 *tp, u32 off, u32 val)
+{
+	writel(val, tp->regs + off);
+	readl(tp->regs + off);
+}
+
+static u32 tg3_read_indirect_reg32(struct tg3 *tp, u32 off)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off);
+	pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
+	return val;
+}
+
+static void tg3_write_indirect_mbox(struct tg3 *tp, u32 off, u32 val)
+{
+	unsigned long flags;
+
+	if (off == (MAILBOX_RCVRET_CON_IDX_0 + TG3_64BIT_REG_LOW)) {
+		pci_write_config_dword(tp->pdev, TG3PCI_RCV_RET_RING_CON_IDX +
+				       TG3_64BIT_REG_LOW, val);
+		return;
+	}
+	if (off == (MAILBOX_RCV_STD_PROD_IDX + TG3_64BIT_REG_LOW)) {
+		pci_write_config_dword(tp->pdev, TG3PCI_STD_RING_PROD_IDX +
+				       TG3_64BIT_REG_LOW, val);
+		return;
+	}
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
+
+	/* In indirect mode when disabling interrupts, we also need
+	 * to clear the interrupt bit in the GRC local ctrl register.
+	 */
+	if ((off == (MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW)) &&
+	    (val == 0x1)) {
+		pci_write_config_dword(tp->pdev, TG3PCI_MISC_LOCAL_CTRL,
+				       tp->grc_local_ctrl|GRC_LCLCTRL_CLEARINT);
 	}
 }
 
+static u32 tg3_read_indirect_mbox(struct tg3 *tp, u32 off)
+{
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
+	pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off + 0x5600);
+	pci_read_config_dword(tp->pdev, TG3PCI_REG_DATA, &val);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
+	return val;
+}
+
 static void _tw32_flush(struct tg3 *tp, u32 off, u32 val)
 {
-	if ((tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) != 0) {
-		spin_lock_bh(&tp->indirect_lock);
-		pci_write_config_dword(tp->pdev, TG3PCI_REG_BASE_ADDR, off);
-		pci_write_config_dword(tp->pdev, TG3PCI_REG_DATA, val);
-		spin_unlock_bh(&tp->indirect_lock);
-	} else {
-		void __iomem *dest = tp->regs + off;
-		writel(val, dest);
-		readl(dest);    /* always flush PCI write */
-	}
+	tp->write32(tp, off, val);
+	if (!(tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG) &&
+	    !(tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG) &&
+	    !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND))
+		tp->read32(tp, off);	/* flush */
 }
 
-static inline void _tw32_rx_mbox(struct tg3 *tp, u32 off, u32 val)
+static inline void tw32_mailbox_flush(struct tg3 *tp, u32 off, u32 val)
 {
-	void __iomem *mbox = tp->regs + off;
-	writel(val, mbox);
-	if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)
-		readl(mbox);
+	tp->write32_mbox(tp, off, val);
+	if (!(tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) &&
+	    !(tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND))
+		tp->read32_mbox(tp, off);
 }
 
-static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val)
+static void tg3_write32_tx_mbox(struct tg3 *tp, u32 off, u32 val)
 {
 	void __iomem *mbox = tp->regs + off;
 	writel(val, mbox);
@@ -384,46 +435,57 @@ static inline void _tw32_tx_mbox(struct tg3 *tp, u32 off, u32 val)
 		readl(mbox);
 }
 
-#define tw32_mailbox(reg, val)  writel(((val) & 0xffffffff), tp->regs + (reg))
-#define tw32_rx_mbox(reg, val)  _tw32_rx_mbox(tp, reg, val)
-#define tw32_tx_mbox(reg, val)  _tw32_tx_mbox(tp, reg, val)
+static void tg3_write32(struct tg3 *tp, u32 off, u32 val)
+{
+	writel(val, tp->regs + off);
+}
+
+static u32 tg3_read32(struct tg3 *tp, u32 off)
+{
+	return (readl(tp->regs + off)); 
+}
+
+#define tw32_mailbox(reg, val)	tp->write32_mbox(tp, reg, val)
+#define tw32_mailbox_f(reg, val)	tw32_mailbox_flush(tp, (reg), (val))
+#define tw32_rx_mbox(reg, val)	tp->write32_rx_mbox(tp, reg, val)
+#define tw32_tx_mbox(reg, val)	tp->write32_tx_mbox(tp, reg, val)
+#define tr32_mailbox(reg)	tp->read32_mbox(tp, reg)
 
-#define tw32(reg,val)		tg3_write_indirect_reg32(tp,(reg),(val))
+#define tw32(reg,val)		tp->write32(tp, reg, val)
 #define tw32_f(reg,val)		_tw32_flush(tp,(reg),(val))
-#define tw16(reg,val)		writew(((val) & 0xffff), tp->regs + (reg))
-#define tw8(reg,val)		writeb(((val) & 0xff), tp->regs + (reg))
-#define tr32(reg)		readl(tp->regs + (reg))
-#define tr16(reg)		readw(tp->regs + (reg))
-#define tr8(reg)		readb(tp->regs + (reg))
+#define tr32(reg)		tp->read32(tp, reg)
 
 static void tg3_write_mem(struct tg3 *tp, u32 off, u32 val)
 {
-	spin_lock_bh(&tp->indirect_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
 	pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off);
 	pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val);
 
 	/* Always leave this as zero. */
 	pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0);
-	spin_unlock_bh(&tp->indirect_lock);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
 }
 
 static void tg3_read_mem(struct tg3 *tp, u32 off, u32 *val)
 {
-	spin_lock_bh(&tp->indirect_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tp->indirect_lock, flags);
 	pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, off);
 	pci_read_config_dword(tp->pdev, TG3PCI_MEM_WIN_DATA, val);
 
 	/* Always leave this as zero. */
 	pci_write_config_dword(tp->pdev, TG3PCI_MEM_WIN_BASE_ADDR, 0);
-	spin_unlock_bh(&tp->indirect_lock);
+	spin_unlock_irqrestore(&tp->indirect_lock, flags);
 }
 
 static void tg3_disable_ints(struct tg3 *tp)
 {
 	tw32(TG3PCI_MISC_HOST_CTRL,
 	     (tp->misc_host_ctrl | MISC_HOST_CTRL_MASK_PCI_INT));
-	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
-	tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
+	tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
 }
 
 static inline void tg3_cond_int(struct tg3 *tp)
@@ -439,9 +501,8 @@ static void tg3_enable_ints(struct tg3 *tp)
 
 	tw32(TG3PCI_MISC_HOST_CTRL,
 	     (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT));
-	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
-		     (tp->last_tag << 24));
-	tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
+	tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
+		       (tp->last_tag << 24));
 	tg3_cond_int(tp);
 }
 
@@ -472,8 +533,6 @@ static inline unsigned int tg3_has_work(struct tg3 *tp)
  */
 static void tg3_restart_ints(struct tg3 *tp)
 {
-	tw32(TG3PCI_MISC_HOST_CTRL,
-		(tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT));
 	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
 		     tp->last_tag << 24);
 	mmiowb();
@@ -3278,9 +3337,8 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			/* No work, shared interrupt perhaps?  re-enable
 			 * interrupts, and flush that PCI write
 			 */
-			tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
+			tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
 			     	0x00000000);
-			tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
 		}
 	} else {	/* shared interrupt */
 		handled = 0;
@@ -3323,9 +3381,8 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id, struct pt_regs *r
 			/* no work, shared interrupt perhaps?  re-enable
 			 * interrupts, and flush that PCI write
 			 */
-			tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
-				     tp->last_tag << 24);
-			tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
+			tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW,
+				       tp->last_tag << 24);
 		}
 	} else {	/* shared interrupt */
 		handled = 0;
@@ -4216,7 +4273,7 @@ static void tg3_stop_fw(struct tg3 *);
 static int tg3_chip_reset(struct tg3 *tp)
 {
 	u32 val;
-	u32 flags_save;
+	void (*write_op)(struct tg3 *, u32, u32);
 	int i;
 
 	if (!(tp->tg3_flags2 & TG3_FLG2_SUN_570X))
@@ -4228,8 +4285,9 @@ static int tg3_chip_reset(struct tg3 *tp)
 	 * fun things.  So, temporarily disable the 5701
 	 * hardware workaround, while we do the reset.
 	 */
-	flags_save = tp->tg3_flags;
-	tp->tg3_flags &= ~TG3_FLAG_5701_REG_WRITE_BUG;
+	write_op = tp->write32;
+	if (write_op == tg3_write_flush_reg32)
+		tp->write32 = tg3_write32;
 
 	/* do the reset */
 	val = GRC_MISC_CFG_CORECLK_RESET;
@@ -4248,8 +4306,8 @@ static int tg3_chip_reset(struct tg3 *tp)
 		val |= GRC_MISC_CFG_KEEP_GPHY_POWER;
 	tw32(GRC_MISC_CFG, val);
 
-	/* restore 5701 hardware bug workaround flag */
-	tp->tg3_flags = flags_save;
+	/* restore 5701 hardware bug workaround write method */
+	tp->write32 = write_op;
 
 	/* Unfortunately, we have to delay before the PCI read back.
 	 * Some 575X chips even will not respond to a PCI cfg access
@@ -4635,7 +4693,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b
 				 int cpu_scratch_size, struct fw_info *info)
 {
 	int err, i;
-	u32 orig_tg3_flags = tp->tg3_flags;
 	void (*write_op)(struct tg3 *, u32, u32);
 
 	if (cpu_base == TX_CPU_BASE &&
@@ -4651,11 +4708,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b
 	else
 		write_op = tg3_write_indirect_reg32;
 
-	/* Force use of PCI config space for indirect register
-	 * write calls.
-	 */
-	tp->tg3_flags |= TG3_FLAG_PCIX_TARGET_HWBUG;
-
 	/* It is possible that bootcode is still loading at this point.
 	 * Get the nvram lock first before halting the cpu.
 	 */
@@ -4691,7 +4743,6 @@ static int tg3_load_firmware_cpu(struct tg3 *tp, u32 cpu_base, u32 cpu_scratch_b
 	err = 0;
 
 out:
-	tp->tg3_flags = orig_tg3_flags;
 	return err;
 }
 
@@ -5808,8 +5859,7 @@ static int tg3_reset_hw(struct tg3 *tp)
 	tw32_f(GRC_LOCAL_CTRL, tp->grc_local_ctrl);
 	udelay(100);
 
-	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0);
-	tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
+	tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0);
 	tp->last_tag = 0;
 
 	if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) {
@@ -6198,7 +6248,8 @@ static int tg3_test_interrupt(struct tg3 *tp)
 	       HOSTCC_MODE_NOW);
 
 	for (i = 0; i < 5; i++) {
-		int_mbox = tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW);
+		int_mbox = tr32_mailbox(MAILBOX_INTERRUPT_0 +
+					TG3_64BIT_REG_LOW);
 		if (int_mbox != 0)
 			break;
 		msleep(10);
@@ -6598,10 +6649,10 @@ static int tg3_open(struct net_device *dev)
 
 	/* Mailboxes */
 	printk("DEBUG: SNDHOST_PROD[%08x%08x] SNDNIC_PROD[%08x%08x]\n",
-	       tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0),
-	       tr32(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4),
-	       tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0),
-	       tr32(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4));
+	       tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x0),
+	       tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + 0x4),
+	       tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x0),
+	       tr32_mailbox(MAILBOX_SNDNIC_PROD_IDX_0 + 0x4));
 
 	/* NIC side send descriptors. */
 	for (i = 0; i < 6; i++) {
@@ -7901,7 +7952,7 @@ static int tg3_test_loopback(struct tg3 *tp)
 	num_pkts++;
 
 	tw32_tx_mbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW, send_idx);
-	tr32(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW);
+	tr32_mailbox(MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW);
 
 	udelay(10);
 
@@ -9153,14 +9204,6 @@ static int __devinit tg3_is_sun_570X(struct tg3 *tp)
 static int __devinit tg3_get_invariants(struct tg3 *tp)
 {
 	static struct pci_device_id write_reorder_chipsets[] = {
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
-		             PCI_DEVICE_ID_INTEL_82801AA_8) },
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
-		             PCI_DEVICE_ID_INTEL_82801AB_8) },
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
-		             PCI_DEVICE_ID_INTEL_82801BA_11) },
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
-		             PCI_DEVICE_ID_INTEL_82801BA_6) },
 		{ PCI_DEVICE(PCI_VENDOR_ID_AMD,
 		             PCI_DEVICE_ID_AMD_FE_GATE_700C) },
 		{ },
@@ -9177,7 +9220,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		tp->tg3_flags2 |= TG3_FLG2_SUN_570X;
 #endif
 
-	/* If we have an AMD 762 or Intel ICH/ICH0/ICH2 chipset, write
+	/* If we have an AMD 762 chipset, write
 	 * reordering to the mailbox registers done by the host
 	 * controller can cause major troubles.  We read back from
 	 * every mailbox register write to force the writes to be
@@ -9215,6 +9258,69 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5752_A0_HW)
 		tp->pci_chip_rev_id = CHIPREV_ID_5752_A0;
 
+	/* If we have 5702/03 A1 or A2 on certain ICH chipsets,
+	 * we need to disable memory and use config. cycles
+	 * only to access all registers. The 5702/03 chips
+	 * can mistakenly decode the special cycles from the
+	 * ICH chipsets as memory write cycles, causing corruption
+	 * of register and memory space. Only certain ICH bridges
+	 * will drive special cycles with non-zero data during the
+	 * address phase which can fall within the 5703's address
+	 * range. This is not an ICH bug as the PCI spec allows
+	 * non-zero address during special cycles. However, only
+	 * these ICH bridges are known to drive non-zero addresses
+	 * during special cycles.
+	 *
+	 * Since special cycles do not cross PCI bridges, we only
+	 * enable this workaround if the 5703 is on the secondary
+	 * bus of these ICH bridges.
+	 */
+	if ((tp->pci_chip_rev_id == CHIPREV_ID_5703_A1) ||
+	    (tp->pci_chip_rev_id == CHIPREV_ID_5703_A2)) {
+		static struct tg3_dev_id {
+			u32	vendor;
+			u32	device;
+			u32	rev;
+		} ich_chipsets[] = {
+			{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AA_8,
+			  PCI_ANY_ID },
+			{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AB_8,
+			  PCI_ANY_ID },
+			{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_11,
+			  0xa },
+			{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_6,
+			  PCI_ANY_ID },
+			{ },
+		};
+		struct tg3_dev_id *pci_id = &ich_chipsets[0];
+		struct pci_dev *bridge = NULL;
+
+		while (pci_id->vendor != 0) {
+			bridge = pci_get_device(pci_id->vendor, pci_id->device,
+						bridge);
+			if (!bridge) {
+				pci_id++;
+				continue;
+			}
+			if (pci_id->rev != PCI_ANY_ID) {
+				u8 rev;
+
+				pci_read_config_byte(bridge, PCI_REVISION_ID,
+						     &rev);
+				if (rev > pci_id->rev)
+					continue;
+			}
+			if (bridge->subordinate &&
+			    (bridge->subordinate->number ==
+			     tp->pdev->bus->number)) {
+
+				tp->tg3_flags2 |= TG3_FLG2_ICH_WORKAROUND;
+				pci_dev_put(bridge);
+				break;
+			}
+		}
+	}
+
 	/* Find msi capability. */
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780)
 		tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI);
@@ -9302,6 +9408,12 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		}
 	}
 
+	/* 5700 BX chips need to have their TX producer index mailboxes
+	 * written twice to workaround a bug.
+	 */
+	if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX)
+		tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG;
+
 	/* Back to back register writes can cause problems on this chip,
 	 * the workaround is to read back all reg writes except those to
 	 * mailbox regs.  See tg3_write_indirect_reg32().
@@ -9325,6 +9437,43 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 		pci_write_config_dword(tp->pdev, TG3PCI_PCISTATE, pci_state_reg);
 	}
 
+	/* Default fast path register access methods */
+	tp->read32 = tg3_read32;
+	tp->write32 = tg3_write32;
+	tp->read32_mbox = tg3_read32;
+	tp->write32_mbox = tg3_write32;
+	tp->write32_tx_mbox = tg3_write32;
+	tp->write32_rx_mbox = tg3_write32;
+
+	/* Various workaround register access methods */
+	if (tp->tg3_flags & TG3_FLAG_PCIX_TARGET_HWBUG)
+		tp->write32 = tg3_write_indirect_reg32;
+	else if (tp->tg3_flags & TG3_FLAG_5701_REG_WRITE_BUG)
+		tp->write32 = tg3_write_flush_reg32;
+
+	if ((tp->tg3_flags & TG3_FLAG_TXD_MBOX_HWBUG) ||
+	    (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)) {
+		tp->write32_tx_mbox = tg3_write32_tx_mbox;
+		if (tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER)
+			tp->write32_rx_mbox = tg3_write_flush_reg32;
+	}
+
+	if (tp->tg3_flags2 & TG3_FLG2_ICH_WORKAROUND) {
+		tp->read32 = tg3_read_indirect_reg32;
+		tp->write32 = tg3_write_indirect_reg32;
+		tp->read32_mbox = tg3_read_indirect_mbox;
+		tp->write32_mbox = tg3_write_indirect_mbox;
+		tp->write32_tx_mbox = tg3_write_indirect_mbox;
+		tp->write32_rx_mbox = tg3_write_indirect_mbox;
+
+		iounmap(tp->regs);
+		tp->regs = 0;
+
+		pci_read_config_word(tp->pdev, PCI_COMMAND, &pci_cmd);
+		pci_cmd &= ~PCI_COMMAND_MEMORY;
+		pci_write_config_word(tp->pdev, PCI_COMMAND, pci_cmd);
+	}
+
 	/* Get eeprom hw config before calling tg3_set_power_state().
 	 * In particular, the TG3_FLAG_EEPROM_WRITE_PROT flag must be
 	 * determined before calling tg3_set_power_state() so that
@@ -9539,14 +9688,6 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	else
 		tp->tg3_flags &= ~TG3_FLAG_POLL_SERDES;
 
-	/* 5700 BX chips need to have their TX producer index mailboxes
-	 * written twice to workaround a bug.
-	 */
-	if (GET_CHIP_REV(tp->pci_chip_rev_id) == CHIPREV_5700_BX)
-		tp->tg3_flags |= TG3_FLAG_TXD_MBOX_HWBUG;
-	else
-		tp->tg3_flags &= ~TG3_FLAG_TXD_MBOX_HWBUG;
-
 	/* It seems all chips can get confused if TX buffers
 	 * straddle the 4GB address boundary in some cases.
 	 */
@@ -10469,7 +10610,10 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	return 0;
 
 err_out_iounmap:
-	iounmap(tp->regs);
+	if (tp->regs) {
+		iounmap(tp->regs);
+		tp->regs = 0;
+	}
 
 err_out_free_dev:
 	free_netdev(dev);
@@ -10491,7 +10635,10 @@ static void __devexit tg3_remove_one(struct pci_dev *pdev)
 		struct tg3 *tp = netdev_priv(dev);
 
 		unregister_netdev(dev);
-		iounmap(tp->regs);
+		if (tp->regs) {
+			iounmap(tp->regs);
+			tp->regs = 0;
+		}
 		free_netdev(dev);
 		pci_release_regions(pdev);
 		pci_disable_device(pdev);
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 5c4433c147f..c184b773e58 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2049,6 +2049,11 @@ struct tg3 {
 	spinlock_t			lock;
 	spinlock_t			indirect_lock;
 
+	u32				(*read32) (struct tg3 *, u32);
+	void				(*write32) (struct tg3 *, u32, u32);
+	u32				(*read32_mbox) (struct tg3 *, u32);
+	void				(*write32_mbox) (struct tg3 *, u32,
+							 u32);
 	void __iomem			*regs;
 	struct net_device		*dev;
 	struct pci_dev			*pdev;
@@ -2060,6 +2065,8 @@ struct tg3 {
 	u32				msg_enable;
 
 	/* begin "tx thread" cacheline section */
+	void				(*write32_tx_mbox) (struct tg3 *, u32,
+							    u32);
 	u32				tx_prod;
 	u32				tx_cons;
 	u32				tx_pending;
@@ -2071,6 +2078,8 @@ struct tg3 {
 	dma_addr_t			tx_desc_mapping;
 
 	/* begin "rx thread" cacheline section */
+	void				(*write32_rx_mbox) (struct tg3 *, u32,
+							    u32);
 	u32				rx_rcb_ptr;
 	u32				rx_std_ptr;
 	u32				rx_jumbo_ptr;
@@ -2165,6 +2174,7 @@ struct tg3 {
 #define TG3_FLG2_ANY_SERDES		(TG3_FLG2_PHY_SERDES |	\
 					TG3_FLG2_MII_SERDES)
 #define TG3_FLG2_PARALLEL_DETECT	0x01000000
+#define TG3_FLG2_ICH_WORKAROUND		0x02000000
 
 	u32				split_mode_max_reqs;
 #define SPLIT_MODE_5704_MAX_REQ		3
diff --git a/drivers/net/wan/hdlc_generic.c b/drivers/net/wan/hdlc_generic.c
index a63f6a2cc4f..cdd4c09c2d9 100644
--- a/drivers/net/wan/hdlc_generic.c
+++ b/drivers/net/wan/hdlc_generic.c
@@ -61,7 +61,7 @@ static struct net_device_stats *hdlc_get_stats(struct net_device *dev)
 
 
 static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev,
-		    struct packet_type *p)
+		    struct packet_type *p, struct net_device *orig_dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 	if (hdlc->proto.netif_rx)
diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index 7f2e3653c5e..6c302e9dbca 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -86,7 +86,7 @@ static __inline__ int dev_is_ethdev(struct net_device *dev)
 /*
  *	Receive a LAPB frame via an ethernet interface.
  */
-static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype)
+static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev)
 {
 	int len, err;
 	struct lapbethdev *lapbeth;
diff --git a/drivers/net/wan/sdla_fr.c b/drivers/net/wan/sdla_fr.c
index c5f5e62aab8..0497dbdb863 100644
--- a/drivers/net/wan/sdla_fr.c
+++ b/drivers/net/wan/sdla_fr.c
@@ -445,7 +445,7 @@ void 	s508_s514_unlock(sdla_t *card, unsigned long *smp_flags);
 void 	s508_s514_lock(sdla_t *card, unsigned long *smp_flags);
 
 unsigned short calc_checksum (char *, int);
-static int setup_fr_header(struct sk_buff** skb,
+static int setup_fr_header(struct sk_buff *skb,
 			   struct net_device* dev, char op_mode);
 
 
@@ -1372,7 +1372,7 @@ static int if_send(struct sk_buff* skb, struct net_device* dev)
 	/* Move the if_header() code to here. By inserting frame
 	 * relay header in if_header() we would break the
 	 * tcpdump and other packet sniffers */
-	chan->fr_header_len = setup_fr_header(&skb,dev,chan->common.usedby);
+	chan->fr_header_len = setup_fr_header(skb,dev,chan->common.usedby);
 	if (chan->fr_header_len < 0 ){
 		++chan->ifstats.tx_dropped;
 		++card->wandev.stats.tx_dropped;
@@ -1597,8 +1597,6 @@ static int setup_for_delayed_transmit(struct net_device* dev,
 		return 1;
 	}
 
-	skb_unlink(skb);
-	
         chan->transmit_length = len;
 	chan->delay_skb = skb;
         
@@ -4871,18 +4869,15 @@ static void unconfig_fr (sdla_t *card)
 	}
 }
 
-static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev,
+static int setup_fr_header(struct sk_buff *skb, struct net_device* dev,
 			   char op_mode)
 {
-	struct sk_buff *skb = *skb_orig;
 	fr_channel_t *chan=dev->priv;
 
-	if (op_mode == WANPIPE){
-
+	if (op_mode == WANPIPE) {
 		chan->fr_header[0]=Q922_UI;
 		
 		switch (htons(skb->protocol)){
-			
 		case ETH_P_IP:
 			chan->fr_header[1]=NLPID_IP;
 			break;
@@ -4894,16 +4889,14 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev,
 	}
 
 	/* If we are in bridging mode, we must apply
-	 * an Ethernet header */
-	if (op_mode == BRIDGE || op_mode == BRIDGE_NODE){
-
-
+	 * an Ethernet header
+	 */
+	if (op_mode == BRIDGE || op_mode == BRIDGE_NODE) {
 		/* Encapsulate the packet as a bridged Ethernet frame. */
 #ifdef DEBUG
 		printk(KERN_INFO "%s: encapsulating skb for frame relay\n", 
 			dev->name);
 #endif
-		
 		chan->fr_header[0] = 0x03;
 		chan->fr_header[1] = 0x00;
 		chan->fr_header[2] = 0x80;
@@ -4916,7 +4909,6 @@ static int setup_fr_header(struct sk_buff **skb_orig, struct net_device* dev,
 		/* Yuck. */
 		skb->protocol = ETH_P_802_3;
 		return 8;
-
 	}
 		
 	return 0;
diff --git a/drivers/net/wan/syncppp.c b/drivers/net/wan/syncppp.c
index 84b65c60c79..f58c794a963 100644
--- a/drivers/net/wan/syncppp.c
+++ b/drivers/net/wan/syncppp.c
@@ -1447,7 +1447,7 @@ static void sppp_print_bytes (u_char *p, u16 len)
  *	after interrupt servicing to process frames queued via netif_rx.
  */
 
-static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p)
+static int sppp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev)
 {
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		return NET_RX_DROP;
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index fe09d145542..2cb3c8340ca 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -1442,7 +1442,7 @@ static int ibmvscsi_remove(struct vio_dev *vdev)
  */
 static struct vio_device_id ibmvscsi_device_table[] __devinitdata = {
 	{"vscsi", "IBM,v-scsi"},
-	{0,}
+	{ "", "" }
 };
 
 MODULE_DEVICE_TABLE(vio, ibmvscsi_device_table);
diff --git a/drivers/scsi/ibmvscsi/rpa_vscsi.c b/drivers/scsi/ibmvscsi/rpa_vscsi.c
index 035f615817d..8bf5652f106 100644
--- a/drivers/scsi/ibmvscsi/rpa_vscsi.c
+++ b/drivers/scsi/ibmvscsi/rpa_vscsi.c
@@ -28,6 +28,7 @@
  */
 
 #include <asm/vio.h>
+#include <asm/prom.h>
 #include <asm/iommu.h>
 #include <asm/hvcall.h>
 #include <linux/dma-mapping.h>
diff --git a/drivers/usb/net/usbnet.c b/drivers/usb/net/usbnet.c
index 4528a00c45b..a2f67245f6d 100644
--- a/drivers/usb/net/usbnet.c
+++ b/drivers/usb/net/usbnet.c
@@ -2903,19 +2903,18 @@ static struct net_device_stats *usbnet_get_stats (struct net_device *net)
  * completion callbacks.  2.5 should have fixed those bugs...
  */
 
-static void defer_bh (struct usbnet *dev, struct sk_buff *skb)
+static void defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list)
 {
-	struct sk_buff_head	*list = skb->list;
 	unsigned long		flags;
 
-	spin_lock_irqsave (&list->lock, flags);
-	__skb_unlink (skb, list);
-	spin_unlock (&list->lock);
-	spin_lock (&dev->done.lock);
-	__skb_queue_tail (&dev->done, skb);
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_unlink(skb, list);
+	spin_unlock(&list->lock);
+	spin_lock(&dev->done.lock);
+	__skb_queue_tail(&dev->done, skb);
 	if (dev->done.qlen == 1)
-		tasklet_schedule (&dev->bh);
-	spin_unlock_irqrestore (&dev->done.lock, flags);
+		tasklet_schedule(&dev->bh);
+	spin_unlock_irqrestore(&dev->done.lock, flags);
 }
 
 /* some work can't be done in tasklets, so we use keventd
@@ -3120,7 +3119,7 @@ block:
 		break;
 	}
 
-	defer_bh (dev, skb);
+	defer_bh(dev, skb, &dev->rxq);
 
 	if (urb) {
 		if (netif_running (dev->net)
@@ -3490,7 +3489,7 @@ static void tx_complete (struct urb *urb, struct pt_regs *regs)
 
 	urb->dev = NULL;
 	entry->state = tx_done;
-	defer_bh (dev, skb);
+	defer_bh(dev, skb, &dev->txq);
 }
 
 /*-------------------------------------------------------------------------*/
diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index b5a5e04b6d3..498ad505fa5 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -86,9 +86,9 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl,
 
 	dev->driver = driver;
 
-	dev->groups = 23;
+	dev->groups = 1;
 	dev->seq = 1;
-	dev->nls = netlink_kernel_create(NETLINK_W1, NULL);
+	dev->nls = netlink_kernel_create(NETLINK_W1, 1, NULL, THIS_MODULE);
 	if (!dev->nls) {
 		printk(KERN_ERR "Failed to create new netlink socket(%u) for w1 master %s.\n",
 			NETLINK_NFLOG, dev->dev.bus_id);
@@ -225,3 +225,5 @@ void w1_remove_master_device(struct w1_bus_master *bm)
 
 EXPORT_SYMBOL(w1_add_master_device);
 EXPORT_SYMBOL(w1_remove_master_device);
+
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_W1);
diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c
index 2a82fb055c7..e7b774423dd 100644
--- a/drivers/w1/w1_netlink.c
+++ b/drivers/w1/w1_netlink.c
@@ -51,7 +51,7 @@ void w1_netlink_send(struct w1_master *dev, struct w1_netlink_msg *msg)
 
 	memcpy(data, msg, sizeof(struct w1_netlink_msg));
 
-	NETLINK_CB(skb).dst_groups = dev->groups;
+	NETLINK_CB(skb).dst_group = dev->groups;
 	netlink_broadcast(dev->nls, skb, 0, dev->groups, GFP_ATOMIC);
 
 nlmsg_failure:
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
index 93f3cd22a2e..6815b1b12b6 100644
--- a/fs/smbfs/sock.c
+++ b/fs/smbfs/sock.c
@@ -15,12 +15,12 @@
 #include <linux/file.h>
 #include <linux/in.h>
 #include <linux/net.h>
-#include <linux/tcp.h>
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/smp_lock.h>
 #include <linux/workqueue.h>
 #include <net/scm.h>
+#include <net/tcp_states.h>
 #include <net/ip.h>
 
 #include <linux/smb_fs.h>
diff --git a/include/asm-alpha/socket.h b/include/asm-alpha/socket.h
index d00259d3dc7..b5193229132 100644
--- a/include/asm-alpha/socket.h
+++ b/include/asm-alpha/socket.h
@@ -25,6 +25,8 @@
 #define SO_ERROR	0x1007
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
 #define	SO_RCVLOWAT	0x1010
 #define	SO_SNDLOWAT	0x1011
 #define	SO_RCVTIMEO	0x1012
diff --git a/include/asm-arm/socket.h b/include/asm-arm/socket.h
index 46d20585d95..3c51da6438c 100644
--- a/include/asm-arm/socket.h
+++ b/include/asm-arm/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-arm26/socket.h b/include/asm-arm26/socket.h
index 46d20585d95..3c51da6438c 100644
--- a/include/asm-arm26/socket.h
+++ b/include/asm-arm26/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-cris/socket.h b/include/asm-cris/socket.h
index f159b4f165f..8b1da3e58c5 100644
--- a/include/asm-cris/socket.h
+++ b/include/asm-cris/socket.h
@@ -16,6 +16,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-frv/socket.h b/include/asm-frv/socket.h
index c3be17c7de4..7177f8b9817 100644
--- a/include/asm-frv/socket.h
+++ b/include/asm-frv/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-h8300/socket.h b/include/asm-h8300/socket.h
index af33b8525dc..d98cf85bafc 100644
--- a/include/asm-h8300/socket.h
+++ b/include/asm-h8300/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-i386/checksum.h b/include/asm-i386/checksum.h
index f949e44c2a3..67d3630c4e8 100644
--- a/include/asm-i386/checksum.h
+++ b/include/asm-i386/checksum.h
@@ -83,7 +83,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph,
 	    "adcl $0, %0	;\n"
 	    "notl %0		;\n"
 "2:				;\n"
-	/* Since the input registers which are loaded with iph and ipl
+	/* Since the input registers which are loaded with iph and ihl
 	   are modified, we must also specify them as outputs, or gcc
 	   will assume they contain their original values. */
 	: "=r" (sum), "=r" (iph), "=r" (ihl)
diff --git a/include/asm-i386/socket.h b/include/asm-i386/socket.h
index 07f6b38ad14..802ae76195b 100644
--- a/include/asm-i386/socket.h
+++ b/include/asm-i386/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-ia64/socket.h b/include/asm-ia64/socket.h
index 21a9f10d6ba..a255006fb7b 100644
--- a/include/asm-ia64/socket.h
+++ b/include/asm-ia64/socket.h
@@ -23,6 +23,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-m32r/checksum.h b/include/asm-m32r/checksum.h
index 99f37dbf255..877ebf46e9f 100644
--- a/include/asm-m32r/checksum.h
+++ b/include/asm-m32r/checksum.h
@@ -105,7 +105,7 @@ static inline unsigned short ip_fast_csum(unsigned char * iph,
 		"	addx	%0, %3 \n"
 		"	.fillinsn\n"
 		"2: \n"
-	/* Since the input registers which are loaded with iph and ipl
+	/* Since the input registers which are loaded with iph and ihl
 	   are modified, we must also specify them as outputs, or gcc
 	   will assume they contain their original values. */
 	: "=&r" (sum), "=r" (iph), "=r" (ihl), "=&r" (tmpreg0), "=&r" (tmpreg1)
diff --git a/include/asm-m32r/socket.h b/include/asm-m32r/socket.h
index 159519d9904..8b6680f223c 100644
--- a/include/asm-m32r/socket.h
+++ b/include/asm-m32r/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-m68k/socket.h b/include/asm-m68k/socket.h
index 8d0b9fc2d07..f578ca4b776 100644
--- a/include/asm-m68k/socket.h
+++ b/include/asm-m68k/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-mips/socket.h b/include/asm-mips/socket.h
index 020b4db70ee..d478a86294e 100644
--- a/include/asm-mips/socket.h
+++ b/include/asm-mips/socket.h
@@ -37,6 +37,8 @@ To add: #define SO_REUSEPORT 0x0200	/* Allow local address and port reuse.  */
 #define SO_ERROR	0x1007	/* get error status and clear */
 #define SO_SNDBUF	0x1001	/* Send buffer size. */
 #define SO_RCVBUF	0x1002	/* Receive buffer. */
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
 #define SO_SNDLOWAT	0x1003	/* send low-water mark */
 #define SO_RCVLOWAT	0x1004	/* receive low-water mark */
 #define SO_SNDTIMEO	0x1005	/* send timeout */
diff --git a/include/asm-parisc/socket.h b/include/asm-parisc/socket.h
index 4a77996c186..1bf54dc53c1 100644
--- a/include/asm-parisc/socket.h
+++ b/include/asm-parisc/socket.h
@@ -16,6 +16,8 @@
 /* To add :#define SO_REUSEPORT 0x0200 */
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
 #define SO_SNDLOWAT	0x1003
 #define SO_RCVLOWAT	0x1004
 #define SO_SNDTIMEO	0x1005
diff --git a/include/asm-ppc64/8253pit.h b/include/asm-powerpc/8253pit.h
index 285f78488cc..862708a749b 100644
--- a/include/asm-ppc64/8253pit.h
+++ b/include/asm-powerpc/8253pit.h
@@ -5,6 +5,6 @@
 #ifndef _8253PIT_H
 #define _8253PIT_H
 
-#define PIT_TICK_RATE 	1193182UL
+#define PIT_TICK_RATE	1193182UL
 
 #endif
diff --git a/include/asm-ppc/agp.h b/include/asm-powerpc/agp.h
index ca9e423307f..ca9e423307f 100644
--- a/include/asm-ppc/agp.h
+++ b/include/asm-powerpc/agp.h
diff --git a/include/asm-powerpc/cputime.h b/include/asm-powerpc/cputime.h
new file mode 100644
index 00000000000..6d68ad7e0ea
--- /dev/null
+++ b/include/asm-powerpc/cputime.h
@@ -0,0 +1 @@
+#include <asm-generic/cputime.h>
diff --git a/include/asm-ppc/div64.h b/include/asm-powerpc/div64.h
index 6cd978cefb2..6cd978cefb2 100644
--- a/include/asm-ppc/div64.h
+++ b/include/asm-powerpc/div64.h
diff --git a/include/asm-powerpc/emergency-restart.h b/include/asm-powerpc/emergency-restart.h
new file mode 100644
index 00000000000..3711bd9d50b
--- /dev/null
+++ b/include/asm-powerpc/emergency-restart.h
@@ -0,0 +1 @@
+#include <asm-generic/emergency-restart.h>
diff --git a/include/asm-ppc/errno.h b/include/asm-powerpc/errno.h
index 19f20bd41ae..19f20bd41ae 100644
--- a/include/asm-ppc/errno.h
+++ b/include/asm-powerpc/errno.h
diff --git a/include/asm-ppc/ioctl.h b/include/asm-powerpc/ioctl.h
index 93c6acfdd0f..93c6acfdd0f 100644
--- a/include/asm-ppc/ioctl.h
+++ b/include/asm-powerpc/ioctl.h
diff --git a/include/asm-ppc/ioctls.h b/include/asm-powerpc/ioctls.h
index f5b7f2b055e..f5b7f2b055e 100644
--- a/include/asm-ppc/ioctls.h
+++ b/include/asm-powerpc/ioctls.h
diff --git a/include/asm-ppc/ipc.h b/include/asm-powerpc/ipc.h
index a46e3d9c2a3..a46e3d9c2a3 100644
--- a/include/asm-ppc/ipc.h
+++ b/include/asm-powerpc/ipc.h
diff --git a/include/asm-ppc/linkage.h b/include/asm-powerpc/linkage.h
index 291c2d01c44..291c2d01c44 100644
--- a/include/asm-ppc/linkage.h
+++ b/include/asm-powerpc/linkage.h
diff --git a/include/asm-ppc64/local.h b/include/asm-powerpc/local.h
index c11c530f74d..c11c530f74d 100644
--- a/include/asm-ppc64/local.h
+++ b/include/asm-powerpc/local.h
diff --git a/include/asm-ppc/namei.h b/include/asm-powerpc/namei.h
index 29c9ec83213..29c9ec83213 100644
--- a/include/asm-ppc/namei.h
+++ b/include/asm-powerpc/namei.h
diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h
new file mode 100644
index 00000000000..06a959d6723
--- /dev/null
+++ b/include/asm-powerpc/percpu.h
@@ -0,0 +1 @@
+#include <asm-generic/percpu.h>
diff --git a/include/asm-ppc/poll.h b/include/asm-powerpc/poll.h
index be5024913c6..be5024913c6 100644
--- a/include/asm-ppc/poll.h
+++ b/include/asm-powerpc/poll.h
diff --git a/include/asm-powerpc/resource.h b/include/asm-powerpc/resource.h
new file mode 100644
index 00000000000..04bc4db8921
--- /dev/null
+++ b/include/asm-powerpc/resource.h
@@ -0,0 +1 @@
+#include <asm-generic/resource.h>
diff --git a/include/asm-ppc/shmparam.h b/include/asm-powerpc/shmparam.h
index d6250602ae6..d6250602ae6 100644
--- a/include/asm-ppc/shmparam.h
+++ b/include/asm-powerpc/shmparam.h
diff --git a/include/asm-ppc/string.h b/include/asm-powerpc/string.h
index 22557599739..22557599739 100644
--- a/include/asm-ppc/string.h
+++ b/include/asm-powerpc/string.h
diff --git a/include/asm-ppc/unaligned.h b/include/asm-powerpc/unaligned.h
index 45520d9b85d..45520d9b85d 100644
--- a/include/asm-ppc/unaligned.h
+++ b/include/asm-powerpc/unaligned.h
diff --git a/include/asm-ppc/xor.h b/include/asm-powerpc/xor.h
index c82eb12a5b1..c82eb12a5b1 100644
--- a/include/asm-ppc/xor.h
+++ b/include/asm-powerpc/xor.h
diff --git a/include/asm-ppc/8253pit.h b/include/asm-ppc/8253pit.h
deleted file mode 100644
index 285f78488cc..00000000000
--- a/include/asm-ppc/8253pit.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * 8253/8254 Programmable Interval Timer
- */
-
-#ifndef _8253PIT_H
-#define _8253PIT_H
-
-#define PIT_TICK_RATE 	1193182UL
-
-#endif
diff --git a/include/asm-ppc/cputime.h b/include/asm-ppc/cputime.h
deleted file mode 100644
index 8e9faf5ce72..00000000000
--- a/include/asm-ppc/cputime.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __PPC_CPUTIME_H
-#define __PPC_CPUTIME_H
-
-#include <asm-generic/cputime.h>
-
-#endif /* __PPC_CPUTIME_H */
diff --git a/include/asm-ppc/emergency-restart.h b/include/asm-ppc/emergency-restart.h
deleted file mode 100644
index 108d8c48e42..00000000000
--- a/include/asm-ppc/emergency-restart.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_EMERGENCY_RESTART_H
-#define _ASM_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-ppc/hdreg.h b/include/asm-ppc/hdreg.h
deleted file mode 100644
index 7f7fd1af0af..00000000000
--- a/include/asm-ppc/hdreg.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/hdreg.h>
diff --git a/include/asm-ppc/local.h b/include/asm-ppc/local.h
deleted file mode 100644
index b08e3eced10..00000000000
--- a/include/asm-ppc/local.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __PPC_LOCAL_H
-#define __PPC_LOCAL_H
-
-#include <asm-generic/local.h>
-
-#endif /* __PPC_LOCAL_H */
diff --git a/include/asm-ppc/percpu.h b/include/asm-ppc/percpu.h
deleted file mode 100644
index d66667cd587..00000000000
--- a/include/asm-ppc/percpu.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ARCH_PPC_PERCPU__
-#define __ARCH_PPC_PERCPU__
-
-#include <asm-generic/percpu.h>
-
-#endif /* __ARCH_PPC_PERCPU__ */
diff --git a/include/asm-ppc/resource.h b/include/asm-ppc/resource.h
deleted file mode 100644
index 86a1ea23a6e..00000000000
--- a/include/asm-ppc/resource.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _PPC_RESOURCE_H
-#define _PPC_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif
diff --git a/include/asm-ppc/socket.h b/include/asm-ppc/socket.h
index 4134376b0f6..296e1a3469d 100644
--- a/include/asm-ppc/socket.h
+++ b/include/asm-ppc/socket.h
@@ -20,6 +20,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-ppc64/abs_addr.h b/include/asm-ppc64/abs_addr.h
index 6d4e8e78705..84c24d4cdb7 100644
--- a/include/asm-ppc64/abs_addr.h
+++ b/include/asm-ppc64/abs_addr.h
@@ -16,93 +16,51 @@
 #include <asm/page.h>
 #include <asm/prom.h>
 #include <asm/lmb.h>
+#include <asm/firmware.h>
 
-typedef u32 msChunks_entry;
-struct msChunks {
+struct mschunks_map {
         unsigned long num_chunks;
         unsigned long chunk_size;
         unsigned long chunk_shift;
         unsigned long chunk_mask;
-        msChunks_entry *abs;
+        u32 *mapping;
 };
 
-extern struct msChunks msChunks;
+extern struct mschunks_map mschunks_map;
 
-extern unsigned long msChunks_alloc(unsigned long, unsigned long, unsigned long);
-extern unsigned long reloc_offset(void);
+/* Chunks are 256 KB */
+#define MSCHUNKS_CHUNK_SHIFT	(18)
+#define MSCHUNKS_CHUNK_SIZE	(1UL << MSCHUNKS_CHUNK_SHIFT)
+#define MSCHUNKS_OFFSET_MASK	(MSCHUNKS_CHUNK_SIZE - 1)
 
-#ifdef CONFIG_MSCHUNKS
-
-static inline unsigned long
-chunk_to_addr(unsigned long chunk)
+static inline unsigned long chunk_to_addr(unsigned long chunk)
 {
-	unsigned long offset = reloc_offset();
-	struct msChunks *_msChunks = PTRRELOC(&msChunks);
-
-	return chunk << _msChunks->chunk_shift;
+	return chunk << MSCHUNKS_CHUNK_SHIFT;
 }
 
-static inline unsigned long
-addr_to_chunk(unsigned long addr)
+static inline unsigned long addr_to_chunk(unsigned long addr)
 {
-	unsigned long offset = reloc_offset();
-	struct msChunks *_msChunks = PTRRELOC(&msChunks);
-
-	return addr >> _msChunks->chunk_shift;
+	return addr >> MSCHUNKS_CHUNK_SHIFT;
 }
 
-static inline unsigned long
-chunk_offset(unsigned long addr)
+static inline unsigned long phys_to_abs(unsigned long pa)
 {
-	unsigned long offset = reloc_offset();
-	struct msChunks *_msChunks = PTRRELOC(&msChunks);
+	unsigned long chunk;
 
-	return addr & _msChunks->chunk_mask;
-}
+	/* This is a no-op on non-iSeries */
+	if (!firmware_has_feature(FW_FEATURE_ISERIES))
+		return pa;
 
-static inline unsigned long
-abs_chunk(unsigned long pchunk)
-{
-	unsigned long offset = reloc_offset();
-	struct msChunks *_msChunks = PTRRELOC(&msChunks);
-	if ( pchunk >= _msChunks->num_chunks ) {
-		return pchunk;
-	}
-	return PTRRELOC(_msChunks->abs)[pchunk];
-}
+	chunk = addr_to_chunk(pa);
 
-/* A macro so it can take pointers or unsigned long. */
-#define phys_to_abs(pa)						     \
-	({ unsigned long _pa = (unsigned long)(pa);			     \
-	   chunk_to_addr(abs_chunk(addr_to_chunk(_pa))) + chunk_offset(_pa); \
-	})
+	if (chunk < mschunks_map.num_chunks)
+		chunk = mschunks_map.mapping[chunk];
 
-static inline unsigned long
-physRpn_to_absRpn(unsigned long rpn)
-{
-	unsigned long pa = rpn << PAGE_SHIFT;
-	unsigned long aa = phys_to_abs(pa);
-	return (aa >> PAGE_SHIFT);
+	return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK);
 }
 
-/* A macro so it can take pointers or unsigned long. */
-#define abs_to_phys(aa) lmb_abs_to_phys((unsigned long)(aa))
-
-#else  /* !CONFIG_MSCHUNKS */
-
-#define chunk_to_addr(chunk) ((unsigned long)(chunk))
-#define addr_to_chunk(addr) (addr)
-#define chunk_offset(addr) (0)
-#define abs_chunk(pchunk) (pchunk)
-
-#define phys_to_abs(pa) (pa)
-#define physRpn_to_absRpn(rpn) (rpn)
-#define abs_to_phys(aa) (aa)
-
-#endif /* !CONFIG_MSCHUNKS */
-
 /* Convenience macros */
 #define virt_to_abs(va) phys_to_abs(__pa(va))
-#define abs_to_virt(aa) __va(abs_to_phys(aa))
+#define abs_to_virt(aa) __va(aa)
 
 #endif /* _ABS_ADDR_H */
diff --git a/include/asm-ppc64/agp.h b/include/asm-ppc64/agp.h
deleted file mode 100644
index ca9e423307f..00000000000
--- a/include/asm-ppc64/agp.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef AGP_H
-#define AGP_H 1
-
-#include <asm/io.h>
-
-/* nothing much needed here */
-
-#define map_page_into_agp(page)
-#define unmap_page_from_agp(page)
-#define flush_agp_mappings()
-#define flush_agp_cache() mb()
-
-/* Convert a physical address to an address suitable for the GART. */
-#define phys_to_gart(x) (x)
-#define gart_to_phys(x) (x)
-
-/* GATT allocation. Returns/accepts GATT kernel virtual address. */
-#define alloc_gatt_pages(order)		\
-	((char *)__get_free_pages(GFP_KERNEL, (order)))
-#define free_gatt_pages(table, order)	\
-	free_pages((unsigned long)(table), (order))
-
-#endif
diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h
index d67fa9e2607..ae6cf383010 100644
--- a/include/asm-ppc64/cputable.h
+++ b/include/asm-ppc64/cputable.h
@@ -56,11 +56,6 @@ struct cpu_spec {
 	 * BHT, SPD, etc... from head.S before branching to identify_machine
 	 */
 	cpu_setup_t	cpu_setup;
-
-	/* This is used to identify firmware features which are available
-	 * to the kernel.
-	 */
-	unsigned long   firmware_features;
 };
 
 extern struct cpu_spec		cpu_specs[];
@@ -71,39 +66,6 @@ static inline unsigned long cpu_has_feature(unsigned long feature)
 	return cur_cpu_spec->cpu_features & feature;
 }
 
-
-/* firmware feature bitmask values */
-#define FIRMWARE_MAX_FEATURES 63
-
-#define FW_FEATURE_PFT		(1UL<<0)
-#define FW_FEATURE_TCE		(1UL<<1)	
-#define FW_FEATURE_SPRG0	(1UL<<2)	
-#define FW_FEATURE_DABR		(1UL<<3)	
-#define FW_FEATURE_COPY		(1UL<<4)	
-#define FW_FEATURE_ASR		(1UL<<5)	
-#define FW_FEATURE_DEBUG	(1UL<<6)	
-#define FW_FEATURE_TERM		(1UL<<7)	
-#define FW_FEATURE_PERF		(1UL<<8)	
-#define FW_FEATURE_DUMP		(1UL<<9)	
-#define FW_FEATURE_INTERRUPT	(1UL<<10)	
-#define FW_FEATURE_MIGRATE	(1UL<<11)	
-#define FW_FEATURE_PERFMON	(1UL<<12)	
-#define FW_FEATURE_CRQ   	(1UL<<13)	
-#define FW_FEATURE_VIO   	(1UL<<14)	
-#define FW_FEATURE_RDMA   	(1UL<<15)	
-#define FW_FEATURE_LLAN   	(1UL<<16)	
-#define FW_FEATURE_BULK   	(1UL<<17)	
-#define FW_FEATURE_XDABR   	(1UL<<18)	
-#define FW_FEATURE_MULTITCE   	(1UL<<19)	
-#define FW_FEATURE_SPLPAR   	(1UL<<20)	
-
-typedef struct {
-    unsigned long val;
-    char * name;
-} firmware_feature_t;
-
-extern firmware_feature_t firmware_features_table[];
-
 #endif /* __ASSEMBLY__ */
 
 /* CPU kernel features */
@@ -140,10 +102,8 @@ extern firmware_feature_t firmware_features_table[];
 #define CPU_FTR_MMCRA_SIHV		ASM_CONST(0x0000080000000000)
 #define CPU_FTR_CTRL			ASM_CONST(0x0000100000000000)
 
-/* Platform firmware features */
-#define FW_FTR_				ASM_CONST(0x0000000000000001)
-
 #ifndef __ASSEMBLY__
+
 #define COMMON_USER_PPC64	(PPC_FEATURE_32 | PPC_FEATURE_64 | \
 			         PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_MMU)
 
@@ -156,10 +116,9 @@ extern firmware_feature_t firmware_features_table[];
 #define CPU_FTR_PPCAS_ARCH_V2	(CPU_FTR_PPCAS_ARCH_V2_BASE)
 #else
 #define CPU_FTR_PPCAS_ARCH_V2	(CPU_FTR_PPCAS_ARCH_V2_BASE | CPU_FTR_16M_PAGE)
-#endif
+#endif /* CONFIG_PPC_ISERIES */
 
-#define COMMON_PPC64_FW	(0)
-#endif
+#endif /* __ASSEMBLY */
 
 #ifdef __ASSEMBLY__
 
diff --git a/include/asm-ppc64/cputime.h b/include/asm-ppc64/cputime.h
deleted file mode 100644
index 8e9faf5ce72..00000000000
--- a/include/asm-ppc64/cputime.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __PPC_CPUTIME_H
-#define __PPC_CPUTIME_H
-
-#include <asm-generic/cputime.h>
-
-#endif /* __PPC_CPUTIME_H */
diff --git a/include/asm-ppc64/div64.h b/include/asm-ppc64/div64.h
deleted file mode 100644
index 6cd978cefb2..00000000000
--- a/include/asm-ppc64/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/include/asm-ppc64/emergency-restart.h b/include/asm-ppc64/emergency-restart.h
deleted file mode 100644
index 108d8c48e42..00000000000
--- a/include/asm-ppc64/emergency-restart.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_EMERGENCY_RESTART_H
-#define _ASM_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-ppc64/errno.h b/include/asm-ppc64/errno.h
deleted file mode 100644
index 69bc3b0c6cb..00000000000
--- a/include/asm-ppc64/errno.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _PPC64_ERRNO_H
-#define _PPC64_ERRNO_H
-
-/* 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm-generic/errno.h>
-
-#undef	EDEADLOCK
-#define	EDEADLOCK	58	/* File locking deadlock error */
-
-#define _LAST_ERRNO	516
-
-#endif
diff --git a/include/asm-ppc64/firmware.h b/include/asm-ppc64/firmware.h
new file mode 100644
index 00000000000..22bb85cf60a
--- /dev/null
+++ b/include/asm-ppc64/firmware.h
@@ -0,0 +1,101 @@
+/*
+ *  include/asm-ppc64/firmware.h
+ *
+ *  Extracted from include/asm-ppc64/cputable.h
+ *
+ *  Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ *
+ *  Modifications for ppc64:
+ *      Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+#ifndef __ASM_PPC_FIRMWARE_H
+#define __ASM_PPC_FIRMWARE_H
+
+#ifdef __KERNEL__
+
+#ifndef __ASSEMBLY__
+
+/* firmware feature bitmask values */
+#define FIRMWARE_MAX_FEATURES 63
+
+#define FW_FEATURE_PFT		(1UL<<0)
+#define FW_FEATURE_TCE		(1UL<<1)
+#define FW_FEATURE_SPRG0	(1UL<<2)
+#define FW_FEATURE_DABR		(1UL<<3)
+#define FW_FEATURE_COPY		(1UL<<4)
+#define FW_FEATURE_ASR		(1UL<<5)
+#define FW_FEATURE_DEBUG	(1UL<<6)
+#define FW_FEATURE_TERM		(1UL<<7)
+#define FW_FEATURE_PERF		(1UL<<8)
+#define FW_FEATURE_DUMP		(1UL<<9)
+#define FW_FEATURE_INTERRUPT	(1UL<<10)
+#define FW_FEATURE_MIGRATE	(1UL<<11)
+#define FW_FEATURE_PERFMON	(1UL<<12)
+#define FW_FEATURE_CRQ		(1UL<<13)
+#define FW_FEATURE_VIO		(1UL<<14)
+#define FW_FEATURE_RDMA		(1UL<<15)
+#define FW_FEATURE_LLAN		(1UL<<16)
+#define FW_FEATURE_BULK		(1UL<<17)
+#define FW_FEATURE_XDABR	(1UL<<18)
+#define FW_FEATURE_MULTITCE	(1UL<<19)
+#define FW_FEATURE_SPLPAR	(1UL<<20)
+#define FW_FEATURE_ISERIES	(1UL<<21)
+
+enum {
+	FW_FEATURE_PSERIES_POSSIBLE = FW_FEATURE_PFT | FW_FEATURE_TCE |
+		FW_FEATURE_SPRG0 | FW_FEATURE_DABR | FW_FEATURE_COPY |
+		FW_FEATURE_ASR | FW_FEATURE_DEBUG | FW_FEATURE_TERM |
+		FW_FEATURE_PERF | FW_FEATURE_DUMP | FW_FEATURE_INTERRUPT |
+		FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ |
+		FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN |
+		FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE |
+		FW_FEATURE_SPLPAR,
+	FW_FEATURE_PSERIES_ALWAYS = 0,
+	FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES,
+	FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES,
+	FW_FEATURE_POSSIBLE =
+#ifdef CONFIG_PPC_PSERIES
+		FW_FEATURE_PSERIES_POSSIBLE |
+#endif
+#ifdef CONFIG_PPC_ISERIES
+		FW_FEATURE_ISERIES_POSSIBLE |
+#endif
+		0,
+	FW_FEATURE_ALWAYS =
+#ifdef CONFIG_PPC_PSERIES
+		FW_FEATURE_PSERIES_ALWAYS &
+#endif
+#ifdef CONFIG_PPC_ISERIES
+		FW_FEATURE_ISERIES_ALWAYS &
+#endif
+		FW_FEATURE_POSSIBLE,
+};
+
+/* This is used to identify firmware features which are available
+ * to the kernel.
+ */
+extern unsigned long	ppc64_firmware_features;
+
+static inline unsigned long firmware_has_feature(unsigned long feature)
+{
+	return (FW_FEATURE_ALWAYS & feature) ||
+		(FW_FEATURE_POSSIBLE & ppc64_firmware_features & feature);
+}
+
+#ifdef CONFIG_PPC_PSERIES
+typedef struct {
+    unsigned long val;
+    char * name;
+} firmware_feature_t;
+
+extern firmware_feature_t firmware_features_table[];
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+#endif /* __ASM_PPC_FIRMWARE_H */
diff --git a/include/asm-ppc64/hdreg.h b/include/asm-ppc64/hdreg.h
deleted file mode 100644
index 7f7fd1af0af..00000000000
--- a/include/asm-ppc64/hdreg.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/hdreg.h>
diff --git a/include/asm-ppc64/imalloc.h b/include/asm-ppc64/imalloc.h
index e46ff68a6e4..42adf7033a8 100644
--- a/include/asm-ppc64/imalloc.h
+++ b/include/asm-ppc64/imalloc.h
@@ -6,7 +6,7 @@
  */
 #define PHBS_IO_BASE  	  VMALLOC_END
 #define IMALLOC_BASE      (PHBS_IO_BASE + 0x80000000ul)	/* Reserve 2 gigs for PHBs */
-#define IMALLOC_END       (VMALLOC_START + EADDR_MASK)
+#define IMALLOC_END       (VMALLOC_START + PGTABLE_RANGE)
 
 
 /* imalloc region types */
diff --git a/include/asm-ppc64/ioctl.h b/include/asm-ppc64/ioctl.h
deleted file mode 100644
index 42b8c5da7fb..00000000000
--- a/include/asm-ppc64/ioctl.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef _PPC64_IOCTL_H
-#define _PPC64_IOCTL_H
-
-
-/*
- * This was copied from the alpha as it's a bit cleaner there.
- *                         -- Cort
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define _IOC_NRBITS	8
-#define _IOC_TYPEBITS	8
-#define _IOC_SIZEBITS	13
-#define _IOC_DIRBITS	3
-
-#define _IOC_NRMASK	((1 << _IOC_NRBITS)-1)
-#define _IOC_TYPEMASK	((1 << _IOC_TYPEBITS)-1)
-#define _IOC_SIZEMASK	((1 << _IOC_SIZEBITS)-1)
-#define _IOC_DIRMASK	((1 << _IOC_DIRBITS)-1)
-
-#define _IOC_NRSHIFT	0
-#define _IOC_TYPESHIFT	(_IOC_NRSHIFT+_IOC_NRBITS)
-#define _IOC_SIZESHIFT	(_IOC_TYPESHIFT+_IOC_TYPEBITS)
-#define _IOC_DIRSHIFT	(_IOC_SIZESHIFT+_IOC_SIZEBITS)
-
-/*
- * Direction bits _IOC_NONE could be 0, but OSF/1 gives it a bit.
- * And this turns out useful to catch old ioctl numbers in header
- * files for us.
- */
-#define _IOC_NONE	1U
-#define _IOC_READ	2U
-#define _IOC_WRITE	4U
-
-#define _IOC(dir,type,nr,size) \
-	(((dir)  << _IOC_DIRSHIFT) | \
-	 ((type) << _IOC_TYPESHIFT) | \
-	 ((nr)   << _IOC_NRSHIFT) | \
-	 ((size) << _IOC_SIZESHIFT))
-
-/* provoke compile error for invalid uses of size argument */
-extern unsigned int __invalid_size_argument_for_IOC;
-#define _IOC_TYPECHECK(t) \
-       ((sizeof(t) == sizeof(t[1]) && \
-         sizeof(t) < (1 << _IOC_SIZEBITS)) ? \
-         sizeof(t) : __invalid_size_argument_for_IOC)
-
-/* used to create numbers */
-#define _IO(type,nr)		_IOC(_IOC_NONE,(type),(nr),0)
-#define _IOR(type,nr,size)	_IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
-#define _IOW(type,nr,size)	_IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
-#define _IOWR(type,nr,size)	_IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
-#define _IOR_BAD(type,nr,size)	_IOC(_IOC_READ,(type),(nr),sizeof(size))
-#define _IOW_BAD(type,nr,size)	_IOC(_IOC_WRITE,(type),(nr),sizeof(size))
-#define _IOWR_BAD(type,nr,size)	_IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
-
-/* used to decode them.. */
-#define _IOC_DIR(nr)		(((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
-#define _IOC_TYPE(nr)		(((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
-#define _IOC_NR(nr)		(((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
-#define _IOC_SIZE(nr)		(((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
-
-/* various drivers, such as the pcmcia stuff, need these... */
-#define IOC_IN		(_IOC_WRITE << _IOC_DIRSHIFT)
-#define IOC_OUT		(_IOC_READ << _IOC_DIRSHIFT)
-#define IOC_INOUT	((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
-#define IOCSIZE_MASK	(_IOC_SIZEMASK << _IOC_SIZESHIFT)
-#define IOCSIZE_SHIFT	(_IOC_SIZESHIFT)
-
-#endif /* _PPC64_IOCTL_H */
diff --git a/include/asm-ppc64/ioctls.h b/include/asm-ppc64/ioctls.h
deleted file mode 100644
index 48796bf3e4f..00000000000
--- a/include/asm-ppc64/ioctls.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef _ASM_PPC64_IOCTLS_H
-#define _ASM_PPC64_IOCTLS_H
-
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/ioctl.h>
-
-#define FIOCLEX		_IO('f', 1)
-#define FIONCLEX	_IO('f', 2)
-#define FIOASYNC	_IOW('f', 125, int)
-#define FIONBIO		_IOW('f', 126, int)
-#define FIONREAD	_IOR('f', 127, int)
-#define TIOCINQ		FIONREAD
-#define FIOQSIZE        _IOR('f', 128, loff_t)
-
-#define TIOCGETP	_IOR('t', 8, struct sgttyb)
-#define TIOCSETP	_IOW('t', 9, struct sgttyb)
-#define TIOCSETN	_IOW('t', 10, struct sgttyb)	/* TIOCSETP wo flush */
-
-#define TIOCSETC	_IOW('t', 17, struct tchars)
-#define TIOCGETC	_IOR('t', 18, struct tchars)
-#define TCGETS		_IOR('t', 19, struct termios)
-#define TCSETS		_IOW('t', 20, struct termios)
-#define TCSETSW		_IOW('t', 21, struct termios)
-#define TCSETSF		_IOW('t', 22, struct termios)
-
-#define TCGETA		_IOR('t', 23, struct termio)
-#define TCSETA		_IOW('t', 24, struct termio)
-#define TCSETAW		_IOW('t', 25, struct termio)
-#define TCSETAF		_IOW('t', 28, struct termio)
-
-#define TCSBRK		_IO('t', 29)
-#define TCXONC		_IO('t', 30)
-#define TCFLSH		_IO('t', 31)
-
-#define TIOCSWINSZ	_IOW('t', 103, struct winsize)
-#define TIOCGWINSZ	_IOR('t', 104, struct winsize)
-#define	TIOCSTART	_IO('t', 110)		/* start output, like ^Q */
-#define	TIOCSTOP	_IO('t', 111)		/* stop output, like ^S */
-#define TIOCOUTQ        _IOR('t', 115, int)     /* output queue size */
-
-#define TIOCGLTC	_IOR('t', 116, struct ltchars)
-#define TIOCSLTC	_IOW('t', 117, struct ltchars)
-#define TIOCSPGRP	_IOW('t', 118, int)
-#define TIOCGPGRP	_IOR('t', 119, int)
-
-#define TIOCEXCL	0x540C
-#define TIOCNXCL	0x540D
-#define TIOCSCTTY	0x540E
-
-#define TIOCSTI		0x5412
-#define TIOCMGET	0x5415
-#define TIOCMBIS	0x5416
-#define TIOCMBIC	0x5417
-#define TIOCMSET	0x5418
-# define TIOCM_LE	0x001
-# define TIOCM_DTR	0x002
-# define TIOCM_RTS	0x004
-# define TIOCM_ST	0x008
-# define TIOCM_SR	0x010
-# define TIOCM_CTS	0x020
-# define TIOCM_CAR	0x040
-# define TIOCM_RNG	0x080
-# define TIOCM_DSR	0x100
-# define TIOCM_CD	TIOCM_CAR
-# define TIOCM_RI	TIOCM_RNG
-
-#define TIOCGSOFTCAR	0x5419
-#define TIOCSSOFTCAR	0x541A
-#define TIOCLINUX	0x541C
-#define TIOCCONS	0x541D
-#define TIOCGSERIAL	0x541E
-#define TIOCSSERIAL	0x541F
-#define TIOCPKT		0x5420
-# define TIOCPKT_DATA		 0
-# define TIOCPKT_FLUSHREAD	 1
-# define TIOCPKT_FLUSHWRITE	 2
-# define TIOCPKT_STOP		 4
-# define TIOCPKT_START		 8
-# define TIOCPKT_NOSTOP		16
-# define TIOCPKT_DOSTOP		32
-
-
-#define TIOCNOTTY	0x5422
-#define TIOCSETD	0x5423
-#define TIOCGETD	0x5424
-#define TCSBRKP		0x5425	/* Needed for POSIX tcsendbreak() */
-#define TIOCSBRK	0x5427  /* BSD compatibility */
-#define TIOCCBRK	0x5428  /* BSD compatibility */
-#define TIOCGSID	0x5429  /* Return the session ID of FD */
-#define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
-#define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
-
-#define TIOCSERCONFIG	0x5453
-#define TIOCSERGWILD	0x5454
-#define TIOCSERSWILD	0x5455
-#define TIOCGLCKTRMIOS	0x5456
-#define TIOCSLCKTRMIOS	0x5457
-#define TIOCSERGSTRUCT	0x5458 /* For debugging only */
-#define TIOCSERGETLSR   0x5459 /* Get line status register */
-  /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
-# define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
-#define TIOCSERGETMULTI 0x545A /* Get multiport config  */
-#define TIOCSERSETMULTI 0x545B /* Set multiport config */
-
-#define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
-#define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
-
-#endif /* _ASM_PPC64_IOCTLS_H */
diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h
index 729de5cc21d..72dcf8116b0 100644
--- a/include/asm-ppc64/iommu.h
+++ b/include/asm-ppc64/iommu.h
@@ -104,9 +104,6 @@ extern void iommu_devnode_init_pSeries(struct device_node *dn);
 
 #ifdef CONFIG_PPC_ISERIES
 
-/* Initializes tables for bio buses */
-extern void __init iommu_vio_init(void);
-
 struct iSeries_Device_Node;
 /* Creates table for an individual device node */
 extern void iommu_devnode_init_iSeries(struct iSeries_Device_Node *dn);
diff --git a/include/asm-ppc64/ipc.h b/include/asm-ppc64/ipc.h
deleted file mode 100644
index a46e3d9c2a3..00000000000
--- a/include/asm-ppc64/ipc.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipc.h>
diff --git a/include/asm-ppc64/linkage.h b/include/asm-ppc64/linkage.h
deleted file mode 100644
index 291c2d01c44..00000000000
--- a/include/asm-ppc64/linkage.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-/* Nothing to see here... */
-
-#endif
diff --git a/include/asm-ppc64/lmb.h b/include/asm-ppc64/lmb.h
index a6cbca21ac1..cb368bf0f26 100644
--- a/include/asm-ppc64/lmb.h
+++ b/include/asm-ppc64/lmb.h
@@ -22,7 +22,6 @@
 
 struct lmb_property {
 	unsigned long base;
-	unsigned long physbase;
 	unsigned long size;
 };
 
diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h
index f0ef0637594..ff2c9287d3b 100644
--- a/include/asm-ppc64/machdep.h
+++ b/include/asm-ppc64/machdep.h
@@ -140,6 +140,9 @@ struct machdep_calls {
 
 	/* Idle loop for this platform, leave empty for default idle loop */
 	int		(*idle_loop)(void);
+
+	/* Function to enable pmcs for this platform, called once per cpu. */
+	void		(*enable_pmcs)(void);
 };
 
 extern int default_idle(void);
diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index 70348a85131..ad36bb28de2 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -28,9 +28,12 @@
 #define STE_VSID_SHIFT	12
 
 /* Location of cpu0's segment table */
-#define STAB0_PAGE	0x9
+#define STAB0_PAGE	0x6
 #define STAB0_PHYS_ADDR	(STAB0_PAGE<<PAGE_SHIFT)
-#define STAB0_VIRT_ADDR	(KERNELBASE+STAB0_PHYS_ADDR)
+
+#ifndef __ASSEMBLY__
+extern char initial_stab[];
+#endif /* ! __ASSEMBLY */
 
 /*
  * SLB
@@ -259,8 +262,10 @@ extern void stabs_alloc(void);
 #define VSID_BITS	36
 #define VSID_MODULUS	((1UL<<VSID_BITS)-1)
 
-#define CONTEXT_BITS	20
-#define USER_ESID_BITS	15
+#define CONTEXT_BITS	19
+#define USER_ESID_BITS	16
+
+#define USER_VSID_RANGE	(1UL << (USER_ESID_BITS + SID_SHIFT))
 
 /*
  * This macro generates asm code to compute the VSID scramble
@@ -302,8 +307,7 @@ typedef unsigned long mm_context_id_t;
 typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
-	pgd_t *huge_pgdir;
-	u16 htlb_segs; /* bitmask */
+	u16 low_htlb_areas, high_htlb_areas;
 #endif
 } mm_context_t;
 
diff --git a/include/asm-ppc64/naca.h b/include/asm-ppc64/naca.h
index bfb7caa32ea..d2afe644759 100644
--- a/include/asm-ppc64/naca.h
+++ b/include/asm-ppc64/naca.h
@@ -12,8 +12,6 @@
 
 #include <asm/types.h>
 
-#ifndef __ASSEMBLY__
-
 struct naca_struct {
 	/* Kernel only data - undefined for user space */
 	void *xItVpdAreas;              /* VPD Data                  0x00 */
@@ -23,9 +21,4 @@ struct naca_struct {
 
 extern struct naca_struct naca;
 
-#endif /* __ASSEMBLY__ */
-
-#define NACA_PAGE      0x4
-#define NACA_PHYS_ADDR (NACA_PAGE<<PAGE_SHIFT)
-
 #endif /* _NACA_H */
diff --git a/include/asm-ppc64/namei.h b/include/asm-ppc64/namei.h
deleted file mode 100644
index a1412a2d102..00000000000
--- a/include/asm-ppc64/namei.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* 
- * linux/include/asm-ppc/namei.h
- * Adapted from linux/include/asm-alpha/namei.h
- *
- * Included from linux/fs/namei.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef __PPC64_NAMEI_H
-#define __PPC64_NAMEI_H
-
-/* This dummy routine maybe changed to something useful
- * for /usr/gnemul/ emulation stuff.
- * Look at asm-sparc/namei.h for details.
- */
-
-#define __emul_prefix() NULL
-
-#endif /* __PPC64_NAMEI_H */
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index a5893a305a0..a79a08df62b 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -37,39 +37,45 @@
 
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 
-/* For 64-bit processes the hugepage range is 1T-1.5T */
-#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000)
-#define TASK_HPAGE_END 	ASM_CONST(0x0000018000000000)
+#define HTLB_AREA_SHIFT		40
+#define HTLB_AREA_SIZE		(1UL << HTLB_AREA_SHIFT)
+#define GET_HTLB_AREA(x)	((x) >> HTLB_AREA_SHIFT)
 
 #define LOW_ESID_MASK(addr, len)	(((1U << (GET_ESID(addr+len-1)+1)) \
 	   	                	- (1U << GET_ESID(addr))) & 0xffff)
+#define HTLB_AREA_MASK(addr, len)	(((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
+	   	                	- (1U << GET_HTLB_AREA(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 
 #define touches_hugepage_low_range(mm, addr, len) \
-	(LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
-#define touches_hugepage_high_range(addr, len) \
-	(((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
+	(LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas)
+#define touches_hugepage_high_range(mm, addr, len) \
+	(HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas)
 
 #define __within_hugepage_low_range(addr, len, segmask) \
 	((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask))
 #define within_hugepage_low_range(addr, len) \
 	__within_hugepage_low_range((addr), (len), \
-				    current->mm->context.htlb_segs)
-#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
-	  && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
+				    current->mm->context.low_htlb_areas)
+#define __within_hugepage_high_range(addr, len, zonemask) \
+	((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask))
+#define within_hugepage_high_range(addr, len) \
+	__within_hugepage_high_range((addr), (len), \
+				    current->mm->context.high_htlb_areas)
 
 #define is_hugepage_only_range(mm, addr, len) \
-	(touches_hugepage_high_range((addr), (len)) || \
+	(touches_hugepage_high_range((mm), (addr), (len)) || \
 	  touches_hugepage_low_range((mm), (addr), (len)))
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
 #define in_hugepage_area(context, addr) \
 	(cpu_has_feature(CPU_FTR_16M_PAGE) && \
-	 ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
+	 ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \
 	   ( ((addr) < 0x100000000L) && \
-	     ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) )
+	     ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) )
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -125,36 +131,42 @@ extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct pag
  * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b.
  */
 typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned int  pmd; } pmd_t;
-typedef struct { unsigned int  pgd; } pgd_t;
+typedef struct { unsigned long pmd; } pmd_t;
+typedef struct { unsigned long pud; } pud_t;
+typedef struct { unsigned long pgd; } pgd_t;
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pte_val(x)	((x).pte)
 #define pmd_val(x)	((x).pmd)
+#define pud_val(x)	((x).pud)
 #define pgd_val(x)	((x).pgd)
 #define pgprot_val(x)	((x).pgprot)
 
-#define __pte(x)	((pte_t) { (x) } )
-#define __pmd(x)	((pmd_t) { (x) } )
-#define __pgd(x)	((pgd_t) { (x) } )
-#define __pgprot(x)	((pgprot_t) { (x) } )
+#define __pte(x)	((pte_t) { (x) })
+#define __pmd(x)	((pmd_t) { (x) })
+#define __pud(x)	((pud_t) { (x) })
+#define __pgd(x)	((pgd_t) { (x) })
+#define __pgprot(x)	((pgprot_t) { (x) })
 
 #else
 /*
  * .. while these make it easier on the compiler
  */
 typedef unsigned long pte_t;
-typedef unsigned int  pmd_t;
-typedef unsigned int  pgd_t;
+typedef unsigned long pmd_t;
+typedef unsigned long pud_t;
+typedef unsigned long pgd_t;
 typedef unsigned long pgprot_t;
 
 #define pte_val(x)	(x)
 #define pmd_val(x)	(x)
+#define pud_val(x)	(x)
 #define pgd_val(x)	(x)
 #define pgprot_val(x)	(x)
 
 #define __pte(x)	(x)
 #define __pmd(x)	(x)
+#define __pud(x)	(x)
 #define __pgd(x)	(x)
 #define __pgprot(x)	(x)
 
@@ -208,9 +220,6 @@ extern u64 ppc64_pft_size;		/* Log 2 of page table size */
 #define USER_REGION_ID     (0UL)
 #define REGION_ID(ea)	   (((unsigned long)(ea)) >> REGION_SHIFT)
 
-#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE)
-#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT)
-
 #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE))
 
 #ifdef CONFIG_DISCONTIGMEM
diff --git a/include/asm-ppc64/param.h b/include/asm-ppc64/param.h
index 1fad38dcf70..76c212d475b 100644
--- a/include/asm-ppc64/param.h
+++ b/include/asm-ppc64/param.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_PPC64_PARAM_H
 #define _ASM_PPC64_PARAM_H
 
+#include <linux/config.h>
+
 /*
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -9,7 +11,7 @@
  */
 
 #ifdef __KERNEL__
-# define HZ		1000		/* Internal kernel timer frequency */
+# define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
 #endif
diff --git a/include/asm-ppc64/percpu.h b/include/asm-ppc64/percpu.h
deleted file mode 100644
index 60a659a4ce1..00000000000
--- a/include/asm-ppc64/percpu.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ARCH_PPC64_PERCPU__
-#define __ARCH_PPC64_PERCPU__
-
-#include <asm-generic/percpu.h>
-
-#endif /* __ARCH_PPC64_PERCPU__ */
diff --git a/include/asm-ppc64/pgalloc.h b/include/asm-ppc64/pgalloc.h
index 4fc4b739b38..26bc49c1108 100644
--- a/include/asm-ppc64/pgalloc.h
+++ b/include/asm-ppc64/pgalloc.h
@@ -6,7 +6,12 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
-extern kmem_cache_t *zero_cache;
+extern kmem_cache_t *pgtable_cache[];
+
+#define PTE_CACHE_NUM	0
+#define PMD_CACHE_NUM	1
+#define PUD_CACHE_NUM	1
+#define PGD_CACHE_NUM	0
 
 /*
  * This program is free software; you can redistribute it and/or
@@ -15,30 +20,40 @@ extern kmem_cache_t *zero_cache;
  * 2 of the License, or (at your option) any later version.
  */
 
-static inline pgd_t *
-pgd_alloc(struct mm_struct *mm)
+static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL);
+	return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
 }
 
-static inline void
-pgd_free(pgd_t *pgd)
+static inline void pgd_free(pgd_t *pgd)
 {
-	kmem_cache_free(zero_cache, pgd);
+	kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
+}
+
+#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+				GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(pud_t *pud)
+{
+	kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
 }
 
 #define pud_populate(MM, PUD, PMD)	pud_set(PUD, PMD)
 
-static inline pmd_t *
-pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+	return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+				GFP_KERNEL|__GFP_REPEAT);
 }
 
-static inline void
-pmd_free(pmd_t *pmd)
+static inline void pmd_free(pmd_t *pmd)
 {
-	kmem_cache_free(zero_cache, pmd);
+	kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
 }
 
 #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte)
@@ -47,44 +62,58 @@ pmd_free(pmd_t *pmd)
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+	return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM],
+				GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
-	if (pte)
-		return virt_to_page(pte);
-	return NULL;
+	return virt_to_page(pte_alloc_one_kernel(mm, address));
 }
 		
 static inline void pte_free_kernel(pte_t *pte)
 {
-	kmem_cache_free(zero_cache, pte);
+	kmem_cache_free(pgtable_cache[PTE_CACHE_NUM], pte);
 }
 
 static inline void pte_free(struct page *ptepage)
 {
-	kmem_cache_free(zero_cache, page_address(ptepage));
+	pte_free_kernel(page_address(ptepage));
 }
 
-struct pte_freelist_batch
+#define PGF_CACHENUM_MASK	0xf
+
+typedef struct pgtable_free {
+	unsigned long val;
+} pgtable_free_t;
+
+static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
+						unsigned long mask)
 {
-	struct rcu_head	rcu;
-	unsigned int	index;
-	struct page *	pages[0];
-};
+	BUG_ON(cachenum > PGF_CACHENUM_MASK);
 
-#define PTE_FREELIST_SIZE	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \
-				  sizeof(struct page *))
+	return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
+}
 
-extern void pte_free_now(struct page *ptepage);
-extern void pte_free_submit(struct pte_freelist_batch *batch);
+static inline void pgtable_free(pgtable_free_t pgf)
+{
+	void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
+	int cachenum = pgf.val & PGF_CACHENUM_MASK;
 
-DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+	kmem_cache_free(pgtable_cache[cachenum], p);
+}
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage);
-#define __pmd_free_tlb(tlb, pmd)	__pte_free_tlb(tlb, virt_to_page(pmd))
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+
+#define __pte_free_tlb(tlb, ptepage)	\
+	pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \
+		PTE_CACHE_NUM, PTE_TABLE_SIZE-1))
+#define __pmd_free_tlb(tlb, pmd) 	\
+	pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
+		PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#define __pud_free_tlb(tlb, pmd)	\
+	pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
+		PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index 46cf61c2ff6..c83679c9d2b 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -15,19 +15,24 @@
 #include <asm/tlbflush.h>
 #endif /* __ASSEMBLY__ */
 
-#include <asm-generic/pgtable-nopud.h>
-
 /*
  * Entries per page directory level.  The PTE level must use a 64b record
  * for each page table entry.  The PMD and PGD level use a 32b record for 
  * each entry by assuming that each entry is page aligned.
  */
 #define PTE_INDEX_SIZE  9
-#define PMD_INDEX_SIZE  10
-#define PGD_INDEX_SIZE  10
+#define PMD_INDEX_SIZE  7
+#define PUD_INDEX_SIZE  7
+#define PGD_INDEX_SIZE  9
+
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 /* PMD_SHIFT determines what a second-level page table entry can map */
@@ -35,8 +40,13 @@
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 
-/* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
@@ -45,15 +55,23 @@
 /*
  * Size of EA range mapped by our pagetables.
  */
-#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
-                    PGD_INDEX_SIZE + PAGE_SHIFT)
-#define EADDR_MASK ((1UL << EADDR_SIZE) - 1)
+#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
+                	    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE)
+
+#if TASK_SIZE_USER64 > PGTABLE_RANGE
+#error TASK_SIZE_USER64 exceeds pagetable range
+#endif
+
+#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT))
+#error TASK_SIZE_USER64 exceeds user VSID range
+#endif
 
 /*
  * Define the address range of the vmalloc VM area.
  */
 #define VMALLOC_START (0xD000000000000000ul)
-#define VMALLOC_SIZE  (0x10000000000UL)
+#define VMALLOC_SIZE  (0x80000000000UL)
 #define VMALLOC_END   (VMALLOC_START + VMALLOC_SIZE)
 
 /*
@@ -154,8 +172,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
 #ifndef __ASSEMBLY__
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local);
-
-void hugetlb_mm_free_pgd(struct mm_struct *mm);
 #endif /* __ASSEMBLY__ */
 
 #define HAVE_ARCH_UNMAPPED_AREA
@@ -163,7 +179,6 @@ void hugetlb_mm_free_pgd(struct mm_struct *mm);
 #else
 
 #define hash_huge_page(mm,a,ea,vsid,local)	-1
-#define hugetlb_mm_free_pgd(mm)			do {} while (0)
 
 #endif
 
@@ -197,39 +212,45 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 #define pte_pfn(x)		((unsigned long)((pte_val(x) >> PTE_SHIFT)))
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 
-#define pmd_set(pmdp, ptep) 	\
-	(pmd_val(*(pmdp)) = __ba_to_bpn(ptep))
+#define pmd_set(pmdp, ptep) 	({BUG_ON((u64)ptep < KERNELBASE); pmd_val(*(pmdp)) = (unsigned long)(ptep);})
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) == 0)
 #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page_kernel(pmd)	(pmd_val(pmd))
 #define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 
-#define pud_set(pudp, pmdp)	(pud_val(*(pudp)) = (__ba_to_bpn(pmdp)))
+#define pud_set(pudp, pmdp)	(pud_val(*(pudp)) = (unsigned long)(pmdp))
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		((pud_val(pud)) == 0UL)
-#define pud_present(pud)	(pud_val(pud) != 0UL)
-#define pud_clear(pudp)		(pud_val(*(pudp)) = 0UL)
-#define pud_page(pud)		(__bpn_to_ba(pud_val(pud)))
+#define pud_bad(pud)		((pud_val(pud)) == 0)
+#define pud_present(pud)	(pud_val(pud) != 0)
+#define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
+#define pud_page(pud)		(pud_val(pud))
+
+#define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
+#define pgd_present(pgd)	(pgd_val(pgd) != 0)
+#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
+#define pgd_page(pgd)		(pgd_val(pgd))
 
 /* 
  * Find an entry in a page-table-directory.  We combine the address region 
  * (the high order N bits) and the pgd portion of the address.
  */
 /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff)
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff)
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-/* Find an entry in the second-level page table.. */
+#define pud_offset(pgdp, addr)	\
+  (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+
 #define pmd_offset(pudp,addr) \
-  ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+  (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
-/* Find an entry in the third-level page table.. */
 #define pte_offset_kernel(dir,addr) \
-  ((pte_t *) pmd_page_kernel(*(dir)) \
- + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+  (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
@@ -458,23 +479,20 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
 #define pmd_ERROR(e) \
-	printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e))
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e))
+	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 extern pgd_t swapper_pg_dir[];
 
 extern void paging_init(void);
 
-/*
- * Because the huge pgtables are only 2 level, they can take
- * at most around 4M, much less than one hugepage which the
- * process is presumably entitled to use.  So we don't bother
- * freeing up the pagetables on unmap, and wait until
- * destroy_context() to clean up the lot.
- */
+#ifdef CONFIG_HUGETLB_PAGE
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+	free_pgd_range(tlb, addr, end, floor, ceiling)
+#endif
 
 /*
  * This gets called at the end of handling a page fault, when
diff --git a/include/asm-ppc64/pmc.h b/include/asm-ppc64/pmc.h
index c924748c0be..d1d297dbccf 100644
--- a/include/asm-ppc64/pmc.h
+++ b/include/asm-ppc64/pmc.h
@@ -26,4 +26,6 @@ typedef void (*perf_irq_t)(struct pt_regs *);
 int reserve_pmc_hardware(perf_irq_t new_perf_irq);
 void release_pmc_hardware(void);
 
+void power4_enable_pmcs(void);
+
 #endif /* _PPC64_PMC_H */
diff --git a/include/asm-ppc64/poll.h b/include/asm-ppc64/poll.h
deleted file mode 100644
index 370fa3ba0db..00000000000
--- a/include/asm-ppc64/poll.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __PPC64_POLL_H
-#define __PPC64_POLL_H
-
-/*
- * Copyright (C) 2001 PPC64 Team, IBM Corp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define POLLIN		0x0001
-#define POLLPRI		0x0002
-#define POLLOUT		0x0004
-#define POLLERR		0x0008
-#define POLLHUP		0x0010
-#define POLLNVAL	0x0020
-#define POLLRDNORM	0x0040
-#define POLLRDBAND	0x0080
-#define POLLWRNORM	0x0100
-#define POLLWRBAND	0x0200
-#define POLLMSG		0x0400
-#define POLLREMOVE	0x1000
-
-struct pollfd {
-	int fd;
-	short events;
-	short revents;
-};
-
-#endif /* __PPC64_POLL_H */
diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h
index 352306cfb57..7bd4796f123 100644
--- a/include/asm-ppc64/processor.h
+++ b/include/asm-ppc64/processor.h
@@ -268,6 +268,7 @@
 #define PV_970FX	0x003C
 #define	PV_630        	0x0040
 #define	PV_630p	        0x0041
+#define	PV_970MP	0x0044
 #define	PV_BE		0x0070
 
 /* Platforms supported by PPC64 */
@@ -382,8 +383,8 @@ extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
 extern struct task_struct *last_task_used_math;
 extern struct task_struct *last_task_used_altivec;
 
-/* 64-bit user address space is 41-bits (2TBs user VM) */
-#define TASK_SIZE_USER64 (0x0000020000000000UL)
+/* 64-bit user address space is 44-bits (16TB user VM) */
+#define TASK_SIZE_USER64 (0x0000100000000000UL)
 
 /* 
  * 32-bit user address space is 4GB - 1 page 
diff --git a/include/asm-ppc64/prom.h b/include/asm-ppc64/prom.h
index 04b1a84f7ca..dc5330b3950 100644
--- a/include/asm-ppc64/prom.h
+++ b/include/asm-ppc64/prom.h
@@ -22,13 +22,15 @@
 #define RELOC(x)        (*PTRRELOC(&(x)))
 
 /* Definitions used by the flattened device tree */
-#define OF_DT_HEADER		0xd00dfeed	/* 4: version, 4: total size */
-#define OF_DT_BEGIN_NODE	0x1		/* Start node: full name */
+#define OF_DT_HEADER		0xd00dfeed	/* marker */
+#define OF_DT_BEGIN_NODE	0x1		/* Start of node, full name */
 #define OF_DT_END_NODE		0x2		/* End node */
-#define OF_DT_PROP		0x3		/* Property: name off, size, content */
+#define OF_DT_PROP		0x3		/* Property: name off, size,
+						 * content */
+#define OF_DT_NOP		0x4		/* nop */
 #define OF_DT_END		0x9
 
-#define OF_DT_VERSION		1
+#define OF_DT_VERSION		0x10
 
 /*
  * This is what gets passed to the kernel by prom_init or kexec
@@ -54,7 +56,9 @@ struct boot_param_header
 	u32	version;		/* format version */
 	u32	last_comp_version;	/* last compatible version */
 	/* version 2 fields below */
-	u32	boot_cpuid_phys;	/* Which physical CPU id we're booting on */
+	u32	boot_cpuid_phys;	/* Physical CPU id we're booting on */
+	/* version 3 fields below */
+	u32	dt_strings_size;	/* size of the DT strings block */
 };
 
 
diff --git a/include/asm-ppc64/resource.h b/include/asm-ppc64/resource.h
deleted file mode 100644
index add031b9dfd..00000000000
--- a/include/asm-ppc64/resource.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _PPC64_RESOURCE_H
-#define _PPC64_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif /* _PPC64_RESOURCE_H */
diff --git a/include/asm-ppc64/shmparam.h b/include/asm-ppc64/shmparam.h
deleted file mode 100644
index b2825ceff05..00000000000
--- a/include/asm-ppc64/shmparam.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _PPC64_SHMPARAM_H
-#define _PPC64_SHMPARAM_H
-
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define	SHMLBA PAGE_SIZE		 /* attach addr a multiple of this */
-
-#endif /* _PPC64_SHMPARAM_H */
diff --git a/include/asm-ppc64/socket.h b/include/asm-ppc64/socket.h
index 59e00dfc8b8..9e1af8eb2d9 100644
--- a/include/asm-ppc64/socket.h
+++ b/include/asm-ppc64/socket.h
@@ -21,6 +21,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-ppc64/string.h b/include/asm-ppc64/string.h
deleted file mode 100644
index eeca68ef1e9..00000000000
--- a/include/asm-ppc64/string.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _PPC64_STRING_H_
-#define _PPC64_STRING_H_
-
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define __HAVE_ARCH_STRCPY
-#define __HAVE_ARCH_STRNCPY
-#define __HAVE_ARCH_STRLEN
-#define __HAVE_ARCH_STRCMP
-#define __HAVE_ARCH_STRCAT
-#define __HAVE_ARCH_MEMSET
-#define __HAVE_ARCH_MEMCPY
-#define __HAVE_ARCH_MEMMOVE
-#define __HAVE_ARCH_MEMCMP
-#define __HAVE_ARCH_MEMCHR
-
-extern int strcasecmp(const char *, const char *);
-extern int strncasecmp(const char *, const char *, int);
-extern char * strcpy(char *,const char *);
-extern char * strncpy(char *,const char *, __kernel_size_t);
-extern __kernel_size_t strlen(const char *);
-extern int strcmp(const char *,const char *);
-extern char * strcat(char *, const char *);
-extern void * memset(void *,int,__kernel_size_t);
-extern void * memcpy(void *,const void *,__kernel_size_t);
-extern void * memmove(void *,const void *,__kernel_size_t);
-extern int memcmp(const void *,const void *,__kernel_size_t);
-extern void * memchr(const void *,int,__kernel_size_t);
-
-#endif /* _PPC64_STRING_H_ */
diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h
index 98d120ca8a9..b9e1835351e 100644
--- a/include/asm-ppc64/system.h
+++ b/include/asm-ppc64/system.h
@@ -88,7 +88,7 @@ DEBUGGER_BOILERPLATE(debugger_dabr_match)
 DEBUGGER_BOILERPLATE(debugger_fault_handler)
 
 #ifdef CONFIG_XMON
-extern void xmon_init(void);
+extern void xmon_init(int enable);
 #endif
 
 #else
@@ -302,5 +302,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 
 #define arch_align_stack(x) (x)
 
+extern unsigned long reloc_offset(void);
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/asm-ppc64/unaligned.h b/include/asm-ppc64/unaligned.h
deleted file mode 100644
index 636e93c4f37..00000000000
--- a/include/asm-ppc64/unaligned.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __PPC64_UNALIGNED_H
-#define __PPC64_UNALIGNED_H
-
-/*
- * The PowerPC can do unaligned accesses itself in big endian mode. 
- *
- * The strange macros are there to make sure these can't
- * be misused in a way that makes them not work on other
- * architectures where unaligned accesses aren't as simple.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define get_unaligned(ptr) (*(ptr))
-
-#define put_unaligned(val, ptr) ((void)( *(ptr) = (val) ))
-
-#endif /* __PPC64_UNALIGNED_H */
diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h
index 20cd98ee633..03f1b95f433 100644
--- a/include/asm-ppc64/vio.h
+++ b/include/asm-ppc64/vio.h
@@ -19,13 +19,15 @@
 #include <linux/errno.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/mod_devicetable.h>
+
 #include <asm/hvcall.h>
-#include <asm/prom.h>
 #include <asm/scatterlist.h>
-/* 
+
+/*
  * Architecture-specific constants for drivers to
  * extract attributes of the device using vio_get_attribute()
-*/
+ */
 #define VETH_MAC_ADDR "local-mac-address"
 #define VETH_MCAST_FILTER_SIZE "ibm,mac-address-filters"
 
@@ -37,64 +39,65 @@
 #define VIO_IRQ_DISABLE		0UL
 #define VIO_IRQ_ENABLE		1UL
 
-struct vio_dev;
-struct vio_driver;
-struct vio_device_id;
 struct iommu_table;
 
-int vio_register_driver(struct vio_driver *drv);
-void vio_unregister_driver(struct vio_driver *drv);
-
-#ifdef CONFIG_PPC_PSERIES
-struct vio_dev * __devinit vio_register_device_node(
-		struct device_node *node_vdev);
-#endif
-void __devinit vio_unregister_device(struct vio_dev *dev);
-struct vio_dev *vio_find_node(struct device_node *vnode);
-
-const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length);
-int vio_get_irq(struct vio_dev *dev);
-int vio_enable_interrupts(struct vio_dev *dev);
-int vio_disable_interrupts(struct vio_dev *dev);
-
-extern struct dma_mapping_ops vio_dma_ops;
-
-extern struct bus_type vio_bus_type;
-
-struct vio_device_id {
+/*
+ * The vio_dev structure is used to describe virtual I/O devices.
+ */
+struct vio_dev {
+	struct iommu_table *iommu_table;     /* vio_map_* uses this */
+	char *name;
 	char *type;
-	char *compat;
+	uint32_t unit_address;
+	unsigned int irq;
+	struct device dev;
 };
 
 struct vio_driver {
 	struct list_head node;
 	char *name;
-	const struct vio_device_id *id_table;	/* NULL if wants all devices */
-	int  (*probe)  (struct vio_dev *dev, const struct vio_device_id *id);	/* New device inserted */
-	int (*remove) (struct vio_dev *dev);	/* Device removed (NULL if not a hot-plug capable driver) */
+	const struct vio_device_id *id_table;
+	int (*probe)(struct vio_dev *dev, const struct vio_device_id *id);
+	int (*remove)(struct vio_dev *dev);
 	unsigned long driver_data;
-
 	struct device_driver driver;
 };
 
+struct vio_bus_ops {
+	int (*match)(const struct vio_device_id *id, const struct vio_dev *dev);
+	void (*unregister_device)(struct vio_dev *);
+	void (*release_device)(struct device *);
+};
+
+extern struct dma_mapping_ops vio_dma_ops;
+extern struct bus_type vio_bus_type;
+extern struct vio_dev vio_bus_device;
+
+extern int vio_register_driver(struct vio_driver *drv);
+extern void vio_unregister_driver(struct vio_driver *drv);
+
+extern struct vio_dev * __devinit vio_register_device(struct vio_dev *viodev);
+extern void __devinit vio_unregister_device(struct vio_dev *dev);
+
+extern int vio_bus_init(struct vio_bus_ops *);
+
+#ifdef CONFIG_PPC_PSERIES
+struct device_node;
+
+extern struct vio_dev * __devinit vio_register_device_node(
+		struct device_node *node_vdev);
+extern struct vio_dev *vio_find_node(struct device_node *vnode);
+extern const void *vio_get_attribute(struct vio_dev *vdev, void *which,
+		int *length);
+extern int vio_enable_interrupts(struct vio_dev *dev);
+extern int vio_disable_interrupts(struct vio_dev *dev);
+#endif
+
 static inline struct vio_driver *to_vio_driver(struct device_driver *drv)
 {
 	return container_of(drv, struct vio_driver, driver);
 }
 
-/*
- * The vio_dev structure is used to describe virtual I/O devices.
- */
-struct vio_dev {
-	struct iommu_table *iommu_table;     /* vio_map_* uses this */
-	char *name;
-	char *type;
-	uint32_t unit_address;	
-	unsigned int irq;
-
-	struct device dev;
-};
-
 static inline struct vio_dev *to_vio_dev(struct device *dev)
 {
 	return container_of(dev, struct vio_dev, dev);
diff --git a/include/asm-ppc64/xor.h b/include/asm-ppc64/xor.h
deleted file mode 100644
index c82eb12a5b1..00000000000
--- a/include/asm-ppc64/xor.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/xor.h>
diff --git a/include/asm-s390/socket.h b/include/asm-s390/socket.h
index 0e96eeca4e6..15a5298c874 100644
--- a/include/asm-s390/socket.h
+++ b/include/asm-s390/socket.h
@@ -22,6 +22,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-sh/socket.h b/include/asm-sh/socket.h
index dde696c3b4c..553904ff933 100644
--- a/include/asm-sh/socket.h
+++ b/include/asm-sh/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_RCVBUFFORCE	32
+#define SO_SNDBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-sparc/socket.h b/include/asm-sparc/socket.h
index c1154e3ecfd..09575b608ad 100644
--- a/include/asm-sparc/socket.h
+++ b/include/asm-sparc/socket.h
@@ -29,6 +29,8 @@
 
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
 
diff --git a/include/asm-sparc64/socket.h b/include/asm-sparc64/socket.h
index 865547a2390..59987dad335 100644
--- a/include/asm-sparc64/socket.h
+++ b/include/asm-sparc64/socket.h
@@ -29,6 +29,8 @@
 
 #define SO_SNDBUF	0x1001
 #define SO_RCVBUF	0x1002
+#define SO_SNDBUFFORCE	0x100a
+#define SO_RCVBUFFORCE	0x100b
 #define SO_ERROR	0x1007
 #define SO_TYPE		0x1008
 
diff --git a/include/asm-v850/socket.h b/include/asm-v850/socket.h
index 213b852af53..0240d366a0a 100644
--- a/include/asm-v850/socket.h
+++ b/include/asm-v850/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-x86_64/checksum.h b/include/asm-x86_64/checksum.h
index d01356f0144..989469e8e0b 100644
--- a/include/asm-x86_64/checksum.h
+++ b/include/asm-x86_64/checksum.h
@@ -64,7 +64,7 @@ static inline unsigned short ip_fast_csum(unsigned char *iph, unsigned int ihl)
 		"  adcl $0, %0\n"
 		"  notl %0\n"
 		"2:"
-	/* Since the input registers which are loaded with iph and ipl
+	/* Since the input registers which are loaded with iph and ihl
 	   are modified, we must also specify them as outputs, or gcc
 	   will assume they contain their original values. */
 	: "=r" (sum), "=r" (iph), "=r" (ihl)
diff --git a/include/asm-x86_64/socket.h b/include/asm-x86_64/socket.h
index d9a252ea821..f2cdbeae5d5 100644
--- a/include/asm-x86_64/socket.h
+++ b/include/asm-x86_64/socket.h
@@ -14,6 +14,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/asm-xtensa/socket.h b/include/asm-xtensa/socket.h
index daccd05a14c..00f83f3a6d7 100644
--- a/include/asm-xtensa/socket.h
+++ b/include/asm-xtensa/socket.h
@@ -24,6 +24,8 @@
 #define SO_BROADCAST	6
 #define SO_SNDBUF	7
 #define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
 #define SO_KEEPALIVE	9
 #define SO_OOBINLINE	10
 #define SO_NO_CHECK	11
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
new file mode 100644
index 00000000000..007c290f74d
--- /dev/null
+++ b/include/linux/dccp.h
@@ -0,0 +1,456 @@
+#ifndef _LINUX_DCCP_H
+#define _LINUX_DCCP_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+/* Structure describing an Internet (DCCP) socket address. */
+struct sockaddr_dccp {
+	__u16	sdccp_family;	/* Address family   */
+	__u16	sdccp_port;	/* Port number	    */
+	__u32	sdccp_addr;	/* Internet address */
+	__u32	sdccp_service;	/* Service	    */
+	/* Pad to size of `struct sockaddr': 16 bytes . */
+	__u32	sdccp_pad;
+};
+
+/**
+ * struct dccp_hdr - generic part of DCCP packet header
+ *
+ * @dccph_sport - Relevant port on the endpoint that sent this packet
+ * @dccph_dport - Relevant port on the other endpoint
+ * @dccph_doff - Data Offset from the start of the DCCP header, in 32-bit words
+ * @dccph_ccval - Used by the HC-Sender CCID
+ * @dccph_cscov - Parts of the packet that are covered by the Checksum field
+ * @dccph_checksum - Internet checksum, depends on dccph_cscov
+ * @dccph_x - 0 = 24 bit sequence number, 1 = 48
+ * @dccph_type - packet type, see DCCP_PKT_ prefixed macros
+ * @dccph_seq - sequence number high or low order 24 bits, depends on dccph_x
+ */
+struct dccp_hdr {
+	__u16	dccph_sport,
+		dccph_dport;
+	__u8	dccph_doff;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8	dccph_cscov:4,
+		dccph_ccval:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8	dccph_ccval:4,
+		dccph_cscov:4;
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+	__u16	dccph_checksum;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u32	dccph_x:1,
+		dccph_type:4,
+		dccph_reserved:3,
+		dccph_seq:24;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u32	dccph_reserved:3,
+		dccph_type:4,
+		dccph_x:1,
+		dccph_seq:24;
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+/**
+ * struct dccp_hdr_ext - the low bits of a 48 bit seq packet
+ *
+ * @dccph_seq_low - low 24 bits of a 48 bit seq packet
+ */
+struct dccp_hdr_ext {
+	__u32	dccph_seq_low;
+};
+
+/**
+ * struct dccp_hdr_request - Conection initiation request header
+ *
+ * @dccph_req_service - Service to which the client app wants to connect
+ * @dccph_req_options - list of options (must be a multiple of 32 bits
+ */
+struct dccp_hdr_request {
+	__u32	dccph_req_service;
+};
+/**
+ * struct dccp_hdr_ack_bits - acknowledgment bits common to most packets
+ *
+ * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR
+ * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR
+ */
+struct dccp_hdr_ack_bits {
+	__u32	dccph_reserved1:8,
+		dccph_ack_nr_high:24;
+	__u32	dccph_ack_nr_low;
+};
+/**
+ * struct dccp_hdr_response - Conection initiation response header
+ *
+ * @dccph_resp_ack_nr_high - 48 bit ack number high order bits, contains GSR
+ * @dccph_resp_ack_nr_low - 48 bit ack number low order bits, contains GSR
+ * @dccph_resp_service - Echoes the Service Code on a received DCCP-Request
+ * @dccph_resp_options - list of options (must be a multiple of 32 bits
+ */
+struct dccp_hdr_response {
+	struct dccp_hdr_ack_bits	dccph_resp_ack;
+	__u32				dccph_resp_service;
+};
+
+/**
+ * struct dccp_hdr_reset - Unconditionally shut down a connection
+ *
+ * @dccph_reset_service - Echoes the Service Code on a received DCCP-Request
+ * @dccph_reset_options - list of options (must be a multiple of 32 bits
+ */
+struct dccp_hdr_reset {
+	struct dccp_hdr_ack_bits	dccph_reset_ack;
+	__u8				dccph_reset_code,
+					dccph_reset_data[3];
+};
+
+enum dccp_pkt_type {
+	DCCP_PKT_REQUEST = 0,
+	DCCP_PKT_RESPONSE,
+	DCCP_PKT_DATA,
+	DCCP_PKT_ACK,
+	DCCP_PKT_DATAACK,
+	DCCP_PKT_CLOSEREQ,
+	DCCP_PKT_CLOSE,
+	DCCP_PKT_RESET,
+	DCCP_PKT_SYNC,
+	DCCP_PKT_SYNCACK,
+	DCCP_PKT_INVALID,
+};
+
+#define DCCP_NR_PKT_TYPES DCCP_PKT_INVALID
+
+static inline unsigned int dccp_packet_hdr_len(const __u8 type)
+{
+	if (type == DCCP_PKT_DATA)
+		return 0;
+	if (type == DCCP_PKT_DATAACK	||
+	    type == DCCP_PKT_ACK	||
+	    type == DCCP_PKT_SYNC	||
+	    type == DCCP_PKT_SYNCACK	||
+	    type == DCCP_PKT_CLOSE	||
+	    type == DCCP_PKT_CLOSEREQ)
+		return sizeof(struct dccp_hdr_ack_bits);
+	if (type == DCCP_PKT_REQUEST)
+		return sizeof(struct dccp_hdr_request);
+	if (type == DCCP_PKT_RESPONSE)
+		return sizeof(struct dccp_hdr_response);
+	return sizeof(struct dccp_hdr_reset);
+}
+enum dccp_reset_codes {
+	DCCP_RESET_CODE_UNSPECIFIED = 0,
+	DCCP_RESET_CODE_CLOSED,
+	DCCP_RESET_CODE_ABORTED,
+	DCCP_RESET_CODE_NO_CONNECTION,
+	DCCP_RESET_CODE_PACKET_ERROR,
+	DCCP_RESET_CODE_OPTION_ERROR,
+	DCCP_RESET_CODE_MANDATORY_ERROR,
+	DCCP_RESET_CODE_CONNECTION_REFUSED,
+	DCCP_RESET_CODE_BAD_SERVICE_CODE,
+	DCCP_RESET_CODE_TOO_BUSY,
+	DCCP_RESET_CODE_BAD_INIT_COOKIE,
+	DCCP_RESET_CODE_AGGRESSION_PENALTY,
+};
+
+/* DCCP options */
+enum {
+	DCCPO_PADDING = 0,
+	DCCPO_MANDATORY = 1,
+	DCCPO_MIN_RESERVED = 3,
+	DCCPO_MAX_RESERVED = 31,
+	DCCPO_NDP_COUNT = 37,
+	DCCPO_ACK_VECTOR_0 = 38,
+	DCCPO_ACK_VECTOR_1 = 39,
+	DCCPO_TIMESTAMP = 41,
+	DCCPO_TIMESTAMP_ECHO = 42,
+	DCCPO_ELAPSED_TIME = 43,
+	DCCPO_MAX = 45,
+	DCCPO_MIN_CCID_SPECIFIC = 128,
+	DCCPO_MAX_CCID_SPECIFIC = 255,
+};
+
+/* DCCP features */
+enum {
+	DCCPF_RESERVED = 0,
+	DCCPF_SEQUENCE_WINDOW = 3,
+	DCCPF_SEND_ACK_VECTOR = 6,
+	DCCPF_SEND_NDP_COUNT = 7,
+	/* 10-127 reserved */
+	DCCPF_MIN_CCID_SPECIFIC = 128,
+	DCCPF_MAX_CCID_SPECIFIC = 255,
+};
+
+/* DCCP socket options */
+#define DCCP_SOCKOPT_PACKET_SIZE	1
+
+#ifdef __KERNEL__
+
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/uio.h>
+#include <linux/workqueue.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_timewait_sock.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+enum dccp_state {
+	DCCP_OPEN	= TCP_ESTABLISHED,
+	DCCP_REQUESTING	= TCP_SYN_SENT,
+	DCCP_PARTOPEN	= TCP_FIN_WAIT1, /* FIXME:
+					    This mapping is horrible, but TCP has
+					    no matching state for DCCP_PARTOPEN,
+					    as TCP_SYN_RECV is already used by
+					    DCCP_RESPOND, why don't stop using TCP
+					    mapping of states? OK, now we don't use
+					    sk_stream_sendmsg anymore, so doesn't
+					    seem to exist any reason for us to
+					    do the TCP mapping here */
+	DCCP_LISTEN	= TCP_LISTEN,
+	DCCP_RESPOND	= TCP_SYN_RECV,
+	DCCP_CLOSING	= TCP_CLOSING,
+	DCCP_TIME_WAIT	= TCP_TIME_WAIT,
+	DCCP_CLOSED	= TCP_CLOSE,
+	DCCP_MAX_STATES = TCP_MAX_STATES,
+};
+
+#define DCCP_STATE_MASK 0xf
+#define DCCP_ACTION_FIN (1<<7)
+
+enum {
+	DCCPF_OPEN	 = TCPF_ESTABLISHED,
+	DCCPF_REQUESTING = TCPF_SYN_SENT,
+	DCCPF_PARTOPEN	 = TCPF_FIN_WAIT1,
+	DCCPF_LISTEN	 = TCPF_LISTEN,
+	DCCPF_RESPOND	 = TCPF_SYN_RECV,
+	DCCPF_CLOSING	 = TCPF_CLOSING,
+	DCCPF_TIME_WAIT	 = TCPF_TIME_WAIT,
+	DCCPF_CLOSED	 = TCPF_CLOSE,
+};
+
+static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb)
+{
+	return (struct dccp_hdr *)skb->h.raw;
+}
+
+static inline struct dccp_hdr_ext *dccp_hdrx(const struct sk_buff *skb)
+{
+	return (struct dccp_hdr_ext *)(skb->h.raw + sizeof(struct dccp_hdr));
+}
+
+static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh)
+{
+	return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0);
+}
+
+static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	return __dccp_basic_hdr_len(dh);
+}
+
+static inline __u64 dccp_hdr_seq(const struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64 seq_nr = ntohl(dh->dccph_seq << 8);
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u64 seq_nr = ntohl(dh->dccph_seq);
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+
+	if (dh->dccph_x != 0)
+		seq_nr = (seq_nr << 32) + ntohl(dccp_hdrx(skb)->dccph_seq_low);
+
+	return seq_nr;
+}
+
+static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb)
+{
+	return (struct dccp_hdr_request *)(skb->h.raw + dccp_basic_hdr_len(skb));
+}
+
+static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb)
+{
+	return (struct dccp_hdr_ack_bits *)(skb->h.raw + dccp_basic_hdr_len(skb));
+}
+
+static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb)
+{
+	const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb);
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	return (((u64)ntohl(dhack->dccph_ack_nr_high << 8)) << 32) + ntohl(dhack->dccph_ack_nr_low);
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	return (((u64)ntohl(dhack->dccph_ack_nr_high)) << 32) + ntohl(dhack->dccph_ack_nr_low);
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+}
+
+static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb)
+{
+	return (struct dccp_hdr_response *)(skb->h.raw + dccp_basic_hdr_len(skb));
+}
+
+static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb)
+{
+	return (struct dccp_hdr_reset *)(skb->h.raw + dccp_basic_hdr_len(skb));
+}
+
+static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh)
+{
+	return __dccp_basic_hdr_len(dh) +
+	       dccp_packet_hdr_len(dh->dccph_type);
+}
+
+static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
+{
+	return __dccp_hdr_len(dccp_hdr(skb));
+}
+
+
+/* initial values for each feature */
+#define DCCPF_INITIAL_SEQUENCE_WINDOW		100
+/* FIXME: for now we're using CCID 3 (TFRC) */
+#define DCCPF_INITIAL_CCID			3
+#define DCCPF_INITIAL_SEND_ACK_VECTOR		0
+/* FIXME: for now we're default to 1 but it should really be 0 */
+#define DCCPF_INITIAL_SEND_NDP_COUNT		1
+
+#define DCCP_NDP_LIMIT 0xFFFFFF
+
+/**
+  * struct dccp_options - option values for a DCCP connection
+  *	@dccpo_sequence_window - Sequence Window Feature (section 7.5.2)
+  *	@dccpo_ccid - Congestion Control Id (CCID) (section 10)
+  *	@dccpo_send_ack_vector - Send Ack Vector Feature (section 11.5)
+  *	@dccpo_send_ndp_count - Send NDP Count Feature (7.7.2)
+  */
+struct dccp_options {
+	__u64	dccpo_sequence_window;
+	__u8	dccpo_ccid;
+	__u8	dccpo_send_ack_vector;
+	__u8	dccpo_send_ndp_count;
+};
+
+extern void __dccp_options_init(struct dccp_options *dccpo);
+extern void dccp_options_init(struct dccp_options *dccpo);
+extern int dccp_parse_options(struct sock *sk, struct sk_buff *skb);
+
+struct dccp_request_sock {
+	struct inet_request_sock dreq_inet_rsk;
+	__u64			 dreq_iss;
+	__u64			 dreq_isr;
+	__u32			 dreq_service;
+};
+
+static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req)
+{
+	return (struct dccp_request_sock *)req;
+}
+
+extern struct inet_timewait_death_row dccp_death_row;
+
+/* Read about the ECN nonce to see why it is 253 */
+#define DCCP_MAX_ACK_VECTOR_LEN 253
+
+struct dccp_options_received {
+	u32	dccpor_ndp:24,
+		dccpor_ack_vector_len:8;
+	u32	dccpor_ack_vector_idx:10;
+	/* 22 bits hole, try to pack */
+	u32	dccpor_timestamp;
+	u32	dccpor_timestamp_echo;
+	u32	dccpor_elapsed_time;
+};
+
+struct ccid;
+
+enum dccp_role {
+	DCCP_ROLE_UNDEFINED,
+	DCCP_ROLE_LISTEN,
+	DCCP_ROLE_CLIENT,
+	DCCP_ROLE_SERVER,
+};
+
+/**
+ * struct dccp_sock - DCCP socket state
+ *
+ * @dccps_swl - sequence number window low
+ * @dccps_swh - sequence number window high
+ * @dccps_awl - acknowledgement number window low
+ * @dccps_awh - acknowledgement number window high
+ * @dccps_iss - initial sequence number sent
+ * @dccps_isr - initial sequence number received
+ * @dccps_osr - first OPEN sequence number received
+ * @dccps_gss - greatest sequence number sent
+ * @dccps_gsr - greatest valid sequence number received
+ * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss
+ * @dccps_timestamp_time - time of latest TIMESTAMP option
+ * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option
+ * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options)
+ * @dccps_pmtu_cookie - Last pmtu seen by socket
+ * @dccps_packet_size - Set thru setsockopt
+ * @dccps_role - Role of this sock, one of %dccp_role
+ * @dccps_ndp_count - number of Non Data Packets since last data packet
+ * @dccps_hc_rx_ackpkts - receiver half connection acked packets
+ */
+struct dccp_sock {
+	/* inet_connection_sock has to be the first member of dccp_sock */
+	struct inet_connection_sock	dccps_inet_connection;
+	__u64				dccps_swl;
+	__u64				dccps_swh;
+	__u64				dccps_awl;
+	__u64				dccps_awh;
+	__u64				dccps_iss;
+	__u64				dccps_isr;
+	__u64				dccps_osr;
+	__u64				dccps_gss;
+	__u64				dccps_gsr;
+	__u64				dccps_gar;
+	unsigned long			dccps_service;
+	struct timeval			dccps_timestamp_time;
+	__u32				dccps_timestamp_echo;
+	__u32				dccps_packet_size;
+	unsigned long			dccps_ndp_count;
+	__u16				dccps_ext_header_len;
+	__u32				dccps_pmtu_cookie;
+	__u32				dccps_mss_cache;
+	struct dccp_options		dccps_options;
+	struct dccp_ackpkts		*dccps_hc_rx_ackpkts;
+	void				*dccps_hc_rx_ccid_private;
+	void				*dccps_hc_tx_ccid_private;
+	struct ccid			*dccps_hc_rx_ccid;
+	struct ccid			*dccps_hc_tx_ccid;
+	struct dccp_options_received	dccps_options_received;
+	enum dccp_role			dccps_role:2;
+};
+ 
+static inline struct dccp_sock *dccp_sk(const struct sock *sk)
+{
+	return (struct dccp_sock *)sk;
+}
+
+static inline const char *dccp_role(const struct sock *sk)
+{
+	switch (dccp_sk(sk)->dccps_role) {
+	case DCCP_ROLE_UNDEFINED: return "undefined";
+	case DCCP_ROLE_LISTEN:	  return "listen";
+	case DCCP_ROLE_SERVER:	  return "server";
+	case DCCP_ROLE_CLIENT:	  return "client";
+	}
+	return NULL;
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_DCCP_H */
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index d7021c391b2..ed1440ea4c9 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -250,6 +250,12 @@ struct ethtool_stats {
 	u64	data[0];
 };
 
+struct ethtool_perm_addr {
+	u32	cmd;		/* ETHTOOL_GPERMADDR */
+	u32	size;
+	u8	data[0];
+};
+
 struct net_device;
 
 /* Some generic methods drivers may use in their ethtool_ops */
@@ -261,6 +267,8 @@ u32 ethtool_op_get_sg(struct net_device *dev);
 int ethtool_op_set_sg(struct net_device *dev, u32 data);
 u32 ethtool_op_get_tso(struct net_device *dev);
 int ethtool_op_set_tso(struct net_device *dev, u32 data);
+int ethtool_op_get_perm_addr(struct net_device *dev, 
+			     struct ethtool_perm_addr *addr, u8 *data);
 
 /**
  * &ethtool_ops - Alter and report network device settings
@@ -294,7 +302,8 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data);
  * get_strings: Return a set of strings that describe the requested objects 
  * phys_id: Identify the device
  * get_stats: Return statistics about the device
- *
+ * get_perm_addr: Gets the permanent hardware address
+ * 
  * Description:
  *
  * get_settings:
@@ -352,6 +361,7 @@ struct ethtool_ops {
 	int	(*phys_id)(struct net_device *, u32);
 	int	(*get_stats_count)(struct net_device *);
 	void	(*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *);
+	int	(*get_perm_addr)(struct net_device *, struct ethtool_perm_addr *, u8 *);
 	int	(*begin)(struct net_device *);
 	void	(*complete)(struct net_device *);
 };
@@ -389,6 +399,7 @@ struct ethtool_ops {
 #define ETHTOOL_GSTATS		0x0000001d /* get NIC-specific statistics */
 #define ETHTOOL_GTSO		0x0000001e /* Get TSO enable (ethtool_value) */
 #define ETHTOOL_STSO		0x0000001f /* Set TSO enable (ethtool_value) */
+#define ETHTOOL_GPERMADDR	0x00000020 /* Get permanent hardware address */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h
index 9debe6bbe5f..bab303dafd6 100644
--- a/include/linux/hippidevice.h
+++ b/include/linux/hippidevice.h
@@ -26,8 +26,12 @@
 #include <linux/if_hippi.h>
 
 #ifdef __KERNEL__
-extern unsigned short hippi_type_trans(struct sk_buff *skb,
-				       struct net_device *dev);
+
+struct hippi_cb {
+	__u32	ifield;
+};
+
+extern __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev);
 
 extern struct net_device *alloc_hippi_dev(int sizeof_priv);
 #endif
diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h
index b5b58e9c054..fc2d4c8225a 100644
--- a/include/linux/if_ether.h
+++ b/include/linux/if_ether.h
@@ -110,6 +110,8 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
 {
 	return (struct ethhdr *)skb->mac.raw;
 }
+
+extern struct ctl_table ether_table[];
 #endif
 
 #endif	/* _LINUX_IF_ETHER_H */
diff --git a/include/linux/if_fc.h b/include/linux/if_fc.h
index 33330b458b9..376a34ea472 100644
--- a/include/linux/if_fc.h
+++ b/include/linux/if_fc.h
@@ -44,7 +44,7 @@ struct fcllc {
 	__u8  ssap;			/* source SAP */
 	__u8  llc;			/* LLC control field */
 	__u8  protid[3];		/* protocol id */
-	__u16 ethertype;		/* ether type field */
+	__be16 ethertype;		/* ether type field */
 };
 
 #endif	/* _LINUX_IF_FC_H */
diff --git a/include/linux/if_fddi.h b/include/linux/if_fddi.h
index a912818e636..1288a161bc0 100644
--- a/include/linux/if_fddi.h
+++ b/include/linux/if_fddi.h
@@ -85,7 +85,7 @@ struct fddi_snap_hdr
 	__u8	ssap;					/* always 0xAA */
 	__u8	ctrl;					/* always 0x03 */
 	__u8	oui[FDDI_K_OUI_LEN];	/* organizational universal id */
-	__u16	ethertype;				/* packet type ID field */
+	__be16	ethertype;				/* packet type ID field */
 	} __attribute__ ((packed));
 
 /* Define FDDI LLC frame header */
diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 3c94b173657..511999c7eed 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -191,10 +191,12 @@ struct frad_local
    int               buffer;		/* current buffer for S508 firmware */
 };
 
-extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *));
-
 #endif /* __KERNEL__ */
 
 #endif /* CONFIG_DLCI || CONFIG_DLCI_MODULE */
 
+#ifdef __KERNEL__
+extern void dlci_ioctl_set(int (*hook)(unsigned int, void __user *));
+#endif
+
 #endif
diff --git a/include/linux/if_hippi.h b/include/linux/if_hippi.h
index c8ca72c46f7..94d31ca7d71 100644
--- a/include/linux/if_hippi.h
+++ b/include/linux/if_hippi.h
@@ -102,9 +102,9 @@ struct hippi_fp_hdr
 #error	"Please fix <asm/byteorder.h>"
 #endif
 #else
-	__u32		fixed;
+	__be32		fixed;
 #endif
-	__u32		d2_size;
+	__be32		d2_size;
 } __attribute__ ((packed));
 
 struct hippi_le_hdr
@@ -144,7 +144,7 @@ struct hippi_snap_hdr
 	__u8	ssap;			/* always 0xAA */
 	__u8	ctrl;			/* always 0x03 */
 	__u8	oui[HIPPI_OUI_LEN];	/* organizational universal id (zero)*/
-	__u16	ethertype;		/* packet type ID field */
+	__be16	ethertype;		/* packet type ID field */
 } __attribute__ ((packed));
 
 struct hippi_hdr
diff --git a/include/linux/if_tr.h b/include/linux/if_tr.h
index 3fba9e2f542..5502f597cf0 100644
--- a/include/linux/if_tr.h
+++ b/include/linux/if_tr.h
@@ -43,12 +43,16 @@ struct trh_hdr {
 };
 
 #ifdef __KERNEL__
+#include <linux/config.h>
 #include <linux/skbuff.h>
 
 static inline struct trh_hdr *tr_hdr(const struct sk_buff *skb)
 {
 	return (struct trh_hdr *)skb->mac.raw;
 }
+#ifdef CONFIG_SYSCTL
+extern struct ctl_table tr_table[];
+#endif
 #endif
 
 /* This is an Token-Ring LLC structure */
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 62a9d89dfbe..17d0c0d40b0 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -155,7 +155,6 @@ static inline int __vlan_hwaccel_rx(struct sk_buff *skb,
 {
 	struct net_device_stats *stats;
 
-	skb->real_dev = skb->dev;
 	skb->dev = grp->vlan_devices[vlan_tag & VLAN_VID_MASK];
 	if (skb->dev == NULL) {
 		dev_kfree_skb_any(skb);
diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 0c31ef0b5ba..28f4f3b3695 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -129,6 +129,9 @@ struct igmpv3_query {
 #include <linux/skbuff.h>
 #include <linux/in.h>
 
+extern int sysctl_igmp_max_memberships;
+extern int sysctl_igmp_max_msf;
+
 struct ip_sf_socklist
 {
 	unsigned int		sl_max;
diff --git a/include/linux/in.h b/include/linux/in.h
index fb88c66d748..ba355384016 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -32,6 +32,7 @@ enum {
   IPPROTO_PUP = 12,		/* PUP protocol				*/
   IPPROTO_UDP = 17,		/* User Datagram Protocol		*/
   IPPROTO_IDP = 22,		/* XNS IDP protocol			*/
+  IPPROTO_DCCP = 33,		/* Datagram Congestion Control Protocol */
   IPPROTO_RSVP = 46,		/* RSVP protocol			*/
   IPPROTO_GRE = 47,		/* Cisco GRE tunnels (rfc 1701,1702)	*/
 
diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
new file mode 100644
index 00000000000..a4606e5810e
--- /dev/null
+++ b/include/linux/inet_diag.h
@@ -0,0 +1,138 @@
+#ifndef _INET_DIAG_H_
+#define _INET_DIAG_H_ 1
+
+/* Just some random number */
+#define TCPDIAG_GETSOCK 18
+#define DCCPDIAG_GETSOCK 19
+
+#define INET_DIAG_GETSOCK_MAX 24
+
+/* Socket identity */
+struct inet_diag_sockid {
+	__u16	idiag_sport;
+	__u16	idiag_dport;
+	__u32	idiag_src[4];
+	__u32	idiag_dst[4];
+	__u32	idiag_if;
+	__u32	idiag_cookie[2];
+#define INET_DIAG_NOCOOKIE (~0U)
+};
+
+/* Request structure */
+
+struct inet_diag_req {
+	__u8	idiag_family;		/* Family of addresses. */
+	__u8	idiag_src_len;
+	__u8	idiag_dst_len;
+	__u8	idiag_ext;		/* Query extended information */
+
+	struct inet_diag_sockid id;
+
+	__u32	idiag_states;		/* States to dump */
+	__u32	idiag_dbs;		/* Tables to dump (NI) */
+};
+
+enum {
+	INET_DIAG_REQ_NONE,
+	INET_DIAG_REQ_BYTECODE,
+};
+
+#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE
+
+/* Bytecode is sequence of 4 byte commands followed by variable arguments.
+ * All the commands identified by "code" are conditional jumps forward:
+ * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be
+ * length of the command and its arguments.
+ */
+ 
+struct inet_diag_bc_op {
+	unsigned char	code;
+	unsigned char	yes;
+	unsigned short	no;
+};
+
+enum {
+	INET_DIAG_BC_NOP,
+	INET_DIAG_BC_JMP,
+	INET_DIAG_BC_S_GE,
+	INET_DIAG_BC_S_LE,
+	INET_DIAG_BC_D_GE,
+	INET_DIAG_BC_D_LE,
+	INET_DIAG_BC_AUTO,
+	INET_DIAG_BC_S_COND,
+	INET_DIAG_BC_D_COND,
+};
+
+struct inet_diag_hostcond {
+	__u8	family;
+	__u8	prefix_len;
+	int	port;
+	__u32	addr[0];
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie)
+ * and, alas, the information shown by netstat. */
+struct inet_diag_msg {
+	__u8	idiag_family;
+	__u8	idiag_state;
+	__u8	idiag_timer;
+	__u8	idiag_retrans;
+
+	struct inet_diag_sockid id;
+
+	__u32	idiag_expires;
+	__u32	idiag_rqueue;
+	__u32	idiag_wqueue;
+	__u32	idiag_uid;
+	__u32	idiag_inode;
+};
+
+/* Extensions */
+
+enum {
+	INET_DIAG_NONE,
+	INET_DIAG_MEMINFO,
+	INET_DIAG_INFO,
+	INET_DIAG_VEGASINFO,
+	INET_DIAG_CONG,
+};
+
+#define INET_DIAG_MAX INET_DIAG_CONG
+
+
+/* INET_DIAG_MEM */
+
+struct inet_diag_meminfo {
+	__u32	idiag_rmem;
+	__u32	idiag_wmem;
+	__u32	idiag_fmem;
+	__u32	idiag_tmem;
+};
+
+/* INET_DIAG_VEGASINFO */
+
+struct tcpvegas_info {
+	__u32	tcpv_enabled;
+	__u32	tcpv_rttcnt;
+	__u32	tcpv_rtt;
+	__u32	tcpv_minrtt;
+};
+
+#ifdef __KERNEL__
+struct sock;
+struct inet_hashinfo;
+
+struct inet_diag_handler {
+	struct inet_hashinfo    *idiag_hashinfo;
+	void			(*idiag_get_info)(struct sock *sk,
+						  struct inet_diag_msg *r,
+						  void *info);
+	__u16                   idiag_info_size;
+	__u16                   idiag_type;
+};
+
+extern int  inet_diag_register(const struct inet_diag_handler *handler);
+extern void inet_diag_unregister(const struct inet_diag_handler *handler);
+#endif /* __KERNEL__ */
+
+#endif /* _INET_DIAG_H_ */
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 31e7cedd9f8..33e8a19a1a0 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -196,6 +196,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 #endif
 #endif
 
+extern int inet_sk_rebuild_header(struct sock *sk);
+
 struct iphdr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u8	ihl:4,
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 6fcd6a0ade2..3c7dbc6a0a7 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -193,6 +193,11 @@ struct inet6_skb_parm {
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
 
+static inline int inet6_iif(const struct sk_buff *skb)
+{
+	return IP6CB(skb)->iif;
+}
+
 struct tcp6_request_sock {
 	struct tcp_request_sock	req;
 	struct in6_addr		loc_addr;
@@ -308,6 +313,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 
 #define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
 #define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
+
+#include <linux/tcp.h>
+
+struct tcp6_timewait_sock {
+	struct tcp_timewait_sock tw_v6_sk;
+	struct in6_addr		 tw_v6_daddr;
+	struct in6_addr		 tw_v6_rcv_saddr;
+};
+
+static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
+{
+	return (struct tcp6_timewait_sock *)sk;
+}
+
+static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
+{
+	return likely(sk->sk_state != TCP_TIME_WAIT) ?
+		&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
+}
+
+static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
+{
+	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
+}
+
+static inline int inet_v6_ipv6only(const struct sock *sk)
+{
+	return likely(sk->sk_state != TCP_TIME_WAIT) ?
+		ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only;
+}
 #else
 #define __ipv6_only_sock(sk)	0
 #define ipv6_only_sock(sk)	0
@@ -322,8 +357,19 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk)
 	return NULL;
 }
 
-#endif
+#define __tcp_v6_rcv_saddr(__sk)	NULL
+#define tcp_v6_rcv_saddr(__sk)		NULL
+#define tcp_twsk_ipv6only(__sk)		0
+#define inet_v6_ipv6only(__sk)		0
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
 
-#endif
+#define INET6_MATCH(__sk, __saddr, __daddr, __ports, __dif)	   \
+	(((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))  	&& \
+	 ((__sk)->sk_family		== AF_INET6)		&& \
+	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&& \
+	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&& \
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
 
-#endif
+#endif /* __KERNEL__ */
+
+#endif /* _IPV6_H */
diff --git a/include/linux/list.h b/include/linux/list.h
index aab2db21b01..e6ec5968227 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -419,6 +419,20 @@ static inline void list_splice_init(struct list_head *list,
 	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
 
 /**
+ * list_for_each_entry_safe_continue -	iterate over list of given type
+ *			continuing after existing point safe against removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
+		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
  * list_for_each_rcu	-	iterate over an rcu-protected list
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
@@ -620,6 +634,57 @@ static inline void hlist_add_after(struct hlist_node *n,
 		next->next->pprev  = &next->next;
 }
 
+/**
+ * hlist_add_before_rcu - adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	smp_wmb();
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+/**
+ * hlist_add_after_rcu - adds the specified element to the specified hlist
+ * after the specified node while permitting racing traversals.
+ * @prev: the existing element to add the new element after.
+ * @n: the new element to add to the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_after_rcu(struct hlist_node *prev,
+				       struct hlist_node *n)
+{
+	n->next = prev->next;
+	n->pprev = &prev->next;
+	smp_wmb();
+	prev->next = n;
+	if (n->next)
+		n->next->pprev = &n->next;
+}
+
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 97bbccdbcca..47da39ba3f0 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -1,6 +1,6 @@
 /*
  * Device tables which are exported to userspace via
- * scripts/table2alias.c.  You must keep that file in sync with this
+ * scripts/mod/file2alias.c.  You must keep that file in sync with this
  * header.
  */
 
@@ -190,6 +190,11 @@ struct of_device_id
 #endif
 };
 
+/* VIO */
+struct vio_device_id {
+	char type[32];
+	char compat[32];
+};
 
 /* PCMCIA */
 
diff --git a/include/linux/net.h b/include/linux/net.h
index 20cb226b226..4e981585a89 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -84,6 +84,7 @@ enum sock_type {
 	SOCK_RAW	= 3,
 	SOCK_RDM	= 4,
 	SOCK_SEQPACKET	= 5,
+	SOCK_DCCP	= 6,
 	SOCK_PACKET	= 10,
 };
 
@@ -282,5 +283,15 @@ static struct proto_ops name##_ops = {			\
 #define MODULE_ALIAS_NETPROTO(proto) \
 	MODULE_ALIAS("net-pf-" __stringify(proto))
 
+#define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \
+	MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto))
+
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+extern ctl_table net_table[];
+extern int net_msg_cost;
+extern int net_msg_burst;
+#endif
+
 #endif /* __KERNEL__ */
 #endif	/* _LINUX_NET_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a0ed7f9e80..7c717907896 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -244,6 +244,7 @@ struct netdev_boot_setup {
 };
 #define NETDEV_BOOT_SETUP_MAX 8
 
+extern int __init netdev_boot_setup(char *str);
 
 /*
  *	The DEVICE structure.
@@ -336,6 +337,7 @@ struct net_device
 	/* Interface address info. */
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address	*/
+	unsigned char		perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
 	unsigned char		addr_len;	/* hardware address length	*/
 	unsigned short          dev_id;		/* for shared network cards */
 
@@ -497,10 +499,12 @@ static inline void *netdev_priv(struct net_device *dev)
 #define SET_NETDEV_DEV(net, pdev)	((net)->class_dev.dev = (pdev))
 
 struct packet_type {
-	__be16			type;	/* This is really htons(ether_type).	*/
-	struct net_device		*dev;	/* NULL is wildcarded here		*/
-	int			(*func) (struct sk_buff *, struct net_device *,
-					 struct packet_type *);
+	__be16			type;	/* This is really htons(ether_type). */
+	struct net_device	*dev;	/* NULL is wildcarded here	     */
+	int			(*func) (struct sk_buff *,
+					 struct net_device *,
+					 struct packet_type *,
+					 struct net_device *);
 	void			*af_packet_priv;
 	struct list_head	list;
 };
@@ -671,6 +675,7 @@ extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 extern void		dev_init(void);
 
 extern int		netdev_nit;
+extern int		netdev_budget;
 
 /* Called by rtnetlink.c:rtnl_unlock() */
 extern void netdev_run_todo(void);
@@ -697,19 +702,9 @@ static inline int netif_carrier_ok(const struct net_device *dev)
 
 extern void __netdev_watchdog_up(struct net_device *dev);
 
-static inline void netif_carrier_on(struct net_device *dev)
-{
-	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
-		linkwatch_fire_event(dev);
-	if (netif_running(dev))
-		__netdev_watchdog_up(dev);
-}
+extern void netif_carrier_on(struct net_device *dev);
 
-static inline void netif_carrier_off(struct net_device *dev)
-{
-	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
-		linkwatch_fire_event(dev);
-}
+extern void netif_carrier_off(struct net_device *dev);
 
 /* Hot-plugging. */
 static inline int netif_device_present(struct net_device *dev)
@@ -916,6 +911,14 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward);
 extern void		net_enable_timestamp(void);
 extern void		net_disable_timestamp(void);
 
+#ifdef CONFIG_PROC_FS
+extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
+extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
+extern void dev_seq_stop(struct seq_file *seq, void *v);
+#endif
+
+extern void linkwatch_run_queue(void);
+
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2e2045482cb..be365e70ee9 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -21,10 +21,23 @@
 #define NF_STOP 5
 #define NF_MAX_VERDICT NF_STOP
 
+/* we overload the higher bits for encoding auxiliary data such as the queue
+ * number. Not nice, but better than additional function arguments. */
+#define NF_VERDICT_MASK 0x0000ffff
+#define NF_VERDICT_BITS 16
+
+#define NF_VERDICT_QMASK 0xffff0000
+#define NF_VERDICT_QBITS 16
+
+#define NF_QUEUE_NR(x) (((x << NF_VERDICT_QBITS) & NF_VERDICT_QMASK) | NF_QUEUE)
+
+/* only for userspace compatibility */
+#ifndef __KERNEL__
 /* Generic cache responses from hook functions.
    <= 0x2000 is used for protocol-flags. */
 #define NFC_UNKNOWN 0x4000
 #define NFC_ALTERED 0x8000
+#endif
 
 #ifdef __KERNEL__
 #include <linux/config.h>
@@ -101,15 +114,51 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
 extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
 
-typedef void nf_logfn(unsigned int hooknum,
+/* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will
+ * disappear once iptables is replaced with pkttables.  Please DO NOT use them
+ * for any new code! */
+#define NF_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
+#define NF_LOG_TCPOPT		0x02	/* Log TCP options */
+#define NF_LOG_IPOPT		0x04	/* Log IP options */
+#define NF_LOG_UID		0x08	/* Log UID owning local socket */
+#define NF_LOG_MASK		0x0f
+
+#define NF_LOG_TYPE_LOG		0x01
+#define NF_LOG_TYPE_ULOG	0x02
+
+struct nf_loginfo {
+	u_int8_t type;
+	union {
+		struct {
+			u_int32_t copy_len;
+			u_int16_t group;
+			u_int16_t qthreshold;
+		} ulog;
+		struct {
+			u_int8_t level;
+			u_int8_t logflags;
+		} log;
+	} u;
+};
+
+typedef void nf_logfn(unsigned int pf,
+		      unsigned int hooknum,
 		      const struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
+		      const struct nf_loginfo *li,
 		      const char *prefix);
 
+struct nf_logger {
+	struct module	*me;
+	nf_logfn 	*logfn;
+	char		*name;
+};
+
 /* Function to register/unregister log function. */
-int nf_log_register(int pf, nf_logfn *logfn);
-void nf_log_unregister(int pf, nf_logfn *logfn);
+int nf_log_register(int pf, struct nf_logger *logger);
+int nf_log_unregister_pf(int pf);
+void nf_log_unregister_logger(struct nf_logger *logger);
 
 /* Calls the registered backend logging function */
 void nf_log_packet(int pf,
@@ -117,6 +166,7 @@ void nf_log_packet(int pf,
 		   const struct sk_buff *skb,
 		   const struct net_device *in,
 		   const struct net_device *out,
+		   struct nf_loginfo *li,
 		   const char *fmt, ...);
                    
 /* Activate hook; either okfn or kfree_skb called, unless a hook
@@ -175,11 +225,16 @@ int nf_getsockopt(struct sock *sk, int pf, int optval, char __user *opt,
 		  int *len);
 
 /* Packet queuing */
-typedef int (*nf_queue_outfn_t)(struct sk_buff *skb, 
-                                struct nf_info *info, void *data);
+struct nf_queue_handler {
+	int (*outfn)(struct sk_buff *skb, struct nf_info *info,
+		     unsigned int queuenum, void *data);
+	void *data;
+	char *name;
+};
 extern int nf_register_queue_handler(int pf, 
-                                     nf_queue_outfn_t outfn, void *data);
+                                     struct nf_queue_handler *qh);
 extern int nf_unregister_queue_handler(int pf);
+extern void nf_unregister_queue_handlers(struct nf_queue_handler *qh);
 extern void nf_reinject(struct sk_buff *skb,
 			struct nf_info *info,
 			unsigned int verdict);
@@ -190,6 +245,27 @@ extern void nf_ct_attach(struct sk_buff *, struct sk_buff *);
 /* FIXME: Before cache is ever used, this must be implemented for real. */
 extern void nf_invalidate_cache(int pf);
 
+/* Call this before modifying an existing packet: ensures it is
+   modifiable and linear to the point you care about (writable_len).
+   Returns true or false. */
+extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len);
+
+struct nf_queue_rerouter {
+	void (*save)(const struct sk_buff *skb, struct nf_info *info);
+	int (*reroute)(struct sk_buff **skb, const struct nf_info *info);
+	int rer_size;
+};
+
+#define nf_info_reroute(x) ((void *)x + sizeof(struct nf_info))
+
+extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
+extern int nf_unregister_queue_rerouter(int pf);
+
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+extern struct proc_dir_entry *proc_net_netfilter;
+#endif
+
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
new file mode 100644
index 00000000000..1d5b10ae239
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink.h
@@ -0,0 +1,169 @@
+#ifndef _NFNETLINK_H
+#define _NFNETLINK_H
+#include <linux/types.h>
+
+#ifndef __KERNEL__
+/* nfnetlink groups: Up to 32 maximum - backwards compatibility for userspace */
+#define NF_NETLINK_CONNTRACK_NEW 		0x00000001
+#define NF_NETLINK_CONNTRACK_UPDATE		0x00000002
+#define NF_NETLINK_CONNTRACK_DESTROY		0x00000004
+#define NF_NETLINK_CONNTRACK_EXP_NEW		0x00000008
+#define NF_NETLINK_CONNTRACK_EXP_UPDATE		0x00000010
+#define NF_NETLINK_CONNTRACK_EXP_DESTROY	0x00000020
+#endif
+
+enum nfnetlink_groups {
+	NFNLGRP_NONE,
+#define NFNLGRP_NONE			NFNLGRP_NONE
+	NFNLGRP_CONNTRACK_NEW,
+#define NFNLGRP_CONNTRACK_NEW		NFNLGRP_CONNTRACK_NEW
+	NFNLGRP_CONNTRACK_UPDATE,
+#define NFNLGRP_CONNTRACK_UPDATE	NFNLGRP_CONNTRACK_UPDATE
+	NFNLGRP_CONNTRACK_DESTROY,
+#define NFNLGRP_CONNTRACK_DESTROY	NFNLGRP_CONNTRACK_DESTROY
+	NFNLGRP_CONNTRACK_EXP_NEW,
+#define	NFNLGRP_CONNTRACK_EXP_NEW	NFNLGRP_CONNTRACK_EXP_NEW
+	NFNLGRP_CONNTRACK_EXP_UPDATE,
+#define NFNLGRP_CONNTRACK_EXP_UPDATE	NFNLGRP_CONNTRACK_EXP_UPDATE
+	NFNLGRP_CONNTRACK_EXP_DESTROY,
+#define NFNLGRP_CONNTRACK_EXP_DESTROY	NFNLGRP_CONNTRACK_EXP_DESTROY
+	__NFNLGRP_MAX,
+};
+#define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
+
+/* Generic structure for encapsulation optional netfilter information.
+ * It is reminiscent of sockaddr, but with sa_family replaced
+ * with attribute type. 
+ * ! This should someday be put somewhere generic as now rtnetlink and
+ * ! nfnetlink use the same attributes methods. - J. Schulist.
+ */
+
+struct nfattr
+{
+	u_int16_t nfa_len;
+	u_int16_t nfa_type;
+} __attribute__ ((packed));
+
+/* FIXME: Shamelessly copy and pasted from rtnetlink.h, it's time
+ * 	  to put this in a generic file */
+
+#define NFA_ALIGNTO     4
+#define NFA_ALIGN(len)	(((len) + NFA_ALIGNTO - 1) & ~(NFA_ALIGNTO - 1))
+#define NFA_OK(nfa,len)	((len) > 0 && (nfa)->nfa_len >= sizeof(struct nfattr) \
+	&& (nfa)->nfa_len <= (len))
+#define NFA_NEXT(nfa,attrlen)	((attrlen) -= NFA_ALIGN((nfa)->nfa_len), \
+	(struct nfattr *)(((char *)(nfa)) + NFA_ALIGN((nfa)->nfa_len)))
+#define NFA_LENGTH(len)	(NFA_ALIGN(sizeof(struct nfattr)) + (len))
+#define NFA_SPACE(len)	NFA_ALIGN(NFA_LENGTH(len))
+#define NFA_DATA(nfa)   ((void *)(((char *)(nfa)) + NFA_LENGTH(0)))
+#define NFA_PAYLOAD(nfa) ((int)((nfa)->nfa_len) - NFA_LENGTH(0))
+#define NFA_NEST(skb, type) \
+({	struct nfattr *__start = (struct nfattr *) (skb)->tail; \
+	NFA_PUT(skb, type, 0, NULL); \
+	__start;  })
+#define NFA_NEST_END(skb, start) \
+({      (start)->nfa_len = ((skb)->tail - (unsigned char *) (start)); \
+        (skb)->len; })
+#define NFA_NEST_CANCEL(skb, start) \
+({      if (start) \
+                skb_trim(skb, (unsigned char *) (start) - (skb)->data); \
+        -1; })
+
+/* General form of address family dependent message.
+ */
+struct nfgenmsg {
+	u_int8_t  nfgen_family;		/* AF_xxx */
+	u_int8_t  version;		/* nfnetlink version */
+	u_int16_t res_id;		/* resource id */
+} __attribute__ ((packed));
+
+#define NFNETLINK_V0	0
+
+#define NFM_NFA(n)      ((struct nfattr *)(((char *)(n)) \
+        + NLMSG_ALIGN(sizeof(struct nfgenmsg))))
+#define NFM_PAYLOAD(n)  NLMSG_PAYLOAD(n, sizeof(struct nfgenmsg))
+
+/* netfilter netlink message types are split in two pieces:
+ * 8 bit subsystem, 8bit operation.
+ */
+
+#define NFNL_SUBSYS_ID(x)	((x & 0xff00) >> 8)
+#define NFNL_MSG_TYPE(x)	(x & 0x00ff)
+
+/* No enum here, otherwise __stringify() trick of MODULE_ALIAS_NFNL_SUBSYS()
+ * won't work anymore */
+#define NFNL_SUBSYS_NONE 		0
+#define NFNL_SUBSYS_CTNETLINK		1
+#define NFNL_SUBSYS_CTNETLINK_EXP	2
+#define NFNL_SUBSYS_QUEUE		3
+#define NFNL_SUBSYS_ULOG		4
+#define NFNL_SUBSYS_COUNT		5
+
+#ifdef __KERNEL__
+
+#include <linux/netlink.h>
+#include <linux/capability.h>
+
+struct nfnl_callback
+{
+	int (*call)(struct sock *nl, struct sk_buff *skb, 
+		struct nlmsghdr *nlh, struct nfattr *cda[], int *errp);
+	kernel_cap_t cap_required; /* capabilities required for this msg */
+	u_int16_t attr_count;	/* number of nfattr's */
+};
+
+struct nfnetlink_subsystem
+{
+	const char *name;
+	__u8 subsys_id;		/* nfnetlink subsystem ID */
+	__u8 cb_count;		/* number of callbacks */
+	struct nfnl_callback *cb; /* callback for individual types */
+};
+
+extern void __nfa_fill(struct sk_buff *skb, int attrtype,
+        int attrlen, const void *data);
+#define NFA_PUT(skb, attrtype, attrlen, data) \
+({ if (skb_tailroom(skb) < (int)NFA_SPACE(attrlen)) goto nfattr_failure; \
+   __nfa_fill(skb, attrtype, attrlen, data); })
+
+extern struct semaphore nfnl_sem;
+
+#define nfnl_shlock()		down(&nfnl_sem)
+#define nfnl_shlock_nowait()	down_trylock(&nfnl_sem)
+
+#define nfnl_shunlock()		do { up(&nfnl_sem); \
+				     if(nfnl && nfnl->sk_receive_queue.qlen) \
+					    nfnl->sk_data_ready(nfnl, 0); \
+                        	} while(0)
+
+extern void nfnl_lock(void);
+extern void nfnl_unlock(void);
+
+extern int nfnetlink_subsys_register(struct nfnetlink_subsystem *n);
+extern int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n);
+
+extern int nfattr_parse(struct nfattr *tb[], int maxattr, 
+			struct nfattr *nfa, int len);
+
+#define nfattr_parse_nested(tb, max, nfa) \
+	nfattr_parse((tb), (max), NFA_DATA((nfa)), NFA_PAYLOAD((nfa)))
+
+#define nfattr_bad_size(tb, max, cta_min)				\
+({	int __i, __res = 0;						\
+ 	for (__i=0; __i<max; __i++) 					\
+ 		if (tb[__i] && NFA_PAYLOAD(tb[__i]) < cta_min[__i]){	\
+ 			__res = 1;					\
+ 			break;						\
+ 		}							\
+ 	__res;								\
+})
+
+extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, 
+			  int echo);
+extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
+
+#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
+	MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))
+
+#endif	/* __KERNEL__ */
+#endif	/* _NFNETLINK_H */
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
new file mode 100644
index 00000000000..5c55751c78e
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -0,0 +1,124 @@
+#ifndef _IPCONNTRACK_NETLINK_H
+#define _IPCONNTRACK_NETLINK_H
+#include <linux/netfilter/nfnetlink.h>
+
+enum cntl_msg_types {
+	IPCTNL_MSG_CT_NEW,
+	IPCTNL_MSG_CT_GET,
+	IPCTNL_MSG_CT_DELETE,
+	IPCTNL_MSG_CT_GET_CTRZERO,
+
+	IPCTNL_MSG_MAX
+};
+
+enum ctnl_exp_msg_types {
+	IPCTNL_MSG_EXP_NEW,
+	IPCTNL_MSG_EXP_GET,
+	IPCTNL_MSG_EXP_DELETE,
+
+	IPCTNL_MSG_EXP_MAX
+};
+
+
+enum ctattr_type {
+	CTA_UNSPEC,
+	CTA_TUPLE_ORIG,
+	CTA_TUPLE_REPLY,
+	CTA_STATUS,
+	CTA_PROTOINFO,
+	CTA_HELP,
+	CTA_NAT,
+	CTA_TIMEOUT,
+	CTA_MARK,
+	CTA_COUNTERS_ORIG,
+	CTA_COUNTERS_REPLY,
+	CTA_USE,
+	CTA_ID,
+	__CTA_MAX
+};
+#define CTA_MAX (__CTA_MAX - 1)
+
+enum ctattr_tuple {
+	CTA_TUPLE_UNSPEC,
+	CTA_TUPLE_IP,
+	CTA_TUPLE_PROTO,
+	__CTA_TUPLE_MAX
+};
+#define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1)
+
+enum ctattr_ip {
+	CTA_IP_UNSPEC,
+	CTA_IP_V4_SRC,
+	CTA_IP_V4_DST,
+	CTA_IP_V6_SRC,
+	CTA_IP_V6_DST,
+	__CTA_IP_MAX
+};
+#define CTA_IP_MAX (__CTA_IP_MAX - 1)
+
+enum ctattr_l4proto {
+	CTA_PROTO_UNSPEC,
+	CTA_PROTO_NUM,
+	CTA_PROTO_SRC_PORT,
+	CTA_PROTO_DST_PORT,
+	CTA_PROTO_ICMP_ID,
+	CTA_PROTO_ICMP_TYPE,
+	CTA_PROTO_ICMP_CODE,
+	__CTA_PROTO_MAX
+};
+#define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
+
+enum ctattr_protoinfo {
+	CTA_PROTOINFO_UNSPEC,
+	CTA_PROTOINFO_TCP_STATE,
+	__CTA_PROTOINFO_MAX
+};
+#define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1)
+
+enum ctattr_counters {
+	CTA_COUNTERS_UNSPEC,
+	CTA_COUNTERS_PACKETS,
+	CTA_COUNTERS_BYTES,
+	__CTA_COUNTERS_MAX
+};
+#define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1)
+
+enum ctattr_nat {
+	CTA_NAT_UNSPEC,
+	CTA_NAT_MINIP,
+	CTA_NAT_MAXIP,
+	CTA_NAT_PROTO,
+	__CTA_NAT_MAX
+};
+#define CTA_NAT_MAX (__CTA_NAT_MAX - 1)
+
+enum ctattr_protonat {
+	CTA_PROTONAT_UNSPEC,
+	CTA_PROTONAT_PORT_MIN,
+	CTA_PROTONAT_PORT_MAX,
+	__CTA_PROTONAT_MAX
+};
+#define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1)
+
+enum ctattr_expect {
+	CTA_EXPECT_UNSPEC,
+	CTA_EXPECT_MASTER,
+	CTA_EXPECT_TUPLE,
+	CTA_EXPECT_MASK,
+	CTA_EXPECT_TIMEOUT,
+	CTA_EXPECT_ID,
+	CTA_EXPECT_HELP_NAME,
+	__CTA_EXPECT_MAX
+};
+#define CTA_EXPECT_MAX (__CTA_EXPECT_MAX - 1)
+
+enum ctattr_help {
+	CTA_HELP_UNSPEC,
+	CTA_HELP_NAME,
+	__CTA_HELP_MAX
+};
+#define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
+
+#define CTA_HELP_MAXNAMESIZE	32
+
+#endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
new file mode 100644
index 00000000000..b04b0388059
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -0,0 +1,88 @@
+#ifndef _NFNETLINK_LOG_H
+#define _NFNETLINK_LOG_H
+
+/* This file describes the netlink messages (i.e. 'protocol packets'),
+ * and not any kind of function definitions.  It is shared between kernel and
+ * userspace.  Don't put kernel specific stuff in here */
+
+#include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
+
+enum nfulnl_msg_types {
+	NFULNL_MSG_PACKET,		/* packet from kernel to userspace */
+	NFULNL_MSG_CONFIG,		/* connect to a particular queue */
+
+	NFULNL_MSG_MAX
+};
+
+struct nfulnl_msg_packet_hdr {
+	u_int16_t	hw_protocol;	/* hw protocol (network order) */
+	u_int8_t	hook;		/* netfilter hook */
+	u_int8_t	_pad;
+} __attribute__ ((packed));
+
+struct nfulnl_msg_packet_hw {
+	u_int16_t	hw_addrlen;
+	u_int16_t	_pad;
+	u_int8_t	hw_addr[8];
+} __attribute__ ((packed));
+
+struct nfulnl_msg_packet_timestamp {
+	aligned_u64	sec;
+	aligned_u64	usec;
+} __attribute__ ((packed));
+
+#define NFULNL_PREFIXLEN	30	/* just like old log target */
+
+enum nfulnl_attr_type {
+	NFULA_UNSPEC,
+	NFULA_PACKET_HDR,
+	NFULA_MARK,			/* u_int32_t nfmark */
+	NFULA_TIMESTAMP,		/* nfulnl_msg_packet_timestamp */
+	NFULA_IFINDEX_INDEV,		/* u_int32_t ifindex */
+	NFULA_IFINDEX_OUTDEV,		/* u_int32_t ifindex */
+	NFULA_IFINDEX_PHYSINDEV,	/* u_int32_t ifindex */
+	NFULA_IFINDEX_PHYSOUTDEV,	/* u_int32_t ifindex */
+	NFULA_HWADDR,			/* nfulnl_msg_packet_hw */
+	NFULA_PAYLOAD,			/* opaque data payload */
+	NFULA_PREFIX,			/* string prefix */
+	NFULA_UID,			/* user id of socket */
+
+	__NFULA_MAX
+};
+#define NFULA_MAX (__NFULA_MAX - 1)
+
+enum nfulnl_msg_config_cmds {
+	NFULNL_CFG_CMD_NONE,
+	NFULNL_CFG_CMD_BIND,
+	NFULNL_CFG_CMD_UNBIND,
+	NFULNL_CFG_CMD_PF_BIND,
+	NFULNL_CFG_CMD_PF_UNBIND,
+};
+
+struct nfulnl_msg_config_cmd {
+	u_int8_t	command;	/* nfulnl_msg_config_cmds */
+} __attribute__ ((packed));
+
+struct nfulnl_msg_config_mode {
+	u_int32_t	copy_range;
+	u_int8_t	copy_mode;
+	u_int8_t	_pad;
+} __attribute__ ((packed));
+
+enum nfulnl_attr_config {
+	NFULA_CFG_UNSPEC,
+	NFULA_CFG_CMD,			/* nfulnl_msg_config_cmd */
+	NFULA_CFG_MODE,			/* nfulnl_msg_config_mode */
+	NFULA_CFG_NLBUFSIZ,		/* u_int32_t buffer size */
+	NFULA_CFG_TIMEOUT,		/* u_int32_t in 1/100 s */
+	NFULA_CFG_QTHRESH,		/* u_int32_t */
+	__NFULA_CFG_MAX
+};
+#define NFULA_CFG_MAX (__NFULA_CFG_MAX -1)
+
+#define NFULNL_COPY_NONE	0x00
+#define NFULNL_COPY_META	0x01
+#define NFULNL_COPY_PACKET	0x02
+
+#endif /* _NFNETLINK_LOG_H */
diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
new file mode 100644
index 00000000000..9e774373244
--- /dev/null
+++ b/include/linux/netfilter/nfnetlink_queue.h
@@ -0,0 +1,89 @@
+#ifndef _NFNETLINK_QUEUE_H
+#define _NFNETLINK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
+
+enum nfqnl_msg_types {
+	NFQNL_MSG_PACKET,		/* packet from kernel to userspace */
+	NFQNL_MSG_VERDICT,		/* verdict from userspace to kernel */
+	NFQNL_MSG_CONFIG,		/* connect to a particular queue */
+
+	NFQNL_MSG_MAX
+};
+
+struct nfqnl_msg_packet_hdr {
+	u_int32_t	packet_id;	/* unique ID of packet in queue */
+	u_int16_t	hw_protocol;	/* hw protocol (network order) */
+	u_int8_t	hook;		/* netfilter hook */
+} __attribute__ ((packed));
+
+struct nfqnl_msg_packet_hw {
+	u_int16_t	hw_addrlen;
+	u_int16_t	_pad;
+	u_int8_t	hw_addr[8];
+} __attribute__ ((packed));
+
+struct nfqnl_msg_packet_timestamp {
+	aligned_u64	sec;
+	aligned_u64	usec;
+} __attribute__ ((packed));
+
+enum nfqnl_attr_type {
+	NFQA_UNSPEC,
+	NFQA_PACKET_HDR,
+	NFQA_VERDICT_HDR,		/* nfqnl_msg_verdict_hrd */
+	NFQA_MARK,			/* u_int32_t nfmark */
+	NFQA_TIMESTAMP,			/* nfqnl_msg_packet_timestamp */
+	NFQA_IFINDEX_INDEV,		/* u_int32_t ifindex */
+	NFQA_IFINDEX_OUTDEV,		/* u_int32_t ifindex */
+	NFQA_IFINDEX_PHYSINDEV,		/* u_int32_t ifindex */
+	NFQA_IFINDEX_PHYSOUTDEV,	/* u_int32_t ifindex */
+	NFQA_HWADDR,			/* nfqnl_msg_packet_hw */
+	NFQA_PAYLOAD,			/* opaque data payload */
+
+	__NFQA_MAX
+};
+#define NFQA_MAX (__NFQA_MAX - 1)
+
+struct nfqnl_msg_verdict_hdr {
+	u_int32_t verdict;
+	u_int32_t id;
+} __attribute__ ((packed));
+
+
+enum nfqnl_msg_config_cmds {
+	NFQNL_CFG_CMD_NONE,
+	NFQNL_CFG_CMD_BIND,
+	NFQNL_CFG_CMD_UNBIND,
+	NFQNL_CFG_CMD_PF_BIND,
+	NFQNL_CFG_CMD_PF_UNBIND,
+};
+
+struct nfqnl_msg_config_cmd {
+	u_int8_t	command;	/* nfqnl_msg_config_cmds */
+	u_int8_t	_pad;
+	u_int16_t	pf;		/* AF_xxx for PF_[UN]BIND */
+} __attribute__ ((packed));
+
+enum nfqnl_config_mode {
+	NFQNL_COPY_NONE,
+	NFQNL_COPY_META,
+	NFQNL_COPY_PACKET,
+};
+
+struct nfqnl_msg_config_params {
+	u_int32_t	copy_range;
+	u_int8_t	copy_mode;	/* enum nfqnl_config_mode */
+} __attribute__ ((packed));
+
+
+enum nfqnl_attr_config {
+	NFQA_CFG_UNSPEC,
+	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
+	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
+	__NFQA_CFG_MAX
+};
+#define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
+
+#endif /* _NFNETLINK_QUEUE_H */
diff --git a/include/linux/netfilter_decnet.h b/include/linux/netfilter_decnet.h
index 3064eec9cb8..6f425369ee2 100644
--- a/include/linux/netfilter_decnet.h
+++ b/include/linux/netfilter_decnet.h
@@ -9,6 +9,8 @@
 
 #include <linux/netfilter.h>
 
+/* only for userspace compatibility */
+#ifndef __KERNEL__
 /* IP Cache bits. */
 /* Src IP address. */
 #define NFC_DN_SRC		0x0001
@@ -18,6 +20,7 @@
 #define NFC_DN_IF_IN		0x0004
 /* Output device. */
 #define NFC_DN_IF_OUT		0x0008
+#endif /* ! __KERNEL__ */
 
 /* DECnet Hooks */
 /* After promisc drops, checksum checks. */
@@ -53,7 +56,21 @@ struct nf_dn_rtmsg {
 
 #define NFDN_RTMSG(r) ((unsigned char *)(r) + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)))
 
+#ifndef __KERNEL__
+/* backwards compatibility for userspace */
 #define DNRMG_L1_GROUP 0x01
 #define DNRMG_L2_GROUP 0x02
+#endif
+
+enum {
+	DNRNG_NLGRP_NONE,
+#define DNRNG_NLGRP_NONE	DNRNG_NLGRP_NONE
+	DNRNG_NLGRP_L1,
+#define DNRNG_NLGRP_L1		DNRNG_NLGRP_L1
+	DNRNG_NLGRP_L2,
+#define DNRNG_NLGRP_L2		DNRNG_NLGRP_L2
+	__DNRNG_NLGRP_MAX
+};
+#define DNRNG_NLGRP_MAX	(__DNRNG_NLGRP_MAX - 1)
 
 #endif /*__LINUX_DECNET_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index 3ebc36afae1..fdc4a952734 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -8,6 +8,8 @@
 #include <linux/config.h>
 #include <linux/netfilter.h>
 
+/* only for userspace compatibility */
+#ifndef __KERNEL__
 /* IP Cache bits. */
 /* Src IP address. */
 #define NFC_IP_SRC		0x0001
@@ -35,6 +37,7 @@
 #define NFC_IP_DST_PT		0x0400
 /* Something else about the proto */
 #define NFC_IP_PROTO_UNKNOWN	0x2000
+#endif /* ! __KERNEL__ */
 
 /* IP Hooks */
 /* After promisc drops, checksum checks. */
@@ -77,11 +80,6 @@ enum nf_ip_hook_priorities {
 #ifdef __KERNEL__
 extern int ip_route_me_harder(struct sk_buff **pskb);
 
-/* Call this before modifying an existing IP packet: ensures it is
-   modifiable and linear to the point you care about (writable_len).
-   Returns true or false. */
-extern int skb_ip_make_writable(struct sk_buff **pskb,
-				unsigned int writable_len);
 #endif /*__KERNEL__*/
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 08fe5f7d14a..088742befe4 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -65,6 +65,63 @@ enum ip_conntrack_status {
 
 	/* Both together */
 	IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE),
+
+	/* Connection is dying (removed from lists), can not be unset. */
+	IPS_DYING_BIT = 9,
+	IPS_DYING = (1 << IPS_DYING_BIT),
+};
+
+/* Connection tracking event bits */
+enum ip_conntrack_events
+{
+	/* New conntrack */
+	IPCT_NEW_BIT = 0,
+	IPCT_NEW = (1 << IPCT_NEW_BIT),
+
+	/* Expected connection */
+	IPCT_RELATED_BIT = 1,
+	IPCT_RELATED = (1 << IPCT_RELATED_BIT),
+
+	/* Destroyed conntrack */
+	IPCT_DESTROY_BIT = 2,
+	IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
+
+	/* Timer has been refreshed */
+	IPCT_REFRESH_BIT = 3,
+	IPCT_REFRESH = (1 << IPCT_REFRESH_BIT),
+
+	/* Status has changed */
+	IPCT_STATUS_BIT = 4,
+	IPCT_STATUS = (1 << IPCT_STATUS_BIT),
+
+	/* Update of protocol info */
+	IPCT_PROTOINFO_BIT = 5,
+	IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
+
+	/* Volatile protocol info */
+	IPCT_PROTOINFO_VOLATILE_BIT = 6,
+	IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT),
+
+	/* New helper for conntrack */
+	IPCT_HELPER_BIT = 7,
+	IPCT_HELPER = (1 << IPCT_HELPER_BIT),
+
+	/* Update of helper info */
+	IPCT_HELPINFO_BIT = 8,
+	IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT),
+
+	/* Volatile helper info */
+	IPCT_HELPINFO_VOLATILE_BIT = 9,
+	IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT),
+
+	/* NAT info */
+	IPCT_NATINFO_BIT = 10,
+	IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
+};
+
+enum ip_conntrack_expect_events {
+	IPEXP_NEW_BIT = 0,
+	IPEXP_NEW = (1 << IPEXP_NEW_BIT),
 };
 
 #ifdef __KERNEL__
@@ -152,6 +209,9 @@ struct ip_conntrack
 	/* Current number of expected connections */
 	unsigned int expecting;
 
+	/* Unique ID that identifies this conntrack*/
+	unsigned int id;
+
 	/* Helper, if any. */
 	struct ip_conntrack_helper *helper;
 
@@ -171,7 +231,7 @@ struct ip_conntrack
 #endif /* CONFIG_IP_NF_NAT_NEEDED */
 
 #if defined(CONFIG_IP_NF_CONNTRACK_MARK)
-	unsigned long mark;
+	u_int32_t mark;
 #endif
 
 	/* Traversed often, so hopefully in different cacheline to top */
@@ -200,6 +260,9 @@ struct ip_conntrack_expect
 	/* Usage count. */
 	atomic_t use;
 
+	/* Unique ID */
+	unsigned int id;
+
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
@@ -239,7 +302,12 @@ ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
 }
 
 /* decrement reference count on a conntrack */
-extern void ip_conntrack_put(struct ip_conntrack *ct);
+static inline void
+ip_conntrack_put(struct ip_conntrack *ct)
+{
+	IP_NF_ASSERT(ct);
+	nf_conntrack_put(&ct->ct_general);
+}
 
 /* call to create an explicit dependency on ip_conntrack. */
 extern void need_ip_conntrack(void);
@@ -274,12 +342,50 @@ extern void
 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *data),
 		      void *data);
 
+extern struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *);
+extern struct ip_conntrack_helper *
+ip_conntrack_helper_find_get(const struct ip_conntrack_tuple *tuple);
+extern void ip_conntrack_helper_put(struct ip_conntrack_helper *helper);
+
+extern struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol);
+extern struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol);
+extern void ip_conntrack_proto_put(struct ip_conntrack_protocol *proto);
+
+extern void ip_ct_remove_expectations(struct ip_conntrack *ct);
+
+extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *,
+					       struct ip_conntrack_tuple *);
+
+extern void ip_conntrack_free(struct ip_conntrack *ct);
+
+extern void ip_conntrack_hash_insert(struct ip_conntrack *ct);
+
+extern struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple);
+
+extern struct ip_conntrack_expect *
+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple);
+
+extern struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+                    const struct ip_conntrack *ignored_conntrack);
+
+extern void ip_conntrack_flush(void);
+
 /* It's confirmed if it is, or has been in the hash table. */
 static inline int is_confirmed(struct ip_conntrack *ct)
 {
 	return test_bit(IPS_CONFIRMED_BIT, &ct->status);
 }
 
+static inline int is_dying(struct ip_conntrack *ct)
+{
+	return test_bit(IPS_DYING_BIT, &ct->status);
+}
+
 extern unsigned int ip_conntrack_htable_size;
  
 struct ip_conntrack_stat
@@ -303,6 +409,85 @@ struct ip_conntrack_stat
 
 #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
 
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+
+struct ip_conntrack_ecache {
+	struct ip_conntrack *ct;
+	unsigned int events;
+};
+DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+#define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
+ 
+extern struct notifier_block *ip_conntrack_chain;
+extern struct notifier_block *ip_conntrack_expect_chain;
+
+static inline int ip_conntrack_register_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_register(&ip_conntrack_chain, nb);
+}
+
+static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&ip_conntrack_chain, nb);
+}
+
+static inline int 
+ip_conntrack_expect_register_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_register(&ip_conntrack_expect_chain, nb);
+}
+
+static inline int
+ip_conntrack_expect_unregister_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&ip_conntrack_expect_chain, nb);
+}
+
+extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
+extern void __ip_ct_event_cache_init(struct ip_conntrack *ct);
+
+static inline void 
+ip_conntrack_event_cache(enum ip_conntrack_events event,
+			 const struct sk_buff *skb)
+{
+	struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
+	struct ip_conntrack_ecache *ecache;
+	
+	local_bh_disable();
+	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	if (ct != ecache->ct)
+		__ip_ct_event_cache_init(ct);
+	ecache->events |= event;
+	local_bh_enable();
+}
+
+static inline void ip_conntrack_event(enum ip_conntrack_events event,
+				      struct ip_conntrack *ct)
+{
+	if (is_confirmed(ct) && !is_dying(ct))
+		notifier_call_chain(&ip_conntrack_chain, event, ct);
+}
+
+static inline void 
+ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
+			  struct ip_conntrack_expect *exp)
+{
+	notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+}
+#else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, 
+					    const struct sk_buff *skb) {}
+static inline void ip_conntrack_event(enum ip_conntrack_events event, 
+				      struct ip_conntrack *ct) {}
+static inline void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) {}
+static inline void 
+ip_conntrack_expect_event(enum ip_conntrack_expect_events event, 
+			  struct ip_conntrack_expect *exp) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 static inline int ip_nat_initialized(struct ip_conntrack *conntrack,
 				     enum ip_nat_manip_type manip)
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h
index 694aec9b478..dc4d2a0575d 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_core.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h
@@ -2,6 +2,9 @@
 #define _IP_CONNTRACK_CORE_H
 #include <linux/netfilter.h>
 
+#define MAX_IP_CT_PROTO 256
+extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+
 /* This header is used to share core functionality between the
    standalone connection tracking module, and the compatibility layer's use
    of connection tracking. */
@@ -38,12 +41,19 @@ extern int __ip_conntrack_confirm(struct sk_buff **pskb);
 /* Confirm a connection: returns NF_DROP if packet must be dropped. */
 static inline int ip_conntrack_confirm(struct sk_buff **pskb)
 {
-	if ((*pskb)->nfct
-	    && !is_confirmed((struct ip_conntrack *)(*pskb)->nfct))
-		return __ip_conntrack_confirm(pskb);
-	return NF_ACCEPT;
+	struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct;
+	int ret = NF_ACCEPT;
+
+	if (ct) {
+		if (!is_confirmed(ct))
+			ret = __ip_conntrack_confirm(pskb);
+		ip_ct_deliver_cached_events(ct);
+	}
+	return ret;
 }
 
+extern void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp);
+
 extern struct list_head *ip_conntrack_hash;
 extern struct list_head ip_conntrack_expect_list;
 extern rwlock_t ip_conntrack_lock;
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper.h b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
index 3692daa93de..8d69279ccfe 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_helper.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
@@ -24,6 +24,8 @@ struct ip_conntrack_helper
 	int (*help)(struct sk_buff **pskb,
 		    struct ip_conntrack *ct,
 		    enum ip_conntrack_info conntrackinfo);
+
+	int (*to_nfattr)(struct sk_buff *skb, const struct ip_conntrack *ct);
 };
 
 extern int ip_conntrack_helper_register(struct ip_conntrack_helper *);
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
index e20b57c5e1b..b6b99be8632 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
@@ -2,6 +2,7 @@
 #ifndef _IP_CONNTRACK_PROTOCOL_H
 #define _IP_CONNTRACK_PROTOCOL_H
 #include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
 
 struct seq_file;
 
@@ -47,22 +48,22 @@ struct ip_conntrack_protocol
 	int (*error)(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 		     unsigned int hooknum);
 
+	/* convert protoinfo to nfnetink attributes */
+	int (*to_nfattr)(struct sk_buff *skb, struct nfattr *nfa,
+			 const struct ip_conntrack *ct);
+
+	int (*tuple_to_nfattr)(struct sk_buff *skb,
+			       const struct ip_conntrack_tuple *t);
+	int (*nfattr_to_tuple)(struct nfattr *tb[],
+			       struct ip_conntrack_tuple *t);
+
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
 
-#define MAX_IP_CT_PROTO 256
-extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
-
 /* Protocol registration. */
 extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto);
 extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto);
-
-static inline struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
-{
-	return ip_ct_protos[protocol];
-}
-
 /* Existing built-in protocols */
 extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp;
 extern struct ip_conntrack_protocol ip_conntrack_protocol_udp;
@@ -73,6 +74,11 @@ extern int ip_conntrack_protocol_tcp_init(void);
 /* Log invalid packets */
 extern unsigned int ip_ct_log_invalid;
 
+extern int ip_ct_port_tuple_to_nfattr(struct sk_buff *,
+				      const struct ip_conntrack_tuple *);
+extern int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+				      struct ip_conntrack_tuple *);
+
 #ifdef CONFIG_SYSCTL
 #ifdef DEBUG_INVALID_PACKETS
 #define LOG_INVALID(proto) \
diff --git a/include/linux/netfilter_ipv4/ip_logging.h b/include/linux/netfilter_ipv4/ip_logging.h
deleted file mode 100644
index 0c5c52cb658..00000000000
--- a/include/linux/netfilter_ipv4/ip_logging.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* IPv4 macros for the internal logging interface. */
-#ifndef __IP_LOGGING_H
-#define __IP_LOGGING_H
-
-#ifdef __KERNEL__
-#include <linux/socket.h>
-#include <linux/netfilter_logging.h>
-
-#define nf_log_ip_packet(pskb,hooknum,in,out,fmt,args...) \
-	nf_log_packet(AF_INET,pskb,hooknum,in,out,fmt,##args)
-
-#define nf_log_ip(pfh,len,fmt,args...) \
-	nf_log(AF_INET,pfh,len,fmt,##args)
-
-#define nf_ip_log_register(logging) nf_log_register(AF_INET,logging)
-#define nf_ip_log_unregister(logging) nf_log_unregister(AF_INET,logging)
-	
-#endif /*__KERNEL__*/
-
-#endif /*__IP_LOGGING_H*/
diff --git a/include/linux/netfilter_ipv4/ip_nat_protocol.h b/include/linux/netfilter_ipv4/ip_nat_protocol.h
index 129708c2238..ef63aa991a0 100644
--- a/include/linux/netfilter_ipv4/ip_nat_protocol.h
+++ b/include/linux/netfilter_ipv4/ip_nat_protocol.h
@@ -4,6 +4,9 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
 struct iphdr;
 struct ip_nat_range;
 
@@ -15,6 +18,8 @@ struct ip_nat_protocol
 	/* Protocol number. */
 	unsigned int protonum;
 
+	struct module *me;
+
 	/* Translate a packet to the target according to manip type.
 	   Return true if succeeded. */
 	int (*manip_pkt)(struct sk_buff **pskb,
@@ -43,19 +48,20 @@ struct ip_nat_protocol
 
 	unsigned int (*print_range)(char *buffer,
 				    const struct ip_nat_range *range);
-};
 
-#define MAX_IP_NAT_PROTO 256
-extern struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+	int (*range_to_nfattr)(struct sk_buff *skb,
+			       const struct ip_nat_range *range);
+
+	int (*nfattr_to_range)(struct nfattr *tb[],
+			       struct ip_nat_range *range);
+};
 
 /* Protocol registration. */
 extern int ip_nat_protocol_register(struct ip_nat_protocol *proto);
 extern void ip_nat_protocol_unregister(struct ip_nat_protocol *proto);
 
-static inline struct ip_nat_protocol *ip_nat_find_proto(u_int8_t protocol)
-{
-	return ip_nat_protos[protocol];
-}
+extern struct ip_nat_protocol *ip_nat_proto_find_get(u_int8_t protocol);
+extern void ip_nat_proto_put(struct ip_nat_protocol *proto);
 
 /* Built-in protocols. */
 extern struct ip_nat_protocol ip_nat_protocol_tcp;
@@ -67,4 +73,9 @@ extern int init_protocols(void) __init;
 extern void cleanup_protocols(void);
 extern struct ip_nat_protocol *find_nat_proto(u_int16_t protonum);
 
+extern int ip_nat_port_range_to_nfattr(struct sk_buff *skb,
+				       const struct ip_nat_range *range);
+extern int ip_nat_port_nfattr_to_range(struct nfattr *tb[],
+				       struct ip_nat_range *range);
+
 #endif /*_IP_NAT_PROTO_H*/
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 12ce47808e7..d19d65cf453 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -109,7 +109,8 @@ struct ipt_counters
 
 /* Values for "flag" field in struct ipt_ip (general ip structure). */
 #define IPT_F_FRAG		0x01	/* Set if rule is a fragment rule */
-#define IPT_F_MASK		0x01	/* All possible flag bits mask. */
+#define IPT_F_GOTO		0x02	/* Set if jump is a goto */
+#define IPT_F_MASK		0x03	/* All possible flag bits mask. */
 
 /* Values for "inv" field in struct ipt_ip. */
 #define IPT_INV_VIA_IN		0x01	/* Invert the sense of IN IFACE. */
diff --git a/include/linux/netfilter_ipv4/ipt_LOG.h b/include/linux/netfilter_ipv4/ipt_LOG.h
index d25f782e57d..22d16177319 100644
--- a/include/linux/netfilter_ipv4/ipt_LOG.h
+++ b/include/linux/netfilter_ipv4/ipt_LOG.h
@@ -1,6 +1,7 @@
 #ifndef _IPT_LOG_H
 #define _IPT_LOG_H
 
+/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */
 #define IPT_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
 #define IPT_LOG_TCPOPT		0x02	/* Log TCP options */
 #define IPT_LOG_IPOPT		0x04	/* Log IP options */
diff --git a/include/linux/netfilter_ipv4/ipt_NFQUEUE.h b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h
new file mode 100644
index 00000000000..b5b2943b0c6
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_NFQUEUE.h
@@ -0,0 +1,16 @@
+/* iptables module for using NFQUEUE mechanism
+ *
+ * (C) 2005 Harald Welte <laforge@netfilter.org>
+ *
+ * This software is distributed under GNU GPL v2, 1991
+ * 
+*/
+#ifndef _IPT_NFQ_TARGET_H
+#define _IPT_NFQ_TARGET_H
+
+/* target info */
+struct ipt_NFQ_info {
+	u_int16_t queuenum;
+};
+
+#endif /* _IPT_DSCP_TARGET_H */
diff --git a/include/linux/netfilter_ipv4/ipt_TTL.h b/include/linux/netfilter_ipv4/ipt_TTL.h
new file mode 100644
index 00000000000..ee6611edc11
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_TTL.h
@@ -0,0 +1,21 @@
+/* TTL modification module for IP tables
+ * (C) 2000 by Harald Welte <laforge@netfilter.org> */
+
+#ifndef _IPT_TTL_H
+#define _IPT_TTL_H
+
+enum {
+	IPT_TTL_SET = 0,
+	IPT_TTL_INC,
+	IPT_TTL_DEC
+};
+
+#define IPT_TTL_MAXMODE	IPT_TTL_DEC
+
+struct ipt_TTL_info {
+	u_int8_t	mode;
+	u_int8_t	ttl;
+};
+
+
+#endif
diff --git a/include/linux/netfilter_ipv4/ipt_connbytes.h b/include/linux/netfilter_ipv4/ipt_connbytes.h
new file mode 100644
index 00000000000..9e5532f8d8a
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_connbytes.h
@@ -0,0 +1,25 @@
+#ifndef _IPT_CONNBYTES_H
+#define _IPT_CONNBYTES_H
+
+enum ipt_connbytes_what {
+	IPT_CONNBYTES_PKTS,
+	IPT_CONNBYTES_BYTES,
+	IPT_CONNBYTES_AVGPKT,
+};
+
+enum ipt_connbytes_direction {
+	IPT_CONNBYTES_DIR_ORIGINAL,
+	IPT_CONNBYTES_DIR_REPLY,
+	IPT_CONNBYTES_DIR_BOTH,
+};
+
+struct ipt_connbytes_info
+{
+	struct {
+		aligned_u64 from;	/* count to be matched */
+		aligned_u64 to;		/* count to be matched */
+	} count;
+	u_int8_t what;		/* ipt_connbytes_what */
+	u_int8_t direction;	/* ipt_connbytes_direction */
+};
+#endif
diff --git a/include/linux/netfilter_ipv4/ipt_dccp.h b/include/linux/netfilter_ipv4/ipt_dccp.h
new file mode 100644
index 00000000000..3cb3a522e62
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_dccp.h
@@ -0,0 +1,23 @@
+#ifndef _IPT_DCCP_H_
+#define _IPT_DCCP_H_
+
+#define IPT_DCCP_SRC_PORTS	        0x01
+#define IPT_DCCP_DEST_PORTS	        0x02
+#define IPT_DCCP_TYPE			0x04
+#define IPT_DCCP_OPTION			0x08
+
+#define IPT_DCCP_VALID_FLAGS		0x0f
+
+struct ipt_dccp_info {
+	u_int16_t dpts[2];  /* Min, Max */
+	u_int16_t spts[2];  /* Min, Max */
+
+	u_int16_t flags;
+	u_int16_t invflags;
+
+	u_int16_t typemask;
+	u_int8_t option;
+};
+
+#endif /* _IPT_DCCP_H_ */
+
diff --git a/include/linux/netfilter_ipv4/ipt_string.h b/include/linux/netfilter_ipv4/ipt_string.h
new file mode 100644
index 00000000000..a265f6e44ea
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_string.h
@@ -0,0 +1,18 @@
+#ifndef _IPT_STRING_H
+#define _IPT_STRING_H
+
+#define IPT_STRING_MAX_PATTERN_SIZE 128
+#define IPT_STRING_MAX_ALGO_NAME_SIZE 16
+
+struct ipt_string_info
+{
+	u_int16_t from_offset;
+	u_int16_t to_offset;
+	char	  algo[IPT_STRING_MAX_ALGO_NAME_SIZE];
+	char 	  pattern[IPT_STRING_MAX_PATTERN_SIZE];
+	u_int8_t  patlen;
+	u_int8_t  invert;
+	struct ts_config __attribute__((aligned(8))) *config;
+};
+
+#endif /*_IPT_STRING_H*/
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index bee7a5ec7c6..edcc2c6eb5c 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -10,6 +10,8 @@
 
 #include <linux/netfilter.h>
 
+/* only for userspace compatibility */
+#ifndef __KERNEL__
 /* IP Cache bits. */
 /* Src IP address. */
 #define NFC_IP6_SRC              0x0001
@@ -38,6 +40,7 @@
 #define NFC_IP6_DST_PT           0x0400
 /* Something else about the proto */
 #define NFC_IP6_PROTO_UNKNOWN    0x2000
+#endif /* ! __KERNEL__ */
 
 
 /* IP6 Hooks */
@@ -68,4 +71,7 @@ enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_LAST = INT_MAX,
 };
 
+extern int ipv6_netfilter_init(void);
+extern void ipv6_netfilter_fini(void);
+
 #endif /*__LINUX_IP6_NETFILTER_H*/
diff --git a/include/linux/netfilter_ipv6/ip6_logging.h b/include/linux/netfilter_ipv6/ip6_logging.h
deleted file mode 100644
index a0b2ee3043a..00000000000
--- a/include/linux/netfilter_ipv6/ip6_logging.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* IPv6 macros for the nternal logging interface. */
-#ifndef __IP6_LOGGING_H
-#define __IP6_LOGGING_H
-
-#ifdef __KERNEL__
-#include <linux/socket.h>
-#include <linux/netfilter_logging.h>
-
-#define nf_log_ip6_packet(pskb,hooknum,in,out,fmt,args...) \
-	nf_log_packet(AF_INET6,pskb,hooknum,in,out,fmt,##args)
-
-#define nf_log_ip6(pfh,len,fmt,args...) \
-	nf_log(AF_INET6,pfh,len,fmt,##args)
-
-#define nf_ip6_log_register(logging) nf_log_register(AF_INET6,logging)
-#define nf_ip6_log_unregister(logging) nf_log_unregister(AF_INET6,logging)
-	
-#endif /*__KERNEL__*/
-
-#endif /*__IP6_LOGGING_H*/
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index f1ce3b00985..58c72a52dc6 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -111,7 +111,8 @@ struct ip6t_counters
 #define IP6T_F_PROTO		0x01	/* Set if rule cares about upper 
 					   protocols */
 #define IP6T_F_TOS		0x02	/* Match the TOS. */
-#define IP6T_F_MASK		0x03	/* All possible flag bits mask. */
+#define IP6T_F_GOTO		0x04	/* Set if jump is a goto */
+#define IP6T_F_MASK		0x07	/* All possible flag bits mask. */
 
 /* Values for "inv" field in struct ip6t_ip6. */
 #define IP6T_INV_VIA_IN		0x01	/* Invert the sense of IN IFACE. */
diff --git a/include/linux/netfilter_ipv6/ip6t_HL.h b/include/linux/netfilter_ipv6/ip6t_HL.h
new file mode 100644
index 00000000000..afb7813d45a
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_HL.h
@@ -0,0 +1,22 @@
+/* Hop Limit modification module for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ * Based on HW's TTL module */
+
+#ifndef _IP6T_HL_H
+#define _IP6T_HL_H
+
+enum {
+	IP6T_HL_SET = 0,
+	IP6T_HL_INC,
+	IP6T_HL_DEC
+};
+
+#define IP6T_HL_MAXMODE	IP6T_HL_DEC
+
+struct ip6t_HL_info {
+	u_int8_t	mode;
+	u_int8_t	hop_limit;
+};
+
+
+#endif
diff --git a/include/linux/netfilter_ipv6/ip6t_LOG.h b/include/linux/netfilter_ipv6/ip6t_LOG.h
index 42996a43bb3..9008ff5c40a 100644
--- a/include/linux/netfilter_ipv6/ip6t_LOG.h
+++ b/include/linux/netfilter_ipv6/ip6t_LOG.h
@@ -1,6 +1,7 @@
 #ifndef _IP6T_LOG_H
 #define _IP6T_LOG_H
 
+/* make sure not to change this without changing netfilter.h:NF_LOG_* (!) */
 #define IP6T_LOG_TCPSEQ		0x01	/* Log TCP sequence numbers */
 #define IP6T_LOG_TCPOPT		0x02	/* Log TCP options */
 #define IP6T_LOG_IPOPT		0x04	/* Log IP options */
diff --git a/include/linux/netfilter_ipv6/ip6t_REJECT.h b/include/linux/netfilter_ipv6/ip6t_REJECT.h
new file mode 100644
index 00000000000..6be6504162b
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_REJECT.h
@@ -0,0 +1,18 @@
+#ifndef _IP6T_REJECT_H
+#define _IP6T_REJECT_H
+
+enum ip6t_reject_with {
+	IP6T_ICMP6_NO_ROUTE,
+	IP6T_ICMP6_ADM_PROHIBITED,
+	IP6T_ICMP6_NOT_NEIGHBOUR,
+	IP6T_ICMP6_ADDR_UNREACH,
+	IP6T_ICMP6_PORT_UNREACH,
+	IP6T_ICMP6_ECHOREPLY,
+	IP6T_TCP_RESET
+};
+
+struct ip6t_reject_info {
+	u_int32_t	with;	/* reject type */
+};
+
+#endif /*_IP6T_REJECT_H*/
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6552b71bfa7..16751866893 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -8,7 +8,7 @@
 #define NETLINK_W1		1	/* 1-wire subsystem				*/
 #define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
 #define NETLINK_FIREWALL	3	/* Firewalling hook				*/
-#define NETLINK_TCPDIAG		4	/* TCP socket monitoring			*/
+#define NETLINK_INET_DIAG	4	/* INET socket monitoring			*/
 #define NETLINK_NFLOG		5	/* netfilter/iptables ULOG */
 #define NETLINK_XFRM		6	/* ipsec */
 #define NETLINK_SELINUX		7	/* SELinux event notifications */
@@ -90,6 +90,15 @@ struct nlmsgerr
 	struct nlmsghdr msg;
 };
 
+#define NETLINK_ADD_MEMBERSHIP	1
+#define NETLINK_DROP_MEMBERSHIP	2
+#define NETLINK_PKTINFO		3
+
+struct nl_pktinfo
+{
+	__u32	group;
+};
+
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
 enum {
@@ -106,9 +115,8 @@ struct netlink_skb_parms
 {
 	struct ucred		creds;		/* Skb credentials	*/
 	__u32			pid;
-	__u32			groups;
 	__u32			dst_pid;
-	__u32			dst_groups;
+	__u32			dst_group;
 	kernel_cap_t		eff_cap;
 	__u32			loginuid;	/* Login (audit) uid */
 };
@@ -117,11 +125,11 @@ struct netlink_skb_parms
 #define NETLINK_CREDS(skb)	(&NETLINK_CB((skb)).creds)
 
 
-extern struct sock *netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len));
+extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (*input)(struct sock *sk, int len), struct module *module);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
-			     __u32 group, int allocation);
+			     __u32 group, unsigned int __nocast allocation);
 extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 extern int netlink_register_notifier(struct notifier_block *nb);
 extern int netlink_unregister_notifier(struct notifier_block *nb);
diff --git a/include/linux/random.h b/include/linux/random.h
index cc670344991..7b2adb3322d 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -59,6 +59,8 @@ extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr,
 					__u16 sport, __u16 dport);
 extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr,
 					  __u16 sport, __u16 dport);
+extern u64 secure_dccp_sequence_number(__u32 saddr, __u32 daddr,
+				       __u16 sport, __u16 dport);
 
 #ifndef MODULE
 extern struct file_operations random_fops, urandom_fops;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 657c05ab8f9..c231e9a08f0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -826,9 +826,8 @@ enum
 #define TCA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
 #define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
 
-
-/* RTnetlink multicast groups */
-
+#ifndef __KERNEL__
+/* RTnetlink multicast groups - backwards compatibility for userspace */
 #define RTMGRP_LINK		1
 #define RTMGRP_NOTIFY		2
 #define RTMGRP_NEIGH		4
@@ -847,6 +846,43 @@ enum
 #define RTMGRP_DECnet_ROUTE     0x4000
 
 #define RTMGRP_IPV6_PREFIX	0x20000
+#endif
+
+/* RTnetlink multicast groups */
+enum rtnetlink_groups {
+	RTNLGRP_NONE,
+#define RTNLGRP_NONE		RTNLGRP_NONE
+	RTNLGRP_LINK,
+#define RTNLGRP_LINK		RTNLGRP_LINK
+	RTNLGRP_NOTIFY,
+#define RTNLGRP_NOTIFY		RTNLGRP_NOTIFY
+	RTNLGRP_NEIGH,
+#define RTNLGRP_NEIGH		RTNLGRP_NEIGH
+	RTNLGRP_TC,
+#define RTNLGRP_TC		RTNLGRP_TC
+	RTNLGRP_IPV4_IFADDR,
+#define RTNLGRP_IPV4_IFADDR	RTNLGRP_IPV4_IFADDR
+	RTNLGRP_IPV4_MROUTE,
+#define	RTNLGRP_IPV4_MROUTE	RTNLGRP_IPV4_MROUTE
+	RTNLGRP_IPV4_ROUTE,
+#define RTNLGRP_IPV4_ROUTE	RTNLGRP_IPV4_ROUTE
+	RTNLGRP_IPV6_IFADDR,
+#define RTNLGRP_IPV6_IFADDR	RTNLGRP_IPV6_IFADDR
+	RTNLGRP_IPV6_MROUTE,
+#define RTNLGRP_IPV6_MROUTE	RTNLGRP_IPV6_MROUTE
+	RTNLGRP_IPV6_ROUTE,
+#define RTNLGRP_IPV6_ROUTE	RTNLGRP_IPV6_ROUTE
+	RTNLGRP_IPV6_IFINFO,
+#define RTNLGRP_IPV6_IFINFO	RTNLGRP_IPV6_IFINFO
+	RTNLGRP_DECnet_IFADDR,
+#define RTNLGRP_DECnet_IFADDR	RTNLGRP_DECnet_IFADDR
+	RTNLGRP_DECnet_ROUTE,
+#define RTNLGRP_DECnet_ROUTE	RTNLGRP_DECnet_ROUTE
+	RTNLGRP_IPV6_PREFIX,
+#define RTNLGRP_IPV6_PREFIX	RTNLGRP_IPV6_PREFIX
+	__RTNLGRP_MAX
+};
+#define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
 
 /* TC action piece */
 struct tcamsg
diff --git a/include/linux/security.h b/include/linux/security.h
index b42095a68b1..7aab6ab7c57 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2727,7 +2727,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o
 	return security_ops->socket_getpeersec(sock, optval, optlen, len);
 }
 
-static inline int security_sk_alloc(struct sock *sk, int family, int priority)
+static inline int security_sk_alloc(struct sock *sk, int family,
+				    unsigned int __nocast priority)
 {
 	return security_ops->sk_alloc_security(sk, family, priority);
 }
@@ -2844,7 +2845,8 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o
 	return -ENOPROTOOPT;
 }
 
-static inline int security_sk_alloc(struct sock *sk, int family, int priority)
+static inline int security_sk_alloc(struct sock *sk, int family,
+				    unsigned int __nocast priority)
 {
 	return 0;
 }
diff --git a/include/linux/selinux_netlink.h b/include/linux/selinux_netlink.h
index 957e6ebca4e..bbf489decd8 100644
--- a/include/linux/selinux_netlink.h
+++ b/include/linux/selinux_netlink.h
@@ -20,10 +20,21 @@ enum {
 	SELNL_MSG_MAX
 };
 
-/* Multicast groups */
+#ifndef __KERNEL__
+/* Multicast groups - backwards compatiblility for userspace */
 #define SELNL_GRP_NONE		0x00000000
 #define SELNL_GRP_AVC		0x00000001	/* AVC notifications */
 #define SELNL_GRP_ALL		0xffffffff
+#endif
+
+enum selinux_nlgroups {
+	SELNLGRP_NONE,
+#define SELNLGRP_NONE	SELNLGRP_NONE
+	SELNLGRP_AVC,
+#define SELNLGRP_AVC	SELNLGRP_AVC
+	__SELNLGRP_MAX
+};
+#define SELNLGRP_MAX	(__SELNLGRP_MAX - 1)
 
 /* Message structures */
 struct selnl_msg_setenforce {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 948527e42a6..42edce6abe2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -155,16 +155,29 @@ struct skb_shared_info {
 #define SKB_DATAREF_SHIFT 16
 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)
 
+extern struct timeval skb_tv_base;
+
+struct skb_timeval {
+	u32	off_sec;
+	u32	off_usec;
+};
+
+
+enum {
+	SKB_FCLONE_UNAVAILABLE,
+	SKB_FCLONE_ORIG,
+	SKB_FCLONE_CLONE,
+};
+
 /** 
  *	struct sk_buff - socket buffer
  *	@next: Next buffer in list
  *	@prev: Previous buffer in list
  *	@list: List we are on
  *	@sk: Socket we are owned by
- *	@stamp: Time we arrived
+ *	@tstamp: Time we arrived stored as offset to skb_tv_base
  *	@dev: Device we arrived on/are leaving by
  *	@input_dev: Device we arrived on
- *      @real_dev: The real device we are using
  *	@h: Transport layer header
  *	@nh: Network layer header
  *	@mac: Link layer header
@@ -190,14 +203,11 @@ struct skb_shared_info {
  *	@end: End pointer
  *	@destructor: Destruct function
  *	@nfmark: Can be used for communication between hooks
- *	@nfcache: Cache info
  *	@nfct: Associated connection, if any
  *	@nfctinfo: Relationship of this skb to the connection
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
- *      @private: Data which is private to the HIPPI implementation
  *	@tc_index: Traffic control index
  *	@tc_verd: traffic control verdict
- *	@tc_classid: traffic control classid
  */
 
 struct sk_buff {
@@ -205,12 +215,10 @@ struct sk_buff {
 	struct sk_buff		*next;
 	struct sk_buff		*prev;
 
-	struct sk_buff_head	*list;
 	struct sock		*sk;
-	struct timeval		stamp;
+	struct skb_timeval	tstamp;
 	struct net_device	*dev;
 	struct net_device	*input_dev;
-	struct net_device	*real_dev;
 
 	union {
 		struct tcphdr	*th;
@@ -252,33 +260,28 @@ struct sk_buff {
 	__u8			local_df:1,
 				cloned:1,
 				ip_summed:2,
-				nohdr:1;
-				/* 3 bits spare */
-	__u8			pkt_type;
+				nohdr:1,
+				nfctinfo:3;
+	__u8			pkt_type:3,
+				fclone:2;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
-	unsigned long		nfmark;
-	__u32			nfcache;
-	__u32			nfctinfo;
+	__u32			nfmark;
 	struct nf_conntrack	*nfct;
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+	__u8			ipvs_property:1;
+#endif
 #ifdef CONFIG_BRIDGE_NETFILTER
 	struct nf_bridge_info	*nf_bridge;
 #endif
 #endif /* CONFIG_NETFILTER */
-#if defined(CONFIG_HIPPI)
-	union {
-		__u32		ifield;
-	} private;
-#endif
 #ifdef CONFIG_NET_SCHED
-       __u32			tc_index;        /* traffic control index */
+	__u16			tc_index;	/* traffic control index */
 #ifdef CONFIG_NET_CLS_ACT
-	__u32           tc_verd;               /* traffic control verdict */
-	__u32           tc_classid;            /* traffic control classid */
+	__u16			tc_verd;	/* traffic control verdict */
 #endif
-
 #endif
 
 
@@ -300,8 +303,20 @@ struct sk_buff {
 #include <asm/system.h>
 
 extern void	       __kfree_skb(struct sk_buff *skb);
-extern struct sk_buff *alloc_skb(unsigned int size,
-				 unsigned int __nocast priority);
+extern struct sk_buff *__alloc_skb(unsigned int size,
+				   unsigned int __nocast priority, int fclone);
+static inline struct sk_buff *alloc_skb(unsigned int size,
+					unsigned int __nocast priority)
+{
+	return __alloc_skb(size, priority, 0);
+}
+
+static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
+					       unsigned int __nocast priority)
+{
+	return __alloc_skb(size, priority, 1);
+}
+
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 					    unsigned int size,
 					    unsigned int __nocast priority);
@@ -597,7 +612,6 @@ static inline void __skb_queue_head(struct sk_buff_head *list,
 {
 	struct sk_buff *prev, *next;
 
-	newsk->list = list;
 	list->qlen++;
 	prev = (struct sk_buff *)list;
 	next = prev->next;
@@ -622,7 +636,6 @@ static inline void __skb_queue_tail(struct sk_buff_head *list,
 {
 	struct sk_buff *prev, *next;
 
-	newsk->list = list;
 	list->qlen++;
 	next = (struct sk_buff *)list;
 	prev = next->prev;
@@ -655,7 +668,6 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
 		next->prev   = prev;
 		prev->next   = next;
 		result->next = result->prev = NULL;
-		result->list = NULL;
 	}
 	return result;
 }
@@ -664,7 +676,7 @@ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
 /*
  *	Insert a packet on a list.
  */
-extern void        skb_insert(struct sk_buff *old, struct sk_buff *newsk);
+extern void        skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list);
 static inline void __skb_insert(struct sk_buff *newsk,
 				struct sk_buff *prev, struct sk_buff *next,
 				struct sk_buff_head *list)
@@ -672,24 +684,23 @@ static inline void __skb_insert(struct sk_buff *newsk,
 	newsk->next = next;
 	newsk->prev = prev;
 	next->prev  = prev->next = newsk;
-	newsk->list = list;
 	list->qlen++;
 }
 
 /*
  *	Place a packet after a given packet in a list.
  */
-extern void	   skb_append(struct sk_buff *old, struct sk_buff *newsk);
-static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk)
+extern void	   skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list);
+static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
 {
-	__skb_insert(newsk, old, old->next, old->list);
+	__skb_insert(newsk, old, old->next, list);
 }
 
 /*
  * remove sk_buff from list. _Must_ be called atomically, and with
  * the list known..
  */
-extern void	   skb_unlink(struct sk_buff *skb);
+extern void	   skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
 static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 {
 	struct sk_buff *next, *prev;
@@ -698,7 +709,6 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 	next	   = skb->next;
 	prev	   = skb->prev;
 	skb->next  = skb->prev = NULL;
-	skb->list  = NULL;
 	next->prev = prev;
 	prev->next = next;
 }
@@ -1213,6 +1223,8 @@ extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
 
+extern void	       skb_release_data(struct sk_buff *skb);
+
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
 {
@@ -1230,6 +1242,42 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 extern void skb_init(void);
 extern void skb_add_mtu(int mtu);
 
+/**
+ *	skb_get_timestamp - get timestamp from a skb
+ *	@skb: skb to get stamp from
+ *	@stamp: pointer to struct timeval to store stamp in
+ *
+ *	Timestamps are stored in the skb as offsets to a base timestamp.
+ *	This function converts the offset back to a struct timeval and stores
+ *	it in stamp.
+ */
+static inline void skb_get_timestamp(struct sk_buff *skb, struct timeval *stamp)
+{
+	stamp->tv_sec  = skb->tstamp.off_sec;
+	stamp->tv_usec = skb->tstamp.off_usec;
+	if (skb->tstamp.off_sec) {
+		stamp->tv_sec  += skb_tv_base.tv_sec;
+		stamp->tv_usec += skb_tv_base.tv_usec;
+	}
+}
+
+/**
+ * 	skb_set_timestamp - set timestamp of a skb
+ *	@skb: skb to set stamp of
+ *	@stamp: pointer to struct timeval to get stamp from
+ *
+ *	Timestamps are stored in the skb as offsets to a base timestamp.
+ *	This function converts a struct timeval to an offset and stores
+ *	it in the skb.
+ */
+static inline void skb_set_timestamp(struct sk_buff *skb, struct timeval *stamp)
+{
+	skb->tstamp.off_sec  = stamp->tv_sec - skb_tv_base.tv_sec;
+	skb->tstamp.off_usec = stamp->tv_usec - skb_tv_base.tv_usec;
+}
+
+extern void __net_timestamp(struct sk_buff *skb);
+
 #ifdef CONFIG_NETFILTER
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
 {
diff --git a/include/linux/socket.h b/include/linux/socket.h
index a5c7d96e4d2..1739c2d5b95 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -26,6 +26,13 @@ struct __kernel_sockaddr_storage {
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
 
+extern int sysctl_somaxconn;
+extern void sock_init(void);
+#ifdef CONFIG_PROC_FS
+struct seq_file;
+extern void socket_seq_show(struct seq_file *seq);
+#endif
+
 typedef unsigned short	sa_family_t;
 
 /*
@@ -271,6 +278,8 @@ struct ucred {
 #define SOL_IRDA        266
 #define SOL_NETBEUI	267
 #define SOL_LLC		268
+#define SOL_DCCP	269
+#define SOL_NETLINK	270
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e4fd82e4210..ac4ca44c75c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -55,24 +55,6 @@ struct tcphdr {
 	__u16	urg_ptr;
 };
 
-
-enum {
-  TCP_ESTABLISHED = 1,
-  TCP_SYN_SENT,
-  TCP_SYN_RECV,
-  TCP_FIN_WAIT1,
-  TCP_FIN_WAIT2,
-  TCP_TIME_WAIT,
-  TCP_CLOSE,
-  TCP_CLOSE_WAIT,
-  TCP_LAST_ACK,
-  TCP_LISTEN,
-  TCP_CLOSING,	 /* now a valid state */
-
-  TCP_MAX_STATES /* Leave at the end! */
-};
-
-#define TCP_STATE_MASK	0xF
 #define TCP_ACTION_FIN	(1 << 7)
 
 enum {
@@ -195,8 +177,9 @@ struct tcp_info
 
 #include <linux/config.h>
 #include <linux/skbuff.h>
-#include <linux/ip.h>
 #include <net/sock.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_timewait_sock.h>
 
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block {
@@ -236,8 +219,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
 }
 
 struct tcp_sock {
-	/* inet_sock has to be the first member of tcp_sock */
-	struct inet_sock	inet;
+	/* inet_connection_sock has to be the first member of tcp_sock */
+	struct inet_connection_sock	inet_conn;
 	int	tcp_header_len;	/* Bytes of tcp header to send		*/
 
 /*
@@ -258,19 +241,6 @@ struct tcp_sock {
  	__u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	__u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	__u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
-	struct tcp_bind_bucket *bind_hash;
-	/* Delayed ACK control data */
-	struct {
-		__u8	pending;	/* ACK is pending */
-		__u8	quick;		/* Scheduled number of quick acks	*/
-		__u8	pingpong;	/* The session is interactive		*/
-		__u8	blocked;	/* Delayed ACK was blocked by socket lock*/
-		__u32	ato;		/* Predicted tick of soft clock		*/
-		unsigned long timeout;	/* Currently scheduled timeout		*/
-		__u32	lrcvtime;	/* timestamp of last received data packet*/
-		__u16	last_seg_size;	/* Size of last incoming segment	*/
-		__u16	rcv_mss;	/* MSS used for delayed ACK decisions	*/ 
-	} ack;
 
 	/* Data for direct copy to user */
 	struct {
@@ -288,19 +258,15 @@ struct tcp_sock {
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
 	__u16	xmit_size_goal;	/* Goal for segmenting output packets	*/
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
-	__u8	ca_state;	/* State of fast-retransmit machine 	*/
-	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
 
-	__u16	advmss;		/* Advertised MSS			*/
 	__u32	window_clamp;	/* Maximal window to advertise		*/
 	__u32	rcv_ssthresh;	/* Current window clamp			*/
 
 	__u32	frto_highmark;	/* snd_nxt when RTO occurred */
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
-
-	__u8	unused;
-	__u8	defer_accept;	/* User waits for some data after accept() */
+	__u8	nonagle;	/* Disable Nagle algorithm?             */
+	__u8	keepalive_probes; /* num of allowed keep alive probes	*/
 
 /* RTT measurement */
 	__u32	srtt;		/* smoothed round trip time << 3	*/
@@ -308,19 +274,13 @@ struct tcp_sock {
 	__u32	mdev_max;	/* maximal mdev for the last rtt period	*/
 	__u32	rttvar;		/* smoothed mdev_max			*/
 	__u32	rtt_seq;	/* sequence number to update rttvar	*/
-	__u32	rto;		/* retransmit timeout			*/
 
 	__u32	packets_out;	/* Packets which are "in flight"	*/
 	__u32	left_out;	/* Packets which leaved network	*/
 	__u32	retrans_out;	/* Retransmitted packets out		*/
-	__u8	backoff;	/* backoff				*/
 /*
  *      Options received (usually on last packet, some only on SYN packets).
  */
-	__u8	nonagle;	/* Disable Nagle algorithm?             */
-	__u8	keepalive_probes; /* num of allowed keep alive probes	*/
-
-	__u8	probes_out;	/* unanswered 0 window probes		*/
 	struct tcp_options_received rx_opt;
 
 /*
@@ -333,11 +293,6 @@ struct tcp_sock {
 	__u32	snd_cwnd_used;
 	__u32	snd_cwnd_stamp;
 
-	/* Two commonly used timers in both sender and receiver paths. */
-	unsigned long		timeout;
- 	struct timer_list	retransmit_timer;	/* Resend (no ack)	*/
- 	struct timer_list	delack_timer;		/* Ack delay 		*/
-
 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
 
 	struct tcp_func		*af_specific;	/* Operations which are AF_INET{4,6} specific	*/
@@ -352,8 +307,7 @@ struct tcp_sock {
 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
-	__u8	syn_retries;	/* num of allowed syn retries */
-	__u8	ecn_flags;	/* ECN status bits.			*/
+	__u16	advmss;		/* Advertised MSS			*/
 	__u16	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	__u32	lost_out;	/* Lost packets			*/
 	__u32	sacked_out;	/* SACK'd packets			*/
@@ -367,14 +321,12 @@ struct tcp_sock {
 	int	undo_retrans;	/* number of undoable retransmissions. */
 	__u32	urg_seq;	/* Seq of received urgent pointer */
 	__u16	urg_data;	/* Saved octet of OOB data and control flags */
-	__u8	pending;	/* Scheduled timer event	*/
 	__u8	urg_mode;	/* In urgent mode		*/
+	__u8	ecn_flags;	/* ECN status bits.			*/
 	__u32	snd_up;		/* Urgent pointer		*/
 
 	__u32	total_retrans;	/* Total retransmits for entire connection */
 
-	struct request_sock_queue accept_queue; /* FIFO of established children */
-
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
 	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
 	int			linger2;
@@ -394,11 +346,6 @@ struct tcp_sock {
 		__u32	seq;
 		__u32	time;
 	} rcvq_space;
-
-	/* Pluggable TCP congestion control hook */
-	struct tcp_congestion_ops *ca_ops;
-	u32	ca_priv[16];
-#define TCP_CA_PRIV_SIZE	(16*sizeof(u32))
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -406,9 +353,18 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
-static inline void *tcp_ca(const struct tcp_sock *tp)
+struct tcp_timewait_sock {
+	struct inet_timewait_sock tw_sk;
+	__u32			  tw_rcv_nxt;
+	__u32			  tw_snd_nxt;
+	__u32			  tw_rcv_wnd;
+	__u32			  tw_ts_recent;
+	long			  tw_ts_recent_stamp;
+};
+
+static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
 {
-	return (void *) tp->ca_priv;
+	return (struct tcp_timewait_sock *)sk;
 }
 
 #endif
diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
deleted file mode 100644
index 7a599674394..00000000000
--- a/include/linux/tcp_diag.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef _TCP_DIAG_H_
-#define _TCP_DIAG_H_ 1
-
-/* Just some random number */
-#define TCPDIAG_GETSOCK 18
-
-/* Socket identity */
-struct tcpdiag_sockid
-{
-	__u16	tcpdiag_sport;
-	__u16	tcpdiag_dport;
-	__u32	tcpdiag_src[4];
-	__u32	tcpdiag_dst[4];
-	__u32	tcpdiag_if;
-	__u32	tcpdiag_cookie[2];
-#define TCPDIAG_NOCOOKIE (~0U)
-};
-
-/* Request structure */
-
-struct tcpdiagreq
-{
-	__u8	tcpdiag_family;		/* Family of addresses. */
-	__u8	tcpdiag_src_len;
-	__u8	tcpdiag_dst_len;
-	__u8	tcpdiag_ext;		/* Query extended information */
-
-	struct tcpdiag_sockid id;
-
-	__u32	tcpdiag_states;		/* States to dump */
-	__u32	tcpdiag_dbs;		/* Tables to dump (NI) */
-};
-
-enum
-{
-	TCPDIAG_REQ_NONE,
-	TCPDIAG_REQ_BYTECODE,
-};
-
-#define TCPDIAG_REQ_MAX TCPDIAG_REQ_BYTECODE
-
-/* Bytecode is sequence of 4 byte commands followed by variable arguments.
- * All the commands identified by "code" are conditional jumps forward:
- * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be
- * length of the command and its arguments.
- */
- 
-struct tcpdiag_bc_op
-{
-	unsigned char	code;
-	unsigned char	yes;
-	unsigned short	no;
-};
-
-enum
-{
-	TCPDIAG_BC_NOP,
-	TCPDIAG_BC_JMP,
-	TCPDIAG_BC_S_GE,
-	TCPDIAG_BC_S_LE,
-	TCPDIAG_BC_D_GE,
-	TCPDIAG_BC_D_LE,
-	TCPDIAG_BC_AUTO,
-	TCPDIAG_BC_S_COND,
-	TCPDIAG_BC_D_COND,
-};
-
-struct tcpdiag_hostcond
-{
-	__u8	family;
-	__u8	prefix_len;
-	int	port;
-	__u32	addr[0];
-};
-
-/* Base info structure. It contains socket identity (addrs/ports/cookie)
- * and, alas, the information shown by netstat. */
-struct tcpdiagmsg
-{
-	__u8	tcpdiag_family;
-	__u8	tcpdiag_state;
-	__u8	tcpdiag_timer;
-	__u8	tcpdiag_retrans;
-
-	struct tcpdiag_sockid id;
-
-	__u32	tcpdiag_expires;
-	__u32	tcpdiag_rqueue;
-	__u32	tcpdiag_wqueue;
-	__u32	tcpdiag_uid;
-	__u32	tcpdiag_inode;
-};
-
-/* Extensions */
-
-enum
-{
-	TCPDIAG_NONE,
-	TCPDIAG_MEMINFO,
-	TCPDIAG_INFO,
-	TCPDIAG_VEGASINFO,
-	TCPDIAG_CONG,
-};
-
-#define TCPDIAG_MAX TCPDIAG_CONG
-
-
-/* TCPDIAG_MEM */
-
-struct tcpdiag_meminfo
-{
-	__u32	tcpdiag_rmem;
-	__u32	tcpdiag_wmem;
-	__u32	tcpdiag_fmem;
-	__u32	tcpdiag_tmem;
-};
-
-/* TCPDIAG_VEGASINFO */
-
-struct tcpvegas_info {
-	__u32	tcpv_enabled;
-	__u32	tcpv_rttcnt;
-	__u32	tcpv_rtt;
-	__u32	tcpv_minrtt;
-};
-
-#endif /* _TCP_DIAG_H_ */
diff --git a/include/linux/types.h b/include/linux/types.h
index dcb13f865df..2b678c22ca4 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -123,6 +123,9 @@ typedef		__u64		u_int64_t;
 typedef		__s64		int64_t;
 #endif
 
+/* this is a special 64bit data type that is 8-byte aligned */
+#define aligned_u64 unsigned long long __attribute__((aligned(8)))
+
 /*
  * The type used for indexing onto a disc or disc partition.
  * If required, asm/types.h can override it and define
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index f0d423300d8..0fb077d6844 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -258,9 +258,27 @@ struct xfrm_usersa_flush {
 	__u8				proto;
 };
 
+#ifndef __KERNEL__
+/* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
 #define XFRMGRP_EXPIRE		2
 #define XFRMGRP_SA		4
 #define XFRMGRP_POLICY		8
+#endif
+
+enum xfrm_nlgroups {
+	XFRMNLGRP_NONE,
+#define XFRMNLGRP_NONE		XFRMNLGRP_NONE
+	XFRMNLGRP_ACQUIRE,
+#define XFRMNLGRP_ACQUIRE	XFRMNLGRP_ACQUIRE
+	XFRMNLGRP_EXPIRE,
+#define XFRMNLGRP_EXPIRE	XFRMNLGRP_EXPIRE
+	XFRMNLGRP_SA,
+#define XFRMNLGRP_SA		XFRMNLGRP_SA
+	XFRMNLGRP_POLICY,
+#define XFRMNLGRP_POLICY	XFRMNLGRP_POLICY
+	__XFRMNLGRP_MAX
+};
+#define XFRMNLGRP_MAX	(__XFRMNLGRP_MAX - 1)
 
 #endif /* _LINUX_XFRM_H */
diff --git a/include/net/act_api.h b/include/net/act_api.h
index ed00a995f57..b55eb7c7f03 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -63,7 +63,7 @@ struct tc_action_ops
 	__u32   type; /* TBD to match kind */
 	__u32 	capab;  /* capabilities includes 4 bit version */
 	struct module		*owner;
-	int     (*act)(struct sk_buff **, struct tc_action *);
+	int     (*act)(struct sk_buff **, struct tc_action *, struct tcf_result *);
 	int     (*get_stats)(struct sk_buff *, struct tc_action *);
 	int     (*dump)(struct sk_buff *, struct tc_action *,int , int);
 	int     (*cleanup)(struct tc_action *, int bind);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index a0ed9367217..750e2508dd9 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -45,6 +45,7 @@ struct prefix_info {
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
 #include <linux/netdevice.h>
 #include <net/if_inet6.h>
 #include <net/ipv6.h>
@@ -238,5 +239,10 @@ static inline int ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
 		addr->s6_addr32[3] == htonl(0x00000002));
 }
 
+#ifdef CONFIG_PROC_FS
+extern int if6_proc_init(void);
+extern void if6_proc_exit(void);
+#endif
+
 #endif
 #endif
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index b60b3846b9d..b5d785ab4a0 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -1,5 +1,11 @@
 #ifndef __LINUX_NET_AFUNIX_H
 #define __LINUX_NET_AFUNIX_H
+
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <net/sock.h>
+
 extern void unix_inflight(struct file *fp);
 extern void unix_notinflight(struct file *fp);
 extern void unix_gc(void);
@@ -74,5 +80,14 @@ struct unix_sock {
         wait_queue_head_t       peer_wait;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
+
+#ifdef CONFIG_SYSCTL
+extern int sysctl_unix_max_dgram_qlen;
+extern void unix_sysctl_register(void);
+extern void unix_sysctl_unregister(void);
+#else
+static inline void unix_sysctl_register(void) {}
+static inline void unix_sysctl_unregister(void) {}
+#endif
 #endif
 #endif
diff --git a/include/net/arp.h b/include/net/arp.h
index a1f09fad6a5..a13e30c35f4 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -11,7 +11,7 @@ extern struct neigh_table arp_tbl;
 
 extern void	arp_init(void);
 extern int	arp_rcv(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt);
+			struct packet_type *pt, struct net_device *orig_dev);
 extern int	arp_find(unsigned char *haddr, struct sk_buff *skb);
 extern int	arp_ioctl(unsigned int cmd, void __user *arg);
 extern void     arp_send(int type, int ptype, u32 dest_ip, 
diff --git a/include/net/ax25.h b/include/net/ax25.h
index 3696f988a9f..926eed54302 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -316,7 +316,7 @@ extern int  ax25_protocol_is_registered(unsigned int);
 
 /* ax25_in.c */
 extern int  ax25_rx_iframe(ax25_cb *, struct sk_buff *);
-extern int  ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int  ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 
 /* ax25_ip.c */
 extern int  ax25_encapsulate(struct sk_buff *, struct net_device *, unsigned short, void *, void *, unsigned int);
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 06b24f63702..6dfa4a61ffd 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -131,11 +131,12 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock);
 
 /* Skb helpers */
 struct bt_skb_cb {
-	int incoming;
+	__u8 pkt_type;
+	__u8 incoming;
 };
 #define bt_cb(skb) ((struct bt_skb_cb *)(skb->cb)) 
 
-static inline struct sk_buff *bt_skb_alloc(unsigned int len, int how)
+static inline struct sk_buff *bt_skb_alloc(unsigned int len, unsigned int __nocast how)
 {
 	struct sk_buff *skb;
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 6f0706f4af6..371e7d3f2e6 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -453,6 +453,15 @@ struct inquiry_info_with_rssi {
 	__u16    clock_offset;
 	__s8     rssi;
 } __attribute__ ((packed));
+struct inquiry_info_with_rssi_and_pscan_mode {
+	bdaddr_t bdaddr;
+	__u8     pscan_rep_mode;
+	__u8     pscan_period_mode;
+	__u8     pscan_mode;
+	__u8     dev_class[3];
+	__u16    clock_offset;
+	__s8     rssi;
+} __attribute__ ((packed));
 
 #define HCI_EV_CONN_COMPLETE 	0x03
 struct hci_ev_conn_complete {
@@ -584,6 +593,12 @@ struct hci_ev_clock_offset {
 	__u16    clock_offset;
 } __attribute__ ((packed));
 
+#define HCI_EV_PSCAN_REP_MODE	0x20
+struct hci_ev_pscan_rep_mode {
+	bdaddr_t bdaddr;
+	__u8     pscan_rep_mode;
+} __attribute__ ((packed));
+
 /* Internal events generated by Bluetooth stack */
 #define HCI_EV_STACK_INTERNAL	0xFD
 struct hci_ev_stack_internal {
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6d63a47c731..7f933f30207 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -404,7 +404,7 @@ static inline int hci_recv_frame(struct sk_buff *skb)
 	bt_cb(skb)->incoming = 1;
 
 	/* Time stamp */
-	do_gettimeofday(&skb->stamp);
+	__net_timestamp(skb);
 
 	/* Queue frame for rx task */
 	skb_queue_tail(&hdev->rx_q, skb);
diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h
index 13669bad00b..ffea9d54071 100644
--- a/include/net/bluetooth/rfcomm.h
+++ b/include/net/bluetooth/rfcomm.h
@@ -80,9 +80,9 @@
 #define RFCOMM_RPN_STOP_15	1
 
 #define RFCOMM_RPN_PARITY_NONE	0x0
-#define RFCOMM_RPN_PARITY_ODD	0x4
-#define RFCOMM_RPN_PARITY_EVEN	0x5
-#define RFCOMM_RPN_PARITY_MARK	0x6
+#define RFCOMM_RPN_PARITY_ODD	0x1
+#define RFCOMM_RPN_PARITY_EVEN	0x3
+#define RFCOMM_RPN_PARITY_MARK	0x5
 #define RFCOMM_RPN_PARITY_SPACE	0x7
 
 #define RFCOMM_RPN_FLOW_NONE	0x00
@@ -223,8 +223,14 @@ struct rfcomm_dlc {
 #define RFCOMM_CFC_DISABLED 0
 #define RFCOMM_CFC_ENABLED  RFCOMM_MAX_CREDITS
 
+/* ---- RFCOMM SEND RPN ---- */
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+			u8 bit_rate, u8 data_bits, u8 stop_bits,
+			u8 parity, u8 flow_ctrl_settings, 
+			u8 xon_char, u8 xoff_char, u16 param_mask);
+
 /* ---- RFCOMM DLCs (channels) ---- */
-struct rfcomm_dlc *rfcomm_dlc_alloc(int prio);
+struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio);
 void rfcomm_dlc_free(struct rfcomm_dlc *d);
 int  rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel);
 int  rfcomm_dlc_close(struct rfcomm_dlc *d, int reason);
diff --git a/include/net/datalink.h b/include/net/datalink.h
index 5797ba3d2eb..deb7ca75db4 100644
--- a/include/net/datalink.h
+++ b/include/net/datalink.h
@@ -9,7 +9,7 @@ struct datalink_proto {
         unsigned short  header_length;
 
         int     (*rcvfunc)(struct sk_buff *, struct net_device *,
-                                struct packet_type *);
+                                struct packet_type *, struct net_device *);
 	int     (*request)(struct datalink_proto *, struct sk_buff *,
                                         unsigned char *);
 	struct list_head node;
diff --git a/include/net/dn.h b/include/net/dn.h
index 5551c46db39..c1dbbd22279 100644
--- a/include/net/dn.h
+++ b/include/net/dn.h
@@ -3,6 +3,7 @@
 
 #include <linux/dn.h>
 #include <net/sock.h>
+#include <net/tcp.h>
 #include <asm/byteorder.h>
 
 typedef unsigned short dn_address;
diff --git a/include/net/icmp.h b/include/net/icmp.h
index e5ef0d15fb4..6cdebeee5f9 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -57,4 +57,11 @@ static inline struct raw_sock *raw_sk(const struct sock *sk)
 	return (struct raw_sock *)sk;
 }
 
+extern int sysctl_icmp_echo_ignore_all;
+extern int sysctl_icmp_echo_ignore_broadcasts;
+extern int sysctl_icmp_ignore_bogus_error_responses;
+extern int sysctl_icmp_errors_use_inbound_ifaddr;
+extern int sysctl_icmp_ratelimit;
+extern int sysctl_icmp_ratemask;
+
 #endif	/* _ICMP_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
new file mode 100644
index 00000000000..03df3b15796
--- /dev/null
+++ b/include/net/inet6_hashtables.h
@@ -0,0 +1,130 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _INET6_HASHTABLES_H
+#define _INET6_HASHTABLES_H
+
+#include <linux/config.h>
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+
+#include <net/ipv6.h>
+
+struct inet_hashinfo;
+
+/* I have no idea if this is a good hash for v6 or not. -DaveM */
+static inline int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport,
+				const struct in6_addr *faddr, const u16 fport,
+				const int ehash_size)
+{
+	int hashent = (lport ^ fport);
+
+	hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
+	hashent ^= hashent >> 16;
+	hashent ^= hashent >> 8;
+	return (hashent & (ehash_size - 1));
+}
+
+static inline int inet6_sk_ehashfn(const struct sock *sk, const int ehash_size)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *laddr = &np->rcv_saddr;
+	const struct in6_addr *faddr = &np->daddr;
+	const __u16 lport = inet->num;
+	const __u16 fport = inet->dport;
+	return inet6_ehashfn(laddr, lport, faddr, fport, ehash_size);
+}
+
+/*
+ * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * The sockhash lock must be held as a reader here.
+ */
+static inline struct sock *
+		__inet6_lookup_established(struct inet_hashinfo *hashinfo,
+					   const struct in6_addr *saddr,
+					   const u16 sport,
+					   const struct in6_addr *daddr,
+					   const u16 hnum,
+					   const int dif)
+{
+	struct sock *sk;
+	const struct hlist_node *node;
+	const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	const int hash = inet6_ehashfn(daddr, hnum, saddr, sport,
+				       hashinfo->ehash_size);
+	struct inet_ehash_bucket *head = &hashinfo->ehash[hash];
+
+	read_lock(&head->lock);
+	sk_for_each(sk, node, &head->chain) {
+		/* For IPV6 do the cheaper port and family tests first. */
+		if (INET6_MATCH(sk, saddr, daddr, ports, dif))
+			goto hit; /* You sunk my battleship! */
+	}
+	/* Must check for a TIME_WAIT'er before going to listener hash. */
+	sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
+		const struct inet_timewait_sock *tw = inet_twsk(sk);
+
+		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
+		   sk->sk_family		== PF_INET6) {
+			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+			if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
+			    ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
+			    (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
+				goto hit;
+		}
+	}
+	read_unlock(&head->lock);
+	return NULL;
+
+hit:
+	sock_hold(sk);
+	read_unlock(&head->lock);
+	return sk;
+}
+
+extern struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
+					  const struct in6_addr *daddr,
+					  const unsigned short hnum,
+					  const int dif);
+
+static inline struct sock *__inet6_lookup(struct inet_hashinfo *hashinfo,
+					  const struct in6_addr *saddr,
+					  const u16 sport,
+					  const struct in6_addr *daddr,
+					  const u16 hnum,
+					  const int dif)
+{
+	struct sock *sk = __inet6_lookup_established(hashinfo, saddr, sport,
+						     daddr, hnum, dif);
+	if (sk)
+		return sk;
+
+	return inet6_lookup_listener(hashinfo, daddr, hnum, dif);
+}
+
+extern struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+				 const struct in6_addr *saddr, const u16 sport,
+				 const struct in6_addr *daddr, const u16 dport,
+				 const int dif);
+#endif /* defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) */
+#endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index fbc1f4d140d..f943306ce5f 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -8,6 +8,11 @@ extern struct proto_ops		inet_dgram_ops;
  *	INET4 prototypes used by INET6
  */
 
+struct msghdr;
+struct sock;
+struct sockaddr;
+struct socket;
+
 extern void			inet_remove_sock(struct sock *sk1);
 extern void			inet_put_sock(unsigned short num, 
 					      struct sock *sk);
@@ -29,7 +34,6 @@ extern unsigned int		inet_poll(struct file * file, struct socket *sock, struct p
 extern int			inet_listen(struct socket *sock, int backlog);
 
 extern void			inet_sock_destruct(struct sock *sk);
-extern atomic_t			inet_sock_nr;
 
 extern int			inet_bind(struct socket *sock, 
 					  struct sockaddr *uaddr, int addr_len);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
new file mode 100644
index 00000000000..8a87a3a4f10
--- /dev/null
+++ b/include/net/inet_connection_sock.h
@@ -0,0 +1,276 @@
+/*
+ * NET		Generic infrastructure for INET connection oriented protocols.
+ *
+ *		Definitions for inet_connection_sock 
+ *
+ * Authors:	Many people, see the TCP sources
+ *
+ * 		From code originally in TCP
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_CONNECTION_SOCK_H
+#define _INET_CONNECTION_SOCK_H
+
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <net/request_sock.h>
+
+#define INET_CSK_DEBUG 1
+
+/* Cancel timers, when they are not required. */
+#undef INET_CSK_CLEAR_TIMERS
+
+struct inet_bind_bucket;
+struct inet_hashinfo;
+struct tcp_congestion_ops;
+
+/** inet_connection_sock - INET connection oriented sock
+ *
+ * @icsk_accept_queue:	   FIFO of established children 
+ * @icsk_bind_hash:	   Bind node
+ * @icsk_timeout:	   Timeout
+ * @icsk_retransmit_timer: Resend (no ack)
+ * @icsk_rto:		   Retransmit timeout
+ * @icsk_ca_ops		   Pluggable congestion control hook
+ * @icsk_ca_state:	   Congestion control state
+ * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
+ * @icsk_pending:	   Scheduled timer event
+ * @icsk_backoff:	   Backoff
+ * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
+ * @icsk_probes_out:	   unanswered 0 window probes
+ * @icsk_ack:		   Delayed ACK control data
+ */
+struct inet_connection_sock {
+	/* inet_sock has to be the first member! */
+	struct inet_sock	  icsk_inet;
+	struct request_sock_queue icsk_accept_queue;
+	struct inet_bind_bucket	  *icsk_bind_hash;
+	unsigned long		  icsk_timeout;
+ 	struct timer_list	  icsk_retransmit_timer;
+ 	struct timer_list	  icsk_delack_timer;
+	__u32			  icsk_rto;
+	struct tcp_congestion_ops *icsk_ca_ops;
+	__u8			  icsk_ca_state;
+	__u8			  icsk_retransmits;
+	__u8			  icsk_pending;
+	__u8			  icsk_backoff;
+	__u8			  icsk_syn_retries;
+	__u8			  icsk_probes_out;
+	/* 2 BYTES HOLE, TRY TO PACK! */
+	struct {
+		__u8		  pending;	 /* ACK is pending			   */
+		__u8		  quick;	 /* Scheduled number of quick acks	   */
+		__u8		  pingpong;	 /* The session is interactive		   */
+		__u8		  blocked;	 /* Delayed ACK was blocked by socket lock */
+		__u32		  ato;		 /* Predicted tick of soft clock	   */
+		unsigned long	  timeout;	 /* Currently scheduled timeout		   */
+		__u32		  lrcvtime;	 /* timestamp of last received data packet */
+		__u16		  last_seg_size; /* Size of last incoming segment	   */
+		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
+	} icsk_ack;
+	u32			  icsk_ca_priv[16];
+#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
+};
+
+#define ICSK_TIME_RETRANS	1	/* Retransmit timer */
+#define ICSK_TIME_DACK		2	/* Delayed ack timer */
+#define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
+#define ICSK_TIME_KEEPOPEN	4	/* Keepalive timer */
+
+static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
+{
+	return (struct inet_connection_sock *)sk;
+}
+
+static inline void *inet_csk_ca(const struct sock *sk)
+{
+	return (void *)inet_csk(sk)->icsk_ca_priv;
+}
+
+extern struct sock *inet_csk_clone(struct sock *sk,
+				   const struct request_sock *req,
+				   const unsigned int __nocast priority);
+
+enum inet_csk_ack_state_t {
+	ICSK_ACK_SCHED	= 1,
+	ICSK_ACK_TIMER  = 2,
+	ICSK_ACK_PUSHED = 4
+};
+
+extern void inet_csk_init_xmit_timers(struct sock *sk,
+				      void (*retransmit_handler)(unsigned long),
+				      void (*delack_handler)(unsigned long),
+				      void (*keepalive_handler)(unsigned long));
+extern void inet_csk_clear_xmit_timers(struct sock *sk);
+
+static inline void inet_csk_schedule_ack(struct sock *sk)
+{
+	inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED;
+}
+
+static inline int inet_csk_ack_scheduled(const struct sock *sk)
+{
+	return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED;
+}
+
+static inline void inet_csk_delack_init(struct sock *sk)
+{
+	memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
+}
+
+extern void inet_csk_delete_keepalive_timer(struct sock *sk);
+extern void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
+
+#ifdef INET_CSK_DEBUG
+extern const char inet_csk_timer_bug_msg[];
+#endif
+
+static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	
+	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
+		icsk->icsk_pending = 0;
+#ifdef INET_CSK_CLEAR_TIMERS
+		sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+#endif
+	} else if (what == ICSK_TIME_DACK) {
+		icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0;
+#ifdef INET_CSK_CLEAR_TIMERS
+		sk_stop_timer(sk, &icsk->icsk_delack_timer);
+#endif
+	}
+#ifdef INET_CSK_DEBUG
+	else {
+		pr_debug(inet_csk_timer_bug_msg);
+	}
+#endif
+}
+
+/*
+ *	Reset the retransmission timer
+ */
+static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
+					     unsigned long when,
+					     const unsigned long max_when)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (when > max_when) {
+#ifdef INET_CSK_DEBUG
+		pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",
+			 sk, what, when, current_text_addr());
+#endif
+		when = max_when;
+	}
+
+	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
+		icsk->icsk_pending = what;
+		icsk->icsk_timeout = jiffies + when;
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+	} else if (what == ICSK_TIME_DACK) {
+		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
+		icsk->icsk_ack.timeout = jiffies + when;
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+	}
+#ifdef INET_CSK_DEBUG
+	else {
+		pr_debug(inet_csk_timer_bug_msg);
+	}
+#endif
+}
+
+extern struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
+
+extern struct request_sock *inet_csk_search_req(const struct sock *sk,
+						struct request_sock ***prevp,
+						const __u16 rport,
+						const __u32 raddr,
+						const __u32 laddr);
+extern int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+			     struct sock *sk, unsigned short snum);
+
+extern struct dst_entry* inet_csk_route_req(struct sock *sk,
+					    const struct request_sock *req);
+
+static inline void inet_csk_reqsk_queue_add(struct sock *sk,
+					    struct request_sock *req,
+					    struct sock *child)
+{
+	reqsk_queue_add(&inet_csk(sk)->icsk_accept_queue, req, sk, child);
+}
+
+extern void inet_csk_reqsk_queue_hash_add(struct sock *sk,
+					  struct request_sock *req,
+					  const unsigned timeout);
+
+static inline void inet_csk_reqsk_queue_removed(struct sock *sk,
+						struct request_sock *req)
+{
+	if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
+		inet_csk_delete_keepalive_timer(sk);
+}
+
+static inline void inet_csk_reqsk_queue_added(struct sock *sk,
+					      const unsigned long timeout)
+{
+	if (reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue) == 0)
+		inet_csk_reset_keepalive_timer(sk, timeout);
+}
+
+static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
+{
+	return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
+}
+
+static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
+{
+	return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
+}
+
+static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
+{
+	return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
+}
+
+static inline void inet_csk_reqsk_queue_unlink(struct sock *sk,
+					       struct request_sock *req,
+					       struct request_sock **prev)
+{
+	reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req, prev);
+}
+
+static inline void inet_csk_reqsk_queue_drop(struct sock *sk,
+					     struct request_sock *req,
+					     struct request_sock **prev)
+{
+	inet_csk_reqsk_queue_unlink(sk, req, prev);
+	inet_csk_reqsk_queue_removed(sk, req);
+	reqsk_free(req);
+}
+
+extern void inet_csk_reqsk_queue_prune(struct sock *parent,
+				       const unsigned long interval,
+				       const unsigned long timeout,
+				       const unsigned long max_rto);
+
+extern void inet_csk_destroy_sock(struct sock *sk);
+
+/*
+ * LISTEN is a special case for poll..
+ */
+static inline unsigned int inet_csk_listen_poll(const struct sock *sk)
+{
+	return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ?
+			(POLLIN | POLLRDNORM) : 0;
+}
+
+extern int  inet_csk_listen_start(struct sock *sk, const int nr_table_entries);
+extern void inet_csk_listen_stop(struct sock *sk);
+
+#endif /* _INET_CONNECTION_SOCK_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
new file mode 100644
index 00000000000..646b6ea7fe2
--- /dev/null
+++ b/include/net/inet_hashtables.h
@@ -0,0 +1,427 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _INET_HASHTABLES_H
+#define _INET_HASHTABLES_H
+
+#include <linux/config.h>
+
+#include <linux/interrupt.h>
+#include <linux/ipv6.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+
+/* This is for all connections with a full identity, no wildcards.
+ * New scheme, half the table is for TIME_WAIT, the other half is
+ * for the rest.  I'll experiment with dynamic table growth later.
+ */
+struct inet_ehash_bucket {
+	rwlock_t	  lock;
+	struct hlist_head chain;
+} __attribute__((__aligned__(8)));
+
+/* There are a few simple rules, which allow for local port reuse by
+ * an application.  In essence:
+ *
+ *	1) Sockets bound to different interfaces may share a local port.
+ *	   Failing that, goto test 2.
+ *	2) If all sockets have sk->sk_reuse set, and none of them are in
+ *	   TCP_LISTEN state, the port may be shared.
+ *	   Failing that, goto test 3.
+ *	3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
+ *	   address, and none of them are the same, the port may be
+ *	   shared.
+ *	   Failing this, the port cannot be shared.
+ *
+ * The interesting point, is test #2.  This is what an FTP server does
+ * all day.  To optimize this case we use a specific flag bit defined
+ * below.  As we add sockets to a bind bucket list, we perform a
+ * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
+ * As long as all sockets added to a bind bucket pass this test,
+ * the flag bit will be set.
+ * The resulting situation is that tcp_v[46]_verify_bind() can just check
+ * for this flag bit, if it is set and the socket trying to bind has
+ * sk->sk_reuse set, we don't even have to walk the owners list at all,
+ * we return that it is ok to bind this socket to the requested local port.
+ *
+ * Sounds like a lot of work, but it is worth it.  In a more naive
+ * implementation (ie. current FreeBSD etc.) the entire list of ports
+ * must be walked for each data port opened by an ftp server.  Needless
+ * to say, this does not scale at all.  With a couple thousand FTP
+ * users logged onto your box, isn't it nice to know that new data
+ * ports are created in O(1) time?  I thought so. ;-)	-DaveM
+ */
+struct inet_bind_bucket {
+	unsigned short		port;
+	signed short		fastreuse;
+	struct hlist_node	node;
+	struct hlist_head	owners;
+};
+
+#define inet_bind_bucket_for_each(tb, node, head) \
+	hlist_for_each_entry(tb, node, head, node)
+
+struct inet_bind_hashbucket {
+	spinlock_t		lock;
+	struct hlist_head	chain;
+};
+
+/* This is for listening sockets, thus all sockets which possess wildcards. */
+#define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
+
+struct inet_hashinfo {
+	/* This is for sockets with full identity only.  Sockets here will
+	 * always be without wildcards and will have the following invariant:
+	 *
+	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
+	 *
+	 * First half of the table is for sockets not in TIME_WAIT, second half
+	 * is for TIME_WAIT sockets only.
+	 */
+	struct inet_ehash_bucket	*ehash;
+
+	/* Ok, let's try this, I give up, we do need a local binding
+	 * TCP hash as well as the others for fast bind/connect.
+	 */
+	struct inet_bind_hashbucket	*bhash;
+
+	int				bhash_size;
+	int				ehash_size;
+
+	/* All sockets in TCP_LISTEN state will be in here.  This is the only
+	 * table where wildcard'd TCP sockets can exist.  Hash function here
+	 * is just local port number.
+	 */
+	struct hlist_head		listening_hash[INET_LHTABLE_SIZE];
+
+	/* All the above members are written once at bootup and
+	 * never written again _or_ are predominantly read-access.
+	 *
+	 * Now align to a new cache line as all the following members
+	 * are often dirty.
+	 */
+	rwlock_t			lhash_lock ____cacheline_aligned;
+	atomic_t			lhash_users;
+	wait_queue_head_t		lhash_wait;
+	spinlock_t			portalloc_lock;
+	kmem_cache_t			*bind_bucket_cachep;
+	int				port_rover;
+};
+
+static inline int inet_ehashfn(const __u32 laddr, const __u16 lport,
+			       const __u32 faddr, const __u16 fport,
+			       const int ehash_size)
+{
+	int h = (laddr ^ lport) ^ (faddr ^ fport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	return h & (ehash_size - 1);
+}
+
+static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const __u32 laddr = inet->rcv_saddr;
+	const __u16 lport = inet->num;
+	const __u32 faddr = inet->daddr;
+	const __u16 fport = inet->dport;
+
+	return inet_ehashfn(laddr, lport, faddr, fport, ehash_size);
+}
+
+extern struct inet_bind_bucket *
+		    inet_bind_bucket_create(kmem_cache_t *cachep,
+					    struct inet_bind_hashbucket *head,
+					    const unsigned short snum);
+extern void inet_bind_bucket_destroy(kmem_cache_t *cachep,
+				     struct inet_bind_bucket *tb);
+
+static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
+{
+	return lport & (bhash_size - 1);
+}
+
+extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+			   const unsigned short snum);
+
+/* These can have wildcards, don't try too hard. */
+static inline int inet_lhashfn(const unsigned short num)
+{
+	return num & (INET_LHTABLE_SIZE - 1);
+}
+
+static inline int inet_sk_listen_hashfn(const struct sock *sk)
+{
+	return inet_lhashfn(inet_sk(sk)->num);
+}
+
+/* Caller must disable local BH processing. */
+static inline void __inet_inherit_port(struct inet_hashinfo *table,
+				       struct sock *sk, struct sock *child)
+{
+	const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
+	struct inet_bind_hashbucket *head = &table->bhash[bhash];
+	struct inet_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	sk_add_bind_node(child, &tb->owners);
+	inet_csk(child)->icsk_bind_hash = tb;
+	spin_unlock(&head->lock);
+}
+
+static inline void inet_inherit_port(struct inet_hashinfo *table,
+				     struct sock *sk, struct sock *child)
+{
+	local_bh_disable();
+	__inet_inherit_port(table, sk, child);
+	local_bh_enable();
+}
+
+extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk);
+
+extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);
+
+/*
+ * - We may sleep inside this lock.
+ * - If sleeping is not required (or called from BH),
+ *   use plain read_(un)lock(&inet_hashinfo.lhash_lock).
+ */
+static inline void inet_listen_lock(struct inet_hashinfo *hashinfo)
+{
+	/* read_lock synchronizes to candidates to writers */
+	read_lock(&hashinfo->lhash_lock);
+	atomic_inc(&hashinfo->lhash_users);
+	read_unlock(&hashinfo->lhash_lock);
+}
+
+static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
+{
+	if (atomic_dec_and_test(&hashinfo->lhash_users))
+		wake_up(&hashinfo->lhash_wait);
+}
+
+static inline void __inet_hash(struct inet_hashinfo *hashinfo,
+			       struct sock *sk, const int listen_possible)
+{
+	struct hlist_head *list;
+	rwlock_t *lock;
+
+	BUG_TRAP(sk_unhashed(sk));
+	if (listen_possible && sk->sk_state == TCP_LISTEN) {
+		list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &hashinfo->lhash_lock;
+		inet_listen_wlock(hashinfo);
+	} else {
+		sk->sk_hashent = inet_sk_ehashfn(sk, hashinfo->ehash_size);
+		list = &hashinfo->ehash[sk->sk_hashent].chain;
+		lock = &hashinfo->ehash[sk->sk_hashent].lock;
+		write_lock(lock);
+	}
+	__sk_add_node(sk, list);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(lock);
+	if (listen_possible && sk->sk_state == TCP_LISTEN)
+		wake_up(&hashinfo->lhash_wait);
+}
+
+static inline void inet_hash(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__inet_hash(hashinfo, sk, 1);
+		local_bh_enable();
+	}
+}
+
+static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	rwlock_t *lock;
+
+	if (sk_unhashed(sk))
+		goto out;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		local_bh_disable();
+		inet_listen_wlock(hashinfo);
+		lock = &hashinfo->lhash_lock;
+	} else {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[sk->sk_hashent];
+		lock = &head->lock;
+		write_lock_bh(&head->lock);
+	}
+
+	if (__sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+	write_unlock_bh(lock);
+out:
+	if (sk->sk_state == TCP_LISTEN)
+		wake_up(&hashinfo->lhash_wait);
+}
+
+static inline int inet_iif(const struct sk_buff *skb)
+{
+	return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+extern struct sock *__inet_lookup_listener(const struct hlist_head *head,
+					   const u32 daddr,
+					   const unsigned short hnum,
+					   const int dif);
+
+/* Optimize the common listener case. */
+static inline struct sock *
+		inet_lookup_listener(struct inet_hashinfo *hashinfo,
+				     const u32 daddr,
+				     const unsigned short hnum, const int dif)
+{
+	struct sock *sk = NULL;
+	const struct hlist_head *head;
+
+	read_lock(&hashinfo->lhash_lock);
+	head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
+	if (!hlist_empty(head)) {
+		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+
+		if (inet->num == hnum && !sk->sk_node.next &&
+		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+		    !sk->sk_bound_dev_if)
+			goto sherry_cache;
+		sk = __inet_lookup_listener(head, daddr, hnum, dif);
+	}
+	if (sk) {
+sherry_cache:
+		sock_hold(sk);
+	}
+	read_unlock(&hashinfo->lhash_lock);
+	return sk;
+}
+
+/* Socket demux engine toys. */
+#ifdef __BIG_ENDIAN
+#define INET_COMBINED_PORTS(__sport, __dport) \
+	(((__u32)(__sport) << 16) | (__u32)(__dport))
+#else /* __LITTLE_ENDIAN */
+#define INET_COMBINED_PORTS(__sport, __dport) \
+	(((__u32)(__dport) << 16) | (__u32)(__sport))
+#endif
+
+#if (BITS_PER_LONG == 64)
+#ifdef __BIG_ENDIAN
+#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
+	const __u64 __name = (((__u64)(__saddr)) << 32) | ((__u64)(__daddr));
+#else /* __LITTLE_ENDIAN */
+#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
+	const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr));
+#endif /* __BIG_ENDIAN */
+#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
+	(((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie))	&&	\
+	 ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
+	(((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) &&	\
+	 ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+#else /* 32-bit arch */
+#define INET_ADDR_COOKIE(__name, __saddr, __daddr)
+#define INET_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)	\
+	((inet_sk(__sk)->daddr		== (__saddr))		&&	\
+	 (inet_sk(__sk)->rcv_saddr	== (__daddr))		&&	\
+	 ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))	&&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+#define INET_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)	\
+	((inet_twsk(__sk)->tw_daddr	== (__saddr))		&&	\
+	 (inet_twsk(__sk)->tw_rcv_saddr	== (__daddr))		&&	\
+	 ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) &&	\
+	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+#endif /* 64-bit arch */
+
+/*
+ * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
+ * not check it for lookups anymore, thanks Alexey. -DaveM
+ *
+ * Local BH must be disabled here.
+ */
+static inline struct sock *
+	__inet_lookup_established(struct inet_hashinfo *hashinfo,
+				  const u32 saddr, const u16 sport,
+				  const u32 daddr, const u16 hnum,
+				  const int dif)
+{
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
+	struct sock *sk;
+	const struct hlist_node *node;
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	const int hash = inet_ehashfn(daddr, hnum, saddr, sport, hashinfo->ehash_size);
+	struct inet_ehash_bucket *head = &hashinfo->ehash[hash];
+
+	read_lock(&head->lock);
+	sk_for_each(sk, node, &head->chain) {
+		if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
+			goto hit; /* You sunk my battleship! */
+	}
+
+	/* Must check for a TIME_WAIT'er before going to listener hash. */
+	sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
+		if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
+			goto hit;
+	}
+	sk = NULL;
+out:
+	read_unlock(&head->lock);
+	return sk;
+hit:
+	sock_hold(sk);
+	goto out;
+}
+
+static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo,
+					 const u32 saddr, const u16 sport,
+					 const u32 daddr, const u16 hnum,
+					 const int dif)
+{
+	struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr,
+						    hnum, dif);
+	return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif);
+}
+
+static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
+				       const u32 saddr, const u16 sport,
+				       const u32 daddr, const u16 dport,
+				       const int dif)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+	sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	local_bh_enable();
+
+	return sk;
+}
+#endif /* _INET_HASHTABLES_H */
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
new file mode 100644
index 00000000000..3b070352e86
--- /dev/null
+++ b/include/net/inet_timewait_sock.h
@@ -0,0 +1,219 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for a generic INET TIMEWAIT sock
+ *
+ *		From code originally in net/tcp.h
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _INET_TIMEWAIT_SOCK_
+#define _INET_TIMEWAIT_SOCK_
+
+#include <linux/config.h>
+
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+#include <asm/atomic.h>
+
+struct inet_hashinfo;
+
+#define INET_TWDR_RECYCLE_SLOTS_LOG	5
+#define INET_TWDR_RECYCLE_SLOTS		(1 << INET_TWDR_RECYCLE_SLOTS_LOG)
+
+/*
+ * If time > 4sec, it is "slow" path, no recycling is required,
+ * so that we select tick to get range about 4 seconds.
+ */
+#if HZ <= 16 || HZ > 4096
+# error Unsupported: HZ <= 16 or HZ > 4096
+#elif HZ <= 32
+# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 64
+# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 128
+# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 256
+# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 512
+# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 1024
+# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#elif HZ <= 2048
+# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#else
+# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
+#endif
+
+/* TIME_WAIT reaping mechanism. */
+#define INET_TWDR_TWKILL_SLOTS	8 /* Please keep this a power of 2. */
+
+#define INET_TWDR_TWKILL_QUOTA 100
+
+struct inet_timewait_death_row {
+	/* Short-time timewait calendar */
+	int			twcal_hand;
+	int			twcal_jiffie;
+	struct timer_list	twcal_timer;
+	struct hlist_head	twcal_row[INET_TWDR_RECYCLE_SLOTS];
+
+	spinlock_t		death_lock;
+	int			tw_count;
+	int			period;
+	u32			thread_slots;
+	struct work_struct	twkill_work;
+	struct timer_list	tw_timer;
+	int			slot;
+	struct hlist_head	cells[INET_TWDR_TWKILL_SLOTS];
+	struct inet_hashinfo 	*hashinfo;
+	int			sysctl_tw_recycle;
+	int			sysctl_max_tw_buckets;
+};
+
+extern void inet_twdr_hangman(unsigned long data);
+extern void inet_twdr_twkill_work(void *data);
+extern void inet_twdr_twcal_tick(unsigned long data);
+
+#if (BITS_PER_LONG == 64)
+#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 8
+#else
+#define INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES 4
+#endif
+
+struct inet_bind_bucket;
+
+/*
+ * This is a TIME_WAIT sock. It works around the memory consumption
+ * problems of sockets in such a state on heavily loaded servers, but
+ * without violating the protocol specification.
+ */
+struct inet_timewait_sock {
+	/*
+	 * Now struct sock also uses sock_common, so please just
+	 * don't add nothing before this first member (__tw_common) --acme
+	 */
+	struct sock_common	__tw_common;
+#define tw_family		__tw_common.skc_family
+#define tw_state		__tw_common.skc_state
+#define tw_reuse		__tw_common.skc_reuse
+#define tw_bound_dev_if		__tw_common.skc_bound_dev_if
+#define tw_node			__tw_common.skc_node
+#define tw_bind_node		__tw_common.skc_bind_node
+#define tw_refcnt		__tw_common.skc_refcnt
+#define tw_prot			__tw_common.skc_prot
+	volatile unsigned char	tw_substate;
+	/* 3 bits hole, try to pack */
+	unsigned char		tw_rcv_wscale;
+	/* Socket demultiplex comparisons on incoming packets. */
+	/* these five are in inet_sock */
+	__u16			tw_sport;
+	__u32			tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
+	__u32			tw_rcv_saddr;
+	__u16			tw_dport;
+	__u16			tw_num;
+	/* And these are ours. */
+	__u8			tw_ipv6only:1;
+	/* 31 bits hole, try to pack */
+	int			tw_hashent;
+	int			tw_timeout;
+	unsigned long		tw_ttd;
+	struct inet_bind_bucket	*tw_tb;
+	struct hlist_node	tw_death_node;
+};
+
+static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
+				      struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_node, list);
+}
+
+static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
+					   struct hlist_head *list)
+{
+	hlist_add_head(&tw->tw_bind_node, list);
+}
+
+static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
+{
+	return tw->tw_death_node.pprev != NULL;
+}
+
+static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
+{
+	tw->tw_death_node.pprev = NULL;
+}
+
+static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
+{
+	__hlist_del(&tw->tw_death_node);
+	inet_twsk_dead_node_init(tw);
+}
+
+static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
+{
+	if (inet_twsk_dead_hashed(tw)) {
+		__inet_twsk_del_dead_node(tw);
+		return 1;
+	}
+	return 0;
+}
+
+#define inet_twsk_for_each(tw, node, head) \
+	hlist_for_each_entry(tw, node, head, tw_node)
+
+#define inet_twsk_for_each_inmate(tw, node, jail) \
+	hlist_for_each_entry(tw, node, jail, tw_death_node)
+
+#define inet_twsk_for_each_inmate_safe(tw, node, safe, jail) \
+	hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node)
+
+static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
+{
+	return (struct inet_timewait_sock *)sk;
+}
+
+static inline u32 inet_rcv_saddr(const struct sock *sk)
+{
+	return likely(sk->sk_state != TCP_TIME_WAIT) ?
+		inet_sk(sk)->rcv_saddr : inet_twsk(sk)->tw_rcv_saddr;
+}
+
+static inline void inet_twsk_put(struct inet_timewait_sock *tw)
+{
+	if (atomic_dec_and_test(&tw->tw_refcnt)) {
+#ifdef SOCK_REFCNT_DEBUG
+		printk(KERN_DEBUG "%s timewait_sock %p released\n",
+		       tw->tw_prot->name, tw);
+#endif
+		kmem_cache_free(tw->tw_prot->twsk_slab, tw);
+	}
+}
+
+extern struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+						  const int state);
+
+extern void __inet_twsk_kill(struct inet_timewait_sock *tw,
+			     struct inet_hashinfo *hashinfo);
+
+extern void __inet_twsk_hashdance(struct inet_timewait_sock *tw,
+				  struct sock *sk,
+				  struct inet_hashinfo *hashinfo);
+
+extern void inet_twsk_schedule(struct inet_timewait_sock *tw,
+			       struct inet_timewait_death_row *twdr,
+			       const int timeo, const int timewait_len);
+extern void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+				 struct inet_timewait_death_row *twdr);
+#endif	/* _INET_TIMEWAIT_SOCK_ */
diff --git a/include/net/ip.h b/include/net/ip.h
index 32360bbe143..e4563bbee6e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -86,7 +86,7 @@ extern int		ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 					      u32 saddr, u32 daddr,
 					      struct ip_options *opt);
 extern int		ip_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt);
+			       struct packet_type *pt, struct net_device *orig_dev);
 extern int		ip_local_deliver(struct sk_buff *skb);
 extern int		ip_mr_input(struct sk_buff *skb);
 extern int		ip_output(struct sk_buff *skb);
@@ -140,8 +140,6 @@ struct ip_reply_arg {
 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 		   unsigned int len); 
 
-extern int ip_finish_output(struct sk_buff *skb);
-
 struct ipv4_config
 {
 	int	log_martians;
@@ -165,6 +163,24 @@ extern int sysctl_local_port_range[2];
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_high_thresh; 
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_time;
+extern int sysctl_ipfrag_secret_interval;
+
+/* From inetpeer.c */
+extern int inet_peer_threshold;
+extern int inet_peer_minttl;
+extern int inet_peer_maxttl;
+extern int inet_peer_gc_mintime;
+extern int inet_peer_gc_maxtime;
+
+/* From ip_output.c */
+extern int sysctl_ip_dynaddr;
+
+extern void ipfrag_init(void);
+
 #ifdef CONFIG_INET
 /* The function in 2.2 was invalid, producing wrong result for
  * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
@@ -319,7 +335,10 @@ extern void ip_options_build(struct sk_buff *skb, struct ip_options *opt, u32 da
 extern int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb);
 extern void ip_options_fragment(struct sk_buff *skb);
 extern int ip_options_compile(struct ip_options *opt, struct sk_buff *skb);
-extern int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user);
+extern int ip_options_get(struct ip_options **optp,
+			  unsigned char *data, int optlen);
+extern int ip_options_get_from_user(struct ip_options **optp,
+				    unsigned char __user *data, int optlen);
 extern void ip_options_undo(struct ip_options * opt);
 extern void ip_forward_options(struct sk_buff *skb);
 extern int ip_options_rcv_srr(struct sk_buff *skb);
@@ -350,5 +369,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
 				  void __user *oldval, size_t __user *oldlenp,
 				  void __user *newval, size_t newlen, 
 				  void **context);
+#ifdef CONFIG_PROC_FS
+extern int ip_misc_proc_init(void);
+#endif
+
+extern struct ctl_table ipv4_table[];
 
 #endif	/* _IP_H */
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f920706d526..1f2e428ca36 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -12,7 +12,6 @@
 #include <net/flow.h>
 #include <net/ip6_fib.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a4208a336ac..14de4ebd121 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -295,4 +295,9 @@ static inline void fib_res_put(struct fib_result *res)
 #endif
 }
 
+#ifdef CONFIG_PROC_FS
+extern int  fib_proc_init(void);
+extern void fib_proc_exit(void);
+#endif
+
 #endif  /* _NET_FIB_H */
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 52da5d26617..7a3c43711a1 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -255,7 +255,6 @@ struct ip_vs_daemon_user {
 #include <asm/atomic.h>                 /* for struct atomic_t */
 #include <linux/netdevice.h>		/* for struct neighbour */
 #include <net/dst.h>			/* for struct dst_entry */
-#include <net/tcp.h>
 #include <net/udp.h>
 #include <linux/compiler.h>
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 69324465e8b..3203eaff4bd 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -104,6 +104,7 @@ struct frag_hdr {
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
 #include <net/sock.h>
 
 /* sysctls */
@@ -145,7 +146,6 @@ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6);
 #define UDP6_INC_STATS(field)		SNMP_INC_STATS(udp_stats_in6, field)
 #define UDP6_INC_STATS_BH(field)	SNMP_INC_STATS_BH(udp_stats_in6, field)
 #define UDP6_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(udp_stats_in6, field)
-extern atomic_t			inet6_sock_nr;
 
 int snmp6_register_dev(struct inet6_dev *idev);
 int snmp6_unregister_dev(struct inet6_dev *idev);
@@ -346,7 +346,8 @@ static inline int ipv6_addr_any(const struct in6_addr *a)
 
 extern int			ipv6_rcv(struct sk_buff *skb, 
 					 struct net_device *dev, 
-					 struct packet_type *pt);
+					 struct packet_type *pt,
+					 struct net_device *orig_dev);
 
 /*
  *	upper-layer output functions
@@ -464,8 +465,38 @@ extern int sysctl_ip6frag_low_thresh;
 extern int sysctl_ip6frag_time;
 extern int sysctl_ip6frag_secret_interval;
 
-#endif /* __KERNEL__ */
-#endif /* _NET_IPV6_H */
+extern struct proto_ops inet6_stream_ops;
+extern struct proto_ops inet6_dgram_ops;
+
+extern int ip6_mc_source(int add, int omode, struct sock *sk,
+			 struct group_source_req *pgsr);
+extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
+extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
+			 struct group_filter __user *optval,
+			 int __user *optlen);
+
+#ifdef CONFIG_PROC_FS
+extern int  ac6_proc_init(void);
+extern void ac6_proc_exit(void);
+extern int  raw6_proc_init(void);
+extern void raw6_proc_exit(void);
+extern int  tcp6_proc_init(void);
+extern void tcp6_proc_exit(void);
+extern int  udp6_proc_init(void);
+extern void udp6_proc_exit(void);
+extern int  ipv6_misc_proc_init(void);
+extern void ipv6_misc_proc_exit(void);
+
+extern struct rt6_statistics rt6_stats;
+#endif
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table ipv6_route_table[];
+extern ctl_table ipv6_icmp_table[];
 
+extern void ipv6_sysctl_register(void);
+extern void ipv6_sysctl_unregister(void);
+#endif
 
+#endif /* __KERNEL__ */
+#endif /* _NET_IPV6_H */
diff --git a/include/net/llc.h b/include/net/llc.h
index c9aed2a8b4e..71769a5aeef 100644
--- a/include/net/llc.h
+++ b/include/net/llc.h
@@ -46,7 +46,8 @@ struct llc_sap {
 	unsigned char	 f_bit;
 	int		 (*rcv_func)(struct sk_buff *skb,
 				     struct net_device *dev,
-				     struct packet_type *pt);
+				     struct packet_type *pt,
+				     struct net_device *orig_dev);
 	struct llc_addr	 laddr;
 	struct list_head node;
 	struct {
@@ -64,7 +65,7 @@ extern rwlock_t llc_sap_list_lock;
 extern unsigned char llc_station_mac_sa[ETH_ALEN];
 
 extern int llc_rcv(struct sk_buff *skb, struct net_device *dev,
-		   struct packet_type *pt);
+		   struct packet_type *pt, struct net_device *orig_dev);
 
 extern int llc_mac_hdr_init(struct sk_buff *skb,
 			    unsigned char *sa, unsigned char *da);
@@ -78,7 +79,8 @@ extern void llc_set_station_handler(void (*handler)(struct sk_buff *skb));
 extern struct llc_sap *llc_sap_open(unsigned char lsap,
 				    int (*rcv)(struct sk_buff *skb,
 					       struct net_device *dev,
-					       struct packet_type *pt));
+					       struct packet_type *pt,
+					       struct net_device *orig_dev));
 extern void llc_sap_close(struct llc_sap *sap);
 
 extern struct llc_sap *llc_sap_find(unsigned char sap_value);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 89809891e5a..34c07731933 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -363,7 +363,14 @@ __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
 	return neigh_create(tbl, pkey, dev);
 }
 
-#define LOCALLY_ENQUEUED -2
+struct neighbour_cb {
+	unsigned long sched_next;
+	unsigned int flags;
+};
+
+#define LOCALLY_ENQUEUED 0x1
+
+#define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
 #endif
 #endif
diff --git a/include/net/p8022.h b/include/net/p8022.h
index 3c99a86c358..42e9fac51b3 100644
--- a/include/net/p8022.h
+++ b/include/net/p8022.h
@@ -4,7 +4,10 @@ extern struct datalink_proto *
 	register_8022_client(unsigned char type,
 			     int (*func)(struct sk_buff *skb,
 					 struct net_device *dev,
-					 struct packet_type *pt));
+					 struct packet_type *pt,
+					 struct net_device *orig_dev));
 extern void unregister_8022_client(struct datalink_proto *proto);
 
+extern struct datalink_proto *make_8023_client(void);
+extern void destroy_8023_client(struct datalink_proto *dl);
 #endif
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 4abda6aec05..b902d24a325 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -352,10 +352,10 @@ tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv)
 static inline int
 tcf_match_indev(struct sk_buff *skb, char *indev)
 {
-	if (0 != indev[0]) {
-		if  (NULL == skb->input_dev)
+	if (indev[0]) {
+		if  (!skb->input_dev)
 			return 0;
-		else if (0 != strcmp(indev, skb->input_dev->name))
+		if (strcmp(indev, skb->input_dev->name))
 			return 0;
 	}
 
diff --git a/include/net/psnap.h b/include/net/psnap.h
index 9c94e8f98b3..b2e01cc3fc8 100644
--- a/include/net/psnap.h
+++ b/include/net/psnap.h
@@ -1,7 +1,7 @@
 #ifndef _NET_PSNAP_H
 #define _NET_PSNAP_H
 
-extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *));
+extern struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *orig_dev));
 extern void unregister_snap_client(struct datalink_proto *proto);
 
 #endif
diff --git a/include/net/raw.h b/include/net/raw.h
index 1c411c45587..f47917469b1 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -17,10 +17,10 @@
 #ifndef _RAW_H
 #define _RAW_H
 
+#include <linux/config.h>
 
 extern struct proto raw_prot;
 
-
 extern void 	raw_err(struct sock *, struct sk_buff *, u32 info);
 extern int 	raw_rcv(struct sock *, struct sk_buff *);
 
@@ -37,6 +37,11 @@ extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
 				    unsigned long raddr, unsigned long laddr,
 				    int dif);
 
-extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
+extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash);
+
+#ifdef CONFIG_PROC_FS
+extern int  raw_proc_init(void);
+extern void raw_proc_exit(void);
+#endif
 
 #endif	/* _RAW_H */
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 23fd9a6a221..14476a71725 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -7,10 +7,11 @@
 extern struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE];
 extern rwlock_t raw_v6_lock;
 
-extern void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr);
+extern int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr);
 
 extern struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
-				    struct in6_addr *loc_addr, struct in6_addr *rmt_addr);
+				    struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
+				    int dif);
 
 extern int			rawv6_rcv(struct sock *sk,
 					  struct sk_buff *skb);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 72fd6f5e86b..b52cc52ffe3 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -89,6 +89,7 @@ struct listen_sock {
 	int			qlen_young;
 	int			clock_hand;
 	u32			hash_rnd;
+	u32			nr_table_entries;
 	struct request_sock	*syn_table[0];
 };
 
@@ -96,6 +97,7 @@ struct listen_sock {
  *
  * @rskq_accept_head - FIFO head of established children
  * @rskq_accept_tail - FIFO tail of established children
+ * @rskq_defer_accept - User waits for some data after accept()
  * @syn_wait_lock - serializer
  *
  * %syn_wait_lock is necessary only to avoid proc interface having to grab the main
@@ -111,6 +113,8 @@ struct request_sock_queue {
 	struct request_sock	*rskq_accept_head;
 	struct request_sock	*rskq_accept_tail;
 	rwlock_t		syn_wait_lock;
+	u8			rskq_defer_accept;
+	/* 3 bytes hole, try to pack */
 	struct listen_sock	*listen_opt;
 };
 
@@ -129,11 +133,13 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock
 	return lopt;
 }
 
-static inline void reqsk_queue_destroy(struct request_sock_queue *queue)
+static inline void __reqsk_queue_destroy(struct request_sock_queue *queue)
 {
 	kfree(reqsk_queue_yank_listen_sk(queue));
 }
 
+extern void reqsk_queue_destroy(struct request_sock_queue *queue);
+
 static inline struct request_sock *
 	reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
 {
@@ -221,17 +227,17 @@ static inline int reqsk_queue_added(struct request_sock_queue *queue)
 	return prev_qlen;
 }
 
-static inline int reqsk_queue_len(struct request_sock_queue *queue)
+static inline int reqsk_queue_len(const struct request_sock_queue *queue)
 {
 	return queue->listen_opt != NULL ? queue->listen_opt->qlen : 0;
 }
 
-static inline int reqsk_queue_len_young(struct request_sock_queue *queue)
+static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
 {
 	return queue->listen_opt->qlen_young;
 }
 
-static inline int reqsk_queue_is_full(struct request_sock_queue *queue)
+static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
 {
 	return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
 }
diff --git a/include/net/route.h b/include/net/route.h
index c3cd069a9ac..dbe79ca67d3 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -105,10 +105,6 @@ struct rt_cache_stat
         unsigned int out_hlist_search;
 };
 
-extern struct rt_cache_stat *rt_cache_stat;
-#define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
-
 extern struct ip_rt_acct *ip_rt_acct;
 
 struct in_device;
@@ -199,4 +195,6 @@ static inline struct inet_peer *rt_get_peer(struct rtable *rt)
 	return rt->peer;
 }
 
+extern ctl_table ipv4_route_table[];
+
 #endif	/* _ROUTE_H */
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 5999e5684bb..c51541ee024 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -47,10 +47,10 @@
 #ifndef __sctp_constants_h__
 #define __sctp_constants_h__
 
-#include <linux/tcp.h>  /* For TCP states used in sctp_sock_state_t */
 #include <linux/sctp.h>
 #include <linux/ipv6.h> /* For ipv6hdr. */
 #include <net/sctp/user.h>
+#include <net/tcp_states.h>  /* For TCP states used in sctp_sock_state_t */
 
 /* Value used for stream negotiation. */
 enum { SCTP_MAX_STREAM = 0xffff };
diff --git a/include/net/sock.h b/include/net/sock.h
index e9b1dbab90d..312cb25cbd1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -88,6 +88,7 @@ do {	spin_lock_init(&((__sk)->sk_lock.slock)); \
 } while(0)
 
 struct sock;
+struct proto;
 
 /**
  *	struct sock_common - minimal network layer representation of sockets
@@ -98,10 +99,11 @@ struct sock;
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_refcnt: reference count
+ *	@skc_prot: protocol handlers inside a network family
  *
  *	This is the minimal network layer representation of sockets, the header
- *	for struct sock and struct tcp_tw_bucket.
-  */
+ *	for struct sock and struct inet_timewait_sock.
+ */
 struct sock_common {
 	unsigned short		skc_family;
 	volatile unsigned char	skc_state;
@@ -110,11 +112,12 @@ struct sock_common {
 	struct hlist_node	skc_node;
 	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
+	struct proto		*skc_prot;
 };
 
 /**
   *	struct sock - network layer representation of sockets
-  *	@__sk_common: shared layout with tcp_tw_bucket
+  *	@__sk_common: shared layout with inet_timewait_sock
   *	@sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
@@ -136,11 +139,10 @@ struct sock_common {
   *	@sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
   *	@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
   *	@sk_lingertime: %SO_LINGER l_linger setting
-  *	@sk_hashent: hash entry in several tables (e.g. tcp_ehash)
+  *	@sk_hashent: hash entry in several tables (e.g. inet_hashinfo.ehash)
   *	@sk_backlog: always used with the per-socket spinlock held
   *	@sk_callback_lock: used with the callbacks in the end of this struct
   *	@sk_error_queue: rarely used
-  *	@sk_prot: protocol handlers inside a network family
   *	@sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance)
   *	@sk_err: last error
   *	@sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out'
@@ -173,7 +175,7 @@ struct sock_common {
  */
 struct sock {
 	/*
-	 * Now struct tcp_tw_bucket also uses sock_common, so please just
+	 * Now struct inet_timewait_sock also uses sock_common, so please just
 	 * don't add nothing before this first member (__sk_common) --acme
 	 */
 	struct sock_common	__sk_common;
@@ -184,6 +186,7 @@ struct sock {
 #define sk_node			__sk_common.skc_node
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_refcnt		__sk_common.skc_refcnt
+#define sk_prot			__sk_common.skc_prot
 	unsigned char		sk_shutdown : 2,
 				sk_no_check : 2,
 				sk_userlocks : 4;
@@ -218,7 +221,6 @@ struct sock {
 		struct sk_buff *tail;
 	} sk_backlog;
 	struct sk_buff_head	sk_error_queue;
-	struct proto		*sk_prot;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
@@ -253,28 +255,28 @@ struct sock {
 /*
  * Hashed lists helper routines
  */
-static inline struct sock *__sk_head(struct hlist_head *head)
+static inline struct sock *__sk_head(const struct hlist_head *head)
 {
 	return hlist_entry(head->first, struct sock, sk_node);
 }
 
-static inline struct sock *sk_head(struct hlist_head *head)
+static inline struct sock *sk_head(const struct hlist_head *head)
 {
 	return hlist_empty(head) ? NULL : __sk_head(head);
 }
 
-static inline struct sock *sk_next(struct sock *sk)
+static inline struct sock *sk_next(const struct sock *sk)
 {
 	return sk->sk_node.next ?
 		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
 }
 
-static inline int sk_unhashed(struct sock *sk)
+static inline int sk_unhashed(const struct sock *sk)
 {
 	return hlist_unhashed(&sk->sk_node);
 }
 
-static inline int sk_hashed(struct sock *sk)
+static inline int sk_hashed(const struct sock *sk)
 {
 	return sk->sk_node.pprev != NULL;
 }
@@ -554,6 +556,10 @@ struct proto {
 	kmem_cache_t		*slab;
 	unsigned int		obj_size;
 
+	kmem_cache_t		*twsk_slab;
+	unsigned int		twsk_obj_size;
+	atomic_t		*orphan_count;
+
 	struct request_sock_ops	*rsk_prot;
 
 	struct module		*owner;
@@ -561,7 +567,9 @@ struct proto {
 	char			name[32];
 
 	struct list_head	node;
-
+#ifdef SOCK_REFCNT_DEBUG
+	atomic_t		socks;
+#endif
 	struct {
 		int inuse;
 		u8  __pad[SMP_CACHE_BYTES - sizeof(int)];
@@ -571,6 +579,31 @@ struct proto {
 extern int proto_register(struct proto *prot, int alloc_slab);
 extern void proto_unregister(struct proto *prot);
 
+#ifdef SOCK_REFCNT_DEBUG
+static inline void sk_refcnt_debug_inc(struct sock *sk)
+{
+	atomic_inc(&sk->sk_prot->socks);
+}
+
+static inline void sk_refcnt_debug_dec(struct sock *sk)
+{
+	atomic_dec(&sk->sk_prot->socks);
+	printk(KERN_DEBUG "%s socket %p released, %d are still alive\n",
+	       sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks));
+}
+
+static inline void sk_refcnt_debug_release(const struct sock *sk)
+{
+	if (atomic_read(&sk->sk_refcnt) != 1)
+		printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
+		       sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt));
+}
+#else /* SOCK_REFCNT_DEBUG */
+#define sk_refcnt_debug_inc(sk) do { } while (0)
+#define sk_refcnt_debug_dec(sk) do { } while (0)
+#define sk_refcnt_debug_release(sk) do { } while (0)
+#endif /* SOCK_REFCNT_DEBUG */
+
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
@@ -582,6 +615,15 @@ static __inline__ void sock_prot_dec_use(struct proto *prot)
 	prot->stats[smp_processor_id()].inuse--;
 }
 
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
+static inline void __sk_prot_rehash(struct sock *sk)
+{
+	sk->sk_prot->unhash(sk);
+	sk->sk_prot->hash(sk);
+}
+
 /* About 10 seconds */
 #define SOCK_DESTROY_TIME (10*HZ)
 
@@ -693,6 +735,8 @@ extern struct sock		*sk_alloc(int family,
 					  unsigned int __nocast priority,
 					  struct proto *prot, int zero_it);
 extern void			sk_free(struct sock *sk);
+extern struct sock		*sk_clone(const struct sock *sk,
+					  const unsigned int __nocast priority);
 
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
@@ -986,6 +1030,16 @@ sk_dst_check(struct sock *sk, u32 cookie)
 	return dst;
 }
 
+static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+	__sk_dst_set(sk, dst);
+	sk->sk_route_caps = dst->dev->features;
+	if (sk->sk_route_caps & NETIF_F_TSO) {
+		if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
+			sk->sk_route_caps &= ~NETIF_F_TSO;
+	}
+}
+
 static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk->sk_wmem_queued   += skb->truesize;
@@ -1146,7 +1200,7 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
 	int hdr_len;
 
 	hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
-	skb = alloc_skb(size + hdr_len, gfp);
+	skb = alloc_skb_fclone(size + hdr_len, gfp);
 	if (skb) {
 		skb->truesize += mem;
 		if (sk->sk_forward_alloc >= (int)skb->truesize ||
@@ -1228,16 +1282,19 @@ static inline int sock_intr_errno(long timeo)
 static __inline__ void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 {
-	struct timeval *stamp = &skb->stamp;
+	struct timeval stamp;
+
+	skb_get_timestamp(skb, &stamp);
 	if (sock_flag(sk, SOCK_RCVTSTAMP)) {
 		/* Race occurred between timestamp enabling and packet
 		   receiving.  Fill in the current time for now. */
-		if (stamp->tv_sec == 0)
-			do_gettimeofday(stamp);
+		if (stamp.tv_sec == 0)
+			do_gettimeofday(&stamp);
+		skb_set_timestamp(skb, &stamp);
 		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval),
-			 stamp);
+			 &stamp);
 	} else
-		sk->sk_stamp = *stamp;
+		sk->sk_stamp = stamp;
 }
 
 /**
@@ -1262,11 +1319,11 @@ extern int sock_get_timestamp(struct sock *, struct timeval __user *);
  */
 
 #if 0
-#define NETDEBUG(x)	do { } while (0)
-#define LIMIT_NETDEBUG(x) do {} while(0)
+#define NETDEBUG(fmt, args...)	do { } while (0)
+#define LIMIT_NETDEBUG(fmt, args...) do { } while(0)
 #else
-#define NETDEBUG(x)	do { x; } while (0)
-#define LIMIT_NETDEBUG(x) do { if (net_ratelimit()) { x; } } while(0)
+#define NETDEBUG(fmt, args...)	printk(fmt,##args)
+#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0)
 #endif
 
 /*
@@ -1313,4 +1370,14 @@ static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsign
 }
 #endif
 
+extern void sk_init(void);
+
+#ifdef CONFIG_SYSCTL
+extern struct ctl_table core_table[];
+extern int sysctl_optmem_max;
+#endif
+
+extern __u32 sysctl_wmem_default;
+extern __u32 sysctl_rmem_default;
+
 #endif	/* _SOCK_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5010f0c5a56..d6bcf1317a6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -21,360 +21,29 @@
 #define TCP_DEBUG 1
 #define FASTRETRANS_DEBUG 1
 
-/* Cancel timers, when they are not required. */
-#undef TCP_CLEAR_TIMERS
-
 #include <linux/config.h>
 #include <linux/list.h>
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/cache.h>
 #include <linux/percpu.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet_hashtables.h>
 #include <net/checksum.h>
 #include <net/request_sock.h>
 #include <net/sock.h>
 #include <net/snmp.h>
 #include <net/ip.h>
-#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
-#include <linux/ipv6.h>
-#endif
-#include <linux/seq_file.h>
-
-/* This is for all connections with a full identity, no wildcards.
- * New scheme, half the table is for TIME_WAIT, the other half is
- * for the rest.  I'll experiment with dynamic table growth later.
- */
-struct tcp_ehash_bucket {
-	rwlock_t	  lock;
-	struct hlist_head chain;
-} __attribute__((__aligned__(8)));
-
-/* This is for listening sockets, thus all sockets which possess wildcards. */
-#define TCP_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */
-
-/* There are a few simple rules, which allow for local port reuse by
- * an application.  In essence:
- *
- *	1) Sockets bound to different interfaces may share a local port.
- *	   Failing that, goto test 2.
- *	2) If all sockets have sk->sk_reuse set, and none of them are in
- *	   TCP_LISTEN state, the port may be shared.
- *	   Failing that, goto test 3.
- *	3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
- *	   address, and none of them are the same, the port may be
- *	   shared.
- *	   Failing this, the port cannot be shared.
- *
- * The interesting point, is test #2.  This is what an FTP server does
- * all day.  To optimize this case we use a specific flag bit defined
- * below.  As we add sockets to a bind bucket list, we perform a
- * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
- * As long as all sockets added to a bind bucket pass this test,
- * the flag bit will be set.
- * The resulting situation is that tcp_v[46]_verify_bind() can just check
- * for this flag bit, if it is set and the socket trying to bind has
- * sk->sk_reuse set, we don't even have to walk the owners list at all,
- * we return that it is ok to bind this socket to the requested local port.
- *
- * Sounds like a lot of work, but it is worth it.  In a more naive
- * implementation (ie. current FreeBSD etc.) the entire list of ports
- * must be walked for each data port opened by an ftp server.  Needless
- * to say, this does not scale at all.  With a couple thousand FTP
- * users logged onto your box, isn't it nice to know that new data
- * ports are created in O(1) time?  I thought so. ;-)	-DaveM
- */
-struct tcp_bind_bucket {
-	unsigned short		port;
-	signed short		fastreuse;
-	struct hlist_node	node;
-	struct hlist_head	owners;
-};
-
-#define tb_for_each(tb, node, head) hlist_for_each_entry(tb, node, head, node)
-
-struct tcp_bind_hashbucket {
-	spinlock_t		lock;
-	struct hlist_head	chain;
-};
-
-static inline struct tcp_bind_bucket *__tb_head(struct tcp_bind_hashbucket *head)
-{
-	return hlist_entry(head->chain.first, struct tcp_bind_bucket, node);
-}
-
-static inline struct tcp_bind_bucket *tb_head(struct tcp_bind_hashbucket *head)
-{
-	return hlist_empty(&head->chain) ? NULL : __tb_head(head);
-}
-
-extern struct tcp_hashinfo {
-	/* This is for sockets with full identity only.  Sockets here will
-	 * always be without wildcards and will have the following invariant:
-	 *
-	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
-	 *
-	 * First half of the table is for sockets not in TIME_WAIT, second half
-	 * is for TIME_WAIT sockets only.
-	 */
-	struct tcp_ehash_bucket *__tcp_ehash;
-
-	/* Ok, let's try this, I give up, we do need a local binding
-	 * TCP hash as well as the others for fast bind/connect.
-	 */
-	struct tcp_bind_hashbucket *__tcp_bhash;
+#include <net/tcp_states.h>
 
-	int __tcp_bhash_size;
-	int __tcp_ehash_size;
-
-	/* All sockets in TCP_LISTEN state will be in here.  This is the only
-	 * table where wildcard'd TCP sockets can exist.  Hash function here
-	 * is just local port number.
-	 */
-	struct hlist_head __tcp_listening_hash[TCP_LHTABLE_SIZE];
-
-	/* All the above members are written once at bootup and
-	 * never written again _or_ are predominantly read-access.
-	 *
-	 * Now align to a new cache line as all the following members
-	 * are often dirty.
-	 */
-	rwlock_t __tcp_lhash_lock ____cacheline_aligned;
-	atomic_t __tcp_lhash_users;
-	wait_queue_head_t __tcp_lhash_wait;
-	spinlock_t __tcp_portalloc_lock;
-} tcp_hashinfo;
-
-#define tcp_ehash	(tcp_hashinfo.__tcp_ehash)
-#define tcp_bhash	(tcp_hashinfo.__tcp_bhash)
-#define tcp_ehash_size	(tcp_hashinfo.__tcp_ehash_size)
-#define tcp_bhash_size	(tcp_hashinfo.__tcp_bhash_size)
-#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
-#define tcp_lhash_lock	(tcp_hashinfo.__tcp_lhash_lock)
-#define tcp_lhash_users	(tcp_hashinfo.__tcp_lhash_users)
-#define tcp_lhash_wait	(tcp_hashinfo.__tcp_lhash_wait)
-#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
-
-extern kmem_cache_t *tcp_bucket_cachep;
-extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
-						 unsigned short snum);
-extern void tcp_bucket_destroy(struct tcp_bind_bucket *tb);
-extern void tcp_bucket_unlock(struct sock *sk);
-extern int tcp_port_rover;
-
-/* These are AF independent. */
-static __inline__ int tcp_bhashfn(__u16 lport)
-{
-	return (lport & (tcp_bhash_size - 1));
-}
-
-extern void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
-			  unsigned short snum);
-
-#if (BITS_PER_LONG == 64)
-#define TCP_ADDRCMP_ALIGN_BYTES 8
-#else
-#define TCP_ADDRCMP_ALIGN_BYTES 4
-#endif
-
-/* This is a TIME_WAIT bucket.  It works around the memory consumption
- * problems of sockets in such a state on heavily loaded servers, but
- * without violating the protocol specification.
- */
-struct tcp_tw_bucket {
-	/*
-	 * Now struct sock also uses sock_common, so please just
-	 * don't add nothing before this first member (__tw_common) --acme
-	 */
-	struct sock_common	__tw_common;
-#define tw_family		__tw_common.skc_family
-#define tw_state		__tw_common.skc_state
-#define tw_reuse		__tw_common.skc_reuse
-#define tw_bound_dev_if		__tw_common.skc_bound_dev_if
-#define tw_node			__tw_common.skc_node
-#define tw_bind_node		__tw_common.skc_bind_node
-#define tw_refcnt		__tw_common.skc_refcnt
-	volatile unsigned char	tw_substate;
-	unsigned char		tw_rcv_wscale;
-	__u16			tw_sport;
-	/* Socket demultiplex comparisons on incoming packets. */
-	/* these five are in inet_sock */
-	__u32			tw_daddr
-		__attribute__((aligned(TCP_ADDRCMP_ALIGN_BYTES)));
-	__u32			tw_rcv_saddr;
-	__u16			tw_dport;
-	__u16			tw_num;
-	/* And these are ours. */
-	int			tw_hashent;
-	int			tw_timeout;
-	__u32			tw_rcv_nxt;
-	__u32			tw_snd_nxt;
-	__u32			tw_rcv_wnd;
-	__u32			tw_ts_recent;
-	long			tw_ts_recent_stamp;
-	unsigned long		tw_ttd;
-	struct tcp_bind_bucket	*tw_tb;
-	struct hlist_node	tw_death_node;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-	struct in6_addr		tw_v6_daddr;
-	struct in6_addr		tw_v6_rcv_saddr;
-	int			tw_v6_ipv6only;
-#endif
-};
-
-static __inline__ void tw_add_node(struct tcp_tw_bucket *tw,
-				   struct hlist_head *list)
-{
-	hlist_add_head(&tw->tw_node, list);
-}
-
-static __inline__ void tw_add_bind_node(struct tcp_tw_bucket *tw,
-					struct hlist_head *list)
-{
-	hlist_add_head(&tw->tw_bind_node, list);
-}
-
-static inline int tw_dead_hashed(struct tcp_tw_bucket *tw)
-{
-	return tw->tw_death_node.pprev != NULL;
-}
-
-static __inline__ void tw_dead_node_init(struct tcp_tw_bucket *tw)
-{
-	tw->tw_death_node.pprev = NULL;
-}
-
-static __inline__ void __tw_del_dead_node(struct tcp_tw_bucket *tw)
-{
-	__hlist_del(&tw->tw_death_node);
-	tw_dead_node_init(tw);
-}
-
-static __inline__ int tw_del_dead_node(struct tcp_tw_bucket *tw)
-{
-	if (tw_dead_hashed(tw)) {
-		__tw_del_dead_node(tw);
-		return 1;
-	}
-	return 0;
-}
-
-#define tw_for_each(tw, node, head) \
-	hlist_for_each_entry(tw, node, head, tw_node)
-
-#define tw_for_each_inmate(tw, node, jail) \
-	hlist_for_each_entry(tw, node, jail, tw_death_node)
-
-#define tw_for_each_inmate_safe(tw, node, safe, jail) \
-	hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node)
-
-#define tcptw_sk(__sk)	((struct tcp_tw_bucket *)(__sk))
-
-static inline u32 tcp_v4_rcv_saddr(const struct sock *sk)
-{
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
-{
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		&inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr;
-}
-
-static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
-{
-	return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
-}
-
-#define tcptw_sk_ipv6only(__sk)	(tcptw_sk(__sk)->tw_v6_ipv6only)
-
-static inline int tcp_v6_ipv6only(const struct sock *sk)
-{
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk);
-}
-#else
-# define __tcp_v6_rcv_saddr(__sk)	NULL
-# define tcp_v6_rcv_saddr(__sk)		NULL
-# define tcptw_sk_ipv6only(__sk)	0
-# define tcp_v6_ipv6only(__sk)		0
-#endif
+#include <linux/seq_file.h>
 
-extern kmem_cache_t *tcp_timewait_cachep;
-
-static inline void tcp_tw_put(struct tcp_tw_bucket *tw)
-{
-	if (atomic_dec_and_test(&tw->tw_refcnt)) {
-#ifdef INET_REFCNT_DEBUG
-		printk(KERN_DEBUG "tw_bucket %p released\n", tw);
-#endif
-		kmem_cache_free(tcp_timewait_cachep, tw);
-	}
-}
+extern struct inet_hashinfo tcp_hashinfo;
 
 extern atomic_t tcp_orphan_count;
-extern int tcp_tw_count;
 extern void tcp_time_wait(struct sock *sk, int state, int timeo);
-extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
-
-
-/* Socket demux engine toys. */
-#ifdef __BIG_ENDIAN
-#define TCP_COMBINED_PORTS(__sport, __dport) \
-	(((__u32)(__sport)<<16) | (__u32)(__dport))
-#else /* __LITTLE_ENDIAN */
-#define TCP_COMBINED_PORTS(__sport, __dport) \
-	(((__u32)(__dport)<<16) | (__u32)(__sport))
-#endif
-
-#if (BITS_PER_LONG == 64)
-#ifdef __BIG_ENDIAN
-#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
-	__u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr));
-#else /* __LITTLE_ENDIAN */
-#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
-	__u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr));
-#endif /* __BIG_ENDIAN */
-#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
-	(((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))	&&	\
-	 ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports))	&&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
-#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
-	(((*((__u64 *)&(tcptw_sk(__sk)->tw_daddr))) == (__cookie)) &&	\
-	 ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) &&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
-#else /* 32-bit arch */
-#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr)
-#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
-	((inet_sk(__sk)->daddr			== (__saddr))	&&	\
-	 (inet_sk(__sk)->rcv_saddr		== (__daddr))	&&	\
-	 ((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports))	&&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
-#define TCP_IPV4_TW_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
-	((tcptw_sk(__sk)->tw_daddr		== (__saddr))	&&	\
-	 (tcptw_sk(__sk)->tw_rcv_saddr		== (__daddr))	&&	\
-	 ((*((__u32 *)&(tcptw_sk(__sk)->tw_dport))) == (__ports)) &&	\
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
-#endif /* 64-bit arch */
-
-#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif)	   \
-	(((*((__u32 *)&(inet_sk(__sk)->dport)))== (__ports))   	&& \
-	 ((__sk)->sk_family		== AF_INET6)		&& \
-	 ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr))	&& \
-	 ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr))	&& \
-	 (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
-
-/* These can have wildcards, don't try too hard. */
-static __inline__ int tcp_lhashfn(unsigned short num)
-{
-	return num & (TCP_LHTABLE_SIZE - 1);
-}
-
-static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
-{
-	return tcp_lhashfn(inet_sk(sk)->num);
-}
 
 #define MAX_TCP_HEADER	(128 + MAX_HEADER)
 
@@ -478,33 +147,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 					 * timestamps. It must be less than
 					 * minimal timewait lifetime.
 					 */
-
-#define TCP_TW_RECYCLE_SLOTS_LOG	5
-#define TCP_TW_RECYCLE_SLOTS		(1<<TCP_TW_RECYCLE_SLOTS_LOG)
-
-/* If time > 4sec, it is "slow" path, no recycling is required,
-   so that we select tick to get range about 4 seconds.
- */
-
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
-#elif HZ <= 32
-# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 64
-# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 128
-# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 256
-# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 512
-# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 1024
-# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#elif HZ <= 2048
-# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#else
-# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
-#endif
 /*
  *	TCP option
  */
@@ -534,22 +176,18 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 #define TCPOLEN_SACK_BASE_ALIGNED	4
 #define TCPOLEN_SACK_PERBLOCK		8
 
-#define TCP_TIME_RETRANS	1	/* Retransmit timer */
-#define TCP_TIME_DACK		2	/* Delayed ack timer */
-#define TCP_TIME_PROBE0		3	/* Zero window probe timer */
-#define TCP_TIME_KEEPOPEN	4	/* Keepalive timer */
-
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
 #define TCP_NAGLE_CORK		2	/* Socket is corked	    */
 #define TCP_NAGLE_PUSH		4	/* Cork is overriden for already queued data */
 
+extern struct inet_timewait_death_row tcp_death_row;
+
 /* sysctl variables for tcp */
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_tw_recycle;
 extern int sysctl_tcp_keepalive_time;
 extern int sysctl_tcp_keepalive_probes;
 extern int sysctl_tcp_keepalive_intvl;
@@ -564,7 +202,6 @@ extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
 extern int sysctl_tcp_abort_on_overflow;
 extern int sysctl_tcp_max_orphans;
-extern int sysctl_tcp_max_tw_buckets;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_ecn;
@@ -585,12 +222,6 @@ extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-#define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
-#else
-#define TCP_INET_FAMILY(fam) 1
-#endif
-
 /*
  *	Pointers to address related TCP functions
  *	(i.e. things that depend on the address family)
@@ -671,9 +302,6 @@ DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 #define TCP_ADD_STATS_BH(field, val)	SNMP_ADD_STATS_BH(tcp_statistics, field, val)
 #define TCP_ADD_STATS_USER(field, val)	SNMP_ADD_STATS_USER(tcp_statistics, field, val)
 
-extern void			tcp_put_port(struct sock *sk);
-extern void			tcp_inherit_port(struct sock *sk, struct sock *child);
-
 extern void			tcp_v4_err(struct sk_buff *skb, u32);
 
 extern void			tcp_shutdown (struct sock *sk, int how);
@@ -682,7 +310,7 @@ extern int			tcp_v4_rcv(struct sk_buff *skb);
 
 extern int			tcp_v4_remember_stamp(struct sock *sk);
 
-extern int		    	tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw);
+extern int		    	tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 
 extern int			tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
 					    struct msghdr *msg, size_t size);
@@ -704,42 +332,22 @@ extern int			tcp_rcv_established(struct sock *sk,
 
 extern void			tcp_rcv_space_adjust(struct sock *sk);
 
-enum tcp_ack_state_t
-{
-	TCP_ACK_SCHED = 1,
-	TCP_ACK_TIMER = 2,
-	TCP_ACK_PUSHED= 4
-};
-
-static inline void tcp_schedule_ack(struct tcp_sock *tp)
+static inline void tcp_dec_quickack_mode(struct sock *sk,
+					 const unsigned int pkts)
 {
-	tp->ack.pending |= TCP_ACK_SCHED;
-}
-
-static inline int tcp_ack_scheduled(struct tcp_sock *tp)
-{
-	return tp->ack.pending&TCP_ACK_SCHED;
-}
-
-static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp, unsigned int pkts)
-{
-	if (tp->ack.quick) {
-		if (pkts >= tp->ack.quick) {
-			tp->ack.quick = 0;
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	if (icsk->icsk_ack.quick) {
+		if (pkts >= icsk->icsk_ack.quick) {
+			icsk->icsk_ack.quick = 0;
 			/* Leaving quickack mode we deflate ATO. */
-			tp->ack.ato = TCP_ATO_MIN;
+			icsk->icsk_ack.ato   = TCP_ATO_MIN;
 		} else
-			tp->ack.quick -= pkts;
+			icsk->icsk_ack.quick -= pkts;
 	}
 }
 
-extern void tcp_enter_quickack_mode(struct tcp_sock *tp);
-
-static __inline__ void tcp_delack_init(struct tcp_sock *tp)
-{
-	memset(&tp->ack, 0, sizeof(tp->ack));
-}
+extern void tcp_enter_quickack_mode(struct sock *sk);
 
 static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
@@ -755,10 +363,9 @@ enum tcp_tw_status
 };
 
 
-extern enum tcp_tw_status	tcp_timewait_state_process(struct tcp_tw_bucket *tw,
+extern enum tcp_tw_status	tcp_timewait_state_process(struct inet_timewait_sock *tw,
 							   struct sk_buff *skb,
-							   struct tcphdr *th,
-							   unsigned len);
+							   const struct tcphdr *th);
 
 extern struct sock *		tcp_check_req(struct sock *sk,struct sk_buff *skb,
 					      struct request_sock *req,
@@ -773,7 +380,6 @@ extern void			tcp_update_metrics(struct sock *sk);
 
 extern void			tcp_close(struct sock *sk, 
 					  long timeout);
-extern struct sock *		tcp_accept(struct sock *sk, int flags, int *err);
 extern unsigned int		tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait);
 
 extern int			tcp_getsockopt(struct sock *sk, int level, 
@@ -789,8 +395,6 @@ extern int			tcp_recvmsg(struct kiocb *iocb, struct sock *sk,
 					    size_t len, int nonblock, 
 					    int flags, int *addr_len);
 
-extern int			tcp_listen_start(struct sock *sk);
-
 extern void			tcp_parse_options(struct sk_buff *skb,
 						  struct tcp_options_received *opt_rx,
 						  int estab);
@@ -799,11 +403,6 @@ extern void			tcp_parse_options(struct sk_buff *skb,
  *	TCP v4 functions exported for the inet6 API
  */
 
-extern int		       	tcp_v4_rebuild_header(struct sock *sk);
-
-extern int		       	tcp_v4_build_header(struct sock *sk, 
-						    struct sk_buff *skb);
-
 extern void		       	tcp_v4_send_check(struct sock *sk, 
 						  struct tcphdr *th, int len, 
 						  struct sk_buff *skb);
@@ -872,18 +471,15 @@ extern void tcp_cwnd_application_limited(struct sock *sk);
 
 /* tcp_timer.c */
 extern void tcp_init_xmit_timers(struct sock *);
-extern void tcp_clear_xmit_timers(struct sock *);
+static inline void tcp_clear_xmit_timers(struct sock *sk)
+{
+	inet_csk_clear_xmit_timers(sk);
+}
 
-extern void tcp_delete_keepalive_timer(struct sock *);
-extern void tcp_reset_keepalive_timer(struct sock *, unsigned long);
 extern unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
 extern unsigned int tcp_current_mss(struct sock *sk, int large);
 
-#ifdef TCP_DEBUG
-extern const char tcp_timer_bug_msg[];
-#endif
-
-/* tcp_diag.c */
+/* tcp.c */
 extern void tcp_get_info(struct sock *, struct tcp_info *);
 
 /* Read 'sendfile()'-style from a TCP socket */
@@ -892,72 +488,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			 sk_read_actor_t recv_actor);
 
-static inline void tcp_clear_xmit_timer(struct sock *sk, int what)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	
-	switch (what) {
-	case TCP_TIME_RETRANS:
-	case TCP_TIME_PROBE0:
-		tp->pending = 0;
-
-#ifdef TCP_CLEAR_TIMERS
-		sk_stop_timer(sk, &tp->retransmit_timer);
-#endif
-		break;
-	case TCP_TIME_DACK:
-		tp->ack.blocked = 0;
-		tp->ack.pending = 0;
-
-#ifdef TCP_CLEAR_TIMERS
-		sk_stop_timer(sk, &tp->delack_timer);
-#endif
-		break;
-	default:
-#ifdef TCP_DEBUG
-		printk(tcp_timer_bug_msg);
-#endif
-		return;
-	};
-
-}
-
-/*
- *	Reset the retransmission timer
- */
-static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (when > TCP_RTO_MAX) {
-#ifdef TCP_DEBUG
-		printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());
-#endif
-		when = TCP_RTO_MAX;
-	}
-
-	switch (what) {
-	case TCP_TIME_RETRANS:
-	case TCP_TIME_PROBE0:
-		tp->pending = what;
-		tp->timeout = jiffies+when;
-		sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
-		break;
-
-	case TCP_TIME_DACK:
-		tp->ack.pending |= TCP_ACK_TIMER;
-		tp->ack.timeout = jiffies+when;
-		sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
-		break;
-
-	default:
-#ifdef TCP_DEBUG
-		printk(tcp_timer_bug_msg);
-#endif
-		return;
-	};
-}
-
 /* Initialize RCV_MSS value.
  * RCV_MSS is an our guess about MSS used by the peer.
  * We haven't any direct information about the MSS.
@@ -975,7 +505,7 @@ static inline void tcp_initialize_rcv_mss(struct sock *sk)
 	hint = min(hint, TCP_MIN_RCVMSS);
 	hint = max(hint, TCP_MIN_MSS);
 
-	tp->ack.rcv_mss = hint;
+	inet_csk(sk)->icsk_ack.rcv_mss = hint;
 }
 
 static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
@@ -1110,7 +640,8 @@ static inline void tcp_packets_out_inc(struct sock *sk,
 
 	tp->packets_out += tcp_skb_pcount(skb);
 	if (!orig)
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 }
 
 static inline void tcp_packets_out_dec(struct tcp_sock *tp, 
@@ -1138,29 +669,29 @@ struct tcp_congestion_ops {
 	struct list_head	list;
 
 	/* initialize private data (optional) */
-	void (*init)(struct tcp_sock *tp);
+	void (*init)(struct sock *sk);
 	/* cleanup private data  (optional) */
-	void (*release)(struct tcp_sock *tp);
+	void (*release)(struct sock *sk);
 
 	/* return slow start threshold (required) */
-	u32 (*ssthresh)(struct tcp_sock *tp);
+	u32 (*ssthresh)(struct sock *sk);
 	/* lower bound for congestion window (optional) */
-	u32 (*min_cwnd)(struct tcp_sock *tp);
+	u32 (*min_cwnd)(struct sock *sk);
 	/* do new cwnd calculation (required) */
-	void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
+	void (*cong_avoid)(struct sock *sk, u32 ack,
 			   u32 rtt, u32 in_flight, int good_ack);
 	/* round trip time sample per acked packet (optional) */
-	void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
+	void (*rtt_sample)(struct sock *sk, u32 usrtt);
 	/* call before changing ca_state (optional) */
-	void (*set_state)(struct tcp_sock *tp, u8 new_state);
+	void (*set_state)(struct sock *sk, u8 new_state);
 	/* call when cwnd event occurs (optional) */
-	void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
+	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
 	/* new value of cwnd after loss (optional) */
-	u32  (*undo_cwnd)(struct tcp_sock *tp);
+	u32  (*undo_cwnd)(struct sock *sk);
 	/* hook for packet ack accounting (optional) */
-	void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
-	/* get info for tcp_diag (optional) */
-	void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
+	void (*pkts_acked)(struct sock *sk, u32 num_acked);
+	/* get info for inet_diag (optional) */
+	void (*get_info)(struct sock *sk, u32 ext, struct sk_buff *skb);
 
 	char 		name[TCP_CA_NAME_MAX];
 	struct module 	*owner;
@@ -1169,30 +700,34 @@ struct tcp_congestion_ops {
 extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
 
-extern void tcp_init_congestion_control(struct tcp_sock *tp);
-extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
+extern void tcp_init_congestion_control(struct sock *sk);
+extern void tcp_cleanup_congestion_control(struct sock *sk);
 extern int tcp_set_default_congestion_control(const char *name);
 extern void tcp_get_default_congestion_control(char *name);
-extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name);
+extern int tcp_set_congestion_control(struct sock *sk, const char *name);
 
 extern struct tcp_congestion_ops tcp_init_congestion_ops;
-extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
-extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
+extern u32 tcp_reno_ssthresh(struct sock *sk);
+extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack,
 				u32 rtt, u32 in_flight, int flag);
-extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
+extern u32 tcp_reno_min_cwnd(struct sock *sk);
 extern struct tcp_congestion_ops tcp_reno;
 
-static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
+static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
 {
-	if (tp->ca_ops->set_state)
-		tp->ca_ops->set_state(tp, ca_state);
-	tp->ca_state = ca_state;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->set_state)
+		icsk->icsk_ca_ops->set_state(sk, ca_state);
+	icsk->icsk_ca_state = ca_state;
 }
 
-static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 {
-	if (tp->ca_ops->cwnd_event)
-		tp->ca_ops->cwnd_event(tp, event);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->cwnd_event)
+		icsk->icsk_ca_ops->cwnd_event(sk, event);
 }
 
 /* This determines how many packets are "in the network" to the best
@@ -1218,9 +753,10 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
  * The exception is rate halving phase, when cwnd is decreasing towards
  * ssthresh.
  */
-static inline __u32 tcp_current_ssthresh(struct tcp_sock *tp)
+static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 {
-	if ((1<<tp->ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery))
+	const struct tcp_sock *tp = tcp_sk(sk);
+	if ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_CWR | TCPF_CA_Recovery))
 		return tp->snd_ssthresh;
 	else
 		return max(tp->snd_ssthresh,
@@ -1237,10 +773,13 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
 }
 
 /* Set slow start threshold and cwnd not falling to slow start */
-static inline void __tcp_enter_cwr(struct tcp_sock *tp)
+static inline void __tcp_enter_cwr(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	tp->undo_marker = 0;
-	tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 	tp->snd_cwnd = min(tp->snd_cwnd,
 			   tcp_packets_in_flight(tp) + 1U);
 	tp->snd_cwnd_cnt = 0;
@@ -1249,12 +788,14 @@ static inline void __tcp_enter_cwr(struct tcp_sock *tp)
 	TCP_ECN_queue_cwr(tp);
 }
 
-static inline void tcp_enter_cwr(struct tcp_sock *tp)
+static inline void tcp_enter_cwr(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	tp->prior_ssthresh = 0;
-	if (tp->ca_state < TCP_CA_CWR) {
-		__tcp_enter_cwr(tp);
-		tcp_set_ca_state(tp, TCP_CA_CWR);
+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+		__tcp_enter_cwr(sk);
+		tcp_set_ca_state(sk, TCP_CA_CWR);
 	}
 }
 
@@ -1277,8 +818,10 @@ static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss,
 
 static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp)
 {
-	if (!tp->packets_out && !tp->pending)
-		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	if (!tp->packets_out && !icsk->icsk_pending)
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  icsk->icsk_rto, TCP_RTO_MAX);
 }
 
 static __inline__ void tcp_push_pending_frames(struct sock *sk,
@@ -1297,9 +840,6 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq)
 	tp->snd_wl1 = seq;
 }
 
-extern void tcp_destroy_sock(struct sock *sk);
-
-
 /*
  * Calculate(/check) TCP checksum
  */
@@ -1359,8 +899,10 @@ static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 			tp->ucopy.memory = 0;
 		} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
 			wake_up_interruptible(sk->sk_sleep);
-			if (!tcp_ack_scheduled(tp))
-				tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4);
+			if (!inet_csk_ack_scheduled(sk))
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						          (3 * TCP_RTO_MIN) / 4,
+							  TCP_RTO_MAX);
 		}
 		return 1;
 	}
@@ -1393,9 +935,9 @@ static __inline__ void tcp_set_state(struct sock *sk, int state)
 			TCP_INC_STATS(TCP_MIB_ESTABRESETS);
 
 		sk->sk_prot->unhash(sk);
-		if (tcp_sk(sk)->bind_hash &&
+		if (inet_csk(sk)->icsk_bind_hash &&
 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
-			tcp_put_port(sk);
+			inet_put_port(&tcp_hashinfo, sk);
 		/* fall through */
 	default:
 		if (oldstate==TCP_ESTABLISHED)
@@ -1422,7 +964,7 @@ static __inline__ void tcp_done(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_state_change(sk);
 	else
-		tcp_destroy_sock(sk);
+		inet_csk_destroy_sock(sk);
 }
 
 static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt)
@@ -1524,54 +1066,6 @@ static inline int tcp_full_space(const struct sock *sk)
 	return tcp_win_from_space(sk->sk_rcvbuf); 
 }
 
-static inline void tcp_acceptq_queue(struct sock *sk, struct request_sock *req,
-					 struct sock *child)
-{
-	reqsk_queue_add(&tcp_sk(sk)->accept_queue, req, sk, child);
-}
-
-static inline void
-tcp_synq_removed(struct sock *sk, struct request_sock *req)
-{
-	if (reqsk_queue_removed(&tcp_sk(sk)->accept_queue, req) == 0)
-		tcp_delete_keepalive_timer(sk);
-}
-
-static inline void tcp_synq_added(struct sock *sk)
-{
-	if (reqsk_queue_added(&tcp_sk(sk)->accept_queue) == 0)
-		tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT);
-}
-
-static inline int tcp_synq_len(struct sock *sk)
-{
-	return reqsk_queue_len(&tcp_sk(sk)->accept_queue);
-}
-
-static inline int tcp_synq_young(struct sock *sk)
-{
-	return reqsk_queue_len_young(&tcp_sk(sk)->accept_queue);
-}
-
-static inline int tcp_synq_is_full(struct sock *sk)
-{
-	return reqsk_queue_is_full(&tcp_sk(sk)->accept_queue);
-}
-
-static inline void tcp_synq_unlink(struct tcp_sock *tp, struct request_sock *req,
-				   struct request_sock **prev)
-{
-	reqsk_queue_unlink(&tp->accept_queue, req, prev);
-}
-
-static inline void tcp_synq_drop(struct sock *sk, struct request_sock *req,
-				     struct request_sock **prev)
-{
-	tcp_synq_unlink(tcp_sk(sk), req, prev);
-	tcp_synq_removed(sk, req);
-	reqsk_free(req);
-}
-
 static __inline__ void tcp_openreq_init(struct request_sock *req,
 					struct tcp_options_received *rx_opt,
 					struct sk_buff *skb)
@@ -1593,27 +1087,6 @@ static __inline__ void tcp_openreq_init(struct request_sock *req,
 
 extern void tcp_enter_memory_pressure(void);
 
-extern void tcp_listen_wlock(void);
-
-/* - We may sleep inside this lock.
- * - If sleeping is not required (or called from BH),
- *   use plain read_(un)lock(&tcp_lhash_lock).
- */
-
-static inline void tcp_listen_lock(void)
-{
-	/* read_lock synchronizes to candidates to writers */
-	read_lock(&tcp_lhash_lock);
-	atomic_inc(&tcp_lhash_users);
-	read_unlock(&tcp_lhash_lock);
-}
-
-static inline void tcp_listen_unlock(void)
-{
-	if (atomic_dec_and_test(&tcp_lhash_users))
-		wake_up(&tcp_lhash_wait);
-}
-
 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
 {
 	return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl;
@@ -1624,12 +1097,13 @@ static inline int keepalive_time_when(const struct tcp_sock *tp)
 	return tp->keepalive_time ? : sysctl_tcp_keepalive_time;
 }
 
-static inline int tcp_fin_time(const struct tcp_sock *tp)
+static inline int tcp_fin_time(const struct sock *sk)
 {
-	int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout;
+	int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout;
+	const int rto = inet_csk(sk)->icsk_rto;
 
-	if (fin_timeout < (tp->rto<<2) - (tp->rto>>1))
-		fin_timeout = (tp->rto<<2) - (tp->rto>>1);
+	if (fin_timeout < (rto << 2) - (rto >> 1))
+		fin_timeout = (rto << 2) - (rto >> 1);
 
 	return fin_timeout;
 }
@@ -1658,15 +1132,6 @@ static inline int tcp_paws_check(const struct tcp_options_received *rx_opt, int
 	return 1;
 }
 
-static inline void tcp_v4_setup_caps(struct sock *sk, struct dst_entry *dst)
-{
-	sk->sk_route_caps = dst->dev->features;
-	if (sk->sk_route_caps & NETIF_F_TSO) {
-		if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
-			sk->sk_route_caps &= ~NETIF_F_TSO;
-	}
-}
-
 #define TCP_CHECK_TIMER(sk) do { } while (0)
 
 static inline int tcp_use_frto(const struct sock *sk)
@@ -1718,4 +1183,16 @@ struct tcp_iter_state {
 extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
 extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
 
+extern struct request_sock_ops tcp_request_sock_ops;
+
+extern int tcp_v4_destroy_sock(struct sock *sk);
+
+#ifdef CONFIG_PROC_FS
+extern int  tcp4_proc_init(void);
+extern void tcp4_proc_exit(void);
+#endif
+
+extern void tcp_v4_init(struct net_proto_family *ops);
+extern void tcp_init(void);
+
 #endif	/* _TCP_H */
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
index 64980ee8c92..c6b84397448 100644
--- a/include/net/tcp_ecn.h
+++ b/include/net/tcp_ecn.h
@@ -88,7 +88,7 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
 		 * it is surely retransmit. It is not in ECN RFC,
 		 * but Linux follows this rule. */
 		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
-			tcp_enter_quickack_mode(tp);
+			tcp_enter_quickack_mode((struct sock *)tp);
 	}
 }
 
diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
new file mode 100644
index 00000000000..b9d4176b2d1
--- /dev/null
+++ b/include/net/tcp_states.h
@@ -0,0 +1,34 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for the TCP protocol sk_state field.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _LINUX_TCP_STATES_H
+#define _LINUX_TCP_STATES_H
+
+enum {
+	TCP_ESTABLISHED = 1,
+	TCP_SYN_SENT,
+	TCP_SYN_RECV,
+	TCP_FIN_WAIT1,
+	TCP_FIN_WAIT2,
+	TCP_TIME_WAIT,
+	TCP_CLOSE,
+	TCP_CLOSE_WAIT,
+	TCP_LAST_ACK,
+	TCP_LISTEN,
+	TCP_CLOSING,	/* Now a valid state */
+
+	TCP_MAX_STATES	/* Leave at the end! */
+};
+
+#define TCP_STATE_MASK	0xF
+
+#endif	/* _LINUX_TCP_STATES_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index ac229b761db..107b9d791a1 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -94,6 +94,11 @@ struct udp_iter_state {
 	struct seq_operations	seq_ops;
 };
 
+#ifdef CONFIG_PROC_FS
 extern int udp_proc_register(struct udp_seq_afinfo *afinfo);
 extern void udp_proc_unregister(struct udp_seq_afinfo *afinfo);
+
+extern int  udp4_proc_init(void);
+extern void udp4_proc_exit(void);
+#endif
 #endif	/* _UDP_H */
diff --git a/include/net/x25.h b/include/net/x25.h
index 8b39b98876e..fee62ff8c19 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -175,7 +175,7 @@ extern void x25_kill_by_neigh(struct x25_neigh *);
 
 /* x25_dev.c */
 extern void x25_send_frame(struct sk_buff *, struct x25_neigh *);
-extern int  x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int  x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 extern void x25_establish_link(struct x25_neigh *);
 extern void x25_terminate_link(struct x25_neigh *);
 
diff --git a/include/net/x25device.h b/include/net/x25device.h
index d45ae883bd1..1a318374fae 100644
--- a/include/net/x25device.h
+++ b/include/net/x25device.h
@@ -8,7 +8,6 @@
 static inline __be16 x25_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	skb->mac.raw = skb->data;
-	skb->input_dev = skb->dev = dev;
 	skb->pkt_type = PACKET_HOST;
 	
 	return htons(ETH_P_X25);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 868ef88ef97..a9d0d8c5dfb 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -818,7 +818,6 @@ extern void xfrm6_init(void);
 extern void xfrm6_fini(void);
 extern void xfrm_state_init(void);
 extern void xfrm4_state_init(void);
-extern void xfrm4_state_fini(void);
 extern void xfrm6_state_init(void);
 extern void xfrm6_state_fini(void);
 
diff --git a/init/main.c b/init/main.c
index c9c311cf177..ff410063e4e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <net/sock.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -80,7 +81,6 @@
 static int init(void *);
 
 extern void init_IRQ(void);
-extern void sock_init(void);
 extern void fork_init(unsigned long);
 extern void mca_init(void);
 extern void sbus_init(void);
diff --git a/kernel/audit.c b/kernel/audit.c
index ef35166fdc2..7f0699790d4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -514,7 +514,8 @@ static int __init audit_init(void)
 {
 	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
 	       audit_default ? "enabled" : "disabled");
-	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+					   THIS_MODULE);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e0bbee549e..8e56e249554 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
@@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header =
 
 static ctl_table kern_table[];
 static ctl_table vm_table[];
-#ifdef CONFIG_NET
-extern ctl_table net_table[];
-#endif
 static ctl_table proc_table[];
 static ctl_table fs_table[];
 static ctl_table debug_table[];
diff --git a/lib/Kconfig b/lib/Kconfig
index eeb429a5215..e43197efeb9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -72,6 +72,9 @@ config TEXTSEARCH
 config TEXTSEARCH_KMP
 	tristate
 
+config TEXTSEARCH_BM
+	tristate
+
 config TEXTSEARCH_FSM
 	tristate
 
diff --git a/lib/Makefile b/lib/Makefile
index f28d9031303..52f83380f70 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
 hostprogs-y	:= gen_crc32table
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 8e49d21057e..04ca4429ddf 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -93,6 +93,7 @@ static int send_uevent(const char *signal, const char *obj,
 		}
 	}
 
+	NETLINK_CB(skb).dst_group = 1;
 	return netlink_broadcast(uevent_sock, skb, 0, 1, gfp_mask);
 }
 
@@ -153,7 +154,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_atomic);
 
 static int __init kobject_uevent_init(void)
 {
-	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, NULL);
+	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL,
+					    THIS_MODULE);
 
 	if (!uevent_sock) {
 		printk(KERN_ERR
diff --git a/lib/ts_bm.c b/lib/ts_bm.c
new file mode 100644
index 00000000000..2cc79112ecc
--- /dev/null
+++ b/lib/ts_bm.c
@@ -0,0 +1,185 @@
+/*
+ * lib/ts_bm.c		Boyer-Moore text search implementation
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * ==========================================================================
+ * 
+ *   Implements Boyer-Moore string matching algorithm:
+ *
+ *   [1] A Fast String Searching Algorithm, R.S. Boyer and Moore.
+ *       Communications of the Association for Computing Machinery, 
+ *       20(10), 1977, pp. 762-772.
+ *       http://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf
+ *
+ *   [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004
+ *       http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf
+ *
+ *   Note: Since Boyer-Moore (BM) performs searches for matchings from right 
+ *   to left, it's still possible that a matching could be spread over 
+ *   multiple blocks, in that case this algorithm won't find any coincidence.
+ *   
+ *   If you're willing to ensure that such thing won't ever happen, use the
+ *   Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose 
+ *   the proper string search algorithm depending on your setting. 
+ *
+ *   Say you're using the textsearch infrastructure for filtering, NIDS or 
+ *   any similar security focused purpose, then go KMP. Otherwise, if you 
+ *   really care about performance, say you're classifying packets to apply
+ *   Quality of Service (QoS) policies, and you don't mind about possible
+ *   matchings spread over multiple fragments, then go BM.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/textsearch.h>
+
+/* Alphabet size, use ASCII */
+#define ASIZE 256
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(args, format...)
+#endif
+
+struct ts_bm
+{
+	u8 *		pattern;
+	unsigned int	patlen;
+	unsigned int 	bad_shift[ASIZE];
+	unsigned int	good_shift[0];
+};
+
+static unsigned int bm_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_bm *bm = ts_config_priv(conf);
+	unsigned int i, text_len, consumed = state->offset;
+	const u8 *text;
+	int shift = bm->patlen, bs;
+
+	for (;;) {
+		text_len = conf->get_next_block(consumed, &text, conf, state);
+
+		if (unlikely(text_len == 0))
+			break;
+
+		while (shift < text_len) {
+			DEBUGP("Searching in position %d (%c)\n", 
+				shift, text[shift]);
+			for (i = 0; i < bm->patlen; i++) 
+			     if (text[shift-i] != bm->pattern[bm->patlen-1-i])
+				     goto next;
+
+			/* London calling... */
+			DEBUGP("found!\n");
+			return consumed += (shift-(bm->patlen-1));
+
+next:			bs = bm->bad_shift[text[shift-i]];
+
+			/* Now jumping to... */
+			shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]);
+		}
+		consumed += text_len;
+	}
+
+	return UINT_MAX;
+}
+
+static void compute_prefix_tbl(struct ts_bm *bm, const u8 *pattern,
+			       unsigned int len)
+{
+	int i, j, ended, l[ASIZE];
+
+	for (i = 0; i < ASIZE; i++)
+		bm->bad_shift[i] = len;
+	for (i = 0; i < len - 1; i++)
+		bm->bad_shift[pattern[i]] = len - 1 - i;
+
+	/* Compute the good shift array, used to match reocurrences 
+	 * of a subpattern */
+	for (i = 1; i < bm->patlen; i++) {
+		for (j = 0; j < bm->patlen && bm->pattern[bm->patlen - 1 - j]
+				== bm->pattern[bm->patlen - 1 - i - j]; j++);
+		l[i] = j;
+	}  
+
+	bm->good_shift[0] = 1;
+	for (i = 1; i < bm->patlen; i++)
+		bm->good_shift[i] = bm->patlen;
+	for (i = bm->patlen - 1; i > 0; i--)
+		bm->good_shift[l[i]] = i;
+	ended = 0;
+	for (i = 0; i < bm->patlen; i++) {
+		if (l[i] == bm->patlen - 1 - i)
+			ended = i;
+		if (ended)
+			bm->good_shift[i] = ended;
+	}
+}
+
+static struct ts_config *bm_init(const void *pattern, unsigned int len,
+				 int gfp_mask)
+{
+	struct ts_config *conf;
+	struct ts_bm *bm;
+	unsigned int prefix_tbl_len = len * sizeof(unsigned int);
+	size_t priv_size = sizeof(*bm) + len + prefix_tbl_len;
+
+	conf = alloc_ts_config(priv_size, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	bm = ts_config_priv(conf);
+	bm->patlen = len;
+	bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len;
+	compute_prefix_tbl(bm, pattern, len);
+	memcpy(bm->pattern, pattern, len);
+
+	return conf;
+}
+
+static void *bm_get_pattern(struct ts_config *conf)
+{
+	struct ts_bm *bm = ts_config_priv(conf);
+	return bm->pattern;
+}
+
+static unsigned int bm_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_bm *bm = ts_config_priv(conf);
+	return bm->patlen;
+}
+
+static struct ts_ops bm_ops = {
+	.name		  = "bm",
+	.find		  = bm_find,
+	.init		  = bm_init,
+	.get_pattern	  = bm_get_pattern,
+	.get_pattern_len  = bm_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(bm_ops.list)
+};
+
+static int __init init_bm(void)
+{
+	return textsearch_register(&bm_ops);
+}
+
+static void __exit exit_bm(void)
+{
+	textsearch_unregister(&bm_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_bm);
+module_exit(exit_bm);
diff --git a/mm/memory.c b/mm/memory.c
index e046b7e4b53..a596c117224 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -498,6 +498,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
 
+	/*
+	 * Don't copy ptes where a page fault will fill them correctly.
+	 * Fork becomes much lighter when there are big shared or private
+	 * readonly mappings. The tradeoff is that copy_page_range is more
+	 * efficient than faulting.
+	 */
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
+		if (!vma->anon_vma)
+			return 0;
+	}
+
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
diff --git a/net/802/fc.c b/net/802/fc.c
index 640d34e026c..282c4ab1abe 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -87,7 +87,7 @@ static int fc_rebuild_header(struct sk_buff *skb)
 	struct fch_hdr *fch=(struct fch_hdr *)skb->data;
 	struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr));
 	if(fcllc->ethertype != htons(ETH_P_IP)) {
-		printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype));
+		printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n", ntohs(fcllc->ethertype));
 		return 0;
 	}
 #ifdef CONFIG_INET
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 5ce24c4bb84..ac242a4bc34 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -108,8 +108,8 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
 	else
 #endif	
 	{
-		printk("%s: Don't know how to resolve type %02X addresses.\n",
-		       skb->dev->name, htons(fddi->hdr.llc_snap.ethertype));
+		printk("%s: Don't know how to resolve type %04X addresses.\n",
+		       skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
 		return(0);
 	}
 }
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 051e8af56a7..6d7fed3dd99 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -51,6 +51,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
 			unsigned len)
 {
 	struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN);
+	struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
 
 	if (!len){
 		len = skb->len - HIPPI_HLEN;
@@ -84,9 +85,10 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
 	if (daddr)
 	{
 		memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
-		memcpy(&skb->private.ifield, daddr + 2, 4);
+		memcpy(&hcb->ifield, daddr + 2, 4);
 		return HIPPI_HLEN;
 	}
+	hcb->ifield = 0;
 	return -((int)HIPPI_HLEN);
 }
 
@@ -122,7 +124,7 @@ static int hippi_rebuild_header(struct sk_buff *skb)
  *	Determine the packet's protocol ID.
  */
  
-unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct hippi_hdr *hip;
 	
diff --git a/net/802/p8022.c b/net/802/p8022.c
index 5ae63416df6..b24817c63ca 100644
--- a/net/802/p8022.c
+++ b/net/802/p8022.c
@@ -35,7 +35,8 @@ static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
 struct datalink_proto *register_8022_client(unsigned char type,
 					    int (*func)(struct sk_buff *skb,
 							struct net_device *dev,
-							struct packet_type *pt))
+							struct packet_type *pt,
+							struct net_device *orig_dev))
 {
 	struct datalink_proto *proto;
 
diff --git a/net/802/p8023.c b/net/802/p8023.c
index a0b61b40225..6368d3dce44 100644
--- a/net/802/p8023.c
+++ b/net/802/p8023.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 
 #include <net/datalink.h>
+#include <net/p8022.h>
 
 /*
  *	Place an 802.3 header on a packet. The driver will do the mac
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 1053821ddf9..ab80b1fab53 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -47,7 +47,7 @@ static struct datalink_proto *find_snap_client(unsigned char *desc)
  *	A SNAP packet has arrived
  */
 static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
-		    struct packet_type *pt)
+		    struct packet_type *pt, struct net_device *orig_dev)
 {
 	int rc = 1;
 	struct datalink_proto *proto;
@@ -61,7 +61,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
 		/* Pass the frame on. */
 		skb->h.raw  += 5;
 		skb_pull(skb, 5);
-		rc = proto->rcvfunc(skb, dev, &snap_packet_type);
+		rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
 	} else {
 		skb->sk = NULL;
 		kfree_skb(skb);
@@ -118,7 +118,8 @@ module_exit(snap_exit);
 struct datalink_proto *register_snap_client(unsigned char *desc,
 					    int (*rcvfunc)(struct sk_buff *,
 						    	   struct net_device *,
-							   struct packet_type *))
+							   struct packet_type *,
+							   struct net_device *))
 {
 	struct datalink_proto *proto = NULL;
 
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c
index 36079630c49..700129556c1 100644
--- a/net/802/sysctl_net_802.c
+++ b/net/802/sysctl_net_802.c
@@ -10,9 +10,10 @@
  *		2 of the License, or (at your option) any later version.
  */
 
+#include <linux/config.h>
 #include <linux/mm.h>
+#include <linux/if_tr.h>
 #include <linux/sysctl.h>
-#include <linux/config.h>
 
 #ifdef CONFIG_TR
 extern int sysctl_tr_rif_timeout;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 508b1fa1454..9ae3a14dd01 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -51,7 +51,7 @@ struct net_device *__find_vlan_dev(struct net_device* real_dev,
 /* found in vlan_dev.c */
 int vlan_dev_rebuild_header(struct sk_buff *skb);
 int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
-                  struct packet_type* ptype);
+                  struct packet_type *ptype, struct net_device *orig_dev);
 int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                          unsigned short type, void *daddr, void *saddr,
                          unsigned len);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 49c48741351..145f5cde96c 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -113,7 +113,7 @@ static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb)
  *
  */
 int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
-                  struct packet_type* ptype)
+                  struct packet_type* ptype, struct net_device *orig_dev)
 {
 	unsigned char *rawp = NULL;
 	struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data);
diff --git a/net/Kconfig b/net/Kconfig
index 40a31ba86d2..c07aafb59a0 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -147,6 +147,7 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/atm/Kconfig"
 source "net/bridge/Kconfig"
@@ -205,6 +206,8 @@ config NET_PKTGEN
 	  To compile this code as a module, choose M here: the
 	  module will be called pktgen.
 
+source "net/netfilter/Kconfig"
+
 endmenu
 
 endmenu
diff --git a/net/Makefile b/net/Makefile
index 8e2bdc025ab..7e6eff206c8 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_NET)		+= $(tmp-y)
 obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/
 obj-$(CONFIG_INET)		+= ipv4/
+obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 ifneq ($(CONFIG_IPV6),)
@@ -41,6 +42,7 @@ obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_DECNET)		+= decnet/
 obj-$(CONFIG_ECONET)		+= econet/
 obj-$(CONFIG_VLAN_8021Q)	+= 8021q/
+obj-$(CONFIG_IP_DCCP)		+= dccp/
 obj-$(CONFIG_IP_SCTP)		+= sctp/
 
 ifeq ($(CONFIG_NET),y)
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index c34614ea5fc..7076097debc 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -698,7 +698,7 @@ static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a,
  *	frame. We currently only support Ethernet.
  */
 static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
-		    struct packet_type *pt)
+		    struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct elapaarp *ea = aarp_hdr(skb);
 	int hash, ret = 0;
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 192b529f86a..1d31b3a3f1e 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -53,12 +53,12 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
 #include <linux/if_arp.h>
 #include <linux/termios.h>	/* For TIOCOUTQ/INQ */
 #include <net/datalink.h>
 #include <net/psnap.h>
 #include <net/sock.h>
+#include <net/tcp_states.h>
 #include <net/route.h>
 #include <linux/atalk.h>
 
@@ -1390,7 +1390,7 @@ free_it:
  *	[ie ARPHRD_ETHERTALK]
  */
 static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
-		     struct packet_type *pt)
+		     struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct ddpehdr *ddp;
 	struct sock *sock;
@@ -1482,7 +1482,7 @@ freeit:
  * header and append a long one.
  */
 static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt)
+		     struct packet_type *pt, struct net_device *orig_dev)
 {
 	/* Expand any short form frames */
 	if (skb->mac.raw[2] == 1) {
@@ -1528,7 +1528,7 @@ static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev,
 	}
 	skb->h.raw = skb->data;
 
-	return atalk_rcv(skb, dev, pt);
+	return atalk_rcv(skb, dev, pt, orig_dev);
 freeit:
 	kfree_skb(skb);
 	return 0;
diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c
index 181a3002d8a..4b1faca5013 100644
--- a/net/atm/ipcommon.c
+++ b/net/atm/ipcommon.c
@@ -34,7 +34,6 @@
 
 void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
 {
-	struct sk_buff *skb;
 	unsigned long flags;
 	struct sk_buff *skb_from = (struct sk_buff *) from;
 	struct sk_buff *skb_to = (struct sk_buff *) to;
@@ -47,8 +46,6 @@ void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to)
 	prev->next = skb_to;
 	to->prev->next = from->next;
 	to->prev = from->prev;
-	for (skb = from->next; skb != skb_to; skb = skb->next)
-		skb->list = to;
 	to->qlen += from->qlen;
 	spin_unlock(&to->lock);
 	from->prev = skb_from;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index a5c94f11547..ea43dfb774e 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -45,7 +45,7 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ip.h>
 #include <net/arp.h>
 
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 8adc0022cf5..edcaa897027 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,8 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>			/* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 3a8b67316fc..061083efc1d 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 #include <linux/sockios.h>
 #include <linux/net.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ax25.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 3dc808fde33..810c9c76c2e 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -9,7 +9,6 @@
  * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
  * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de)
  */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -26,9 +25,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter.h>
 #include <net/sock.h>
-#include <net/ip.h>			/* For ip_rcv */
-#include <net/tcp.h>
-#include <net/arp.h>			/* For arp_rcv */
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -114,7 +111,6 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
 
 	pid = *skb->data;
 
-#ifdef CONFIG_INET
 	if (pid == AX25_P_IP) {
 		/* working around a TCP bug to keep additional listeners
 		 * happy. TCP re-uses the buffer and destroys the original
@@ -132,10 +128,9 @@ int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb)
 		skb->dev      = ax25->ax25_dev->dev;
 		skb->pkt_type = PACKET_HOST;
 		skb->protocol = htons(ETH_P_IP);
-		ip_rcv(skb, skb->dev, NULL);	/* Wrong ptype */
+		netif_rx(skb);
 		return 1;
 	}
-#endif
 	if (pid == AX25_P_SEGMENT) {
 		skb_pull(skb, 1);	/* Remove PID */
 		return ax25_rx_fragment(ax25, skb);
@@ -250,7 +245,6 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
 
 		/* Now we are pointing at the pid byte */
 		switch (skb->data[1]) {
-#ifdef CONFIG_INET
 		case AX25_P_IP:
 			skb_pull(skb,2);		/* drop PID/CTRL */
 			skb->h.raw    = skb->data;
@@ -258,7 +252,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
 			skb->dev      = dev;
 			skb->pkt_type = PACKET_HOST;
 			skb->protocol = htons(ETH_P_IP);
-			ip_rcv(skb, dev, ptype);	/* Note ptype here is the wrong one, fix me later */
+			netif_rx(skb);
 			break;
 
 		case AX25_P_ARP:
@@ -268,9 +262,8 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
 			skb->dev      = dev;
 			skb->pkt_type = PACKET_HOST;
 			skb->protocol = htons(ETH_P_ARP);
-			arp_rcv(skb, dev, ptype);	/* Note ptype here is wrong... */
+			netif_rx(skb);
 			break;
-#endif
 		case AX25_P_TEXT:
 			/* Now find a suitable dgram socket */
 			sk = ax25_get_socket(&dest, &src, SOCK_DGRAM);
@@ -454,7 +447,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
  *	Receive an AX.25 frame via a SLIP interface.
  */
 int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
-		  struct packet_type *ptype)
+		  struct packet_type *ptype, struct net_device *orig_dev)
 {
 	skb->sk = NULL;		/* Initially we don't know who it's for */
 	skb->destructor = NULL;	/* Who initializes this, dammit?! */
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 7131873322c..f6ed283e9de 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,8 +29,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>			/* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 066897bc074..a29c480a4dc 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 99694b57f6f..c41dbe5fade 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -24,7 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -76,7 +76,7 @@ void ax25_requeue_frames(ax25_cb *ax25)
 		if (skb_prev == NULL)
 			skb_queue_head(&ax25->write_queue, skb);
 		else
-			skb_append(skb_prev, skb);
+			skb_append(skb_prev, skb, &ax25->write_queue);
 		skb_prev = skb;
 	}
 }
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ffa26c10bfe..55dc42eac92 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -191,7 +191,7 @@ static void hci_init_req(struct hci_dev *hdev, unsigned long opt)
 
 	/* Special commands */
 	while ((skb = skb_dequeue(&hdev->driver_init))) {
-		skb->pkt_type = HCI_COMMAND_PKT;
+		bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
 		skb->dev = (void *) hdev;
 		skb_queue_tail(&hdev->cmd_q, skb);
 		hci_sched_cmd(hdev);
@@ -995,11 +995,11 @@ static int hci_send_frame(struct sk_buff *skb)
 		return -ENODEV;
 	}
 
-	BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len);
+	BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
 
 	if (atomic_read(&hdev->promisc)) {
 		/* Time stamp */
-		do_gettimeofday(&skb->stamp);
+		__net_timestamp(skb);
 
 		hci_send_to_sock(hdev, skb);
 	}
@@ -1034,7 +1034,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *p
 
 	BT_DBG("skb len %d", skb->len);
 
-	skb->pkt_type = HCI_COMMAND_PKT;
+	bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
 	skb->dev = (void *) hdev;
 	skb_queue_tail(&hdev->cmd_q, skb);
 	hci_sched_cmd(hdev);
@@ -1081,7 +1081,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
 	BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags);
 
 	skb->dev = (void *) hdev;
-	skb->pkt_type = HCI_ACLDATA_PKT;
+	bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
 	hci_add_acl_hdr(skb, conn->handle, flags | ACL_START);
 
 	if (!(list = skb_shinfo(skb)->frag_list)) {
@@ -1103,7 +1103,7 @@ int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags)
 			skb = list; list = list->next;
 			
 			skb->dev = (void *) hdev;
-			skb->pkt_type = HCI_ACLDATA_PKT;
+			bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
 			hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT);
 
 			BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -1139,7 +1139,7 @@ int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
 	memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE);
 
 	skb->dev = (void *) hdev;
-	skb->pkt_type = HCI_SCODATA_PKT;
+	bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
 	skb_queue_tail(&conn->data_q, skb);
 	hci_sched_tx(hdev);
 	return 0;
@@ -1369,7 +1369,7 @@ void hci_rx_task(unsigned long arg)
 
 		if (test_bit(HCI_INIT, &hdev->flags)) {
 			/* Don't process data packets in this states. */
-			switch (skb->pkt_type) {
+			switch (bt_cb(skb)->pkt_type) {
 			case HCI_ACLDATA_PKT:
 			case HCI_SCODATA_PKT:
 				kfree_skb(skb);
@@ -1378,7 +1378,7 @@ void hci_rx_task(unsigned long arg)
 		}
 
 		/* Process frame */
-		switch (skb->pkt_type) {
+		switch (bt_cb(skb)->pkt_type) {
 		case HCI_EVENT_PKT:
 			hci_event_packet(hdev, skb);
 			break;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 46367bd129c..d6da0939216 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,14 +484,18 @@ static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff
 /* Inquiry Result */
 static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
+	struct inquiry_data data;
 	struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1);
 	int num_rsp = *((__u8 *) skb->data);
 
 	BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
 
+	if (!num_rsp)
+		return;
+
 	hci_dev_lock(hdev);
+
 	for (; num_rsp; num_rsp--) {
-		struct inquiry_data data;
 		bacpy(&data.bdaddr, &info->bdaddr);
 		data.pscan_rep_mode	= info->pscan_rep_mode;
 		data.pscan_period_mode	= info->pscan_period_mode;
@@ -502,30 +506,55 @@ static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *
 		info++;
 		hci_inquiry_cache_update(hdev, &data);
 	}
+
 	hci_dev_unlock(hdev);
 }
 
 /* Inquiry Result With RSSI */
 static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb)
 {
-	struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1);
+	struct inquiry_data data;
 	int num_rsp = *((__u8 *) skb->data);
 
 	BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
 
+	if (!num_rsp)
+		return;
+
 	hci_dev_lock(hdev);
-	for (; num_rsp; num_rsp--) {
-		struct inquiry_data data;
-		bacpy(&data.bdaddr, &info->bdaddr);
-		data.pscan_rep_mode	= info->pscan_rep_mode;
-		data.pscan_period_mode	= info->pscan_period_mode;
-		data.pscan_mode		= 0x00;
-		memcpy(data.dev_class, info->dev_class, 3);
-		data.clock_offset	= info->clock_offset;
-		data.rssi		= info->rssi;
-		info++;
-		hci_inquiry_cache_update(hdev, &data);
+
+	if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
+		struct inquiry_info_with_rssi_and_pscan_mode *info =
+			(struct inquiry_info_with_rssi_and_pscan_mode *) (skb->data + 1);
+
+		for (; num_rsp; num_rsp--) {
+			bacpy(&data.bdaddr, &info->bdaddr);
+			data.pscan_rep_mode	= info->pscan_rep_mode;
+			data.pscan_period_mode	= info->pscan_period_mode;
+			data.pscan_mode		= info->pscan_mode;
+			memcpy(data.dev_class, info->dev_class, 3);
+			data.clock_offset	= info->clock_offset;
+			data.rssi		= info->rssi;
+			info++;
+			hci_inquiry_cache_update(hdev, &data);
+		}
+	} else {
+		struct inquiry_info_with_rssi *info =
+			(struct inquiry_info_with_rssi *) (skb->data + 1);
+
+		for (; num_rsp; num_rsp--) {
+			bacpy(&data.bdaddr, &info->bdaddr);
+			data.pscan_rep_mode	= info->pscan_rep_mode;
+			data.pscan_period_mode	= info->pscan_period_mode;
+			data.pscan_mode		= 0x00;
+			memcpy(data.dev_class, info->dev_class, 3);
+			data.clock_offset	= info->clock_offset;
+			data.rssi		= info->rssi;
+			info++;
+			hci_inquiry_cache_update(hdev, &data);
+		}
 	}
+
 	hci_dev_unlock(hdev);
 }
 
@@ -865,6 +894,24 @@ static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *sk
 	hci_dev_unlock(hdev);
 }
 
+/* Page Scan Repetition Mode */
+static inline void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	struct hci_ev_pscan_rep_mode *ev = (struct hci_ev_pscan_rep_mode *) skb->data;
+	struct inquiry_entry *ie;
+
+	BT_DBG("%s", hdev->name);
+
+	hci_dev_lock(hdev);
+
+	if ((ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr))) {
+		ie->data.pscan_rep_mode = ev->pscan_rep_mode;
+		ie->timestamp = jiffies;
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data;
@@ -937,6 +984,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
 		hci_clock_offset_evt(hdev, skb);
 		break;
 
+	case HCI_EV_PSCAN_REP_MODE:
+		hci_pscan_rep_mode_evt(hdev, skb);
+		break;
+
 	case HCI_EV_CMD_STATUS:
 		cs = (struct hci_ev_cmd_status *) skb->data;
 		skb_pull(skb, sizeof(cs));
@@ -1036,9 +1087,9 @@ void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
 	memcpy(ev->data, data, dlen);
 
 	bt_cb(skb)->incoming = 1;
-	do_gettimeofday(&skb->stamp);
+	__net_timestamp(skb);
 
-	skb->pkt_type = HCI_EVENT_PKT;
+	bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
 	skb->dev = (void *) hdev;
 	hci_send_to_sock(hdev, skb);
 	kfree_skb(skb);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index ebdcce5e7ca..32ef7975a13 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -110,11 +110,11 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
 		/* Apply filter */
 		flt = &hci_pi(sk)->filter;
 
-		if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ?
-				0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
+		if (!test_bit((bt_cb(skb)->pkt_type == HCI_VENDOR_PKT) ?
+				0 : (bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask))
 			continue;
 
-		if (skb->pkt_type == HCI_EVENT_PKT) {
+		if (bt_cb(skb)->pkt_type == HCI_EVENT_PKT) {
 			register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
 
 			if (!hci_test_bit(evt, &flt->event_mask))
@@ -131,7 +131,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
 			continue;
 
 		/* Put type byte before the data */
-		memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1);
+		memcpy(skb_push(nskb, 1), &bt_cb(nskb)->pkt_type, 1);
 
 		if (sock_queue_rcv_skb(sk, nskb))
 			kfree_skb(nskb);
@@ -327,11 +327,17 @@ static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_
 {
 	__u32 mask = hci_pi(sk)->cmsg_mask;
 
-	if (mask & HCI_CMSG_DIR)
-		put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming);
+	if (mask & HCI_CMSG_DIR) {
+		int incoming = bt_cb(skb)->incoming;
+		put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(incoming), &incoming);
+	}
+
+	if (mask & HCI_CMSG_TSTAMP) {
+		struct timeval tv;
 
-	if (mask & HCI_CMSG_TSTAMP)
-		put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp);
+		skb_get_timestamp(skb, &tv);
+		put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(tv), &tv);
+	}
 }
  
 static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, 
@@ -405,11 +411,11 @@ static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 		goto drop;
 	}
 
-	skb->pkt_type = *((unsigned char *) skb->data);
+	bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
 	skb_pull(skb, 1);
 	skb->dev = (void *) hdev;
 
-	if (skb->pkt_type == HCI_COMMAND_PKT) {
+	if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
 		u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data));
 		u16 ogf = hci_opcode_ogf(opcode);
 		u16 ocf = hci_opcode_ocf(opcode);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 32fccfb5bfa..d3d6bc54721 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
 	.obj_size	= sizeof(struct l2cap_pinfo)
 };
 
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 27bf5047cd3..173f46e8cda 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -21,10 +21,6 @@
    SOFTWARE IS DISCLAIMED.
 */
 
-/* 
-   RPN support    -    Dirk Husemann <hud@zurich.ibm.com>
-*/
-
 /*
  * Bluetooth RFCOMM core.
  *
@@ -115,10 +111,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
 #define __get_mcc_len(b)  ((b & 0xfe) >> 1)
 
 /* RPN macros */
-#define __rpn_line_settings(data, stop, parity)  ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3))
+#define __rpn_line_settings(data, stop, parity)  ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x7) << 3))
 #define __get_rpn_data_bits(line) ((line) & 0x3)
 #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
-#define __get_rpn_parity(line)    (((line) >> 3) & 0x3)
+#define __get_rpn_parity(line)    (((line) >> 3) & 0x7)
 
 static inline void rfcomm_schedule(uint event)
 {
@@ -233,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
 	d->rx_credits = RFCOMM_DEFAULT_CREDITS;
 }
 
-struct rfcomm_dlc *rfcomm_dlc_alloc(int prio)
+struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio)
 {
 	struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
 	if (!d)
@@ -780,10 +776,10 @@ static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d
 	return rfcomm_send_frame(s, buf, ptr - buf);
 }
 
-static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
-			   u8 bit_rate, u8 data_bits, u8 stop_bits,
-			   u8 parity, u8 flow_ctrl_settings, 
-			   u8 xon_char, u8 xoff_char, u16 param_mask)
+int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
+			u8 bit_rate, u8 data_bits, u8 stop_bits,
+			u8 parity, u8 flow_ctrl_settings, 
+			u8 xon_char, u8 xoff_char, u16 param_mask)
 {
 	struct rfcomm_hdr *hdr;
 	struct rfcomm_mcc *mcc;
@@ -791,9 +787,9 @@ static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
 	u8 buf[16], *ptr = buf;
 
 	BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x"
-	       "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", 
-			s, cr, dlci, bit_rate, data_bits, stop_bits, parity, 
-			flow_ctrl_settings, xon_char, xoff_char, param_mask);
+			" flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", 
+		s, cr, dlci, bit_rate, data_bits, stop_bits, parity, 
+		flow_ctrl_settings, xon_char, xoff_char, param_mask);
 
 	hdr = (void *) ptr; ptr += sizeof(*hdr);
 	hdr->addr = __addr(s->initiator, 0);
@@ -1265,16 +1261,16 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 	u8 xon_char  = 0;
 	u8 xoff_char = 0;
 	u16 rpn_mask = RFCOMM_RPN_PM_ALL;
-	
-	BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", 
-	       dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
-	       rpn->xon_char, rpn->xoff_char, rpn->param_mask);
-	
-	if (!cr) 
+
+	BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x",
+		dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl,
+		rpn->xon_char, rpn->xoff_char, rpn->param_mask);
+
+	if (!cr)
 		return 0;
-	
+
 	if (len == 1) {
-		/* request: return default setting */
+		/* This is a request, return default settings */
 		bit_rate  = RFCOMM_RPN_BR_115200;
 		data_bits = RFCOMM_RPN_DATA_8;
 		stop_bits = RFCOMM_RPN_STOP_1;
@@ -1282,11 +1278,12 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 		flow_ctrl = RFCOMM_RPN_FLOW_NONE;
 		xon_char  = RFCOMM_RPN_XON_CHAR;
 		xoff_char = RFCOMM_RPN_XOFF_CHAR;
-
 		goto rpn_out;
 	}
-	/* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity,
-	                          no flow control lines, normal XON/XOFF chars */
+
+	/* Check for sane values, ignore/accept bit_rate, 8 bits, 1 stop bit,
+	 * no parity, no flow control lines, normal XON/XOFF chars */
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) {
 		bit_rate = rpn->bit_rate;
 		if (bit_rate != RFCOMM_RPN_BR_115200) {
@@ -1295,6 +1292,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_DATA) {
 		data_bits = __get_rpn_data_bits(rpn->line_settings);
 		if (data_bits != RFCOMM_RPN_DATA_8) {
@@ -1303,6 +1301,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_DATA;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_STOP) {
 		stop_bits = __get_rpn_stop_bits(rpn->line_settings);
 		if (stop_bits != RFCOMM_RPN_STOP_1) {
@@ -1311,6 +1310,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_STOP;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) {
 		parity = __get_rpn_parity(rpn->line_settings);
 		if (parity != RFCOMM_RPN_PARITY_NONE) {
@@ -1319,6 +1319,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_PARITY;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) {
 		flow_ctrl = rpn->flow_ctrl;
 		if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) {
@@ -1327,6 +1328,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_FLOW;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_XON) {
 		xon_char = rpn->xon_char;
 		if (xon_char != RFCOMM_RPN_XON_CHAR) {
@@ -1335,6 +1337,7 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 			rpn_mask ^= RFCOMM_RPN_PM_XON;
 		}
 	}
+
 	if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) {
 		xoff_char = rpn->xoff_char;
 		if (xoff_char != RFCOMM_RPN_XOFF_CHAR) {
@@ -1345,9 +1348,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 	}
 
 rpn_out:
-	rfcomm_send_rpn(s, 0, dlci, 
-			bit_rate, data_bits, stop_bits, parity, flow_ctrl,
-			xon_char, xoff_char, rpn_mask);
+	rfcomm_send_rpn(s, 0, dlci, bit_rate, data_bits, stop_bits,
+			parity, flow_ctrl, xon_char, xoff_char, rpn_mask);
 
 	return 0;
 }
@@ -1358,14 +1360,13 @@ static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb
 	u8 dlci = __get_dlci(rls->dlci);
 
 	BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status);
-	
+
 	if (!cr)
 		return 0;
 
-	/* FIXME: We should probably do something with this
-	   information here. But for now it's sufficient just
-	   to reply -- Bluetooth 1.1 says it's mandatory to 
-	   recognise and respond to RLS */
+	/* We should probably do something with this information here. But
+	 * for now it's sufficient just to reply -- Bluetooth 1.1 says it's
+	 * mandatory to recognise and respond to RLS */
 
 	rfcomm_send_rls(s, 0, dlci, rls->status);
 
@@ -1381,7 +1382,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
 	BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig);
 
 	d = rfcomm_dlc_get(s, dlci);
-	if (!d) 
+	if (!d)
 		return 0;
 
 	if (cr) {
@@ -1389,7 +1390,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
 			set_bit(RFCOMM_TX_THROTTLED, &d->flags);
 		else
 			clear_bit(RFCOMM_TX_THROTTLED, &d->flags);
-		
+
 		rfcomm_dlc_lock(d);
 		if (d->modem_status)
 			d->modem_status(d, msc->v24_sig);
@@ -1398,7 +1399,7 @@ static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb
 		rfcomm_send_msc(s, 0, dlci, msc->v24_sig);
 
 		d->mscex |= RFCOMM_MSCEX_RX;
-	} else 
+	} else
 		d->mscex |= RFCOMM_MSCEX_TX;
 
 	return 0;
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 63a123c5c41..90e19eb6d3c 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
 	.obj_size	= sizeof(struct rfcomm_pinfo)
 };
 
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
 	struct rfcomm_dlc *d;
 	struct sock *sk;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6304590fd36..1bca860a610 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
 	skb->destructor = rfcomm_wfree;
 }
 
-static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority)
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority)
 {
 	if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
 		struct sk_buff *skb = alloc_skb(size, priority);
@@ -528,9 +528,14 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
 	struct rfcomm_dev *dev = dlc->owner;
 	if (!dev)
 		return;
-	
+
 	BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig);
 
+	if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) {
+		if (dev->tty && !C_CLOCAL(dev->tty))
+			tty_hangup(dev->tty);
+	}
+
 	dev->modem_status = 
 		((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
 		((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
@@ -740,20 +745,143 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
 	return -ENOIOCTLCMD;
 }
 
-#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
-
 static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old)
 {
-	BT_DBG("tty %p", tty);
+	struct termios *new = (struct termios *) tty->termios;
+	int old_baud_rate = tty_termios_baud_rate(old);
+	int new_baud_rate = tty_termios_baud_rate(new);
 
-	if ((tty->termios->c_cflag == old->c_cflag) &&
-		(RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag)))
-		return;
+	u8 baud, data_bits, stop_bits, parity, x_on, x_off;
+	u16 changes = 0;
+
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+
+	BT_DBG("tty %p termios %p", tty, old);
+
+	/* Handle turning off CRTSCTS */
+	if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) 
+		BT_DBG("Turning off CRTSCTS unsupported");
+
+	/* Parity on/off and when on, odd/even */
+	if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) ||
+			((old->c_cflag & PARODD) != (new->c_cflag & PARODD)) ) {
+		changes |= RFCOMM_RPN_PM_PARITY;
+		BT_DBG("Parity change detected.");
+	}
+
+	/* Mark and space parity are not supported! */
+	if (new->c_cflag & PARENB) {
+		if (new->c_cflag & PARODD) {
+			BT_DBG("Parity is ODD");
+			parity = RFCOMM_RPN_PARITY_ODD;
+		} else {
+			BT_DBG("Parity is EVEN");
+			parity = RFCOMM_RPN_PARITY_EVEN;
+		}
+	} else {
+		BT_DBG("Parity is OFF");
+		parity = RFCOMM_RPN_PARITY_NONE;
+	}
+
+	/* Setting the x_on / x_off characters */
+	if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) {
+		BT_DBG("XOFF custom");
+		x_on = new->c_cc[VSTOP];
+		changes |= RFCOMM_RPN_PM_XON;
+	} else {
+		BT_DBG("XOFF default");
+		x_on = RFCOMM_RPN_XON_CHAR;
+	}
+
+	if (old->c_cc[VSTART] != new->c_cc[VSTART]) {
+		BT_DBG("XON custom");
+		x_off = new->c_cc[VSTART];
+		changes |= RFCOMM_RPN_PM_XOFF;
+	} else {
+		BT_DBG("XON default");
+		x_off = RFCOMM_RPN_XOFF_CHAR;
+	}
+
+	/* Handle setting of stop bits */
+	if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB))
+		changes |= RFCOMM_RPN_PM_STOP;
+
+	/* POSIX does not support 1.5 stop bits and RFCOMM does not
+	 * support 2 stop bits. So a request for 2 stop bits gets
+	 * translated to 1.5 stop bits */
+	if (new->c_cflag & CSTOPB) {
+		stop_bits = RFCOMM_RPN_STOP_15;
+	} else {
+		stop_bits = RFCOMM_RPN_STOP_1;
+	}
+
+	/* Handle number of data bits [5-8] */
+	if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) 
+		changes |= RFCOMM_RPN_PM_DATA;
+
+	switch (new->c_cflag & CSIZE) {
+	case CS5:
+		data_bits = RFCOMM_RPN_DATA_5;
+		break;
+	case CS6:
+		data_bits = RFCOMM_RPN_DATA_6;
+		break;
+	case CS7:
+		data_bits = RFCOMM_RPN_DATA_7;
+		break;
+	case CS8:
+		data_bits = RFCOMM_RPN_DATA_8;
+		break;
+	default:
+		data_bits = RFCOMM_RPN_DATA_8;
+		break;
+	}
+
+	/* Handle baudrate settings */
+	if (old_baud_rate != new_baud_rate)
+		changes |= RFCOMM_RPN_PM_BITRATE;
 
-	/* handle turning off CRTSCTS */
-	if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) {
-		BT_DBG("turning off CRTSCTS");
+	switch (new_baud_rate) {
+	case 2400:
+		baud = RFCOMM_RPN_BR_2400;
+		break;
+	case 4800:
+		baud = RFCOMM_RPN_BR_4800;
+		break;
+	case 7200:
+		baud = RFCOMM_RPN_BR_7200;
+		break;
+	case 9600:
+		baud = RFCOMM_RPN_BR_9600;
+		break;
+	case 19200: 
+		baud = RFCOMM_RPN_BR_19200;
+		break;
+	case 38400:
+		baud = RFCOMM_RPN_BR_38400;
+		break;
+	case 57600:
+		baud = RFCOMM_RPN_BR_57600;
+		break;
+	case 115200:
+		baud = RFCOMM_RPN_BR_115200;
+		break;
+	case 230400:
+		baud = RFCOMM_RPN_BR_230400;
+		break;
+	default:
+		/* 9600 is standard accordinag to the RFCOMM specification */
+		baud = RFCOMM_RPN_BR_9600;
+		break;
+	
 	}
+
+	if (changes)
+		rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud,
+				data_bits, stop_bits, parity,
+				RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes);
+
+	return;
 }
 
 static void rfcomm_tty_throttle(struct tty_struct *tty)
@@ -761,7 +889,7 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
 
 	BT_DBG("tty %p dev %p", tty, dev);
-	
+
 	rfcomm_dlc_throttle(dev->dlc);
 }
 
@@ -770,7 +898,7 @@ static void rfcomm_tty_unthrottle(struct tty_struct *tty)
 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
 
 	BT_DBG("tty %p dev %p", tty, dev);
-	
+
 	rfcomm_dlc_unthrottle(dev->dlc);
 }
 
@@ -841,35 +969,35 @@ static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp)
 
 static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear)
 {
- 	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
- 	struct rfcomm_dlc *dlc = dev->dlc;
- 	u8 v24_sig;
+	struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+	struct rfcomm_dlc *dlc = dev->dlc;
+	u8 v24_sig;
 
 	BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear);
 
- 	rfcomm_dlc_get_modem_status(dlc, &v24_sig);
-
- 	if (set & TIOCM_DSR || set & TIOCM_DTR)
- 		v24_sig |= RFCOMM_V24_RTC;
- 	if (set & TIOCM_RTS || set & TIOCM_CTS)
- 		v24_sig |= RFCOMM_V24_RTR;
- 	if (set & TIOCM_RI)
- 		v24_sig |= RFCOMM_V24_IC;
- 	if (set & TIOCM_CD)
- 		v24_sig |= RFCOMM_V24_DV;
-
- 	if (clear & TIOCM_DSR || clear & TIOCM_DTR)
- 		v24_sig &= ~RFCOMM_V24_RTC;
- 	if (clear & TIOCM_RTS || clear & TIOCM_CTS)
- 		v24_sig &= ~RFCOMM_V24_RTR;
- 	if (clear & TIOCM_RI)
- 		v24_sig &= ~RFCOMM_V24_IC;
- 	if (clear & TIOCM_CD)
- 		v24_sig &= ~RFCOMM_V24_DV;
-
- 	rfcomm_dlc_set_modem_status(dlc, v24_sig);
-
- 	return 0;
+	rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+	if (set & TIOCM_DSR || set & TIOCM_DTR)
+		v24_sig |= RFCOMM_V24_RTC;
+	if (set & TIOCM_RTS || set & TIOCM_CTS)
+		v24_sig |= RFCOMM_V24_RTR;
+	if (set & TIOCM_RI)
+		v24_sig |= RFCOMM_V24_IC;
+	if (set & TIOCM_CD)
+		v24_sig |= RFCOMM_V24_DV;
+
+	if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+		v24_sig &= ~RFCOMM_V24_RTC;
+	if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+		v24_sig &= ~RFCOMM_V24_RTR;
+	if (clear & TIOCM_RI)
+		v24_sig &= ~RFCOMM_V24_IC;
+	if (clear & TIOCM_CD)
+		v24_sig &= ~RFCOMM_V24_DV;
+
+	rfcomm_dlc_set_modem_status(dlc, v24_sig);
+
+	return 0;
 }
 
 /* ---- TTY structure ---- */
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 746c11fc017..ce7ab7dfa0b 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
 	.obj_size	= sizeof(struct sco_pinfo)
 };
 
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio)
+static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
 {
 	struct sock *sk;
 
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6c2200b7ca..24396b914d1 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -23,7 +23,7 @@
 #include <asm/atomic.h>
 #include "br_private.h"
 
-static kmem_cache_t *br_fdb_cache;
+static kmem_cache_t *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr);
 
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 02c632b4d32..c93d35ab95c 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -23,10 +23,9 @@ static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr,
 {
 	struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data;
 
-	if ((*pskb)->nfmark != info->mark) {
+	if ((*pskb)->nfmark != info->mark)
 		(*pskb)->nfmark = info->mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return info->target;
 }
 
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index 01af4fcef26..aae26ae2e61 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -78,8 +78,8 @@ static void ulog_send(unsigned int nlgroup)
 	if (ub->qlen > 1)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
 
-	NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup;
-	netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC);
+	NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
+	netlink_broadcast(ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
 
 	ub->qlen = 0;
 	ub->skb = NULL;
@@ -162,7 +162,7 @@ static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr,
 	pm->version = EBT_ULOG_VERSION;
 	do_gettimeofday(&pm->stamp);
 	if (ub->qlen == 1)
-		ub->skb->stamp = pm->stamp;
+		skb_set_timestamp(ub->skb, &pm->stamp);
 	pm->data_len = copy_len;
 	pm->mark = skb->nfmark;
 	pm->hook = hooknr;
@@ -258,7 +258,8 @@ static int __init init(void)
 		spin_lock_init(&ulog_buffers[i].lock);
 	}
 
-	ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+	ebtulognl = netlink_kernel_create(NETLINK_NFLOG, EBT_ULOG_MAXNLGROUPS,
+	                                  NULL, THIS_MODULE);
 	if (!ebtulognl)
 		ret = -ENOMEM;
 	else if ((ret = ebt_register_watcher(&ulog)))
diff --git a/net/core/Makefile b/net/core/Makefile
index f5f5e58943e..630da0f0579 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -12,7 +12,6 @@ obj-y		     += dev.o ethtool.o dev_mcast.o dst.o \
 
 obj-$(CONFIG_XFRM) += flow.o
 obj-$(CONFIG_SYSFS) += net-sysfs.o
-obj-$(CONFIG_NETFILTER) += netfilter.o
 obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NET_RADIO) += wireless.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fcee054b6f7..da9bf71421a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -43,7 +43,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/inet.h>
-#include <linux/tcp.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/poll.h>
@@ -51,9 +50,10 @@
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/checksum.h>
 
+#include <net/checksum.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
 
 /*
  *	Is a socket 'connection oriented' ?
diff --git a/net/core/dev.c b/net/core/dev.c
index faf59b02c4b..c01511e3d0c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -267,10 +267,6 @@ void dev_add_pack(struct packet_type *pt)
 	spin_unlock_bh(&ptype_lock);
 }
 
-extern void linkwatch_run_queue(void);
-
-
-
 /**
  *	__dev_remove_pack	 - remove packet handler
  *	@pt: packet type declaration
@@ -1009,13 +1005,22 @@ void net_disable_timestamp(void)
 	atomic_dec(&netstamp_needed);
 }
 
-static inline void net_timestamp(struct timeval *stamp)
+void __net_timestamp(struct sk_buff *skb)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	skb_set_timestamp(skb, &tv);
+}
+EXPORT_SYMBOL(__net_timestamp);
+
+static inline void net_timestamp(struct sk_buff *skb)
 {
 	if (atomic_read(&netstamp_needed))
-		do_gettimeofday(stamp);
+		__net_timestamp(skb);
 	else {
-		stamp->tv_sec = 0;
-		stamp->tv_usec = 0;
+		skb->tstamp.off_sec = 0;
+		skb->tstamp.off_usec = 0;
 	}
 }
 
@@ -1027,7 +1032,8 @@ static inline void net_timestamp(struct timeval *stamp)
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct packet_type *ptype;
-	net_timestamp(&skb->stamp);
+
+	net_timestamp(skb);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -1058,7 +1064,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 
 			skb2->h.raw = skb2->nh.raw;
 			skb2->pkt_type = PACKET_OUTGOING;
-			ptype->func(skb2, skb->dev, ptype);
+			ptype->func(skb2, skb->dev, ptype, skb->dev);
 		}
 	}
 	rcu_read_unlock();
@@ -1123,8 +1129,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #define illegal_highdma(dev, skb)	(0)
 #endif
 
-extern void skb_release_data(struct sk_buff *);
-
 /* Keep head the same: replace data */
 int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
@@ -1379,8 +1383,8 @@ int netif_rx(struct sk_buff *skb)
 	if (netpoll_rx(skb))
 		return NET_RX_DROP;
 
-	if (!skb->stamp.tv_sec)
-		net_timestamp(&skb->stamp);
+	if (!skb->tstamp.off_sec)
+		net_timestamp(skb);
 
 	/*
 	 * The code is rearranged so that the path is the most
@@ -1425,14 +1429,14 @@ int netif_rx_ni(struct sk_buff *skb)
 
 EXPORT_SYMBOL(netif_rx_ni);
 
-static __inline__ void skb_bond(struct sk_buff *skb)
+static inline struct net_device *skb_bond(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
 
-	if (dev->master) {
-		skb->real_dev = skb->dev;
+	if (dev->master)
 		skb->dev = dev->master;
-	}
+
+	return dev;
 }
 
 static void net_tx_action(struct softirq_action *h)
@@ -1482,10 +1486,11 @@ static void net_tx_action(struct softirq_action *h)
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
-				  struct packet_type *pt_prev)
+				  struct packet_type *pt_prev,
+				  struct net_device *orig_dev)
 {
 	atomic_inc(&skb->users);
-	return pt_prev->func(skb, skb->dev, pt_prev);
+	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
@@ -1496,7 +1501,8 @@ struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
 
 static __inline__ int handle_bridge(struct sk_buff **pskb,
-				    struct packet_type **pt_prev, int *ret)
+				    struct packet_type **pt_prev, int *ret,
+				    struct net_device *orig_dev)
 {
 	struct net_bridge_port *port;
 
@@ -1505,14 +1511,14 @@ static __inline__ int handle_bridge(struct sk_buff **pskb,
 		return 0;
 
 	if (*pt_prev) {
-		*ret = deliver_skb(*pskb, *pt_prev);
+		*ret = deliver_skb(*pskb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
 	} 
 	
 	return br_handle_frame_hook(port, pskb);
 }
 #else
-#define handle_bridge(skb, pt_prev, ret)	(0)
+#define handle_bridge(skb, pt_prev, ret, orig_dev)	(0)
 #endif
 
 #ifdef CONFIG_NET_CLS_ACT
@@ -1534,17 +1540,14 @@ static int ing_filter(struct sk_buff *skb)
 		__u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
 		if (MAX_RED_LOOP < ttl++) {
 			printk("Redir loop detected Dropping packet (%s->%s)\n",
-				skb->input_dev?skb->input_dev->name:"??",skb->dev->name);
+				skb->input_dev->name, skb->dev->name);
 			return TC_ACT_SHOT;
 		}
 
 		skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
 
 		skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
-		if (NULL == skb->input_dev) {
-			skb->input_dev = skb->dev;
-			printk("ing_filter:  fixed  %s out %s\n",skb->input_dev->name,skb->dev->name);
-		}
+
 		spin_lock(&dev->ingress_lock);
 		if ((q = dev->qdisc_ingress) != NULL)
 			result = q->enqueue(skb, q);
@@ -1559,6 +1562,7 @@ static int ing_filter(struct sk_buff *skb)
 int netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
+	struct net_device *orig_dev;
 	int ret = NET_RX_DROP;
 	unsigned short type;
 
@@ -1566,10 +1570,13 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (skb->dev->poll && netpoll_rx(skb))
 		return NET_RX_DROP;
 
-	if (!skb->stamp.tv_sec)
-		net_timestamp(&skb->stamp);
+	if (!skb->tstamp.off_sec)
+		net_timestamp(skb);
+
+	if (!skb->input_dev)
+		skb->input_dev = skb->dev;
 
-	skb_bond(skb);
+	orig_dev = skb_bond(skb);
 
 	__get_cpu_var(netdev_rx_stat).total++;
 
@@ -1590,14 +1597,14 @@ int netif_receive_skb(struct sk_buff *skb)
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev) 
-				ret = deliver_skb(skb, pt_prev);
+				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
 		}
 	}
 
 #ifdef CONFIG_NET_CLS_ACT
 	if (pt_prev) {
-		ret = deliver_skb(skb, pt_prev);
+		ret = deliver_skb(skb, pt_prev, orig_dev);
 		pt_prev = NULL; /* noone else should process this after*/
 	} else {
 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -1616,7 +1623,7 @@ ncls:
 
 	handle_diverter(skb);
 
-	if (handle_bridge(&skb, &pt_prev, &ret))
+	if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
 		goto out;
 
 	type = skb->protocol;
@@ -1624,13 +1631,13 @@ ncls:
 		if (ptype->type == type &&
 		    (!ptype->dev || ptype->dev == skb->dev)) {
 			if (pt_prev) 
-				ret = deliver_skb(skb, pt_prev);
+				ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = ptype;
 		}
 	}
 
 	if (pt_prev) {
-		ret = pt_prev->func(skb, skb->dev, pt_prev);
+		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index a3eeb88e1c8..289c1b5a8e4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -81,6 +81,18 @@ int ethtool_op_set_tso(struct net_device *dev, u32 data)
 	return 0;
 }
 
+int ethtool_op_get_perm_addr(struct net_device *dev, struct ethtool_perm_addr *addr, u8 *data)
+{
+	unsigned char len = dev->addr_len;
+	if ( addr->size < len )
+		return -ETOOSMALL;
+	
+	addr->size = len;
+	memcpy(data, dev->perm_addr, len);
+	return 0;
+}
+ 
+
 /* Handlers for each ethtool command */
 
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
@@ -683,6 +695,39 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
+static int ethtool_get_perm_addr(struct net_device *dev, void *useraddr)
+{
+	struct ethtool_perm_addr epaddr;
+	u8 *data;
+	int ret;
+
+	if (!dev->ethtool_ops->get_perm_addr)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&epaddr,useraddr,sizeof(epaddr)))
+		return -EFAULT;
+
+	data = kmalloc(epaddr.size, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+
+	ret = dev->ethtool_ops->get_perm_addr(dev,&epaddr,data);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, &epaddr, sizeof(epaddr)))
+		goto out;
+	useraddr += sizeof(epaddr);
+	if (copy_to_user(useraddr, data, epaddr.size))
+		goto out;
+	ret = 0;
+
+ out:
+	kfree(data);
+	return ret;
+}
+
 /* The main entry point in this file.  Called from net/core/dev.c */
 
 int dev_ethtool(struct ifreq *ifr)
@@ -806,6 +851,9 @@ int dev_ethtool(struct ifreq *ifr)
 	case ETHTOOL_GSTATS:
 		rc = ethtool_get_stats(dev, useraddr);
 		break;
+	case ETHTOOL_GPERMADDR:
+		rc = ethtool_get_perm_addr(dev, useraddr);
+		break;
 	default:
 		rc =  -EOPNOTSUPP;
 	}
@@ -826,6 +874,7 @@ int dev_ethtool(struct ifreq *ifr)
 
 EXPORT_SYMBOL(dev_ethtool);
 EXPORT_SYMBOL(ethtool_op_get_link);
+EXPORT_SYMBOL_GPL(ethtool_op_get_perm_addr);
 EXPORT_SYMBOL(ethtool_op_get_sg);
 EXPORT_SYMBOL(ethtool_op_get_tso);
 EXPORT_SYMBOL(ethtool_op_get_tx_csum);
diff --git a/net/core/flow.c b/net/core/flow.c
index f289570b15a..7e95b39de9f 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
 
-static kmem_cache_t *flow_cachep;
+static kmem_cache_t *flow_cachep __read_mostly;
 
 static int flow_lwm, flow_hwm;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1beb782ac41..39fc55edf69 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1217,7 +1217,7 @@ static void neigh_proxy_process(unsigned long arg)
 
 	while (skb != (struct sk_buff *)&tbl->proxy_queue) {
 		struct sk_buff *back = skb;
-		long tdif = back->stamp.tv_usec - now;
+		long tdif = NEIGH_CB(back)->sched_next - now;
 
 		skb = skb->next;
 		if (tdif <= 0) {
@@ -1248,8 +1248,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		kfree_skb(skb);
 		return;
 	}
-	skb->stamp.tv_sec  = LOCALLY_ENQUEUED;
-	skb->stamp.tv_usec = sched_next;
+
+	NEIGH_CB(skb)->sched_next = sched_next;
+	NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
 
 	spin_lock(&tbl->proxy_queue.lock);
 	if (del_timer(&tbl->proxy_timer)) {
@@ -2342,8 +2343,8 @@ void neigh_app_ns(struct neighbour *n)
 	}
 	nlh			   = (struct nlmsghdr *)skb->data;
 	nlh->nlmsg_flags	   = NLM_F_REQUEST;
-	NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group  = RTNLGRP_NEIGH;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
 }
 
 static void neigh_app_notify(struct neighbour *n)
@@ -2360,8 +2361,8 @@ static void neigh_app_notify(struct neighbour *n)
 		return;
 	}
 	nlh			   = (struct nlmsghdr *)skb->data;
-	NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group  = RTNLGRP_NEIGH;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_NEIGH, GFP_ATOMIC);
 }
 
 #endif /* CONFIG_ARPD */
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
deleted file mode 100644
index 076c156d5ed..00000000000
--- a/net/core/netfilter.c
+++ /dev/null
@@ -1,648 +0,0 @@
-/* netfilter.c: look after the filters for various protocols. 
- * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
- *
- * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
- * way.
- *
- * Rusty Russell (C)2000 -- This code is GPL.
- *
- * February 2000: Modified by James Morris to have 1 queue per protocol.
- * 15-Mar-2000:   Added NF_REPEAT --RR.
- * 08-May-2003:	  Internal logging interface added by Jozsef Kadlecsik.
- */
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/wait.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <net/sock.h>
-#include <net/route.h>
-#include <linux/ip.h>
-
-/* In this code, we can be waiting indefinitely for userspace to
- * service a packet if a hook returns NF_QUEUE.  We could keep a count
- * of skbuffs queued for userspace, and not deregister a hook unless
- * this is zero, but that sucks.  Now, we simply check when the
- * packets come back: if the hook is gone, the packet is discarded. */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NFDEBUG(format, args...)  printk(format , ## args)
-#else
-#define NFDEBUG(format, args...)
-#endif
-
-/* Sockopts only registered and called from user context, so
-   net locking would be overkill.  Also, [gs]etsockopt calls may
-   sleep. */
-static DECLARE_MUTEX(nf_sockopt_mutex);
-
-struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
-static LIST_HEAD(nf_sockopts);
-static DEFINE_SPINLOCK(nf_hook_lock);
-
-/* 
- * A queue handler may be registered for each protocol.  Each is protected by
- * long term mutex.  The handler must provide an an outfn() to accept packets
- * for queueing and must reinject all packets it receives, no matter what.
- */
-static struct nf_queue_handler_t {
-	nf_queue_outfn_t outfn;
-	void *data;
-} queue_handler[NPROTO];
-static DEFINE_RWLOCK(queue_handler_lock);
-
-int nf_register_hook(struct nf_hook_ops *reg)
-{
-	struct list_head *i;
-
-	spin_lock_bh(&nf_hook_lock);
-	list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
-		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
-			break;
-	}
-	list_add_rcu(&reg->list, i->prev);
-	spin_unlock_bh(&nf_hook_lock);
-
-	synchronize_net();
-	return 0;
-}
-
-void nf_unregister_hook(struct nf_hook_ops *reg)
-{
-	spin_lock_bh(&nf_hook_lock);
-	list_del_rcu(&reg->list);
-	spin_unlock_bh(&nf_hook_lock);
-
-	synchronize_net();
-}
-
-/* Do exclusive ranges overlap? */
-static inline int overlap(int min1, int max1, int min2, int max2)
-{
-	return max1 > min2 && min1 < max2;
-}
-
-/* Functions to register sockopt ranges (exclusive). */
-int nf_register_sockopt(struct nf_sockopt_ops *reg)
-{
-	struct list_head *i;
-	int ret = 0;
-
-	if (down_interruptible(&nf_sockopt_mutex) != 0)
-		return -EINTR;
-
-	list_for_each(i, &nf_sockopts) {
-		struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
-		if (ops->pf == reg->pf
-		    && (overlap(ops->set_optmin, ops->set_optmax, 
-				reg->set_optmin, reg->set_optmax)
-			|| overlap(ops->get_optmin, ops->get_optmax, 
-				   reg->get_optmin, reg->get_optmax))) {
-			NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
-				ops->set_optmin, ops->set_optmax, 
-				ops->get_optmin, ops->get_optmax, 
-				reg->set_optmin, reg->set_optmax,
-				reg->get_optmin, reg->get_optmax);
-			ret = -EBUSY;
-			goto out;
-		}
-	}
-
-	list_add(&reg->list, &nf_sockopts);
-out:
-	up(&nf_sockopt_mutex);
-	return ret;
-}
-
-void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
-{
-	/* No point being interruptible: we're probably in cleanup_module() */
- restart:
-	down(&nf_sockopt_mutex);
-	if (reg->use != 0) {
-		/* To be woken by nf_sockopt call... */
-		/* FIXME: Stuart Young's name appears gratuitously. */
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		reg->cleanup_task = current;
-		up(&nf_sockopt_mutex);
-		schedule();
-		goto restart;
-	}
-	list_del(&reg->list);
-	up(&nf_sockopt_mutex);
-}
-
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, int pf, int val, 
-		      char __user *opt, int *len, int get)
-{
-	struct list_head *i;
-	struct nf_sockopt_ops *ops;
-	int ret;
-
-	if (down_interruptible(&nf_sockopt_mutex) != 0)
-		return -EINTR;
-
-	list_for_each(i, &nf_sockopts) {
-		ops = (struct nf_sockopt_ops *)i;
-		if (ops->pf == pf) {
-			if (get) {
-				if (val >= ops->get_optmin
-				    && val < ops->get_optmax) {
-					ops->use++;
-					up(&nf_sockopt_mutex);
-					ret = ops->get(sk, val, opt, len);
-					goto out;
-				}
-			} else {
-				if (val >= ops->set_optmin
-				    && val < ops->set_optmax) {
-					ops->use++;
-					up(&nf_sockopt_mutex);
-					ret = ops->set(sk, val, opt, *len);
-					goto out;
-				}
-			}
-		}
-	}
-	up(&nf_sockopt_mutex);
-	return -ENOPROTOOPT;
-	
- out:
-	down(&nf_sockopt_mutex);
-	ops->use--;
-	if (ops->cleanup_task)
-		wake_up_process(ops->cleanup_task);
-	up(&nf_sockopt_mutex);
-	return ret;
-}
-
-int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
-		  int len)
-{
-	return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-
-int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
-{
-	return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-
-static unsigned int nf_iterate(struct list_head *head,
-			       struct sk_buff **skb,
-			       int hook,
-			       const struct net_device *indev,
-			       const struct net_device *outdev,
-			       struct list_head **i,
-			       int (*okfn)(struct sk_buff *),
-			       int hook_thresh)
-{
-	unsigned int verdict;
-
-	/*
-	 * The caller must not block between calls to this
-	 * function because of risk of continuing from deleted element.
-	 */
-	list_for_each_continue_rcu(*i, head) {
-		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
-
-		if (hook_thresh > elem->priority)
-			continue;
-
-		/* Optimization: we don't need to hold module
-                   reference here, since function can't sleep. --RR */
-		verdict = elem->hook(hook, skb, indev, outdev, okfn);
-		if (verdict != NF_ACCEPT) {
-#ifdef CONFIG_NETFILTER_DEBUG
-			if (unlikely(verdict > NF_MAX_VERDICT)) {
-				NFDEBUG("Evil return from %p(%u).\n",
-				        elem->hook, hook);
-				continue;
-			}
-#endif
-			if (verdict != NF_REPEAT)
-				return verdict;
-			*i = (*i)->prev;
-		}
-	}
-	return NF_ACCEPT;
-}
-
-int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
-{      
-	int ret;
-
-	write_lock_bh(&queue_handler_lock);
-	if (queue_handler[pf].outfn)
-		ret = -EBUSY;
-	else {
-		queue_handler[pf].outfn = outfn;
-		queue_handler[pf].data = data;
-		ret = 0;
-	}
-	write_unlock_bh(&queue_handler_lock);
-
-	return ret;
-}
-
-/* The caller must flush their queue before this */
-int nf_unregister_queue_handler(int pf)
-{
-	write_lock_bh(&queue_handler_lock);
-	queue_handler[pf].outfn = NULL;
-	queue_handler[pf].data = NULL;
-	write_unlock_bh(&queue_handler_lock);
-	
-	return 0;
-}
-
-/* 
- * Any packet that leaves via this function must come back 
- * through nf_reinject().
- */
-static int nf_queue(struct sk_buff *skb, 
-		    struct list_head *elem, 
-		    int pf, unsigned int hook,
-		    struct net_device *indev,
-		    struct net_device *outdev,
-		    int (*okfn)(struct sk_buff *))
-{
-	int status;
-	struct nf_info *info;
-#ifdef CONFIG_BRIDGE_NETFILTER
-	struct net_device *physindev = NULL;
-	struct net_device *physoutdev = NULL;
-#endif
-
-	/* QUEUE == DROP if noone is waiting, to be safe. */
-	read_lock(&queue_handler_lock);
-	if (!queue_handler[pf].outfn) {
-		read_unlock(&queue_handler_lock);
-		kfree_skb(skb);
-		return 1;
-	}
-
-	info = kmalloc(sizeof(*info), GFP_ATOMIC);
-	if (!info) {
-		if (net_ratelimit())
-			printk(KERN_ERR "OOM queueing packet %p\n",
-			       skb);
-		read_unlock(&queue_handler_lock);
-		kfree_skb(skb);
-		return 1;
-	}
-
-	*info = (struct nf_info) { 
-		(struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
-
-	/* If it's going away, ignore hook. */
-	if (!try_module_get(info->elem->owner)) {
-		read_unlock(&queue_handler_lock);
-		kfree(info);
-		return 0;
-	}
-
-	/* Bump dev refs so they don't vanish while packet is out */
-	if (indev) dev_hold(indev);
-	if (outdev) dev_hold(outdev);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
-	if (skb->nf_bridge) {
-		physindev = skb->nf_bridge->physindev;
-		if (physindev) dev_hold(physindev);
-		physoutdev = skb->nf_bridge->physoutdev;
-		if (physoutdev) dev_hold(physoutdev);
-	}
-#endif
-
-	status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
-	read_unlock(&queue_handler_lock);
-
-	if (status < 0) {
-		/* James M doesn't say fuck enough. */
-		if (indev) dev_put(indev);
-		if (outdev) dev_put(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
-		if (physindev) dev_put(physindev);
-		if (physoutdev) dev_put(physoutdev);
-#endif
-		module_put(info->elem->owner);
-		kfree(info);
-		kfree_skb(skb);
-		return 1;
-	}
-	return 1;
-}
-
-/* Returns 1 if okfn() needs to be executed by the caller,
- * -EPERM for NF_DROP, 0 otherwise. */
-int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
-		 struct net_device *indev,
-		 struct net_device *outdev,
-		 int (*okfn)(struct sk_buff *),
-		 int hook_thresh)
-{
-	struct list_head *elem;
-	unsigned int verdict;
-	int ret = 0;
-
-	/* We may already have this, but read-locks nest anyway */
-	rcu_read_lock();
-
-	elem = &nf_hooks[pf][hook];
-next_hook:
-	verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
-			     outdev, &elem, okfn, hook_thresh);
-	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
-		ret = 1;
-		goto unlock;
-	} else if (verdict == NF_DROP) {
-		kfree_skb(*pskb);
-		ret = -EPERM;
-	} else if (verdict == NF_QUEUE) {
-		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
-		if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
-			goto next_hook;
-	}
-unlock:
-	rcu_read_unlock();
-	return ret;
-}
-
-void nf_reinject(struct sk_buff *skb, struct nf_info *info,
-		 unsigned int verdict)
-{
-	struct list_head *elem = &info->elem->list;
-	struct list_head *i;
-
-	rcu_read_lock();
-
-	/* Release those devices we held, or Alexey will kill me. */
-	if (info->indev) dev_put(info->indev);
-	if (info->outdev) dev_put(info->outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
-	if (skb->nf_bridge) {
-		if (skb->nf_bridge->physindev)
-			dev_put(skb->nf_bridge->physindev);
-		if (skb->nf_bridge->physoutdev)
-			dev_put(skb->nf_bridge->physoutdev);
-	}
-#endif
-
-	/* Drop reference to owner of hook which queued us. */
-	module_put(info->elem->owner);
-
-	list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
-		if (i == elem) 
-  			break;
-  	}
-  
-	if (elem == &nf_hooks[info->pf][info->hook]) {
-		/* The module which sent it to userspace is gone. */
-		NFDEBUG("%s: module disappeared, dropping packet.\n",
-			__FUNCTION__);
-		verdict = NF_DROP;
-	}
-
-	/* Continue traversal iff userspace said ok... */
-	if (verdict == NF_REPEAT) {
-		elem = elem->prev;
-		verdict = NF_ACCEPT;
-	}
-
-	if (verdict == NF_ACCEPT) {
-	next_hook:
-		verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
-				     &skb, info->hook, 
-				     info->indev, info->outdev, &elem,
-				     info->okfn, INT_MIN);
-	}
-
-	switch (verdict) {
-	case NF_ACCEPT:
-		info->okfn(skb);
-		break;
-
-	case NF_QUEUE:
-		if (!nf_queue(skb, elem, info->pf, info->hook, 
-			      info->indev, info->outdev, info->okfn))
-			goto next_hook;
-		break;
-	}
-	rcu_read_unlock();
-
-	if (verdict == NF_DROP)
-		kfree_skb(skb);
-
-	kfree(info);
-	return;
-}
-
-#ifdef CONFIG_INET
-/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct sk_buff **pskb)
-{
-	struct iphdr *iph = (*pskb)->nh.iph;
-	struct rtable *rt;
-	struct flowi fl = {};
-	struct dst_entry *odst;
-	unsigned int hh_len;
-
-	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
-	 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
-	 */
-	if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
-		fl.nl_u.ip4_u.daddr = iph->daddr;
-		fl.nl_u.ip4_u.saddr = iph->saddr;
-		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
-		fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
-#ifdef CONFIG_IP_ROUTE_FWMARK
-		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
-#endif
-		fl.proto = iph->protocol;
-		if (ip_route_output_key(&rt, &fl) != 0)
-			return -1;
-
-		/* Drop old route. */
-		dst_release((*pskb)->dst);
-		(*pskb)->dst = &rt->u.dst;
-	} else {
-		/* non-local src, find valid iif to satisfy
-		 * rp-filter when calling ip_route_input. */
-		fl.nl_u.ip4_u.daddr = iph->saddr;
-		if (ip_route_output_key(&rt, &fl) != 0)
-			return -1;
-
-		odst = (*pskb)->dst;
-		if (ip_route_input(*pskb, iph->daddr, iph->saddr,
-				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
-			dst_release(&rt->u.dst);
-			return -1;
-		}
-		dst_release(&rt->u.dst);
-		dst_release(odst);
-	}
-	
-	if ((*pskb)->dst->error)
-		return -1;
-
-	/* Change in oif may mean change in hh_len. */
-	hh_len = (*pskb)->dst->dev->hard_header_len;
-	if (skb_headroom(*pskb) < hh_len) {
-		struct sk_buff *nskb;
-
-		nskb = skb_realloc_headroom(*pskb, hh_len);
-		if (!nskb) 
-			return -1;
-		if ((*pskb)->sk)
-			skb_set_owner_w(nskb, (*pskb)->sk);
-		kfree_skb(*pskb);
-		*pskb = nskb;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(ip_route_me_harder);
-
-int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
-{
-	struct sk_buff *nskb;
-
-	if (writable_len > (*pskb)->len)
-		return 0;
-
-	/* Not exclusive use of packet?  Must copy. */
-	if (skb_shared(*pskb) || skb_cloned(*pskb))
-		goto copy_skb;
-
-	return pskb_may_pull(*pskb, writable_len);
-
-copy_skb:
-	nskb = skb_copy(*pskb, GFP_ATOMIC);
-	if (!nskb)
-		return 0;
-	BUG_ON(skb_is_nonlinear(nskb));
-
-	/* Rest of kernel will get very unhappy if we pass it a
-	   suddenly-orphaned skbuff */
-	if ((*pskb)->sk)
-		skb_set_owner_w(nskb, (*pskb)->sk);
-	kfree_skb(*pskb);
-	*pskb = nskb;
-	return 1;
-}
-EXPORT_SYMBOL(skb_ip_make_writable);
-#endif /*CONFIG_INET*/
-
-/* Internal logging interface, which relies on the real 
-   LOG target modules */
-
-#define NF_LOG_PREFIXLEN		128
-
-static nf_logfn *nf_logging[NPROTO]; /* = NULL */
-static int reported = 0;
-static DEFINE_SPINLOCK(nf_log_lock);
-
-int nf_log_register(int pf, nf_logfn *logfn)
-{
-	int ret = -EBUSY;
-
-	/* Any setup of logging members must be done before
-	 * substituting pointer. */
-	spin_lock(&nf_log_lock);
-	if (!nf_logging[pf]) {
-		rcu_assign_pointer(nf_logging[pf], logfn);
-		ret = 0;
-	}
-	spin_unlock(&nf_log_lock);
-	return ret;
-}		
-
-void nf_log_unregister(int pf, nf_logfn *logfn)
-{
-	spin_lock(&nf_log_lock);
-	if (nf_logging[pf] == logfn)
-		nf_logging[pf] = NULL;
-	spin_unlock(&nf_log_lock);
-
-	/* Give time to concurrent readers. */
-	synchronize_net();
-}		
-
-void nf_log_packet(int pf,
-		   unsigned int hooknum,
-		   const struct sk_buff *skb,
-		   const struct net_device *in,
-		   const struct net_device *out,
-		   const char *fmt, ...)
-{
-	va_list args;
-	char prefix[NF_LOG_PREFIXLEN];
-	nf_logfn *logfn;
-	
-	rcu_read_lock();
-	logfn = rcu_dereference(nf_logging[pf]);
-	if (logfn) {
-		va_start(args, fmt);
-		vsnprintf(prefix, sizeof(prefix), fmt, args);
-		va_end(args);
-		/* We must read logging before nf_logfn[pf] */
-		logfn(hooknum, skb, in, out, prefix);
-	} else if (!reported) {
-		printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
-		       "no backend logging module loaded in!\n");
-		reported++;
-	}
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(nf_log_register);
-EXPORT_SYMBOL(nf_log_unregister);
-EXPORT_SYMBOL(nf_log_packet);
-
-/* This does not belong here, but locally generated errors need it if connection
-   tracking in use: without this, connection may not be in hash table, and hence
-   manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
-
-void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
-{
-	void (*attach)(struct sk_buff *, struct sk_buff *);
-
-	if (skb->nfct && (attach = ip_ct_attach) != NULL) {
-		mb(); /* Just to be sure: must be read before executing this */
-		attach(new, skb);
-	}
-}
-
-void __init netfilter_init(void)
-{
-	int i, h;
-
-	for (i = 0; i < NPROTO; i++) {
-		for (h = 0; h < NF_MAX_HOOKS; h++)
-			INIT_LIST_HEAD(&nf_hooks[i][h]);
-	}
-}
-
-EXPORT_SYMBOL(ip_ct_attach);
-EXPORT_SYMBOL(nf_ct_attach);
-EXPORT_SYMBOL(nf_getsockopt);
-EXPORT_SYMBOL(nf_hook_slow);
-EXPORT_SYMBOL(nf_hooks);
-EXPORT_SYMBOL(nf_register_hook);
-EXPORT_SYMBOL(nf_register_queue_handler);
-EXPORT_SYMBOL(nf_register_sockopt);
-EXPORT_SYMBOL(nf_reinject);
-EXPORT_SYMBOL(nf_setsockopt);
-EXPORT_SYMBOL(nf_unregister_hook);
-EXPORT_SYMBOL(nf_unregister_queue_handler);
-EXPORT_SYMBOL(nf_unregister_sockopt);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index bb55675f068..b8203de5ff0 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -32,7 +32,6 @@
  * Further increasing requires to change hash table size.
  */
 int sysctl_max_syn_backlog = 256;
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 
 int reqsk_queue_alloc(struct request_sock_queue *queue,
 		      const int nr_table_entries)
@@ -53,6 +52,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
 	rwlock_init(&queue->syn_wait_lock);
 	queue->rskq_accept_head = queue->rskq_accept_head = NULL;
+	queue->rskq_defer_accept = 0;
+	lopt->nr_table_entries = nr_table_entries;
 
 	write_lock_bh(&queue->syn_wait_lock);
 	queue->listen_opt = lopt;
@@ -62,3 +63,28 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
 }
 
 EXPORT_SYMBOL(reqsk_queue_alloc);
+
+void reqsk_queue_destroy(struct request_sock_queue *queue)
+{
+	/* make all the listen_opt local to us */
+	struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+
+	if (lopt->qlen != 0) {
+		int i;
+
+		for (i = 0; i < lopt->nr_table_entries; i++) {
+			struct request_sock *req;
+
+			while ((req = lopt->syn_table[i]) != NULL) {
+				lopt->syn_table[i] = req->dl_next;
+				lopt->qlen--;
+				reqsk_free(req);
+			}
+		}
+	}
+
+	BUG_TRAP(lopt->qlen == 0);
+	kfree(lopt);
+}
+
+EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4b1bb30e638..9bed7569ce3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -148,7 +148,7 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 {
 	int err = 0;
 
-	NETLINK_CB(skb).dst_groups = group;
+	NETLINK_CB(skb).dst_group = group;
 	if (echo)
 		atomic_inc(&skb->users);
 	netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
@@ -458,8 +458,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 		kfree_skb(skb);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL);
+	NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_KERNEL);
 }
 
 static int rtnetlink_done(struct netlink_callback *cb)
@@ -708,7 +708,8 @@ void __init rtnetlink_init(void)
 	if (!rta_buf)
 		panic("rtnetlink_init: cannot allocate rta_buf\n");
 
-	rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv);
+	rtnl = netlink_kernel_create(NETLINK_ROUTE, RTNLGRP_MAX, rtnetlink_rcv,
+	                             THIS_MODULE);
 	if (rtnl == NULL)
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
 	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7eab867ede5..f80a2878561 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,7 +68,10 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-static kmem_cache_t *skbuff_head_cache;
+static kmem_cache_t *skbuff_head_cache __read_mostly;
+static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+
+struct timeval __read_mostly skb_tv_base;
 
 /*
  *	Keep out-of-line to prevent kernel bloat.
@@ -118,7 +121,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  */
 
 /**
- *	alloc_skb	-	allocate a network buffer
+ *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
  *	@gfp_mask: allocation mask
  *
@@ -129,14 +132,20 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
+struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
+			    int fclone)
 {
 	struct sk_buff *skb;
 	u8 *data;
 
 	/* Get the HEAD */
-	skb = kmem_cache_alloc(skbuff_head_cache,
-			       gfp_mask & ~__GFP_DMA);
+	if (fclone)
+		skb = kmem_cache_alloc(skbuff_fclone_cache,
+				       gfp_mask & ~__GFP_DMA);
+	else
+		skb = kmem_cache_alloc(skbuff_head_cache,
+				       gfp_mask & ~__GFP_DMA);
+
 	if (!skb)
 		goto out;
 
@@ -153,7 +162,15 @@ struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
 	skb->data = data;
 	skb->tail = data;
 	skb->end  = data + size;
+	if (fclone) {
+		struct sk_buff *child = skb + 1;
+		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
+		skb->fclone = SKB_FCLONE_ORIG;
+		atomic_set(fclone_ref, 1);
+
+		child->fclone = SKB_FCLONE_UNAVAILABLE;
+	}
 	atomic_set(&(skb_shinfo(skb)->dataref), 1);
 	skb_shinfo(skb)->nr_frags  = 0;
 	skb_shinfo(skb)->tso_size = 0;
@@ -266,8 +283,34 @@ void skb_release_data(struct sk_buff *skb)
  */
 void kfree_skbmem(struct sk_buff *skb)
 {
+	struct sk_buff *other;
+	atomic_t *fclone_ref;
+
 	skb_release_data(skb);
-	kmem_cache_free(skbuff_head_cache, skb);
+	switch (skb->fclone) {
+	case SKB_FCLONE_UNAVAILABLE:
+		kmem_cache_free(skbuff_head_cache, skb);
+		break;
+
+	case SKB_FCLONE_ORIG:
+		fclone_ref = (atomic_t *) (skb + 2);
+		if (atomic_dec_and_test(fclone_ref))
+			kmem_cache_free(skbuff_fclone_cache, skb);
+		break;
+
+	case SKB_FCLONE_CLONE:
+		fclone_ref = (atomic_t *) (skb + 1);
+		other = skb - 1;
+
+		/* The clone portion is available for
+		 * fast-cloning again.
+		 */
+		skb->fclone = SKB_FCLONE_UNAVAILABLE;
+
+		if (atomic_dec_and_test(fclone_ref))
+			kmem_cache_free(skbuff_fclone_cache, other);
+		break;
+	};
 }
 
 /**
@@ -281,8 +324,6 @@ void kfree_skbmem(struct sk_buff *skb)
 
 void __kfree_skb(struct sk_buff *skb)
 {
-	BUG_ON(skb->list != NULL);
-
 	dst_release(skb->dst);
 #ifdef CONFIG_XFRM
 	secpath_put(skb->sp);
@@ -302,7 +343,6 @@ void __kfree_skb(struct sk_buff *skb)
 	skb->tc_index = 0;
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
-	skb->tc_classid = 0;
 #endif
 #endif
 
@@ -325,19 +365,27 @@ void __kfree_skb(struct sk_buff *skb)
 
 struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
-	struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
-
-	if (!n) 
-		return NULL;
+	struct sk_buff *n;
+
+	n = skb + 1;
+	if (skb->fclone == SKB_FCLONE_ORIG &&
+	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
+		atomic_t *fclone_ref = (atomic_t *) (n + 1);
+		n->fclone = SKB_FCLONE_CLONE;
+		atomic_inc(fclone_ref);
+	} else {
+		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+		if (!n)
+			return NULL;
+		n->fclone = SKB_FCLONE_UNAVAILABLE;
+	}
 
 #define C(x) n->x = skb->x
 
 	n->next = n->prev = NULL;
-	n->list = NULL;
 	n->sk = NULL;
-	C(stamp);
+	C(tstamp);
 	C(dev);
-	C(real_dev);
 	C(h);
 	C(nh);
 	C(mac);
@@ -361,7 +409,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 	n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
 	C(nfmark);
-	C(nfcache);
 	C(nfct);
 	nf_conntrack_get(skb->nfct);
 	C(nfctinfo);
@@ -370,9 +417,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 	nf_bridge_get(skb->nf_bridge);
 #endif
 #endif /*CONFIG_NETFILTER*/
-#if defined(CONFIG_HIPPI)
-	C(private);
-#endif
 #ifdef CONFIG_NET_SCHED
 	C(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -380,7 +424,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 	n->tc_verd = CLR_TC_OK2MUNGE(n->tc_verd);
 	n->tc_verd = CLR_TC_MUNGED(n->tc_verd);
 	C(input_dev);
-	C(tc_classid);
 #endif
 
 #endif
@@ -404,10 +447,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	 */
 	unsigned long offset = new->data - old->data;
 
-	new->list	= NULL;
 	new->sk		= NULL;
 	new->dev	= old->dev;
-	new->real_dev	= old->real_dev;
 	new->priority	= old->priority;
 	new->protocol	= old->protocol;
 	new->dst	= dst_clone(old->dst);
@@ -419,12 +460,12 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->mac.raw	= old->mac.raw + offset;
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	new->local_df	= old->local_df;
+	new->fclone	= SKB_FCLONE_UNAVAILABLE;
 	new->pkt_type	= old->pkt_type;
-	new->stamp	= old->stamp;
+	new->tstamp	= old->tstamp;
 	new->destructor = NULL;
 #ifdef CONFIG_NETFILTER
 	new->nfmark	= old->nfmark;
-	new->nfcache	= old->nfcache;
 	new->nfct	= old->nfct;
 	nf_conntrack_get(old->nfct);
 	new->nfctinfo	= old->nfctinfo;
@@ -1344,50 +1385,43 @@ void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
 	__skb_queue_tail(list, newsk);
 	spin_unlock_irqrestore(&list->lock, flags);
 }
+
 /**
  *	skb_unlink	-	remove a buffer from a list
  *	@skb: buffer to remove
+ *	@list: list to use
  *
- *	Place a packet after a given packet in a list. The list locks are taken
- *	and this function is atomic with respect to other list locked calls
+ *	Remove a packet from a list. The list locks are taken and this
+ *	function is atomic with respect to other list locked calls
  *
- *	Works even without knowing the list it is sitting on, which can be
- *	handy at times. It also means that THE LIST MUST EXIST when you
- *	unlink. Thus a list must have its contents unlinked before it is
- *	destroyed.
+ *	You must know what list the SKB is on.
  */
-void skb_unlink(struct sk_buff *skb)
+void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
 {
-	struct sk_buff_head *list = skb->list;
-
-	if (list) {
-		unsigned long flags;
+	unsigned long flags;
 
-		spin_lock_irqsave(&list->lock, flags);
-		if (skb->list == list)
-			__skb_unlink(skb, skb->list);
-		spin_unlock_irqrestore(&list->lock, flags);
-	}
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_unlink(skb, list);
+	spin_unlock_irqrestore(&list->lock, flags);
 }
 
-
 /**
  *	skb_append	-	append a buffer
  *	@old: buffer to insert after
  *	@newsk: buffer to insert
+ *	@list: list to use
  *
  *	Place a packet after a given packet in a list. The list locks are taken
  *	and this function is atomic with respect to other list locked calls.
  *	A buffer cannot be placed on two lists at the same time.
  */
-
-void skb_append(struct sk_buff *old, struct sk_buff *newsk)
+void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&old->list->lock, flags);
-	__skb_append(old, newsk);
-	spin_unlock_irqrestore(&old->list->lock, flags);
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_append(old, newsk, list);
+	spin_unlock_irqrestore(&list->lock, flags);
 }
 
 
@@ -1395,19 +1429,21 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk)
  *	skb_insert	-	insert a buffer
  *	@old: buffer to insert before
  *	@newsk: buffer to insert
+ *	@list: list to use
+ *
+ *	Place a packet before a given packet in a list. The list locks are
+ * 	taken and this function is atomic with respect to other list locked
+ *	calls.
  *
- *	Place a packet before a given packet in a list. The list locks are taken
- *	and this function is atomic with respect to other list locked calls
  *	A buffer cannot be placed on two lists at the same time.
  */
-
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk)
+void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&old->list->lock, flags);
-	__skb_insert(newsk, old->prev, old, old->list);
-	spin_unlock_irqrestore(&old->list->lock, flags);
+	spin_lock_irqsave(&list->lock, flags);
+	__skb_insert(newsk, old->prev, old, list);
+	spin_unlock_irqrestore(&list->lock, flags);
 }
 
 #if 0
@@ -1663,12 +1699,23 @@ void __init skb_init(void)
 					      NULL, NULL);
 	if (!skbuff_head_cache)
 		panic("cannot create skbuff cache");
+
+	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+						(2*sizeof(struct sk_buff)) +
+						sizeof(atomic_t),
+						0,
+						SLAB_HWCACHE_ALIGN,
+						NULL, NULL);
+	if (!skbuff_fclone_cache)
+		panic("cannot create skbuff cache");
+
+	do_gettimeofday(&skb_tv_base);
 }
 
 EXPORT_SYMBOL(___pskb_trim);
 EXPORT_SYMBOL(__kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
-EXPORT_SYMBOL(alloc_skb);
+EXPORT_SYMBOL(__alloc_skb);
 EXPORT_SYMBOL(pskb_copy);
 EXPORT_SYMBOL(pskb_expand_head);
 EXPORT_SYMBOL(skb_checksum);
@@ -1696,3 +1743,4 @@ EXPORT_SYMBOL(skb_prepare_seq_read);
 EXPORT_SYMBOL(skb_seq_read);
 EXPORT_SYMBOL(skb_abort_seq_read);
 EXPORT_SYMBOL(skb_find_text);
+EXPORT_SYMBOL(skb_tv_base);
diff --git a/net/core/sock.c b/net/core/sock.c
index 12f6d9a2a52..ccd10fd6568 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -260,7 +260,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			   
 			if (val > sysctl_wmem_max)
 				val = sysctl_wmem_max;
-
+set_sndbuf:
 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 			if ((val * 2) < SOCK_MIN_SNDBUF)
 				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
@@ -274,6 +274,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sk->sk_write_space(sk);
 			break;
 
+		case SO_SNDBUFFORCE:
+			if (!capable(CAP_NET_ADMIN)) {
+				ret = -EPERM;
+				break;
+			}
+			goto set_sndbuf;
+
 		case SO_RCVBUF:
 			/* Don't error on this BSD doesn't and if you think
 			   about it this is right. Otherwise apps have to
@@ -282,7 +289,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			  
 			if (val > sysctl_rmem_max)
 				val = sysctl_rmem_max;
-
+set_rcvbuf:
 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 			/* FIXME: is this lower bound the right one? */
 			if ((val * 2) < SOCK_MIN_RCVBUF)
@@ -291,6 +298,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 				sk->sk_rcvbuf = val * 2;
 			break;
 
+		case SO_RCVBUFFORCE:
+			if (!capable(CAP_NET_ADMIN)) {
+				ret = -EPERM;
+				break;
+			}
+			goto set_rcvbuf;
+
 		case SO_KEEPALIVE:
 #ifdef CONFIG_INET
 			if (sk->sk_protocol == IPPROTO_TCP)
@@ -686,6 +700,80 @@ void sk_free(struct sock *sk)
 	module_put(owner);
 }
 
+struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
+{
+	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+
+	if (newsk != NULL) {
+		struct sk_filter *filter;
+
+		memcpy(newsk, sk, sk->sk_prot->obj_size);
+
+		/* SANITY */
+		sk_node_init(&newsk->sk_node);
+		sock_lock_init(newsk);
+		bh_lock_sock(newsk);
+
+		atomic_set(&newsk->sk_rmem_alloc, 0);
+		atomic_set(&newsk->sk_wmem_alloc, 0);
+		atomic_set(&newsk->sk_omem_alloc, 0);
+		skb_queue_head_init(&newsk->sk_receive_queue);
+		skb_queue_head_init(&newsk->sk_write_queue);
+
+		rwlock_init(&newsk->sk_dst_lock);
+		rwlock_init(&newsk->sk_callback_lock);
+
+		newsk->sk_dst_cache	= NULL;
+		newsk->sk_wmem_queued	= 0;
+		newsk->sk_forward_alloc = 0;
+		newsk->sk_send_head	= NULL;
+		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
+		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+		sock_reset_flag(newsk, SOCK_DONE);
+		skb_queue_head_init(&newsk->sk_error_queue);
+
+		filter = newsk->sk_filter;
+		if (filter != NULL)
+			sk_filter_charge(newsk, filter);
+
+		if (unlikely(xfrm_sk_clone_policy(newsk))) {
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			newsk = NULL;
+			goto out;
+		}
+
+		newsk->sk_err	   = 0;
+		newsk->sk_priority = 0;
+		atomic_set(&newsk->sk_refcnt, 2);
+
+		/*
+		 * Increment the counter in the same struct proto as the master
+		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+		 * is the same as sk->sk_prot->socks, as this field was copied
+		 * with memcpy).
+		 *
+		 * This _changes_ the previous behaviour, where
+		 * tcp_create_openreq_child always was incrementing the
+		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+		 * to be taken into account in all callers. -acme
+		 */
+		sk_refcnt_debug_inc(newsk);
+		newsk->sk_socket = NULL;
+		newsk->sk_sleep	 = NULL;
+
+		if (newsk->sk_prot->sockets_allocated)
+			atomic_inc(newsk->sk_prot->sockets_allocated);
+	}
+out:
+	return newsk;
+}
+
+EXPORT_SYMBOL_GPL(sk_clone);
+
 void __init sk_init(void)
 {
 	if (num_physpages <= 4096) {
@@ -1353,11 +1441,7 @@ void sk_common_release(struct sock *sk)
 
 	xfrm_sk_free_policy(sk);
 
-#ifdef INET_REFCNT_DEBUG
-	if (atomic_read(&sk->sk_refcnt) != 1)
-		printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
-		       sk, atomic_read(&sk->sk_refcnt));
-#endif
+	sk_refcnt_debug_release(sk);
 	sock_put(sk);
 }
 
@@ -1368,7 +1452,8 @@ static LIST_HEAD(proto_list);
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
-	char *request_sock_slab_name;
+	char *request_sock_slab_name = NULL;
+	char *timewait_sock_slab_name;
 	int rc = -ENOBUFS;
 
 	if (alloc_slab) {
@@ -1399,6 +1484,23 @@ int proto_register(struct proto *prot, int alloc_slab)
 				goto out_free_request_sock_slab_name;
 			}
 		}
+
+		if (prot->twsk_obj_size) {
+			static const char mask[] = "tw_sock_%s";
+
+			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+
+			if (timewait_sock_slab_name == NULL)
+				goto out_free_request_sock_slab;
+
+			sprintf(timewait_sock_slab_name, mask, prot->name);
+			prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
+							    prot->twsk_obj_size,
+							    0, SLAB_HWCACHE_ALIGN,
+							    NULL, NULL);
+			if (prot->twsk_slab == NULL)
+				goto out_free_timewait_sock_slab_name;
+		}
 	}
 
 	write_lock(&proto_list_lock);
@@ -1407,6 +1509,13 @@ int proto_register(struct proto *prot, int alloc_slab)
 	rc = 0;
 out:
 	return rc;
+out_free_timewait_sock_slab_name:
+	kfree(timewait_sock_slab_name);
+out_free_request_sock_slab:
+	if (prot->rsk_prot && prot->rsk_prot->slab) {
+		kmem_cache_destroy(prot->rsk_prot->slab);
+		prot->rsk_prot->slab = NULL;
+	}
 out_free_request_sock_slab_name:
 	kfree(request_sock_slab_name);
 out_free_sock_slab:
@@ -1434,6 +1543,14 @@ void proto_unregister(struct proto *prot)
 		prot->rsk_prot->slab = NULL;
 	}
 
+	if (prot->twsk_slab != NULL) {
+		const char *name = kmem_cache_name(prot->twsk_slab);
+
+		kmem_cache_destroy(prot->twsk_slab);
+		kfree(name);
+		prot->twsk_slab = NULL;
+	}
+
 	list_del(&prot->node);
 	write_unlock(&proto_list_lock);
 }
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8f817ad9f54..2f278c8e474 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -9,23 +9,18 @@
 #include <linux/sysctl.h>
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/socket.h>
+#include <net/sock.h>
 
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
-extern int netdev_budget;
 extern int weight_p;
-extern int net_msg_cost;
-extern int net_msg_burst;
 
 extern __u32 sysctl_wmem_max;
 extern __u32 sysctl_rmem_max;
-extern __u32 sysctl_wmem_default;
-extern __u32 sysctl_rmem_default;
 
 extern int sysctl_core_destroy_delay;
-extern int sysctl_optmem_max;
-extern int sysctl_somaxconn;
 
 #ifdef CONFIG_NET_DIVERT
 extern char sysctl_divert_version[];
diff --git a/net/core/utils.c b/net/core/utils.c
index 88eb8b68e26..7b5970fc9e4 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -16,7 +16,9 @@
 #include <linux/module.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
+#include <linux/inet.h>
 #include <linux/mm.h>
+#include <linux/net.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/random.h>
diff --git a/net/core/wireless.c b/net/core/wireless.c
index 3ff5639c0b7..5caae2399f3 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -571,10 +571,6 @@ static int wireless_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-extern void *dev_seq_start(struct seq_file *seq, loff_t *pos);
-extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos);
-extern void dev_seq_stop(struct seq_file *seq, void *v);
-
 static struct seq_operations wireless_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
@@ -1144,8 +1140,8 @@ static inline void rtmsg_iwinfo(struct net_device *	dev,
 		kfree_skb(skb);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_LINK;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = RTNLGRP_LINK;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC);
 }
 #endif	/* WE_EVENT_NETLINK */
 
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
new file mode 100644
index 00000000000..187ac182e24
--- /dev/null
+++ b/net/dccp/Kconfig
@@ -0,0 +1,50 @@
+menu "DCCP Configuration (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+
+config IP_DCCP
+	tristate "The DCCP Protocol (EXPERIMENTAL)"
+	---help---
+	  Datagram Congestion Control Protocol
+
+	  From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
+
+	  The Datagram Congestion Control Protocol (DCCP) is a transport
+	  protocol that implements bidirectional, unicast connections of
+	  congestion-controlled, unreliable datagrams. It should be suitable
+	  for use by applications such as streaming media, Internet telephony,
+	  and on-line games
+
+	  To compile this protocol support as a module, choose M here: the
+	  module will be called dccp.
+
+	  If in doubt, say N.
+
+config INET_DCCP_DIAG
+	depends on IP_DCCP && INET_DIAG
+	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
+	def_tristate m
+
+source "net/dccp/ccids/Kconfig"
+
+menu "DCCP Kernel Hacking"
+	depends on IP_DCCP && DEBUG_KERNEL=y
+
+config IP_DCCP_DEBUG
+	bool "DCCP debug messages"
+	---help---
+	  Only use this if you're hacking DCCP.
+
+	  Just say N.
+
+config IP_DCCP_UNLOAD_HACK
+	depends on IP_DCCP=m && IP_DCCP_CCID3=m
+	bool "DCCP control sock unload hack"
+	---help---
+	  Enable this to be able to unload the dccp module when the it
+	  has only one refcount held, the control sock one. Just execute
+	  "rmmod dccp_ccid3 dccp"
+
+	  Just say N.
+endmenu
+
+endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
new file mode 100644
index 00000000000..fb97bb04245
--- /dev/null
+++ b/net/dccp/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_IP_DCCP) += dccp.o
+
+dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
+	  timer.o
+
+obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+
+dccp_diag-y := diag.o
+
+obj-y += ccids/
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
new file mode 100644
index 00000000000..9d8fc0e289e
--- /dev/null
+++ b/net/dccp/ccid.c
@@ -0,0 +1,139 @@
+/*
+ *  net/dccp/ccid.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include "ccid.h"
+
+static struct ccid *ccids[CCID_MAX];
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+static atomic_t ccids_lockct = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(ccids_lock);
+
+/*
+ * The strategy is: modifications ccids vector are short, do not sleep and
+ * veeery rare, but read access should be free of any exclusive locks.
+ */
+static void ccids_write_lock(void)
+{
+	spin_lock(&ccids_lock);
+	while (atomic_read(&ccids_lockct) != 0) {
+		spin_unlock(&ccids_lock);
+		yield();
+		spin_lock(&ccids_lock);
+	}
+}
+
+static inline void ccids_write_unlock(void)
+{
+	spin_unlock(&ccids_lock);
+}
+
+static inline void ccids_read_lock(void)
+{
+	atomic_inc(&ccids_lockct);
+	spin_unlock_wait(&ccids_lock);
+}
+
+static inline void ccids_read_unlock(void)
+{
+	atomic_dec(&ccids_lockct);
+}
+
+#else
+#define ccids_write_lock() do { } while(0)
+#define ccids_write_unlock() do { } while(0)
+#define ccids_read_lock() do { } while(0)
+#define ccids_read_unlock() do { } while(0)
+#endif
+
+int ccid_register(struct ccid *ccid)
+{
+	int err;
+
+	if (ccid->ccid_init == NULL)
+		return -1;
+
+	ccids_write_lock();
+	err = -EEXIST;
+	if (ccids[ccid->ccid_id] == NULL) {
+		ccids[ccid->ccid_id] = ccid;
+		err = 0;
+	}
+	ccids_write_unlock();
+	if (err == 0)
+		pr_info("CCID: Registered CCID %d (%s)\n",
+			ccid->ccid_id, ccid->ccid_name);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(ccid_register);
+
+int ccid_unregister(struct ccid *ccid)
+{
+	ccids_write_lock();
+	ccids[ccid->ccid_id] = NULL;
+	ccids_write_unlock();
+	pr_info("CCID: Unregistered CCID %d (%s)\n",
+		ccid->ccid_id, ccid->ccid_name);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(ccid_unregister);
+
+struct ccid *ccid_init(unsigned char id, struct sock *sk)
+{
+	struct ccid *ccid;
+
+#ifdef CONFIG_KMOD
+	if (ccids[id] == NULL)
+		request_module("net-dccp-ccid-%d", id);
+#endif
+	ccids_read_lock();
+
+	ccid = ccids[id];
+	if (ccid == NULL)
+		goto out;
+
+	if (!try_module_get(ccid->ccid_owner))
+		goto out_err;
+
+	if (ccid->ccid_init(sk) != 0)
+		goto out_module_put;
+out:
+	ccids_read_unlock();
+	return ccid;
+out_module_put:
+	module_put(ccid->ccid_owner);
+out_err:
+	ccid = NULL;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(ccid_init);
+
+void ccid_exit(struct ccid *ccid, struct sock *sk)
+{
+	if (ccid == NULL)
+		return;
+
+	ccids_read_lock();
+
+	if (ccids[ccid->ccid_id] != NULL) {
+		if (ccid->ccid_exit != NULL)
+			ccid->ccid_exit(sk);
+		module_put(ccid->ccid_owner);
+	}
+
+	ccids_read_unlock();
+}
+
+EXPORT_SYMBOL_GPL(ccid_exit);
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
new file mode 100644
index 00000000000..962f1e9e2f7
--- /dev/null
+++ b/net/dccp/ccid.h
@@ -0,0 +1,180 @@
+#ifndef _CCID_H
+#define _CCID_H
+/*
+ *  net/dccp/ccid.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  CCID infrastructure
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <net/sock.h>
+#include <linux/dccp.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+#define CCID_MAX 255
+
+struct ccid {
+	unsigned char	ccid_id;
+	const char	*ccid_name;
+	struct module	*ccid_owner;
+	int		(*ccid_init)(struct sock *sk);
+	void		(*ccid_exit)(struct sock *sk);
+	int		(*ccid_hc_rx_init)(struct sock *sk);
+	int		(*ccid_hc_tx_init)(struct sock *sk);
+	void		(*ccid_hc_rx_exit)(struct sock *sk);
+	void		(*ccid_hc_tx_exit)(struct sock *sk);
+	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
+						  struct sk_buff *skb);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
+	void		(*ccid_hc_rx_insert_options)(struct sock *sk,
+						     struct sk_buff *skb);
+	void		(*ccid_hc_tx_insert_options)(struct sock *sk,
+						     struct sk_buff *skb);
+	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
+						  struct sk_buff *skb);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
+	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
+						  struct sk_buff *skb, int len);
+	void		(*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
+						  int len);
+	void		(*ccid_hc_rx_get_info)(struct sock *sk,
+					       struct tcp_info *info);
+	void		(*ccid_hc_tx_get_info)(struct sock *sk,
+					       struct tcp_info *info);
+};
+
+extern int	   ccid_register(struct ccid *ccid);
+extern int	   ccid_unregister(struct ccid *ccid);
+
+extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
+extern void	   ccid_exit(struct ccid *ccid, struct sock *sk);
+
+static inline void __ccid_get(struct ccid *ccid)
+{
+	__module_get(ccid->ccid_owner);
+}
+
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+					 struct sk_buff *skb, int len)
+{
+	int rc = 0;
+	if (ccid->ccid_hc_tx_send_packet != NULL)
+		rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
+	return rc;
+}
+
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+					  int more, int len)
+{
+	if (ccid->ccid_hc_tx_packet_sent != NULL)
+		ccid->ccid_hc_tx_packet_sent(sk, more, len);
+}
+
+static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
+{
+	int rc = 0;
+	if (ccid->ccid_hc_rx_init != NULL)
+		rc = ccid->ccid_hc_rx_init(sk);
+	return rc;
+}
+
+static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+	int rc = 0;
+	if (ccid->ccid_hc_tx_init != NULL)
+		rc = ccid->ccid_hc_tx_init(sk);
+	return rc;
+}
+
+static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
+{
+	if (ccid->ccid_hc_rx_exit != NULL &&
+	    dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
+		ccid->ccid_hc_rx_exit(sk);
+}
+
+static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
+{
+	if (ccid->ccid_hc_tx_exit != NULL &&
+	    dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
+		ccid->ccid_hc_tx_exit(sk);
+}
+
+static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
+					  struct sk_buff *skb)
+{
+	if (ccid->ccid_hc_rx_packet_recv != NULL)
+		ccid->ccid_hc_rx_packet_recv(sk, skb);
+}
+
+static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
+					  struct sk_buff *skb)
+{
+	if (ccid->ccid_hc_tx_packet_recv != NULL)
+		ccid->ccid_hc_tx_packet_recv(sk, skb);
+}
+
+static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
+{
+	int rc = 0;
+	if (ccid->ccid_hc_tx_parse_options != NULL)
+		rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
+						    value);
+	return rc;
+}
+
+static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
+{
+	int rc = 0;
+	if (ccid->ccid_hc_rx_parse_options != NULL)
+		rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+	return rc;
+}
+
+static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
+					     struct sk_buff *skb)
+{
+	if (ccid->ccid_hc_tx_insert_options != NULL)
+		ccid->ccid_hc_tx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+					     struct sk_buff *skb)
+{
+	if (ccid->ccid_hc_rx_insert_options != NULL)
+		ccid->ccid_hc_rx_insert_options(sk, skb);
+}
+
+static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
+				       struct tcp_info *info)
+{
+	if (ccid->ccid_hc_rx_get_info != NULL)
+		ccid->ccid_hc_rx_get_info(sk, info);
+}
+
+static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
+				       struct tcp_info *info)
+{
+	if (ccid->ccid_hc_tx_get_info != NULL)
+		ccid->ccid_hc_tx_get_info(sk, info);
+}
+#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
new file mode 100644
index 00000000000..7684d83946a
--- /dev/null
+++ b/net/dccp/ccids/Kconfig
@@ -0,0 +1,29 @@
+menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+	depends on IP_DCCP && EXPERIMENTAL
+
+config IP_DCCP_CCID3
+	tristate "CCID3 (TFRC) (EXPERIMENTAL)"
+	depends on IP_DCCP
+	---help---
+	  CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+	  rate-controlled congestion control mechanism.  TFRC is designed to
+	  be reasonably fair when competing for bandwidth with TCP-like flows,
+	  where a flow is "reasonably fair" if its sending rate is generally
+	  within a factor of two of the sending rate of a TCP flow under the
+	  same conditions.  However, TFRC has a much lower variation of
+	  throughput over time compared with TCP, which makes CCID 3 more
+	  suitable than CCID 2 for applications such streaming media where a
+	  relatively smooth sending rate is of importance.
+
+	  CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
+	  congestion control algorithms were initially described in RFC 3448.
+
+	  This text was extracted from draft-ietf-dccp-spec-11.txt.
+	  
+	  If in doubt, say M.
+
+config IP_DCCP_TFRC_LIB
+	depends on IP_DCCP_CCID3
+	def_tristate IP_DCCP_CCID3
+
+endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
new file mode 100644
index 00000000000..956f79f5074
--- /dev/null
+++ b/net/dccp/ccids/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
+
+dccp_ccid3-y := ccid3.o
+
+obj-y += lib/
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
new file mode 100644
index 00000000000..7bf3b3a91e9
--- /dev/null
+++ b/net/dccp/ccids/ccid3.c
@@ -0,0 +1,1221 @@
+/*
+ *  net/dccp/ccids/ccid3.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include "../ccid.h"
+#include "../dccp.h"
+#include "lib/packet_history.h"
+#include "lib/loss_interval.h"
+#include "lib/tfrc.h"
+#include "ccid3.h"
+
+/*
+ * Reason for maths with 10 here is to avoid 32 bit overflow when a is big.
+ */
+static inline u32 usecs_div(const u32 a, const u32 b)
+{
+	const u32 tmp = a * (USEC_PER_SEC / 10);
+	return b > 20 ? tmp / (b / 10) : tmp;
+}
+
+static int ccid3_debug;
+
+#ifdef CCID3_DEBUG
+#define ccid3_pr_debug(format, a...) \
+	do { if (ccid3_debug) \
+		printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
+	} while (0)
+#else
+#define ccid3_pr_debug(format, a...)
+#endif
+
+static struct dccp_tx_hist *ccid3_tx_hist;
+static struct dccp_rx_hist *ccid3_rx_hist;
+static struct dccp_li_hist *ccid3_li_hist;
+
+static int ccid3_init(struct sock *sk)
+{
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+	return 0;
+}
+
+static void ccid3_exit(struct sock *sk)
+{
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+}
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+       	TFRC_SSTATE_NO_SENT = 1,
+	TFRC_SSTATE_NO_FBACK,
+	TFRC_SSTATE_FBACK,
+	TFRC_SSTATE_TERM,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+	static char *ccid3_state_names[] = {
+	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
+	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+	[TFRC_SSTATE_FBACK]    = "FBACK",
+	[TFRC_SSTATE_TERM]     = "TERM",
+	};
+
+	return ccid3_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_tx_set_state(struct sock *sk,
+					 enum ccid3_hc_tx_states state)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+		       ccid3_tx_state_name(state));
+	WARN_ON(state == oldstate);
+	hctx->ccid3hctx_state = state;
+}
+
+/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
+static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
+{
+	/*
+	 * If no feedback spec says t_ipi is 1 second (set elsewhere and then
+	 * doubles after every no feedback timer (separate function)
+	 */
+	if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+		hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
+						  hctx->ccid3hctx_x);
+}
+
+/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
+{
+	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+					   TFRC_OPSYS_HALF_TIME_GRAN);
+}
+
+/*
+ * Update X by
+ *    If (p > 0)
+ *       x_calc = calcX(s, R, p);
+ *       X = max(min(X_calc, 2 * X_recv), s / t_mbi);
+ *    Else
+ *       If (now - tld >= R)
+ *          X = max(min(2 * X, 2 * X_recv), s / R);
+ *          tld = now;
+ */ 
+static void ccid3_hc_tx_update_x(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	/* To avoid large error in calcX */
+	if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
+		hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
+						     hctx->ccid3hctx_rtt,
+						     hctx->ccid3hctx_p);
+		hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
+							  2 * hctx->ccid3hctx_x_recv),
+					       (hctx->ccid3hctx_s /
+					        TFRC_MAX_BACK_OFF_TIME));
+	} else {
+		struct timeval now;
+
+		do_gettimeofday(&now);
+	       	if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
+		    hctx->ccid3hctx_rtt) {
+			hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
+								  hctx->ccid3hctx_x) * 2,
+						       usecs_div(hctx->ccid3hctx_s,
+							       	 hctx->ccid3hctx_rtt));
+			hctx->ccid3hctx_t_ld = now;
+		}
+	}
+}
+
+static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct dccp_sock *dp = dccp_sk(sk);
+	unsigned long next_tmout = 0;
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		/* XXX: set some sensible MIB */
+		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+			       jiffies + HZ / 5);
+		goto out;
+	}
+
+	ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
+		       ccid3_tx_state_name(hctx->ccid3hctx_state));
+	
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_TERM:
+		goto out;
+	case TFRC_SSTATE_NO_FBACK:
+		/* Halve send rate */
+		hctx->ccid3hctx_x /= 2;
+		if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
+					 TFRC_MAX_BACK_OFF_TIME))
+			hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
+					     TFRC_MAX_BACK_OFF_TIME);
+
+		ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
+			       "bytes/s\n",
+			       dccp_role(sk), sk,
+			       ccid3_tx_state_name(hctx->ccid3hctx_state),
+			       hctx->ccid3hctx_x);
+		next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
+						      hctx->ccid3hctx_x),
+					TFRC_INITIAL_TIMEOUT);
+		/*
+		 * FIXME - not sure above calculation is correct. See section
+		 * 5 of CCID3 11 should adjust tx_t_ipi and double that to
+		 * achieve it really
+		 */
+		break;
+	case TFRC_SSTATE_FBACK:
+		/*
+		 * Check if IDLE since last timeout and recv rate is less than
+		 * 4 packets per RTT
+		 */
+		if (!hctx->ccid3hctx_idle ||
+		    (hctx->ccid3hctx_x_recv >=
+		     4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
+			ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
+				       dccp_role(sk), sk,
+				       ccid3_tx_state_name(hctx->ccid3hctx_state));
+			/* Halve sending rate */
+
+			/*  If (X_calc > 2 * X_recv)
+			 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
+			 *  Else
+			 *    X_recv = X_calc / 4;
+			 */
+			BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
+			       hctx->ccid3hctx_x_calc == 0);
+
+			/* check also if p is zero -> x_calc is infinity? */
+			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
+			    hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
+				hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
+								    hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
+			else
+				hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
+
+			/* Update sending rate */
+			ccid3_hc_tx_update_x(sk);
+		}
+		/*
+		 * Schedule no feedback timer to expire in
+		 * max(4 * R, 2 * s / X)
+		 */
+		next_tmout = max_t(u32, hctx->ccid3hctx_t_rto, 
+					2 * usecs_div(hctx->ccid3hctx_s,
+						      hctx->ccid3hctx_x));
+		break;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+		dump_stack();
+		goto out;
+	}
+
+	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 
+		      jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+	hctx->ccid3hctx_idle = 1;
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static int ccid3_hc_tx_send_packet(struct sock *sk,
+				   struct sk_buff *skb, int len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+	struct dccp_tx_hist_entry *new_packet;
+	struct timeval now;
+	long delay;
+	int rc = -ENOTCONN;
+
+	/* Check if pure ACK or Terminating*/
+
+	/*
+	 * XXX: We only call this function for DATA and DATAACK, on, these
+	 * packets can have zero length, but why the comment about "pure ACK"?
+	 */
+	if (hctx == NULL || len == 0 ||
+	    hctx->ccid3hctx_state == TFRC_SSTATE_TERM)
+		goto out;
+
+	/* See if last packet allocated was not sent */
+	new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+	if (new_packet == NULL || new_packet->dccphtx_sent) {
+		new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
+						    SLAB_ATOMIC);
+
+		rc = -ENOBUFS;
+		if (new_packet == NULL) {
+			ccid3_pr_debug("%s, sk=%p, not enough mem to add "
+				       "to history, send refused\n",
+				       dccp_role(sk), sk);
+			goto out;
+		}
+
+		dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
+	}
+
+	do_gettimeofday(&now);
+
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_NO_SENT:
+		ccid3_pr_debug("%s, sk=%p, first packet(%llu)\n",
+			       dccp_role(sk), sk, dp->dccps_gss);
+
+		hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
+		hctx->ccid3hctx_no_feedback_timer.data     = (unsigned long)sk;
+		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+			       jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
+		hctx->ccid3hctx_last_win_count	 = 0;
+		hctx->ccid3hctx_t_last_win_count = now;
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+		hctx->ccid3hctx_t_ipi = TFRC_INITIAL_TIMEOUT;
+
+		/* Set nominal send time for initial packet */
+		hctx->ccid3hctx_t_nom = now;
+		timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+				  hctx->ccid3hctx_t_ipi);
+		ccid3_calc_new_delta(hctx);
+		rc = 0;
+		break;
+	case TFRC_SSTATE_NO_FBACK:
+	case TFRC_SSTATE_FBACK:
+		delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
+		         hctx->ccid3hctx_delta);
+		ccid3_pr_debug("send_packet delay=%ld\n", delay);
+		delay /= -1000;
+		/* divide by -1000 is to convert to ms and get sign right */
+		rc = delay > 0 ? delay : 0;
+		break;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+		dump_stack();
+		rc = -EINVAL;
+		break;
+	}
+
+	/* Can we send? if so add options and add to packet history */
+	if (rc == 0)
+		new_packet->dccphtx_ccval =
+			DCCP_SKB_CB(skb)->dccpd_ccval =
+				hctx->ccid3hctx_last_win_count;
+out:
+	return rc;
+}
+
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+	struct timeval now;
+
+	BUG_ON(hctx == NULL);
+
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+		ccid3_pr_debug("%s, sk=%p, while state is TFRC_SSTATE_TERM!\n",
+			       dccp_role(sk), sk);
+		return;
+	}
+
+	do_gettimeofday(&now);
+
+	/* check if we have sent a data packet */
+	if (len > 0) {
+		unsigned long quarter_rtt;
+		struct dccp_tx_hist_entry *packet;
+
+		packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
+		if (packet == NULL) {
+			printk(KERN_CRIT "%s: packet doesn't exists in "
+					 "history!\n", __FUNCTION__);
+			return;
+		}
+		if (packet->dccphtx_sent) {
+			printk(KERN_CRIT "%s: no unsent packet in history!\n",
+			       __FUNCTION__);
+			return;
+		}
+		packet->dccphtx_tstamp = now;
+		packet->dccphtx_seqno  = dp->dccps_gss;
+		/*
+		 * Check if win_count have changed
+		 * Algorithm in "8.1. Window Counter Valuer" in
+		 * draft-ietf-dccp-ccid3-11.txt
+		 */
+		quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
+		if (likely(hctx->ccid3hctx_rtt > 8))
+			quarter_rtt /= hctx->ccid3hctx_rtt / 4;
+
+		if (quarter_rtt > 0) {
+			hctx->ccid3hctx_t_last_win_count = now;
+			hctx->ccid3hctx_last_win_count	 = (hctx->ccid3hctx_last_win_count +
+							    min_t(unsigned long, quarter_rtt, 5)) % 16;
+			ccid3_pr_debug("%s, sk=%p, window changed from "
+				       "%u to %u!\n",
+				       dccp_role(sk), sk,
+				       packet->dccphtx_ccval,
+				       hctx->ccid3hctx_last_win_count);
+		}
+
+		hctx->ccid3hctx_idle = 0;
+		packet->dccphtx_rtt  = hctx->ccid3hctx_rtt;
+		packet->dccphtx_sent = 1;
+	} else
+		ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
+			       dccp_role(sk), sk, dp->dccps_gss);
+
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_NO_SENT:
+		/* if first wasn't pure ack */
+		if (len != 0)
+			printk(KERN_CRIT "%s: %s, First packet sent is noted "
+					 "as a data packet\n",
+			       __FUNCTION__, dccp_role(sk));
+		return;
+	case TFRC_SSTATE_NO_FBACK:
+	case TFRC_SSTATE_FBACK:
+		if (len > 0) {
+			hctx->ccid3hctx_t_nom = now;
+			ccid3_calc_new_t_ipi(hctx);
+			ccid3_calc_new_delta(hctx);
+			timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+					  hctx->ccid3hctx_t_ipi);
+		}
+		break;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+		dump_stack();
+		break;
+	}
+}
+
+static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+	struct ccid3_options_received *opt_recv;
+	struct dccp_tx_hist_entry *packet;
+	unsigned long next_tmout; 
+	u32 t_elapsed;
+	u32 pinv;
+	u32 x_recv;
+	u32 r_sample;
+
+	if (hctx == NULL)
+		return;
+
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_TERM) {
+		ccid3_pr_debug("%s, sk=%p, received a packet when "
+			       "terminating!\n", dccp_role(sk), sk);
+		return;
+	}
+
+	/* we are only interested in ACKs */
+	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
+	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
+		return;
+
+	opt_recv = &hctx->ccid3hctx_options_received;
+
+	t_elapsed = dp->dccps_options_received.dccpor_elapsed_time;
+	x_recv = opt_recv->ccid3or_receive_rate;
+	pinv = opt_recv->ccid3or_loss_event_rate;
+
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_NO_SENT:
+		/* FIXME: what to do here? */
+		return;
+	case TFRC_SSTATE_NO_FBACK:
+	case TFRC_SSTATE_FBACK:
+		/* Calculate new round trip sample by
+		 * R_sample = (now - t_recvdata) - t_delay */
+		/* get t_recvdata from history */
+		packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
+						 DCCP_SKB_CB(skb)->dccpd_ack_seq);
+		if (packet == NULL) {
+			ccid3_pr_debug("%s, sk=%p, seqno %llu(%s) does't "
+				       "exist in history!\n",
+				       dccp_role(sk), sk,
+				       DCCP_SKB_CB(skb)->dccpd_ack_seq,
+				       dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+			return;
+		}
+
+		/* Update RTT */
+		r_sample = timeval_now_delta(&packet->dccphtx_tstamp);
+		/* FIXME: */
+		// r_sample -= usecs_to_jiffies(t_elapsed * 10);
+
+		/* Update RTT estimate by 
+		 * If (No feedback recv)
+		 *    R = R_sample;
+		 * Else
+		 *    R = q * R + (1 - q) * R_sample;
+		 *
+		 * q is a constant, RFC 3448 recomments 0.9
+		 */
+		if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+			ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
+			hctx->ccid3hctx_rtt = r_sample;
+		} else
+			hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
+					      r_sample / 10;
+
+		ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
+			       "r_sample=%us\n", dccp_role(sk), sk,
+			       hctx->ccid3hctx_rtt, r_sample);
+
+		/* Update timeout interval */
+		hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+					      USEC_PER_SEC);
+
+		/* Update receive rate */
+		hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
+
+		/* Update loss event rate */
+		if (pinv == ~0 || pinv == 0)
+			hctx->ccid3hctx_p = 0;
+		else {
+			hctx->ccid3hctx_p = 1000000 / pinv;
+
+			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
+				hctx->ccid3hctx_p = TFRC_SMALLEST_P;
+				ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
+					       dccp_role(sk), sk);
+			}
+		}
+
+		/* unschedule no feedback timer */
+		sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+		/* Update sending rate */
+		ccid3_hc_tx_update_x(sk);
+
+		/* Update next send time */
+		timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
+				  hctx->ccid3hctx_t_ipi);
+		ccid3_calc_new_t_ipi(hctx);
+		timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+				  hctx->ccid3hctx_t_ipi);
+		ccid3_calc_new_delta(hctx);
+
+		/* remove all packets older than the one acked from history */
+		dccp_tx_hist_purge_older(ccid3_tx_hist,
+					 &hctx->ccid3hctx_hist, packet);
+		/*
+		 * As we have calculated new ipi, delta, t_nom it is possible that
+		 * we now can send a packet, so wake up dccp_wait_for_ccids.
+		 */
+		sk->sk_write_space(sk);
+
+		/*
+		 * Schedule no feedback timer to expire in
+		 * max(4 * R, 2 * s / X)
+		 */
+		next_tmout = max(hctx->ccid3hctx_t_rto,
+				 2 * usecs_div(hctx->ccid3hctx_s,
+					       hctx->ccid3hctx_x));
+			
+		ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
+			       "expire in %lu jiffies (%luus)\n",
+			       dccp_role(sk), sk,
+			       usecs_to_jiffies(next_tmout), next_tmout); 
+
+		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, 
+			       jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
+
+		/* set idle flag */
+		hctx->ccid3hctx_idle = 1;   
+		break;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
+		dump_stack();
+		break;
+	}
+}
+
+static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	if (hctx == NULL || !(sk->sk_state == DCCP_OPEN ||
+			      sk->sk_state == DCCP_PARTOPEN))
+		return;
+
+	 DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+}
+
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+				     unsigned char len, u16 idx,
+				     unsigned char *value)
+{
+	int rc = 0;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+	struct ccid3_options_received *opt_recv;
+
+	if (hctx == NULL)
+		return 0;
+
+	opt_recv = &hctx->ccid3hctx_options_received;
+
+	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
+		opt_recv->ccid3or_loss_event_rate    = ~0;
+		opt_recv->ccid3or_loss_intervals_idx = 0;
+		opt_recv->ccid3or_loss_intervals_len = 0;
+		opt_recv->ccid3or_receive_rate	     = 0;
+	}
+
+	switch (option) {
+	case TFRC_OPT_LOSS_EVENT_RATE:
+		if (len != 4) {
+			ccid3_pr_debug("%s, sk=%p, invalid len for "
+				       "TFRC_OPT_LOSS_EVENT_RATE\n",
+				       dccp_role(sk), sk);
+			rc = -EINVAL;
+		} else {
+			opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
+			ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_loss_event_rate);
+		}
+		break;
+	case TFRC_OPT_LOSS_INTERVALS:
+		opt_recv->ccid3or_loss_intervals_idx = idx;
+		opt_recv->ccid3or_loss_intervals_len = len;
+		ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
+			       dccp_role(sk), sk,
+			       opt_recv->ccid3or_loss_intervals_idx,
+			       opt_recv->ccid3or_loss_intervals_len);
+		break;
+	case TFRC_OPT_RECEIVE_RATE:
+		if (len != 4) {
+			ccid3_pr_debug("%s, sk=%p, invalid len for "
+				       "TFRC_OPT_RECEIVE_RATE\n",
+				       dccp_role(sk), sk);
+			rc = -EINVAL;
+		} else {
+			opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
+			ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_receive_rate);
+		}
+		break;
+	}
+
+	return rc;
+}
+
+static int ccid3_hc_tx_init(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx;
+
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+	hctx = dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx),
+						      gfp_any());
+	if (hctx == NULL)
+		return -ENOMEM;
+
+	memset(hctx, 0, sizeof(*hctx));
+
+	if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+	    dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+		hctx->ccid3hctx_s = dp->dccps_packet_size;
+	else
+		hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
+
+	/* Set transmission rate to 1 packet per second */
+	hctx->ccid3hctx_x     = hctx->ccid3hctx_s;
+	hctx->ccid3hctx_t_rto = USEC_PER_SEC;
+	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+	INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
+	init_timer(&hctx->ccid3hctx_no_feedback_timer);
+
+	return 0;
+}
+
+static void ccid3_hc_tx_exit(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+	BUG_ON(hctx == NULL);
+
+	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+	/* Empty packet history */
+	dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
+
+	kfree(dp->dccps_hc_tx_ccid_private);
+	dp->dccps_hc_tx_ccid_private = NULL;
+}
+
+/*
+ * RX Half Connection methods
+ */
+
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+       	TFRC_RSTATE_NO_DATA = 1,
+	TFRC_RSTATE_DATA,
+	TFRC_RSTATE_TERM    = 127,
+};
+
+#ifdef CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+	static char *ccid3_rx_state_names[] = {
+	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
+	[TFRC_RSTATE_DATA]    = "DATA",
+	[TFRC_RSTATE_TERM]    = "TERM",
+	};
+
+	return ccid3_rx_state_names[state];
+}
+#endif
+
+static inline void ccid3_hc_rx_set_state(struct sock *sk,
+					 enum ccid3_hc_rx_states state)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+		       ccid3_rx_state_name(state));
+	WARN_ON(state == oldstate);
+	hcrx->ccid3hcrx_state = state;
+}
+
+static void ccid3_hc_rx_send_feedback(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+	struct dccp_rx_hist_entry *packet;
+	struct timeval now;
+
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+	do_gettimeofday(&now);
+
+	switch (hcrx->ccid3hcrx_state) {
+	case TFRC_RSTATE_NO_DATA:
+		hcrx->ccid3hcrx_x_recv = 0;
+		break;
+	case TFRC_RSTATE_DATA: {
+		const u32 delta = timeval_delta(&now,
+					&hcrx->ccid3hcrx_tstamp_last_feedback);
+
+		hcrx->ccid3hcrx_x_recv = (hcrx->ccid3hcrx_bytes_recv *
+					  USEC_PER_SEC);
+		if (likely(delta > 1))
+			hcrx->ccid3hcrx_x_recv /= delta;
+	}
+		break;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+		dump_stack();
+		return;
+	}
+
+	packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
+	if (packet == NULL) {
+		printk(KERN_CRIT "%s: %s, sk=%p, no data packet in history!\n",
+		       __FUNCTION__, dccp_role(sk), sk);
+		dump_stack();
+		return;
+	}
+
+	hcrx->ccid3hcrx_tstamp_last_feedback = now;
+	hcrx->ccid3hcrx_last_counter	     = packet->dccphrx_ccval;
+	hcrx->ccid3hcrx_seqno_last_counter   = packet->dccphrx_seqno;
+	hcrx->ccid3hcrx_bytes_recv	     = 0;
+
+	/* Convert to multiples of 10us */
+	hcrx->ccid3hcrx_elapsed_time =
+			timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
+	if (hcrx->ccid3hcrx_p == 0)
+		hcrx->ccid3hcrx_pinv = ~0;
+	else
+		hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
+	dccp_send_ack(sk);
+}
+
+static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	u32 x_recv, pinv;
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+	if (hcrx == NULL || !(sk->sk_state == DCCP_OPEN ||
+			      sk->sk_state == DCCP_PARTOPEN))
+		return;
+
+	DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+
+	if (dccp_packet_without_ack(skb))
+		return;
+		
+	if (hcrx->ccid3hcrx_elapsed_time != 0)
+		dccp_insert_option_elapsed_time(sk, skb,
+						hcrx->ccid3hcrx_elapsed_time);
+	dccp_insert_option_timestamp(sk, skb);
+	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+	pinv   = htonl(hcrx->ccid3hcrx_pinv);
+	dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+			   &pinv, sizeof(pinv));
+	dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+			   &x_recv, sizeof(x_recv));
+}
+
+/* calculate first loss interval
+ *
+ * returns estimated loss interval in usecs */
+
+static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+	struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
+	u32 rtt, delta, x_recv, fval, p, tmp2;
+	struct timeval tstamp = { 0, };
+	int interval = 0;
+	int win_count = 0;
+	int step = 0;
+	u64 tmp1;
+
+	list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
+				 dccphrx_node) {
+		if (dccp_rx_hist_entry_data_packet(entry)) {
+			tail = entry;
+
+			switch (step) {
+			case 0:
+				tstamp	  = entry->dccphrx_tstamp;
+				win_count = entry->dccphrx_ccval;
+				step = 1;
+				break;
+			case 1:
+				interval = win_count - entry->dccphrx_ccval;
+				if (interval < 0)
+					interval += TFRC_WIN_COUNT_LIMIT;
+				if (interval > 4)
+					goto found;
+				break;
+			}
+		}
+	}
+
+	if (step == 0) {
+		printk(KERN_CRIT "%s: %s, sk=%p, packet history contains no "
+				 "data packets!\n",
+		       __FUNCTION__, dccp_role(sk), sk);
+		return ~0;
+	}
+
+	if (interval == 0) {
+		ccid3_pr_debug("%s, sk=%p, Could not find a win_count "
+			       "interval > 0. Defaulting to 1\n",
+			       dccp_role(sk), sk);
+		interval = 1;
+	}
+found:
+	rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
+	ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
+		       dccp_role(sk), sk, rtt);
+	if (rtt == 0)
+		rtt = 1;
+
+	delta = timeval_now_delta(&hcrx->ccid3hcrx_tstamp_last_feedback);
+	x_recv = hcrx->ccid3hcrx_bytes_recv * USEC_PER_SEC;
+	if (likely(delta > 1))
+		x_recv /= delta;
+
+	tmp1 = (u64)x_recv * (u64)rtt;
+	do_div(tmp1,10000000);
+	tmp2 = (u32)tmp1;
+	fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
+	/* do not alter order above or you will get overflow on 32 bit */
+	p = tfrc_calc_x_reverse_lookup(fval);
+	ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
+		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
+
+	if (p == 0)
+		return ~0;
+	else
+		return 1000000 / p; 
+}
+
+static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+	if (seq_loss != DCCP_MAX_SEQNO + 1 &&
+	    list_empty(&hcrx->ccid3hcrx_li_hist)) {
+		struct dccp_li_hist_entry *li_tail;
+
+		li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
+						    &hcrx->ccid3hcrx_li_hist,
+						    seq_loss, win_loss);
+		if (li_tail == NULL)
+			return;
+		li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
+	}
+	/* FIXME: find end of interval */
+}
+
+static void ccid3_hc_rx_detect_loss(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+	u8 win_loss;
+	const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
+						      &hcrx->ccid3hcrx_li_hist,
+						      &win_loss);
+
+	ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+}
+
+static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+	const struct dccp_options_received *opt_recv;
+	struct dccp_rx_hist_entry *packet;
+	struct timeval now;
+	u8 win_count;
+	u32 p_prev;
+	int ins;
+
+	if (hcrx == NULL)
+		return;
+
+	BUG_ON(!(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
+		 hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
+
+	opt_recv = &dp->dccps_options_received;
+
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_ACK:
+		if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
+			return;
+	case DCCP_PKT_DATAACK:
+		if (opt_recv->dccpor_timestamp_echo == 0)
+			break;
+		p_prev = hcrx->ccid3hcrx_rtt;
+		do_gettimeofday(&now);
+		hcrx->ccid3hcrx_rtt = timeval_usecs(&now) -
+				     (opt_recv->dccpor_timestamp_echo -
+				      opt_recv->dccpor_elapsed_time) * 10;
+		if (p_prev != hcrx->ccid3hcrx_rtt)
+			ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
+				       dccp_role(sk), hcrx->ccid3hcrx_rtt,
+				       opt_recv->dccpor_elapsed_time);
+		break;
+	case DCCP_PKT_DATA:
+		break;
+	default:
+		ccid3_pr_debug("%s, sk=%p, not DATA/DATAACK/ACK packet(%s)\n",
+			       dccp_role(sk), sk,
+			       dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
+		return;
+	}
+
+	packet = dccp_rx_hist_entry_new(ccid3_rx_hist, opt_recv->dccpor_ndp,
+					skb, SLAB_ATOMIC);
+	if (packet == NULL) {
+		ccid3_pr_debug("%s, sk=%p, Not enough mem to add rx packet "
+			       "to history (consider it lost)!",
+			       dccp_role(sk), sk);
+		return;
+	}
+
+	win_count = packet->dccphrx_ccval;
+
+	ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
+				      &hcrx->ccid3hcrx_li_hist, packet);
+
+	if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
+		return;
+
+	switch (hcrx->ccid3hcrx_state) {
+	case TFRC_RSTATE_NO_DATA:
+		ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
+			       "feedback\n",
+			       dccp_role(sk), sk,
+			       dccp_state_name(sk->sk_state), skb);
+		ccid3_hc_rx_send_feedback(sk);
+		ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+		return;
+	case TFRC_RSTATE_DATA:
+		hcrx->ccid3hcrx_bytes_recv += skb->len -
+					      dccp_hdr(skb)->dccph_doff * 4;
+		if (ins != 0)
+			break;
+
+		do_gettimeofday(&now);
+		if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
+		    hcrx->ccid3hcrx_rtt) {
+			hcrx->ccid3hcrx_tstamp_last_ack = now;
+			ccid3_hc_rx_send_feedback(sk);
+		}
+		return;
+	default:
+		printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
+		       __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
+		dump_stack();
+		return;
+	}
+
+	/* Dealing with packet loss */
+	ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
+		       dccp_role(sk), sk, dccp_state_name(sk->sk_state));
+
+	ccid3_hc_rx_detect_loss(sk);
+	p_prev = hcrx->ccid3hcrx_p;
+	
+	/* Calculate loss event rate */
+	if (!list_empty(&hcrx->ccid3hcrx_li_hist))
+		/* Scaling up by 1000000 as fixed decimal */
+		hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
+
+	if (hcrx->ccid3hcrx_p > p_prev) {
+		ccid3_hc_rx_send_feedback(sk);
+		return;
+	}
+}
+
+static int ccid3_hc_rx_init(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx;
+
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+	hcrx = dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx),
+						      gfp_any());
+	if (hcrx == NULL)
+		return -ENOMEM;
+
+	memset(hcrx, 0, sizeof(*hcrx));
+
+	if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
+	    dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
+		hcrx->ccid3hcrx_s = dp->dccps_packet_size;
+	else
+		hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
+
+	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+	INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
+	INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
+	/*
+	 * XXX this seems to be paranoid, need to think more about this, for
+	 * now start with something different than zero. -acme
+	 */
+	hcrx->ccid3hcrx_rtt = USEC_PER_SEC / 5;
+	return 0;
+}
+
+static void ccid3_hc_rx_exit(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+	ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+
+	if (hcrx == NULL)
+		return;
+
+	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+
+	/* Empty packet history */
+	dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
+
+	/* Empty loss interval history */
+	dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
+
+	kfree(dp->dccps_hc_rx_ccid_private);
+	dp->dccps_hc_rx_ccid_private = NULL;
+}
+
+static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	const struct ccid3_hc_rx_sock *hcrx = dp->dccps_hc_rx_ccid_private;
+
+	if (hcrx == NULL)
+		return;
+
+	info->tcpi_ca_state	= hcrx->ccid3hcrx_state;
+	info->tcpi_options	|= TCPI_OPT_TIMESTAMPS;
+	info->tcpi_rcv_rtt	= hcrx->ccid3hcrx_rtt;
+}
+
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	const struct ccid3_hc_tx_sock *hctx = dp->dccps_hc_tx_ccid_private;
+
+	if (hctx == NULL)
+		return;
+
+	info->tcpi_rto = hctx->ccid3hctx_t_rto;
+	info->tcpi_rtt = hctx->ccid3hctx_rtt;
+}
+
+static struct ccid ccid3 = {
+	.ccid_id		   = 3,
+	.ccid_name		   = "ccid3",
+	.ccid_owner		   = THIS_MODULE,
+	.ccid_init		   = ccid3_init,
+	.ccid_exit		   = ccid3_exit,
+	.ccid_hc_tx_init	   = ccid3_hc_tx_init,
+	.ccid_hc_tx_exit	   = ccid3_hc_tx_exit,
+	.ccid_hc_tx_send_packet	   = ccid3_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	   = ccid3_hc_tx_packet_sent,
+	.ccid_hc_tx_packet_recv	   = ccid3_hc_tx_packet_recv,
+	.ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
+	.ccid_hc_tx_parse_options  = ccid3_hc_tx_parse_options,
+	.ccid_hc_rx_init	   = ccid3_hc_rx_init,
+	.ccid_hc_rx_exit	   = ccid3_hc_rx_exit,
+	.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
+	.ccid_hc_rx_packet_recv	   = ccid3_hc_rx_packet_recv,
+	.ccid_hc_rx_get_info	   = ccid3_hc_rx_get_info,
+	.ccid_hc_tx_get_info	   = ccid3_hc_tx_get_info,
+};
+ 
+module_param(ccid3_debug, int, 0444);
+MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
+
+static __init int ccid3_module_init(void)
+{
+	int rc = -ENOBUFS;
+
+	ccid3_rx_hist = dccp_rx_hist_new("ccid3");
+	if (ccid3_rx_hist == NULL)
+		goto out;
+
+	ccid3_tx_hist = dccp_tx_hist_new("ccid3");
+	if (ccid3_tx_hist == NULL)
+		goto out_free_rx;
+
+	ccid3_li_hist = dccp_li_hist_new("ccid3");
+	if (ccid3_li_hist == NULL)
+		goto out_free_tx;
+
+	rc = ccid_register(&ccid3);
+	if (rc != 0) 
+		goto out_free_loss_interval_history;
+out:
+	return rc;
+
+out_free_loss_interval_history:
+	dccp_li_hist_delete(ccid3_li_hist);
+	ccid3_li_hist = NULL;
+out_free_tx:
+	dccp_tx_hist_delete(ccid3_tx_hist);
+	ccid3_tx_hist = NULL;
+out_free_rx:
+	dccp_rx_hist_delete(ccid3_rx_hist);
+	ccid3_rx_hist = NULL;
+	goto out;
+}
+module_init(ccid3_module_init);
+
+static __exit void ccid3_module_exit(void)
+{
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+	/*
+	 * Hack to use while developing, so that we get rid of the control
+	 * sock, that is what keeps a refcount on dccp.ko -acme
+	 */
+	extern void dccp_ctl_sock_exit(void);
+
+	dccp_ctl_sock_exit();
+#endif
+	ccid_unregister(&ccid3);
+
+	if (ccid3_tx_hist != NULL) {
+		dccp_tx_hist_delete(ccid3_tx_hist);
+		ccid3_tx_hist = NULL;
+	}
+	if (ccid3_rx_hist != NULL) {
+		dccp_rx_hist_delete(ccid3_rx_hist);
+		ccid3_rx_hist = NULL;
+	}
+	if (ccid3_li_hist != NULL) {
+		dccp_li_hist_delete(ccid3_li_hist);
+		ccid3_li_hist = NULL;
+	}
+}
+module_exit(ccid3_module_exit);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
new file mode 100644
index 00000000000..ee8cbace663
--- /dev/null
+++ b/net/dccp/ccids/ccid3.h
@@ -0,0 +1,137 @@
+/*
+ *  net/dccp/ccids/ccid3.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID3_H_
+#define _DCCP_CCID3_H_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/time.h>
+#include <linux/types.h>
+
+#define TFRC_MIN_PACKET_SIZE	   16
+#define TFRC_STD_PACKET_SIZE	  256
+#define TFRC_MAX_PACKET_SIZE	65535
+
+/* Two seconds as per CCID3 spec */
+#define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
+
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
+
+/* In seconds */
+#define TFRC_MAX_BACK_OFF_TIME	   64
+
+#define TFRC_SMALLEST_P		   40
+
+enum ccid3_options {
+	TFRC_OPT_LOSS_EVENT_RATE = 192,
+	TFRC_OPT_LOSS_INTERVALS	 = 193,
+	TFRC_OPT_RECEIVE_RATE	 = 194,
+};
+
+struct ccid3_options_received {
+	u64 ccid3or_seqno:48,
+	    ccid3or_loss_intervals_idx:16;
+	u16 ccid3or_loss_intervals_len;
+	u32 ccid3or_loss_event_rate;
+	u32 ccid3or_receive_rate;
+};
+
+/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
+ *
+  * @ccid3hctx_state - Sender state
+  * @ccid3hctx_x - Current sending rate
+  * @ccid3hctx_x_recv - Receive rate
+  * @ccid3hctx_x_calc - Calculated send (?) rate
+  * @ccid3hctx_s - Packet size
+  * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+  * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+  * @ccid3hctx_last_win_count - Last window counter sent
+  * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+  * 				  with last_win_count value sent
+  * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+  * @ccid3hctx_idle - FIXME
+  * @ccid3hctx_t_ld - Time last doubled during slow start
+  * @ccid3hctx_t_nom - Nominal send time of next packet
+  * @ccid3hctx_t_ipi - Interpacket (send) interval
+  * @ccid3hctx_delta - Send timer delta
+  * @ccid3hctx_hist - Packet history
+  */
+struct ccid3_hc_tx_sock {
+	u32				ccid3hctx_x;
+	u32				ccid3hctx_x_recv;
+	u32				ccid3hctx_x_calc;
+	u16				ccid3hctx_s;
+	u32				ccid3hctx_rtt;
+	u32				ccid3hctx_p;
+  	u8				ccid3hctx_state;
+	u8				ccid3hctx_last_win_count;
+	u8				ccid3hctx_idle;
+	struct timeval			ccid3hctx_t_last_win_count;
+	struct timer_list		ccid3hctx_no_feedback_timer;
+	struct timeval			ccid3hctx_t_ld;
+	struct timeval			ccid3hctx_t_nom;
+	u32				ccid3hctx_t_rto;
+	u32				ccid3hctx_t_ipi;
+	u32				ccid3hctx_delta;
+	struct list_head		ccid3hctx_hist;
+	struct ccid3_options_received	ccid3hctx_options_received;
+};
+
+struct ccid3_hc_rx_sock {
+  	u64			ccid3hcrx_seqno_last_counter:48,
+				ccid3hcrx_state:8,
+				ccid3hcrx_last_counter:4;
+	unsigned long		ccid3hcrx_rtt;
+  	u32			ccid3hcrx_p;
+  	u32			ccid3hcrx_bytes_recv;
+  	struct timeval		ccid3hcrx_tstamp_last_feedback;
+  	struct timeval		ccid3hcrx_tstamp_last_ack;
+	struct list_head	ccid3hcrx_hist;
+	struct list_head	ccid3hcrx_li_hist;
+  	u16			ccid3hcrx_s;
+  	u32			ccid3hcrx_pinv;
+  	u32			ccid3hcrx_elapsed_time;
+  	u32			ccid3hcrx_x_recv;
+};
+
+#define ccid3_hc_tx_field(s,field) (s->dccps_hc_tx_ccid_private == NULL ? 0 : \
+    ((struct ccid3_hc_tx_sock *)s->dccps_hc_tx_ccid_private)->ccid3hctx_##field)
+
+#define ccid3_hc_rx_field(s,field) (s->dccps_hc_rx_ccid_private == NULL ? 0 : \
+    ((struct ccid3_hc_rx_sock *)s->dccps_hc_rx_ccid_private)->ccid3hcrx_##field)
+
+#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
new file mode 100644
index 00000000000..5f940a6cbac
--- /dev/null
+++ b/net/dccp/ccids/lib/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
+
+dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
new file mode 100644
index 00000000000..4c01a54143a
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -0,0 +1,144 @@
+/*
+ *  net/dccp/ccids/lib/loss_interval.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include "loss_interval.h"
+
+struct dccp_li_hist *dccp_li_hist_new(const char *name)
+{
+	struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+	static const char dccp_li_hist_mask[] = "li_hist_%s";
+	char *slab_name;
+
+	if (hist == NULL)
+		goto out;
+
+	slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
+			    GFP_ATOMIC);
+	if (slab_name == NULL)
+		goto out_free_hist;
+
+	sprintf(slab_name, dccp_li_hist_mask, name);
+	hist->dccplih_slab = kmem_cache_create(slab_name,
+					     sizeof(struct dccp_li_hist_entry),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL, NULL);
+	if (hist->dccplih_slab == NULL)
+		goto out_free_slab_name;
+out:
+	return hist;
+out_free_slab_name:
+	kfree(slab_name);
+out_free_hist:
+	kfree(hist);
+	hist = NULL;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_new);
+
+void dccp_li_hist_delete(struct dccp_li_hist *hist)
+{
+	const char* name = kmem_cache_name(hist->dccplih_slab);
+
+	kmem_cache_destroy(hist->dccplih_slab);
+	kfree(name);
+	kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
+
+void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
+{
+	struct dccp_li_hist_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, list, dccplih_node) {
+		list_del_init(&entry->dccplih_node);
+		kmem_cache_free(hist->dccplih_slab, entry);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
+
+/* Weights used to calculate loss event rate */
+/*
+ * These are integers as per section 8 of RFC3448. We can then divide by 4 *
+ * when we use it.
+ */
+static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
+	4, 4, 4, 4, 3, 2, 1, 1,
+};
+
+u32 dccp_li_hist_calc_i_mean(struct list_head *list)
+{
+	struct dccp_li_hist_entry *li_entry, *li_next;
+	int i = 0;
+	u32 i_tot;
+	u32 i_tot0 = 0;
+	u32 i_tot1 = 0;
+	u32 w_tot  = 0;
+
+	list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
+		if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
+			i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
+			w_tot  += dccp_li_hist_w[i];
+		}
+
+		if (i != 0)
+			i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
+
+		if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
+			break;
+	}
+
+	if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
+		return 0;
+
+	i_tot = max(i_tot0, i_tot1);
+
+	/* FIXME: Why do we do this? -Ian McDonald */
+	if (i_tot * 4 < w_tot)
+		i_tot = w_tot * 4;
+
+	return i_tot * 4 / w_tot;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
+
+struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+						     struct list_head *list,
+						     const u64 seq_loss,
+						     const u8 win_loss)
+{
+	struct dccp_li_hist_entry *tail = NULL, *entry;
+	int i;
+
+	for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
+		entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
+		if (entry == NULL) {
+			dccp_li_hist_purge(hist, list);
+			return NULL;
+		}
+		if (tail == NULL)
+			tail = entry;
+		list_add(&entry->dccplih_node, list);
+	}
+
+	entry->dccplih_seqno     = seq_loss;
+	entry->dccplih_win_count = win_loss;
+	return tail;
+}
+
+EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
new file mode 100644
index 00000000000..13ad47ba142
--- /dev/null
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -0,0 +1,61 @@
+#ifndef _DCCP_LI_HIST_
+#define _DCCP_LI_HIST_
+/*
+ *  net/dccp/ccids/lib/loss_interval.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#define DCCP_LI_HIST_IVAL_F_LENGTH  8
+
+struct dccp_li_hist {
+	kmem_cache_t *dccplih_slab;
+};
+
+extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
+extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
+
+struct dccp_li_hist_entry {
+	struct list_head dccplih_node;
+	u64		 dccplih_seqno:48,
+			 dccplih_win_count:4;
+	u32		 dccplih_interval;
+};
+
+static inline struct dccp_li_hist_entry *
+		dccp_li_hist_entry_new(struct dccp_li_hist *hist,
+				       const unsigned int __nocast prio)
+{
+	return kmem_cache_alloc(hist->dccplih_slab, prio);
+}
+
+static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
+					     struct dccp_li_hist_entry *entry)
+{
+	if (entry != NULL)
+		kmem_cache_free(hist->dccplih_slab, entry);
+}
+
+extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
+			       struct list_head *list);
+
+extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
+
+extern struct dccp_li_hist_entry *
+			dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+						  struct list_head *list,
+						  const u64 seq_loss,
+						  const u8 win_loss);
+#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
new file mode 100644
index 00000000000..d3f9d205383
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -0,0 +1,398 @@
+/*
+ *  net/dccp/packet_history.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include "packet_history.h"
+
+struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
+{
+	struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+	static const char dccp_rx_hist_mask[] = "rx_hist_%s";
+	char *slab_name;
+
+	if (hist == NULL)
+		goto out;
+
+	slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
+			    GFP_ATOMIC);
+	if (slab_name == NULL)
+		goto out_free_hist;
+
+	sprintf(slab_name, dccp_rx_hist_mask, name);
+	hist->dccprxh_slab = kmem_cache_create(slab_name,
+					     sizeof(struct dccp_rx_hist_entry),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL, NULL);
+	if (hist->dccprxh_slab == NULL)
+		goto out_free_slab_name;
+out:
+	return hist;
+out_free_slab_name:
+	kfree(slab_name);
+out_free_hist:
+	kfree(hist);
+	hist = NULL;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
+
+void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
+{
+	const char* name = kmem_cache_name(hist->dccprxh_slab);
+
+	kmem_cache_destroy(hist->dccprxh_slab);
+	kfree(name);
+	kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
+
+void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
+{
+	struct dccp_rx_hist_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, list, dccphrx_node) {
+		list_del_init(&entry->dccphrx_node);
+		kmem_cache_free(hist->dccprxh_slab, entry);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
+
+struct dccp_rx_hist_entry *
+		dccp_rx_hist_find_data_packet(const struct list_head *list)
+{
+	struct dccp_rx_hist_entry *entry, *packet = NULL;
+
+	list_for_each_entry(entry, list, dccphrx_node)
+		if (entry->dccphrx_type == DCCP_PKT_DATA ||
+		    entry->dccphrx_type == DCCP_PKT_DATAACK) {
+			packet = entry;
+			break;
+		}
+
+	return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
+
+int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+			    struct list_head *rx_list,
+			    struct list_head *li_list,
+			    struct dccp_rx_hist_entry *packet)
+{
+	struct dccp_rx_hist_entry *entry, *next, *iter;
+	u8 num_later = 0;
+
+	iter = dccp_rx_hist_head(rx_list);
+	if (iter == NULL)
+		dccp_rx_hist_add_entry(rx_list, packet);
+	else {
+		const u64 seqno = packet->dccphrx_seqno;
+
+		if (after48(seqno, iter->dccphrx_seqno))
+			dccp_rx_hist_add_entry(rx_list, packet);
+		else {
+			if (dccp_rx_hist_entry_data_packet(iter))
+				num_later = 1;
+
+			list_for_each_entry_continue(iter, rx_list,
+						     dccphrx_node) {
+				if (after48(seqno, iter->dccphrx_seqno)) {
+					dccp_rx_hist_add_entry(&iter->dccphrx_node,
+							       packet);
+					goto trim_history;
+				}
+
+				if (dccp_rx_hist_entry_data_packet(iter))
+					num_later++;
+
+				if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
+					dccp_rx_hist_entry_delete(hist, packet);
+					return 1;
+				}
+			}
+
+			if (num_later < TFRC_RECV_NUM_LATE_LOSS)
+				dccp_rx_hist_add_entry(rx_list, packet);
+			/*
+			 * FIXME: else what? should we destroy the packet
+			 * like above?
+			 */
+		}
+	}
+
+trim_history:
+	/*
+	 * Trim history (remove all packets after the NUM_LATE_LOSS + 1
+	 * data packets)
+	 */
+	num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
+
+	if (!list_empty(li_list)) {
+		list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+			if (num_later == 0) {
+				list_del_init(&entry->dccphrx_node);
+				dccp_rx_hist_entry_delete(hist, entry);
+			} else if (dccp_rx_hist_entry_data_packet(entry))
+				--num_later;
+		}
+	} else {
+		int step = 0;
+		u8 win_count = 0; /* Not needed, but lets shut up gcc */
+		int tmp;
+		/*
+		 * We have no loss interval history so we need at least one
+		 * rtt:s of data packets to approximate rtt.
+		 */
+		list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+			if (num_later == 0) {
+				switch (step) {
+				case 0:
+					step = 1;
+					/* OK, find next data packet */
+					num_later = 1;
+					break;
+				case 1:
+					step = 2;
+					/* OK, find next data packet */
+					num_later = 1;
+					win_count = entry->dccphrx_ccval;
+					break;
+				case 2:
+					tmp = win_count - entry->dccphrx_ccval;
+					if (tmp < 0)
+						tmp += TFRC_WIN_COUNT_LIMIT;
+					if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
+						/*
+						 * We have found a packet older
+						 * than one rtt remove the rest
+						 */
+						step = 3;
+					} else /* OK, find next data packet */
+						num_later = 1;
+					break;
+				case 3:
+					list_del_init(&entry->dccphrx_node);
+					dccp_rx_hist_entry_delete(hist, entry);
+					break;
+				}
+			} else if (dccp_rx_hist_entry_data_packet(entry))
+				--num_later;
+		}
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
+
+u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+			     struct list_head *li_list, u8 *win_loss)
+{
+	struct dccp_rx_hist_entry *entry, *next, *packet;
+	struct dccp_rx_hist_entry *a_loss = NULL;
+	struct dccp_rx_hist_entry *b_loss = NULL;
+	u64 seq_loss = DCCP_MAX_SEQNO + 1;
+	u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
+
+	list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
+		if (num_later == 0) {
+			b_loss = entry;
+			break;
+		} else if (dccp_rx_hist_entry_data_packet(entry))
+			--num_later;
+	}
+
+	if (b_loss == NULL)
+		goto out;
+
+	num_later = 1;
+	list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+		if (num_later == 0) {
+			a_loss = entry;
+			break;
+		} else if (dccp_rx_hist_entry_data_packet(entry))
+			--num_later;
+	}
+
+	if (a_loss == NULL) {
+		if (list_empty(li_list)) {
+			/* no loss event have occured yet */
+			LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
+				       "comparing to initial seqno\n",
+				       __FUNCTION__);
+			goto out;
+		} else {
+			LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
+				       __FUNCTION__);
+			goto out;
+		}
+	}
+
+	/* Locate a lost data packet */
+	entry = packet = b_loss;
+	list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
+		u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
+					     packet->dccphrx_seqno);
+
+		if (delta != 0) {
+			if (dccp_rx_hist_entry_data_packet(packet))
+				--delta;
+			/*
+			 * FIXME: check this, probably this % usage is because
+			 * in earlier drafts the ndp count was just 8 bits
+			 * long, but now it cam be up to 24 bits long.
+			 */
+#if 0
+			if (delta % DCCP_NDP_LIMIT !=
+			    (packet->dccphrx_ndp -
+			     entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
+#endif
+			if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
+				seq_loss = entry->dccphrx_seqno;
+				dccp_inc_seqno(&seq_loss);
+			}
+		}
+		packet = entry;
+		if (packet == a_loss)
+			break;
+	}
+out:
+	if (seq_loss != DCCP_MAX_SEQNO + 1)
+		*win_loss = a_loss->dccphrx_ccval;
+	else
+		*win_loss = 0; /* Paranoia */
+
+	return seq_loss;
+}
+
+EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
+
+struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
+{
+	struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
+	static const char dccp_tx_hist_mask[] = "tx_hist_%s";
+	char *slab_name;
+
+	if (hist == NULL)
+		goto out;
+
+	slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
+			    GFP_ATOMIC);
+	if (slab_name == NULL)
+		goto out_free_hist;
+
+	sprintf(slab_name, dccp_tx_hist_mask, name);
+	hist->dccptxh_slab = kmem_cache_create(slab_name,
+					     sizeof(struct dccp_tx_hist_entry),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL, NULL);
+	if (hist->dccptxh_slab == NULL)
+		goto out_free_slab_name;
+out:
+	return hist;
+out_free_slab_name:
+	kfree(slab_name);
+out_free_hist:
+	kfree(hist);
+	hist = NULL;
+	goto out;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
+
+void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
+{
+	const char* name = kmem_cache_name(hist->dccptxh_slab);
+
+	kmem_cache_destroy(hist->dccptxh_slab);
+	kfree(name);
+	kfree(hist);
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
+
+struct dccp_tx_hist_entry *
+	dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
+{
+	struct dccp_tx_hist_entry *packet = NULL, *entry;
+
+	list_for_each_entry(entry, list, dccphtx_node)
+		if (entry->dccphtx_seqno == seq) {
+			packet = entry;
+			break;
+		}
+
+	return packet;
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
+
+void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+			      struct list_head *list,
+			      struct dccp_tx_hist_entry *packet)
+{
+	struct dccp_tx_hist_entry *next;
+
+	list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
+		list_del_init(&packet->dccphtx_node);
+		dccp_tx_hist_entry_delete(hist, packet);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
+
+void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
+{
+	struct dccp_tx_hist_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, list, dccphtx_node) {
+		list_del_init(&entry->dccphtx_node);
+		dccp_tx_hist_entry_delete(hist, entry);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
+
+MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
+	      "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
+MODULE_DESCRIPTION("DCCP TFRC library");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
new file mode 100644
index 00000000000..fb90a91aa93
--- /dev/null
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -0,0 +1,199 @@
+/*
+ *  net/dccp/packet_history.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *
+ *  An implementation of the DCCP protocol
+ *
+ *  This code has been developed by the University of Waikato WAND
+ *  research group. For further information please see http://www.wand.net.nz/
+ *  or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ *
+ *  This code also uses code from Lulea University, rereleased as GPL by its
+ *  authors:
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  Changes to meet Linux coding standards, to make it meet latest ccid3 draft
+ *  and to make it work as a loadable module in the DCCP stack written by
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
+ *
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DCCP_PKT_HIST_
+#define _DCCP_PKT_HIST_
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include "../../dccp.h"
+
+/* Number of later packets received before one is considered lost */
+#define TFRC_RECV_NUM_LATE_LOSS	 3
+
+#define TFRC_WIN_COUNT_PER_RTT	 4
+#define TFRC_WIN_COUNT_LIMIT	16
+
+struct dccp_tx_hist_entry {
+	struct list_head dccphtx_node;
+	u64		 dccphtx_seqno:48,
+			 dccphtx_ccval:4,
+			 dccphtx_sent:1;
+	u32		 dccphtx_rtt;
+	struct timeval	 dccphtx_tstamp;
+};
+
+struct dccp_rx_hist_entry {
+	struct list_head dccphrx_node;
+	u64		 dccphrx_seqno:48,
+			 dccphrx_ccval:4,
+			 dccphrx_type:4;
+	u32		 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
+	struct timeval	 dccphrx_tstamp;
+};
+
+struct dccp_tx_hist {
+	kmem_cache_t *dccptxh_slab;
+};
+
+extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
+extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
+
+struct dccp_rx_hist {
+	kmem_cache_t *dccprxh_slab;
+};
+
+extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
+extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
+extern struct dccp_rx_hist_entry *
+		dccp_rx_hist_find_data_packet(const struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+		dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
+				       const unsigned int __nocast prio)
+{
+	struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
+							    prio);
+
+	if (entry != NULL)
+		entry->dccphtx_sent = 0;
+
+	return entry;
+}
+
+static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
+					     struct dccp_tx_hist_entry *entry)
+{
+	if (entry != NULL)
+		kmem_cache_free(hist->dccptxh_slab, entry);
+}
+
+extern struct dccp_tx_hist_entry *
+			dccp_tx_hist_find_entry(const struct list_head *list,
+						const u64 seq);
+
+static inline void dccp_tx_hist_add_entry(struct list_head *list,
+					  struct dccp_tx_hist_entry *entry)
+{
+	list_add(&entry->dccphtx_node, list);
+}
+
+extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
+				     struct list_head *list,
+				     struct dccp_tx_hist_entry *next);
+
+extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
+			       struct list_head *list);
+
+static inline struct dccp_tx_hist_entry *
+		dccp_tx_hist_head(struct list_head *list)
+{
+	struct dccp_tx_hist_entry *head = NULL;
+
+	if (!list_empty(list))
+		head = list_entry(list->next, struct dccp_tx_hist_entry,
+				  dccphtx_node);
+	return head;
+}
+
+static inline struct dccp_rx_hist_entry *
+		     dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
+				     	    const u32 ndp, 
+					    const struct sk_buff *skb,
+					    const unsigned int __nocast prio)
+{
+	struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
+							    prio);
+
+	if (entry != NULL) {
+		const struct dccp_hdr *dh = dccp_hdr(skb);
+
+		entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+		entry->dccphrx_ccval = dh->dccph_ccval;
+		entry->dccphrx_type  = dh->dccph_type;
+		entry->dccphrx_ndp   = ndp;
+		do_gettimeofday(&(entry->dccphrx_tstamp));
+	}
+
+	return entry;
+}
+
+static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
+					     struct dccp_rx_hist_entry *entry)
+{
+	if (entry != NULL)
+		kmem_cache_free(hist->dccprxh_slab, entry);
+}
+
+extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
+			       struct list_head *list);
+
+static inline void dccp_rx_hist_add_entry(struct list_head *list,
+					  struct dccp_rx_hist_entry *entry)
+{
+	list_add(&entry->dccphrx_node, list);
+}
+
+static inline struct dccp_rx_hist_entry *
+		dccp_rx_hist_head(struct list_head *list)
+{
+	struct dccp_rx_hist_entry *head = NULL;
+
+	if (!list_empty(list))
+		head = list_entry(list->next, struct dccp_rx_hist_entry,
+				  dccphrx_node);
+	return head;
+}
+
+static inline int
+	dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
+{
+	return entry->dccphrx_type == DCCP_PKT_DATA ||
+	       entry->dccphrx_type == DCCP_PKT_DATAACK;
+}
+
+extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+				   struct list_head *rx_list,
+				   struct list_head *li_list,
+				   struct dccp_rx_hist_entry *packet);
+
+extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
+				    struct list_head *li_list, u8 *win_loss);
+
+#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
new file mode 100644
index 00000000000..130c4c40cfe
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -0,0 +1,22 @@
+#ifndef _TFRC_H_
+#define _TFRC_H_
+/*
+ *  net/dccp/ccids/lib/tfrc.h
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+
+#include <linux/types.h>
+
+extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+
+#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
new file mode 100644
index 00000000000..d2b5933b451
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -0,0 +1,644 @@
+/*
+ *  net/dccp/ccids/lib/tfrc_equation.c
+ *
+ *  Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/bug.h>
+#include <asm/div64.h>
+
+#include "tfrc.h"
+
+#define TFRC_CALC_X_ARRSIZE 500
+
+#define TFRC_CALC_X_SPLIT 50000
+/* equivalent to 0.05 */
+
+static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
+	{     37172,   8172 },
+	{     53499,  11567 },
+	{     66664,  14180 },
+	{     78298,  16388 },
+	{     89021,  18339 },
+	{     99147,  20108 },
+	{    108858,  21738 },
+	{    118273,  23260 },
+	{    127474,  24693 },
+	{    136520,  26052 },
+	{    145456,  27348 },
+	{    154316,  28589 },
+	{    163130,  29783 },
+	{    171919,  30935 },
+	{    180704,  32049 },
+	{    189502,  33130 },
+	{    198328,  34180 },
+	{    207194,  35202 },
+	{    216114,  36198 },
+	{    225097,  37172 },
+	{    234153,  38123 },
+	{    243294,  39055 },
+	{    252527,  39968 },
+	{    261861,  40864 },
+	{    271305,  41743 },
+	{    280866,  42607 },
+	{    290553,  43457 },
+	{    300372,  44293 },
+	{    310333,  45117 },
+	{    320441,  45929 },
+	{    330705,  46729 },
+	{    341131,  47518 },
+	{    351728,  48297 },
+	{    362501,  49066 },
+	{    373460,  49826 },
+	{    384609,  50577 },
+	{    395958,  51320 },
+	{    407513,  52054 },
+	{    419281,  52780 },
+	{    431270,  53499 },
+	{    443487,  54211 },
+	{    455940,  54916 },
+	{    468635,  55614 },
+	{    481581,  56306 },
+	{    494785,  56991 },
+	{    508254,  57671 },
+	{    521996,  58345 },
+	{    536019,  59014 },
+	{    550331,  59677 },
+	{    564939,  60335 },
+	{    579851,  60988 },
+	{    595075,  61636 },
+	{    610619,  62279 },
+	{    626491,  62918 },
+	{    642700,  63553 },
+	{    659253,  64183 },
+	{    676158,  64809 },
+	{    693424,  65431 },
+	{    711060,  66050 },
+	{    729073,  66664 },
+	{    747472,  67275 },
+	{    766266,  67882 },
+	{    785464,  68486 },
+	{    805073,  69087 },
+	{    825103,  69684 },
+	{    845562,  70278 },
+	{    866460,  70868 },
+	{    887805,  71456 },
+	{    909606,  72041 },
+	{    931873,  72623 },
+	{    954614,  73202 },
+	{    977839,  73778 },
+	{   1001557,  74352 },
+	{   1025777,  74923 },
+	{   1050508,  75492 },
+	{   1075761,  76058 },
+	{   1101544,  76621 },
+	{   1127867,  77183 },
+	{   1154739,  77741 },
+	{   1182172,  78298 },
+	{   1210173,  78852 },
+	{   1238753,  79405 },
+	{   1267922,  79955 },
+	{   1297689,  80503 },
+	{   1328066,  81049 },
+	{   1359060,  81593 },
+	{   1390684,  82135 },
+	{   1422947,  82675 },
+	{   1455859,  83213 },
+	{   1489430,  83750 },
+	{   1523671,  84284 },
+	{   1558593,  84817 },
+	{   1594205,  85348 },
+	{   1630518,  85878 },
+	{   1667543,  86406 },
+	{   1705290,  86932 },
+	{   1743770,  87457 },
+	{   1782994,  87980 },
+	{   1822973,  88501 },
+	{   1863717,  89021 },
+	{   1905237,  89540 },
+	{   1947545,  90057 },
+	{   1990650,  90573 },
+	{   2034566,  91087 },
+	{   2079301,  91600 },
+	{   2124869,  92111 },
+	{   2171279,  92622 },
+	{   2218543,  93131 },
+	{   2266673,  93639 },
+	{   2315680,  94145 },
+	{   2365575,  94650 },
+	{   2416371,  95154 },
+	{   2468077,  95657 },
+	{   2520707,  96159 },
+	{   2574271,  96660 },
+	{   2628782,  97159 },
+	{   2684250,  97658 },
+	{   2740689,  98155 },
+	{   2798110,  98651 },
+	{   2856524,  99147 },
+	{   2915944,  99641 },
+	{   2976382, 100134 },
+	{   3037850, 100626 },
+	{   3100360, 101117 },
+	{   3163924, 101608 },
+	{   3228554, 102097 },
+	{   3294263, 102586 },
+	{   3361063, 103073 },
+	{   3428966, 103560 },
+	{   3497984, 104045 },
+	{   3568131, 104530 },
+	{   3639419, 105014 },
+	{   3711860, 105498 },
+	{   3785467, 105980 },
+	{   3860253, 106462 },
+	{   3936229, 106942 },
+	{   4013410, 107422 },
+	{   4091808, 107902 },
+	{   4171435, 108380 },
+	{   4252306, 108858 },
+	{   4334431, 109335 },
+	{   4417825, 109811 },
+	{   4502501, 110287 },
+	{   4588472, 110762 },
+	{   4675750, 111236 },
+	{   4764349, 111709 },
+	{   4854283, 112182 },
+	{   4945564, 112654 },
+	{   5038206, 113126 },
+	{   5132223, 113597 },
+	{   5227627, 114067 },
+	{   5324432, 114537 },
+	{   5422652, 115006 },
+	{   5522299, 115474 },
+	{   5623389, 115942 },
+	{   5725934, 116409 },
+	{   5829948, 116876 },
+	{   5935446, 117342 },
+	{   6042439, 117808 },
+	{   6150943, 118273 },
+	{   6260972, 118738 },
+	{   6372538, 119202 },
+	{   6485657, 119665 },
+	{   6600342, 120128 },
+	{   6716607, 120591 },
+	{   6834467, 121053 },
+	{   6953935, 121514 },
+	{   7075025, 121976 },
+	{   7197752, 122436 },
+	{   7322131, 122896 },
+	{   7448175, 123356 },
+	{   7575898, 123815 },
+	{   7705316, 124274 },
+	{   7836442, 124733 },
+	{   7969291, 125191 },
+	{   8103877, 125648 },
+	{   8240216, 126105 },
+	{   8378321, 126562 },
+	{   8518208, 127018 },
+	{   8659890, 127474 },
+	{   8803384, 127930 },
+	{   8948702, 128385 },
+	{   9095861, 128840 },
+	{   9244875, 129294 },
+	{   9395760, 129748 },
+	{   9548529, 130202 },
+	{   9703198, 130655 },
+	{   9859782, 131108 },
+	{  10018296, 131561 },
+	{  10178755, 132014 },
+	{  10341174, 132466 },
+	{  10505569, 132917 },
+	{  10671954, 133369 },
+	{  10840345, 133820 },
+	{  11010757, 134271 },
+	{  11183206, 134721 },
+	{  11357706, 135171 },
+	{  11534274, 135621 },
+	{  11712924, 136071 },
+	{  11893673, 136520 },
+	{  12076536, 136969 },
+	{  12261527, 137418 },
+	{  12448664, 137867 },
+	{  12637961, 138315 },
+	{  12829435, 138763 },
+	{  13023101, 139211 },
+	{  13218974, 139658 },
+	{  13417071, 140106 },
+	{  13617407, 140553 },
+	{  13819999, 140999 },
+	{  14024862, 141446 },
+	{  14232012, 141892 },
+	{  14441465, 142339 },
+	{  14653238, 142785 },
+	{  14867346, 143230 },
+	{  15083805, 143676 },
+	{  15302632, 144121 },
+	{  15523842, 144566 },
+	{  15747453, 145011 },
+	{  15973479, 145456 },
+	{  16201939, 145900 },
+	{  16432847, 146345 },
+	{  16666221, 146789 },
+	{  16902076, 147233 },
+	{  17140429, 147677 },
+	{  17381297, 148121 },
+	{  17624696, 148564 },
+	{  17870643, 149007 },
+	{  18119154, 149451 },
+	{  18370247, 149894 },
+	{  18623936, 150336 },
+	{  18880241, 150779 },
+	{  19139176, 151222 },
+	{  19400759, 151664 },
+	{  19665007, 152107 },
+	{  19931936, 152549 },
+	{  20201564, 152991 },
+	{  20473907, 153433 },
+	{  20748982, 153875 },
+	{  21026807, 154316 },
+	{  21307399, 154758 },
+	{  21590773, 155199 },
+	{  21876949, 155641 },
+	{  22165941, 156082 },
+	{  22457769, 156523 },
+	{  22752449, 156964 },
+	{  23049999, 157405 },
+	{  23350435, 157846 },
+	{  23653774, 158287 },
+	{  23960036, 158727 },
+	{  24269236, 159168 },
+	{  24581392, 159608 },
+	{  24896521, 160049 },
+	{  25214642, 160489 },
+	{  25535772, 160929 },
+	{  25859927, 161370 },
+	{  26187127, 161810 },
+	{  26517388, 162250 },
+	{  26850728, 162690 },
+	{  27187165, 163130 },
+	{  27526716, 163569 },
+	{  27869400, 164009 },
+	{  28215234, 164449 },
+	{  28564236, 164889 },
+	{  28916423, 165328 },
+	{  29271815, 165768 },
+	{  29630428, 166208 },
+	{  29992281, 166647 },
+	{  30357392, 167087 },
+	{  30725779, 167526 },
+	{  31097459, 167965 },
+	{  31472452, 168405 },
+	{  31850774, 168844 },
+	{  32232445, 169283 },
+	{  32617482, 169723 },
+	{  33005904, 170162 },
+	{  33397730, 170601 },
+	{  33792976, 171041 },
+	{  34191663, 171480 },
+	{  34593807, 171919 },
+	{  34999428, 172358 },
+	{  35408544, 172797 },
+	{  35821174, 173237 },
+	{  36237335, 173676 },
+	{  36657047, 174115 },
+	{  37080329, 174554 },
+	{  37507197, 174993 },
+	{  37937673, 175433 },
+	{  38371773, 175872 },
+	{  38809517, 176311 },
+	{  39250924, 176750 },
+	{  39696012, 177190 },
+	{  40144800, 177629 },
+	{  40597308, 178068 },
+	{  41053553, 178507 },
+	{  41513554, 178947 },
+	{  41977332, 179386 },
+	{  42444904, 179825 },
+	{  42916290, 180265 },
+	{  43391509, 180704 },
+	{  43870579, 181144 },
+	{  44353520, 181583 },
+	{  44840352, 182023 },
+	{  45331092, 182462 },
+	{  45825761, 182902 },
+	{  46324378, 183342 },
+	{  46826961, 183781 },
+	{  47333531, 184221 },
+	{  47844106, 184661 },
+	{  48358706, 185101 },
+	{  48877350, 185541 },
+	{  49400058, 185981 },
+	{  49926849, 186421 },
+	{  50457743, 186861 },
+	{  50992759, 187301 },
+	{  51531916, 187741 },
+	{  52075235, 188181 },
+	{  52622735, 188622 },
+	{  53174435, 189062 },
+	{  53730355, 189502 },
+	{  54290515, 189943 },
+	{  54854935, 190383 },
+	{  55423634, 190824 },
+	{  55996633, 191265 },
+	{  56573950, 191706 },
+	{  57155606, 192146 },
+	{  57741621, 192587 },
+	{  58332014, 193028 },
+	{  58926806, 193470 },
+	{  59526017, 193911 },
+	{  60129666, 194352 },
+	{  60737774, 194793 },
+	{  61350361, 195235 },
+	{  61967446, 195677 },
+	{  62589050, 196118 },
+	{  63215194, 196560 },
+	{  63845897, 197002 },
+	{  64481179, 197444 },
+	{  65121061, 197886 },
+	{  65765563, 198328 },
+	{  66414705, 198770 },
+	{  67068508, 199213 },
+	{  67726992, 199655 },
+	{  68390177, 200098 },
+	{  69058085, 200540 },
+	{  69730735, 200983 },
+	{  70408147, 201426 },
+	{  71090343, 201869 },
+	{  71777343, 202312 },
+	{  72469168, 202755 },
+	{  73165837, 203199 },
+	{  73867373, 203642 },
+	{  74573795, 204086 },
+	{  75285124, 204529 },
+	{  76001380, 204973 },
+	{  76722586, 205417 },
+	{  77448761, 205861 },
+	{  78179926, 206306 },
+	{  78916102, 206750 },
+	{  79657310, 207194 },
+	{  80403571, 207639 },
+	{  81154906, 208084 },
+	{  81911335, 208529 },
+	{  82672880, 208974 },
+	{  83439562, 209419 },
+	{  84211402, 209864 },
+	{  84988421, 210309 },
+	{  85770640, 210755 },
+	{  86558080, 211201 },
+	{  87350762, 211647 },
+	{  88148708, 212093 },
+	{  88951938, 212539 },
+	{  89760475, 212985 },
+	{  90574339, 213432 },
+	{  91393551, 213878 },
+	{  92218133, 214325 },
+	{  93048107, 214772 },
+	{  93883493, 215219 },
+	{  94724314, 215666 },
+	{  95570590, 216114 },
+	{  96422343, 216561 },
+	{  97279594, 217009 },
+	{  98142366, 217457 },
+	{  99010679, 217905 },
+	{  99884556, 218353 },
+	{ 100764018, 218801 },
+	{ 101649086, 219250 },
+	{ 102539782, 219698 },
+	{ 103436128, 220147 },
+	{ 104338146, 220596 },
+	{ 105245857, 221046 },
+	{ 106159284, 221495 },
+	{ 107078448, 221945 },
+	{ 108003370, 222394 },
+	{ 108934074, 222844 },
+	{ 109870580, 223294 },
+	{ 110812910, 223745 },
+	{ 111761087, 224195 },
+	{ 112715133, 224646 },
+	{ 113675069, 225097 },
+	{ 114640918, 225548 },
+	{ 115612702, 225999 },
+	{ 116590442, 226450 },
+	{ 117574162, 226902 },
+	{ 118563882, 227353 },
+	{ 119559626, 227805 },
+	{ 120561415, 228258 },
+	{ 121569272, 228710 },
+	{ 122583219, 229162 },
+	{ 123603278, 229615 },
+	{ 124629471, 230068 },
+	{ 125661822, 230521 },
+	{ 126700352, 230974 },
+	{ 127745083, 231428 },
+	{ 128796039, 231882 },
+	{ 129853241, 232336 },
+	{ 130916713, 232790 },
+	{ 131986475, 233244 },
+	{ 133062553, 233699 },
+	{ 134144966, 234153 },
+	{ 135233739, 234608 },
+	{ 136328894, 235064 },
+	{ 137430453, 235519 },
+	{ 138538440, 235975 },
+	{ 139652876, 236430 },
+	{ 140773786, 236886 },
+	{ 141901190, 237343 },
+	{ 143035113, 237799 },
+	{ 144175576, 238256 },
+	{ 145322604, 238713 },
+	{ 146476218, 239170 },
+	{ 147636442, 239627 },
+	{ 148803298, 240085 },
+	{ 149976809, 240542 },
+	{ 151156999, 241000 },
+	{ 152343890, 241459 },
+	{ 153537506, 241917 },
+	{ 154737869, 242376 },
+	{ 155945002, 242835 },
+	{ 157158929, 243294 },
+	{ 158379673, 243753 },
+	{ 159607257, 244213 },
+	{ 160841704, 244673 },
+	{ 162083037, 245133 },
+	{ 163331279, 245593 },
+	{ 164586455, 246054 },
+	{ 165848586, 246514 },
+	{ 167117696, 246975 },
+	{ 168393810, 247437 },
+	{ 169676949, 247898 },
+	{ 170967138, 248360 },
+	{ 172264399, 248822 },
+	{ 173568757, 249284 },
+	{ 174880235, 249747 },
+	{ 176198856, 250209 },
+	{ 177524643, 250672 },
+	{ 178857621, 251136 },
+	{ 180197813, 251599 },
+	{ 181545242, 252063 },
+	{ 182899933, 252527 },
+	{ 184261908, 252991 },
+	{ 185631191, 253456 },
+	{ 187007807, 253920 },
+	{ 188391778, 254385 },
+	{ 189783129, 254851 },
+	{ 191181884, 255316 },
+	{ 192588065, 255782 },
+	{ 194001698, 256248 },
+	{ 195422805, 256714 },
+	{ 196851411, 257181 },
+	{ 198287540, 257648 },
+	{ 199731215, 258115 },
+	{ 201182461, 258582 },
+	{ 202641302, 259050 },
+	{ 204107760, 259518 },
+	{ 205581862, 259986 },
+	{ 207063630, 260454 },
+	{ 208553088, 260923 },
+	{ 210050262, 261392 },
+	{ 211555174, 261861 },
+	{ 213067849, 262331 },
+	{ 214588312, 262800 },
+	{ 216116586, 263270 },
+	{ 217652696, 263741 },
+	{ 219196666, 264211 },
+	{ 220748520, 264682 },
+	{ 222308282, 265153 },
+	{ 223875978, 265625 },
+	{ 225451630, 266097 },
+	{ 227035265, 266569 },
+	{ 228626905, 267041 },
+	{ 230226576, 267514 },
+	{ 231834302, 267986 },
+	{ 233450107, 268460 },
+	{ 235074016, 268933 },
+	{ 236706054, 269407 },
+	{ 238346244, 269881 },
+	{ 239994613, 270355 },
+	{ 241651183, 270830 },
+	{ 243315981, 271305 }
+};
+
+/* Calculate the send rate as per section 3.1 of RFC3448
+ 
+Returns send rate in bytes per second
+
+Integer maths and lookups are used as not allowed floating point in kernel
+
+The function for Xcalc as per section 3.1 of RFC3448 is:
+
+X =                            s
+     -------------------------------------------------------------
+     R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
+
+where 
+X is the trasmit rate in bytes/second
+s is the packet size in bytes
+R is the round trip time in seconds
+p is the loss event rate, between 0 and 1.0, of the number of loss events 
+  as a fraction of the number of packets transmitted
+t_RTO is the TCP retransmission timeout value in seconds
+b is the number of packets acknowledged by a single TCP acknowledgement
+
+we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
+
+X =                            s
+     -----------------------------------------------------------------------
+     R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
+
+
+which we can break down into:
+
+X =     s
+     --------
+     R * f(p)
+
+where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
+
+Function parameters:
+s - bytes
+R - RTT in usecs
+p - loss rate (decimal fraction multiplied by 1,000,000)
+
+Returns Xcalc in bytes per second
+
+DON'T alter this code unless you run test cases against it as the code
+has been manipulated to stop underflow/overlow.
+
+*/
+u32 tfrc_calc_x(u16 s, u32 R, u32 p)
+{
+	int index;
+	u32 f;
+	u64 tmp1, tmp2;
+
+	if (p < TFRC_CALC_X_SPLIT)
+		index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
+	else
+		index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
+
+	if (index < 0)
+		/* p should be 0 unless there is a bug in my code */
+		index = 0;
+
+	if (R == 0)
+		R = 1; /* RTT can't be zero or else divide by zero */
+
+	BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
+
+	if (p >= TFRC_CALC_X_SPLIT)
+		f = tfrc_calc_x_lookup[index][0];
+	else
+		f = tfrc_calc_x_lookup[index][1];
+
+	tmp1 = ((u64)s * 100000000);
+	tmp2 = ((u64)R * (u64)f);
+	do_div(tmp2, 10000);
+	do_div(tmp1, tmp2); 
+	/* Don't alter above math unless you test due to overflow on 32 bit */
+
+	return (u32)tmp1; 
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x);
+
+/*
+ * args: fvalue - function value to match
+ * returns: p closest to that value
+ *
+ * both fvalue and p are multiplied by 1,000,000 to use ints
+ */
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
+{
+	int ctr = 0;
+	int small;
+
+	if (fvalue < tfrc_calc_x_lookup[0][1])
+		return 0;
+
+	if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
+		small = 1;
+	else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
+		return 1000000;
+	else
+		small = 0;
+
+	while (fvalue > tfrc_calc_x_lookup[ctr][small])
+		ctr++;
+
+	if (small)
+		return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
+	else
+		return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+}
+
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
new file mode 100644
index 00000000000..33456c0d593
--- /dev/null
+++ b/net/dccp/dccp.h
@@ -0,0 +1,493 @@
+#ifndef _DCCP_H
+#define _DCCP_H
+/*
+ *  net/dccp/dccp.h
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <net/snmp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern int dccp_debug;
+
+#define dccp_pr_debug(format, a...) \
+	do { if (dccp_debug) \
+		printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
+	} while (0)
+#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
+					     printk(format, ##a); } while (0)
+#else
+#define dccp_pr_debug(format, a...)
+#define dccp_pr_debug_cat(format, a...)
+#endif
+
+extern struct inet_hashinfo dccp_hashinfo;
+
+extern atomic_t dccp_orphan_count;
+extern int dccp_tw_count;
+extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
+
+extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+
+/* FIXME: Right size this */
+#define DCCP_MAX_OPT_LEN 128
+
+#define DCCP_MAX_PACKET_HDR 32
+
+#define MAX_DCCP_HEADER  (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
+
+#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
+				     * state, about 60 seconds */
+
+/* draft-ietf-dccp-spec-11.txt initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+
+/* Maximal interval between probes for local resources.  */
+#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
+
+#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
+
+extern struct proto dccp_v4_prot;
+
+/* is seq1 < seq2 ? */
+static inline int before48(const u64 seq1, const u64 seq2)
+{
+	return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
+}
+
+/* is seq1 > seq2 ? */
+static inline int after48(const u64 seq1, const u64 seq2)
+{
+	return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
+}
+
+/* is seq2 <= seq1 <= seq3 ? */
+static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
+{
+	return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
+}
+
+static inline u64 max48(const u64 seq1, const u64 seq2)
+{
+	return after48(seq1, seq2) ? seq1 : seq2;
+}
+
+enum {
+	DCCP_MIB_NUM = 0,
+	DCCP_MIB_ACTIVEOPENS,			/* ActiveOpens */
+	DCCP_MIB_ESTABRESETS,			/* EstabResets */
+	DCCP_MIB_CURRESTAB,			/* CurrEstab */
+	DCCP_MIB_OUTSEGS,			/* OutSegs */ 
+	DCCP_MIB_OUTRSTS,
+	DCCP_MIB_ABORTONTIMEOUT,
+	DCCP_MIB_TIMEOUTS,
+	DCCP_MIB_ABORTFAILED,
+	DCCP_MIB_PASSIVEOPENS,
+	DCCP_MIB_ATTEMPTFAILS,
+	DCCP_MIB_OUTDATAGRAMS,
+	DCCP_MIB_INERRS,
+	DCCP_MIB_OPTMANDATORYERROR,
+	DCCP_MIB_INVALIDOPT,
+	__DCCP_MIB_MAX
+};
+
+#define DCCP_MIB_MAX	__DCCP_MIB_MAX
+struct dccp_mib {
+	unsigned long	mibs[DCCP_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
+DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+#define DCCP_INC_STATS(field)	    SNMP_INC_STATS(dccp_statistics, field)
+#define DCCP_INC_STATS_BH(field)    SNMP_INC_STATS_BH(dccp_statistics, field)
+#define DCCP_INC_STATS_USER(field)  SNMP_INC_STATS_USER(dccp_statistics, field)
+#define DCCP_DEC_STATS(field)	    SNMP_DEC_STATS(dccp_statistics, field)
+#define DCCP_ADD_STATS_BH(field, val) \
+			SNMP_ADD_STATS_BH(dccp_statistics, field, val)
+#define DCCP_ADD_STATS_USER(field, val)	\
+			SNMP_ADD_STATS_USER(dccp_statistics, field, val)
+
+extern int  dccp_transmit_skb(struct sock *sk, struct sk_buff *skb);
+extern int  dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+
+extern int dccp_send_response(struct sock *sk);
+extern void dccp_send_ack(struct sock *sk);
+extern void dccp_send_delayed_ack(struct sock *sk);
+extern void dccp_send_sync(struct sock *sk, const u64 seq,
+			   const enum dccp_pkt_type pkt_type);
+
+extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
+extern void dccp_write_space(struct sock *sk);
+
+extern void dccp_init_xmit_timers(struct sock *sk);
+static inline void dccp_clear_xmit_timers(struct sock *sk)
+{
+	inet_csk_clear_xmit_timers(sk);
+}
+
+extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+
+extern const char *dccp_packet_name(const int type);
+extern const char *dccp_state_name(const int state);
+
+static inline void dccp_set_state(struct sock *sk, const int state)
+{
+	const int oldstate = sk->sk_state;
+
+	dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
+		      dccp_role(sk), sk,
+		      dccp_state_name(oldstate), dccp_state_name(state));
+	WARN_ON(state == oldstate);
+
+	switch (state) {
+	case DCCP_OPEN:
+		if (oldstate != DCCP_OPEN)
+			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+		break;
+
+	case DCCP_CLOSED:
+		if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
+			DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+		sk->sk_prot->unhash(sk);
+		if (inet_csk(sk)->icsk_bind_hash != NULL &&
+		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+			inet_put_port(&dccp_hashinfo, sk);
+		/* fall through */
+	default:
+		if (oldstate == DCCP_OPEN)
+			DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+	}
+
+	/* Change state AFTER socket is unhashed to avoid closed
+	 * socket sitting in hash tables.
+	 */
+	sk->sk_state = state;
+}
+
+static inline void dccp_done(struct sock *sk)
+{
+	dccp_set_state(sk, DCCP_CLOSED);
+	dccp_clear_xmit_timers(sk);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_state_change(sk);
+	else
+		inet_csk_destroy_sock(sk);
+}
+
+static inline void dccp_openreq_init(struct request_sock *req,
+				     struct dccp_sock *dp,
+				     struct sk_buff *skb)
+{
+	/*
+	 * FIXME: fill in the other req fields from the DCCP options
+	 * received
+	 */
+	inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
+	inet_rsk(req)->acked	= 0;
+	req->rcv_wnd = 0;
+}
+
+extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
+
+extern struct sock *dccp_create_openreq_child(struct sock *sk,
+					      const struct request_sock *req,
+					      const struct sk_buff *skb);
+
+extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+extern void dccp_v4_err(struct sk_buff *skb, u32);
+
+extern int dccp_v4_rcv(struct sk_buff *skb);
+
+extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
+					      struct sk_buff *skb,
+					      struct request_sock *req,
+					      struct dst_entry *dst);
+extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+				   struct request_sock *req,
+				   struct request_sock **prev);
+
+extern int dccp_child_process(struct sock *parent, struct sock *child,
+			      struct sk_buff *skb);
+extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+				  struct dccp_hdr *dh, unsigned len);
+extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+				const struct dccp_hdr *dh, const unsigned len);
+
+extern void		dccp_close(struct sock *sk, long timeout);
+extern struct sk_buff	*dccp_make_response(struct sock *sk,
+					    struct dst_entry *dst,
+					    struct request_sock *req);
+extern struct sk_buff	*dccp_make_reset(struct sock *sk,
+					 struct dst_entry *dst,
+					 enum dccp_reset_codes code);
+
+extern int	   dccp_connect(struct sock *sk);
+extern int	   dccp_disconnect(struct sock *sk, int flags);
+extern int	   dccp_getsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, int __user *optlen);
+extern int	   dccp_setsockopt(struct sock *sk, int level, int optname,
+				   char __user *optval, int optlen);
+extern int	   dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int	   dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+				struct msghdr *msg, size_t size);
+extern int	   dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+				struct msghdr *msg, size_t len, int nonblock,
+				int flags, int *addr_len);
+extern void	   dccp_shutdown(struct sock *sk, int how);
+
+extern int	   dccp_v4_checksum(const struct sk_buff *skb,
+				    const u32 saddr, const u32 daddr);
+
+extern int	   dccp_v4_send_reset(struct sock *sk,
+				      enum dccp_reset_codes code);
+extern void	   dccp_send_close(struct sock *sk, const int active);
+
+struct dccp_skb_cb {
+	__u8 dccpd_type;
+	__u8 dccpd_reset_code;
+	__u8 dccpd_service;
+	__u8 dccpd_ccval;
+	__u64 dccpd_seq;
+	__u64 dccpd_ack_seq;
+	int  dccpd_opt_len;
+};
+
+#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+
+static inline int dccp_non_data_packet(const struct sk_buff *skb)
+{
+	const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+	return type == DCCP_PKT_ACK	 ||
+	       type == DCCP_PKT_CLOSE	 ||
+	       type == DCCP_PKT_CLOSEREQ ||
+	       type == DCCP_PKT_RESET	 ||
+	       type == DCCP_PKT_SYNC	 ||
+	       type == DCCP_PKT_SYNCACK;
+}
+
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+{
+	const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
+
+	return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+}
+
+#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
+#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
+
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+	if (value > DCCP_MAX_SEQNO)
+		value -= DCCP_MAX_SEQNO + 1;
+	*seqno = value;
+}
+
+static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
+{
+	return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
+}
+
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+	if (++*seqno > DCCP_MAX_SEQNO)
+		*seqno = 0;
+}
+
+static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
+{
+	struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
+							   sizeof(*dh));
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	dh->dccph_seq	   = htonl((gss >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	dh->dccph_seq	   = htonl((gss >> 32));
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+	dhx->dccph_seq_low = htonl(gss & 0xffffffff);
+}
+
+static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
+				    const u64 gsr)
+{
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	dhack->dccph_ack_nr_high = htonl((gsr >> 32));
+#else
+#error  "Adjust your <asm/byteorder.h> defines"
+#endif
+	dhack->dccph_ack_nr_low  = htonl(gsr & 0xffffffff);
+}
+
+static inline void dccp_update_gsr(struct sock *sk, u64 seq)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	dp->dccps_gsr = seq;
+	dccp_set_seqno(&dp->dccps_swl,
+		       (dp->dccps_gsr + 1 -
+		        (dp->dccps_options.dccpo_sequence_window / 4)));
+	dccp_set_seqno(&dp->dccps_swh,
+		       (dp->dccps_gsr +
+			(3 * dp->dccps_options.dccpo_sequence_window) / 4));
+}
+
+static inline void dccp_update_gss(struct sock *sk, u64 seq)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	dp->dccps_awh = dp->dccps_gss = seq;
+	dccp_set_seqno(&dp->dccps_awl,
+		       (dp->dccps_gss -
+			dp->dccps_options.dccpo_sequence_window + 1));
+}
+
+extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+extern void dccp_insert_option_elapsed_time(struct sock *sk,
+					    struct sk_buff *skb,
+					    u32 elapsed_time);
+extern void dccp_insert_option_timestamp(struct sock *sk,
+					 struct sk_buff *skb);
+extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+			       unsigned char option,
+			       const void *value, unsigned char len);
+
+extern struct socket *dccp_ctl_socket;
+
+#define DCCP_ACKPKTS_STATE_RECEIVED	0
+#define DCCP_ACKPKTS_STATE_ECN_MARKED	(1 << 6)
+#define DCCP_ACKPKTS_STATE_NOT_RECEIVED	(3 << 6)
+
+#define DCCP_ACKPKTS_STATE_MASK		0xC0 /* 11000000 */
+#define DCCP_ACKPKTS_LEN_MASK		0x3F /* 00111111 */
+
+/** struct dccp_ackpkts - acknowledgeable packets
+ *
+ * This data structure is the one defined in the DCCP draft
+ * Appendix A.
+ *
+ * @dccpap_buf_head - circular buffer head
+ * @dccpap_buf_tail - circular buffer tail
+ * @dccpap_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ * 		       buffer (i.e. %dccpap_buf_head)
+ * @dccpap_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ * 		       by the buffer with State 0
+ *
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @dccpap_ack_seqno - the Sequence Number used for the packet
+ * 		       (HC-Receiver seqno)
+ * @dccpap_ack_ptr - the value of buf_head at the time of acknowledgement.
+ * @dccpap_ack_ackno - the Acknowledgement Number used for the packet
+ * 		       (HC-Sender seqno)
+ * @dccpap_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @dccpap_buf_len - circular buffer length
+ * @dccpap_time		- the time in usecs
+ * @dccpap_buf - circular buffer of acknowledgeable packets
+ */
+struct dccp_ackpkts {
+	unsigned int		dccpap_buf_head;
+	unsigned int		dccpap_buf_tail;
+	u64			dccpap_buf_ackno;
+	u64			dccpap_ack_seqno;
+	u64			dccpap_ack_ackno;
+	unsigned int		dccpap_ack_ptr;
+	unsigned int		dccpap_buf_vector_len;
+	unsigned int		dccpap_ack_vector_len;
+	unsigned int		dccpap_buf_len;
+	struct timeval		dccpap_time;
+	u8			dccpap_buf_nonce;
+	u8			dccpap_ack_nonce;
+	u8			dccpap_buf[0];
+};
+
+extern struct dccp_ackpkts *
+		dccp_ackpkts_alloc(unsigned int len,
+				  const unsigned int __nocast priority);
+extern void dccp_ackpkts_free(struct dccp_ackpkts *ap);
+extern int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state);
+extern void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap,
+					 struct sock *sk, u64 ackno);
+
+static inline suseconds_t timeval_usecs(const struct timeval *tv)
+{
+	return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+}
+
+static inline suseconds_t timeval_delta(const struct timeval *large,
+					const struct timeval *small)
+{
+	time_t	    secs  = large->tv_sec  - small->tv_sec;
+	suseconds_t usecs = large->tv_usec - small->tv_usec;
+
+	if (usecs < 0) {
+		secs--;
+		usecs += USEC_PER_SEC;
+	}
+	return secs * USEC_PER_SEC + usecs;
+}
+
+static inline void timeval_add_usecs(struct timeval *tv,
+				     const suseconds_t usecs)
+{
+	tv->tv_usec += usecs;
+	while (tv->tv_usec >= USEC_PER_SEC) {
+		tv->tv_sec++;
+		tv->tv_usec -= USEC_PER_SEC;
+	}
+}
+
+static inline void timeval_sub_usecs(struct timeval *tv,
+				     const suseconds_t usecs)
+{
+	tv->tv_usec -= usecs;
+	while (tv->tv_usec < 0) {
+		tv->tv_sec--;
+		tv->tv_usec += USEC_PER_SEC;
+	}
+}
+
+/*
+ * Returns the difference in usecs between timeval
+ * passed in and current time
+ */
+static inline suseconds_t timeval_now_delta(const struct timeval *tv)
+{
+	struct timeval now;
+	do_gettimeofday(&now);
+	return timeval_delta(&now, tv);
+}
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern void dccp_ackvector_print(const u64 ackno,
+				 const unsigned char *vector, int len);
+extern void dccp_ackpkts_print(const struct dccp_ackpkts *ap);
+#else
+static inline void dccp_ackvector_print(const u64 ackno,
+					const unsigned char *vector,
+					int len) { }
+static inline void dccp_ackpkts_print(const struct dccp_ackpkts *ap) { }
+#endif
+
+#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
new file mode 100644
index 00000000000..f675d8e642d
--- /dev/null
+++ b/net/dccp/diag.c
@@ -0,0 +1,71 @@
+/*
+ *  net/dccp/diag.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_get_info(struct sock *sk, struct tcp_info *info)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	memset(info, 0, sizeof(*info));
+
+	info->tcpi_state	= sk->sk_state;
+	info->tcpi_retransmits	= icsk->icsk_retransmits;
+	info->tcpi_probes	= icsk->icsk_probes_out;
+	info->tcpi_backoff	= icsk->icsk_backoff;
+	info->tcpi_pmtu		= dp->dccps_pmtu_cookie;
+
+	if (dp->dccps_options.dccpo_send_ack_vector)
+		info->tcpi_options |= TCPI_OPT_SACK;
+
+	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+	ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+}
+
+static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			       void *_info)
+{
+	r->idiag_rqueue = r->idiag_wqueue = 0;
+
+	if (_info != NULL)
+		dccp_get_info(sk, _info);
+}
+
+static struct inet_diag_handler dccp_diag_handler = {
+	.idiag_hashinfo	 = &dccp_hashinfo,
+	.idiag_get_info	 = dccp_diag_get_info,
+	.idiag_type	 = DCCPDIAG_GETSOCK,
+	.idiag_info_size = sizeof(struct tcp_info),
+};
+
+static int __init dccp_diag_init(void)
+{
+	return inet_diag_register(&dccp_diag_handler);
+}
+
+static void __exit dccp_diag_fini(void)
+{
+	inet_diag_unregister(&dccp_diag_handler);
+}
+
+module_init(dccp_diag_init);
+module_exit(dccp_diag_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP inet_diag handler");
diff --git a/net/dccp/input.c b/net/dccp/input.c
new file mode 100644
index 00000000000..ef29cef1daf
--- /dev/null
+++ b/net/dccp/input.c
@@ -0,0 +1,600 @@
+/*
+ *  net/dccp/input.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+{
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(sk, SOCK_DONE);
+	__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_set_owner_r(skb, sk);
+	sk->sk_data_ready(sk, 0);
+}
+
+static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+{
+	dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+	dccp_fin(sk, skb);
+	dccp_set_state(sk, DCCP_CLOSED);
+	sk_wake_async(sk, 1, POLL_HUP);
+}
+
+static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+	/*
+	 *   Step 7: Check for unexpected packet types
+	 *      If (S.is_server and P.type == CloseReq)
+	 *	  Send Sync packet acknowledging P.seqno
+	 *	  Drop packet and return
+	 */
+	if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+		return;
+	}
+
+	dccp_set_state(sk, DCCP_CLOSING);
+	dccp_send_close(sk, 0);
+}
+
+static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (dp->dccps_options.dccpo_send_ack_vector)
+		dccp_ackpkts_check_rcv_ackno(dp->dccps_hc_rx_ackpkts, sk,
+					     DCCP_SKB_CB(skb)->dccpd_ack_seq);
+}
+
+static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	struct dccp_sock *dp = dccp_sk(sk);
+	u64 lswl, lawl;
+
+	/*
+	 *   Step 5: Prepare sequence numbers for Sync
+	 *     If P.type == Sync or P.type == SyncAck,
+	 *	  If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
+	 *	     / * P is valid, so update sequence number variables
+	 *		 accordingly.  After this update, P will pass the tests
+	 *		 in Step 6.  A SyncAck is generated if necessary in
+	 *		 Step 15 * /
+	 *	     Update S.GSR, S.SWL, S.SWH
+	 *	  Otherwise,
+	 *	     Drop packet and return
+	 */
+	if (dh->dccph_type == DCCP_PKT_SYNC || 
+	    dh->dccph_type == DCCP_PKT_SYNCACK) {
+		if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			      dp->dccps_awl, dp->dccps_awh) &&
+		    !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
+			dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+		else
+			return -1;
+	}
+	
+	/*
+	 *   Step 6: Check sequence numbers
+	 *      Let LSWL = S.SWL and LAWL = S.AWL
+	 *      If P.type == CloseReq or P.type == Close or P.type == Reset,
+	 *	  LSWL := S.GSR + 1, LAWL := S.GAR
+	 *      If LSWL <= P.seqno <= S.SWH
+	 *	     and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
+	 *	  Update S.GSR, S.SWL, S.SWH
+	 *	  If P.type != Sync,
+	 *	     Update S.GAR
+	 *      Otherwise,
+	 *	  Send Sync packet acknowledging P.seqno
+	 *	  Drop packet and return
+	 */
+	lswl = dp->dccps_swl;
+	lawl = dp->dccps_awl;
+
+	if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
+	    dh->dccph_type == DCCP_PKT_CLOSE ||
+	    dh->dccph_type == DCCP_PKT_RESET) {
+		lswl = dp->dccps_gsr;
+		dccp_inc_seqno(&lswl);
+		lawl = dp->dccps_gar;
+	}
+
+	if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
+	    (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
+	     between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+		       lawl, dp->dccps_awh))) {
+		dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+
+		if (dh->dccph_type != DCCP_PKT_SYNC &&
+		    (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+		     DCCP_PKT_WITHOUT_ACK_SEQ))
+			dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+	} else {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
+					    "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+					    "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+					    "sending SYNC...\n",
+			       dccp_packet_name(dh->dccph_type),
+			       (unsigned long long) lswl,
+			       (unsigned long long)
+			       DCCP_SKB_CB(skb)->dccpd_seq,
+			       (unsigned long long) dp->dccps_swh,
+			       (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
+			        DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
+			       (unsigned long long) lawl,
+			       (unsigned long long)
+			       DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			       (unsigned long long) dp->dccps_awh);
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+		return -1;
+	}
+
+	return 0;
+}
+
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			 const struct dccp_hdr *dh, const unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (dccp_check_seqno(sk, skb))
+		goto discard;
+
+	if (dccp_parse_options(sk, skb))
+		goto discard;
+
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_event_ack_recv(sk, skb);
+
+	/*
+	 * FIXME: check ECN to see if we should use
+	 * DCCP_ACKPKTS_STATE_ECN_MARKED
+	 */
+	if (dp->dccps_options.dccpo_send_ack_vector) {
+		struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+
+		if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+				     DCCP_SKB_CB(skb)->dccpd_seq,
+				     DCCP_ACKPKTS_STATE_RECEIVED)) {
+			LIMIT_NETDEBUG(KERN_WARNING "DCCP: acknowledgeable "
+						    "packets buffer full!\n");
+			ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+			inet_csk_schedule_ack(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MIN,
+						  DCCP_RTO_MAX);
+			goto discard;
+		}
+
+		/*
+		 * FIXME: this activation is probably wrong, have to study more
+		 * TCP delack machinery and how it fits into DCCP draft, but
+		 * for now it kinda "works" 8)
+		 */
+		if (!inet_csk_ack_scheduled(sk)) {
+			inet_csk_schedule_ack(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5 * HZ,
+						  DCCP_RTO_MAX);
+		}
+	}
+
+	ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+	ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+	switch (dccp_hdr(skb)->dccph_type) {
+	case DCCP_PKT_DATAACK:
+	case DCCP_PKT_DATA:
+		/*
+		 * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
+		 * option if it is.
+		 */
+		__skb_pull(skb, dh->dccph_doff * 4);
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		skb_set_owner_r(skb, sk);
+		sk->sk_data_ready(sk, 0);
+		return 0;
+	case DCCP_PKT_ACK:
+		goto discard;
+	case DCCP_PKT_RESET:
+		/*
+		 *  Step 9: Process Reset
+		 *	If P.type == Reset,
+		 *		Tear down connection
+		 *		S.state := TIMEWAIT
+		 *		Set TIMEWAIT timer
+		 *		Drop packet and return
+		*/
+		dccp_fin(sk, skb);
+		dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+		return 0;
+	case DCCP_PKT_CLOSEREQ:
+		dccp_rcv_closereq(sk, skb);
+		goto discard;
+	case DCCP_PKT_CLOSE:
+		dccp_rcv_close(sk, skb);
+		return 0;
+	case DCCP_PKT_REQUEST:
+		/* Step 7 
+            	 *   or (S.is_server and P.type == Response)
+		 *   or (S.is_client and P.type == Request)
+		 *   or (S.state >= OPEN and P.type == Request
+		 *	and P.seqno >= S.OSR)
+		 *    or (S.state >= OPEN and P.type == Response
+		 *	and P.seqno >= S.OSR)
+		 *    or (S.state == RESPOND and P.type == Data),
+		 *  Send Sync packet acknowledging P.seqno
+		 *  Drop packet and return
+		 */
+		if (dp->dccps_role != DCCP_ROLE_LISTEN)
+			goto send_sync;
+		goto check_seq;
+	case DCCP_PKT_RESPONSE:
+		if (dp->dccps_role != DCCP_ROLE_CLIENT)
+			goto send_sync;
+check_seq:
+		if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
+send_sync:
+			dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+				       DCCP_PKT_SYNC);
+		}
+		break;
+	case DCCP_PKT_SYNC:
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+			       DCCP_PKT_SYNCACK);
+		/*
+		 * From the draft:
+		 *
+		 * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
+		 * MAY have non-zero-length application data areas, whose
+		 * contents * receivers MUST ignore.
+		 */
+		goto discard;
+	}
+
+	DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
+static int dccp_rcv_request_sent_state_process(struct sock *sk,
+					       struct sk_buff *skb,
+					       const struct dccp_hdr *dh,
+					       const unsigned len)
+{
+	/* 
+	 *  Step 4: Prepare sequence numbers in REQUEST
+	 *     If S.state == REQUEST,
+	 *	  If (P.type == Response or P.type == Reset)
+	 *		and S.AWL <= P.ackno <= S.AWH,
+	 *	     / * Set sequence number variables corresponding to the
+	 *		other endpoint, so P will pass the tests in Step 6 * /
+	 *	     Set S.GSR, S.ISR, S.SWL, S.SWH
+	 *	     / * Response processing continues in Step 10; Reset
+	 *		processing continues in Step 9 * /
+	*/
+	if (dh->dccph_type == DCCP_PKT_RESPONSE) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		struct dccp_sock *dp = dccp_sk(sk);
+
+		/* Stop the REQUEST timer */
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+		BUG_TRAP(sk->sk_send_head != NULL);
+		__kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+
+		if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			       dp->dccps_awl, dp->dccps_awh)) {
+			dccp_pr_debug("invalid ackno: S.AWL=%llu, "
+				      "P.ackno=%llu, S.AWH=%llu \n",
+				      (unsigned long long)dp->dccps_awl,
+			   (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
+				      (unsigned long long)dp->dccps_awh);
+			goto out_invalid_packet;
+		}
+
+		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_update_gsr(sk, dp->dccps_isr);
+		/*
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
+		 *
+		 * AWL was adjusted in dccp_v4_connect -acme
+		 */
+		dccp_set_seqno(&dp->dccps_swl,
+			       max48(dp->dccps_swl, dp->dccps_isr));
+
+		if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
+		    ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
+			ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+			ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+			/* FIXME: send appropriate RESET code */
+			goto out_invalid_packet;
+		}
+
+		dccp_sync_mss(sk, dp->dccps_pmtu_cookie);
+
+		/*
+		 *    Step 10: Process REQUEST state (second part)
+		 *       If S.state == REQUEST,
+		 *	  / * If we get here, P is a valid Response from the
+		 *	      server (see Step 4), and we should move to
+		 *	      PARTOPEN state. PARTOPEN means send an Ack,
+		 *	      don't send Data packets, retransmit Acks
+		 *	      periodically, and always include any Init Cookie
+		 *	      from the Response * /
+		 *	  S.state := PARTOPEN
+		 *	  Set PARTOPEN timer
+		 * 	  Continue with S.state == PARTOPEN
+		 *	  / * Step 12 will send the Ack completing the
+		 *	      three-way handshake * /
+		 */
+		dccp_set_state(sk, DCCP_PARTOPEN);
+
+		/* Make sure socket is routed, for correct metrics. */
+		inet_sk_rebuild_header(sk);
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, 0, POLL_OUT);
+		}
+
+		if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
+		    icsk->icsk_accept_queue.rskq_defer_accept) {
+			/* Save one ACK. Data will be ready after
+			 * several ticks, if write_pending is set.
+			 *
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
+			 */
+			/*
+			 * OK, in DCCP we can as well do a similar trick, its
+			 * even in the draft, but there is no need for us to
+			 * schedule an ack here, as dccp_sendmsg does this for
+			 * us, also stated in the draft. -acme
+			 */
+			__kfree_skb(skb);
+			return 0;
+		} 
+		dccp_send_ack(sk);
+		return -1;
+	}
+
+out_invalid_packet:
+	return 1; /* dccp_v4_do_rcv will send a reset, but...
+		     FIXME: the reset code should be
+			    DCCP_RESET_CODE_PACKET_ERROR */
+}
+
+static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
+						   struct sk_buff *skb,
+						   const struct dccp_hdr *dh,
+						   const unsigned len)
+{
+	int queued = 0;
+
+	switch (dh->dccph_type) {
+	case DCCP_PKT_RESET:
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+		break;
+	case DCCP_PKT_DATAACK:
+	case DCCP_PKT_ACK:
+		/*
+		 * FIXME: we should be reseting the PARTOPEN (DELACK) timer
+		 * here but only if we haven't used the DELACK timer for
+		 * something else, like sending a delayed ack for a TIMESTAMP
+		 * echo, etc, for now were not clearing it, sending an extra
+		 * ACK when there is nothing else to do in DELACK is not a big
+		 * deal after all.
+		 */
+
+		/* Stop the PARTOPEN timer */
+		if (sk->sk_state == DCCP_PARTOPEN)
+			inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+
+		dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_set_state(sk, DCCP_OPEN);
+
+		if (dh->dccph_type == DCCP_PKT_DATAACK) {
+			dccp_rcv_established(sk, skb, dh, len);
+			queued = 1; /* packet was queued
+				       (by dccp_rcv_established) */
+		}
+		break;
+	}
+
+	return queued;
+}
+
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+			   struct dccp_hdr *dh, unsigned len)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	const int old_state = sk->sk_state;
+	int queued = 0;
+
+	/*
+	 *  Step 3: Process LISTEN state
+	 *  	(Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
+	 *
+	 *     If S.state == LISTEN,
+	 *	  If P.type == Request or P contains a valid Init Cookie
+	 *	  	option,
+	 *	     * Must scan the packet's options to check for an Init
+	 *		Cookie.  Only the Init Cookie is processed here,
+	 *		however; other options are processed in Step 8.  This
+	 *		scan need only be performed if the endpoint uses Init
+	 *		Cookies *
+	 *	     * Generate a new socket and switch to that socket *
+	 *	     Set S := new socket for this port pair
+	 *	     S.state = RESPOND
+	 *	     Choose S.ISS (initial seqno) or set from Init Cookie
+	 *	     Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *	     Continue with S.state == RESPOND
+	 *	     * A Response packet will be generated in Step 11 *
+	 *	  Otherwise,
+	 *	     Generate Reset(No Connection) unless P.type == Reset
+	 *	     Drop packet and return
+	 *
+	 * NOTE: the check for the packet types is done in
+	 *	 dccp_rcv_state_process
+	 */
+	if (sk->sk_state == DCCP_LISTEN) {
+		if (dh->dccph_type == DCCP_PKT_REQUEST) {
+			if (dccp_v4_conn_request(sk, skb) < 0)
+				return 1;
+
+			/* FIXME: do congestion control initialization */
+			goto discard;
+		}
+		if (dh->dccph_type == DCCP_PKT_RESET)
+			goto discard;
+
+		/* Caller (dccp_v4_do_rcv) will send Reset(No Connection)*/
+		return 1;
+	}
+
+	if (sk->sk_state != DCCP_REQUESTING) {
+		if (dccp_check_seqno(sk, skb))
+			goto discard;
+
+		/*
+		 * Step 8: Process options and mark acknowledgeable
+		 */
+		if (dccp_parse_options(sk, skb))
+			goto discard;
+
+		if (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
+		    DCCP_PKT_WITHOUT_ACK_SEQ)
+			dccp_event_ack_recv(sk, skb);
+
+		ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+		ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+
+		/*
+		 * FIXME: check ECN to see if we should use
+		 * DCCP_ACKPKTS_STATE_ECN_MARKED
+		 */
+		if (dp->dccps_options.dccpo_send_ack_vector) {
+			if (dccp_ackpkts_add(dp->dccps_hc_rx_ackpkts,
+					     DCCP_SKB_CB(skb)->dccpd_seq,
+					     DCCP_ACKPKTS_STATE_RECEIVED))
+				goto discard;
+			/*
+			 * FIXME: this activation is probably wrong, have to
+			 * study more TCP delack machinery and how it fits into
+			 * DCCP draft, but for now it kinda "works" 8)
+			 */
+			if ((dp->dccps_hc_rx_ackpkts->dccpap_ack_seqno ==
+			     DCCP_MAX_SEQNO + 1) &&
+			    !inet_csk_ack_scheduled(sk)) {
+				inet_csk_schedule_ack(sk);
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+							  TCP_DELACK_MIN,
+							  DCCP_RTO_MAX);
+			}
+		}
+	}
+
+	/*
+	 *  Step 9: Process Reset
+	 *	If P.type == Reset,
+	 *		Tear down connection
+	 *		S.state := TIMEWAIT
+	 *		Set TIMEWAIT timer
+	 *		Drop packet and return
+	*/
+	if (dh->dccph_type == DCCP_PKT_RESET) {
+		/*
+		 * Queue the equivalent of TCP fin so that dccp_recvmsg
+		 * exits the loop
+		 */
+		dccp_fin(sk, skb);
+		dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+		return 0;
+		/*
+		 *   Step 7: Check for unexpected packet types
+		 *      If (S.is_server and P.type == CloseReq)
+		 *	    or (S.is_server and P.type == Response)
+		 *	    or (S.is_client and P.type == Request)
+		 *	    or (S.state == RESPOND and P.type == Data),
+		 *	  Send Sync packet acknowledging P.seqno
+		 *	  Drop packet and return
+		 */
+	} else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+		    (dh->dccph_type == DCCP_PKT_RESPONSE ||
+		     dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
+		    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+		     dh->dccph_type == DCCP_PKT_REQUEST) ||
+		    (sk->sk_state == DCCP_RESPOND &&
+		     dh->dccph_type == DCCP_PKT_DATA)) {
+		dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
+			       DCCP_PKT_SYNC);
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
+		dccp_rcv_closereq(sk, skb);
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {
+		dccp_rcv_close(sk, skb);
+		return 0;
+	}
+
+	switch (sk->sk_state) {
+	case DCCP_CLOSED:
+		return 1;
+
+	case DCCP_REQUESTING:
+		/* FIXME: do congestion control initialization */
+
+		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
+		if (queued >= 0)
+			return queued;
+
+		__kfree_skb(skb);
+		return 0;
+
+	case DCCP_RESPOND:
+	case DCCP_PARTOPEN:
+		queued = dccp_rcv_respond_partopen_state_process(sk, skb,
+								 dh, len);
+		break;
+	}
+
+	if (dh->dccph_type == DCCP_PKT_ACK ||
+	    dh->dccph_type == DCCP_PKT_DATAACK) {
+		switch (old_state) {
+		case DCCP_PARTOPEN:
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, 0, POLL_OUT);
+			break;
+		}
+	}
+
+	if (!queued) { 
+discard:
+		__kfree_skb(skb);
+	}
+	return 0;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
new file mode 100644
index 00000000000..3fc75dbee4b
--- /dev/null
+++ b/net/dccp/ipv4.c
@@ -0,0 +1,1356 @@
+/*
+ *  net/dccp/ipv4.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/icmp.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+
+#include <net/icmp.h>
+#include <net/inet_hashtables.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
+	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_users	= ATOMIC_INIT(0),
+	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
+	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
+	.port_rover	= 1024 - 1,
+};
+
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
+{
+	return inet_csk_get_port(&dccp_hashinfo, sk, snum);
+}
+
+static void dccp_v4_hash(struct sock *sk)
+{
+	inet_hash(&dccp_hashinfo, sk);
+}
+
+static void dccp_v4_unhash(struct sock *sk)
+{
+	inet_unhash(&dccp_hashinfo, sk);
+}
+
+/* called with local bh disabled */
+static int __dccp_v4_check_established(struct sock *sk, const __u16 lport,
+				      struct inet_timewait_sock **twp)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	const u32 daddr = inet->rcv_saddr;
+	const u32 saddr = inet->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport,
+				      dccp_hashinfo.ehash_size);
+	struct inet_ehash_bucket *head = &dccp_hashinfo.ehash[hash];
+	const struct sock *sk2;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
+
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) {
+		tw = inet_twsk(sk2);
+
+		if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->num = lport;
+	inet->sport = htons(lport);
+	sk->sk_hashent = hash;
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp != NULL) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw != NULL) {
+		/* Silly. Should hash-dance instead... */
+		inet_twsk_deschedule(tw, &dccp_death_row);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		inet_twsk_put(tw);
+	}
+
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static int dccp_v4_hash_connect(struct sock *sk)
+{
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
+	int ret;
+
+ 	if (snum == 0) {
+ 		int rover;
+ 		int low = sysctl_local_port_range[0];
+ 		int high = sysctl_local_port_range[1];
+ 		int remaining = (high - low) + 1;
+		struct hlist_node *node;
+ 		struct inet_timewait_sock *tw = NULL;
+
+ 		local_bh_disable();
+
+ 		/* TODO. Actually it is not so bad idea to remove
+ 		 * dccp_hashinfo.portalloc_lock before next submission to
+		 * Linus.
+ 		 * As soon as we touch this place at all it is time to think.
+ 		 *
+ 		 * Now it protects single _advisory_ variable
+		 * dccp_hashinfo.port_rover, hence it is mostly useless.
+ 		 * Code will work nicely if we just delete it, but
+ 		 * I am afraid in contented case it will work not better or
+ 		 * even worse: another cpu just will hit the same bucket
+ 		 * and spin there.
+ 		 * So some cpu salt could remove both contention and
+ 		 * memory pingpong. Any ideas how to do this in a nice way?
+ 		 */
+ 		spin_lock(&dccp_hashinfo.portalloc_lock);
+ 		rover = dccp_hashinfo.port_rover;
+
+ 		do {
+ 			rover++;
+ 			if ((rover < low) || (rover > high))
+ 				rover = low;
+ 			head = &dccp_hashinfo.bhash[inet_bhashfn(rover,
+						    dccp_hashinfo.bhash_size)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
+ 				if (tb->port == rover) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__dccp_v4_check_established(sk,
+									 rover,
+									 &tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep,
+						     head, rover);
+ 			if (tb == NULL) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		} while (--remaining > 0);
+ 		dccp_hashinfo.port_rover = rover;
+ 		spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+ 		/* All locks still held and bhs disabled */
+ 		dccp_hashinfo.port_rover = rover;
+ 		spin_unlock(&dccp_hashinfo.portalloc_lock);
+
+ 		inet_bind_hash(sk, tb, rover);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(rover);
+ 			__inet_hash(&dccp_hashinfo, sk, 0);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw != NULL) {
+ 			inet_twsk_deschedule(tw, &dccp_death_row);
+ 			inet_twsk_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head = &dccp_hashinfo.bhash[inet_bhashfn(snum,
+						 dccp_hashinfo.bhash_size)];
+ 	tb   = inet_csk(sk)->icsk_bind_hash;
+	spin_lock_bh(&head->lock);
+	if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) {
+		__inet_hash(&dccp_hashinfo, sk, 0);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __dccp_v4_check_established(sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
+			   int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+	struct rtable *rt;
+	u32 daddr, nexthop;
+	int tmp;
+	int err;
+
+	dp->dccps_role = DCCP_ROLE_CLIENT;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	nexthop = daddr = usin->sin_addr.s_addr;
+	if (inet->opt != NULL && inet->opt->srr) {
+		if (daddr == 0)
+			return -EINVAL;
+		nexthop = inet->opt->faddr;
+	}
+
+	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+			       IPPROTO_DCCP,
+			       inet->sport, usin->sin_port, sk);
+	if (tmp < 0)
+		return tmp;
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (inet->opt == NULL || !inet->opt->srr)
+		daddr = rt->rt_dst;
+
+	if (inet->saddr == 0)
+		inet->saddr = rt->rt_src;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = usin->sin_port;
+	inet->daddr = daddr;
+
+	dp->dccps_ext_header_len = 0;
+	if (inet->opt != NULL)
+		dp->dccps_ext_header_len = inet->opt->optlen;
+	/*
+	 * Socket identity is still unknown (sport may be zero).
+	 * However we set state to DCCP_REQUESTING and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initialization after this.
+	 */
+	dccp_set_state(sk, DCCP_REQUESTING);
+	err = dccp_v4_hash_connect(sk);
+	if (err != 0)
+		goto failure;
+
+	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+	if (err != 0)
+		goto failure;
+
+	/* OK, now commit destination to socket.  */
+	sk_setup_caps(sk, &rt->u.dst);
+
+	dp->dccps_gar =
+		dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
+							    inet->daddr,
+							    inet->sport,
+							    usin->sin_port);
+	dccp_update_gss(sk, dp->dccps_iss);
+
+	/*
+	 * SWL and AWL are initially adjusted so that they are not less than
+	 * the initial Sequence Numbers received and sent, respectively:
+	 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+	 *	AWL := max(GSS - W' + 1, ISS).
+	 * These adjustments MUST be applied only at the beginning of the
+	 * connection.
+	 */
+	dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
+
+	inet->id = dp->dccps_iss ^ jiffies;
+
+	err = dccp_connect(sk);
+	rt = NULL;
+	if (err != 0)
+		goto failure;
+out:
+	return err;
+failure:
+	/*
+	 * This unhashes the socket and releases the local port, if necessary.
+	 */
+	dccp_set_state(sk, DCCP_CLOSED);
+	ip_rt_put(rt);
+	sk->sk_route_caps = 0;
+	inet->dport = 0;
+	goto out;
+}
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void dccp_do_pmtu_discovery(struct sock *sk,
+					  const struct iphdr *iph,
+					  u32 mtu)
+{
+	struct dst_entry *dst;
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct dccp_sock *dp = dccp_sk(sk);
+
+	/* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
+	 * send out by Linux are always < 576bytes so they should go through
+	 * unfragmented).
+	 */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	/* We don't check in the destentry if pmtu discovery is forbidden
+	 * on this route. We just assume that no packet_to_big packets
+	 * are send back when pmtu discovery is not active.
+     	 * There is a small race when the user changes this flag in the
+	 * route, but I think that's acceptable.
+	 */
+	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+		return;
+
+	dst->ops->update_pmtu(dst, mtu);
+
+	/* Something is about to be wrong... Remember soft error
+	 * for the case, if this connection will not able to recover.
+	 */
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+		sk->sk_err_soft = EMSGSIZE;
+
+	mtu = dst_mtu(dst);
+
+	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    dp->dccps_pmtu_cookie > mtu) {
+		dccp_sync_mss(sk, mtu);
+
+		/*
+		 * From: draft-ietf-dccp-spec-11.txt
+		 *
+		 *	DCCP-Sync packets are the best choice for upward
+		 *	probing, since DCCP-Sync probes do not risk application
+		 *	data loss.
+		 */
+		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+	} /* else let the usual retransmit timer handle it */
+}
+
+static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
+{
+	int err;
+	struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+	const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
+				     sizeof(struct dccp_hdr_ext) +
+				     sizeof(struct dccp_hdr_ack_bits);
+	struct sk_buff *skb;
+
+	if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+		return;
+
+	skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_DCCP_HEADER);
+
+	skb->dst = dst_clone(rxskb->dst);
+
+	skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_hdr_ack_len);
+
+	/* Build DCCP header and checksum it. */
+	dh->dccph_type	   = DCCP_PKT_ACK;
+	dh->dccph_sport	   = rxdh->dccph_dport;
+	dh->dccph_dport	   = rxdh->dccph_sport;
+	dh->dccph_doff	   = dccp_hdr_ack_len / 4;
+	dh->dccph_x	   = 1;
+
+	dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+			 DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+	bh_lock_sock(dccp_ctl_socket->sk);
+	err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+				    rxskb->nh.iph->daddr,
+				    rxskb->nh.iph->saddr, NULL);
+	bh_unlock_sock(dccp_ctl_socket->sk);
+
+	if (err == NET_XMIT_CN || err == 0) {
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+	}
+}
+
+static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
+				   struct request_sock *req)
+{
+	dccp_v4_ctl_send_ack(skb);
+}
+
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
+				 struct dst_entry *dst)
+{
+	int err = -1;
+	struct sk_buff *skb;
+
+	/* First, grab a route. */
+	
+	if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+		goto out;
+
+	skb = dccp_make_response(sk, dst, req);
+	if (skb != NULL) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+
+		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+					    ireq->rmt_addr,
+					    ireq->opt);
+		if (err == NET_XMIT_CN)
+			err = 0;
+	}
+
+out:
+	dst_release(dst);
+	return err;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some sort of error
+ * condition. If err < 0 then the socket should be closed and the error
+ * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
+ * After adjustment header points to the first 8 bytes of the tcp header. We
+ * need to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When someone else
+ * accesses the socket the ICMP is just dropped and for some paths there is no
+ * check at all. A more general error queue to queue errors for later handling
+ * is probably better.
+ */
+void dccp_v4_err(struct sk_buff *skb, u32 info)
+{
+	const struct iphdr *iph = (struct iphdr *)skb->data;
+	const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
+							(iph->ihl << 2));
+	struct dccp_sock *dp;
+	struct inet_sock *inet;
+	const int type = skb->h.icmph->type;
+	const int code = skb->h.icmph->code;
+	struct sock *sk;
+	__u64 seq;
+	int err;
+
+	if (skb->len < (iph->ihl << 2) + 8) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+		return;
+	}
+
+	sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
+			 iph->saddr, dh->dccph_sport, inet_iif(skb));
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+		return;
+	}
+
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		inet_twsk_put((struct inet_timewait_sock *)sk);
+		return;
+	}
+
+	bh_lock_sock(sk);
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 */
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == DCCP_CLOSED)
+		goto out;
+
+	dp = dccp_sk(sk);
+	seq = dccp_hdr_seq(skb);
+	if (sk->sk_state != DCCP_LISTEN &&
+	    !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
+		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_SOURCE_QUENCH:
+		/* Just silently ignore these. */
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out;
+
+		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			if (!sock_owned_by_user(sk))
+				dccp_do_pmtu_discovery(sk, iph, info);
+			goto out;
+		}
+
+		err = icmp_err_convert[code].errno;
+		break;
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (sk->sk_state) {
+		struct request_sock *req , **prev;
+	case DCCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+		req = inet_csk_search_req(sk, &prev, dh->dccph_dport,
+					  iph->daddr, iph->saddr);
+		if (!req)
+			goto out;
+
+		/*
+		 * ICMPs are not backlogged, hence we cannot get an established
+		 * socket here.
+		 */
+		BUG_TRAP(!req->sk);
+
+		if (seq != dccp_rsk(req)->dreq_iss) {
+			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+		/*
+		 * Still in RESPOND, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept().
+		 */
+		inet_csk_reqsk_queue_drop(sk, req, prev);
+		goto out;
+
+	case DCCP_REQUESTING:
+	case DCCP_RESPOND:
+		if (!sock_owned_by_user(sk)) {
+			DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+			sk->sk_err = err;
+
+			sk->sk_error_report(sk);
+
+			dccp_done(sk);
+		} else
+			sk->sk_err_soft = err;
+		goto out;
+	}
+
+	/* If we've already connected we will keep trying
+	 * until we time out, or the user gives up.
+	 *
+	 * rfc1122 4.2.3.9 allows to consider as hard errors
+	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+	 * but it is obsoleted by pmtu discovery).
+	 *
+	 * Note, that in modern internet, where routing is unreliable
+	 * and in each dark corner broken firewalls sit, sending random
+	 * errors ordered by their masters even this two messages finally lose
+	 * their original sense (even Linux sends invalid PORT_UNREACHs)
+	 *
+	 * Now we are in compliance with RFCs.
+	 *							--ANK (980905)
+	 */
+
+	inet = inet_sk(sk);
+	if (!sock_owned_by_user(sk) && inet->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else /* Only an error on timeout */
+		sk->sk_err_soft = err;
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+	struct sk_buff *skb;
+	/*
+	 * FIXME: what if rebuild_header fails?
+	 * Should we be doing a rebuild_header here?
+	 */
+	int err = inet_sk_rebuild_header(sk);
+
+	if (err != 0)
+		return err;
+
+	skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
+	if (skb != NULL) {
+		const struct dccp_sock *dp = dccp_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
+
+		err = ip_build_and_send_pkt(skb, sk,
+					    inet->saddr, inet->daddr, NULL);
+		if (err == NET_XMIT_CN)
+			err = 0;
+
+		ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+		ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+	}
+
+	return err;
+}
+
+static inline u64 dccp_v4_init_sequence(const struct sock *sk,
+					const struct sk_buff *skb)
+{
+	return secure_dccp_sequence_number(skb->nh.iph->daddr,
+					   skb->nh.iph->saddr,
+					   dccp_hdr(skb)->dccph_dport,
+					   dccp_hdr(skb)->dccph_sport);
+}
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq;
+	struct dccp_sock dp;
+	struct request_sock *req;
+	struct dccp_request_sock *dreq;
+	const __u32 saddr = skb->nh.iph->saddr;
+	const __u32 daddr = skb->nh.iph->daddr;
+	struct dst_entry *dst = NULL;
+
+	/* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+	if (((struct rtable *)skb->dst)->rt_flags &
+	    (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto drop;
+
+	/*
+	 * TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if (inet_csk_reqsk_queue_is_full(sk))
+		goto drop;
+
+	/*
+	 * Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+		goto drop;
+
+	req = reqsk_alloc(sk->sk_prot->rsk_prot);
+	if (req == NULL)
+		goto drop;
+
+	/* FIXME: process options */
+
+	dccp_openreq_init(req, &dp, skb);
+
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	/* FIXME: Merge Aristeu's option parsing code when ready */
+	req->rcv_wnd	= 100; /* Fake, option parsing will get the
+				  right value */
+	ireq->opt	= NULL;
+
+	/* 
+	 * Step 3: Process LISTEN state
+	 *
+	 * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *
+	 * In fact we defer setting S.GSR, S.SWL, S.SWH to
+	 * dccp_create_openreq_child.
+	 */
+	dreq = dccp_rsk(req);
+	dreq->dreq_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+	dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
+	dreq->dreq_service = dccp_hdr_request(skb)->dccph_req_service;
+
+	if (dccp_v4_send_response(sk, req, dst))
+		goto drop_and_free;
+
+	inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+	return 0;
+
+drop_and_free:
+	/*
+	 * FIXME: should be reqsk_free after implementing req->rsk_ops
+	 */
+	__reqsk_free(req);
+drop:
+	DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+	return -1;
+}
+
+/*
+ * The three way handshake has completed - we got a valid ACK or DATAACK -
+ * now create the new socket.
+ *
+ * This is the equivalent of TCP's tcp_v4_syn_recv_sock
+ */
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+				       struct request_sock *req,
+				       struct dst_entry *dst)
+{
+	struct inet_request_sock *ireq;
+	struct inet_sock *newinet;
+	struct dccp_sock *newdp;
+	struct sock *newsk;
+
+	if (sk_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
+		goto exit;
+
+	newsk = dccp_create_openreq_child(sk, req, skb);
+	if (newsk == NULL)
+		goto exit;
+
+	sk_setup_caps(newsk, dst);
+
+	newdp		   = dccp_sk(newsk);
+	newinet		   = inet_sk(newsk);
+	ireq		   = inet_rsk(req);
+	newinet->daddr	   = ireq->rmt_addr;
+	newinet->rcv_saddr = ireq->loc_addr;
+	newinet->saddr	   = ireq->loc_addr;
+	newinet->opt	   = ireq->opt;
+	ireq->opt	   = NULL;
+	newinet->mc_index  = inet_iif(skb);
+	newinet->mc_ttl	   = skb->nh.iph->ttl;
+	newinet->id	   = jiffies;
+
+	dccp_sync_mss(newsk, dst_mtu(dst));
+
+	__inet_hash(&dccp_hashinfo, newsk, 0);
+	__inet_inherit_port(&dccp_hashinfo, sk, newsk);
+
+	return newsk;
+
+exit_overflow:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+	dst_release(dst);
+	return NULL;
+}
+
+static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const struct iphdr *iph = skb->nh.iph;
+	struct sock *nsk;
+	struct request_sock **prev;
+	/* Find possible connection requests. */
+	struct request_sock *req = inet_csk_search_req(sk, &prev,
+						       dh->dccph_sport,
+						       iph->saddr, iph->daddr);
+	if (req != NULL)
+		return dccp_check_req(sk, skb, req, prev);
+
+	nsk = __inet_lookup_established(&dccp_hashinfo,
+					iph->saddr, dh->dccph_sport,
+					iph->daddr, ntohs(dh->dccph_dport),
+					inet_iif(skb));
+	if (nsk != NULL) {
+		if (nsk->sk_state != DCCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		inet_twsk_put((struct inet_timewait_sock *)nsk);
+		return NULL;
+	}
+
+	return sk;
+}
+
+int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
+		     const u32 daddr)
+{
+	const struct dccp_hdr* dh = dccp_hdr(skb);
+	int checksum_len;
+	u32 tmp;
+
+	if (dh->dccph_cscov == 0)
+		checksum_len = skb->len;
+	else {
+		checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+		checksum_len = checksum_len < skb->len ? checksum_len :
+							 skb->len;
+	}
+
+	tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+	return csum_tcpudp_magic(saddr, daddr, checksum_len,
+				 IPPROTO_DCCP, tmp);
+}
+
+static int dccp_v4_verify_checksum(struct sk_buff *skb,
+				   const u32 saddr, const u32 daddr)
+{
+	struct dccp_hdr *dh = dccp_hdr(skb);
+	int checksum_len;
+	u32 tmp;
+
+	if (dh->dccph_cscov == 0)
+		checksum_len = skb->len;
+	else {
+		checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
+		checksum_len = checksum_len < skb->len ? checksum_len :
+							 skb->len;
+	}
+	tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
+	return csum_tcpudp_magic(saddr, daddr, checksum_len,
+				 IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
+}
+
+static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
+					   struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
+			    .nl_u = { .ip4_u =
+				      { .daddr = skb->nh.iph->saddr,
+					.saddr = skb->nh.iph->daddr,
+					.tos = RT_CONN_FLAGS(sk) } },
+			    .proto = sk->sk_protocol,
+			    .uli_u = { .ports =
+				       { .sport = dccp_hdr(skb)->dccph_dport,
+					 .dport = dccp_hdr(skb)->dccph_sport }
+			   	     }
+			  };
+
+	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+
+	return &rt->u.dst;
+}
+
+static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
+{
+	int err;
+	struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
+	const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+				       sizeof(struct dccp_hdr_ext) +
+				       sizeof(struct dccp_hdr_reset);
+	struct sk_buff *skb;
+	struct dst_entry *dst;
+	u64 seqno;
+
+	/* Never send a reset in response to a reset. */
+	if (rxdh->dccph_type == DCCP_PKT_RESET)
+		return;
+
+	if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+		return;
+
+	dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
+	if (dst == NULL)
+		return;
+
+	skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+	if (skb == NULL)
+		goto out;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_DCCP_HEADER);
+	skb->dst = dst_clone(dst);
+
+	skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_hdr_reset_len);
+
+	/* Build DCCP header and checksum it. */
+	dh->dccph_type	   = DCCP_PKT_RESET;
+	dh->dccph_sport	   = rxdh->dccph_dport;
+	dh->dccph_dport	   = rxdh->dccph_sport;
+	dh->dccph_doff	   = dccp_hdr_reset_len / 4;
+	dh->dccph_x	   = 1;
+	dccp_hdr_reset(skb)->dccph_reset_code =
+				DCCP_SKB_CB(rxskb)->dccpd_reset_code;
+
+	/* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
+	seqno = 0;
+	if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
+
+	dccp_hdr_set_seq(dh, seqno);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
+			 DCCP_SKB_CB(rxskb)->dccpd_seq);
+
+	dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
+					      rxskb->nh.iph->daddr);
+
+	bh_lock_sock(dccp_ctl_socket->sk);
+	err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
+				    rxskb->nh.iph->daddr,
+				    rxskb->nh.iph->saddr, NULL);
+	bh_unlock_sock(dccp_ctl_socket->sk);
+
+	if (err == NET_XMIT_CN || err == 0) {
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+		DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
+	}
+out:
+	 dst_release(dst);
+}
+
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_hdr *dh = dccp_hdr(skb);
+
+	if (sk->sk_state == DCCP_OPEN) { /* Fast path */
+		if (dccp_rcv_established(sk, skb, dh, skb->len))
+			goto reset;
+		return 0;
+	}
+
+	/*
+	 *  Step 3: Process LISTEN state
+	 *     If S.state == LISTEN,
+	 *	  If P.type == Request or P contains a valid Init Cookie
+	 *	  	option,
+	 *	     * Must scan the packet's options to check for an Init
+	 *		Cookie.  Only the Init Cookie is processed here,
+	 *		however; other options are processed in Step 8.  This
+	 *		scan need only be performed if the endpoint uses Init
+	 *		Cookies *
+	 *	     * Generate a new socket and switch to that socket *
+	 *	     Set S := new socket for this port pair
+	 *	     S.state = RESPOND
+	 *	     Choose S.ISS (initial seqno) or set from Init Cookie
+	 *	     Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+	 *	     Continue with S.state == RESPOND
+	 *	     * A Response packet will be generated in Step 11 *
+	 *	  Otherwise,
+	 *	     Generate Reset(No Connection) unless P.type == Reset
+	 *	     Drop packet and return
+	 *
+	 * NOTE: the check for the packet types is done in
+	 *	 dccp_rcv_state_process
+	 */
+	if (sk->sk_state == DCCP_LISTEN) {
+		struct sock *nsk = dccp_v4_hnd_req(sk, skb);
+
+		if (nsk == NULL)
+			goto discard;
+
+		if (nsk != sk) {
+			if (dccp_child_process(sk, nsk, skb))
+				goto reset;
+			return 0;
+		}
+	}
+
+	if (dccp_rcv_state_process(sk, skb, dh, skb->len))
+		goto reset;
+	return 0;
+
+reset:
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+	dccp_v4_ctl_send_reset(skb);
+discard:
+	kfree_skb(skb);
+	return 0;
+}
+
+static inline int dccp_invalid_packet(struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return 1;
+
+	if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
+		return 1;
+	}
+
+	dh = dccp_hdr(skb);
+
+	/* If the packet type is not understood, drop packet and return */
+	if (dh->dccph_type >= DCCP_PKT_INVALID) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
+		return 1;
+	}
+
+	/*
+	 * If P.Data Offset is too small for packet type, or too large for
+	 * packet, drop packet and return
+	 */
+	if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+					    "too small 1\n",
+			       dh->dccph_doff);
+		return 1;
+	}
+
+	if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
+					    "too small 2\n",
+			       dh->dccph_doff);
+		return 1;
+	}
+
+	dh = dccp_hdr(skb);
+
+	/*
+	 * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
+	 * has short sequence numbers), drop packet and return
+	 */
+	if (dh->dccph_x == 0 &&
+	    dh->dccph_type != DCCP_PKT_DATA &&
+	    dh->dccph_type != DCCP_PKT_ACK &&
+	    dh->dccph_type != DCCP_PKT_DATAACK) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
+					    "nor DataAck and P.X == 0\n",
+			       dccp_packet_name(dh->dccph_type));
+		return 1;
+	}
+
+	/* If the header checksum is incorrect, drop packet and return */
+	if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
+				    skb->nh.iph->daddr) < 0) {
+		LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is "
+					    "incorrect\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+/* this is called when real data arrives */
+int dccp_v4_rcv(struct sk_buff *skb)
+{
+	const struct dccp_hdr *dh;
+	struct sock *sk;
+	int rc;
+
+	/* Step 1: Check header basics: */
+
+	if (dccp_invalid_packet(skb))
+		goto discard_it;
+
+	dh = dccp_hdr(skb);
+#if 0
+	/*
+	 * Use something like this to simulate some DATA/DATAACK loss to test
+	 * dccp_ackpkts_add, you'll get something like this on a session that
+	 * sends 10 DATA/DATAACK packets:
+	 *
+	 * ackpkts_print: 281473596467422 |0,0|3,0|0,0|3,0|0,0|3,0|0,0|3,0|0,1|
+	 *
+	 * 0, 0 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == just this packet
+	 * 0, 1 means: DCCP_ACKPKTS_STATE_RECEIVED, RLE == two adjacent packets
+	 * 						   with the same state
+	 * 3, 0 means: DCCP_ACKPKTS_STATE_NOT_RECEIVED, RLE == just this packet
+	 *
+	 * So...
+	 *
+	 * 281473596467422 was received
+	 * 281473596467421 was not received
+	 * 281473596467420 was received
+	 * 281473596467419 was not received
+	 * 281473596467418 was received
+	 * 281473596467417 was not received
+	 * 281473596467416 was received
+	 * 281473596467415 was not received
+	 * 281473596467414 was received
+	 * 281473596467413 was received (this one was the 3way handshake
+	 * 				 RESPONSE)
+	 *
+	 */
+	if (dh->dccph_type == DCCP_PKT_DATA ||
+	    dh->dccph_type == DCCP_PKT_DATAACK) {
+		static int discard = 0;
+
+		if (discard) {
+			discard = 0;
+			goto discard_it;
+		}
+		discard = 1;
+	}
+#endif
+	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_hdr_seq(skb);
+	DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
+
+	dccp_pr_debug("%8.8s "
+		      "src=%u.%u.%u.%u@%-5d "
+		      "dst=%u.%u.%u.%u@%-5d seq=%llu",
+		      dccp_packet_name(dh->dccph_type),
+		      NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
+		      NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
+		      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+	if (dccp_packet_without_ack(skb)) {
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
+		dccp_pr_debug_cat("\n");
+	} else {
+		DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
+		dccp_pr_debug_cat(", ack=%llu\n",
+				  (unsigned long long)
+				  DCCP_SKB_CB(skb)->dccpd_ack_seq);
+	}
+
+	/* Step 2:
+	 * 	Look up flow ID in table and get corresponding socket */
+	sk = __inet_lookup(&dccp_hashinfo,
+			   skb->nh.iph->saddr, dh->dccph_sport,
+			   skb->nh.iph->daddr, ntohs(dh->dccph_dport),
+			   inet_iif(skb));
+
+	/* 
+	 * Step 2:
+	 * 	If no socket ...
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (sk == NULL) {
+		dccp_pr_debug("failed to look up flow ID in table and "
+			      "get corresponding socket\n");
+		goto no_dccp_socket;
+	}
+
+	/* 
+	 * Step 2:
+	 * 	... or S.state == TIMEWAIT,
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	       
+	if (sk->sk_state == DCCP_TIME_WAIT) {
+		dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
+			      "do_time_wait\n");
+                goto do_time_wait;
+	}
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		dccp_pr_debug("xfrm4_policy_check failed\n");
+		goto discard_and_relse;
+	}
+
+        if (sk_filter(sk, skb, 0)) {
+		dccp_pr_debug("sk_filter failed\n");
+                goto discard_and_relse;
+	}
+
+	skb->dev = NULL;
+
+	bh_lock_sock(sk);
+	rc = 0;
+	if (!sock_owned_by_user(sk))
+		rc = dccp_v4_do_rcv(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+	return rc;
+
+no_dccp_socket:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+	/*
+	 * Step 2:
+	 *		Generate Reset(No Connection) unless P.type == Reset
+	 *		Drop packet and return
+	 */
+	if (dh->dccph_type != DCCP_PKT_RESET) {
+		DCCP_SKB_CB(skb)->dccpd_reset_code =
+					DCCP_RESET_CODE_NO_CONNECTION;
+		dccp_v4_ctl_send_reset(skb);
+	}
+
+discard_it:
+	/* Discard frame. */
+	kfree_skb(skb);
+	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	inet_twsk_put((struct inet_timewait_sock *)sk);
+	goto no_dccp_socket;
+}
+
+static int dccp_v4_init_sock(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	static int dccp_ctl_socket_init = 1;
+
+	dccp_options_init(&dp->dccps_options);
+
+	if (dp->dccps_options.dccpo_send_ack_vector) {
+		dp->dccps_hc_rx_ackpkts =
+			dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+					   GFP_KERNEL);
+
+		if (dp->dccps_hc_rx_ackpkts == NULL)
+			return -ENOMEM;
+	}
+
+	/*
+	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
+	 * the listening (master) sock get CCID control blocks, which is not
+	 * necessary, but for now, to not mess with the test userspace apps,
+	 * lets leave it here, later the real solution is to do this in a
+	 * setsockopt(CCIDs-I-want/accept). -acme
+	 */
+	if (likely(!dccp_ctl_socket_init)) {
+		dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+						 sk);
+		dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_ccid,
+						 sk);
+	    	if (dp->dccps_hc_rx_ccid == NULL ||
+		    dp->dccps_hc_tx_ccid == NULL) {
+			ccid_exit(dp->dccps_hc_rx_ccid, sk);
+			ccid_exit(dp->dccps_hc_tx_ccid, sk);
+			dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+			dp->dccps_hc_rx_ackpkts = NULL;
+			dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+			return -ENOMEM;
+		}
+	} else
+		dccp_ctl_socket_init = 0;
+
+	dccp_init_xmit_timers(sk);
+	inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT;
+	sk->sk_state = DCCP_CLOSED;
+	sk->sk_write_space = dccp_write_space;
+	dp->dccps_mss_cache = 536;
+	dp->dccps_role = DCCP_ROLE_UNDEFINED;
+
+	return 0;
+}
+
+static int dccp_v4_destroy_sock(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	/*
+	 * DCCP doesn't use sk_qrite_queue, just sk_send_head
+	 * for retransmissions
+	 */
+	if (sk->sk_send_head != NULL) {
+		kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+	}
+
+	/* Clean up a referenced DCCP bind bucket. */
+	if (inet_csk(sk)->icsk_bind_hash != NULL)
+		inet_put_port(&dccp_hashinfo, sk);
+
+	ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+	dccp_ackpkts_free(dp->dccps_hc_rx_ackpkts);
+	dp->dccps_hc_rx_ackpkts = NULL;
+	ccid_exit(dp->dccps_hc_rx_ccid, sk);
+	ccid_exit(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+	return 0;
+}
+
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+	kfree(inet_rsk(req)->opt);
+}
+
+static struct request_sock_ops dccp_request_sock_ops = {
+	.family		= PF_INET,
+	.obj_size	= sizeof(struct dccp_request_sock),
+	.rtx_syn_ack	= dccp_v4_send_response,
+	.send_ack	= dccp_v4_reqsk_send_ack,
+	.destructor	= dccp_v4_reqsk_destructor,
+	.send_reset	= dccp_v4_ctl_send_reset,
+};
+
+struct proto dccp_v4_prot = {
+	.name			= "DCCP",
+	.owner			= THIS_MODULE,
+	.close			= dccp_close,
+	.connect		= dccp_v4_connect,
+	.disconnect		= dccp_disconnect,
+	.ioctl			= dccp_ioctl,
+	.init			= dccp_v4_init_sock,
+	.setsockopt		= dccp_setsockopt,
+	.getsockopt		= dccp_getsockopt,
+	.sendmsg		= dccp_sendmsg,
+	.recvmsg		= dccp_recvmsg,
+	.backlog_rcv		= dccp_v4_do_rcv,
+	.hash			= dccp_v4_hash,
+	.unhash			= dccp_v4_unhash,
+	.accept			= inet_csk_accept,
+	.get_port		= dccp_v4_get_port,
+	.shutdown		= dccp_shutdown,
+	.destroy		= dccp_v4_destroy_sock,
+	.orphan_count		= &dccp_orphan_count,
+	.max_header		= MAX_DCCP_HEADER,
+	.obj_size		= sizeof(struct dccp_sock),
+	.rsk_prot		= &dccp_request_sock_ops,
+	.twsk_obj_size		= sizeof(struct inet_timewait_sock),
+};
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
new file mode 100644
index 00000000000..ce5dff4ac22
--- /dev/null
+++ b/net/dccp/minisocks.c
@@ -0,0 +1,264 @@
+/*
+ *  net/dccp/minisocks.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+
+#include <net/sock.h>
+#include <net/xfrm.h>
+#include <net/inet_timewait_sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+struct inet_timewait_death_row dccp_death_row = {
+	.sysctl_max_tw_buckets = NR_FILE * 2,
+	.period		= DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.hashinfo	= &dccp_hashinfo,
+	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
+					    (unsigned long)&dccp_death_row),
+	.twkill_work	= __WORK_INITIALIZER(dccp_death_row.twkill_work,
+					     inet_twdr_twkill_work,
+					     &dccp_death_row),
+/* Short-time timewait calendar */
+
+	.twcal_hand	= -1,
+	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+					    (unsigned long)&dccp_death_row),
+};
+
+void dccp_time_wait(struct sock *sk, int state, int timeo)
+{
+	struct inet_timewait_sock *tw = NULL;
+
+	if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+		tw = inet_twsk_alloc(sk, state);
+
+	if (tw != NULL) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+		/* Linkage updates. */
+		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+
+		/* Get the TIME_WAIT timeout firing. */
+		if (timeo < rto)
+			timeo = rto;
+
+		tw->tw_timeout = DCCP_TIMEWAIT_LEN;
+		if (state == DCCP_TIME_WAIT)
+			timeo = DCCP_TIMEWAIT_LEN;
+
+		inet_twsk_schedule(tw, &dccp_death_row, timeo,
+				   DCCP_TIMEWAIT_LEN);
+		inet_twsk_put(tw);
+	} else {
+		/* Sorry, if we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
+					 "table overflow\n");
+	}
+
+	dccp_done(sk);
+}
+
+struct sock *dccp_create_openreq_child(struct sock *sk,
+				       const struct request_sock *req,
+				       const struct sk_buff *skb)
+{
+	/*
+	 * Step 3: Process LISTEN state
+	 *
+	 * // Generate a new socket and switch to that socket
+	 * Set S := new socket for this port pair
+	 */
+	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+
+	if (newsk != NULL) {
+		const struct dccp_request_sock *dreq = dccp_rsk(req);
+		struct inet_connection_sock *newicsk = inet_csk(sk);
+		struct dccp_sock *newdp = dccp_sk(newsk);
+
+		newdp->dccps_hc_rx_ackpkts = NULL;
+		newdp->dccps_role = DCCP_ROLE_SERVER;
+		newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+
+		if (newdp->dccps_options.dccpo_send_ack_vector) {
+			newdp->dccps_hc_rx_ackpkts =
+				dccp_ackpkts_alloc(DCCP_MAX_ACK_VECTOR_LEN,
+						   GFP_ATOMIC);
+			/*
+			 * XXX: We're using the same CCIDs set on the parent,
+			 * i.e. sk_clone copied the master sock and left the
+			 * CCID pointers for this child, that is why we do the
+			 * __ccid_get calls.
+			 */
+			if (unlikely(newdp->dccps_hc_rx_ackpkts == NULL))
+				goto out_free;
+		}
+
+		if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
+					     newsk) != 0 ||
+			     ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
+				     	     newsk) != 0)) {
+			dccp_ackpkts_free(newdp->dccps_hc_rx_ackpkts);
+			ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
+			ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
+
+		__ccid_get(newdp->dccps_hc_rx_ccid);
+		__ccid_get(newdp->dccps_hc_tx_ccid);
+
+		/*
+		 * Step 3: Process LISTEN state
+		 *
+		 *	Choose S.ISS (initial seqno) or set from Init Cookie
+		 *	Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+		 *	Cookie
+		 */
+
+		/* See dccp_v4_conn_request */
+		newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
+
+		newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
+		dccp_update_gsr(newsk, dreq->dreq_isr);
+
+		newdp->dccps_iss = dreq->dreq_iss;
+		dccp_update_gss(newsk, dreq->dreq_iss);
+
+		/*
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
+		 */
+		dccp_set_seqno(&newdp->dccps_swl,
+			       max48(newdp->dccps_swl, newdp->dccps_isr));
+		dccp_set_seqno(&newdp->dccps_awl,
+			       max48(newdp->dccps_awl, newdp->dccps_iss));
+
+		dccp_init_xmit_timers(newsk);
+
+		DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
+	}
+	return newsk;
+}
+
+/* 
+ * Process an incoming packet for RESPOND sockets represented
+ * as an request_sock.
+ */
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+			    struct request_sock *req,
+			    struct request_sock **prev)
+{
+	struct sock *child = NULL;
+
+	/* Check for retransmitted REQUEST */
+	if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+		if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
+			    dccp_rsk(req)->dreq_isr)) {
+			struct dccp_request_sock *dreq = dccp_rsk(req);
+
+			dccp_pr_debug("Retransmitted REQUEST\n");
+			/* Send another RESPONSE packet */
+			dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
+			dccp_set_seqno(&dreq->dreq_isr,
+				       DCCP_SKB_CB(skb)->dccpd_seq);
+			req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+		}
+		/* Network Duplicate, discard packet */
+		return NULL;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+
+	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
+	    dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
+		goto drop;
+
+	/* Invalid ACK */
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
+		dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
+			      "dreq_iss=%llu\n",
+			      (unsigned long long)
+			      DCCP_SKB_CB(skb)->dccpd_ack_seq,
+			      (unsigned long long)
+			      dccp_rsk(req)->dreq_iss);
+		goto drop;
+	}
+
+	child = dccp_v4_request_recv_sock(sk, skb, req, NULL);
+	if (child == NULL)
+		goto listen_overflow;
+
+	/* FIXME: deal with options */
+
+	inet_csk_reqsk_queue_unlink(sk, req, prev);
+	inet_csk_reqsk_queue_removed(sk, req);
+	inet_csk_reqsk_queue_add(sk, req, child);
+out:
+	return child;
+listen_overflow:
+	dccp_pr_debug("listen_overflow!\n");
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+drop:
+	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+		req->rsk_ops->send_reset(skb);
+
+	inet_csk_reqsk_queue_drop(sk, req, prev);
+	goto out;
+}
+
+/*
+ *  Queue segment on the new socket if the new socket is active,
+ *  otherwise we just shortcircuit this and continue with
+ *  the new socket.
+ */
+int dccp_child_process(struct sock *parent, struct sock *child,
+		       struct sk_buff *skb)
+{
+	int ret = 0;
+	const int state = child->sk_state;
+
+	if (!sock_owned_by_user(child)) {
+		ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
+					     skb->len);
+
+		/* Wakeup parent, send SIGIO */
+		if (state == DCCP_RESPOND && child->sk_state != state)
+			parent->sk_data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	sock_put(child);
+	return ret;
+}
diff --git a/net/dccp/options.c b/net/dccp/options.c
new file mode 100644
index 00000000000..382c5894acb
--- /dev/null
+++ b/net/dccp/options.c
@@ -0,0 +1,855 @@
+/*
+ *  net/dccp/options.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
+ *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
+ *  Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+					     struct sock *sk,
+					     const u64 ackno,
+					     const unsigned char len,
+					     const unsigned char *vector);
+
+/* stores the default values for new connection. may be changed with sysctl */
+static const struct dccp_options dccpo_default_values = {
+	.dccpo_sequence_window	  = DCCPF_INITIAL_SEQUENCE_WINDOW,
+	.dccpo_ccid		  = DCCPF_INITIAL_CCID,
+	.dccpo_send_ack_vector	  = DCCPF_INITIAL_SEND_ACK_VECTOR,
+	.dccpo_send_ndp_count	  = DCCPF_INITIAL_SEND_NDP_COUNT,
+};
+
+void dccp_options_init(struct dccp_options *dccpo)
+{
+	memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
+}
+
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+{
+	u32 value = 0;
+
+	if (len > 3)
+		value += *bf++ << 24;
+	if (len > 2)
+		value += *bf++ << 16;
+	if (len > 1)
+		value += *bf++ << 8;
+	if (len > 0)
+		value += *bf;
+
+	return value;
+}
+
+int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+	const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT rx opt: " : "server rx opt: ";
+#endif
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+	unsigned char *opt_ptr = options;
+	const unsigned char *opt_end = (unsigned char *)dh +
+					(dh->dccph_doff * 4);
+	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
+	unsigned char opt, len;
+	unsigned char *value;
+
+	memset(opt_recv, 0, sizeof(*opt_recv));
+
+	while (opt_ptr != opt_end) {
+		opt   = *opt_ptr++;
+		len   = 0;
+		value = NULL;
+
+		/* Check if this isn't a single byte option */
+		if (opt > DCCPO_MAX_RESERVED) {
+			if (opt_ptr == opt_end)
+				goto out_invalid_option;
+
+			len = *opt_ptr++;
+			if (len < 3)
+				goto out_invalid_option;
+			/*
+			 * Remove the type and len fields, leaving
+			 * just the value size
+			 */
+			len	-= 2;
+			value	= opt_ptr;
+			opt_ptr += len;
+
+			if (opt_ptr > opt_end)
+				goto out_invalid_option;
+		}
+
+		switch (opt) {
+		case DCCPO_PADDING:
+			break;
+		case DCCPO_NDP_COUNT:
+			if (len > 3)
+				goto out_invalid_option;
+
+			opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
+			dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
+				      opt_recv->dccpor_ndp);
+			break;
+		case DCCPO_ACK_VECTOR_0:
+			if (len > DCCP_MAX_ACK_VECTOR_LEN)
+				goto out_invalid_option;
+
+			if (pkt_type == DCCP_PKT_DATA)
+				continue;
+
+			opt_recv->dccpor_ack_vector_len = len;
+			opt_recv->dccpor_ack_vector_idx = value - options;
+
+			dccp_pr_debug("%sACK vector 0, len=%d, ack_ackno=%llu\n",
+				      debug_prefix, len,
+				      (unsigned long long)
+				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+			dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+					     value, len);
+			dccp_ackpkts_check_rcv_ackvector(dp->dccps_hc_rx_ackpkts,
+							 sk,
+						 DCCP_SKB_CB(skb)->dccpd_ack_seq,
+							 len, value);
+			break;
+		case DCCPO_TIMESTAMP:
+			if (len != 4)
+				goto out_invalid_option;
+
+			opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
+
+			dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
+			do_gettimeofday(&dp->dccps_timestamp_time);
+
+			dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
+				      debug_prefix, opt_recv->dccpor_timestamp,
+				      (unsigned long long)
+				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+			break;
+		case DCCPO_TIMESTAMP_ECHO:
+			if (len != 4 && len != 6 && len != 8)
+				goto out_invalid_option;
+
+			opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+
+			dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
+				      debug_prefix,
+				      opt_recv->dccpor_timestamp_echo,
+				      len + 2,
+				      (unsigned long long)
+				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+
+			if (len > 4) {
+				if (len == 6)
+					opt_recv->dccpor_elapsed_time =
+						 ntohs(*(u16 *)(value + 4));
+				else
+					opt_recv->dccpor_elapsed_time =
+						 ntohl(*(u32 *)(value + 4));
+
+				dccp_pr_debug("%sTIMESTAMP_ECHO ELAPSED_TIME=%d\n",
+				      debug_prefix,
+				      opt_recv->dccpor_elapsed_time);
+			}
+			break;
+		case DCCPO_ELAPSED_TIME:
+			if (len != 2 && len != 4)
+				goto out_invalid_option;
+
+			if (pkt_type == DCCP_PKT_DATA)
+				continue;
+
+			if (len == 2)
+				opt_recv->dccpor_elapsed_time =
+							ntohs(*(u16 *)value);
+			else
+				opt_recv->dccpor_elapsed_time =
+							ntohl(*(u32 *)value);
+
+			dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
+				      opt_recv->dccpor_elapsed_time);
+			break;
+			/*
+			 * From draft-ietf-dccp-spec-11.txt:
+			 *
+			 *	Option numbers 128 through 191 are for
+			 *	options sent from the HC-Sender to the
+			 *	HC-Receiver; option numbers 192 through 255
+			 *	are for options sent from the HC-Receiver to
+			 *	the HC-Sender.
+			 */
+		case 128 ... 191: {
+			const u16 idx = value - options;
+
+			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
+						     opt, len, idx,
+						     value) != 0)
+				goto out_invalid_option;
+		}
+			break;
+		case 192 ... 255: {
+			const u16 idx = value - options;
+
+			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
+						     opt, len, idx,
+						     value) != 0)
+				goto out_invalid_option;
+		}
+			break;
+		default:
+			pr_info("DCCP(%p): option %d(len=%d) not "
+				"implemented, ignoring\n",
+				sk, opt, len);
+			break;
+	        }
+	}
+
+	return 0;
+
+out_invalid_option:
+	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+	pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
+	return -1;
+}
+
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+				  const unsigned int len)
+{
+	if (len > 3)
+		*to++ = (value & 0xFF000000) >> 24;
+	if (len > 2)
+		*to++ = (value & 0xFF0000) >> 16;
+	if (len > 1)
+		*to++ = (value & 0xFF00) >> 8;
+	if (len > 0)
+		*to++ = (value & 0xFF);
+}
+
+static inline int dccp_ndp_len(const int ndp)
+{
+	return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+}
+
+void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
+			const unsigned char option,
+			const void *value, const unsigned char len)
+{
+	unsigned char *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+			       "%d option!\n", option);
+		return;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
+
+	to    = skb_push(skb, len + 2);
+	*to++ = option;
+	*to++ = len + 2;
+
+	memcpy(to, value, len);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option);
+
+static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	int ndp = dp->dccps_ndp_count;
+
+	if (dccp_non_data_packet(skb))
+		++dp->dccps_ndp_count;
+	else
+		dp->dccps_ndp_count = 0;
+
+	if (ndp > 0) {
+		unsigned char *ptr;
+		const int ndp_len = dccp_ndp_len(ndp);
+		const int len = ndp_len + 2;
+
+		if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+			return;
+
+		DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+		ptr = skb_push(skb, len);
+		*ptr++ = DCCPO_NDP_COUNT;
+		*ptr++ = len;
+		dccp_encode_value_var(ndp, ptr, ndp_len);
+	}
+}
+
+static inline int dccp_elapsed_time_len(const u32 elapsed_time)
+{
+	return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
+}
+
+void dccp_insert_option_elapsed_time(struct sock *sk,
+				     struct sk_buff *skb,
+				     u32 elapsed_time)
+{
+#ifdef CONFIG_IP_DCCP_DEBUG
+	struct dccp_sock *dp = dccp_sk(sk);
+	const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT TX opt: " : "server TX opt: ";
+#endif
+	const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+	const int len = 2 + elapsed_time_len;
+	unsigned char *to;
+
+	if (elapsed_time_len == 0)
+		return;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+					 "insert elapsed time!\n");
+		return;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to    = skb_push(skb, len);
+	*to++ = DCCPO_ELAPSED_TIME;
+	*to++ = len;
+
+	if (elapsed_time_len == 2) {
+		const u16 var16 = htons((u16)elapsed_time);
+		memcpy(to, &var16, 2);
+	} else {
+		const u32 var32 = htonl(elapsed_time);
+		memcpy(to, &var32, 4);
+	}
+
+	dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
+		      debug_prefix, elapsed_time,
+		      len,
+		      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
+
+static void dccp_insert_option_ack_vector(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+	const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT TX opt: " : "server TX opt: ";
+#endif
+	struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+	int len = ap->dccpap_buf_vector_len + 2;
+	const u32 elapsed_time = timeval_now_delta(&ap->dccpap_time) / 10;
+	unsigned char *to, *from;
+
+	if (elapsed_time != 0)
+		dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
+					 "insert ACK Vector!\n");
+		return;
+	}
+
+	/*
+	 * XXX: now we have just one ack vector sent record, so
+	 * we have to wait for it to be cleared.
+	 *
+	 * Of course this is not acceptable, but this is just for
+	 * basic testing now.
+	 */
+	if (ap->dccpap_ack_seqno != DCCP_MAX_SEQNO + 1)
+		return;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to    = skb_push(skb, len);
+	*to++ = DCCPO_ACK_VECTOR_0;
+	*to++ = len;
+
+	len  = ap->dccpap_buf_vector_len;
+	from = ap->dccpap_buf + ap->dccpap_buf_head;
+
+	/* Check if buf_head wraps */
+	if (ap->dccpap_buf_head + len > ap->dccpap_buf_len) {
+		const unsigned int tailsize = (ap->dccpap_buf_len -
+					       ap->dccpap_buf_head);
+
+		memcpy(to, from, tailsize);
+		to   += tailsize;
+		len  -= tailsize;
+		from = ap->dccpap_buf;
+	}
+
+	memcpy(to, from, len);
+	/*
+	 *	From draft-ietf-dccp-spec-11.txt:
+	 *
+	 *	For each acknowledgement it sends, the HC-Receiver will add an
+	 *	acknowledgement record.  ack_seqno will equal the HC-Receiver
+	 *	sequence number it used for the ack packet; ack_ptr will equal
+	 *	buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+	 *	equal buf_nonce.
+	 *
+	 * This implemention uses just one ack record for now.
+	 */
+	ap->dccpap_ack_seqno	  = DCCP_SKB_CB(skb)->dccpd_seq;
+	ap->dccpap_ack_ptr	  = ap->dccpap_buf_head;
+	ap->dccpap_ack_ackno	  = ap->dccpap_buf_ackno;
+	ap->dccpap_ack_nonce	  = ap->dccpap_buf_nonce;
+	ap->dccpap_ack_vector_len = ap->dccpap_buf_vector_len;
+
+	dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
+		      "ack_ackno=%llu\n",
+		      debug_prefix, ap->dccpap_ack_vector_len,
+		      (unsigned long long) ap->dccpap_ack_seqno,
+		      (unsigned long long) ap->dccpap_ack_ackno);
+}
+
+void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+	struct timeval tv;
+	u32 now;
+	
+	do_gettimeofday(&tv);
+	now = (tv.tv_sec * USEC_PER_SEC + tv.tv_usec) / 10;
+	/* yes this will overflow but that is the point as we want a
+	 * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+
+	now = htonl(now);
+	dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
+
+static void dccp_insert_option_timestamp_echo(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+#ifdef CONFIG_IP_DCCP_DEBUG
+	const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT TX opt: " : "server TX opt: ";
+#endif
+	u32 tstamp_echo;
+	const u32 elapsed_time =
+			timeval_now_delta(&dp->dccps_timestamp_time) / 10;
+	const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+	const int len = 6 + elapsed_time_len;
+	unsigned char *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+		LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
+					 "timestamp echo!\n");
+		return;
+	}
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to    = skb_push(skb, len);
+	*to++ = DCCPO_TIMESTAMP_ECHO;
+	*to++ = len;
+
+	tstamp_echo = htonl(dp->dccps_timestamp_echo);
+	memcpy(to, &tstamp_echo, 4);
+	to += 4;
+	
+	if (elapsed_time_len == 2) {
+		const u16 var16 = htons((u16)elapsed_time);
+		memcpy(to, &var16, 2);
+	} else if (elapsed_time_len == 4) {
+		const u32 var32 = htonl(elapsed_time);
+		memcpy(to, &var32, 4);
+	}
+
+	dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
+		      debug_prefix, dp->dccps_timestamp_echo,
+		      len,
+		      (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+
+	dp->dccps_timestamp_echo = 0;
+	dp->dccps_timestamp_time.tv_sec = 0;
+	dp->dccps_timestamp_time.tv_usec = 0;
+}
+
+void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+	if (dp->dccps_options.dccpo_send_ndp_count)
+		dccp_insert_option_ndp(sk, skb);
+
+	if (!dccp_packet_without_ack(skb)) {
+		if (dp->dccps_options.dccpo_send_ack_vector &&
+		    (dp->dccps_hc_rx_ackpkts->dccpap_buf_ackno !=
+		     DCCP_MAX_SEQNO + 1))
+			dccp_insert_option_ack_vector(sk, skb);
+
+		if (dp->dccps_timestamp_echo != 0)
+			dccp_insert_option_timestamp_echo(sk, skb);
+	}
+
+	ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+	ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
+
+	/* XXX: insert other options when appropriate */
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
+		/* The length of all options has to be a multiple of 4 */
+		int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+
+		if (padding != 0) {
+			padding = 4 - padding;
+			memset(skb_push(skb, padding), 0, padding);
+			DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+		}
+	}
+}
+
+struct dccp_ackpkts *dccp_ackpkts_alloc(const unsigned int len,
+				        const unsigned int __nocast priority)
+{
+	struct dccp_ackpkts *ap = kmalloc(sizeof(*ap) + len, priority);
+
+	if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+		memset(ap->dccpap_buf, 0xFF, len);
+#endif
+		ap->dccpap_buf_len   = len;
+		ap->dccpap_buf_head  =
+			ap->dccpap_buf_tail =
+				ap->dccpap_buf_len - 1;
+		ap->dccpap_buf_ackno =
+			ap->dccpap_ack_ackno =
+				ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+		ap->dccpap_buf_nonce = ap->dccpap_buf_nonce = 0;
+		ap->dccpap_ack_ptr   = 0;
+		ap->dccpap_time.tv_sec = 0;
+		ap->dccpap_time.tv_usec = 0;
+		ap->dccpap_buf_vector_len = ap->dccpap_ack_vector_len = 0;
+	}
+
+	return ap;
+}
+
+void dccp_ackpkts_free(struct dccp_ackpkts *ap)
+{
+	if (ap != NULL) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+		memset(ap, 0xFF, sizeof(*ap) + ap->dccpap_buf_len);
+#endif
+		kfree(ap);
+	}
+}
+
+static inline u8 dccp_ackpkts_state(const struct dccp_ackpkts *ap,
+				    const unsigned int index)
+{
+	return ap->dccpap_buf[index] & DCCP_ACKPKTS_STATE_MASK;
+}
+
+static inline u8 dccp_ackpkts_len(const struct dccp_ackpkts *ap,
+				  const unsigned int index)
+{
+	return ap->dccpap_buf[index] & DCCP_ACKPKTS_LEN_MASK;
+}
+
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
+ */
+static inline int dccp_ackpkts_set_buf_head_state(struct dccp_ackpkts *ap,
+						  const unsigned int packets,
+						  const unsigned char state)
+{
+	unsigned int gap;
+	signed long new_head;
+
+	if (ap->dccpap_buf_vector_len + packets > ap->dccpap_buf_len)
+		return -ENOBUFS;
+
+	gap	 = packets - 1;
+	new_head = ap->dccpap_buf_head - packets;
+
+	if (new_head < 0) {
+		if (gap > 0) {
+			memset(ap->dccpap_buf, DCCP_ACKPKTS_STATE_NOT_RECEIVED,
+			       gap + new_head + 1);
+			gap = -new_head;
+		}
+		new_head += ap->dccpap_buf_len;
+	} 
+
+	ap->dccpap_buf_head = new_head;
+
+	if (gap > 0)
+		memset(ap->dccpap_buf + ap->dccpap_buf_head + 1,
+		       DCCP_ACKPKTS_STATE_NOT_RECEIVED, gap);
+
+	ap->dccpap_buf[ap->dccpap_buf_head] = state;
+	ap->dccpap_buf_vector_len += packets;
+	return 0;
+}
+
+/*
+ * Implements the draft-ietf-dccp-spec-11.txt Appendix A
+ */
+int dccp_ackpkts_add(struct dccp_ackpkts *ap, u64 ackno, u8 state)
+{
+	/*
+	 * Check at the right places if the buffer is full, if it is, tell the
+	 * caller to start dropping packets till the HC-Sender acks our ACK
+	 * vectors, when we will free up space in dccpap_buf.
+	 *
+	 * We may well decide to do buffer compression, etc, but for now lets
+	 * just drop.
+	 *
+	 * From Appendix A:
+	 *
+	 *	Of course, the circular buffer may overflow, either when the
+	 *	HC-Sender is sending data at a very high rate, when the
+	 *	HC-Receiver's acknowledgements are not reaching the HC-Sender,
+	 *	or when the HC-Sender is forgetting to acknowledge those acks
+	 *	(so the HC-Receiver is unable to clean up old state). In this
+	 *	case, the HC-Receiver should either compress the buffer (by
+	 *	increasing run lengths when possible), transfer its state to
+	 *	a larger buffer, or, as a last resort, drop all received
+	 *	packets, without processing them whatsoever, until its buffer
+	 *	shrinks again.
+	 */
+
+	/* See if this is the first ackno being inserted */
+	if (ap->dccpap_buf_vector_len == 0) {
+		ap->dccpap_buf[ap->dccpap_buf_head] = state;
+		ap->dccpap_buf_vector_len = 1;
+	} else if (after48(ackno, ap->dccpap_buf_ackno)) {
+		const u64 delta = dccp_delta_seqno(ap->dccpap_buf_ackno,
+						   ackno);
+
+		/*
+		 * Look if the state of this packet is the same as the
+		 * previous ackno and if so if we can bump the head len.
+		 */
+		if (delta == 1 &&
+		    dccp_ackpkts_state(ap, ap->dccpap_buf_head) == state &&
+		    (dccp_ackpkts_len(ap, ap->dccpap_buf_head) <
+		     DCCP_ACKPKTS_LEN_MASK))
+			ap->dccpap_buf[ap->dccpap_buf_head]++;
+		else if (dccp_ackpkts_set_buf_head_state(ap, delta, state))
+			return -ENOBUFS;
+	} else {
+		/*
+		 * A.1.2.  Old Packets
+		 *
+		 *	When a packet with Sequence Number S arrives, and
+		 *	S <= buf_ackno, the HC-Receiver will scan the table
+		 *	for the byte corresponding to S. (Indexing structures
+		 *	could reduce the complexity of this scan.)
+		 */
+		u64 delta = dccp_delta_seqno(ackno, ap->dccpap_buf_ackno);
+		unsigned int index = ap->dccpap_buf_head;
+
+		while (1) {
+			const u8 len = dccp_ackpkts_len(ap, index);
+			const u8 state = dccp_ackpkts_state(ap, index);
+			/*
+			 * valid packets not yet in dccpap_buf have a reserved
+			 * entry, with a len equal to 0.
+			 */
+			if (state == DCCP_ACKPKTS_STATE_NOT_RECEIVED &&
+			    len == 0 && delta == 0) { /* Found our
+							 reserved seat! */
+				dccp_pr_debug("Found %llu reserved seat!\n",
+					      (unsigned long long) ackno);
+				ap->dccpap_buf[index] = state;
+				goto out;
+			}
+			/* len == 0 means one packet */
+			if (delta < len + 1)
+				goto out_duplicate;
+
+			delta -= len + 1;
+			if (++index == ap->dccpap_buf_len)
+				index = 0;
+		}
+	}
+
+	ap->dccpap_buf_ackno = ackno;
+	do_gettimeofday(&ap->dccpap_time);
+out:
+	dccp_pr_debug("");
+	dccp_ackpkts_print(ap);
+	return 0;
+
+out_duplicate:
+	/* Duplicate packet */
+	dccp_pr_debug("Received a dup or already considered lost "
+		      "packet: %llu\n", (unsigned long long) ackno);
+	return -EILSEQ;
+}
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+void dccp_ackvector_print(const u64 ackno, const unsigned char *vector,
+			  int len)
+{
+	if (!dccp_debug)
+		return;
+
+	printk("ACK vector len=%d, ackno=%llu |", len,
+	       (unsigned long long) ackno);
+
+	while (len--) {
+		const u8 state = (*vector & DCCP_ACKPKTS_STATE_MASK) >> 6;
+		const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+
+		printk("%d,%d|", state, rl);
+		++vector;
+	}
+
+	printk("\n");
+}
+
+void dccp_ackpkts_print(const struct dccp_ackpkts *ap)
+{
+	dccp_ackvector_print(ap->dccpap_buf_ackno,
+			     ap->dccpap_buf + ap->dccpap_buf_head,
+			     ap->dccpap_buf_vector_len);
+}
+#endif
+
+static void dccp_ackpkts_trow_away_ack_record(struct dccp_ackpkts *ap)
+{
+	/*
+	 * As we're keeping track of the ack vector size
+	 * (dccpap_buf_vector_len) and the sent ack vector size
+	 * (dccpap_ack_vector_len) we don't need dccpap_buf_tail at all, but
+	 * keep this code here as in the future we'll implement a vector of
+	 * ack records, as suggested in draft-ietf-dccp-spec-11.txt
+	 * Appendix A. -acme
+	 */
+#if 0
+	ap->dccpap_buf_tail = ap->dccpap_ack_ptr + 1;
+	if (ap->dccpap_buf_tail >= ap->dccpap_buf_len)
+		ap->dccpap_buf_tail -= ap->dccpap_buf_len;
+#endif
+	ap->dccpap_buf_vector_len -= ap->dccpap_ack_vector_len;
+}
+
+void dccp_ackpkts_check_rcv_ackno(struct dccp_ackpkts *ap, struct sock *sk,
+				 u64 ackno)
+{
+	/* Check if we actually sent an ACK vector */
+	if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+		return;
+
+	if (ackno == ap->dccpap_ack_seqno) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+		struct dccp_sock *dp = dccp_sk(sk);
+		const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT rx ack: " : "server rx ack: ";
+#endif
+		dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
+			      "ack_ackno=%llu, ACKED!\n",
+			      debug_prefix, 1,
+			      (unsigned long long) ap->dccpap_ack_seqno,
+			      (unsigned long long) ap->dccpap_ack_ackno);
+		dccp_ackpkts_trow_away_ack_record(ap);
+		ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+	}
+}
+
+static void dccp_ackpkts_check_rcv_ackvector(struct dccp_ackpkts *ap,
+					     struct sock *sk, u64 ackno,
+					     const unsigned char len,
+					     const unsigned char *vector)
+{
+	unsigned char i;
+
+	/* Check if we actually sent an ACK vector */
+	if (ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1)
+		return;
+	/*
+	 * We're in the receiver half connection, so if the received an ACK
+	 * vector ackno (e.g. 50) before dccpap_ack_seqno (e.g. 52), we're
+	 * not interested.
+	 *
+	 * Extra explanation with example:
+	 * 
+	 * if we received an ACK vector with ackno 50, it can only be acking
+	 * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
+	 */
+	/* dccp_pr_debug("is %llu < %llu? ", ackno, ap->dccpap_ack_seqno); */
+	if (before48(ackno, ap->dccpap_ack_seqno)) {
+		/* dccp_pr_debug_cat("yes\n"); */
+		return;
+	}
+	/* dccp_pr_debug_cat("no\n"); */
+
+	i = len;
+	while (i--) {
+		const u8 rl = (*vector & DCCP_ACKPKTS_LEN_MASK);
+		u64 ackno_end_rl;
+
+		dccp_set_seqno(&ackno_end_rl, ackno - rl);
+
+		/*
+		 * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
+		 * ap->dccpap_ack_seqno, ackno);
+		 */
+		if (between48(ap->dccpap_ack_seqno, ackno_end_rl, ackno)) {
+			const u8 state = (*vector &
+					  DCCP_ACKPKTS_STATE_MASK) >> 6;
+			/* dccp_pr_debug_cat("yes\n"); */
+
+			if (state != DCCP_ACKPKTS_STATE_NOT_RECEIVED) {
+#ifdef CONFIG_IP_DCCP_DEBUG
+				struct dccp_sock *dp = dccp_sk(sk);
+				const char *debug_prefix =
+					dp->dccps_role == DCCP_ROLE_CLIENT ?
+					"CLIENT rx ack: " : "server rx ack: ";
+#endif
+				dccp_pr_debug("%sACK vector 0, len=%d, "
+					      "ack_seqno=%llu, ack_ackno=%llu, "
+					      "ACKED!\n",
+					      debug_prefix, len,
+					      (unsigned long long)
+					      ap->dccpap_ack_seqno,
+					      (unsigned long long)
+					      ap->dccpap_ack_ackno);
+				dccp_ackpkts_trow_away_ack_record(ap);
+			}
+			/*
+			 * If dccpap_ack_seqno was not received, no problem
+			 * we'll send another ACK vector.
+			 */
+			ap->dccpap_ack_seqno = DCCP_MAX_SEQNO + 1;
+			break;
+		}
+		/* dccp_pr_debug_cat("no\n"); */
+
+		dccp_set_seqno(&ackno, ackno_end_rl - 1);
+		++vector;
+	}
+}
diff --git a/net/dccp/output.c b/net/dccp/output.c
new file mode 100644
index 00000000000..28de157a432
--- /dev/null
+++ b/net/dccp/output.c
@@ -0,0 +1,528 @@
+/*
+ *  net/dccp/output.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+static inline void dccp_event_ack_sent(struct sock *sk)
+{
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+}
+
+/*
+ * All SKB's seen here are completely headerless. It is our
+ * job to build the DCCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ */
+int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (likely(skb != NULL)) {
+		const struct inet_sock *inet = inet_sk(sk);
+		struct dccp_sock *dp = dccp_sk(sk);
+		struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+		struct dccp_hdr *dh;
+		/* XXX For now we're using only 48 bits sequence numbers */
+		const int dccp_header_size = sizeof(*dh) +
+					     sizeof(struct dccp_hdr_ext) +
+					  dccp_packet_hdr_len(dcb->dccpd_type);
+		int err, set_ack = 1;
+		u64 ackno = dp->dccps_gsr;
+
+		dccp_inc_seqno(&dp->dccps_gss);
+
+		switch (dcb->dccpd_type) {
+		case DCCP_PKT_DATA:
+			set_ack = 0;
+			break;
+		case DCCP_PKT_SYNC:
+		case DCCP_PKT_SYNCACK:
+			ackno = dcb->dccpd_seq;
+			break;
+		}
+
+		dcb->dccpd_seq = dp->dccps_gss;
+		dccp_insert_options(sk, skb);
+		
+		skb->h.raw = skb_push(skb, dccp_header_size);
+		dh = dccp_hdr(skb);
+		/*
+		 * Data packets are not cloned as they are never retransmitted
+		 */
+		if (skb_cloned(skb))
+			skb_set_owner_w(skb, sk);
+
+		/* Build DCCP header and checksum it. */
+		memset(dh, 0, dccp_header_size);
+		dh->dccph_type	= dcb->dccpd_type;
+		dh->dccph_sport	= inet->sport;
+		dh->dccph_dport	= inet->dport;
+		dh->dccph_doff	= (dccp_header_size + dcb->dccpd_opt_len) / 4;
+		dh->dccph_ccval	= dcb->dccpd_ccval;
+		/* XXX For now we're using only 48 bits sequence numbers */
+		dh->dccph_x	= 1;
+
+		dp->dccps_awh = dp->dccps_gss;
+		dccp_hdr_set_seq(dh, dp->dccps_gss);
+		if (set_ack)
+			dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
+
+		switch (dcb->dccpd_type) {
+		case DCCP_PKT_REQUEST:
+			dccp_hdr_request(skb)->dccph_req_service =
+							dcb->dccpd_service;
+			break;
+		case DCCP_PKT_RESET:
+			dccp_hdr_reset(skb)->dccph_reset_code =
+							dcb->dccpd_reset_code;
+			break;
+		}
+
+		dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr,
+						      inet->daddr);
+
+		if (set_ack)
+			dccp_event_ack_sent(sk);
+
+		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+
+		err = ip_queue_xmit(skb, 0);
+		if (err <= 0)
+			return err;
+
+		/* NET_XMIT_CN is special. It does not guarantee,
+		 * that this packet is lost. It tells that device
+		 * is about to start to drop packets or already
+		 * drops some packets of the same priority and
+		 * invokes us to send less aggressively.
+		 */
+		return err == NET_XMIT_CN ? 0 : err;
+	}
+	return -ENOBUFS;
+}
+
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	int mss_now;
+
+	/*
+	 * FIXME: we really should be using the af_specific thing to support
+	 * 	  IPv6.
+	 * mss_now = pmtu - tp->af_specific->net_header_len -
+	 * 	     sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext);
+	 */
+	mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) -
+		  sizeof(struct dccp_hdr_ext);
+
+	/* Now subtract optional transport overhead */
+	mss_now -= dp->dccps_ext_header_len;
+
+	/*
+	 * FIXME: this should come from the CCID infrastructure, where, say,
+	 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+	 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+	 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+	 * make it a multiple of 4
+	 */
+
+	mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+
+	/* And store cached results */
+	dp->dccps_pmtu_cookie = pmtu;
+	dp->dccps_mss_cache = mss_now;
+
+	return mss_now;
+}
+
+void dccp_write_space(struct sock *sk)
+{
+	read_lock(&sk->sk_callback_lock);
+
+	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+		wake_up_interruptible(sk->sk_sleep);
+	/* Should agree with poll, otherwise some programs break */
+	if (sock_writeable(sk))
+		sk_wake_async(sk, 2, POLL_OUT);
+
+	read_unlock(&sk->sk_callback_lock);
+}
+
+/**
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * @sk: socket to wait for
+ * @timeo: for how long
+ */
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
+			      long *timeo)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	DEFINE_WAIT(wait);
+	long delay;
+	int rc;
+
+	while (1) {
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+			goto do_error;
+		if (!*timeo)
+			goto do_nonblock;
+		if (signal_pending(current))
+			goto do_interrupted;
+
+		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+					    skb->len);
+		if (rc <= 0)
+			break;
+		delay = msecs_to_jiffies(rc);
+		if (delay > *timeo || delay < 0)
+			goto do_nonblock;
+
+		sk->sk_write_pending++;
+		release_sock(sk);
+		*timeo -= schedule_timeout(delay);
+		lock_sock(sk);
+		sk->sk_write_pending--;
+	}
+out:
+	finish_wait(sk->sk_sleep, &wait);
+	return rc;
+
+do_error:
+	rc = -EPIPE;
+	goto out;
+do_nonblock:
+	rc = -EAGAIN;
+	goto out;
+do_interrupted:
+	rc = sock_intr_errno(*timeo);
+	goto out;
+}
+
+int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+					 skb->len);
+
+	if (err > 0)
+		err = dccp_wait_for_ccid(sk, skb, timeo);
+
+	if (err == 0) {
+		const struct dccp_ackpkts *ap = dp->dccps_hc_rx_ackpkts;
+		struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+		const int len = skb->len;
+
+		if (sk->sk_state == DCCP_PARTOPEN) {
+			/* See 8.1.5.  Handshake Completion */
+			inet_csk_schedule_ack(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  inet_csk(sk)->icsk_rto,
+						  DCCP_RTO_MAX);
+			dcb->dccpd_type = DCCP_PKT_DATAACK;
+			/*
+			 * FIXME: we really should have a
+			 * dccps_ack_pending or use icsk.
+			 */
+		} else if (inet_csk_ack_scheduled(sk) ||
+			   dp->dccps_timestamp_echo != 0 ||
+			   (dp->dccps_options.dccpo_send_ack_vector &&
+			    ap->dccpap_buf_ackno != DCCP_MAX_SEQNO + 1 &&
+			    ap->dccpap_ack_seqno == DCCP_MAX_SEQNO + 1))
+			dcb->dccpd_type = DCCP_PKT_DATAACK;
+		else
+			dcb->dccpd_type = DCCP_PKT_DATA;
+
+		err = dccp_transmit_skb(sk, skb);
+		ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+	}
+
+	return err;
+}
+
+int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (inet_sk_rebuild_header(sk) != 0)
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	return dccp_transmit_skb(sk, (skb_cloned(skb) ?
+				      pskb_copy(skb, GFP_ATOMIC):
+				      skb_clone(skb, GFP_ATOMIC)));
+}
+
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+				   struct request_sock *req)
+{
+	struct dccp_hdr *dh;
+	const int dccp_header_size = sizeof(struct dccp_hdr) +
+				     sizeof(struct dccp_hdr_ext) +
+				     sizeof(struct dccp_hdr_response);
+	struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+					       dccp_header_size, 1,
+					   GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+	skb->dst = dst_clone(dst);
+	skb->csum = 0;
+
+	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
+	DCCP_SKB_CB(skb)->dccpd_seq  = dccp_rsk(req)->dreq_iss;
+	dccp_insert_options(sk, skb);
+
+	skb->h.raw = skb_push(skb, dccp_header_size);
+
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_header_size);
+
+	dh->dccph_sport	= inet_sk(sk)->sport;
+	dh->dccph_dport	= inet_rsk(req)->rmt_port;
+	dh->dccph_doff	= (dccp_header_size +
+			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+	dh->dccph_type	= DCCP_PKT_RESPONSE;
+	dh->dccph_x	= 1;
+	dccp_hdr_set_seq(dh, dccp_rsk(req)->dreq_iss);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dccp_rsk(req)->dreq_isr);
+
+	dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
+					      inet_rsk(req)->rmt_addr);
+
+	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+	return skb;
+}
+
+struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
+				const enum dccp_reset_codes code)
+				   
+{
+	struct dccp_hdr *dh;
+	struct dccp_sock *dp = dccp_sk(sk);
+	const int dccp_header_size = sizeof(struct dccp_hdr) +
+				     sizeof(struct dccp_hdr_ext) +
+				     sizeof(struct dccp_hdr_reset);
+	struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
+					       dccp_header_size, 1,
+					   GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+
+	skb->dst = dst_clone(dst);
+	skb->csum = 0;
+
+	dccp_inc_seqno(&dp->dccps_gss);
+
+	DCCP_SKB_CB(skb)->dccpd_reset_code = code;
+	DCCP_SKB_CB(skb)->dccpd_type	   = DCCP_PKT_RESET;
+	DCCP_SKB_CB(skb)->dccpd_seq	   = dp->dccps_gss;
+	dccp_insert_options(sk, skb);
+
+	skb->h.raw = skb_push(skb, dccp_header_size);
+
+	dh = dccp_hdr(skb);
+	memset(dh, 0, dccp_header_size);
+
+	dh->dccph_sport	= inet_sk(sk)->sport;
+	dh->dccph_dport	= inet_sk(sk)->dport;
+	dh->dccph_doff	= (dccp_header_size +
+			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
+	dh->dccph_type	= DCCP_PKT_RESET;
+	dh->dccph_x	= 1;
+	dccp_hdr_set_seq(dh, dp->dccps_gss);
+	dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
+
+	dccp_hdr_reset(skb)->dccph_reset_code = code;
+
+	dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
+					      inet_sk(sk)->daddr);
+
+	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+	return skb;
+}
+
+/*
+ * Do all connect socket setups that can be done AF independent.
+ */
+static inline void dccp_connect_init(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	sk->sk_err = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+	
+	dccp_sync_mss(sk, dst_mtu(dst));
+
+	/*
+	 * FIXME: set dp->{dccps_swh,dccps_swl}, with
+	 * something like dccp_inc_seq
+	 */
+
+	icsk->icsk_retransmits = 0;
+}
+
+int dccp_connect(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	dccp_connect_init(sk);
+
+	skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
+	if (unlikely(skb == NULL))
+		return -ENOBUFS;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_DCCP_HEADER);
+
+	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
+	/* FIXME: set service to something meaningful, coming
+	 * from userspace*/
+	DCCP_SKB_CB(skb)->dccpd_service = 0;
+	skb->csum = 0;
+	skb_set_owner_w(skb, sk);
+
+	BUG_TRAP(sk->sk_send_head == NULL);
+	sk->sk_send_head = skb;
+	dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
+
+	/* Timer for repeating the REQUEST until an answer. */
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  icsk->icsk_rto, DCCP_RTO_MAX);
+	return 0;
+}
+
+void dccp_send_ack(struct sock *sk)
+{
+	/* If we have been reset, we may not send again. */
+	if (sk->sk_state != DCCP_CLOSED) {
+		struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+		if (skb == NULL) {
+			inet_csk_schedule_ack(sk);
+			inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX,
+						  DCCP_RTO_MAX);
+			return;
+		}
+
+		/* Reserve space for headers */
+		skb_reserve(skb, MAX_DCCP_HEADER);
+		skb->csum = 0;
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
+		skb_set_owner_w(skb, sk);
+		dccp_transmit_skb(sk, skb);
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_send_ack);
+
+void dccp_send_delayed_ack(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	/*
+	 * FIXME: tune this timer. elapsed time fixes the skew, so no problem
+	 * with using 2s, and active senders also piggyback the ACK into a
+	 * DATAACK packet, so this is really for quiescent senders.
+	 */
+	unsigned long timeout = jiffies + 2 * HZ;
+
+	/* Use new timeout only if there wasn't a older one earlier. */
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 *
+		 * FIXME: check the "about to expire" part
+		 */
+		if (icsk->icsk_ack.blocked) {
+			dccp_send_ack(sk);
+			return;
+		}
+
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
+	}
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+void dccp_send_sync(struct sock *sk, const u64 seq,
+		    const enum dccp_pkt_type pkt_type)
+{
+	/*
+	 * We are not putting this on the write queue, so
+	 * dccp_transmit_skb() will set the ownership to this
+	 * sock.
+	 */
+	struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+
+	if (skb == NULL)
+		/* FIXME: how to make sure the sync is sent? */
+		return;
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, MAX_DCCP_HEADER);
+	skb->csum = 0;
+	DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
+	DCCP_SKB_CB(skb)->dccpd_seq = seq;
+
+	skb_set_owner_w(skb, sk);
+	dccp_transmit_skb(sk, skb);
+}
+
+/*
+ * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
+ * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
+ * any circumstances.
+ */
+void dccp_send_close(struct sock *sk, const int active)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	const unsigned int prio = active ? GFP_KERNEL : GFP_ATOMIC;
+
+	skb = alloc_skb(sk->sk_prot->max_header, prio);
+	if (skb == NULL)
+		return;
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, sk->sk_prot->max_header);
+	skb->csum = 0;
+	DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
+					DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
+
+	skb_set_owner_w(skb, sk);
+	if (active) {
+		BUG_TRAP(sk->sk_send_head == NULL);
+		sk->sk_send_head = skb;
+		dccp_transmit_skb(sk, skb_clone(skb, prio));
+	} else
+		dccp_transmit_skb(sk, skb);
+
+	ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
+	ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
+}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
new file mode 100644
index 00000000000..18a0e69c9dc
--- /dev/null
+++ b/net/dccp/proto.c
@@ -0,0 +1,826 @@
+/*
+ *  net/dccp/proto.c
+ *
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <net/checksum.h>
+
+#include <net/inet_common.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/sock.h>
+#include <net/xfrm.h>
+
+#include <asm/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/poll.h>
+#include <linux/dccp.h>
+
+#include "ccid.h"
+#include "dccp.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
+
+atomic_t dccp_orphan_count = ATOMIC_INIT(0);
+
+static struct net_protocol dccp_protocol = {
+	.handler	= dccp_v4_rcv,
+	.err_handler	= dccp_v4_err,
+};
+
+const char *dccp_packet_name(const int type)
+{
+	static const char *dccp_packet_names[] = {
+		[DCCP_PKT_REQUEST]  = "REQUEST",
+		[DCCP_PKT_RESPONSE] = "RESPONSE",
+		[DCCP_PKT_DATA]	    = "DATA",
+		[DCCP_PKT_ACK]	    = "ACK",
+		[DCCP_PKT_DATAACK]  = "DATAACK",
+		[DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
+		[DCCP_PKT_CLOSE]    = "CLOSE",
+		[DCCP_PKT_RESET]    = "RESET",
+		[DCCP_PKT_SYNC]	    = "SYNC",
+		[DCCP_PKT_SYNCACK]  = "SYNCACK",
+	};
+
+	if (type >= DCCP_NR_PKT_TYPES)
+		return "INVALID";
+	else
+		return dccp_packet_names[type];
+}
+
+EXPORT_SYMBOL_GPL(dccp_packet_name);
+
+const char *dccp_state_name(const int state)
+{
+	static char *dccp_state_names[] = {
+	[DCCP_OPEN]	  = "OPEN",
+	[DCCP_REQUESTING] = "REQUESTING",
+	[DCCP_PARTOPEN]	  = "PARTOPEN",
+	[DCCP_LISTEN]	  = "LISTEN",
+	[DCCP_RESPOND]	  = "RESPOND",
+	[DCCP_CLOSING]	  = "CLOSING",
+	[DCCP_TIME_WAIT]  = "TIME_WAIT",
+	[DCCP_CLOSED]	  = "CLOSED",
+	};
+
+	if (state >= DCCP_MAX_STATES)
+		return "INVALID STATE!";
+	else
+		return dccp_state_names[state];
+}
+
+EXPORT_SYMBOL_GPL(dccp_state_name);
+
+static inline int dccp_listen_start(struct sock *sk)
+{
+	dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
+	return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+}
+
+int dccp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_sock *inet = inet_sk(sk);
+	int err = 0;
+	const int old_state = sk->sk_state;
+
+	if (old_state != DCCP_CLOSED)
+		dccp_set_state(sk, DCCP_CLOSED);
+
+	/* ABORT function of RFC793 */
+	if (old_state == DCCP_LISTEN) {
+		inet_csk_listen_stop(sk);
+	/* FIXME: do the active reset thing */
+	} else if (old_state == DCCP_REQUESTING)
+		sk->sk_err = ECONNRESET;
+
+	dccp_clear_xmit_timers(sk);
+	__skb_queue_purge(&sk->sk_receive_queue);
+	if (sk->sk_send_head != NULL) {
+		__kfree_skb(sk->sk_send_head);
+		sk->sk_send_head = NULL;
+	}
+
+	inet->dport = 0;
+
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	sk->sk_shutdown = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+
+	icsk->icsk_backoff = 0;
+	inet_csk_delack_init(sk);
+	__sk_dst_reset(sk);
+
+	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+
+	sk->sk_error_report(sk);
+	return err;
+}
+
+/*
+ *	Wait for a DCCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+static unsigned int dccp_poll(struct file *file, struct socket *sock,
+			      poll_table *wait)
+{
+	unsigned int mask;
+	struct sock *sk = sock->sk;
+
+	poll_wait(file, sk->sk_sleep, wait);
+	if (sk->sk_state == DCCP_LISTEN)
+		return inet_csk_listen_poll(sk);
+
+	/* Socket is not locked. We are protected from async events
+	   by poll logic and correct handling of state changes
+	   made by another threads is impossible in any case.
+	 */
+
+	mask = 0;
+	if (sk->sk_err)
+		mask = POLLERR;
+
+	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connected? */
+	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+		if (atomic_read(&sk->sk_rmem_alloc) > 0)
+			mask |= POLLIN | POLLRDNORM;
+
+		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {  /* send SIGIO later */
+				set_bit(SOCK_ASYNC_NOSPACE,
+					&sk->sk_socket->flags);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+				/* Race breaker. If space is freed after
+				 * wspace test but before the flags are set,
+				 * IO signal will be lost.
+				 */
+				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+					mask |= POLLOUT | POLLWRNORM;
+			}
+		}
+	}
+	return mask;
+}
+
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	dccp_pr_debug("entry\n");
+	return -ENOIOCTLCMD;
+}
+
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int optlen)
+{
+	struct dccp_sock *dp;
+	int err;
+	int val;
+
+	if (level != SOL_DCCP)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	dp = dccp_sk(sk);
+	err = 0;
+
+	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		dp->dccps_packet_size = val;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+	
+	release_sock(sk);
+	return err;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+		    char __user *optval, int __user *optlen)
+{
+	struct dccp_sock *dp;
+	int val, len;
+
+	if (level != SOL_DCCP)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	dp = dccp_sk(sk);
+
+	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		val = dp->dccps_packet_size;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len)
+{
+	const struct dccp_sock *dp = dccp_sk(sk);
+	const int flags = msg->msg_flags;
+	const int noblock = flags & MSG_DONTWAIT;
+	struct sk_buff *skb;
+	int rc, size;
+	long timeo;
+
+	if (len > dp->dccps_mss_cache)
+		return -EMSGSIZE;
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, noblock);
+
+	/*
+	 * We have to use sk_stream_wait_connect here to set sk_write_pending,
+	 * so that the trick in dccp_rcv_request_sent_state_process.
+	 */
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+		if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_release;
+
+	size = sk->sk_prot->max_header + len;
+	release_sock(sk);
+	skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+	lock_sock(sk);
+	if (skb == NULL)
+		goto out_release;
+
+	skb_reserve(skb, sk->sk_prot->max_header);
+	rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	if (rc != 0)
+		goto out_discard;
+
+	rc = dccp_write_xmit(sk, skb, &timeo);
+	/*
+	 * XXX we don't use sk_write_queue, so just discard the packet.
+	 *     Current plan however is to _use_ sk_write_queue with
+	 *     an algorith similar to tcp_sendmsg, where the main difference
+	 *     is that in DCCP we have to respect packet boundaries, so
+	 *     no coalescing of skbs.
+	 *
+	 *     This bug was _quickly_ found & fixed by just looking at an OSTRA
+	 *     generated callgraph 8) -acme
+	 */
+	if (rc != 0)
+		goto out_discard;
+out_release:
+	release_sock(sk);
+	return rc ? : len;
+out_discard:
+	kfree_skb(skb);
+	goto out_release;
+}
+
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len, int nonblock, int flags, int *addr_len)
+{
+	const struct dccp_hdr *dh;
+	long timeo;
+
+	lock_sock(sk);
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		len = -ENOTCONN;
+		goto out;
+	}
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	do {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+		if (skb == NULL)
+			goto verify_sock_status;
+
+		dh = dccp_hdr(skb);
+
+		if (dh->dccph_type == DCCP_PKT_DATA ||
+		    dh->dccph_type == DCCP_PKT_DATAACK)
+			goto found_ok_skb;
+
+		if (dh->dccph_type == DCCP_PKT_RESET ||
+		    dh->dccph_type == DCCP_PKT_CLOSE) {
+			dccp_pr_debug("found fin ok!\n");
+			len = 0;
+			goto found_fin_ok;
+		}
+		dccp_pr_debug("packet_type=%s\n",
+			      dccp_packet_name(dh->dccph_type));
+		sk_eat_skb(sk, skb);
+verify_sock_status:
+		if (sock_flag(sk, SOCK_DONE)) {
+			len = 0;
+			break;
+		}
+
+		if (sk->sk_err) {
+			len = sock_error(sk);
+			break;
+		}
+
+		if (sk->sk_shutdown & RCV_SHUTDOWN) {
+			len = 0;
+			break;
+		}
+
+		if (sk->sk_state == DCCP_CLOSED) {
+			if (!sock_flag(sk, SOCK_DONE)) {
+				/* This occurs when user tries to read
+				 * from never connected socket.
+				 */
+				len = -ENOTCONN;
+				break;
+			}
+			len = 0;
+			break;
+		}
+
+		if (!timeo) {
+			len = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			len = sock_intr_errno(timeo);
+			break;
+		}
+
+		sk_wait_data(sk, &timeo);
+		continue;
+	found_ok_skb:
+		if (len > skb->len)
+			len = skb->len;
+		else if (len < skb->len)
+			msg->msg_flags |= MSG_TRUNC;
+
+		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+			/* Exception. Bailout! */
+			len = -EFAULT;
+			break;
+		}
+	found_fin_ok:
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
+		break;
+	} while (1);
+out:
+	release_sock(sk);
+	return len;
+}
+
+static int inet_dccp_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	unsigned char old_state;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
+		goto out;
+
+	old_state = sk->sk_state;
+	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
+		goto out;
+
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != DCCP_LISTEN) {
+		/*
+		 * FIXME: here it probably should be sk->sk_prot->listen_start
+		 * see tcp_listen_start
+		 */
+		err = dccp_listen_start(sk);
+		if (err)
+			goto out;
+	}
+	sk->sk_max_ack_backlog = backlog;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+static const unsigned char dccp_new_state[] = {
+	/* current state:   new state:      action:	*/
+	[0]		  = DCCP_CLOSED,
+	[DCCP_OPEN] 	  = DCCP_CLOSING | DCCP_ACTION_FIN,
+	[DCCP_REQUESTING] = DCCP_CLOSED,
+	[DCCP_PARTOPEN]	  = DCCP_CLOSING | DCCP_ACTION_FIN,
+	[DCCP_LISTEN]	  = DCCP_CLOSED,
+	[DCCP_RESPOND]	  = DCCP_CLOSED,
+	[DCCP_CLOSING]	  = DCCP_CLOSED,
+	[DCCP_TIME_WAIT]  = DCCP_CLOSED,
+	[DCCP_CLOSED]	  = DCCP_CLOSED,
+};
+
+static int dccp_close_state(struct sock *sk)
+{
+	const int next = dccp_new_state[sk->sk_state];
+	const int ns = next & DCCP_STATE_MASK;
+
+	if (ns != sk->sk_state)
+		dccp_set_state(sk, ns);
+
+	return next & DCCP_ACTION_FIN;
+}
+
+void dccp_close(struct sock *sk, long timeout)
+{
+	struct sk_buff *skb;
+
+	lock_sock(sk);
+
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		dccp_set_state(sk, DCCP_CLOSED);
+
+		/* Special case. */
+		inet_csk_listen_stop(sk);
+
+		goto adjudge_to_death;
+	}
+
+	/*
+	 * We need to flush the recv. buffs.  We do this only on the
+	 * descriptor close, not protocol-sourced closes, because the
+	  *reader process may not have drained the data yet!
+	 */
+	/* FIXME: check for unread data */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		__kfree_skb(skb);
+	}
+
+	if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+	} else if (dccp_close_state(sk)) {
+		dccp_send_close(sk, 1);
+	}
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	/*
+	 * It is the last release_sock in its life. It will remove backlog.
+	 */
+	release_sock(sk);
+	/*
+	 * Now socket is owned by kernel and we acquire BH lock
+	 * to finish close. No need to check for user refs.
+	 */
+	local_bh_disable();
+	bh_lock_sock(sk);
+	BUG_TRAP(!sock_owned_by_user(sk));
+
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*
+	 * The last release_sock may have processed the CLOSE or RESET
+	 * packet moving sock to CLOSED state, if not we have to fire
+	 * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
+	 * in draft-ietf-dccp-spec-11. -acme
+	 */
+	if (sk->sk_state == DCCP_CLOSING) {
+		/* FIXME: should start at 2 * RTT */
+		/* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto,
+					  DCCP_RTO_MAX);
+#if 0
+		/* Yeah, we should use sk->sk_prot->orphan_count, etc */
+		dccp_set_state(sk, DCCP_CLOSED);
+#endif
+	}
+
+	atomic_inc(sk->sk_prot->orphan_count);
+	if (sk->sk_state == DCCP_CLOSED)
+		inet_csk_destroy_sock(sk);
+
+	/* Otherwise, socket is reprieved until protocol close. */
+
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	sock_put(sk);
+}
+
+void dccp_shutdown(struct sock *sk, int how)
+{
+	dccp_pr_debug("entry\n");
+}
+
+static struct proto_ops inet_dccp_ops = {
+	.family		= PF_INET,
+	.owner		= THIS_MODULE,
+	.release	= inet_release,
+	.bind		= inet_bind,
+	.connect	= inet_stream_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= inet_accept,
+	.getname	= inet_getname,
+	/* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+	.poll		= dccp_poll,
+	.ioctl		= inet_ioctl,
+	/* FIXME: work on inet_listen to rename it to sock_common_listen */
+	.listen		= inet_dccp_listen,
+	.shutdown	= inet_shutdown,
+	.setsockopt	= sock_common_setsockopt,
+	.getsockopt	= sock_common_getsockopt,
+	.sendmsg	= inet_sendmsg,
+	.recvmsg	= sock_common_recvmsg,
+	.mmap		= sock_no_mmap,
+	.sendpage	= sock_no_sendpage,
+};
+
+extern struct net_proto_family inet_family_ops;
+
+static struct inet_protosw dccp_v4_protosw = {
+	.type		= SOCK_DCCP,
+	.protocol	= IPPROTO_DCCP,
+	.prot		= &dccp_v4_prot,
+	.ops		= &inet_dccp_ops,
+	.capability	= -1,
+	.no_check	= 0,
+	.flags		= 0,
+};
+
+/*
+ * This is the global socket data structure used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
+struct socket *dccp_ctl_socket;
+
+static char dccp_ctl_socket_err_msg[] __initdata =
+	KERN_ERR "DCCP: Failed to create the control socket.\n";
+
+static int __init dccp_ctl_sock_init(void)
+{
+	int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
+				  &dccp_ctl_socket);
+	if (rc < 0)
+		printk(dccp_ctl_socket_err_msg);
+	else {
+		dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
+		inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
+
+		/* Unhash it so that IP input processing does not even
+		 * see it, we do not wish this socket to see incoming
+		 * packets.
+		 */
+		dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
+	}
+
+	return rc;
+}
+
+#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
+void dccp_ctl_sock_exit(void)
+{
+	if (dccp_ctl_socket != NULL) {
+		sock_release(dccp_ctl_socket);
+		dccp_ctl_socket = NULL;
+	}
+}
+
+EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
+#endif
+
+static int __init init_dccp_v4_mibs(void)
+{
+	int rc = -ENOMEM;
+
+	dccp_statistics[0] = alloc_percpu(struct dccp_mib);
+	if (dccp_statistics[0] == NULL)
+		goto out;
+
+	dccp_statistics[1] = alloc_percpu(struct dccp_mib);
+	if (dccp_statistics[1] == NULL)
+		goto out_free_one;
+
+	rc = 0;
+out:
+	return rc;
+out_free_one:
+	free_percpu(dccp_statistics[0]);
+	dccp_statistics[0] = NULL;
+	goto out;
+
+}
+
+static int thash_entries;
+module_param(thash_entries, int, 0444);
+MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+int dccp_debug;
+module_param(dccp_debug, int, 0444);
+MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
+#endif
+
+static int __init dccp_init(void)
+{
+	unsigned long goal;
+	int ehash_order, bhash_order, i;
+	int rc = proto_register(&dccp_v4_prot, 1);
+
+	if (rc)
+		goto out;
+
+	dccp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("dccp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!dccp_hashinfo.bind_bucket_cachep)
+		goto out_proto_unregister;
+
+	/*
+	 * Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	if (num_physpages >= (128 * 1024))
+		goal = num_physpages >> (21 - PAGE_SHIFT);
+	else
+		goal = num_physpages >> (23 - PAGE_SHIFT);
+
+	if (thash_entries)
+		goal = (thash_entries *
+			sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
+	for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
+		;
+	do {
+		dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+					sizeof(struct inet_ehash_bucket);
+		dccp_hashinfo.ehash_size >>= 1;
+		while (dccp_hashinfo.ehash_size &
+		       (dccp_hashinfo.ehash_size - 1))
+			dccp_hashinfo.ehash_size--;
+		dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
+			__get_free_pages(GFP_ATOMIC, ehash_order);
+	} while (!dccp_hashinfo.ehash && --ehash_order > 0);
+
+	if (!dccp_hashinfo.ehash) {
+		printk(KERN_CRIT "Failed to allocate DCCP "
+				 "established hash table\n");
+		goto out_free_bind_bucket_cachep;
+	}
+
+	for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
+		rwlock_init(&dccp_hashinfo.ehash[i].lock);
+		INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
+	}
+
+	bhash_order = ehash_order;
+
+	do {
+		dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
+					sizeof(struct inet_bind_hashbucket);
+		if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
+		    bhash_order > 0)
+			continue;
+		dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
+			__get_free_pages(GFP_ATOMIC, bhash_order);
+	} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
+
+	if (!dccp_hashinfo.bhash) {
+		printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
+		goto out_free_dccp_ehash;
+	}
+
+	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+	}
+
+	if (init_dccp_v4_mibs())
+		goto out_free_dccp_bhash;
+
+	rc = -EAGAIN;
+	if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
+		goto out_free_dccp_v4_mibs;
+
+	inet_register_protosw(&dccp_v4_protosw);
+
+	rc = dccp_ctl_sock_init();
+	if (rc)
+		goto out_unregister_protosw;
+out:
+	return rc;
+out_unregister_protosw:
+	inet_unregister_protosw(&dccp_v4_protosw);
+	inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
+out_free_dccp_v4_mibs:
+	free_percpu(dccp_statistics[0]);
+	free_percpu(dccp_statistics[1]);
+	dccp_statistics[0] = dccp_statistics[1] = NULL;
+out_free_dccp_bhash:
+	free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+	dccp_hashinfo.bhash = NULL;
+out_free_dccp_ehash:
+	free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+	dccp_hashinfo.ehash = NULL;
+out_free_bind_bucket_cachep:
+	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+	dccp_hashinfo.bind_bucket_cachep = NULL;
+out_proto_unregister:
+	proto_unregister(&dccp_v4_prot);
+	goto out;
+}
+
+static const char dccp_del_proto_err_msg[] __exitdata =
+	KERN_ERR "can't remove dccp net_protocol\n";
+
+static void __exit dccp_fini(void)
+{
+	inet_unregister_protosw(&dccp_v4_protosw);
+
+	if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
+		printk(dccp_del_proto_err_msg);
+
+	free_percpu(dccp_statistics[0]);
+	free_percpu(dccp_statistics[1]);
+	free_pages((unsigned long)dccp_hashinfo.bhash,
+		   get_order(dccp_hashinfo.bhash_size *
+			     sizeof(struct inet_bind_hashbucket)));
+	free_pages((unsigned long)dccp_hashinfo.ehash,
+		   get_order(dccp_hashinfo.ehash_size *
+			     sizeof(struct inet_ehash_bucket)));
+	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+	proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_init);
+module_exit(dccp_fini);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
new file mode 100644
index 00000000000..aa34b576e22
--- /dev/null
+++ b/net/dccp/timer.c
@@ -0,0 +1,255 @@
+/*
+ *  net/dccp/timer.c
+ * 
+ *  An implementation of the DCCP protocol
+ *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/dccp.h>
+#include <linux/skbuff.h>
+
+#include "dccp.h"
+
+static void dccp_write_timer(unsigned long data);
+static void dccp_keepalive_timer(unsigned long data);
+static void dccp_delack_timer(unsigned long data);
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+				  &dccp_keepalive_timer);
+}
+
+static void dccp_write_err(struct sock *sk)
+{
+	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+	sk->sk_error_report(sk);
+
+	dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+	dccp_done(sk);
+	DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int dccp_write_timeout(struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	int retry_until;
+
+	if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
+		if (icsk->icsk_retransmits != 0)
+			dst_negative_advice(&sk->sk_dst_cache);
+		retry_until = icsk->icsk_syn_retries ? :
+			    /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
+	} else {
+		if (icsk->icsk_retransmits >=
+		     /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
+			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
+			   black hole detection. :-(
+
+			   It is place to make it. It is not made. I do not want
+			   to make it. It is disguisting. It does not work in any
+			   case. Let me to cite the same draft, which requires for
+			   us to implement this:
+
+   "The one security concern raised by this memo is that ICMP black holes
+   are often caused by over-zealous security administrators who block
+   all ICMP messages.  It is vitally important that those who design and
+   deploy security systems understand the impact of strict filtering on
+   upper-layer protocols.  The safest web site in the world is worthless
+   if most TCP implementations cannot transfer data from it.  It would
+   be far nicer to have all of the black holes fixed rather than fixing
+   all of the TCP implementations."
+
+                           Golden words :-).
+		   */
+
+			dst_negative_advice(&sk->sk_dst_cache);
+		}
+
+		retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
+		/*
+		 * FIXME: see tcp_write_timout and tcp_out_of_resources
+		 */
+	}
+
+	if (icsk->icsk_retransmits >= retry_until) {
+		/* Has it gone just too far? */
+		dccp_write_err(sk);
+		return 1;
+	}
+	return 0;
+}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		icsk->icsk_ack.blocked = 1;
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer,
+			       jiffies + TCP_DELACK_MIN);
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_CLOSED ||
+	    !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+		goto out;
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer,
+			       icsk->icsk_ack.timeout);
+		goto out;
+	}
+
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!icsk->icsk_ack.pingpong) {
+			/* Delayed ACK missed: inflate ATO. */
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+						 icsk->icsk_rto);
+		} else {
+			/* Delayed ACK missed: leave pingpong mode and
+			 * deflate ATO.
+			 */
+			icsk->icsk_ack.pingpong = 0;
+			icsk->icsk_ack.ato = TCP_ATO_MIN;
+		}
+		dccp_send_ack(sk);
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	The DCCP retransmit timer.
+ */
+static void dccp_retransmit_timer(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	/*
+	 * sk->sk_send_head has to have one skb with
+	 * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
+	 * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
+	 * (PARTOPEN timer), etc).
+	 */
+	BUG_TRAP(sk->sk_send_head != NULL);
+
+	/* 
+	 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
+	 * sent, no need to retransmit, this sock is dead.
+	 */
+	if (dccp_write_timeout(sk))
+		goto out;
+
+	/*
+	 * We want to know the number of packets retransmitted, not the
+	 * total number of retransmissions of clones of original packets.
+	 */
+	if (icsk->icsk_retransmits == 0)
+		DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
+
+	if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+		/*
+		 * Retransmission failed because of local congestion,
+		 * do not backoff.
+		 */
+		if (icsk->icsk_retransmits == 0)
+			icsk->icsk_retransmits = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  min(icsk->icsk_rto,
+					      TCP_RESOURCE_PROBE_INTERVAL),
+					  DCCP_RTO_MAX);
+		goto out;
+	}
+
+	icsk->icsk_backoff++;
+	icsk->icsk_retransmits++;
+
+	icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
+				  DCCP_RTO_MAX);
+	if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
+		__sk_dst_reset(sk);
+out:;
+}
+
+static void dccp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int event = 0;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+			       jiffies + (HZ / 20));
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
+		goto out;
+
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+			       icsk->icsk_timeout);
+		goto out;
+	}
+
+	event = icsk->icsk_pending;
+	icsk->icsk_pending = 0;
+
+	switch (event) {
+	case ICSK_TIME_RETRANS:
+		dccp_retransmit_timer(sk);
+		break;
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	Timer for listening sockets
+ */
+static void dccp_response_timer(struct sock *sk)
+{
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
+				   DCCP_RTO_MAX);
+}
+
+static void dccp_keepalive_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock *)data;
+
+	/* Only process if socket is not in use. */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */ 
+		inet_csk_reset_keepalive_timer(sk, HZ / 20);
+		goto out;
+	}
+
+	if (sk->sk_state == DCCP_LISTEN) {
+		dccp_response_timer(sk);
+		goto out;
+	}
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index acdd18e6adb..621680f127a 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -118,7 +118,7 @@ Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
 #include <linux/netfilter.h>
 #include <linux/seq_file.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/flow.h>
 #include <asm/system.h>
 #include <asm/ioctls.h>
@@ -1763,7 +1763,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
 		nskb = skb->next;
 
 		if (skb->len == 0) {
-			skb_unlink(skb);
+			skb_unlink(skb, queue);
 			kfree_skb(skb);
 			/* 
 			 * N.B. Don't refer to skb or cb after this point
@@ -2064,7 +2064,7 @@ static struct notifier_block dn_dev_notifier = {
 	.notifier_call = dn_device_event,
 };
 
-extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *);
+extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
 
 static struct packet_type dn_dix_packet_type = {
 	.type =		__constant_htons(ETH_P_DNA_RT),
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 00233ecbc9c..5610bb16dbf 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -752,16 +752,16 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
 
 	skb = alloc_skb(size, GFP_KERNEL);
 	if (!skb) {
-		netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, ENOBUFS);
 		return;
 	}
 	if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_DECnet_IFADDR, EINVAL);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL);
+	NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_IFADDR;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_DECnet_IFADDR, GFP_KERNEL);
 }
 
 static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 202dbde9850..369f25b60f3 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -60,7 +60,7 @@
 #include <linux/inet.h>
 #include <linux/route.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 8cce1fdbda9..e0bebf4bbca 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -479,7 +479,7 @@ int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff
 		xmit_count = cb2->xmit_count;
 		segnum = cb2->segnum;
 		/* Remove and drop ack'ed packet */
-		skb_unlink(ack);
+		skb_unlink(ack, q);
 		kfree_skb(ack);
 		ack = NULL;
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 2399fa8a3f8..2c915f305be 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -572,7 +572,7 @@ static int dn_route_ptp_hello(struct sk_buff *skb)
 	return NET_RX_SUCCESS;
 }
 
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct dn_skb_cb *cb;
 	unsigned char flags = 0;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 28ba5777a25..eeba56f9932 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -79,7 +79,7 @@ for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_n
 static DEFINE_RWLOCK(dn_fib_tables_lock);
 struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
 
-static kmem_cache_t *dn_hash_kmem;
+static kmem_cache_t *dn_hash_kmem __read_mostly;
 static int dn_fib_hash_zombies;
 
 static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
@@ -349,10 +349,10 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
                 kfree_skb(skb);
                 return;
         }
-        NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE;
+        NETLINK_CB(skb).dst_group = RTNLGRP_DECnet_ROUTE;
         if (nlh->nlmsg_flags & NLM_F_ECHO)
                 atomic_inc(&skb->users);
-        netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL);
+        netlink_broadcast(rtnl, skb, pid, RTNLGRP_DECnet_ROUTE, GFP_KERNEL);
         if (nlh->nlmsg_flags & NLM_F_ECHO)
                 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
 }
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 284a9998e53..1ab94c6e22e 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -19,6 +19,7 @@
 #include <linux/netfilter.h>
 #include <linux/spinlock.h>
 #include <linux/netlink.h>
+#include <linux/netfilter_decnet.h>
 
 #include <net/sock.h>
 #include <net/flow.h>
@@ -71,10 +72,10 @@ static void dnrmg_send_peer(struct sk_buff *skb)
 
 	switch(flags & DN_RT_CNTL_MSK) {
 		case DN_RT_PKT_L1RT:
-			group = DNRMG_L1_GROUP;
+			group = DNRNG_NLGRP_L1;
 			break;
 		case DN_RT_PKT_L2RT:
-			group = DNRMG_L2_GROUP;
+			group = DNRNG_NLGRP_L2;
 			break;
 		default:
 			return;
@@ -83,7 +84,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
 	skb2 = dnrmg_build_message(skb, &status);
 	if (skb2 == NULL)
 		return;
-	NETLINK_CB(skb2).dst_groups = group;
+	NETLINK_CB(skb2).dst_group = group;
 	netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
 }
 
@@ -138,7 +139,8 @@ static int __init init(void)
 {
 	int rv = 0;
 
-	dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk);
+	dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, DNRNG_NLGRP_MAX,
+	                              dnrmg_receive_user_sk, THIS_MODULE);
 	if (dnrmg == NULL) {
 		printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
 		return -ENOMEM;
@@ -162,6 +164,7 @@ static void __exit fini(void)
 MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
 MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
 
 module_init(init);
 module_exit(fini);
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index de691e119e1..4a62093eb34 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -159,7 +159,7 @@ static int econet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
 	if (err)
 		goto out_free;
-	sk->sk_stamp = skb->stamp;
+	skb_get_timestamp(skb, &sk->sk_stamp);
 
 	if (msg->msg_name)
 		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
@@ -869,7 +869,7 @@ static void aun_tx_ack(unsigned long seq, int result)
 
 foundit:
 	tx_result(skb->sk, eb->cookie, result);
-	skb_unlink(skb);
+	skb_unlink(skb, &aun_queue);
 	spin_unlock_irqrestore(&aun_queue_lock, flags);
 	kfree_skb(skb);
 }
@@ -947,7 +947,7 @@ static void ab_cleanup(unsigned long h)
 		{
 			tx_result(skb->sk, eb->cookie, 
 				  ECTYPE_TRANSMIT_NOT_PRESENT);
-			skb_unlink(skb);
+			skb_unlink(skb, &aun_queue);
 			kfree_skb(skb);
 		}
 		skb = newskb;
@@ -1009,7 +1009,7 @@ release:
  *	Receive an Econet frame from a device.
  */
 
-static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct ec_framehdr *hdr;
 	struct sock *sk;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index f6dbfb99b14..87a052a9a84 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,8 +62,6 @@
 #include <asm/system.h>
 #include <asm/checksum.h>
 
-extern int __init netdev_boot_setup(char *str);
-
 __setup("ether=", netdev_boot_setup);
 
 /*
@@ -163,7 +161,6 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	skb->mac.raw=skb->data;
 	skb_pull(skb,ETH_HLEN);
 	eth = eth_hdr(skb);
-	skb->input_dev = dev;
 	
 	if(*eth->h_dest&1)
 	{
diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c
index b81a6d53234..66b39fc342d 100644
--- a/net/ethernet/sysctl_net_ether.c
+++ b/net/ethernet/sysctl_net_ether.c
@@ -7,6 +7,7 @@
 
 #include <linux/mm.h>
 #include <linux/sysctl.h>
+#include <linux/if_ether.h>
 
 ctl_table ether_table[] = {
 	{0}
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0b3d9f1d806..e55136ae09f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -413,20 +413,19 @@ config INET_TUNNEL
 	  
 	  If unsure, say Y.
 
-config IP_TCPDIAG
-	tristate "IP: TCP socket monitoring interface"
+config INET_DIAG
+	tristate "INET: socket monitoring interface"
 	default y
 	---help---
-	  Support for TCP socket monitoring interface used by native Linux
-	  tools such as ss. ss is included in iproute2, currently downloadable
-	  at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
-	  and have selected IPv6 as a module, you need to build this as a
-	  module too.
+	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+	  native Linux tools such as ss. ss is included in iproute2, currently
+	  downloadable at <http://developer.osdl.org/dev/iproute2>. 
 	  
 	  If unsure, say Y.
 
-config IP_TCPDIAG_IPV6
-	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config INET_TCP_DIAG
+	depends on INET_DIAG
+	def_tristate INET_DIAG
 
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 55dc6cca1e7..f0435d00db6 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -4,11 +4,12 @@
 
 obj-y     := route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
-	     ip_output.o ip_sockglue.o \
+	     ip_output.o ip_sockglue.o inet_hashtables.o \
+	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o
 
 obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
 obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
@@ -29,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
-obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
+obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 163ae4068b5..bf147f8db39 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
 #include <net/arp.h>
 #include <net/route.h>
 #include <net/ip_fib.h>
+#include <net/inet_connection_sock.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <linux/skbuff.h>
@@ -112,11 +113,7 @@
 #include <linux/mroute.h>
 #endif
 
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
-
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet_sock_nr;
-#endif
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
 
 extern void ip_mc_drop_socket(struct sock *sk);
 
@@ -153,11 +150,7 @@ void inet_sock_destruct(struct sock *sk)
 	if (inet->opt)
 		kfree(inet->opt);
 	dst_release(sk->sk_dst_cache);
-#ifdef INET_REFCNT_DEBUG
-	atomic_dec(&inet_sock_nr);
-	printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
-	       sk, atomic_read(&inet_sock_nr));
-#endif
+	sk_refcnt_debug_dec(sk);
 }
 
 /*
@@ -210,7 +203,7 @@ int inet_listen(struct socket *sock, int backlog)
 	 * we can only allow the backlog to be adjusted.
 	 */
 	if (old_state != TCP_LISTEN) {
-		err = tcp_listen_start(sk);
+		err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
 		if (err)
 			goto out;
 	}
@@ -235,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol)
 	struct proto *answer_prot;
 	unsigned char answer_flags;
 	char answer_no_check;
-	int err;
+	int try_loading_module = 0;
+	int err = -ESOCKTNOSUPPORT;
 
 	sock->state = SS_UNCONNECTED;
 
 	/* Look for the requested type/protocol pair. */
 	answer = NULL;
+lookup_protocol:
 	rcu_read_lock();
 	list_for_each_rcu(p, &inetsw[sock->type]) {
 		answer = list_entry(p, struct inet_protosw, list);
@@ -261,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol)
 		answer = NULL;
 	}
 
-	err = -ESOCKTNOSUPPORT;
-	if (!answer)
-		goto out_rcu_unlock;
+	if (unlikely(answer == NULL)) {
+		if (try_loading_module < 2) {
+			rcu_read_unlock();
+			/*
+			 * Be more specific, e.g. net-pf-2-proto-132-type-1
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+			 */
+			if (++try_loading_module == 1)
+				request_module("net-pf-%d-proto-%d-type-%d",
+					       PF_INET, protocol, sock->type);
+			/*
+			 * Fall back to generic, e.g. net-pf-2-proto-132
+			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+			 */
+			else
+				request_module("net-pf-%d-proto-%d",
+					       PF_INET, protocol);
+			goto lookup_protocol;
+		} else
+			goto out_rcu_unlock;
+	}
+
 	err = -EPERM;
 	if (answer->capability > 0 && !capable(answer->capability))
 		goto out_rcu_unlock;
@@ -317,9 +331,7 @@ static int inet_create(struct socket *sock, int protocol)
 	inet->mc_index	= 0;
 	inet->mc_list	= NULL;
 
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet_sock_nr);
-#endif
+	sk_refcnt_debug_inc(sk);
 
 	if (inet->num) {
 		/* It assumes that any protocol which allows
@@ -847,10 +859,6 @@ static struct net_proto_family inet_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-
-extern void tcp_init(void);
-extern void tcp_v4_init(struct net_proto_family *);
-
 /* Upon startup we insert all the elements in inetsw_array[] into
  * the linked list inetsw.
  */
@@ -961,6 +969,119 @@ void inet_unregister_protosw(struct inet_protosw *p)
 	}
 }
 
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+	struct rtable *rt;
+	__u32 old_saddr = inet->saddr;
+	__u32 new_saddr;
+	__u32 daddr = inet->daddr;
+
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+
+	/* Query new route. */
+	err = ip_route_connect(&rt, daddr, 0,
+			       RT_CONN_FLAGS(sk),
+			       sk->sk_bound_dev_if,
+			       sk->sk_protocol,
+			       inet->sport, inet->dport, sk);
+	if (err)
+		return err;
+
+	sk_setup_caps(sk, &rt->u.dst);
+
+	new_saddr = rt->rt_src;
+
+	if (new_saddr == old_saddr)
+		return 0;
+
+	if (sysctl_ip_dynaddr > 1) {
+		printk(KERN_INFO "%s(): shifting inet->"
+				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+		       __FUNCTION__,
+		       NIPQUAD(old_saddr),
+		       NIPQUAD(new_saddr));
+	}
+
+	inet->saddr = inet->rcv_saddr = new_saddr;
+
+	/*
+	 * XXX The only one ugly spot where we need to
+	 * XXX really change the sockets identity after
+	 * XXX it has entered the hashes. -DaveM
+	 *
+	 * Besides that, it does not check for connection
+	 * uniqueness. Wait for troubles.
+	 */
+	__sk_prot_rehash(sk);
+	return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+	u32 daddr;
+	int err;
+
+	/* Route is OK, nothing to do. */
+	if (rt)
+		return 0;
+
+	/* Reroute. */
+	daddr = inet->daddr;
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+{
+	struct flowi fl = {
+		.oif = sk->sk_bound_dev_if,
+		.nl_u = {
+			.ip4_u = {
+				.daddr	= daddr,
+				.saddr	= inet->saddr,
+				.tos	= RT_CONN_FLAGS(sk),
+			},
+		},
+		.proto = sk->sk_protocol,
+		.uli_u = {
+			.ports = {
+				.sport = inet->sport,
+				.dport = inet->dport,
+			},
+		},
+	};
+						
+	err = ip_route_output_flow(&rt, &fl, sk, 0);
+}
+	if (!err)
+		sk_setup_caps(sk, &rt->u.dst);
+	else {
+		/* Routing failed... */
+		sk->sk_route_caps = 0;
+		/*
+		 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+		 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+		 */
+		if (!sysctl_ip_dynaddr ||
+		    sk->sk_state != TCP_SYN_SENT ||
+		    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+		    (err = inet_sk_reselect_saddr(sk)) != 0)
+			sk->sk_err_soft = -err;
+	}
+
+	return err;
+}
+
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
 #ifdef CONFIG_IP_MULTICAST
 static struct net_protocol igmp_protocol = {
 	.handler =	igmp_rcv,
@@ -1007,7 +1128,6 @@ static int __init init_ipv4_mibs(void)
 }
 
 static int ipv4_proc_init(void);
-extern void ipfrag_init(void);
 
 /*
  *	IP protocol layer initialiser
@@ -1128,19 +1248,10 @@ module_init(inet_init);
 /* ------------------------------------------------------------------------ */
 
 #ifdef CONFIG_PROC_FS
-extern int  fib_proc_init(void);
-extern void fib_proc_exit(void);
 #ifdef CONFIG_IP_FIB_TRIE
 extern int  fib_stat_proc_init(void);
 extern void fib_stat_proc_exit(void);
 #endif
-extern int  ip_misc_proc_init(void);
-extern int  raw_proc_init(void);
-extern void raw_proc_exit(void);
-extern int  tcp4_proc_init(void);
-extern void tcp4_proc_exit(void);
-extern int  udp4_proc_init(void);
-extern void udp4_proc_exit(void);
 
 static int __init ipv4_proc_init(void)
 {
@@ -1205,7 +1316,3 @@ EXPORT_SYMBOL(inet_stream_ops);
 EXPORT_SYMBOL(inet_unregister_protosw);
 EXPORT_SYMBOL(net_statistics);
 EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
-
-#ifdef INET_REFCNT_DEBUG
-EXPORT_SYMBOL(inet_sock_nr);
-#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a642fd61285..8bf312bdea1 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip,
 static void parp_redo(struct sk_buff *skb)
 {
 	nf_reset(skb);
-	arp_rcv(skb, skb->dev, NULL);
+	arp_rcv(skb, skb->dev, NULL, skb->dev);
 }
 
 /*
@@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb)
 				if (n)
 					neigh_release(n);
 
-				if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 
+				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 
 				    skb->pkt_type == PACKET_HOST ||
 				    in_dev->arp_parms->proxy_delay == 0) {
 					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
@@ -927,7 +927,7 @@ out:
  *	Receive an arp request from the device layer.
  */
 
-int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct arphdr *arp;
 
@@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		goto out_of_mem;
 
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
 	return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
 
 freeskb:
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index b1db561f254..c1b42b5257f 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -16,9 +16,10 @@
 #include <linux/module.h>
 #include <linux/ip.h>
 #include <linux/in.h>
+#include <net/ip.h>
 #include <net/sock.h>
-#include <net/tcp.h>
 #include <net/route.h>
+#include <net/tcp_states.h>
 
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d8a10e3dd77..ba2895ae815 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
 	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
 
 	if (!skb)
-		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS);
 	else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL);
 	} else {
-		NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
-		netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+		netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL);
 	}
 }
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index ba57446d5d1..b31ffc5053d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
 	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
-			ntohl(esph->spi), ntohl(iph->daddr)));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+		 ntohl(esph->spi), ntohl(iph->daddr));
 	xfrm_state_put(x);
 }
 
@@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x)
 
 		if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
 		    crypto_tfm_alg_digestsize(esp->auth.tfm)) {
-			NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
-			       x->aalg->alg_name,
-			       crypto_tfm_alg_digestsize(esp->auth.tfm),
-			       aalg_desc->uinfo.auth.icv_fullbits/8));
+			NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+				 x->aalg->alg_name,
+				 crypto_tfm_alg_digestsize(esp->auth.tfm),
+				 aalg_desc->uinfo.auth.icv_fullbits/8);
 			goto error;
 		}
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cd8e45ab958..4e1379f7126 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -558,16 +558,15 @@ static void nl_fib_input(struct sock *sk, int len)
 	nl_fib_lookup(frn, tb);
 	
 	pid = nlh->nlmsg_pid;           /*pid of sending process */
-	NETLINK_CB(skb).groups = 0;     /* not in mcast group */
 	NETLINK_CB(skb).pid = 0;         /* from kernel */
 	NETLINK_CB(skb).dst_pid = pid;
-	NETLINK_CB(skb).dst_groups = 0;  /* unicast */
+	NETLINK_CB(skb).dst_group = 0;  /* unicast */
 	netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
 }    
 
 static void nl_fib_lookup_init(void)
 {
-      netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+      netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE);
 }
 
 static void fib_disable_ip(struct net_device *dev, int force)
@@ -662,5 +661,4 @@ void __init ip_fib_init(void)
 }
 
 EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
 EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b10d6bb5ef3..2a8c9afc369 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -45,8 +45,8 @@
 
 #include "fib_lookup.h"
 
-static kmem_cache_t *fn_hash_kmem;
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_hash_kmem __read_mostly;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 
 struct fib_node {
 	struct hlist_node	fn_hash;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b729d97cfa9..ef6609ea0eb 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -7,6 +7,7 @@
 
 struct fib_alias {
 	struct list_head	fa_list;
+	struct rcu_head rcu;
 	struct fib_info		*fa_info;
 	u8			fa_tos;
 	u8			fa_type;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e278cb9d007..d41219e8037 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
 		kfree_skb(skb);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE;
 	if (n->nlmsg_flags&NLM_F_ECHO)
 		atomic_inc(&skb->users);
-	netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+	netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL);
 	if (n->nlmsg_flags&NLM_F_ECHO)
 		netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
 }
@@ -854,6 +854,7 @@ failure:
 	return NULL;
 }
 
+/* Note! fib_semantic_match intentionally uses  RCU list functions. */
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 		       struct fib_result *res, __u32 zone, __u32 mask, 
 			int prefixlen)
@@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 	struct fib_alias *fa;
 	int nh_sel = 0;
 
-	list_for_each_entry(fa, head, fa_list) {
+	list_for_each_entry_rcu(fa, head, fa_list) {
 		int err;
 
 		if (fa->fa_tos &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 45efd5f4741..b2dea4e5da7 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#define VERSION "0.325"
+#define VERSION "0.402"
 
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -62,6 +62,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
@@ -77,56 +78,55 @@
 #undef CONFIG_IP_FIB_TRIE_STATS
 #define MAX_CHILDS 16384
 
-#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
 #define KEYLENGTH (8*sizeof(t_key))
 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
 
-static DEFINE_RWLOCK(fib_lock);
-
 typedef unsigned int t_key;
 
 #define T_TNODE 0
 #define T_LEAF  1
 #define NODE_TYPE_MASK	0x1UL
-#define NODE_PARENT(_node) \
-	((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
-#define NODE_SET_PARENT(_node, _ptr) \
-	((_node)->_parent = (((unsigned long)(_ptr)) | \
-                     ((_node)->_parent & NODE_TYPE_MASK)))
-#define NODE_INIT_PARENT(_node, _type) \
-	((_node)->_parent = (_type))
-#define NODE_TYPE(_node) \
-	((_node)->_parent & NODE_TYPE_MASK)
-
-#define IS_TNODE(n) (!(n->_parent & T_LEAF))
-#define IS_LEAF(n) (n->_parent & T_LEAF)
+#define NODE_PARENT(node) \
+	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
+
+#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+
+#define NODE_SET_PARENT(node, ptr)		\
+	rcu_assign_pointer((node)->parent,	\
+			   ((unsigned long)(ptr)) | NODE_TYPE(node))
+
+#define IS_TNODE(n) (!(n->parent & T_LEAF))
+#define IS_LEAF(n) (n->parent & T_LEAF)
 
 struct node {
-        t_key key;
-	unsigned long _parent;
+	t_key key;
+	unsigned long parent;
 };
 
 struct leaf {
-        t_key key;
-	unsigned long _parent;
+	t_key key;
+	unsigned long parent;
 	struct hlist_head list;
+	struct rcu_head rcu;
 };
 
 struct leaf_info {
 	struct hlist_node hlist;
+	struct rcu_head rcu;
 	int plen;
 	struct list_head falh;
 };
 
 struct tnode {
-        t_key key;
-	unsigned long _parent;
-        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
-        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
-        unsigned short full_children;  /* KEYLENGTH bits needed */
-        unsigned short empty_children; /* KEYLENGTH bits needed */
-        struct node *child[0];
+	t_key key;
+	unsigned long parent;
+	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
+	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
+	unsigned short full_children;	/* KEYLENGTH bits needed */
+	unsigned short empty_children;	/* KEYLENGTH bits needed */
+	struct rcu_head rcu;
+	struct node *child[0];
 };
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -150,77 +150,45 @@ struct trie_stat {
 };
 
 struct trie {
-        struct node *trie;
+	struct node *trie;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	struct trie_use_stats stats;
 #endif
-        int size;
+	int size;
 	unsigned int revision;
 };
 
-static int trie_debug = 0;
-
-static int tnode_full(struct tnode *tn, struct node *n);
 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
-static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
-extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
-extern int fib_detect_death(struct fib_info *fi, int order,
-                            struct fib_info **last_resort, int *last_idx, int *dflt);
-
-extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
-               struct nlmsghdr *n, struct netlink_skb_parms *req);
 
-static kmem_cache_t *fn_alias_kmem;
+static kmem_cache_t *fn_alias_kmem __read_mostly;
 static struct trie *trie_local = NULL, *trie_main = NULL;
 
-static void trie_bug(char *err)
-{
-	printk("Trie Bug: %s\n", err);
-	BUG();
-}
+
+/* rcu_read_lock needs to be hold by caller from readside */
 
 static inline struct node *tnode_get_child(struct tnode *tn, int i)
 {
-        if (i >= 1<<tn->bits)
-                trie_bug("tnode_get_child");
+	BUG_ON(i >= 1 << tn->bits);
 
-        return tn->child[i];
+	return rcu_dereference(tn->child[i]);
 }
 
-static inline int tnode_child_length(struct tnode *tn)
+static inline int tnode_child_length(const struct tnode *tn)
 {
-        return 1<<tn->bits;
+	return 1 << tn->bits;
 }
 
-/*
-  _________________________________________________________________
-  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
-  ----------------------------------------------------------------
-    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
-
-  _________________________________________________________________
-  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
-  -----------------------------------------------------------------
-   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
-
-  tp->pos = 7
-  tp->bits = 3
-  n->pos = 15
-  n->bits=4
-  KEYLENGTH=32
-*/
-
 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 {
-        if (offset < KEYLENGTH)
+	if (offset < KEYLENGTH)
 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
-        else
+	else
 		return 0;
 }
 
@@ -233,8 +201,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 {
 	if (bits == 0 || offset >= KEYLENGTH)
 		return 1;
-        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
-        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 }
 
 static inline int tkey_mismatch(t_key a, int offset, t_key b)
@@ -249,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b)
 	return i;
 }
 
-/* Candiate for fib_semantics */
-
-static void fn_free_alias(struct fib_alias *fa)
-{
-	fib_release_info(fa->fa_info);
-	kmem_cache_free(fn_alias_kmem, fa);
-}
-
 /*
   To understand this stuff, an understanding of keys and all their bits is 
   necessary. Every node in the trie has a key associated with it, but not 
@@ -295,7 +255,7 @@ static void fn_free_alias(struct fib_alias *fa)
   tp->pos = 7
   tp->bits = 3
   n->pos = 15
-  n->bits=4
+  n->bits = 4
 
   First, let's just ignore the bits that come before the parent tp, that is 
   the bits from 0 to (tp->pos-1). They are *known* but at this point we do 
@@ -320,60 +280,65 @@ static void fn_free_alias(struct fib_alias *fa)
 
 */
 
-static void check_tnode(struct tnode *tn)
+static inline void check_tnode(const struct tnode *tn)
 {
-	if (tn && tn->pos+tn->bits > 32) {
-		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
-	}
+	WARN_ON(tn && tn->pos+tn->bits > 32);
 }
 
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
 
-static struct leaf *leaf_new(void)
+
+static void __alias_free_mem(struct rcu_head *head)
 {
-	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
-	if (l) {
-		NODE_INIT_PARENT(l, T_LEAF);
-		INIT_HLIST_HEAD(&l->list);
-	}
-	return l;
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+	kmem_cache_free(fn_alias_kmem, fa);
 }
 
-static struct leaf_info *leaf_info_new(int plen)
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
 {
-	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-	if (li) {
-		li->plen = plen;
-		INIT_LIST_HEAD(&li->falh);
-	}
-	return li;
+	call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __leaf_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct leaf, rcu));
+}
+
+static inline void free_leaf(struct leaf *leaf)
+{
+	call_rcu(&leaf->rcu, __leaf_free_rcu);
 }
 
-static inline void free_leaf(struct leaf *l)
+static void __leaf_info_free_rcu(struct rcu_head *head)
 {
-	kfree(l);
+	kfree(container_of(head, struct leaf_info, rcu));
 }
 
-static inline void free_leaf_info(struct leaf_info *li)
+static inline void free_leaf_info(struct leaf_info *leaf)
 {
-	kfree(li);
+	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 }
 
 static struct tnode *tnode_alloc(unsigned int size)
 {
-	if (size <= PAGE_SIZE) {
-		return kmalloc(size, GFP_KERNEL);
-	} else {
-		return (struct tnode *)
-			__get_free_pages(GFP_KERNEL, get_order(size));
-	}
+	struct page *pages;
+
+	if (size <= PAGE_SIZE)
+		return kcalloc(size, 1, GFP_KERNEL);
+
+	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
+	if (!pages)
+		return NULL;
+
+	return page_address(pages);
 }
 
-static void __tnode_free(struct tnode *tn)
+static void __tnode_free_rcu(struct rcu_head *head)
 {
+	struct tnode *tn = container_of(head, struct tnode, rcu);
 	unsigned int size = sizeof(struct tnode) +
-	                    (1<<tn->bits) * sizeof(struct node *);
+		(1 << tn->bits) * sizeof(struct node *);
 
 	if (size <= PAGE_SIZE)
 		kfree(tn);
@@ -381,15 +346,40 @@ static void __tnode_free(struct tnode *tn)
 		free_pages((unsigned long)tn, get_order(size));
 }
 
+static inline void tnode_free(struct tnode *tn)
+{
+	call_rcu(&tn->rcu, __tnode_free_rcu);
+}
+
+static struct leaf *leaf_new(void)
+{
+	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+	if (l) {
+		l->parent = T_LEAF;
+		INIT_HLIST_HEAD(&l->list);
+	}
+	return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+	if (li) {
+		li->plen = plen;
+		INIT_LIST_HEAD(&li->falh);
+	}
+	return li;
+}
+
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
 	int nchildren = 1<<bits;
 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
 	struct tnode *tn = tnode_alloc(sz);
 
-	if (tn)  {
+	if (tn) {
 		memset(tn, 0, sz);
-		NODE_INIT_PARENT(tn, T_TNODE);
+		tn->parent = T_TNODE;
 		tn->pos = pos;
 		tn->bits = bits;
 		tn->key = key;
@@ -397,38 +387,17 @@ static struct tnode* tnode_new(t_key key, int pos, int bits)
 		tn->empty_children = 1<<bits;
 	}
 
-	if (trie_debug > 0)
-		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
-		       (unsigned int) (sizeof(struct node) * 1<<bits));
+	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+		 (unsigned int) (sizeof(struct node) * 1<<bits));
 	return tn;
 }
 
-static void tnode_free(struct tnode *tn)
-{
-	if (!tn) {
-		trie_bug("tnode_free\n");
-	}
-	if (IS_LEAF(tn)) {
-		free_leaf((struct leaf *)tn);
-		if (trie_debug > 0 )
-			printk("FL %p \n", tn);
-	}
-	else if (IS_TNODE(tn)) {
-		__tnode_free(tn);
-		if (trie_debug > 0 )
-			printk("FT %p \n", tn);
-	}
-	else {
-		trie_bug("tnode_free\n");
-	}
-}
-
 /*
  * Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
  */
 
-static inline int tnode_full(struct tnode *tn, struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct node *n)
 {
 	if (n == NULL || IS_LEAF(n))
 		return 0;
@@ -448,15 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod
 
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
 {
-	struct node *chi;
+	struct node *chi = tn->child[i];
 	int isfull;
 
-	if (i >= 1<<tn->bits) {
-		printk("bits=%d, i=%d\n", tn->bits, i);
-		trie_bug("tnode_put_child_reorg bits");
-	}
-	write_lock_bh(&fib_lock);
-	chi = tn->child[i];
+	BUG_ON(i >= 1<<tn->bits);
+
 
 	/* update emptyChildren */
 	if (n == NULL && chi != NULL)
@@ -465,33 +430,32 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 		tn->empty_children--;
 
 	/* update fullChildren */
-        if (wasfull == -1)
+	if (wasfull == -1)
 		wasfull = tnode_full(tn, chi);
 
 	isfull = tnode_full(tn, n);
 	if (wasfull && !isfull)
 		tn->full_children--;
-
 	else if (!wasfull && isfull)
 		tn->full_children++;
+
 	if (n)
 		NODE_SET_PARENT(n, tn);
 
-	tn->child[i] = n;
-	write_unlock_bh(&fib_lock);
+	rcu_assign_pointer(tn->child[i], n);
 }
 
 static struct node *resize(struct trie *t, struct tnode *tn)
 {
 	int i;
 	int err = 0;
+	struct tnode *old_tn;
 
  	if (!tn)
 		return NULL;
 
-	if (trie_debug)
-		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
-		      tn, inflate_threshold, halve_threshold);
+	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+		 tn, inflate_threshold, halve_threshold);
 
 	/* No children */
 	if (tn->empty_children == tnode_child_length(tn)) {
@@ -501,20 +465,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	/* One child */
 	if (tn->empty_children == tnode_child_length(tn) - 1)
 		for (i = 0; i < tnode_child_length(tn); i++) {
+			struct node *n;
 
-			write_lock_bh(&fib_lock);
-			if (tn->child[i] != NULL) {
-
-				/* compress one level */
-				struct node *n = tn->child[i];
-				if (n)
-					NODE_INIT_PARENT(n, NODE_TYPE(n));
+			n = tn->child[i];
+			if (!n)
+				continue;
 
-				write_unlock_bh(&fib_lock);
-				tnode_free(tn);
-				return n;
-			}
-			write_unlock_bh(&fib_lock);
+			/* compress one level */
+			NODE_SET_PARENT(n, NULL);
+			tnode_free(tn);
+			return n;
 		}
 	/*
 	 * Double as long as the resulting node has a number of
@@ -566,16 +526,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 *
 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children ) >= inflate_threshold * new_child_length
+	 *    tn->full_children) >= inflate_threshold * new_child_length
 	 *
 	 * expand new_child_length:
 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
-	 *    tn->full_children ) >=
+	 *    tn->full_children) >=
 	 *      inflate_threshold * tnode_child_length(tn) * 2
 	 *
 	 * shorten again:
 	 * 50 * (tn->full_children + tnode_child_length(tn) -
-	 *    tn->empty_children ) >= inflate_threshold *
+	 *    tn->empty_children) >= inflate_threshold *
 	 *    tnode_child_length(tn)
 	 *
 	 */
@@ -587,9 +547,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
 				inflate_threshold * tnode_child_length(tn))) {
 
-		tn = inflate(t, tn, &err);
-
-		if (err) {
+		old_tn = tn;
+		tn = inflate(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -609,9 +570,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	       100 * (tnode_child_length(tn) - tn->empty_children) <
 	       halve_threshold * tnode_child_length(tn)) {
 
-		tn = halve(t, tn, &err);
-
-		if (err) {
+		old_tn = tn;
+		tn = halve(t, tn);
+		if (IS_ERR(tn)) {
+			tn = old_tn;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			t->stats.resize_node_skipped++;
 #endif
@@ -621,44 +583,37 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 
 	/* Only one child remains */
-
 	if (tn->empty_children == tnode_child_length(tn) - 1)
 		for (i = 0; i < tnode_child_length(tn); i++) {
-		
-			write_lock_bh(&fib_lock);
-			if (tn->child[i] != NULL) {
-				/* compress one level */
-				struct node *n = tn->child[i];
-
-				if (n)
-					NODE_INIT_PARENT(n, NODE_TYPE(n));
-
-				write_unlock_bh(&fib_lock);
-				tnode_free(tn);
-				return n;
-			}
-			write_unlock_bh(&fib_lock);
+			struct node *n;
+
+			n = tn->child[i];
+			if (!n)
+				continue;
+
+			/* compress one level */
+
+			NODE_SET_PARENT(n, NULL);
+			tnode_free(tn);
+			return n;
 		}
 
 	return (struct node *) tn;
 }
 
-static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
 {
 	struct tnode *inode;
 	struct tnode *oldtnode = tn;
 	int olen = tnode_child_length(tn);
 	int i;
 
-  	if (trie_debug)
-		printk("In inflate\n");
+	pr_debug("In inflate\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 
-	if (!tn) {
-		*err = -ENOMEM;
-		return oldtnode;
-	}
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -666,8 +621,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 	 * fails. In case of failure we return the oldnode and  inflate
 	 * of tnode is ignored.
 	 */
-		
-	for(i = 0; i < olen; i++) {
+
+	for (i = 0; i < olen; i++) {
 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
 
 		if (inode &&
@@ -675,46 +630,30 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 		    inode->pos == oldtnode->pos + oldtnode->bits &&
 		    inode->bits > 1) {
 			struct tnode *left, *right;
-
 			t_key m = TKEY_GET_MASK(inode->pos, 1);
 
 			left = tnode_new(inode->key&(~m), inode->pos + 1,
 					 inode->bits - 1);
+			if (!left)
+				goto nomem;
 
-			if (!left) {
-				*err = -ENOMEM;
-				break;
-			}
-		
 			right = tnode_new(inode->key|m, inode->pos + 1,
 					  inode->bits - 1);
 
-			if (!right) {
-				*err = -ENOMEM;
-				break;
-			}
+                        if (!right) {
+				tnode_free(left);
+				goto nomem;
+                        }
 
 			put_child(t, tn, 2*i, (struct node *) left);
 			put_child(t, tn, 2*i+1, (struct node *) right);
 		}
 	}
 
-	if (*err) {
-		int size = tnode_child_length(tn);
-		int j;
-
-		for(j = 0; j < size; j++)
-			if (tn->child[j])
-				tnode_free((struct tnode *)tn->child[j]);
-
-		tnode_free(tn);
-	
-		*err = -ENOMEM;
-		return oldtnode;
-	}
-
-	for(i = 0; i < olen; i++) {
+	for (i = 0; i < olen; i++) {
 		struct node *node = tnode_get_child(oldtnode, i);
+		struct tnode *left, *right;
+		int size, j;
 
 		/* An empty child */
 		if (node == NULL)
@@ -740,76 +679,82 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 			put_child(t, tn, 2*i+1, inode->child[1]);
 
 			tnode_free(inode);
+			continue;
 		}
 
-			/* An internal node with more than two children */
-		else {
-			struct tnode *left, *right;
-			int size, j;
-
-			/* We will replace this node 'inode' with two new
-			 * ones, 'left' and 'right', each with half of the
-			 * original children. The two new nodes will have
-			 * a position one bit further down the key and this
-			 * means that the "significant" part of their keys
-			 * (see the discussion near the top of this file)
-			 * will differ by one bit, which will be "0" in
-			 * left's key and "1" in right's key. Since we are
-			 * moving the key position by one step, the bit that
-			 * we are moving away from - the bit at position
-			 * (inode->pos) - is the one that will differ between
-			 * left and right. So... we synthesize that bit in the
-			 * two  new keys.
-			 * The mask 'm' below will be a single "one" bit at
-			 * the position (inode->pos)
-			 */
-
-			/* Use the old key, but set the new significant
-			 *   bit to zero.
-			 */
+		/* An internal node with more than two children */
+
+		/* We will replace this node 'inode' with two new
+		 * ones, 'left' and 'right', each with half of the
+		 * original children. The two new nodes will have
+		 * a position one bit further down the key and this
+		 * means that the "significant" part of their keys
+		 * (see the discussion near the top of this file)
+		 * will differ by one bit, which will be "0" in
+		 * left's key and "1" in right's key. Since we are
+		 * moving the key position by one step, the bit that
+		 * we are moving away from - the bit at position
+		 * (inode->pos) - is the one that will differ between
+		 * left and right. So... we synthesize that bit in the
+		 * two  new keys.
+		 * The mask 'm' below will be a single "one" bit at
+		 * the position (inode->pos)
+		 */
 
-			left = (struct tnode *) tnode_get_child(tn, 2*i);
-			put_child(t, tn, 2*i, NULL);
+		/* Use the old key, but set the new significant
+		 *   bit to zero.
+		 */
 
-			if (!left)
-				BUG();
+		left = (struct tnode *) tnode_get_child(tn, 2*i);
+		put_child(t, tn, 2*i, NULL);
 
-			right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-			put_child(t, tn, 2*i+1, NULL);
+		BUG_ON(!left);
 
-			if (!right)
-				BUG();
+		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+		put_child(t, tn, 2*i+1, NULL);
 
-			size = tnode_child_length(left);
-			for(j = 0; j < size; j++) {
-				put_child(t, left, j, inode->child[j]);
-				put_child(t, right, j, inode->child[j + size]);
-			}
-			put_child(t, tn, 2*i, resize(t, left));
-			put_child(t, tn, 2*i+1, resize(t, right));
+		BUG_ON(!right);
 
-			tnode_free(inode);
+		size = tnode_child_length(left);
+		for (j = 0; j < size; j++) {
+			put_child(t, left, j, inode->child[j]);
+			put_child(t, right, j, inode->child[j + size]);
 		}
+		put_child(t, tn, 2*i, resize(t, left));
+		put_child(t, tn, 2*i+1, resize(t, right));
+
+		tnode_free(inode);
 	}
 	tnode_free(oldtnode);
 	return tn;
+nomem:
+	{
+		int size = tnode_child_length(tn);
+		int j;
+
+		for (j = 0; j < size; j++)
+			if (tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+
+		return ERR_PTR(-ENOMEM);
+	}
 }
 
-static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
+static struct tnode *halve(struct trie *t, struct tnode *tn)
 {
 	struct tnode *oldtnode = tn;
 	struct node *left, *right;
 	int i;
 	int olen = tnode_child_length(tn);
 
-	if (trie_debug) printk("In halve\n");
+	pr_debug("In halve\n");
 
 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 
-	if (!tn) {
-		*err = -ENOMEM;
-		return oldtnode;
-	}
+	if (!tn)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Preallocate and store tnodes before the actual work so we
@@ -818,38 +763,27 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 	 * of tnode is ignored.
 	 */
 
-	for(i = 0; i < olen; i += 2) {
+	for (i = 0; i < olen; i += 2) {
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
 
 		/* Two nonempty children */
-		if (left && right)  {
-			struct tnode *newBinNode =
-				tnode_new(left->key, tn->pos + tn->bits, 1);
+		if (left && right) {
+			struct tnode *newn;
 
-			if (!newBinNode) {
-				*err = -ENOMEM;
-				break;
-			}
-			put_child(t, tn, i/2, (struct node *)newBinNode);
-		}
-	}
+			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 
-	if (*err) {
-		int size = tnode_child_length(tn);
-		int j;
+			if (!newn)
+				goto nomem;
 
-		for(j = 0; j < size; j++)
-			if (tn->child[j])
-				tnode_free((struct tnode *)tn->child[j]);
+			put_child(t, tn, i/2, (struct node *)newn);
+		}
 
-		tnode_free(tn);
-	
-		*err = -ENOMEM;
-		return oldtnode;
 	}
 
-	for(i = 0; i < olen; i += 2) {
+	for (i = 0; i < olen; i += 2) {
+		struct tnode *newBinNode;
+
 		left = tnode_get_child(oldtnode, i);
 		right = tnode_get_child(oldtnode, i+1);
 
@@ -858,88 +792,99 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 			if (right == NULL)    /* Both are empty */
 				continue;
 			put_child(t, tn, i/2, right);
-		} else if (right == NULL)
+			continue;
+		}
+
+		if (right == NULL) {
 			put_child(t, tn, i/2, left);
+			continue;
+		}
 
 		/* Two nonempty children */
-		else {
-			struct tnode *newBinNode =
-				(struct tnode *) tnode_get_child(tn, i/2);
-			put_child(t, tn, i/2, NULL);
-
-			if (!newBinNode)
-				BUG();
-
-			put_child(t, newBinNode, 0, left);
-			put_child(t, newBinNode, 1, right);
-			put_child(t, tn, i/2, resize(t, newBinNode));
-		}
+		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+		put_child(t, tn, i/2, NULL);
+		put_child(t, newBinNode, 0, left);
+		put_child(t, newBinNode, 1, right);
+		put_child(t, tn, i/2, resize(t, newBinNode));
 	}
 	tnode_free(oldtnode);
 	return tn;
+nomem:
+	{
+		int size = tnode_child_length(tn);
+		int j;
+
+		for (j = 0; j < size; j++)
+			if (tn->child[j])
+				tnode_free((struct tnode *)tn->child[j]);
+
+		tnode_free(tn);
+
+		return ERR_PTR(-ENOMEM);
+	}
 }
 
-static void *trie_init(struct trie *t)
+static void trie_init(struct trie *t)
 {
-	if (t) {
-		t->size = 0;
-		t->trie = NULL;
-		t->revision = 0;
+	if (!t)
+		return;
+
+	t->size = 0;
+	rcu_assign_pointer(t->trie, NULL);
+	t->revision = 0;
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-       		memset(&t->stats, 0, sizeof(struct trie_use_stats));
+	memset(&t->stats, 0, sizeof(struct trie_use_stats));
 #endif
-	}
-	return t;
 }
 
+/* readside most use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
 static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
 {
 	struct hlist_node *node;
 	struct leaf_info *li;
 
-	hlist_for_each_entry(li, node, head, hlist) {
+	hlist_for_each_entry_rcu(li, node, head, hlist)
 		if (li->plen == plen)
 			return li;
-	}
+
 	return NULL;
 }
 
 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
 {
-	struct list_head *fa_head = NULL;
 	struct leaf_info *li = find_leaf_info(&l->list, plen);
 
-	if (li)
-		fa_head = &li->falh;
+	if (!li)
+		return NULL;
 
-	return fa_head;
+	return &li->falh;
 }
 
 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 {
-	struct leaf_info *li = NULL, *last = NULL;
-	struct hlist_node *node, *tmp;
-
-	write_lock_bh(&fib_lock);
-
-	if (hlist_empty(head))
-		hlist_add_head(&new->hlist, head);
-	else {
-		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
-		
-			if (new->plen > li->plen)
-				break;
-		
-			last = li;
-		}
-		if (last)
-			hlist_add_after(&last->hlist, &new->hlist);
-		else
-			hlist_add_before(&new->hlist, &li->hlist);
-	}
-	write_unlock_bh(&fib_lock);
+        struct leaf_info *li = NULL, *last = NULL;
+        struct hlist_node *node;
+
+        if (hlist_empty(head)) {
+                hlist_add_head_rcu(&new->hlist, head);
+        } else {
+                hlist_for_each_entry(li, node, head, hlist) {
+                        if (new->plen > li->plen)
+                                break;
+
+                        last = li;
+                }
+                if (last)
+                        hlist_add_after_rcu(&last->hlist, &new->hlist);
+                else
+                        hlist_add_before_rcu(&new->hlist, &li->hlist);
+        }
 }
 
+/* rcu_read_lock needs to be hold by caller from readside */
+
 static struct leaf *
 fib_find_node(struct trie *t, u32 key)
 {
@@ -948,61 +893,43 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = t->trie;
+	n = rcu_dereference(t->trie);
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-		
+
 		check_tnode(tn);
-		
+
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
-			pos=tn->pos + tn->bits;
+			pos = tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
-		}
-		else
+		} else
 			break;
 	}
 	/* Case we have found a leaf. Compare prefixes */
 
-	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		struct leaf *l = (struct leaf *) n;
-		return l;
-	}
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
+		return (struct leaf *)n;
+
 	return NULL;
 }
 
 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 {
-	int i = 0;
 	int wasfull;
 	t_key cindex, key;
 	struct tnode *tp = NULL;
 
-	if (!tn)
-		BUG();
-
 	key = tn->key;
-	i = 0;
 
 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
 
-		if (i > 10) {
-			printk("Rebalance tn=%p \n", tn);
-			if (tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
-		
-			printk("Rebalance tp=%p \n", tp);
-			if (tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
-		}
-
-		if (i > 12) BUG();
-		i++;
-
 		tp = NODE_PARENT(tn);
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
-	
+
 		if (!NODE_PARENT(tn))
 			break;
 
@@ -1015,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	return (struct node*) tn;
 }
 
+/* only used from updater-side */
+
 static  struct list_head *
 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
@@ -1050,20 +979,16 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-		
+
 		check_tnode(tn);
-	
+
 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 			tp = tn;
-			pos=tn->pos + tn->bits;
+			pos = tn->pos + tn->bits;
 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if (n && NODE_PARENT(n) != tn) {
-				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
-				BUG();
-			}
-		}
-		else
+			BUG_ON(n && NODE_PARENT(n) != tn);
+		} else
 			break;
 	}
 
@@ -1073,17 +998,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	 * tp is n's (parent) ----> NULL or TNODE
 	 */
 
-	if (tp && IS_LEAF(tp))
-		BUG();
-
+	BUG_ON(tp && IS_LEAF(tp));
 
 	/* Case 1: n is a leaf. Compare prefixes */
 
 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
-		struct leaf *l = ( struct leaf *)  n;
-	
+		struct leaf *l = (struct leaf *) n;
+
 		li = leaf_info_new(plen);
-	
+
 		if (!li) {
 			*err = -ENOMEM;
 			goto err;
@@ -1113,35 +1036,29 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 	fa_head = &li->falh;
 	insert_leaf_info(&l->list, li);
 
-	/* Case 2: n is NULL, and will just insert a new leaf */
 	if (t->trie && n == NULL) {
+		/* Case 2: n is NULL, and will just insert a new leaf */
 
 		NODE_SET_PARENT(l, tp);
-	
-		if (!tp)
-			BUG();
 
-		else {
-			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
-			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
-		}
-	}
-	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
-	else {
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+	} else {
+		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
 		/*
 		 *  Add a new tnode here
 		 *  first tnode need some special handling
 		 */
 
 		if (tp)
-			pos=tp->pos+tp->bits;
+			pos = tp->pos+tp->bits;
 		else
-			pos=0;
+			pos = 0;
+
 		if (n) {
 			newpos = tkey_mismatch(key, pos, n->key);
 			tn = tnode_new(n->key, newpos, 1);
-		}
-		else {
+		} else {
 			newpos = 0;
 			tn = tnode_new(key, newpos, 1); /* First tnode */
 		}
@@ -1151,32 +1068,33 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 			tnode_free((struct tnode *) l);
 			*err = -ENOMEM;
 			goto err;
-		}		
-		
+		}
+
 		NODE_SET_PARENT(tn, tp);
 
-		missbit=tkey_extract_bits(key, newpos, 1);
+		missbit = tkey_extract_bits(key, newpos, 1);
 		put_child(t, tn, missbit, (struct node *)l);
 		put_child(t, tn, 1-missbit, n);
 
 		if (tp) {
 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
-		}
-		else {
-			t->trie = (struct node*) tn; /* First tnode */
+		} else {
+			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
 			tp = tn;
 		}
 	}
-	if (tp && tp->pos+tp->bits > 32) {
+
+	if (tp && tp->pos + tp->bits > 32)
 		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
 		       tp, tp->pos, tp->bits, key, plen);
-	}
+
 	/* Rebalance the trie */
-	t->trie = trie_rebalance(t, tp);
+
+	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
 done:
 	t->revision++;
-err:;
+err:
 	return fa_head;
 }
 
@@ -1204,17 +1122,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	key = ntohl(key);
 
-	if (trie_debug)
-		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
 
-	mask = ntohl( inet_make_mask(plen) );
+	mask = ntohl(inet_make_mask(plen));
 
 	if (key & ~mask)
 		return -EINVAL;
 
 	key = key & mask;
 
-	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+	fi = fib_create_info(r, rta, nlhdr, &err);
+
+	if (!fi)
 		goto err;
 
 	l = fib_find_node(t, key);
@@ -1236,8 +1155,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	 * and we need to allocate a new one of those as well.
 	 */
 
-	if (fa &&
-	    fa->fa_info->fib_priority == fi->fib_priority) {
+	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
 		struct fib_alias *fa_orig;
 
 		err = -EEXIST;
@@ -1248,22 +1166,27 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			struct fib_info *fi_drop;
 			u8 state;
 
-			write_lock_bh(&fib_lock);
+			err = -ENOBUFS;
+			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+			if (new_fa == NULL)
+				goto out;
 
 			fi_drop = fa->fa_info;
-			fa->fa_info = fi;
-			fa->fa_type = type;
-			fa->fa_scope = r->rtm_scope;
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = type;
+			new_fa->fa_scope = r->rtm_scope;
 			state = fa->fa_state;
-			fa->fa_state &= ~FA_S_ACCESSED;
+			new_fa->fa_state &= ~FA_S_ACCESSED;
 
-			write_unlock_bh(&fib_lock);
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+			alias_free_mem_rcu(fa);
 
 			fib_release_info(fi_drop);
 			if (state & FA_S_ACCESSED)
-			  rt_cache_flush(-1);
+				rt_cache_flush(-1);
 
-			    goto succeeded;
+			goto succeeded;
 		}
 		/* Error if we find a perfect match which
 		 * uses the same scope, type, and nexthop
@@ -1285,7 +1208,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			fa = fa_orig;
 	}
 	err = -ENOENT;
-	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
 		goto out;
 
 	err = -ENOBUFS;
@@ -1298,9 +1221,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	new_fa->fa_type = type;
 	new_fa->fa_scope = r->rtm_scope;
 	new_fa->fa_state = 0;
-#if 0
-	new_fa->dst = NULL;
-#endif
 	/*
 	 * Insert new entry to the list.
 	 */
@@ -1312,12 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			goto out_free_new_fa;
 	}
 
-	write_lock_bh(&fib_lock);
-
-	list_add_tail(&new_fa->fa_list,
-		 (fa ? &fa->fa_list : fa_head));
-
-	write_unlock_bh(&fib_lock);
+	list_add_tail_rcu(&new_fa->fa_list,
+			  (fa ? &fa->fa_list : fa_head));
 
 	rt_cache_flush(-1);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
@@ -1328,11 +1244,14 @@ out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
 	fib_release_info(fi);
-err:;
+err:
 	return err;
 }
 
-static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp,
+
+/* should be clalled with rcu_read_lock */
+static inline int check_leaf(struct trie *t, struct leaf *l,
+			     t_key key, int *plen, const struct flowi *flp,
 			     struct fib_result *res)
 {
 	int err, i;
@@ -1341,8 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *pl
 	struct hlist_head *hhead = &l->list;
 	struct hlist_node *node;
 
-	hlist_for_each_entry(li, node, hhead, hlist) {
-
+	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
 		i = li->plen;
 		mask = ntohl(inet_make_mask(i));
 		if (l->key != (key & mask))
@@ -1370,13 +1288,17 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 	struct node *n;
 	struct tnode *pn;
 	int pos, bits;
-	t_key key=ntohl(flp->fl4_dst);
+	t_key key = ntohl(flp->fl4_dst);
 	int chopped_off;
 	t_key cindex = 0;
 	int current_prefix_length = KEYLENGTH;
-	n = t->trie;
+	struct tnode *cn;
+	t_key node_prefix, key_prefix, pref_mismatch;
+	int mp;
+
+	rcu_read_lock();
 
-	read_lock(&fib_lock);
+	n = rcu_dereference(t->trie);
 	if (!n)
 		goto failed;
 
@@ -1393,8 +1315,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 	pn = (struct tnode *) n;
 	chopped_off = 0;
 
-        while (pn) {
-
+	while (pn) {
 		pos = pn->pos;
 		bits = pn->bits;
 
@@ -1410,130 +1331,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result
 			goto backtrace;
 		}
 
-		if (IS_TNODE(n)) {
+		if (IS_LEAF(n)) {
+			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
+				goto found;
+			else
+				goto backtrace;
+		}
+
 #define HL_OPTIMIZE
 #ifdef HL_OPTIMIZE
-			struct tnode *cn = (struct tnode *)n;
-			t_key node_prefix, key_prefix, pref_mismatch;
-			int mp;
+		cn = (struct tnode *)n;
 
-			/*
-			 * It's a tnode, and we can do some extra checks here if we
-			 * like, to avoid descending into a dead-end branch.
-			 * This tnode is in the parent's child array at index
-			 * key[p_pos..p_pos+p_bits] but potentially with some bits
-			 * chopped off, so in reality the index may be just a
-			 * subprefix, padded with zero at the end.
-			 * We can also take a look at any skipped bits in this
-			 * tnode - everything up to p_pos is supposed to be ok,
-			 * and the non-chopped bits of the index (se previous
-			 * paragraph) are also guaranteed ok, but the rest is
-			 * considered unknown.
-			 *
-			 * The skipped bits are key[pos+bits..cn->pos].
-			 */
-		
-			/* If current_prefix_length < pos+bits, we are already doing
-			 * actual prefix  matching, which means everything from
-			 * pos+(bits-chopped_off) onward must be zero along some
-			 * branch of this subtree - otherwise there is *no* valid
-			 * prefix present. Here we can only check the skipped
-			 * bits. Remember, since we have already indexed into the
-			 * parent's child array, we know that the bits we chopped of
-			 * *are* zero.
-			 */
+		/*
+		 * It's a tnode, and we can do some extra checks here if we
+		 * like, to avoid descending into a dead-end branch.
+		 * This tnode is in the parent's child array at index
+		 * key[p_pos..p_pos+p_bits] but potentially with some bits
+		 * chopped off, so in reality the index may be just a
+		 * subprefix, padded with zero at the end.
+		 * We can also take a look at any skipped bits in this
+		 * tnode - everything up to p_pos is supposed to be ok,
+		 * and the non-chopped bits of the index (se previous
+		 * paragraph) are also guaranteed ok, but the rest is
+		 * considered unknown.
+		 *
+		 * The skipped bits are key[pos+bits..cn->pos].
+		 */
 
-			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
-		
-			if (current_prefix_length < pos+bits) {
-				if (tkey_extract_bits(cn->key, current_prefix_length,
-						      cn->pos - current_prefix_length) != 0 ||
-				    !(cn->child[0]))
-					goto backtrace;
-			}
+		/* If current_prefix_length < pos+bits, we are already doing
+		 * actual prefix  matching, which means everything from
+		 * pos+(bits-chopped_off) onward must be zero along some
+		 * branch of this subtree - otherwise there is *no* valid
+		 * prefix present. Here we can only check the skipped
+		 * bits. Remember, since we have already indexed into the
+		 * parent's child array, we know that the bits we chopped of
+		 * *are* zero.
+		 */
 
-			/*
-			 * If chopped_off=0, the index is fully validated and we
-			 * only need to look at the skipped bits for this, the new,
-			 * tnode. What we actually want to do is to find out if
-			 * these skipped bits match our key perfectly, or if we will
-			 * have to count on finding a matching prefix further down,
-			 * because if we do, we would like to have some way of
-			 * verifying the existence of such a prefix at this point.
-			 */
+		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
 
-			/* The only thing we can do at this point is to verify that
-			 * any such matching prefix can indeed be a prefix to our
-			 * key, and if the bits in the node we are inspecting that
-			 * do not match our key are not ZERO, this cannot be true.
-			 * Thus, find out where there is a mismatch (before cn->pos)
-			 * and verify that all the mismatching bits are zero in the
-			 * new tnode's key.
-			 */
+		if (current_prefix_length < pos+bits) {
+			if (tkey_extract_bits(cn->key, current_prefix_length,
+						cn->pos - current_prefix_length) != 0 ||
+			    !(cn->child[0]))
+				goto backtrace;
+		}
 
-			/* Note: We aren't very concerned about the piece of the key
-			 * that precede pn->pos+pn->bits, since these have already been
-			 * checked. The bits after cn->pos aren't checked since these are
-			 * by definition "unknown" at this point. Thus, what we want to
-			 * see is if we are about to enter the "prefix matching" state,
-			 * and in that case verify that the skipped bits that will prevail
-			 * throughout this subtree are zero, as they have to be if we are
-			 * to find a matching prefix.
-			 */
+		/*
+		 * If chopped_off=0, the index is fully validated and we
+		 * only need to look at the skipped bits for this, the new,
+		 * tnode. What we actually want to do is to find out if
+		 * these skipped bits match our key perfectly, or if we will
+		 * have to count on finding a matching prefix further down,
+		 * because if we do, we would like to have some way of
+		 * verifying the existence of such a prefix at this point.
+		 */
 
-			node_prefix = MASK_PFX(cn->key, cn->pos);
-			key_prefix = MASK_PFX(key, cn->pos);
-			pref_mismatch = key_prefix^node_prefix;
-			mp = 0;
+		/* The only thing we can do at this point is to verify that
+		 * any such matching prefix can indeed be a prefix to our
+		 * key, and if the bits in the node we are inspecting that
+		 * do not match our key are not ZERO, this cannot be true.
+		 * Thus, find out where there is a mismatch (before cn->pos)
+		 * and verify that all the mismatching bits are zero in the
+		 * new tnode's key.
+		 */
 
-			/* In short: If skipped bits in this node do not match the search
-			 * key, enter the "prefix matching" state.directly.
-			 */
-			if (pref_mismatch) {
-				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
-					mp++;
-					pref_mismatch = pref_mismatch <<1;
-				}
-				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
-			
-				if (key_prefix != 0)
-					goto backtrace;
-
-				if (current_prefix_length >= cn->pos)
-					current_prefix_length=mp;
-		       }
-#endif
-		       pn = (struct tnode *)n; /* Descend */
-		       chopped_off = 0;
-		       continue;
+		/* Note: We aren't very concerned about the piece of the key
+		 * that precede pn->pos+pn->bits, since these have already been
+		 * checked. The bits after cn->pos aren't checked since these are
+		 * by definition "unknown" at this point. Thus, what we want to
+		 * see is if we are about to enter the "prefix matching" state,
+		 * and in that case verify that the skipped bits that will prevail
+		 * throughout this subtree are zero, as they have to be if we are
+		 * to find a matching prefix.
+		 */
+
+		node_prefix = MASK_PFX(cn->key, cn->pos);
+		key_prefix = MASK_PFX(key, cn->pos);
+		pref_mismatch = key_prefix^node_prefix;
+		mp = 0;
+
+		/* In short: If skipped bits in this node do not match the search
+		 * key, enter the "prefix matching" state.directly.
+		 */
+		if (pref_mismatch) {
+			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+				mp++;
+				pref_mismatch = pref_mismatch <<1;
+			}
+			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+
+			if (key_prefix != 0)
+				goto backtrace;
+
+			if (current_prefix_length >= cn->pos)
+				current_prefix_length = mp;
 		}
-		if (IS_LEAF(n)) {
-			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
-				goto found;
-	       }
+#endif
+		pn = (struct tnode *)n; /* Descend */
+		chopped_off = 0;
+		continue;
+
 backtrace:
 		chopped_off++;
 
 		/* As zero don't change the child key (cindex) */
-		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
 			chopped_off++;
-		}
 
 		/* Decrease current_... with bits chopped off */
 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
 			current_prefix_length = pn->pos + pn->bits - chopped_off;
-	
+
 		/*
 		 * Either we do the actual chop off according or if we have
 		 * chopped off all bits in this tnode walk up to our parent.
 		 */
 
-		if (chopped_off <= pn->bits)
+		if (chopped_off <= pn->bits) {
 			cindex &= ~(1 << (chopped_off-1));
-		else {
+		} else {
 			if (NODE_PARENT(pn) == NULL)
 				goto failed;
-		
+
 			/* Get Child's index */
 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
 			pn = NODE_PARENT(pn);
@@ -1548,10 +1468,11 @@ backtrace:
 failed:
 	ret = 1;
 found:
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 	return ret;
 }
 
+/* only called from updater side */
 static int trie_leaf_remove(struct trie *t, t_key key)
 {
 	t_key cindex;
@@ -1559,24 +1480,20 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	struct node *n = t->trie;
 	struct leaf *l;
 
-	if (trie_debug)
-		printk("entering trie_leaf_remove(%p)\n", n);
+	pr_debug("entering trie_leaf_remove(%p)\n", n);
 
 	/* Note that in the case skipped bits, those bits are *not* checked!
 	 * When we finish this, we will have NULL or a T_LEAF, and the
 	 * T_LEAF may or may not match our key.
 	 */
 
-        while (n != NULL && IS_TNODE(n)) {
+	while (n != NULL && IS_TNODE(n)) {
 		struct tnode *tn = (struct tnode *) n;
 		check_tnode(tn);
 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
 
-			if (n && NODE_PARENT(n) != tn) {
-				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
-				BUG();
-			}
-        }
+		BUG_ON(n && NODE_PARENT(n) != tn);
+	}
 	l = (struct leaf *) n;
 
 	if (!n || !tkey_equals(l->key, key))
@@ -1590,23 +1507,24 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	t->revision++;
 	t->size--;
 
+	preempt_disable();
 	tp = NODE_PARENT(n);
 	tnode_free((struct tnode *) n);
 
 	if (tp) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		put_child(t, (struct tnode *)tp, cindex, NULL);
-		t->trie = trie_rebalance(t, tp);
-	}
-	else
-		t->trie = NULL;
+		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
+	} else
+		rcu_assign_pointer(t->trie, NULL);
+	preempt_enable();
 
 	return 1;
 }
 
 static int
 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	u32 key, mask;
@@ -1615,6 +1533,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	struct fib_alias *fa, *fa_to_delete;
 	struct list_head *fa_head;
 	struct leaf *l;
+	struct leaf_info *li;
+
 
 	if (plen > 32)
 		return -EINVAL;
@@ -1624,7 +1544,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		memcpy(&key, rta->rta_dst, 4);
 
 	key = ntohl(key);
-	mask = ntohl( inet_make_mask(plen) );
+	mask = ntohl(inet_make_mask(plen));
 
 	if (key & ~mask)
 		return -EINVAL;
@@ -1641,11 +1561,11 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	if (!fa)
 		return -ESRCH;
 
-	if (trie_debug)
-		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
 
 	fa_to_delete = NULL;
 	fa_head = fa->fa_list.prev;
+
 	list_for_each_entry(fa, fa_head, fa_list) {
 		struct fib_info *fi = fa->fa_info;
 
@@ -1664,39 +1584,31 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		}
 	}
 
-	if (fa_to_delete) {
-		int kill_li = 0;
-		struct leaf_info *li;
-
-		fa = fa_to_delete;
-		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+	if (!fa_to_delete)
+		return -ESRCH;
 
-		l = fib_find_node(t, key);
-		li = find_leaf_info(&l->list, plen);
+	fa = fa_to_delete;
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
 
-		write_lock_bh(&fib_lock);
+	l = fib_find_node(t, key);
+	li = find_leaf_info(&l->list, plen);
 
-		list_del(&fa->fa_list);
+	list_del_rcu(&fa->fa_list);
 
-		if (list_empty(fa_head)) {
-			hlist_del(&li->hlist);
-			kill_li = 1;
-		}
-		write_unlock_bh(&fib_lock);
-	
-		if (kill_li)
-			free_leaf_info(li);
+	if (list_empty(fa_head)) {
+		hlist_del_rcu(&li->hlist);
+		free_leaf_info(li);
+	}
 
-		if (hlist_empty(&l->list))
-			trie_leaf_remove(t, key);
+	if (hlist_empty(&l->list))
+		trie_leaf_remove(t, key);
 
-		if (fa->fa_state & FA_S_ACCESSED)
-			rt_cache_flush(-1);
+	if (fa->fa_state & FA_S_ACCESSED)
+		rt_cache_flush(-1);
 
-		fn_free_alias(fa);
-		return 0;
-	}
-	return -ESRCH;
+	fib_release_info(fa->fa_info);
+	alias_free_mem_rcu(fa);
+	return 0;
 }
 
 static int trie_flush_list(struct trie *t, struct list_head *head)
@@ -1706,14 +1618,11 @@ static int trie_flush_list(struct trie *t, struct list_head *head)
 
 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
 		struct fib_info *fi = fa->fa_info;
-	
-		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-
- 			write_lock_bh(&fib_lock);
-			list_del(&fa->fa_list);
-			write_unlock_bh(&fib_lock);
 
-			fn_free_alias(fa);
+		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+			list_del_rcu(&fa->fa_list);
+			fib_release_info(fa->fa_info);
+			alias_free_mem_rcu(fa);
 			found++;
 		}
 	}
@@ -1728,37 +1637,34 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l)
 	struct leaf_info *li = NULL;
 
 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
-		
 		found += trie_flush_list(t, &li->falh);
 
 		if (list_empty(&li->falh)) {
-
- 			write_lock_bh(&fib_lock);
-			hlist_del(&li->hlist);
-			write_unlock_bh(&fib_lock);
-
+			hlist_del_rcu(&li->hlist);
 			free_leaf_info(li);
 		}
 	}
 	return found;
 }
 
+/* rcu_read_lock needs to be hold by caller from readside */
+
 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 {
 	struct node *c = (struct node *) thisleaf;
 	struct tnode *p;
 	int idx;
+	struct node *trie = rcu_dereference(t->trie);
 
 	if (c == NULL) {
-		if (t->trie == NULL)
+		if (trie == NULL)
 			return NULL;
 
-		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
-			return (struct leaf *) t->trie;
+		if (IS_LEAF(trie))          /* trie w. just a leaf */
+			return (struct leaf *) trie;
 
-		p = (struct tnode*) t->trie;  /* Start */
-	}
-	else
+		p = (struct tnode*) trie;  /* Start */
+	} else
 		p = (struct tnode *) NODE_PARENT(c);
 
 	while (p) {
@@ -1771,29 +1677,31 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
 			pos = 0;
 
 		last = 1 << p->bits;
-		for(idx = pos; idx < last ; idx++) {
-			if (p->child[idx]) {
-
-				/* Decend if tnode */
-
-				while (IS_TNODE(p->child[idx])) {
-					p = (struct tnode*) p->child[idx];
-					idx = 0;
-				
-					/* Rightmost non-NULL branch */
-					if (p && IS_TNODE(p))
-						while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
-
-					/* Done with this tnode? */
-					if (idx >= (1 << p->bits) || p->child[idx] == NULL )
-						goto up;
-				}
-				return (struct leaf*) p->child[idx];
+		for (idx = pos; idx < last ; idx++) {
+			c = rcu_dereference(p->child[idx]);
+
+			if (!c)
+				continue;
+
+			/* Decend if tnode */
+			while (IS_TNODE(c)) {
+				p = (struct tnode *) c;
+  				idx = 0;
+
+				/* Rightmost non-NULL branch */
+				if (p && IS_TNODE(p))
+					while (!(c = rcu_dereference(p->child[idx]))
+					       && idx < (1<<p->bits)) idx++;
+
+				/* Done with this tnode? */
+				if (idx >= (1 << p->bits) || !c)
+					goto up;
 			}
+			return (struct leaf *) c;
 		}
 up:
 		/* No more children go up one step  */
-		c = (struct node*) p;
+		c = (struct node *) p;
 		p = (struct tnode *) NODE_PARENT(p);
 	}
 	return NULL; /* Ready. Root of trie */
@@ -1807,23 +1715,24 @@ static int fn_trie_flush(struct fib_table *tb)
 
 	t->revision++;
 
-	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+	rcu_read_lock();
+	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
 		found += trie_flush_leaf(t, l);
 
 		if (ll && hlist_empty(&ll->list))
 			trie_leaf_remove(t, ll->key);
 		ll = l;
 	}
+	rcu_read_unlock();  
 
 	if (ll && hlist_empty(&ll->list))
 		trie_leaf_remove(t, ll->key);
 
-	if (trie_debug)
-		printk("trie_flush found=%d\n", found);
+	pr_debug("trie_flush found=%d\n", found);
 	return found;
 }
 
-static int trie_last_dflt=-1;
+static int trie_last_dflt = -1;
 
 static void
 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
@@ -1840,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	last_resort = NULL;
 	order = -1;
 
-	read_lock(&fib_lock);
+	rcu_read_lock();
 
 	l = fib_find_node(t, 0);
 	if (!l)
@@ -1853,20 +1762,20 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	if (list_empty(fa_head))
 		goto out;
 
-	list_for_each_entry(fa, fa_head, fa_list) {
+	list_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
-	
+
 		if (fa->fa_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-	
+
 		if (next_fi->fib_priority > res->fi->fib_priority)
 			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
 		fa->fa_state |= FA_S_ACCESSED;
-	
+
 		if (fi == NULL) {
 			if (next_fi != res->fi)
 				break;
@@ -1904,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
 	}
 	trie_last_dflt = last_idx;
  out:;
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 }
 
 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
@@ -1913,12 +1822,14 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 	int i, s_i;
 	struct fib_alias *fa;
 
-	u32 xkey=htonl(key);
+	u32 xkey = htonl(key);
 
-	s_i=cb->args[3];
+	s_i = cb->args[3];
 	i = 0;
 
-	list_for_each_entry(fa, fah, fa_list) {
+	/* rcu_read_lock is hold by caller */
+
+	list_for_each_entry_rcu(fa, fah, fa_list) {
 		if (i < s_i) {
 			i++;
 			continue;
@@ -1946,10 +1857,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 				  fa->fa_info, 0) < 0) {
 			cb->args[3] = i;
 			return -1;
-			}
+		}
 		i++;
 	}
-	cb->args[3]=i;
+	cb->args[3] = i;
 	return skb->len;
 }
 
@@ -1959,10 +1870,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 	int h, s_h;
 	struct list_head *fa_head;
 	struct leaf *l = NULL;
-	s_h=cb->args[2];
 
-	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+	s_h = cb->args[2];
 
+	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
@@ -1970,7 +1881,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
 		fa_head = get_fa_head(l, plen);
-	
+
 		if (!fa_head)
 			continue;
 
@@ -1978,11 +1889,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			continue;
 
 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
-			cb->args[2]=h;
+			cb->args[2] = h;
 			return -1;
 		}
 	}
-	cb->args[2]=h;
+	cb->args[2] = h;
 	return skb->len;
 }
 
@@ -1993,25 +1904,24 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 
 	s_m = cb->args[1];
 
-	read_lock(&fib_lock);
-	for (m=0; m<=32; m++) {
-
+	rcu_read_lock();
+	for (m = 0; m <= 32; m++) {
 		if (m < s_m)
 			continue;
 		if (m > s_m)
 			memset(&cb->args[2], 0,
-			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
+				sizeof(cb->args) - 2*sizeof(cb->args[0]));
 
 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
 			cb->args[1] = m;
 			goto out;
 		}
 	}
-	read_unlock(&fib_lock);
+	rcu_read_unlock();
 	cb->args[1] = m;
 	return skb->len;
- out:
-	read_unlock(&fib_lock);
+out:
+	rcu_read_unlock();
 	return -1;
 }
 
@@ -2051,9 +1961,9 @@ struct fib_table * __init fib_hash_init(int id)
 	trie_init(t);
 
 	if (id == RT_TABLE_LOCAL)
-                trie_local = t;
+		trie_local = t;
 	else if (id == RT_TABLE_MAIN)
-                trie_main = t;
+		trie_main = t;
 
 	if (id == RT_TABLE_LOCAL)
 		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
@@ -2065,7 +1975,8 @@ struct fib_table * __init fib_hash_init(int id)
 
 static void putspace_seq(struct seq_file *seq, int n)
 {
-	while (n--) seq_printf(seq, " ");
+	while (n--)
+		seq_printf(seq, " ");
 }
 
 static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
@@ -2086,29 +1997,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 		seq_printf(seq, "%d/", cindex);
 		printbin_seq(seq, cindex, bits);
 		seq_printf(seq, ": ");
-	}
-	else
+	} else
 		seq_printf(seq, "<root>: ");
 	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
 
-	if (IS_LEAF(n))
-		seq_printf(seq, "key=%d.%d.%d.%d\n",
-			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
-	else {
-		int plen = ((struct tnode *)n)->pos;
-		t_key prf=MASK_PFX(n->key, plen);
-		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
-			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
-	}
 	if (IS_LEAF(n)) {
-		struct leaf *l=(struct leaf *)n;
+		struct leaf *l = (struct leaf *)n;
 		struct fib_alias *fa;
 		int i;
-		for (i=32; i>=0; i--)
-		  if (find_leaf_info(&l->list, i)) {
-		
+
+		seq_printf(seq, "key=%d.%d.%d.%d\n",
+			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
+
+		for (i = 32; i >= 0; i--)
+			if (find_leaf_info(&l->list, i)) {
 				struct list_head *fa_head = get_fa_head(l, i);
-			
+
 				if (!fa_head)
 					continue;
 
@@ -2118,17 +2022,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 				putspace_seq(seq, indent+2);
 				seq_printf(seq, "{/%d...dumping}\n", i);
 
-
-				list_for_each_entry(fa, fa_head, fa_list) {
+				list_for_each_entry_rcu(fa, fa_head, fa_list) {
 					putspace_seq(seq, indent+2);
-					if (fa->fa_info->fib_nh == NULL) {
-						seq_printf(seq, "Error _fib_nh=NULL\n");
-						continue;
-					}
 					if (fa->fa_info == NULL) {
 						seq_printf(seq, "Error fa_info=NULL\n");
 						continue;
 					}
+					if (fa->fa_info->fib_nh == NULL) {
+						seq_printf(seq, "Error _fib_nh=NULL\n");
+						continue;
+					}
 
 					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
 					      fa->fa_type,
@@ -2136,11 +2039,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 					      fa->fa_tos);
 				}
 			}
-	}
-	else if (IS_TNODE(n)) {
+	} else {
 		struct tnode *tn = (struct tnode *)n;
+		int plen = ((struct tnode *)n)->pos;
+		t_key prf = MASK_PFX(n->key, plen);
+
+		seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
+			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
+
 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
-		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
+		seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
 		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
 		seq_printf(seq, "}\n");
 		putspace_seq(seq, indent); seq_printf(seq, "|    ");
@@ -2154,194 +2062,196 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
 
 static void trie_dump_seq(struct seq_file *seq, struct trie *t)
 {
-	struct node *n = t->trie;
-	int cindex=0;
-	int indent=1;
-	int pend=0;
+	struct node *n;
+	int cindex = 0;
+	int indent = 1;
+	int pend = 0;
 	int depth = 0;
+	struct tnode *tn;
 
-  	read_lock(&fib_lock);
-
+	rcu_read_lock();
+	n = rcu_dereference(t->trie);
 	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
-	if (n) {
-		printnode_seq(seq, indent, n, pend, cindex, 0);
-		if (IS_TNODE(n)) {
-			struct tnode *tn = (struct tnode *)n;
-			pend = tn->pos+tn->bits;
-			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-			indent += 3;
-			depth++;
-
-			while (tn && cindex < (1 << tn->bits)) {
-				if (tn->child[cindex]) {
-				
-					/* Got a child */
-				
-					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
-					if (IS_LEAF(tn->child[cindex])) {
-						cindex++;
-					
-					}
-					else {
-						/*
-						 * New tnode. Decend one level
-						 */
-					
-						depth++;
-						n = tn->child[cindex];
-						tn = (struct tnode *)n;
-						pend = tn->pos+tn->bits;
-						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
-						indent+=3;
-						cindex=0;
-					}
-				}
-				else
-					cindex++;
 
+	if (!n) {
+		seq_printf(seq, "------ trie is empty\n");
+
+		rcu_read_unlock();
+		return;
+	}
+
+	printnode_seq(seq, indent, n, pend, cindex, 0);
+
+	if (!IS_TNODE(n)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	tn = (struct tnode *)n;
+	pend = tn->pos+tn->bits;
+	putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+	indent += 3;
+	depth++;
+
+	while (tn && cindex < (1 << tn->bits)) {
+		struct node *child = rcu_dereference(tn->child[cindex]);
+		if (!child)
+			cindex++;
+		else {
+			/* Got a child */
+			printnode_seq(seq, indent, child, pend,
+				      cindex, tn->bits);
+
+			if (IS_LEAF(child))
+				cindex++;
+
+			else {
 				/*
-				 * Test if we are done
+				 * New tnode. Decend one level
 				 */
-			
-				while (cindex >= (1 << tn->bits)) {
 
-					/*
-					 * Move upwards and test for root
-					 * pop off all traversed  nodes
-					 */
-				
-					if (NODE_PARENT(tn) == NULL) {
-						tn = NULL;
-						n = NULL;
-						break;
-					}
-					else {
-						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-						tn = NODE_PARENT(tn);
-						cindex++;
-						n = (struct node *)tn;
-						pend = tn->pos+tn->bits;
-						indent-=3;
-						depth--;
-					}
-				}
+				depth++;
+				n = child;
+				tn = (struct tnode *)n;
+				pend = tn->pos+tn->bits;
+				putspace_seq(seq, indent);
+				seq_printf(seq, "\\--\n");
+				indent += 3;
+				cindex = 0;
 			}
 		}
-		else n = NULL;
-	}
-	else seq_printf(seq, "------ trie is empty\n");
 
-  	read_unlock(&fib_lock);
+		/*
+		 * Test if we are done
+		 */
+
+		while (cindex >= (1 << tn->bits)) {
+			/*
+			 * Move upwards and test for root
+			 * pop off all traversed  nodes
+			 */
+
+			if (NODE_PARENT(tn) == NULL) {
+				tn = NULL;
+				break;
+			}
+
+			cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+			cindex++;
+			tn = NODE_PARENT(tn);
+			pend = tn->pos + tn->bits;
+			indent -= 3;
+			depth--;
+		}
+	}
+	rcu_read_unlock();
 }
 
 static struct trie_stat *trie_stat_new(void)
 {
-	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+	struct trie_stat *s;
 	int i;
 
-	if (s) {
-		s->totdepth = 0;
-		s->maxdepth = 0;
-		s->tnodes = 0;
-		s->leaves = 0;
-		s->nullpointers = 0;
-	
-		for(i=0; i< MAX_CHILDS; i++)
-			s->nodesizes[i] = 0;
-	}
+	s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+	if (!s)
+		return NULL;
+
+	s->totdepth = 0;
+	s->maxdepth = 0;
+	s->tnodes = 0;
+	s->leaves = 0;
+	s->nullpointers = 0;
+
+	for (i = 0; i < MAX_CHILDS; i++)
+		s->nodesizes[i] = 0;
+
 	return s;
 }
 
 static struct trie_stat *trie_collect_stats(struct trie *t)
 {
-	struct node *n = t->trie;
+	struct node *n;
 	struct trie_stat *s = trie_stat_new();
 	int cindex = 0;
-	int indent = 1;
 	int pend = 0;
 	int depth = 0;
 
-	read_lock(&fib_lock);	
+	if (!s)
+		return NULL;
 
-	if (s) {
-		if (n) {
-			if (IS_TNODE(n)) {
-				struct tnode *tn = (struct tnode *)n;
-				pend = tn->pos+tn->bits;
-				indent += 3;
-				s->nodesizes[tn->bits]++;
-				depth++;
+	rcu_read_lock();
+	n = rcu_dereference(t->trie);
 
-				while (tn && cindex < (1 << tn->bits)) {
-					if (tn->child[cindex]) {
-						/* Got a child */
-				
-						if (IS_LEAF(tn->child[cindex])) {
-							cindex++;
-					
-							/* stats */
-							if (depth > s->maxdepth)
-								s->maxdepth = depth;
-							s->totdepth += depth;
-							s->leaves++;
-						}
-				
-						else {
-							/*
-							 * New tnode. Decend one level
-							 */
-					
-							s->tnodes++;
-							s->nodesizes[tn->bits]++;
-							depth++;
-					
-							n = tn->child[cindex];
-							tn = (struct tnode *)n;
-							pend = tn->pos+tn->bits;
-
-							indent += 3;
-							cindex = 0;
-						}
-					}
-					else {
-						cindex++;
-						s->nullpointers++;
-					}
+	if (!n)
+		return s;
+
+	if (IS_TNODE(n)) {
+		struct tnode *tn = (struct tnode *)n;
+		pend = tn->pos+tn->bits;
+		s->nodesizes[tn->bits]++;
+		depth++;
+
+		while (tn && cindex < (1 << tn->bits)) {
+			struct node *ch = rcu_dereference(tn->child[cindex]);
+			if (ch) {
 
+				/* Got a child */
+
+				if (IS_LEAF(tn->child[cindex])) {
+					cindex++;
+
+					/* stats */
+					if (depth > s->maxdepth)
+						s->maxdepth = depth;
+					s->totdepth += depth;
+					s->leaves++;
+				} else {
 					/*
-					 * Test if we are done
+					 * New tnode. Decend one level
 					 */
-			
-					while (cindex >= (1 << tn->bits)) {
-
-						/*
-						 * Move upwards and test for root
-						 * pop off all traversed  nodes
-						 */
-
-					
-						if (NODE_PARENT(tn) == NULL) {
-							tn = NULL;
-							n = NULL;
-							break;
-						}
-						else {
-							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
-							tn = NODE_PARENT(tn);
-							cindex++;
-							n = (struct node *)tn;
-							pend = tn->pos+tn->bits;
-							indent -= 3;
-							depth--;
-						}
- 					}
+
+					s->tnodes++;
+					s->nodesizes[tn->bits]++;
+					depth++;
+
+					n = ch;
+					tn = (struct tnode *)n;
+					pend = tn->pos+tn->bits;
+
+					cindex = 0;
 				}
+			} else {
+				cindex++;
+				s->nullpointers++;
 			}
-			else n = NULL;
+
+			/*
+			 * Test if we are done
+			 */
+
+			while (cindex >= (1 << tn->bits)) {
+				/*
+				 * Move upwards and test for root
+				 * pop off all traversed  nodes
+				 */
+
+				if (NODE_PARENT(tn) == NULL) {
+					tn = NULL;
+					n = NULL;
+					break;
+				}
+
+				cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+				tn = NODE_PARENT(tn);
+				cindex++;
+				n = (struct node *)tn;
+				pend = tn->pos+tn->bits;
+				depth--;
+ 			}
 		}
 	}
 
-	read_unlock(&fib_lock);	
+	rcu_read_unlock();
 	return s;
 }
 
@@ -2359,17 +2269,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
 
 static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	void *v = NULL;
+	if (!ip_fib_main_table)
+		return NULL;
 
-	if (ip_fib_main_table)
-		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
-	return v;
+	if (*pos)
+		return fib_triestat_get_next(seq);
+	else
+		return SEQ_START_TOKEN;
 }
 
 static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
+	if (v == SEQ_START_TOKEN)
+		return fib_triestat_get_first(seq);
+	else
+		return fib_triestat_get_next(seq);
 }
 
 static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
@@ -2388,22 +2303,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
 {
 	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
 	int i, max, pointers;
-        struct trie_stat *stat;
+	struct trie_stat *stat;
 	int avdepth;
 
 	stat = trie_collect_stats(t);
 
-	bytes=0;
+	bytes = 0;
 	seq_printf(seq, "trie=%p\n", t);
 
 	if (stat) {
 		if (stat->leaves)
-			avdepth=stat->totdepth*100 / stat->leaves;
+			avdepth = stat->totdepth*100 / stat->leaves;
 		else
-			avdepth=0;
-		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+			avdepth = 0;
+		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
 		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
-			
+
 		seq_printf(seq, "Leaves: %d\n", stat->leaves);
 		bytes += sizeof(struct leaf) * stat->leaves;
 		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
@@ -2455,11 +2370,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 
 		if (trie_main)
 			collect_and_show(trie_main, seq);
-	}
-	else {
-		snprintf(bf, sizeof(bf),
-			 "*\t%08X\t%08X", 200, 400);
-	
+	} else {
+		snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
+
 		seq_printf(seq, "%-127s\n", bf);
 	}
 	return 0;
@@ -2520,22 +2433,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
 
 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	void *v = NULL;
+	if (!ip_fib_main_table)
+		return NULL;
 
-	if (ip_fib_main_table)
-		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
-	return v;
+	if (*pos)
+		return fib_trie_get_next(seq);
+	else
+		return SEQ_START_TOKEN;
 }
 
 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	++*pos;
-	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+	if (v == SEQ_START_TOKEN)
+		return fib_trie_get_first(seq);
+	else
+		return fib_trie_get_next(seq);
+
 }
 
 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 {
-
 }
 
 /*
@@ -2555,9 +2473,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 
 		if (trie_main)
 			trie_dump_seq(seq, trie_main);
-	}
-
-	else {
+	} else {
 		snprintf(bf, sizeof(bf),
 			 "*\t%08X\t%08X", 200, 400);
 		seq_printf(seq, "%-127s\n", bf);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index badfc584997..24eb56ae1b5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -114,7 +114,7 @@ struct icmp_bxm {
 /*
  *	Statistics
  */
-DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
 
 /* An array of errno for error messages from dest unreach. */
 /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb)
 			break;
 		case ICMP_FRAG_NEEDED:
 			if (ipv4_config.no_pmtu_disc) {
-				LIMIT_NETDEBUG(
-					printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+				LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: "
 							 "fragmentation needed "
 							 "and DF set.\n",
-					       NIPQUAD(iph->daddr)));
+					       NIPQUAD(iph->daddr));
 			} else {
 				info = ip_rt_frag_needed(iph,
 						     ntohs(icmph->un.frag.mtu));
@@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb)
 			}
 			break;
 		case ICMP_SR_FAILED:
-			LIMIT_NETDEBUG(
-				printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+			LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
 						 "Route Failed.\n",
-				       NIPQUAD(iph->daddr)));
+				       NIPQUAD(iph->daddr));
 			break;
 		default:
 			break;
@@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb)
 	case CHECKSUM_HW:
 		if (!(u16)csum_fold(skb->csum))
 			break;
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n");
 	case CHECKSUM_NONE:
 		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
 			goto error;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5088f90835a..44607f4767b 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb)
 	case IGMP_MTRACE_RESP:
 		break;
 	default:
-		NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+		NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
 	}
 	in_dev_put(in_dev);
 	kfree_skb(skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
new file mode 100644
index 00000000000..fe3c6d3d0c9
--- /dev/null
+++ b/net/ipv4/inet_connection_sock.c
@@ -0,0 +1,641 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Support for INET connection oriented protocols.
+ *
+ * Authors:	See the TCP sources
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or(at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/jhash.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp_states.h>
+#include <net/xfrm.h>
+
+#ifdef INET_CSK_DEBUG
+const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+#endif
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+
+static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
+{
+	const u32 sk_rcv_saddr = inet_rcv_saddr(sk);
+	struct sock *sk2;
+	struct hlist_node *node;
+	int reuse = sk->sk_reuse;
+
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    !inet_v6_ipv6only(sk2) &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if (!reuse || !sk2->sk_reuse ||
+			    sk2->sk_state == TCP_LISTEN) {
+				const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+				    sk2_rcv_saddr == sk_rcv_saddr)
+					break;
+			}
+		}
+	}
+	return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+int inet_csk_get_port(struct inet_hashinfo *hashinfo,
+		      struct sock *sk, unsigned short snum)
+{
+	struct inet_bind_hashbucket *head;
+	struct hlist_node *node;
+	struct inet_bind_bucket *tb;
+	int ret;
+
+	local_bh_disable();
+	if (!snum) {
+		int low = sysctl_local_port_range[0];
+		int high = sysctl_local_port_range[1];
+		int remaining = (high - low) + 1;
+		int rover;
+
+		spin_lock(&hashinfo->portalloc_lock);
+		if (hashinfo->port_rover < low)
+			rover = low;
+		else
+			rover = hashinfo->port_rover;
+		do {
+			rover++;
+			if (rover > high)
+				rover = low;
+			head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
+			spin_lock(&head->lock);
+			inet_bind_bucket_for_each(tb, node, &head->chain)
+				if (tb->port == rover)
+					goto next;
+			break;
+		next:
+			spin_unlock(&head->lock);
+		} while (--remaining > 0);
+		hashinfo->port_rover = rover;
+		spin_unlock(&hashinfo->portalloc_lock);
+
+		/* Exhausted local port range during search?  It is not
+		 * possible for us to be holding one of the bind hash
+		 * locks if this test triggers, because if 'remaining'
+		 * drops to zero, we broke out of the do/while loop at
+		 * the top level, not from the 'break;' statement.
+		 */
+		ret = 1;
+		if (remaining <= 0)
+			goto fail;
+
+		/* OK, here is the one we will use.  HEAD is
+		 * non-NULL and we hold it's mutex.
+		 */
+		snum = rover;
+	} else {
+		head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
+		spin_lock(&head->lock);
+		inet_bind_bucket_for_each(tb, node, &head->chain)
+			if (tb->port == snum)
+				goto tb_found;
+	}
+	tb = NULL;
+	goto tb_not_found;
+tb_found:
+	if (!hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse > 1)
+			goto success;
+		if (tb->fastreuse > 0 &&
+		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+			goto success;
+		} else {
+			ret = 1;
+			if (inet_csk_bind_conflict(sk, tb))
+				goto fail_unlock;
+		}
+	}
+tb_not_found:
+	ret = 1;
+	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
+		goto fail_unlock;
+	if (hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+			tb->fastreuse = 1;
+		else
+			tb->fastreuse = 0;
+	} else if (tb->fastreuse &&
+		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+		tb->fastreuse = 0;
+success:
+	if (!inet_csk(sk)->icsk_bind_hash)
+		inet_bind_hash(sk, tb, snum);
+	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
+ 	ret = 0;
+
+fail_unlock:
+	spin_unlock(&head->lock);
+fail:
+	local_bh_enable();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_get_port);
+
+/*
+ * Wait for an incoming connection, avoid race conditions. This must be called
+ * with the socket locked.
+ */
+static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
+			timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		err = 0;
+		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk->sk_sleep, &wait);
+	return err;
+}
+
+/*
+ * This will accept the next outstanding connection.
+ */
+struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sock *newsk;
+	int error;
+
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out_err;
+
+	/* Find already established connection */
+	if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out_err;
+
+		error = inet_csk_wait_for_connect(sk, timeo);
+		if (error)
+			goto out_err;
+	}
+
+	newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
+	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+out:
+	release_sock(sk);
+	return newsk;
+out_err:
+	newsk = NULL;
+	*err = error;
+	goto out;
+}
+
+EXPORT_SYMBOL(inet_csk_accept);
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+void inet_csk_init_xmit_timers(struct sock *sk,
+			       void (*retransmit_handler)(unsigned long),
+			       void (*delack_handler)(unsigned long),
+			       void (*keepalive_handler)(unsigned long))
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	init_timer(&icsk->icsk_retransmit_timer);
+	init_timer(&icsk->icsk_delack_timer);
+	init_timer(&sk->sk_timer);
+
+	icsk->icsk_retransmit_timer.function = retransmit_handler;
+	icsk->icsk_delack_timer.function     = delack_handler;
+	sk->sk_timer.function		     = keepalive_handler;
+
+	icsk->icsk_retransmit_timer.data = 
+		icsk->icsk_delack_timer.data =
+			sk->sk_timer.data  = (unsigned long)sk;
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
+}
+
+EXPORT_SYMBOL(inet_csk_init_xmit_timers);
+
+void inet_csk_clear_xmit_timers(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+
+	sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+	sk_stop_timer(sk, &icsk->icsk_delack_timer);
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
+
+void inet_csk_delete_keepalive_timer(struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+
+void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
+
+struct dst_entry* inet_csk_route_req(struct sock *sk,
+				     const struct request_sock *req)
+{
+	struct rtable *rt;
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct ip_options *opt = inet_rsk(req)->opt;
+	struct flowi fl = { .oif = sk->sk_bound_dev_if,
+			    .nl_u = { .ip4_u =
+				      { .daddr = ((opt && opt->srr) ?
+						  opt->faddr :
+						  ireq->rmt_addr),
+					.saddr = ireq->loc_addr,
+					.tos = RT_CONN_FLAGS(sk) } },
+			    .proto = sk->sk_protocol,
+			    .uli_u = { .ports =
+				       { .sport = inet_sk(sk)->sport,
+					 .dport = ireq->rmt_port } } };
+
+	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+		ip_rt_put(rt);
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	return &rt->u.dst;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_route_req);
+
+static inline u32 inet_synq_hash(const u32 raddr, const u16 rport,
+				 const u32 rnd, const u16 synq_hsize)
+{
+	return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
+#else
+#define AF_INET_FAMILY(fam) 1
+#endif
+
+struct request_sock *inet_csk_search_req(const struct sock *sk,
+					 struct request_sock ***prevp,
+					 const __u16 rport, const __u32 raddr,
+					 const __u32 laddr)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	struct request_sock *req, **prev;
+
+	for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
+						    lopt->nr_table_entries)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
+
+		if (ireq->rmt_port == rport &&
+		    ireq->rmt_addr == raddr &&
+		    ireq->loc_addr == laddr &&
+		    AF_INET_FAMILY(req->rsk_ops->family)) {
+			BUG_TRAP(!req->sk);
+			*prevp = prev;
+			break;
+		}
+	}
+
+	return req;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_search_req);
+
+void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
+				   const unsigned timeout)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port,
+				     lopt->hash_rnd, lopt->nr_table_entries);
+
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
+	inet_csk_reqsk_queue_added(sk, timeout);
+}
+
+/* Only thing we need from tcp.h */
+extern int sysctl_tcp_synack_retries;
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+
+void inet_csk_reqsk_queue_prune(struct sock *parent,
+				const unsigned long interval,
+				const unsigned long timeout,
+				const unsigned long max_rto)
+{
+	struct inet_connection_sock *icsk = inet_csk(parent);
+	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
+	struct listen_sock *lopt = queue->listen_opt;
+	int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
+	unsigned long now = jiffies;
+	struct request_sock **reqp, *req;
+	int i, budget;
+
+	if (lopt == NULL || lopt->qlen == 0)
+		return;
+
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 3 seconds, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+		int young = (lopt->qlen_young<<1);
+
+		while (thresh > 2) {
+			if (lopt->qlen < young)
+				break;
+			thresh--;
+			young <<= 1;
+		}
+	}
+
+	if (queue->rskq_defer_accept)
+		max_retries = queue->rskq_defer_accept;
+
+	budget = 2 * (lopt->nr_table_entries / (timeout / interval));
+	i = lopt->clock_hand;
+
+	do {
+		reqp=&lopt->syn_table[i];
+		while ((req = *reqp) != NULL) {
+			if (time_after_eq(now, req->expires)) {
+				if ((req->retrans < thresh ||
+				     (inet_rsk(req)->acked && req->retrans < max_retries))
+				    && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) {
+					unsigned long timeo;
+
+					if (req->retrans++ == 0)
+						lopt->qlen_young--;
+					timeo = min((timeout << req->retrans), max_rto);
+					req->expires = now + timeo;
+					reqp = &req->dl_next;
+					continue;
+				}
+
+				/* Drop this request */
+				inet_csk_reqsk_queue_unlink(parent, req, reqp);
+				reqsk_queue_removed(queue, req);
+				reqsk_free(req);
+				continue;
+			}
+			reqp = &req->dl_next;
+		}
+
+		i = (i + 1) & (lopt->nr_table_entries - 1);
+
+	} while (--budget > 0);
+
+	lopt->clock_hand = i;
+
+	if (lopt->qlen)
+		inet_csk_reset_keepalive_timer(parent, interval);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
+
+struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
+			    const unsigned int __nocast priority)
+{
+	struct sock *newsk = sk_clone(sk, priority);
+
+	if (newsk != NULL) {
+		struct inet_connection_sock *newicsk = inet_csk(newsk);
+
+		newsk->sk_state = TCP_SYN_RECV;
+		newicsk->icsk_bind_hash = NULL;
+
+		inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
+		newsk->sk_write_space = sk_stream_write_space;
+
+		newicsk->icsk_retransmits = 0;
+		newicsk->icsk_backoff	  = 0;
+		newicsk->icsk_probes_out  = 0;
+
+		/* Deinitialize accept_queue to trap illegal accesses. */
+		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+	}
+	return newsk;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_clone);
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void inet_csk_destroy_sock(struct sock *sk)
+{
+	BUG_TRAP(sk->sk_state == TCP_CLOSE);
+	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+	/* It cannot be in hash table! */
+	BUG_TRAP(sk_unhashed(sk));
+
+	/* If it has not 0 inet_sk(sk)->num, it must be bound */
+	BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash);
+
+	sk->sk_prot->destroy(sk);
+
+	sk_stream_kill_queues(sk);
+
+	xfrm_sk_free_policy(sk);
+
+	sk_refcnt_debug_release(sk);
+
+	atomic_dec(sk->sk_prot->orphan_count);
+	sock_put(sk);
+}
+
+EXPORT_SYMBOL(inet_csk_destroy_sock);
+
+int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
+
+	if (rc != 0)
+		return rc;
+
+	sk->sk_max_ack_backlog = 0;
+	sk->sk_ack_backlog = 0;
+	inet_csk_delack_init(sk);
+
+	/* There is race window here: we announce ourselves listening,
+	 * but this transition is still not validated by get_port().
+	 * It is OK, because this socket enters to hash table only
+	 * after validation is complete.
+	 */
+	sk->sk_state = TCP_LISTEN;
+	if (!sk->sk_prot->get_port(sk, inet->num)) {
+		inet->sport = htons(inet->num);
+
+		sk_dst_reset(sk);
+		sk->sk_prot->hash(sk);
+
+		return 0;
+	}
+
+	sk->sk_state = TCP_CLOSE;
+	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
+	return -EADDRINUSE;
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+void inet_csk_listen_stop(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *acc_req;
+	struct request_sock *req;
+
+	inet_csk_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
+
+	/* Following specs, it would be better either to send FIN
+	 * (and enter FIN-WAIT-1, it is normal close)
+	 * or to send active reset (abort).
+	 * Certainly, it is pretty dangerous while synflood, but it is
+	 * bad justification for our negligence 8)
+	 * To be honest, we are not able to make either
+	 * of the variants now.			--ANK
+	 */
+	reqsk_queue_destroy(&icsk->icsk_accept_queue);
+
+	while ((req = acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		BUG_TRAP(!sock_owned_by_user(child));
+		sock_hold(child);
+
+		sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		atomic_inc(sk->sk_prot->orphan_count);
+
+		inet_csk_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		sk_acceptq_removed(sk);
+		__reqsk_free(req);
+	}
+	BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
new file mode 100644
index 00000000000..71f3c7350c6
--- /dev/null
+++ b/net/ipv4/inet_diag.c
@@ -0,0 +1,868 @@
+/*
+ * inet_diag.c	Module for monitoring INET transport protocols sockets.
+ *
+ * Version:	$Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/inet6_hashtables.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/inet_diag.h>
+
+static const struct inet_diag_handler **inet_diag_table;
+
+struct inet_diag_entry {
+	u32 *saddr;
+	u32 *daddr;
+	u16 sport;
+	u16 dport;
+	u16 family;
+	u16 userlocks;
+};
+
+static struct sock *idiagnl;
+
+#define INET_DIAG_PUT(skb, attrtype, attrlen) \
+	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+
+static int inet_diag_fill(struct sk_buff *skb, struct sock *sk,
+			int ext, u32 pid, u32 seq, u16 nlmsg_flags,
+			const struct nlmsghdr *unlh)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	void *info = NULL;
+	struct inet_diag_meminfo  *minfo = NULL;
+	unsigned char	 *b = skb->tail;
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_table[unlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = nlmsg_flags;
+
+	r = NLMSG_DATA(nlh);
+	if (sk->sk_state != TCP_TIME_WAIT) {
+		if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
+			minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO,
+					      sizeof(*minfo));
+		if (ext & (1 << (INET_DIAG_INFO - 1)))
+			info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
+					   handler->idiag_info_size);
+		
+		if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
+			size_t len = strlen(icsk->icsk_ca_ops->name);
+			strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
+			       icsk->icsk_ca_ops->name);
+		}
+	}
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = sk->sk_state;
+	r->idiag_timer = 0;
+	r->idiag_retrans = 0;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+	if (r->idiag_state == TCP_TIME_WAIT) {
+		const struct inet_timewait_sock *tw = inet_twsk(sk);
+		long tmo = tw->tw_ttd - jiffies;
+		if (tmo < 0)
+			tmo = 0;
+
+		r->id.idiag_sport = tw->tw_sport;
+		r->id.idiag_dport = tw->tw_dport;
+		r->id.idiag_src[0] = tw->tw_rcv_saddr;
+		r->id.idiag_dst[0] = tw->tw_daddr;
+		r->idiag_state = tw->tw_substate;
+		r->idiag_timer = 3;
+		r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ;
+		r->idiag_rqueue = 0;
+		r->idiag_wqueue = 0;
+		r->idiag_uid = 0;
+		r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (r->idiag_family == AF_INET6) {
+			const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk);
+
+			ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+				       &tcp6tw->tw_v6_rcv_saddr);
+			ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+				       &tcp6tw->tw_v6_daddr);
+		}
+#endif
+		nlh->nlmsg_len = skb->tail - b;
+		return skb->len;
+	}
+
+	r->id.idiag_sport = inet->sport;
+	r->id.idiag_dport = inet->dport;
+	r->id.idiag_src[0] = inet->rcv_saddr;
+	r->id.idiag_dst[0] = inet->daddr;
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &np->rcv_saddr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &np->daddr);
+	}
+#endif
+
+#define EXPIRES_IN_MS(tmo)  ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+		r->idiag_timer = 1;
+		r->idiag_retrans = icsk->icsk_retransmits;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+		r->idiag_timer = 4;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->idiag_timer = 2;
+		r->idiag_retrans = icsk->icsk_probes_out;
+		r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_expires = 0;
+	}
+#undef EXPIRES_IN_MS
+
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = sock_i_ino(sk);
+
+	if (minfo) {
+		minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+		minfo->idiag_wmem = sk->sk_wmem_queued;
+		minfo->idiag_fmem = sk->sk_forward_alloc;
+		minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+	}
+
+	handler->idiag_get_info(sk, r, info);
+
+	if (sk->sk_state < TCP_TIME_WAIT &&
+	    icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
+		icsk->icsk_ca_ops->get_info(sk, ext, skb);
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+rtattr_failure:
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+	int err;
+	struct sock *sk;
+	struct inet_diag_req *req = NLMSG_DATA(nlh);
+	struct sk_buff *rep;
+	struct inet_hashinfo *hashinfo;
+	const struct inet_diag_handler *handler;
+
+	handler = inet_diag_table[nlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+	hashinfo = handler->idiag_hashinfo;
+
+	if (req->idiag_family == AF_INET) {
+		sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
+				 req->id.idiag_dport, req->id.idiag_src[0],
+				 req->id.idiag_sport, req->id.idiag_if);
+	}
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	else if (req->idiag_family == AF_INET6) {
+		sk = inet6_lookup(hashinfo,
+				  (struct in6_addr *)req->id.idiag_dst,
+				  req->id.idiag_dport,
+				  (struct in6_addr *)req->id.idiag_src,
+				  req->id.idiag_sport,
+				  req->id.idiag_if);
+	}
+#endif
+	else {
+		return -EINVAL;
+	}
+
+	if (sk == NULL)
+		return -ENOENT;
+
+	err = -ESTALE;
+	if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
+	     req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
+	    ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
+	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
+		goto out;
+
+	err = -ENOMEM;
+	rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
+				     sizeof(struct inet_diag_meminfo) +
+				     handler->idiag_info_size + 64)),
+			GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	if (inet_diag_fill(rep, sk, req->idiag_ext,
+			 NETLINK_CB(in_skb).pid,
+			 nlh->nlmsg_seq, 0, nlh) <= 0)
+		BUG();
+
+	err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+
+out:
+	if (sk) {
+		if (sk->sk_state == TCP_TIME_WAIT)
+			inet_twsk_put((struct inet_timewait_sock *)sk);
+		else
+			sock_put(sk);
+	}
+	return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+	int words = bits >> 5;
+
+	bits &= 0x1f;
+
+	if (words) {
+		if (memcmp(a1, a2, words << 2))
+			return 0;
+	}
+	if (bits) {
+		__u32 w1, w2;
+		__u32 mask;
+
+		w1 = a1[words];
+		w2 = a2[words];
+
+		mask = htonl((0xffffffff) << (32 - bits));
+
+		if ((w1 ^ w2) & mask)
+			return 0;
+	}
+
+	return 1;
+}
+
+
+static int inet_diag_bc_run(const void *bc, int len,
+			  const struct inet_diag_entry *entry)
+{
+	while (len > 0) {
+		int yes = 1;
+		const struct inet_diag_bc_op *op = bc;
+
+		switch (op->code) {
+		case INET_DIAG_BC_NOP:
+			break;
+		case INET_DIAG_BC_JMP:
+			yes = 0;
+			break;
+		case INET_DIAG_BC_S_GE:
+			yes = entry->sport >= op[1].no;
+			break;
+		case INET_DIAG_BC_S_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_D_GE:
+			yes = entry->dport >= op[1].no;
+			break;
+		case INET_DIAG_BC_D_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case INET_DIAG_BC_AUTO:
+			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+			break;
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND: {
+			struct inet_diag_hostcond *cond;
+			u32 *addr;
+
+			cond = (struct inet_diag_hostcond *)(op + 1);
+			if (cond->port != -1 &&
+			    cond->port != (op->code == INET_DIAG_BC_S_COND ?
+					     entry->sport : entry->dport)) {
+				yes = 0;
+				break;
+			}
+			
+			if (cond->prefix_len == 0)
+				break;
+
+			if (op->code == INET_DIAG_BC_S_COND)
+				addr = entry->saddr;
+			else
+				addr = entry->daddr;
+
+			if (bitstring_match(addr, cond->addr, cond->prefix_len))
+				break;
+			if (entry->family == AF_INET6 &&
+			    cond->family == AF_INET) {
+				if (addr[0] == 0 && addr[1] == 0 &&
+				    addr[2] == htonl(0xffff) &&
+				    bitstring_match(addr + 3, cond->addr,
+					    	    cond->prefix_len))
+					break;
+			}
+			yes = 0;
+			break;
+		}
+		}
+
+		if (yes) { 
+			len -= op->yes;
+			bc += op->yes;
+		} else {
+			len -= op->no;
+			bc += op->no;
+		}
+	}
+	return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+	while (len >= 0) {
+		const struct inet_diag_bc_op *op = bc;
+
+		if (cc > len)
+			return 0;
+		if (cc == len)
+			return 1;
+		if (op->yes < 4)
+			return 0;
+		len -= op->yes;
+		bc  += op->yes;
+	}
+	return 0;
+}
+
+static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
+{
+	const unsigned char *bc = bytecode;
+	int  len = bytecode_len;
+
+	while (len > 0) {
+		struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+		switch (op->code) {
+		case INET_DIAG_BC_AUTO:
+		case INET_DIAG_BC_S_COND:
+		case INET_DIAG_BC_D_COND:
+		case INET_DIAG_BC_S_GE:
+		case INET_DIAG_BC_S_LE:
+		case INET_DIAG_BC_D_GE:
+		case INET_DIAG_BC_D_LE:
+			if (op->yes < 4 || op->yes > len + 4)
+				return -EINVAL;
+		case INET_DIAG_BC_JMP:
+			if (op->no < 4 || op->no > len + 4)
+				return -EINVAL;
+			if (op->no < len &&
+			    !valid_cc(bytecode, bytecode_len, len - op->no))
+				return -EINVAL;
+			break;
+		case INET_DIAG_BC_NOP:
+			if (op->yes < 4 || op->yes > len + 4)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+		bc += op->yes;
+		len -= op->yes;
+	}
+	return len == 0 ? 0 : -EINVAL;
+}
+
+static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		struct inet_diag_entry entry;
+		struct rtattr *bc = (struct rtattr *)(r + 1);
+		struct inet_sock *inet = inet_sk(sk);
+
+		entry.family = sk->sk_family;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+		if (entry.family == AF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			entry.saddr = np->rcv_saddr.s6_addr32;
+			entry.daddr = np->daddr.s6_addr32;
+		} else
+#endif
+		{
+			entry.saddr = &inet->rcv_saddr;
+			entry.daddr = &inet->daddr;
+		}
+		entry.sport = inet->num;
+		entry.dport = ntohs(inet->dport);
+		entry.userlocks = sk->sk_userlocks;
+
+		if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+			return 0;
+	}
+
+	return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid,
+			    cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+}
+
+static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
+			    struct request_sock *req,
+			    u32 pid, u32 seq,
+			    const struct nlmsghdr *unlh)
+{
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned char *b = skb->tail;
+	struct inet_diag_msg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
+	nlh->nlmsg_flags = NLM_F_MULTI;
+	r = NLMSG_DATA(nlh);
+
+	r->idiag_family = sk->sk_family;
+	r->idiag_state = TCP_SYN_RECV;
+	r->idiag_timer = 1;
+	r->idiag_retrans = req->retrans;
+
+	r->id.idiag_if = sk->sk_bound_dev_if;
+	r->id.idiag_cookie[0] = (u32)(unsigned long)req;
+	r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+	tmo = req->expires - jiffies;
+	if (tmo < 0)
+		tmo = 0;
+
+	r->id.idiag_sport = inet->sport;
+	r->id.idiag_dport = ireq->rmt_port;
+	r->id.idiag_src[0] = ireq->loc_addr;
+	r->id.idiag_dst[0] = ireq->rmt_addr;
+	r->idiag_expires = jiffies_to_msecs(tmo);
+	r->idiag_rqueue = 0;
+	r->idiag_wqueue = 0;
+	r->idiag_uid = sock_i_uid(sk);
+	r->idiag_inode = 0;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+	if (r->idiag_family == AF_INET6) {
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
+			       &tcp6_rsk(req)->loc_addr);
+		ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
+			       &tcp6_rsk(req)->rmt_addr);
+	}
+#endif
+	nlh->nlmsg_len = skb->tail - b;
+
+	return skb->len;
+
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct inet_diag_entry entry;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt;
+	struct rtattr *bc = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	int j, s_j;
+	int reqnum, s_reqnum;
+	int err = 0;
+
+	s_j = cb->args[3];
+	s_reqnum = cb->args[4];
+
+	if (s_j > 0)
+		s_j--;
+
+	entry.family = sk->sk_family;
+
+	read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	lopt = icsk->icsk_accept_queue.listen_opt;
+	if (!lopt || !lopt->qlen)
+		goto out;
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		bc = (struct rtattr *)(r + 1);
+		entry.sport = inet->num;
+		entry.userlocks = sk->sk_userlocks;
+	}
+
+	for (j = s_j; j < lopt->nr_table_entries; j++) {
+		struct request_sock *req, *head = lopt->syn_table[j];
+
+		reqnum = 0;
+		for (req = head; req; reqnum++, req = req->dl_next) {
+			struct inet_request_sock *ireq = inet_rsk(req);
+
+			if (reqnum < s_reqnum)
+				continue;
+			if (r->id.idiag_dport != ireq->rmt_port &&
+			    r->id.idiag_dport)
+				continue;
+
+			if (bc) {
+				entry.saddr =
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					tcp6_rsk(req)->loc_addr.s6_addr32 :
+#endif
+					&ireq->loc_addr;
+				entry.daddr = 
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+					(entry.family == AF_INET6) ?
+					tcp6_rsk(req)->rmt_addr.s6_addr32 :
+#endif
+					&ireq->rmt_addr;
+				entry.dport = ntohs(ireq->rmt_port);
+
+				if (!inet_diag_bc_run(RTA_DATA(bc),
+						    RTA_PAYLOAD(bc), &entry))
+					continue;
+			}
+
+			err = inet_diag_fill_req(skb, sk, req,
+					       NETLINK_CB(cb->skb).pid,
+					       cb->nlh->nlmsg_seq, cb->nlh);
+			if (err < 0) {
+				cb->args[3] = j + 1;
+				cb->args[4] = reqnum;
+				goto out;
+			}
+		}
+
+		s_reqnum = 0;
+	}
+
+out:
+	read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+
+	return err;
+}
+
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, num;
+	int s_i, s_num;
+	struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+	const struct inet_diag_handler *handler;
+	struct inet_hashinfo *hashinfo;
+
+	handler = inet_diag_table[cb->nlh->nlmsg_type];
+	BUG_ON(handler == NULL);
+	hashinfo = handler->idiag_hashinfo;
+		
+	s_i = cb->args[1];
+	s_num = num = cb->args[2];
+
+	if (cb->args[0] == 0) {
+		if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
+			goto skip_listen_ht;
+
+		inet_listen_lock(hashinfo);
+		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
+			struct sock *sk;
+			struct hlist_node *node;
+
+			num = 0;
+			sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num) {
+					num++;
+					continue;
+				}
+
+				if (r->id.idiag_sport != inet->sport &&
+				    r->id.idiag_sport)
+					goto next_listen;
+
+				if (!(r->idiag_states & TCPF_LISTEN) ||
+				    r->id.idiag_dport ||
+				    cb->args[3] > 0)
+					goto syn_recv;
+
+				if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+					inet_listen_unlock(hashinfo);
+					goto done;
+				}
+
+syn_recv:
+				if (!(r->idiag_states & TCPF_SYN_RECV))
+					goto next_listen;
+
+				if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
+					inet_listen_unlock(hashinfo);
+					goto done;
+				}
+
+next_listen:
+				cb->args[3] = 0;
+				cb->args[4] = 0;
+				++num;
+			}
+
+			s_num = 0;
+			cb->args[3] = 0;
+			cb->args[4] = 0;
+		}
+		inet_listen_unlock(hashinfo);
+skip_listen_ht:
+		cb->args[0] = 1;
+		s_i = num = s_num = 0;
+	}
+
+	if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
+		return skb->len;
+
+	for (i = s_i; i < hashinfo->ehash_size; i++) {
+		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+		struct sock *sk;
+		struct hlist_node *node;
+
+		if (i > s_i)
+			s_num = 0;
+
+		read_lock_bh(&head->lock);
+
+		num = 0;
+		sk_for_each(sk, node, &head->chain) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (num < s_num)
+				goto next_normal;
+			if (!(r->idiag_states & (1 << sk->sk_state)))
+				goto next_normal;
+			if (r->id.idiag_sport != inet->sport &&
+			    r->id.idiag_sport)
+				goto next_normal;
+			if (r->id.idiag_dport != inet->dport && r->id.idiag_dport)
+				goto next_normal;
+			if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+				read_unlock_bh(&head->lock);
+				goto done;
+			}
+next_normal:
+			++num;
+		}
+
+		if (r->idiag_states & TCPF_TIME_WAIT) {
+			sk_for_each(sk, node,
+				    &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num)
+					goto next_dying;
+				if (r->id.idiag_sport != inet->sport &&
+				    r->id.idiag_sport)
+					goto next_dying;
+				if (r->id.idiag_dport != inet->dport &&
+				    r->id.idiag_dport)
+					goto next_dying;
+				if (inet_diag_dump_sock(skb, sk, cb) < 0) {
+					read_unlock_bh(&head->lock);
+					goto done;
+				}
+next_dying:
+				++num;
+			}
+		}
+		read_unlock_bh(&head->lock);
+	}
+
+done:
+	cb->args[1] = i;
+	cb->args[2] = num;
+	return skb->len;
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+	return 0;
+}
+
+
+static __inline__ int
+inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+		return 0;
+
+	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX)
+		goto err_inval;
+
+	if (inet_diag_table[nlh->nlmsg_type] == NULL)
+		return -ENOENT;
+
+	if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len)
+		goto err_inval;
+
+	if (nlh->nlmsg_flags&NLM_F_DUMP) {
+		if (nlh->nlmsg_len >
+		    (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) {
+			struct rtattr *rta = (void *)(NLMSG_DATA(nlh) +
+						 sizeof(struct inet_diag_req));
+			if (rta->rta_type != INET_DIAG_REQ_BYTECODE ||
+			    rta->rta_len < 8 ||
+			    rta->rta_len >
+			    (nlh->nlmsg_len -
+			     NLMSG_SPACE(sizeof(struct inet_diag_req))))
+				goto err_inval;
+			if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+				goto err_inval;
+		}
+		return netlink_dump_start(idiagnl, skb, nlh,
+					  inet_diag_dump,
+					  inet_diag_dump_done);
+	} else {
+		return inet_diag_get_exact(skb, nlh);
+	}
+
+err_inval:
+	return -EINVAL;
+}
+
+
+static inline void inet_diag_rcv_skb(struct sk_buff *skb)
+{
+	int err;
+	struct nlmsghdr * nlh;
+
+	if (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return;
+		err = inet_diag_rcv_msg(skb, nlh);
+		if (err || nlh->nlmsg_flags & NLM_F_ACK) 
+			netlink_ack(skb, nlh, err);
+	}
+}
+
+static void inet_diag_rcv(struct sock *sk, int len)
+{
+	struct sk_buff *skb;
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
+	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
+		inet_diag_rcv_skb(skb);
+		kfree_skb(skb);
+	}
+}
+
+static DEFINE_SPINLOCK(inet_diag_register_lock);
+
+int inet_diag_register(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+	int err = -EINVAL;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		goto out;
+
+	spin_lock(&inet_diag_register_lock);
+	err = -EEXIST;
+	if (inet_diag_table[type] == NULL) {
+		inet_diag_table[type] = h;
+		err = 0;
+	}
+	spin_unlock(&inet_diag_register_lock);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(inet_diag_register);
+
+void inet_diag_unregister(const struct inet_diag_handler *h)
+{
+	const __u16 type = h->idiag_type;
+
+	if (type >= INET_DIAG_GETSOCK_MAX)
+		return;
+
+	spin_lock(&inet_diag_register_lock);
+	inet_diag_table[type] = NULL;
+	spin_unlock(&inet_diag_register_lock);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(inet_diag_unregister);
+
+static int __init inet_diag_init(void)
+{
+	const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
+					  sizeof(struct inet_diag_handler *));
+	int err = -ENOMEM;
+
+	inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL);
+	if (!inet_diag_table)
+		goto out;
+
+	memset(inet_diag_table, 0, inet_diag_table_size);
+	idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv,
+					THIS_MODULE);
+	if (idiagnl == NULL)
+		goto out_free_table;
+	err = 0;
+out:
+	return err;
+out_free_table:
+	kfree(inet_diag_table);
+	goto out;
+}
+
+static void __exit inet_diag_exit(void)
+{
+	sock_release(idiagnl->sk_socket);
+	kfree(inet_diag_table);
+}
+
+module_init(inet_diag_init);
+module_exit(inet_diag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
new file mode 100644
index 00000000000..e8d29fe736d
--- /dev/null
+++ b/net/ipv4/inet_hashtables.c
@@ -0,0 +1,165 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+
+/*
+ * Allocate and initialize a new local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+						 struct inet_bind_hashbucket *head,
+						 const unsigned short snum)
+{
+	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+	if (tb != NULL) {
+		tb->port      = snum;
+		tb->fastreuse = 0;
+		INIT_HLIST_HEAD(&tb->owners);
+		hlist_add_head(&tb->node, &head->chain);
+	}
+	return tb;
+}
+
+EXPORT_SYMBOL(inet_bind_bucket_create);
+
+/*
+ * Caller must hold hashbucket lock for this tb with local BH disabled
+ */
+void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb)
+{
+	if (hlist_empty(&tb->owners)) {
+		__hlist_del(&tb->node);
+		kmem_cache_free(cachep, tb);
+	}
+}
+
+void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+		    const unsigned short snum)
+{
+	inet_sk(sk)->num = snum;
+	sk_add_bind_node(sk, &tb->owners);
+	inet_csk(sk)->icsk_bind_hash = tb;
+}
+
+EXPORT_SYMBOL(inet_bind_hash);
+
+/*
+ * Get rid of any references to a local port held by the given sock.
+ */
+static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+	struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+	struct inet_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = inet_csk(sk)->icsk_bind_hash;
+	__sk_del_bind_node(sk);
+	inet_csk(sk)->icsk_bind_hash = NULL;
+	inet_sk(sk)->num = 0;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	spin_unlock(&head->lock);
+}
+
+void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+{
+	local_bh_disable();
+	__inet_put_port(hashinfo, sk);
+	local_bh_enable();
+}
+
+EXPORT_SYMBOL(inet_put_port);
+
+/*
+ * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+void inet_listen_wlock(struct inet_hashinfo *hashinfo)
+{
+	write_lock(&hashinfo->lhash_lock);
+
+	if (atomic_read(&hashinfo->lhash_users)) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait_exclusive(&hashinfo->lhash_wait,
+						  &wait, TASK_UNINTERRUPTIBLE);
+			if (!atomic_read(&hashinfo->lhash_users))
+				break;
+			write_unlock_bh(&hashinfo->lhash_lock);
+			schedule();
+			write_lock_bh(&hashinfo->lhash_lock);
+		}
+
+		finish_wait(&hashinfo->lhash_wait, &wait);
+	}
+}
+
+EXPORT_SYMBOL(inet_listen_wlock);
+
+/*
+ * Don't inline this cruft. Here are some nice properties to exploit here. The
+ * BSD API does not allow a listening sock to specify the remote port nor the
+ * remote address for the connection. So always assume those are both
+ * wildcarded during the search since they can never be otherwise.
+ */
+struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+				    const unsigned short hnum, const int dif)
+{
+	struct sock *result = NULL, *sk;
+	const struct hlist_node *node;
+	int hiscore = -1;
+
+	sk_for_each(sk, node, head) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->num == hnum && !ipv6_only_sock(sk)) {
+			const __u32 rcv_saddr = inet->rcv_saddr;
+			int score = sk->sk_family == PF_INET ? 1 : 0;
+
+			if (rcv_saddr) {
+				if (rcv_saddr != daddr)
+					continue;
+				score += 2;
+			}
+			if (sk->sk_bound_dev_if) {
+				if (sk->sk_bound_dev_if != dif)
+					continue;
+				score += 2;
+			}
+			if (score == 5)
+				return sk;
+			if (score > hiscore) {
+				hiscore	= score;
+				result	= sk;
+			}
+		}
+	}
+	return result;
+}
+
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
new file mode 100644
index 00000000000..4d1502a4985
--- /dev/null
+++ b/net/ipv4/inet_timewait_sock.c
@@ -0,0 +1,384 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic TIME_WAIT sockets functions
+ *
+ *		From code orinally in TCP
+ */
+
+#include <linux/config.h>
+
+#include <net/inet_hashtables.h>
+#include <net/inet_timewait_sock.h>
+#include <net/ip.h>
+
+/* Must be called with locally disabled BHs. */
+void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_hashbucket *bhead;
+	struct inet_bind_bucket *tb;
+	/* Unlink from established hashes. */
+	struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
+
+	write_lock(&ehead->lock);
+	if (hlist_unhashed(&tw->tw_node)) {
+		write_unlock(&ehead->lock);
+		return;
+	}
+	__hlist_del(&tw->tw_node);
+	sk_node_init(&tw->tw_node);
+	write_unlock(&ehead->lock);
+
+	/* Disassociate with bind bucket. */
+	bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
+	spin_lock(&bhead->lock);
+	tb = tw->tw_tb;
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	spin_unlock(&bhead->lock);
+#ifdef SOCK_REFCNT_DEBUG
+	if (atomic_read(&tw->tw_refcnt) != 1) {
+		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
+		       tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
+	}
+#endif
+	inet_twsk_put(tw);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_kill);
+
+/*
+ * Enter the time wait state. This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the relevant info into it
+ * from the SK, and mess with hash chains and list linkage.
+ */
+void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
+			   struct inet_hashinfo *hashinfo)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
+	struct inet_bind_hashbucket *bhead;
+	/* Step 1: Put TW into bind hash. Original socket stays there too.
+	   Note, that any socket with inet->num != 0 MUST be bound in
+	   binding cache, even if it is closed.
+	 */
+	bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
+	spin_lock(&bhead->lock);
+	tw->tw_tb = icsk->icsk_bind_hash;
+	BUG_TRAP(icsk->icsk_bind_hash);
+	inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
+	spin_unlock(&bhead->lock);
+
+	write_lock(&ehead->lock);
+
+	/* Step 2: Remove SK from established hash. */
+	if (__sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+
+	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+	inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain);
+	atomic_inc(&tw->tw_refcnt);
+
+	write_unlock(&ehead->lock);
+}
+
+EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+{
+	struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab,
+							 SLAB_ATOMIC);
+	if (tw != NULL) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		/* Give us an identity. */
+		tw->tw_daddr	    = inet->daddr;
+		tw->tw_rcv_saddr    = inet->rcv_saddr;
+		tw->tw_bound_dev_if = sk->sk_bound_dev_if;
+		tw->tw_num	    = inet->num;
+		tw->tw_state	    = TCP_TIME_WAIT;
+		tw->tw_substate	    = state;
+		tw->tw_sport	    = inet->sport;
+		tw->tw_dport	    = inet->dport;
+		tw->tw_family	    = sk->sk_family;
+		tw->tw_reuse	    = sk->sk_reuse;
+		tw->tw_hashent	    = sk->sk_hashent;
+		tw->tw_ipv6only	    = 0;
+		tw->tw_prot	    = sk->sk_prot_creator;
+		atomic_set(&tw->tw_refcnt, 1);
+		inet_twsk_dead_node_init(tw);
+	}
+
+	return tw;
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_alloc);
+
+/* Returns non-zero if quota exceeded.  */
+static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
+				    const int slot)
+{
+	struct inet_timewait_sock *tw;
+	struct hlist_node *node;
+	unsigned int killed;
+	int ret;
+
+	/* NOTE: compare this to previous version where lock
+	 * was released after detaching chain. It was racy,
+	 * because tw buckets are scheduled in not serialized context
+	 * in 2.3 (with netfilter), and with softnet it is common, because
+	 * soft irqs are not sequenced.
+	 */
+	killed = 0;
+	ret = 0;
+rescan:
+	inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
+		__inet_twsk_del_dead_node(tw);
+		spin_unlock(&twdr->death_lock);
+		__inet_twsk_kill(tw, twdr->hashinfo);
+		inet_twsk_put(tw);
+		killed++;
+		spin_lock(&twdr->death_lock);
+		if (killed > INET_TWDR_TWKILL_QUOTA) {
+			ret = 1;
+			break;
+		}
+
+		/* While we dropped twdr->death_lock, another cpu may have
+		 * killed off the next TW bucket in the list, therefore
+		 * do a fresh re-read of the hlist head node with the
+		 * lock reacquired.  We still use the hlist traversal
+		 * macro in order to get the prefetches.
+		 */
+		goto rescan;
+	}
+
+	twdr->tw_count -= killed;
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+	return ret;
+}
+
+void inet_twdr_hangman(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int unsigned need_timer;
+
+	twdr = (struct inet_timewait_death_row *)data;
+	spin_lock(&twdr->death_lock);
+
+	if (twdr->tw_count == 0)
+		goto out;
+
+	need_timer = 0;
+	if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
+		twdr->thread_slots |= (1 << twdr->slot);
+		mb();
+		schedule_work(&twdr->twkill_work);
+		need_timer = 1;
+	} else {
+		/* We purged the entire slot, anything left?  */
+		if (twdr->tw_count)
+			need_timer = 1;
+	}
+	twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
+	if (need_timer)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+out:
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_hangman);
+
+extern void twkill_slots_invalid(void);
+
+void inet_twdr_twkill_work(void *data)
+{
+	struct inet_timewait_death_row *twdr = data;
+	int i;
+
+	if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8))
+		twkill_slots_invalid();
+
+	while (twdr->thread_slots) {
+		spin_lock_bh(&twdr->death_lock);
+		for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
+			if (!(twdr->thread_slots & (1 << i)))
+				continue;
+
+			while (inet_twdr_do_twkill_work(twdr, i) != 0) {
+				if (need_resched()) {
+					spin_unlock_bh(&twdr->death_lock);
+					schedule();
+					spin_lock_bh(&twdr->death_lock);
+				}
+			}
+
+			twdr->thread_slots &= ~(1 << i);
+		}
+		spin_unlock_bh(&twdr->death_lock);
+	}
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void inet_twsk_deschedule(struct inet_timewait_sock *tw,
+			  struct inet_timewait_death_row *twdr)
+{
+	spin_lock(&twdr->death_lock);
+	if (inet_twsk_del_dead_node(tw)) {
+		inet_twsk_put(tw);
+		if (--twdr->tw_count == 0)
+			del_timer(&twdr->tw_timer);
+	}
+	spin_unlock(&twdr->death_lock);
+	__inet_twsk_kill(tw, twdr->hashinfo);
+}
+
+EXPORT_SYMBOL(inet_twsk_deschedule);
+
+void inet_twsk_schedule(struct inet_timewait_sock *tw,
+		       struct inet_timewait_death_row *twdr,
+		       const int timeo, const int timewait_len)
+{
+	struct hlist_head *list;
+	int slot;
+
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+	slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
+
+	spin_lock(&twdr->death_lock);
+
+	/* Unlink it, if it was scheduled */
+	if (inet_twsk_del_dead_node(tw))
+		twdr->tw_count--;
+	else
+		atomic_inc(&tw->tw_refcnt);
+
+	if (slot >= INET_TWDR_RECYCLE_SLOTS) {
+		/* Schedule to slow timer */
+		if (timeo >= timewait_len) {
+			slot = INET_TWDR_TWKILL_SLOTS - 1;
+		} else {
+			slot = (timeo + twdr->period - 1) / twdr->period;
+			if (slot >= INET_TWDR_TWKILL_SLOTS)
+				slot = INET_TWDR_TWKILL_SLOTS - 1;
+		}
+		tw->tw_ttd = jiffies + timeo;
+		slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
+		list = &twdr->cells[slot];
+	} else {
+		tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
+
+		if (twdr->twcal_hand < 0) {
+			twdr->twcal_hand = 0;
+			twdr->twcal_jiffie = jiffies;
+			twdr->twcal_timer.expires = twdr->twcal_jiffie +
+					      (slot << INET_TWDR_RECYCLE_TICK);
+			add_timer(&twdr->twcal_timer);
+		} else {
+			if (time_after(twdr->twcal_timer.expires,
+				       jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
+				mod_timer(&twdr->twcal_timer,
+					  jiffies + (slot << INET_TWDR_RECYCLE_TICK));
+			slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
+		}
+		list = &twdr->twcal_row[slot];
+	}
+
+	hlist_add_head(&tw->tw_death_node, list);
+
+	if (twdr->tw_count++ == 0)
+		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twsk_schedule);
+
+void inet_twdr_twcal_tick(unsigned long data)
+{
+	struct inet_timewait_death_row *twdr;
+	int n, slot;
+	unsigned long j;
+	unsigned long now = jiffies;
+	int killed = 0;
+	int adv = 0;
+
+	twdr = (struct inet_timewait_death_row *)data;
+
+	spin_lock(&twdr->death_lock);
+	if (twdr->twcal_hand < 0)
+		goto out;
+
+	slot = twdr->twcal_hand;
+	j = twdr->twcal_jiffie;
+
+	for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
+		if (time_before_eq(j, now)) {
+			struct hlist_node *node, *safe;
+			struct inet_timewait_sock *tw;
+
+			inet_twsk_for_each_inmate_safe(tw, node, safe,
+						       &twdr->twcal_row[slot]) {
+				__inet_twsk_del_dead_node(tw);
+				__inet_twsk_kill(tw, twdr->hashinfo);
+				inet_twsk_put(tw);
+				killed++;
+			}
+		} else {
+			if (!adv) {
+				adv = 1;
+				twdr->twcal_jiffie = j;
+				twdr->twcal_hand = slot;
+			}
+
+			if (!hlist_empty(&twdr->twcal_row[slot])) {
+				mod_timer(&twdr->twcal_timer, j);
+				goto out;
+			}
+		}
+		j += 1 << INET_TWDR_RECYCLE_TICK;
+		slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
+	}
+	twdr->twcal_hand = -1;
+
+out:
+	if ((twdr->tw_count -= killed) == 0)
+		del_timer(&twdr->tw_timer);
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+	spin_unlock(&twdr->death_lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ab18a853d7c..f84ba9c9655 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/net.h>
+#include <net/ip.h>
 #include <net/inetpeer.h>
 
 /*
@@ -72,7 +73,7 @@
 /* Exported for inet_getid inline function.  */
 DEFINE_SPINLOCK(inet_peer_idlock);
 
-static kmem_cache_t *peer_cachep;
+static kmem_cache_t *peer_cachep __read_mostly;
 
 #define node_height(x) x->avl_height
 static struct inet_peer peer_fake_node = {
@@ -459,5 +460,3 @@ static void peer_check_expire(unsigned long dummy)
 				peer_total / inet_peer_threshold * HZ;
 	add_timer(&peer_periodic_timer);
 }
-
-EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 77094aac6c2..0923add122b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb)
 	 *	that reaches zero, we must reply an ICMP control message telling
 	 *	that the packet's lifetime expired.
 	 */
-
-	iph = skb->nh.iph;
-
-	if (iph->ttl <= 1)
+	if (skb->nh.iph->ttl <= 1)
                 goto too_many_hops;
 
 	if (!xfrm4_route_forward(skb))
 		goto drop;
 
-	iph = skb->nh.iph;
 	rt = (struct rtable*)skb->dst;
 
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb377ae1530..9e6e683cc34 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 	return ip_frag_intern(hash, qp);
 
 out_nomem:
-	LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
 	return NULL;
 }
 
@@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
  	if (skb->dev)
  		qp->iif = skb->dev->ifindex;
 	skb->dev = NULL;
-	qp->stamp = skb->stamp;
+	skb_get_timestamp(skb, &qp->stamp);
 	qp->meat += skb->len;
 	atomic_add(skb->truesize, &ip_frag_mem);
 	if (offset == 0)
@@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 
 	head->next = NULL;
 	head->dev = dev;
-	head->stamp = qp->stamp;
+	skb_set_timestamp(head, &qp->stamp);
 
 	iph = head->nh.iph;
 	iph->frag_off = 0;
@@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 	return head;
 
 out_nomem:
- 	LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
-			      "queue %p\n", qp));
+ 	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
+			      "queue %p\n", qp);
 	goto out_fail;
 out_oversize:
 	if (net_ratelimit())
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c703528e0bc..473d0f2b2e0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -150,7 +150,7 @@
  *	SNMP management statistics
  */
 
-DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly;
 
 /*
  *	Process Router Attention IP option
@@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
 		/* If there maybe a raw socket we must check - if not we
 		 * don't care less
 		 */
-		if (raw_sk)
-			raw_v4_input(skb, skb->nh.iph, hash);
+		if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash))
+			raw_sk = NULL;
 
 		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
 			int ret;
@@ -279,18 +279,70 @@ int ip_local_deliver(struct sk_buff *skb)
 		       ip_local_deliver_finish);
 }
 
-static inline int ip_rcv_finish(struct sk_buff *skb)
+static inline int ip_rcv_options(struct sk_buff *skb)
 {
+	struct ip_options *opt;
+	struct iphdr *iph;
 	struct net_device *dev = skb->dev;
+
+	/* It looks as overkill, because not all
+	   IP options require packet mangling.
+	   But it is the easiest for now, especially taking
+	   into account that combination of IP options
+	   and running sniffer is extremely rare condition.
+					      --ANK (980813)
+	*/
+	if (skb_cow(skb, skb_headroom(skb))) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	iph = skb->nh.iph;
+
+	if (ip_options_compile(NULL, skb)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+		goto drop;
+	}
+
+	opt = &(IPCB(skb)->opt);
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = in_dev_get(dev);
+		if (in_dev) {
+			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+				if (IN_DEV_LOG_MARTIANS(in_dev) &&
+				    net_ratelimit())
+					printk(KERN_INFO "source route option "
+					       "%u.%u.%u.%u -> %u.%u.%u.%u\n",
+					       NIPQUAD(iph->saddr),
+					       NIPQUAD(iph->daddr));
+				in_dev_put(in_dev);
+				goto drop;
+			}
+
+			in_dev_put(in_dev);
+		}
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+drop:
+	return -1;
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
 	struct iphdr *iph = skb->nh.iph;
-	int err;
 
 	/*
 	 *	Initialise the virtual path cache for the packet. It describes
 	 *	how the packet travels inside Linux networking.
 	 */ 
-	if (skb->dst == NULL) {
-		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+	if (likely(skb->dst == NULL)) {
+		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+					 skb->dev);
+		if (unlikely(err)) {
 			if (err == -EHOSTUNREACH)
 				IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 			goto drop; 
@@ -298,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (skb->dst->tclassid) {
+	if (unlikely(skb->dst->tclassid)) {
 		struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
 		u32 idx = skb->dst->tclassid;
 		st[idx&0xFF].o_packets++;
@@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 	}
 #endif
 
-	if (iph->ihl > 5) {
-		struct ip_options *opt;
-
-		/* It looks as overkill, because not all
-		   IP options require packet mangling.
-		   But it is the easiest for now, especially taking
-		   into account that combination of IP options
-		   and running sniffer is extremely rare condition.
-		                                      --ANK (980813)
-		*/
-
-		if (skb_cow(skb, skb_headroom(skb))) {
-			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-			goto drop;
-		}
-		iph = skb->nh.iph;
-
-		if (ip_options_compile(NULL, skb))
-			goto inhdr_error;
-
-		opt = &(IPCB(skb)->opt);
-		if (opt->srr) {
-			struct in_device *in_dev = in_dev_get(dev);
-			if (in_dev) {
-				if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
-					if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-						printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
-						       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
-					in_dev_put(in_dev);
-					goto drop;
-				}
-				in_dev_put(in_dev);
-			}
-			if (ip_options_rcv_srr(skb))
-				goto drop;
-		}
-	}
+	if (iph->ihl > 5 && ip_rcv_options(skb))
+		goto drop;
 
 	return dst_input(skb);
 
-inhdr_error:
-	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 drop:
         kfree_skb(skb);
         return NET_RX_DROP;
@@ -358,9 +373,10 @@ drop:
 /*
  * 	Main IP Receive routine.
  */ 
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct iphdr *iph;
+	u32 len;
 
 	/* When the interface is in promisc. mode, drop all the crap
 	 * that it receives, do not try to analyse it.
@@ -392,29 +408,27 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
 	 */
 
 	if (iph->ihl < 5 || iph->version != 4)
-		goto inhdr_error; 
+		goto inhdr_error;
 
 	if (!pskb_may_pull(skb, iph->ihl*4))
 		goto inhdr_error;
 
 	iph = skb->nh.iph;
 
-	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
-		goto inhdr_error; 
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
 
-	{
-		__u32 len = ntohs(iph->tot_len); 
-		if (skb->len < len || len < (iph->ihl<<2))
-			goto inhdr_error;
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < (iph->ihl*4))
+		goto inhdr_error;
 
-		/* Our transport medium may have padded the buffer out. Now we know it
-		 * is IP we can trim to the true length of the frame.
-		 * Note this now means skb->len holds ntohs(iph->tot_len).
-		 */
-		if (pskb_trim_rcsum(skb, len)) {
-			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-			goto drop;
-		}
+	/* Our transport medium may have padded the buffer out. Now we know it
+	 * is IP we can trim to the true length of the frame.
+	 * Note this now means skb->len holds ntohs(iph->tot_len).
+	 */
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+		goto drop;
 	}
 
 	return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
@@ -428,5 +442,4 @@ out:
         return NET_RX_DROP;
 }
 
-EXPORT_SYMBOL(ip_rcv);
 EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 6d89f3f3e70..bce4e875193 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt)
 	}
 }
 
-int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+static struct ip_options *ip_options_get_alloc(const int optlen)
 {
-	struct ip_options *opt;
+	struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3),
+					 GFP_KERNEL);
+	if (opt)
+		memset(opt, 0, sizeof(*opt));
+	return opt;
+}
 
-	opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
-	if (!opt)
-		return -ENOMEM;
-	memset(opt, 0, sizeof(struct ip_options));
-	if (optlen) {
-		if (user) {
-			if (copy_from_user(opt->__data, data, optlen)) {
-				kfree(opt);
-				return -EFAULT;
-			}
-		} else
-			memcpy(opt->__data, data, optlen);
-	}
+static int ip_options_get_finish(struct ip_options **optp,
+				 struct ip_options *opt, int optlen)
+{
 	while (optlen & 3)
 		opt->__data[optlen++] = IPOPT_END;
 	opt->optlen = optlen;
@@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
 	return 0;
 }
 
+int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen)
+{
+	struct ip_options *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen && copy_from_user(opt->__data, data, optlen)) {
+		kfree(opt);
+		return -EFAULT;
+	}
+	return ip_options_get_finish(optp, opt, optlen);
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen)
+{
+	struct ip_options *opt = ip_options_get_alloc(optlen);
+
+	if (!opt)
+		return -ENOMEM;
+	if (optlen)
+		memcpy(opt->__data, data, optlen);
+	return ip_options_get_finish(optp, opt, optlen);
+}
+
 void ip_forward_options(struct sk_buff *skb)
 {
 	struct   ip_options * opt	= &(IPCB(skb)->opt);
@@ -620,6 +639,3 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	return 0;
 }
-
-EXPORT_SYMBOL(ip_options_compile);
-EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 80d13103b2b..3f1a263e124 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -69,13 +69,10 @@
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/icmp.h>
-#include <net/raw.h>
 #include <net/checksum.h>
 #include <net/inetpeer.h>
 #include <net/checksum.h>
@@ -84,12 +81,8 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
+#include <linux/tcp.h>
 
-/*
- *      Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr;
 int sysctl_ip_default_ttl = IPDEFTTL;
 
 /* Generate a checksum for an outgoing IP datagram. */
@@ -165,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 		       dst_output);
 }
 
+EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
+
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
@@ -205,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	return -EINVAL;
 }
 
-int ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dst->dev;
 
@@ -329,8 +324,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 			if (ip_route_output_flow(&rt, &fl, sk, 0))
 				goto no_route;
 		}
-		__sk_dst_set(sk, &rt->u.dst);
-		tcp_v4_setup_caps(sk, &rt->u.dst);
+		sk_setup_caps(sk, &rt->u.dst);
 	}
 	skb->dst = dst_clone(&rt->u.dst);
 
@@ -392,7 +386,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 #endif
 #ifdef CONFIG_NETFILTER
 	to->nfmark = from->nfmark;
-	to->nfcache = from->nfcache;
 	/* Connection association is same as pre-frag packet */
 	nf_conntrack_put(to->nfct);
 	to->nfct = from->nfct;
@@ -580,7 +573,7 @@ slow_path:
 		 */
 
 		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
-			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+			NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 			err = -ENOMEM;
 			goto fail;
 		}
@@ -1329,12 +1322,7 @@ void __init ip_init(void)
 #endif
 }
 
-EXPORT_SYMBOL(ip_finish_output);
 EXPORT_SYMBOL(ip_fragment);
 EXPORT_SYMBOL(ip_generic_getfrag);
 EXPORT_SYMBOL(ip_queue_xmit);
 EXPORT_SYMBOL(ip_send_check);
-
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ff4bd067b39..2f0b47da5b3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
 		switch (cmsg->cmsg_type) {
 		case IP_RETOPTS:
 			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
-			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40);
 			if (err)
 				return err;
 			break;
@@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			struct ip_options * opt = NULL;
 			if (optlen > 40 || optlen < 0)
 				goto e_inval;
-			err = ip_options_get(&opt, optval, optlen, 1);
+			err = ip_options_get_from_user(&opt, optval, optlen);
 			if (err)
 				break;
 			if (sk->sk_type == SOCK_STREAM) {
@@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		}
 		case IP_MSFILTER:
 		{
-			extern int sysctl_optmem_max;
 			extern int sysctl_igmp_max_msf;
 			struct ip_msfilter *msf;
 
@@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		}
 		case MCAST_MSFILTER:
 		{
-			extern int sysctl_optmem_max;
 			extern int sysctl_igmp_max_msf;
 			struct sockaddr_in *psin;
 			struct ip_msfilter *msf = NULL;
@@ -1090,7 +1088,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 EXPORT_SYMBOL(ip_cmsg_recv);
 
-#ifdef CONFIG_IP_SCTP_MODULE
 EXPORT_SYMBOL(ip_getsockopt);
 EXPORT_SYMBOL(ip_setsockopt);
-#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 7ded6e60f43..dcb7ee6c485 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
 	                      spi, IPPROTO_COMP, AF_INET);
 	if (!x)
 		return;
-	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
-	       spi, NIPQUAD(iph->daddr)));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+		 spi, NIPQUAD(iph->daddr));
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d2bf8e1930a..63e106605f2 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -393,7 +393,7 @@ static int __init ic_defaults(void)
 
 #ifdef IPCONFIG_RARP
 
-static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 
 static struct packet_type rarp_packet_type __initdata = {
 	.type =	__constant_htons(ETH_P_RARP),
@@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void)
  *  Process received RARP packet.
  */
 static int __init
-ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct arphdr *rarp;
 	unsigned char *rarp_ptr;
@@ -555,7 +555,7 @@ struct bootp_pkt {		/* BOOTP packet format */
 #define DHCPRELEASE	7
 #define DHCPINFORM	8
 
-static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev);
 
 static struct packet_type bootp_packet_type __initdata = {
 	.type =	__constant_htons(ETH_P_IP),
@@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
 /*
  *  Receive BOOTP reply.
  */
-static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct bootp_pkt *b;
 	struct iphdr *h;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index dc806b57842..9dbf5909f3a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
    In this case data path is free of exclusive locks at all.
  */
 
-static kmem_cache_t *mrt_cachep;
+static kmem_cache_t *mrt_cachep __read_mostly;
 
 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index d9212addd19..6e092dadb38 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -26,6 +26,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <net/protocol.h>
+#include <net/tcp.h>
 #include <asm/system.h>
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index d0145a8b155..e11952ea17a 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -40,7 +40,7 @@
 static struct list_head *ip_vs_conn_tab;
 
 /*  SLAB cache for IPVS connections */
-static kmem_cache_t *ip_vs_conn_cachep;
+static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
 
 /*  counter for current IPVS connections */
 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 5fb257dd07c..3ac7eeca04a 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -22,6 +22,7 @@
  *
  * Changes:
  *	Paul `Rusty' Russell		properly handle non-linear skbs
+ *	Harald Welte			don't use nfcache
  *
  */
 
@@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
 				       const struct net_device *out,
 				       int (*okfn)(struct sk_buff *))
 {
-	if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+	if (!((*pskb)->ipvs_property))
 		return NF_ACCEPT;
 
 	/* The packet was sent from IPVS, exit this chain */
@@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
-	skb->nfcache |= NFC_IPVS_PROPERTY;
+	skb->ipvs_property = 1;
 	verdict = NF_ACCEPT;
 
   out:
@@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
 
 	EnterFunction(11);
 
-	if (skb->nfcache & NFC_IPVS_PROPERTY)
+	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
 	iph = skb->nh.iph;
@@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 	ip_vs_conn_put(cp);
 
-	skb->nfcache |= NFC_IPVS_PROPERTY;
+	skb->ipvs_property = 1;
 
 	LeaveFunction(11);
 	return NF_ACCEPT;
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 7d99ede2ef7..2d66848e7aa 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4",
@@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net",
 		.mode		= 0555,
-		.child		= ipv4_table,
+		.child		= ipvs_ipv4_table,
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index c035838b780..561cda326fa 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -131,7 +131,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4", 
@@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net", 
 		.mode		= 0555, 
-		.child		= ipv4_table
+		.child		= ipvs_ipv4_table
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index 22b5dd55d27..ce456dbf09a 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -320,7 +320,7 @@ static ctl_table vs_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static ctl_table ipv4_table[] = {
+static ctl_table ipvs_ipv4_table[] = {
 	{
 		.ctl_name	= NET_IPV4,
 		.procname	= "ipv4", 
@@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = {
 		.ctl_name	= CTL_NET,
 		.procname	= "net", 
 		.mode		= 0555, 
-		.child		= ipv4_table
+		.child		= ipvs_ipv4_table
 	},
 	{ .ctl_name = 0 }
 };
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index e65de675da7..c19408973c0 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
 }
 
 
-static void tcp_init(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
 {
 	IP_VS_INIT_HASH_TABLE(tcp_apps);
 	pp->timeout_table = tcp_timeouts;
 }
 
 
-static void tcp_exit(struct ip_vs_protocol *pp)
+static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
 {
 }
 
@@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.protocol =		IPPROTO_TCP,
 	.dont_defrag =		0,
 	.appcnt =		ATOMIC_INIT(0),
-	.init =			tcp_init,
-	.exit =			tcp_exit,
+	.init =			ip_vs_tcp_init,
+	.exit =			ip_vs_tcp_exit,
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index a8512a3fd08..3b87482049c 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 
 #define IP_VS_XMIT(skb, rt)				\
 do {							\
-	(skb)->nfcache |= NFC_IPVS_PROPERTY;		\
+	(skb)->ipvs_property = 1;			\
 	(skb)->ip_summed = CHECKSUM_NONE;		\
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
 		(rt)->u.dst.dev, dst_output);		\
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index c9cf8726051..db67373f9b3 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
-struct notifier_block drr_dev_notifier = {
+static struct notifier_block drr_dev_notifier = {
 	.notifier_call	= drr_dev_event,
 };
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
new file mode 100644
index 00000000000..ae0779d82c5
--- /dev/null
+++ b/net/ipv4/netfilter.c
@@ -0,0 +1,139 @@
+/* IPv4 specific functions of netfilter core */
+
+#include <linux/config.h>
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
+int ip_route_me_harder(struct sk_buff **pskb)
+{
+	struct iphdr *iph = (*pskb)->nh.iph;
+	struct rtable *rt;
+	struct flowi fl = {};
+	struct dst_entry *odst;
+	unsigned int hh_len;
+
+	/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
+	 * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
+	 */
+	if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
+		fl.nl_u.ip4_u.daddr = iph->daddr;
+		fl.nl_u.ip4_u.saddr = iph->saddr;
+		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+		fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
+#endif
+		fl.proto = iph->protocol;
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return -1;
+
+		/* Drop old route. */
+		dst_release((*pskb)->dst);
+		(*pskb)->dst = &rt->u.dst;
+	} else {
+		/* non-local src, find valid iif to satisfy
+		 * rp-filter when calling ip_route_input. */
+		fl.nl_u.ip4_u.daddr = iph->saddr;
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return -1;
+
+		odst = (*pskb)->dst;
+		if (ip_route_input(*pskb, iph->daddr, iph->saddr,
+				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+			dst_release(&rt->u.dst);
+			return -1;
+		}
+		dst_release(&rt->u.dst);
+		dst_release(odst);
+	}
+	
+	if ((*pskb)->dst->error)
+		return -1;
+
+	/* Change in oif may mean change in hh_len. */
+	hh_len = (*pskb)->dst->dev->hard_header_len;
+	if (skb_headroom(*pskb) < hh_len) {
+		struct sk_buff *nskb;
+
+		nskb = skb_realloc_headroom(*pskb, hh_len);
+		if (!nskb) 
+			return -1;
+		if ((*pskb)->sk)
+			skb_set_owner_w(nskb, (*pskb)->sk);
+		kfree_skb(*pskb);
+		*pskb = nskb;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ip_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip_rt_info {
+	u_int32_t daddr;
+	u_int32_t saddr;
+	u_int8_t tos;
+};
+
+static void queue_save(const struct sk_buff *skb, struct nf_info *info)
+{
+	struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP_LOCAL_OUT) {
+		const struct iphdr *iph = skb->nh.iph;
+
+		rt_info->tos = iph->tos;
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+	}
+}
+
+static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+	const struct ip_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP_LOCAL_OUT) {
+		struct iphdr *iph = (*pskb)->nh.iph;
+
+		if (!(iph->tos == rt_info->tos
+		      && iph->daddr == rt_info->daddr
+		      && iph->saddr == rt_info->saddr))
+			return ip_route_me_harder(pskb);
+	}
+	return 0;
+}
+
+static struct nf_queue_rerouter ip_reroute = {
+	.rer_size	= sizeof(struct ip_rt_info),
+	.save		= queue_save,
+	.reroute	= queue_reroute,
+};
+
+static int init(void)
+{
+	return nf_register_queue_rerouter(PF_INET, &ip_reroute);
+}
+
+static void fini(void)
+{
+	nf_unregister_queue_rerouter(PF_INET);
+}
+
+module_init(init);
+module_exit(fini);
+
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1c06f..e046f552181 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK
 	  of packets, but this mark value is kept in the conntrack session
 	  instead of the individual packets.
 	
+config IP_NF_CONNTRACK_EVENTS
+	bool "Connection tracking events"
+	depends on IP_NF_CONNTRACK
+	help
+	  If this option is enabled, the connection tracking code will
+	  provide a notifier chain that can be used by other kernel code
+	  to get notified about changes in the connection tracking state.
+	  
+	  IF unsure, say `N'.
+
 config IP_NF_CT_PROTO_SCTP
 	tristate  'SCTP protocol connection tracking support (EXPERIMENTAL)'
 	depends on IP_NF_CONNTRACK && EXPERIMENTAL
@@ -100,11 +110,15 @@ config IP_NF_AMANDA
 	  To compile it as a module, choose M here.  If unsure, say Y.
 
 config IP_NF_QUEUE
-	tristate "Userspace queueing via NETLINK"
+	tristate "IP Userspace queueing via NETLINK (OBSOLETE)"
 	help
 	  Netfilter has the ability to queue packets to user space: the
 	  netlink device can be used to access them using this driver.
 
+	  This option enables the old IPv4-only "ip_queue" implementation
+	  which has been obsoleted by the new "nfnetlink_queue" code (see
+	  CONFIG_NETFILTER_NETLINK_QUEUE).
+
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config IP_NF_IPTABLES
@@ -340,6 +354,17 @@ config IP_NF_MATCH_SCTP
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
+config IP_NF_MATCH_DCCP
+	tristate  'DCCP protocol match support'
+	depends on IP_NF_IPTABLES
+	help
+	  With this option enabled, you will be able to use the iptables
+	  `dccp' match in order to match on DCCP source/destination ports
+	  and DCCP flags.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config IP_NF_MATCH_COMMENT
 	tristate  'comment match support'
 	depends on IP_NF_IPTABLES
@@ -361,6 +386,16 @@ config IP_NF_MATCH_CONNMARK
 	  <file:Documentation/modules.txt>.  The module will be called
 	  ipt_connmark.o.  If unsure, say `N'.
 
+config IP_NF_MATCH_CONNBYTES
+	tristate  'Connection byte/packet counter match support'
+	depends on IP_NF_CT_ACCT && IP_NF_IPTABLES
+	help
+	  This option adds a `connbytes' match, which allows you to match the
+	  number of bytes and/or packets for each direction within a connection.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
 config IP_NF_MATCH_HASHLIMIT
 	tristate  'hashlimit match support'
 	depends on IP_NF_IPTABLES
@@ -375,6 +410,19 @@ config IP_NF_MATCH_HASHLIMIT
 	  destination IP' or `500pps from any given source IP'  with a single
 	  IPtables rule.
 
+config IP_NF_MATCH_STRING
+	tristate  'string match support'
+	depends on IP_NF_IPTABLES 
+	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
+	select TEXTSEARCH_FSM
+	help
+	  This option adds a `string' match, which allows you to look for
+	  pattern matchings in packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
@@ -616,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_TTL
+	tristate  'TTL target support'
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `TTL' target, which enables the user to modify
+	  the TTL value of the IP header.
+
+	  While it is safe to decrement/lower the TTL, this target also enables
+	  functionality to increment and set the TTL value of the IP header to
+	  arbitrary values.  This is EXTREMELY DANGEROUS since you can easily
+	  create immortal packets that loop forever on the network.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config IP_NF_TARGET_CONNMARK
 	tristate  'CONNMARK target support'
 	depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
@@ -692,5 +754,11 @@ config IP_NF_ARP_MANGLE
 	  Allows altering the ARP packet payload: source and destination
 	  hardware and network addresses.
 
+config IP_NF_CONNTRACK_NETLINK
+        tristate 'Connection tracking netlink interface'
+        depends on IP_NF_CONNTRACK && NETFILTER_NETLINK
+        help
+          This option enables support for a netlink-based userspace interface
+
 endmenu
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 45796d5924d..a7bd38f5052 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -9,6 +9,10 @@ iptable_nat-objs	:= ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe
 # connection tracking
 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
 
+# conntrack netlink interface
+obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o
+
+
 # SCTP protocol connection tracking
 obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
 
@@ -38,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
 obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
 obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
 obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o
 obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
 obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
 obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
@@ -54,11 +59,13 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
 obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
 obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o
 obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
 # targets
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
@@ -78,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -87,3 +95,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
 
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 01e1b58322a..be4c9eb3243 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
 static char *conns[] = { "DATA ", "MESG ", "INDEX " };
 
 /* This is slow, but it's simple. --RR */
-static char amanda_buffer[65536];
+static char *amanda_buffer;
 static DEFINE_SPINLOCK(amanda_buffer_lock);
 
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
@@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = {
 static void __exit fini(void)
 {
 	ip_conntrack_helper_unregister(&amanda_helper);
+	kfree(amanda_buffer);
 }
 
 static int __init init(void)
 {
-	return ip_conntrack_helper_register(&amanda_helper);
+	int ret;
+
+	amanda_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!amanda_buffer)
+		return -ENOMEM;
+
+	ret = ip_conntrack_helper_register(&amanda_helper);
+	if (ret < 0) {
+		kfree(amanda_buffer);
+		return ret;
+	}
+	return 0;
+
+
 }
 
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7f0c821a9b..a0648600190 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
 #include <linux/err.h>
 #include <linux/percpu.h>
 #include <linux/moduleparam.h>
+#include <linux/notifier.h>
 
 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
    registrations, conntrack timers*/
@@ -49,7 +50,7 @@
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
-#define IP_CONNTRACK_VERSION	"2.1"
+#define IP_CONNTRACK_VERSION	"2.3"
 
 #if 0
 #define DEBUGP printk
@@ -69,22 +70,81 @@ static LIST_HEAD(helpers);
 unsigned int ip_conntrack_htable_size = 0;
 int ip_conntrack_max;
 struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
 struct ip_conntrack ip_conntrack_untracked;
 unsigned int ip_ct_log_invalid;
 static LIST_HEAD(unconfirmed);
 static int ip_conntrack_vmalloc;
 
-DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+static unsigned int ip_conntrack_next_id = 1;
+static unsigned int ip_conntrack_expect_next_id = 1;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-void 
-ip_conntrack_put(struct ip_conntrack *ct)
+/* deliver cached events and clear cache entry - must be called with locally
+ * disabled softirqs */
+static inline void
+__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
-	IP_NF_ASSERT(ct);
-	nf_conntrack_put(&ct->ct_general);
+	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
+	if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+		notifier_call_chain(&ip_conntrack_chain, ecache->events,
+				    ecache->ct);
+	ecache->events = 0;
+	ip_conntrack_put(ecache->ct);
+	ecache->ct = NULL;
 }
 
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
+{
+	struct ip_conntrack_ecache *ecache;
+	
+	local_bh_disable();
+	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	if (ecache->ct == ct)
+		__ip_ct_deliver_cached_events(ecache);
+	local_bh_enable();
+}
+
+void __ip_ct_event_cache_init(struct ip_conntrack *ct)
+{
+	struct ip_conntrack_ecache *ecache;
+
+	/* take care of delivering potentially old events */
+	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	BUG_ON(ecache->ct == ct);
+	if (ecache->ct)
+		__ip_ct_deliver_cached_events(ecache);
+	/* initialize for this conntrack/packet */
+	ecache->ct = ct;
+	nf_conntrack_get(&ct->ct_general);
+}
+
+/* flush the event cache - touches other CPU's data and must not be called while
+ * packets are still passing through the code */
+static void ip_ct_event_cache_flush(void)
+{
+	struct ip_conntrack_ecache *ecache;
+	int cpu;
+
+	for_each_cpu(cpu) {
+		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		if (ecache->ct)
+			ip_conntrack_put(ecache->ct);
+	}
+}
+#else
+static inline void ip_ct_event_cache_flush(void) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
 static int ip_conntrack_hash_rnd_initted;
 static unsigned int ip_conntrack_hash_rnd;
 
@@ -144,6 +204,13 @@ static void unlink_expect(struct ip_conntrack_expect *exp)
 	list_del(&exp->list);
 	CONNTRACK_STAT_INC(expect_delete);
 	exp->master->expecting--;
+	ip_conntrack_expect_put(exp);
+}
+
+void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
+{
+	unlink_expect(exp);
+	ip_conntrack_expect_put(exp);
 }
 
 static void expectation_timed_out(unsigned long ul_expect)
@@ -156,6 +223,33 @@ static void expectation_timed_out(unsigned long ul_expect)
 	ip_conntrack_expect_put(exp);
 }
 
+struct ip_conntrack_expect *
+__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_expect *i;
+	
+	list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+		if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+			atomic_inc(&i->use);
+			return i;
+		}
+	}
+	return NULL;
+}
+
+/* Just find a expectation corresponding to a tuple. */
+struct ip_conntrack_expect *
+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_expect *i;
+	
+	read_lock_bh(&ip_conntrack_lock);
+	i = __ip_conntrack_expect_find(tuple);
+	read_unlock_bh(&ip_conntrack_lock);
+
+	return i;
+}
+
 /* If an expectation for this connection is found, it gets delete from
  * global list then returned. */
 static struct ip_conntrack_expect *
@@ -180,7 +274,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple)
 }
 
 /* delete all expectations for this conntrack */
-static void remove_expectations(struct ip_conntrack *ct)
+void ip_ct_remove_expectations(struct ip_conntrack *ct)
 {
 	struct ip_conntrack_expect *i, *tmp;
 
@@ -210,7 +304,7 @@ clean_from_lists(struct ip_conntrack *ct)
 	LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
 
 	/* Destroy all pending expectations */
-	remove_expectations(ct);
+	ip_ct_remove_expectations(ct);
 }
 
 static void
@@ -223,10 +317,13 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
 	IP_NF_ASSERT(!timer_pending(&ct->timeout));
 
+	ip_conntrack_event(IPCT_DESTROY, ct);
+	set_bit(IPS_DYING_BIT, &ct->status);
+
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to ip_conntrack_lock!!! -HW */
-	proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
 	if (proto && proto->destroy)
 		proto->destroy(ct);
 
@@ -238,7 +335,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	 * except TFTP can create an expectation on the first packet,
 	 * before connection is in the list, so we need to clean here,
 	 * too. */
-	remove_expectations(ct);
+	ip_ct_remove_expectations(ct);
 
 	/* We overload first tuple to link into unconfirmed list. */
 	if (!is_confirmed(ct)) {
@@ -253,8 +350,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 		ip_conntrack_put(ct->master);
 
 	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
-	kmem_cache_free(ip_conntrack_cachep, ct);
-	atomic_dec(&ip_conntrack_count);
+	ip_conntrack_free(ct);
 }
 
 static void death_by_timeout(unsigned long ul_conntrack)
@@ -280,7 +376,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
 		&& ip_ct_tuple_equal(tuple, &i->tuple);
 }
 
-static struct ip_conntrack_tuple_hash *
+struct ip_conntrack_tuple_hash *
 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
 		    const struct ip_conntrack *ignored_conntrack)
 {
@@ -315,6 +411,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
 	return h;
 }
 
+static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
+					unsigned int hash,
+					unsigned int repl_hash) 
+{
+	ct->id = ++ip_conntrack_next_id;
+	list_prepend(&ip_conntrack_hash[hash],
+		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_prepend(&ip_conntrack_hash[repl_hash],
+		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+}
+
+void ip_conntrack_hash_insert(struct ip_conntrack *ct)
+{
+	unsigned int hash, repl_hash;
+
+	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	write_lock_bh(&ip_conntrack_lock);
+	__ip_conntrack_hash_insert(ct, hash, repl_hash);
+	write_unlock_bh(&ip_conntrack_lock);
+}
+
 /* Confirm a connection given skb; places it in hash table */
 int
 __ip_conntrack_confirm(struct sk_buff **pskb)
@@ -361,10 +480,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 		/* Remove from unconfirmed list */
 		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
 
-		list_prepend(&ip_conntrack_hash[hash],
-			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-		list_prepend(&ip_conntrack_hash[repl_hash],
-			     &ct->tuplehash[IP_CT_DIR_REPLY]);
+		__ip_conntrack_hash_insert(ct, hash, repl_hash);
 		/* Timer relative to confirmation time, not original
 		   setting time, otherwise we'd get timer wrap in
 		   weird delay cases. */
@@ -374,6 +490,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
 		CONNTRACK_STAT_INC(insert);
 		write_unlock_bh(&ip_conntrack_lock);
+		if (ct->helper)
+			ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+			ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+		ip_conntrack_event_cache(master_ct(ct) ?
+					 IPCT_RELATED : IPCT_NEW, *pskb);
+
 		return NF_ACCEPT;
 	}
 
@@ -438,34 +564,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i,
 	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
 }
 
-static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+static struct ip_conntrack_helper *
+__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
 {
 	return LIST_FIND(&helpers, helper_cmp,
 			 struct ip_conntrack_helper *,
 			 tuple);
 }
 
-/* Allocate a new conntrack: we return -ENOMEM if classification
-   failed due to stress.  Otherwise it really is unclassifiable. */
-static struct ip_conntrack_tuple_hash *
-init_conntrack(const struct ip_conntrack_tuple *tuple,
-	       struct ip_conntrack_protocol *protocol,
-	       struct sk_buff *skb)
+struct ip_conntrack_helper *
+ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_helper *helper;
+
+	/* need ip_conntrack_lock to assure that helper exists until
+	 * try_module_get() is called */
+	read_lock_bh(&ip_conntrack_lock);
+
+	helper = __ip_conntrack_helper_find(tuple);
+	if (helper) {
+		/* need to increase module usage count to assure helper will
+		 * not go away while the caller is e.g. busy putting a
+		 * conntrack in the hash that uses the helper */
+		if (!try_module_get(helper->me))
+			helper = NULL;
+	}
+
+	read_unlock_bh(&ip_conntrack_lock);
+
+	return helper;
+}
+
+void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
+{
+	module_put(helper->me);
+}
+
+struct ip_conntrack_protocol *
+__ip_conntrack_proto_find(u_int8_t protocol)
+{
+	return ip_ct_protos[protocol];
+}
+
+/* this is guaranteed to always return a valid protocol helper, since
+ * it falls back to generic_protocol */
+struct ip_conntrack_protocol *
+ip_conntrack_proto_find_get(u_int8_t protocol)
+{
+	struct ip_conntrack_protocol *p;
+
+	preempt_disable();
+	p = __ip_conntrack_proto_find(protocol);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &ip_conntrack_generic_protocol;
+	}
+	preempt_enable();
+	
+	return p;
+}
+
+void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
+{
+	module_put(p->me);
+}
+
+struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+					struct ip_conntrack_tuple *repl)
 {
 	struct ip_conntrack *conntrack;
-	struct ip_conntrack_tuple repl_tuple;
-	size_t hash;
-	struct ip_conntrack_expect *exp;
 
 	if (!ip_conntrack_hash_rnd_initted) {
 		get_random_bytes(&ip_conntrack_hash_rnd, 4);
 		ip_conntrack_hash_rnd_initted = 1;
 	}
 
-	hash = hash_conntrack(tuple);
-
 	if (ip_conntrack_max
 	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+		unsigned int hash = hash_conntrack(orig);
 		/* Try dropping from this hash chain. */
 		if (!early_drop(&ip_conntrack_hash[hash])) {
 			if (net_ratelimit())
@@ -476,11 +652,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		}
 	}
 
-	if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
-		DEBUGP("Can't invert tuple.\n");
-		return NULL;
-	}
-
 	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
 	if (!conntrack) {
 		DEBUGP("Can't allocate conntrack.\n");
@@ -490,17 +661,50 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	memset(conntrack, 0, sizeof(*conntrack));
 	atomic_set(&conntrack->ct_general.use, 1);
 	conntrack->ct_general.destroy = destroy_conntrack;
-	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
-	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
-	if (!protocol->new(conntrack, skb)) {
-		kmem_cache_free(ip_conntrack_cachep, conntrack);
-		return NULL;
-	}
+	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
 	/* Don't set timer yet: wait for confirmation */
 	init_timer(&conntrack->timeout);
 	conntrack->timeout.data = (unsigned long)conntrack;
 	conntrack->timeout.function = death_by_timeout;
 
+	atomic_inc(&ip_conntrack_count);
+
+	return conntrack;
+}
+
+void
+ip_conntrack_free(struct ip_conntrack *conntrack)
+{
+	atomic_dec(&ip_conntrack_count);
+	kmem_cache_free(ip_conntrack_cachep, conntrack);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+ * failed due to stress.   Otherwise it really is unclassifiable */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(struct ip_conntrack_tuple *tuple,
+	       struct ip_conntrack_protocol *protocol,
+	       struct sk_buff *skb)
+{
+	struct ip_conntrack *conntrack;
+	struct ip_conntrack_tuple repl_tuple;
+	struct ip_conntrack_expect *exp;
+
+	if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+		DEBUGP("Can't invert tuple.\n");
+		return NULL;
+	}
+
+	conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+	if (conntrack == NULL || IS_ERR(conntrack))
+		return (struct ip_conntrack_tuple_hash *)conntrack;
+
+	if (!protocol->new(conntrack, skb)) {
+		ip_conntrack_free(conntrack);
+		return NULL;
+	}
+
 	write_lock_bh(&ip_conntrack_lock);
 	exp = find_expectation(tuple);
 
@@ -521,7 +725,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 		nf_conntrack_get(&conntrack->master->ct_general);
 		CONNTRACK_STAT_INC(expect_new);
 	} else {
-		conntrack->helper = ip_ct_find_helper(&repl_tuple);
+		conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
 
 		CONNTRACK_STAT_INC(new);
 	}
@@ -529,7 +733,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	/* Overload tuple linked list to put us in unconfirmed list. */
 	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
 
-	atomic_inc(&ip_conntrack_count);
 	write_unlock_bh(&ip_conntrack_lock);
 
 	if (exp) {
@@ -607,7 +810,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
 	struct ip_conntrack_protocol *proto;
-	int set_reply;
+	int set_reply = 0;
 	int ret;
 
 	/* Previously seen (loopback or untracked)?  Ignore. */
@@ -625,9 +828,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 		return NF_DROP;
 	}
 
-	/* FIXME: Do this right please. --RR */
-	(*pskb)->nfcache |= NFC_UNKNOWN;
-
 /* Doesn't cover locally-generated broadcast, so not worth it. */
 #if 0
 	/* Ignore broadcast: no `connection'. */
@@ -643,7 +843,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 	}
 #endif
 
-	proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+	proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
 
 	/* It may be an special packet, error, unclean...
 	 * inverse of the return code tells to the netfilter
@@ -679,8 +879,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum,
 		return -ret;
 	}
 
-	if (set_reply)
-		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+		ip_conntrack_event_cache(IPCT_STATUS, *pskb);
 
 	return ret;
 }
@@ -689,7 +889,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse,
 		   const struct ip_conntrack_tuple *orig)
 {
 	return ip_ct_invert_tuple(inverse, orig, 
-				  ip_ct_find_proto(orig->dst.protonum));
+				  __ip_conntrack_proto_find(orig->dst.protonum));
 }
 
 /* Would two expected things clash? */
@@ -769,6 +969,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 	exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
 	add_timer(&exp->timeout);
 
+	exp->id = ++ip_conntrack_expect_next_id;
+	atomic_inc(&exp->use);
 	CONNTRACK_STAT_INC(expect_create);
 }
 
@@ -827,6 +1029,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 		evict_oldest_expect(expect->master);
 
 	ip_conntrack_expect_insert(expect);
+	ip_conntrack_expect_event(IPEXP_NEW, expect);
 	ret = 0;
 out:
 	write_unlock_bh(&ip_conntrack_lock);
@@ -847,7 +1050,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
 
 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 	if (!conntrack->master && conntrack->expecting == 0)
-		conntrack->helper = ip_ct_find_helper(newreply);
+		conntrack->helper = __ip_conntrack_helper_find(newreply);
 	write_unlock_bh(&ip_conntrack_lock);
 }
 
@@ -861,11 +1064,26 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 	return 0;
 }
 
+struct ip_conntrack_helper *
+__ip_conntrack_helper_find_byname(const char *name)
+{
+	struct ip_conntrack_helper *h;
+
+	list_for_each_entry(h, &helpers, list) {
+		if (!strcmp(h->name, name))
+			return h;
+	}
+
+	return NULL;
+}
+
 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
 			 const struct ip_conntrack_helper *me)
 {
-	if (tuplehash_to_ctrack(i)->helper == me)
+	if (tuplehash_to_ctrack(i)->helper == me) {
+ 		ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
 		tuplehash_to_ctrack(i)->helper = NULL;
+	}
 	return 0;
 }
 
@@ -927,12 +1145,46 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
 		if (del_timer(&ct->timeout)) {
 			ct->timeout.expires = jiffies + extra_jiffies;
 			add_timer(&ct->timeout);
+			ip_conntrack_event_cache(IPCT_REFRESH, skb);
 		}
 		ct_add_counters(ct, ctinfo, skb);
 		write_unlock_bh(&ip_conntrack_lock);
 	}
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
+ * in ip_conntrack_core, since we don't want the protocols to autoload
+ * or depend on ctnetlink */
+int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
+			       const struct ip_conntrack_tuple *tuple)
+{
+	NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
+		&tuple->src.u.tcp.port);
+	NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
+		&tuple->dst.u.tcp.port);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
+			       struct ip_conntrack_tuple *t)
+{
+	if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
+		return -EINVAL;
+
+	t->src.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
+	t->dst.u.tcp.port =
+		*(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
+
+	return 0;
+}
+#endif
+
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -943,10 +1195,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 	skb = ip_defrag(skb, user);
 	local_bh_enable();
 
-	if (skb) {
+	if (skb)
 		ip_send_check(skb->nh.iph);
-		skb->nfcache |= NFC_ALTERED;
-	}
 	return skb;
 }
 
@@ -1096,16 +1346,14 @@ static void free_conntrack_hash(void)
 				     * ip_conntrack_htable_size));
 }
 
-/* Mishearing the voices in his head, our hero wonders how he's
-   supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
+void ip_conntrack_flush()
 {
-	ip_ct_attach = NULL;
 	/* This makes sure all current packets have passed through
            netfilter framework.  Roll on, two-stage module
            delete... */
 	synchronize_net();
- 
+
+	ip_ct_event_cache_flush();
  i_see_dead_people:
 	ip_ct_iterate_cleanup(kill_all, NULL);
 	if (atomic_read(&ip_conntrack_count) != 0) {
@@ -1115,7 +1363,14 @@ void ip_conntrack_cleanup(void)
 	/* wait until all references to ip_conntrack_untracked are dropped */
 	while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
 		schedule();
+}
 
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+	ip_ct_attach = NULL;
+	ip_conntrack_flush();
 	kmem_cache_destroy(ip_conntrack_cachep);
 	kmem_cache_destroy(ip_conntrack_expect_cachep);
 	free_conntrack_hash();
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773be3f..3a2627db172 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
 MODULE_DESCRIPTION("ftp connection tracking helper");
 
 /* This is slow, but it's simple. --RR */
-static char ftp_buffer[65536];
-
+static char *ftp_buffer;
 static DEFINE_SPINLOCK(ip_ftp_lock);
 
 #define MAX_PORTS 8
@@ -262,7 +261,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir)
 }
 
 /* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+			  struct sk_buff *skb)
 {
 	unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
 
@@ -276,10 +276,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
 			oldest = i;
 	}
 
-	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
 		info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
-	else if (oldest != NUM_SEQ_TO_REMEMBER)
+		ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+	} else if (oldest != NUM_SEQ_TO_REMEMBER) {
 		info->seq_aft_nl[dir][oldest] = nl_seq;
+		ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+	}
 }
 
 static int help(struct sk_buff **pskb,
@@ -439,7 +442,7 @@ out_update_nl:
 	/* Now if this ends in \n, update ftp info.  Seq may have been
 	 * adjusted by NAT code. */
 	if (ends_in_nl)
-		update_nl_seq(seq, ct_ftp_info,dir);
+		update_nl_seq(seq, ct_ftp_info,dir, *pskb);
  out:
 	spin_unlock_bh(&ip_ftp_lock);
 	return ret;
@@ -457,6 +460,8 @@ static void fini(void)
 				ports[i]);
 		ip_conntrack_helper_unregister(&ftp[i]);
 	}
+
+	kfree(ftp_buffer);
 }
 
 static int __init init(void)
@@ -464,6 +469,10 @@ static int __init init(void)
 	int i, ret;
 	char *tmpname;
 
+	ftp_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!ftp_buffer)
+		return -ENOMEM;
+
 	if (ports_c == 0)
 		ports[ports_c++] = FTP_PORT;
 
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 4a28f297d50..25438eec21a 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -39,7 +39,7 @@ static int ports_c;
 static int max_dcc_channels = 8;
 static unsigned int dcc_timeout = 300;
 /* This is slow, but it's simple. --RR */
-static char irc_buffer[65536];
+static char *irc_buffer;
 static DEFINE_SPINLOCK(irc_buffer_lock);
 
 unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
@@ -257,6 +257,10 @@ static int __init init(void)
 		printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
 		return -EBUSY;
 	}
+
+	irc_buffer = kmalloc(65536, GFP_KERNEL);
+	if (!irc_buffer)
+		return -ENOMEM;
 	
 	/* If no port given, default to standard irc port */
 	if (ports_c == 0)
@@ -304,6 +308,7 @@ static void fini(void)
 		       ports[i]);
 		ip_conntrack_helper_unregister(&irc_helpers[i]);
 	}
+	kfree(irc_buffer);
 }
 
 module_init(init);
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
new file mode 100644
index 00000000000..a4e9278db4e
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -0,0 +1,1579 @@
+/* Connection tracking via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2003 by Patrick Mchardy <kaber@trash.net>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * I've reworked this stuff to use attributes instead of conntrack 
+ * structures. 5.44 am. I need more tea. --pablo 05/07/11.
+ *
+ * Initial connection tracking via netlink development funded and 
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+MODULE_LICENSE("GPL");
+
+static char __initdata version[] = "0.90";
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+
+static inline int
+ctnetlink_dump_tuples_proto(struct sk_buff *skb, 
+			    const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_protocol *proto;
+
+	NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum);
+
+	proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+	if (proto && proto->tuple_to_nfattr)
+		return proto->tuple_to_nfattr(skb, tuple);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_tuples(struct sk_buff *skb, 
+		      const struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *nest_parms;
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_IP);
+	NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip);
+	NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip);
+	NFA_NEST_END(skb, nest_parms);
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO);
+	ctnetlink_dump_tuples_proto(skb, tuple);
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t status = htonl((u_int32_t) ct->status);
+	NFA_PUT(skb, CTA_STATUS, sizeof(status), &status);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	long timeout_l = ct->timeout.expires - jiffies;
+	u_int32_t timeout;
+
+	if (timeout_l < 0)
+		timeout = 0;
+	else
+		timeout = htonl(timeout_l / HZ);
+	
+	NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+
+	struct nfattr *nest_proto;
+	int ret;
+	
+	if (!proto || !proto->to_nfattr)
+		return 0;
+	
+	nest_proto = NFA_NEST(skb, CTA_PROTOINFO);
+
+	ret = proto->to_nfattr(skb, nest_proto, ct);
+
+	ip_conntrack_proto_put(proto);
+
+	NFA_NEST_END(skb, nest_proto);
+
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	struct nfattr *nest_helper;
+
+	if (!ct->helper)
+		return 0;
+		
+	nest_helper = NFA_NEST(skb, CTA_HELP);
+	NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name);
+
+	if (ct->helper->to_nfattr)
+		ct->helper->to_nfattr(skb, ct);
+
+	NFA_NEST_END(skb, nest_helper);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static inline int
+ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
+			enum ip_conntrack_dir dir)
+{
+	enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
+	struct nfattr *nest_count = NFA_NEST(skb, type);
+	u_int64_t tmp;
+
+	tmp = cpu_to_be64(ct->counters[dir].packets);
+	NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
+
+	tmp = cpu_to_be64(ct->counters[dir].bytes);
+	NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
+
+	NFA_NEST_END(skb, nest_count);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_counters(a, b, c) (0)
+#endif
+
+#ifdef CONFIG_IP_NF_CONNTRACK_MARK
+static inline int
+ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t mark = htonl(ct->mark);
+
+	NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_mark(a, b) (0)
+#endif
+
+static inline int
+ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	u_int32_t id = htonl(ct->id);
+	NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct)
+{
+	unsigned int use = htonl(atomic_read(&ct->ct_general.use));
+	
+	NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use);
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)
+
+static int
+ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, int nowait, 
+		    const struct ip_conntrack *ct)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version      = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+
+	if (ctnetlink_dump_status(skb, ct) < 0 ||
+	    ctnetlink_dump_timeout(skb, ct) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
+	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+	    ctnetlink_dump_mark(skb, ct) < 0 ||
+	    ctnetlink_dump_id(skb, ct) < 0 ||
+	    ctnetlink_dump_use(skb, ct) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+                                     unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct nfattr *nest_parms;
+	struct ip_conntrack *ct = (struct ip_conntrack *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	unsigned int flags = 0, group;
+
+	/* ignore our fake conntrack entry */
+	if (ct == &ip_conntrack_untracked)
+		return NOTIFY_DONE;
+
+	if (events & IPCT_DESTROY) {
+		type = IPCTNL_MSG_CT_DELETE;
+		group = NFNLGRP_CONNTRACK_DESTROY;
+		goto alloc_skb;
+	}
+	if (events & (IPCT_NEW | IPCT_RELATED)) {
+		type = IPCTNL_MSG_CT_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+		/* dump everything */
+		events = ~0UL;
+		group = NFNLGRP_CONNTRACK_NEW;
+		goto alloc_skb;
+	}
+	if (events & (IPCT_STATUS |
+		      IPCT_PROTOINFO |
+		      IPCT_HELPER |
+		      IPCT_HELPINFO |
+		      IPCT_NATINFO)) {
+		type = IPCTNL_MSG_CT_NEW;
+		group = NFNLGRP_CONNTRACK_UPDATE;
+		goto alloc_skb;
+	} 
+	
+	return NOTIFY_DONE;
+
+alloc_skb:
+  /* FIXME: Check if there are any listeners before, don't hurt performance */
+	
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	= NFNETLINK_V0;
+	nfmsg->res_id	= 0;
+
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY);
+	if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0)
+		goto nfattr_failure;
+	NFA_NEST_END(skb, nest_parms);
+	
+	/* NAT stuff is now a status flag */
+	if ((events & IPCT_STATUS || events & IPCT_NATINFO)
+	    && ctnetlink_dump_status(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_REFRESH
+	    && ctnetlink_dump_timeout(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_PROTOINFO
+	    && ctnetlink_dump_protoinfo(skb, ct) < 0)
+		goto nfattr_failure;
+	if (events & IPCT_HELPINFO
+	    && ctnetlink_dump_helpinfo(skb, ct) < 0)
+		goto nfattr_failure;
+
+	if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
+	    ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	nfnetlink_send(skb, 0, group, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
+static int ctnetlink_done(struct netlink_callback *cb)
+{
+	DEBUGP("entered %s\n", __FUNCTION__);
+	return 0;
+}
+
+static int
+ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	read_lock_bh(&ip_conntrack_lock);
+	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+			h = (struct ip_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+		}
+	}
+out:	
+	read_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static int
+ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack_tuple_hash *h;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[1];
+
+	DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, 
+			cb->args[0], *id);
+
+	write_lock_bh(&ip_conntrack_lock);
+	for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+		list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
+			h = (struct ip_conntrack_tuple_hash *) i;
+			if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+				continue;
+			ct = tuplehash_to_ctrack(h);
+			if (ct->id <= *id)
+				continue;
+			if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
+		                        	cb->nlh->nlmsg_seq,
+						IPCTNL_MSG_CT_NEW,
+						1, ct) < 0)
+				goto out;
+			*id = ct->id;
+
+			memset(&ct->counters, 0, sizeof(ct->counters));
+		}
+	}
+out:	
+	write_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id);
+
+	return skb->len;
+}
+#endif
+
+static const int cta_min_ip[CTA_IP_MAX] = {
+	[CTA_IP_V4_SRC-1]	= sizeof(u_int32_t),
+	[CTA_IP_V4_DST-1]	= sizeof(u_int32_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_IP_MAX];
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	
+	if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip))
+		return -EINVAL;
+
+	if (!tb[CTA_IP_V4_SRC-1])
+		return -EINVAL;
+	tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]);
+
+	if (!tb[CTA_IP_V4_DST-1])
+		return -EINVAL;
+	tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]);
+
+	DEBUGP("leaving\n");
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static const int cta_min_proto[CTA_PROTO_MAX] = {
+	[CTA_PROTO_NUM-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_SRC_PORT-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_DST_PORT-1]	= sizeof(u_int16_t),
+	[CTA_PROTO_ICMP_TYPE-1]	= sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_CODE-1]	= sizeof(u_int8_t),
+	[CTA_PROTO_ICMP_ID-1]	= sizeof(u_int16_t),
+};
+
+static inline int
+ctnetlink_parse_tuple_proto(struct nfattr *attr, 
+			    struct ip_conntrack_tuple *tuple)
+{
+	struct nfattr *tb[CTA_PROTO_MAX];
+	struct ip_conntrack_protocol *proto;
+	int ret = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
+		return -EINVAL;
+
+	if (!tb[CTA_PROTO_NUM-1])
+		return -EINVAL;
+	tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]);
+
+	proto = ip_conntrack_proto_find_get(tuple->dst.protonum);
+
+	if (likely(proto && proto->nfattr_to_tuple)) {
+		ret = proto->nfattr_to_tuple(tb, tuple);
+		ip_conntrack_proto_put(proto);
+	}
+	
+	return ret;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple,
+		      enum ctattr_tuple type)
+{
+	struct nfattr *tb[CTA_TUPLE_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(tuple, 0, sizeof(*tuple));
+
+	if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0)
+		goto nfattr_failure;
+
+	if (!tb[CTA_TUPLE_IP-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_TUPLE_PROTO-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple);
+	if (err < 0)
+		return err;
+
+	/* orig and expect tuples get DIR_ORIGINAL */
+	if (type == CTA_TUPLE_REPLY)
+		tuple->dst.dir = IP_CT_DIR_REPLY;
+	else
+		tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	DUMP_TUPLE(tuple);
+
+	DEBUGP("leaving\n");
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+static const int cta_min_protonat[CTA_PROTONAT_MAX] = {
+	[CTA_PROTONAT_PORT_MIN-1]	= sizeof(u_int16_t),
+	[CTA_PROTONAT_PORT_MAX-1]	= sizeof(u_int16_t),
+};
+
+static int ctnetlink_parse_nat_proto(struct nfattr *attr,
+				     const struct ip_conntrack *ct,
+				     struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_PROTONAT_MAX];
+	struct ip_nat_protocol *npt;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat))
+		goto nfattr_failure;
+
+	npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
+	if (!npt)
+		return 0;
+
+	if (!npt->nfattr_to_range) {
+		ip_nat_proto_put(npt);
+		return 0;
+	}
+
+	/* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */
+	if (npt->nfattr_to_range(tb, range) > 0)
+		range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
+
+	ip_nat_proto_put(npt);
+
+	DEBUGP("leaving\n");
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static inline int
+ctnetlink_parse_nat(struct nfattr *cda[],
+		    const struct ip_conntrack *ct, struct ip_nat_range *range)
+{
+	struct nfattr *tb[CTA_NAT_MAX];
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	memset(range, 0, sizeof(*range));
+	
+	if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0)
+		goto nfattr_failure;
+
+	if (tb[CTA_NAT_MINIP-1])
+		range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]);
+
+	if (!tb[CTA_NAT_MAXIP-1])
+		range->max_ip = range->min_ip;
+	else
+		range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]);
+
+	if (range->min_ip)
+		range->flags |= IP_NAT_RANGE_MAP_IPS;
+
+	if (!tb[CTA_NAT_PROTO-1])
+		return 0;
+
+	err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range);
+	if (err < 0)
+		return err;
+
+	DEBUGP("leaving\n");
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#endif
+
+static inline int
+ctnetlink_parse_help(struct nfattr *attr, char **helper_name)
+{
+	struct nfattr *tb[CTA_HELP_MAX];
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	if (!tb[CTA_HELP_NAME-1])
+		return -EINVAL;
+
+	*helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static int
+ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack *ct;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+	else {
+		/* Flush the whole table */
+		ip_conntrack_flush();
+		return 0;
+	}
+
+	if (err < 0)
+		return err;
+
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash\n");
+		return -ENOENT;
+	}
+
+	ct = tuplehash_to_ctrack(h);
+	
+	if (cda[CTA_ID-1]) {
+		u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1]));
+		if (ct->id != id) {
+			ip_conntrack_put(ct);
+			return -ENOENT;
+		}
+	}	
+	if (del_timer(&ct->timeout)) {
+		ip_conntrack_put(ct);
+		ct->timeout.function((unsigned long)ct);
+		return 0;
+	}
+	ip_conntrack_put(ct);
+	DEBUGP("leaving\n");
+
+	return 0;
+}
+
+static int
+ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack *ct;
+	struct sk_buff *skb2 = NULL;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct nfgenmsg *msg = NLMSG_DATA(nlh);
+		u32 rlen;
+
+		if (msg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+					IPCTNL_MSG_CT_GET_CTRZERO) {
+#ifdef CONFIG_IP_NF_CT_ACCT
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+						ctnetlink_dump_table_w,
+						ctnetlink_done)) != 0)
+				return -EINVAL;
+#else
+			return -ENOTSUPP;
+#endif
+		} else {
+			if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		      		                        ctnetlink_dump_table,
+		                                	ctnetlink_done)) != 0)
+			return -EINVAL;
+		}
+
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (cda[CTA_TUPLE_ORIG-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		DEBUGP("tuple not found in conntrack hash");
+		return -ENOENT;
+	}
+	DEBUGP("tuple found\n");
+	ct = tuplehash_to_ctrack(h);
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb2) {
+		ip_conntrack_put(ct);
+		return -ENOMEM;
+	}
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+
+	err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 
+				  IPCTNL_MSG_CT_NEW, 1, ct);
+	ip_conntrack_put(ct);
+	if (err <= 0)
+		goto out;
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto out;
+
+	DEBUGP("leaving\n");
+	return 0;
+
+out:
+	if (skb2)
+		kfree_skb(skb2);
+	return -1;
+}
+
+static inline int
+ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
+	d = ct->status ^ status;
+
+	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+		/* unchangeable */
+		return -EINVAL;
+	
+	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+		/* SEEN_REPLY bit can only be set */
+		return -EINVAL;
+
+	
+	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+		/* ASSURED bit can only be set */
+		return -EINVAL;
+
+	if (cda[CTA_NAT-1]) {
+#ifndef CONFIG_IP_NF_NAT_NEEDED
+		return -EINVAL;
+#else
+		unsigned int hooknum;
+		struct ip_nat_range range;
+
+		if (ctnetlink_parse_nat(cda, ct, &range) < 0)
+			return -EINVAL;
+
+		DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", 
+		       NIPQUAD(range.min_ip), NIPQUAD(range.max_ip),
+		       htons(range.min.all), htons(range.max.all));
+		
+		/* This is tricky but it works. ip_nat_setup_info needs the
+		 * hook number as parameter, so let's do the correct 
+		 * conversion and run away */
+		if (status & IPS_SRC_NAT_DONE)
+			hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */
+		else if (status & IPS_DST_NAT_DONE)
+			hooknum = NF_IP_PRE_ROUTING;  /* IP_NAT_MANIP_DST */
+		else 
+			return -EINVAL; /* Missing NAT flags */
+
+		DEBUGP("NAT status: %lu\n", 
+		       status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+		
+		if (ip_nat_initialized(ct, hooknum))
+			return -EEXIST;
+		ip_nat_setup_info(ct, &range, hooknum);
+
+                DEBUGP("NAT status after setup_info: %lu\n",
+                       ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK));
+#endif
+	}
+
+	/* Be careful here, modifying NAT bits can screw up things,
+	 * so don't let users modify them directly if they don't pass
+	 * ip_nat_range. */
+	ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+	return 0;
+}
+
+
+static inline int
+ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	struct ip_conntrack_helper *helper;
+	char *helpname;
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* don't change helper of sibling connections */
+	if (ct->master)
+		return -EINVAL;
+
+	err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname);
+	if (err < 0)
+		return err;
+
+	helper = __ip_conntrack_helper_find_byname(helpname);
+	if (!helper) {
+		if (!strcmp(helpname, ""))
+			helper = NULL;
+		else
+			return -EINVAL;
+	}
+
+	if (ct->helper) {
+		if (!helper) {
+			/* we had a helper before ... */
+			ip_ct_remove_expectations(ct);
+			ct->helper = NULL;
+		} else {
+			/* need to zero data of old helper */
+			memset(&ct->help, 0, sizeof(ct->help));
+		}
+	}
+	
+	ct->helper = helper;
+
+	return 0;
+}
+
+static inline int
+ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+	
+	if (!del_timer(&ct->timeout))
+		return -ETIME;
+
+	ct->timeout.expires = jiffies + timeout * HZ;
+	add_timer(&ct->timeout);
+
+	return 0;
+}
+
+static int
+ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	int err;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_HELP-1]) {
+		err = ctnetlink_change_helper(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TIMEOUT-1]) {
+		err = ctnetlink_change_timeout(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_STATUS-1]) {
+		err = ctnetlink_change_status(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
+	DEBUGP("all done\n");
+	return 0;
+}
+
+static int
+ctnetlink_create_conntrack(struct nfattr *cda[], 
+			   struct ip_conntrack_tuple *otuple,
+			   struct ip_conntrack_tuple *rtuple)
+{
+	struct ip_conntrack *ct;
+	int err = -EINVAL;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	ct = ip_conntrack_alloc(otuple, rtuple);
+	if (ct == NULL || IS_ERR(ct))
+		return -ENOMEM;	
+
+	if (!cda[CTA_TIMEOUT-1])
+		goto err;
+	ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1]));
+
+	ct->timeout.expires = jiffies + ct->timeout.expires * HZ;
+	ct->status |= IPS_CONFIRMED;
+
+	err = ctnetlink_change_status(ct, cda);
+	if (err < 0)
+		goto err;
+
+	ct->helper = ip_conntrack_helper_find_get(rtuple);
+
+	add_timer(&ct->timeout);
+	ip_conntrack_hash_insert(ct);
+
+	if (ct->helper)
+		ip_conntrack_helper_put(ct->helper);
+
+	DEBUGP("conntrack with id %u inserted\n", ct->id);
+	return 0;
+
+err:	
+	ip_conntrack_free(ct);
+	return err;
+}
+
+static int 
+ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, 
+			struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple otuple, rtuple;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (cda[CTA_TUPLE_ORIG-1]) {
+		err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG);
+		if (err < 0)
+			return err;
+	}
+
+	if (cda[CTA_TUPLE_REPLY-1]) {
+		err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY);
+		if (err < 0)
+			return err;
+	}
+
+	write_lock_bh(&ip_conntrack_lock);
+	if (cda[CTA_TUPLE_ORIG-1])
+		h = __ip_conntrack_find(&otuple, NULL);
+	else if (cda[CTA_TUPLE_REPLY-1])
+		h = __ip_conntrack_find(&rtuple, NULL);
+
+	if (h == NULL) {
+		write_unlock_bh(&ip_conntrack_lock);
+		DEBUGP("no such conntrack, create new\n");
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
+		return err;
+	}
+	/* implicit 'else' */
+
+	/* we only allow nat config for new conntracks */
+	if (cda[CTA_NAT-1]) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* We manipulate the conntrack inside the global conntrack table lock,
+	 * so there's no need to increase the refcount */
+	DEBUGP("conntrack found\n");
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda);
+
+out_unlock:
+	write_unlock_bh(&ip_conntrack_lock);
+	return err;
+}
+
+/*********************************************************************** 
+ * EXPECT 
+ ***********************************************************************/ 
+
+static inline int
+ctnetlink_exp_dump_tuple(struct sk_buff *skb,
+			 const struct ip_conntrack_tuple *tuple,
+			 enum ctattr_expect type)
+{
+	struct nfattr *nest_parms = NFA_NEST(skb, type);
+	
+	if (ctnetlink_dump_tuples(skb, tuple) < 0)
+		goto nfattr_failure;
+
+	NFA_NEST_END(skb, nest_parms);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}			
+
+static inline int
+ctnetlink_exp_dump_expect(struct sk_buff *skb,
+                          const struct ip_conntrack_expect *exp)
+{
+	struct ip_conntrack *master = exp->master;
+	u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ);
+	u_int32_t id = htonl(exp->id);
+
+	if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0)
+		goto nfattr_failure;
+	if (ctnetlink_exp_dump_tuple(skb,
+				 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 CTA_EXPECT_MASTER) < 0)
+		goto nfattr_failure;
+	
+	NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout);
+	NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id);
+
+	return 0;
+	
+nfattr_failure:
+	return -1;
+}
+
+static int
+ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		    int event, 
+		    int nowait, 
+		    const struct ip_conntrack_expect *exp)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned char *b;
+
+	b = skb->tail;
+
+	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	nlh    = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg));
+	nfmsg  = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = (nowait && pid) ? NLM_F_MULTI : 0;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+nfattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static int ctnetlink_expect_event(struct notifier_block *this,
+				  unsigned long events, void *ptr)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr;
+	struct sk_buff *skb;
+	unsigned int type;
+	unsigned char *b;
+	int flags = 0;
+	u16 proto;
+
+	if (events & IPEXP_NEW) {
+		type = IPCTNL_MSG_EXP_NEW;
+		flags = NLM_F_CREATE|NLM_F_EXCL;
+	} else
+		return NOTIFY_DONE;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb)
+		return NOTIFY_DONE;
+
+	b = skb->tail;
+
+	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	nlh   = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags    = flags;
+	nfmsg->nfgen_family = AF_INET;
+	nfmsg->version	    = NFNETLINK_V0;
+	nfmsg->res_id	    = 0;
+
+	if (ctnetlink_exp_dump_expect(skb, exp) < 0)
+		goto nfattr_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+	proto = exp->tuple.dst.protonum;
+	nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0);
+	return NOTIFY_DONE;
+
+nlmsg_failure:
+nfattr_failure:
+	kfree_skb(skb);
+	return NOTIFY_DONE;
+}
+#endif
+
+static int
+ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct ip_conntrack_expect *exp = NULL;
+	struct list_head *i;
+	u_int32_t *id = (u_int32_t *) &cb->args[0];
+
+	DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+	read_lock_bh(&ip_conntrack_lock);
+	list_for_each_prev(i, &ip_conntrack_expect_list) {
+		exp = (struct ip_conntrack_expect *) i;
+		if (exp->id <= *id)
+			continue;
+		if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					    cb->nlh->nlmsg_seq,
+					    IPCTNL_MSG_EXP_NEW,
+					    1, exp) < 0)
+			goto out;
+		*id = exp->id;
+	}
+out:	
+	read_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving, last id=%llu\n", *id);
+
+	return skb->len;
+}
+
+static int
+ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_expect *exp;
+	struct sk_buff *skb2;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct nfgenmsg *msg = NLMSG_DATA(nlh);
+		u32 rlen;
+
+		if (msg->nfgen_family != AF_INET)
+			return -EAFNOSUPPORT;
+
+		if ((*errp = netlink_dump_start(ctnl, skb, nlh,
+		    				ctnetlink_exp_dump_table,
+						ctnetlink_done)) != 0)
+			return -EINVAL;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		skb_pull(skb, rlen);
+		return 0;
+	}
+
+	if (cda[CTA_EXPECT_MASTER-1])
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER);
+	else
+		return -EINVAL;
+
+	if (err < 0)
+		return err;
+
+	exp = ip_conntrack_expect_find_get(&tuple);
+	if (!exp)
+		return -ENOENT;
+
+	err = -ENOMEM;
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		goto out;
+	NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid;
+	
+	err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, 
+				      nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+				      1, exp);
+	if (err <= 0)
+		goto out;
+
+	ip_conntrack_expect_put(exp);
+
+	err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+	if (err < 0)
+		goto free;
+
+	return err;
+
+out:
+	ip_conntrack_expect_put(exp);
+free:
+	if (skb2)
+		kfree_skb(skb2);
+	return err;
+}
+
+static int
+ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, 
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_expect *exp, *tmp;
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_helper *h;
+	int err;
+
+	if (cda[CTA_EXPECT_TUPLE-1]) {
+		/* delete a single expect by tuple */
+		err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+		if (err < 0)
+			return err;
+
+		/* bump usage count to 2 */
+		exp = ip_conntrack_expect_find_get(&tuple);
+		if (!exp)
+			return -ENOENT;
+
+		if (cda[CTA_EXPECT_ID-1]) {
+			u_int32_t id = 
+				*(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]);
+			if (exp->id != ntohl(id)) {
+				ip_conntrack_expect_put(exp);
+				return -ENOENT;
+			}
+		}
+
+		/* after list removal, usage count == 1 */
+		ip_conntrack_unexpect_related(exp);
+		/* have to put what we 'get' above. 
+		 * after this line usage count == 0 */
+		ip_conntrack_expect_put(exp);
+	} else if (cda[CTA_EXPECT_HELP_NAME-1]) {
+		char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]);
+
+		/* delete all expectations for this helper */
+		write_lock_bh(&ip_conntrack_lock);
+		h = __ip_conntrack_helper_find_byname(name);
+		if (!h) {
+			write_unlock_bh(&ip_conntrack_lock);
+			return -EINVAL;
+		}
+		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+					 list) {
+			if (exp->master->helper == h 
+			    && del_timer(&exp->timeout))
+				__ip_ct_expect_unlink_destroy(exp);
+		}
+		write_unlock(&ip_conntrack_lock);
+	} else {
+		/* This basically means we have to flush everything*/
+		write_lock_bh(&ip_conntrack_lock);
+		list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
+					 list) {
+			if (del_timer(&exp->timeout))
+				__ip_ct_expect_unlink_destroy(exp);
+		}
+		write_unlock_bh(&ip_conntrack_lock);
+	}
+
+	return 0;
+}
+static int
+ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[])
+{
+	return -EOPNOTSUPP;
+}
+
+static int
+ctnetlink_create_expect(struct nfattr *cda[])
+{
+	struct ip_conntrack_tuple tuple, mask, master_tuple;
+	struct ip_conntrack_tuple_hash *h = NULL;
+	struct ip_conntrack_expect *exp;
+	struct ip_conntrack *ct;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);
+
+	/* caller guarantees that those three CTA_EXPECT_* exist */
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK);
+	if (err < 0)
+		return err;
+	err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER);
+	if (err < 0)
+		return err;
+
+	/* Look for master conntrack of this expectation */
+	h = ip_conntrack_find_get(&master_tuple, NULL);
+	if (!h)
+		return -ENOENT;
+	ct = tuplehash_to_ctrack(h);
+
+	if (!ct->helper) {
+		/* such conntrack hasn't got any helper, abort */
+		err = -EINVAL;
+		goto out;
+	}
+
+	exp = ip_conntrack_expect_alloc(ct);
+	if (!exp) {
+		err = -ENOMEM;
+		goto out;
+	}
+	
+	exp->expectfn = NULL;
+	exp->master = ct;
+	memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple));
+	memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple));
+
+	err = ip_conntrack_expect_related(exp);
+	ip_conntrack_expect_put(exp);
+
+out:	
+	ip_conntrack_put(tuplehash_to_ctrack(h));
+	return err;
+}
+
+static int
+ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
+		     struct nlmsghdr *nlh, struct nfattr *cda[], int *errp)
+{
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_expect *exp;
+	int err = 0;
+
+	DEBUGP("entered %s\n", __FUNCTION__);	
+
+	if (!cda[CTA_EXPECT_TUPLE-1]
+	    || !cda[CTA_EXPECT_MASK-1]
+	    || !cda[CTA_EXPECT_MASTER-1])
+		return -EINVAL;
+
+	err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE);
+	if (err < 0)
+		return err;
+
+	write_lock_bh(&ip_conntrack_lock);
+	exp = __ip_conntrack_expect_find(&tuple);
+
+	if (!exp) {
+		write_unlock_bh(&ip_conntrack_lock);
+		err = -ENOENT;
+		if (nlh->nlmsg_flags & NLM_F_CREATE)
+			err = ctnetlink_create_expect(cda);
+		return err;
+	}
+
+	err = -EEXIST;
+	if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+		err = ctnetlink_change_expect(exp, cda);
+	write_unlock_bh(&ip_conntrack_lock);
+
+	DEBUGP("leaving\n");
+	
+	return err;
+}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+static struct notifier_block ctnl_notifier = {
+	.notifier_call	= ctnetlink_conntrack_event,
+};
+
+static struct notifier_block ctnl_notifier_exp = {
+	.notifier_call	= ctnetlink_expect_event,
+};
+#endif
+
+static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
+	[IPCTNL_MSG_CT_NEW]		= { .call = ctnetlink_new_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_GET] 		= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_DELETE]  	= { .call = ctnetlink_del_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_CT_GET_CTRZERO] 	= { .call = ctnetlink_get_conntrack,
+					    .attr_count = CTA_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
+	[IPCTNL_MSG_EXP_GET]		= { .call = ctnetlink_get_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_EXP_NEW]		= { .call = ctnetlink_new_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+	[IPCTNL_MSG_EXP_DELETE]		= { .call = ctnetlink_del_expect,
+					    .attr_count = CTA_EXPECT_MAX,
+					    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem ctnl_subsys = {
+	.name				= "conntrack",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK,
+	.cb_count			= IPCTNL_MSG_MAX,
+	.cb				= ctnl_cb,
+};
+
+static struct nfnetlink_subsystem ctnl_exp_subsys = {
+	.name				= "conntrack_expect",
+	.subsys_id			= NFNL_SUBSYS_CTNETLINK_EXP,
+	.cb_count			= IPCTNL_MSG_EXP_MAX,
+	.cb				= ctnl_exp_cb,
+};
+
+static int __init ctnetlink_init(void)
+{
+	int ret;
+
+	printk("ctnetlink v%s: registering with nfnetlink.\n", version);
+	ret = nfnetlink_subsys_register(&ctnl_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+
+	ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register exp with nfnetlink.\n");
+		goto err_unreg_subsys;
+	}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+	ret = ip_conntrack_register_notifier(&ctnl_notifier);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot register notifier.\n");
+		goto err_unreg_exp_subsys;
+	}
+
+	ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp);
+	if (ret < 0) {
+		printk("ctnetlink_init: cannot expect register notifier.\n");
+		goto err_unreg_notifier;
+	}
+#endif
+
+	return 0;
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+err_unreg_notifier:
+	ip_conntrack_unregister_notifier(&ctnl_notifier);
+err_unreg_exp_subsys:
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+#endif
+err_unreg_subsys:
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+err_out:
+	return ret;
+}
+
+static void __exit ctnetlink_exit(void)
+{
+	printk("ctnetlink: unregistering from nfnetlink.\n");
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+	ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
+	ip_conntrack_unregister_notifier(&ctnl_notifier);
+#endif
+
+	nfnetlink_subsys_unregister(&ctnl_exp_subsys);
+	nfnetlink_subsys_unregister(&ctnl_subsys);
+	return;
+}
+
+module_init(ctnetlink_init);
+module_exit(ctnetlink_exit);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db325..838d1d69b36 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,22 +102,24 @@ static int icmp_packet(struct ip_conntrack *ct,
 			ct->timeout.function((unsigned long)ct);
 	} else {
 		atomic_inc(&ct->proto.icmp.count);
+		ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
 		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
 	}
 
 	return NF_ACCEPT;
 }
 
+static u_int8_t valid_new[] = { 
+	[ICMP_ECHO] = 1,
+	[ICMP_TIMESTAMP] = 1,
+	[ICMP_INFO_REQUEST] = 1,
+	[ICMP_ADDRESS] = 1 
+};
+
 /* Called when a new connection for this protocol found. */
 static int icmp_new(struct ip_conntrack *conntrack,
 		    const struct sk_buff *skb)
 {
-	static u_int8_t valid_new[]
-		= { [ICMP_ECHO] = 1,
-		    [ICMP_TIMESTAMP] = 1,
-		    [ICMP_INFO_REQUEST] = 1,
-		    [ICMP_ADDRESS] = 1 };
-
 	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
 	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
 		/* Can't create a new ICMP `conn' with this. */
@@ -158,11 +160,12 @@ icmp_error_message(struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 
-	innerproto = ip_ct_find_proto(inside->ip.protocol);
+	innerproto = ip_conntrack_proto_find_get(inside->ip.protocol);
 	dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
 	/* Are they talking about one of our connections? */
 	if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
 		DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+		ip_conntrack_proto_put(innerproto);
 		return NF_ACCEPT;
 	}
 
@@ -170,8 +173,10 @@ icmp_error_message(struct sk_buff *skb,
 	   been preserved inside the ICMP. */
 	if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
 		DEBUGP("icmp_error_track: Can't invert tuple\n");
+		ip_conntrack_proto_put(innerproto);
 		return NF_ACCEPT;
 	}
+	ip_conntrack_proto_put(innerproto);
 
 	*ctinfo = IP_CT_RELATED;
 
@@ -212,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
 	if (icmph == NULL) {
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -226,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 		if (!(u16)csum_fold(skb->csum)) 
 			break;
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: bad HW ICMP checksum ");
 		return -NF_ACCEPT;
 	case CHECKSUM_NONE:
 		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
 			if (LOG_INVALID(IPPROTO_ICMP))
-				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 					      "ip_ct_icmp: bad ICMP checksum ");
 			return -NF_ACCEPT;
 		}
@@ -249,7 +254,7 @@ checksum_skipped:
 	 */
 	if (icmph->type > NR_ICMP_TYPES) {
 		if (LOG_INVALID(IPPROTO_ICMP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				      "ip_ct_icmp: invalid ICMP type ");
 		return -NF_ACCEPT;
 	}
@@ -265,6 +270,47 @@ checksum_skipped:
 	return icmp_error_message(skb, ctinfo, hooknum);
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int icmp_tuple_to_nfattr(struct sk_buff *skb,
+				const struct ip_conntrack_tuple *t)
+{
+	NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t),
+		&t->src.u.icmp.id);
+	NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t),
+		&t->dst.u.icmp.type);
+	NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t),
+		&t->dst.u.icmp.code);
+
+	if (t->dst.u.icmp.type >= sizeof(valid_new) 
+	    || !valid_new[t->dst.u.icmp.type])
+		return -EINVAL;
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+static int icmp_nfattr_to_tuple(struct nfattr *tb[],
+				struct ip_conntrack_tuple *tuple)
+{
+	if (!tb[CTA_PROTO_ICMP_TYPE-1]
+	    || !tb[CTA_PROTO_ICMP_CODE-1]
+	    || !tb[CTA_PROTO_ICMP_ID-1])
+		return -1;
+
+	tuple->dst.u.icmp.type = 
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]);
+	tuple->dst.u.icmp.code =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]);
+	tuple->src.u.icmp.id =
+			*(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]);
+
+	return 0;
+}
+#endif
+
 struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
 {
 	.proto 			= IPPROTO_ICMP,
@@ -276,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
 	.packet			= icmp_packet,
 	.new			= icmp_new,
 	.error			= icmp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr	= icmp_tuple_to_nfattr,
+	.nfattr_to_tuple	= icmp_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d75390bf1..a875f35e576 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 		}
 
 		conntrack->proto.sctp.state = newconntrack;
+		if (oldsctpstate != newconntrack)
+			ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
 		write_unlock_bh(&sctp_lock);
 	}
 
@@ -503,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = {
 	.packet 	 = sctp_packet, 
 	.new 		 = sctp_new, 
 	.destroy 	 = NULL, 
-	.me 		 = THIS_MODULE 
+	.me 		 = THIS_MODULE,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr = ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple = ip_ct_port_nfattr_to_tuple,
+#endif
 };
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed766d..f23ef1f88c4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s,
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
+			 const struct ip_conntrack *ct)
+{
+	read_lock_bh(&tcp_lock);
+	NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
+		&ct->proto.tcp.state);
+	read_unlock_bh(&tcp_lock);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+#endif
+
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 {
 	if (tcph->rst) return TCP_RST_SET;
@@ -699,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 		res = 1;
 	} else {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 			"ip_ct_tcp: %s ",
 			before(seq, sender->td_maxend + 1) ?
 			after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -798,7 +815,7 @@ static int tcp_error(struct sk_buff *skb,
 				sizeof(_tcph), &_tcph);
 	if (th == NULL) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				"ip_ct_tcp: short packet ");
 		return -NF_ACCEPT;
   	}
@@ -806,7 +823,7 @@ static int tcp_error(struct sk_buff *skb,
 	/* Not whole TCP header or malformed packet */
 	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				"ip_ct_tcp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -823,7 +840,7 @@ static int tcp_error(struct sk_buff *skb,
 			         skb->ip_summed == CHECKSUM_HW ? skb->csum
 			      	 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: bad TCP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -832,7 +849,7 @@ static int tcp_error(struct sk_buff *skb,
 	tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
 	if (!tcp_valid_flags[tcpflags]) {
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid TCP flag combination ");
 		return -NF_ACCEPT;
 	}
@@ -880,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 			 */
 		    	write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
-				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
-					  "ip_ct_tcp: killing out of sync session ");
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+					      NULL, "ip_ct_tcp: "
+					      "killing out of sync session ");
 		    	if (del_timer(&conntrack->timeout))
 		    		conntrack->timeout.function((unsigned long)
 		    					    conntrack);
@@ -895,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		
 		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid packet ignored ");
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
@@ -905,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		       old_state);
 		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_SYN_SENT:
@@ -926,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 			write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
 				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
-				              "ip_ct_tcp: invalid SYN");
+					      NULL, "ip_ct_tcp: invalid SYN");
 			return -NF_ACCEPT;
 		}
 	case TCP_CONNTRACK_CLOSE:
@@ -973,6 +991,10 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
 	write_unlock_bh(&tcp_lock);
 
+	ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+	if (new_state != old_state)
+		ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
 	if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
 		/* If only reply is a RST, we can consider ourselves not to
 		   have an established connection: this is a fairly common
@@ -1096,4 +1118,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
 	.packet 		= tcp_packet,
 	.new 			= tcp_new,
 	.error			= tcp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.to_nfattr		= tcp_to_nfattr,
+	.tuple_to_nfattr	= ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba098d..f2dcac7c766 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack,
 		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
 				   ip_ct_udp_timeout_stream);
 		/* Also, more likely to be important, and not a probe */
-		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+			ip_conntrack_event_cache(IPCT_STATUS, skb);
 	} else
 		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
 
@@ -97,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
 	if (hdr == NULL) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: short packet ");
 		return -NF_ACCEPT;
 	}
@@ -105,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	/* Truncated/malformed packets */
 	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: truncated/malformed packet ");
 		return -NF_ACCEPT;
 	}
@@ -125,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 			         skb->ip_summed == CHECKSUM_HW ? skb->csum
 			      	 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
 		if (LOG_INVALID(IPPROTO_UDP))
-			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
 				  "ip_ct_udp: bad UDP checksum ");
 		return -NF_ACCEPT;
 	}
@@ -144,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp =
 	.packet			= udp_packet,
 	.new			= udp_new,
 	.error			= udp_error,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.tuple_to_nfattr	= ip_ct_port_tuple_to_nfattr,
+	.nfattr_to_tuple	= ip_ct_port_nfattr_to_tuple,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 61798c46e91..ee5895afd0c 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -5,7 +5,7 @@
 */
 
 /* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (DIRECTION(hash))
 		return 0;
 
-	proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
-			       .tuple.dst.protonum);
+	proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum);
 	IP_NF_ASSERT(proto);
 
 	if (seq_printf(s, "%-8s %u %ld ",
@@ -185,7 +184,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 			return -ENOSPC;
 
 #if defined(CONFIG_IP_NF_CONNTRACK_MARK)
-	if (seq_printf(s, "mark=%lu ", conntrack->mark))
+	if (seq_printf(s, "mark=%u ", conntrack->mark))
 		return -ENOSPC;
 #endif
 
@@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
 	seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
 
 	print_tuple(s, &expect->tuple,
-		    ip_ct_find_proto(expect->tuple.dst.protonum));
+		    __ip_conntrack_proto_find(expect->tuple.dst.protonum));
 	return seq_putc(s, '\n');
 }
 
@@ -889,6 +888,7 @@ static int init_or_cleanup(int init)
 	return ret;
 
  cleanup:
+	synchronize_net();
 #ifdef CONFIG_SYSCTL
  	unregister_sysctl_table(ip_ct_sysctl_header);
  cleanup_localinops:
@@ -971,6 +971,14 @@ void need_ip_conntrack(void)
 {
 }
 
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
 EXPORT_SYMBOL(ip_ct_get_tuple);
@@ -982,12 +990,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register);
 EXPORT_SYMBOL(ip_conntrack_helper_unregister);
 EXPORT_SYMBOL(ip_ct_iterate_cleanup);
 EXPORT_SYMBOL(ip_ct_refresh_acct);
-EXPORT_SYMBOL(ip_ct_protos);
-EXPORT_SYMBOL(ip_ct_find_proto);
+
 EXPORT_SYMBOL(ip_conntrack_expect_alloc);
 EXPORT_SYMBOL(ip_conntrack_expect_put);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
+EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find);
+EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy);
+
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
 EXPORT_SYMBOL(ip_ct_gather_frags);
 EXPORT_SYMBOL(ip_conntrack_htable_size);
@@ -995,7 +1007,28 @@ EXPORT_SYMBOL(ip_conntrack_lock);
 EXPORT_SYMBOL(ip_conntrack_hash);
 EXPORT_SYMBOL(ip_conntrack_untracked);
 EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
-EXPORT_SYMBOL_GPL(ip_conntrack_put);
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 EXPORT_SYMBOL(ip_conntrack_tcp_update);
 #endif
+
+EXPORT_SYMBOL_GPL(ip_conntrack_flush);
+EXPORT_SYMBOL_GPL(__ip_conntrack_find);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_alloc);
+EXPORT_SYMBOL_GPL(ip_conntrack_free);
+EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert);
+
+EXPORT_SYMBOL_GPL(ip_ct_remove_expectations);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_helper_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname);
+
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_proto_put);
+EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find);
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr);
+EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple);
+#endif
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 739b6dde1c8..1adedb743f6 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock);
 static unsigned int ip_nat_htable_size;
 
 static struct list_head *bysource;
+
+#define MAX_IP_NAT_PROTO 256
 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
 
+static inline struct ip_nat_protocol *
+__ip_nat_proto_find(u_int8_t protonum)
+{
+	return ip_nat_protos[protonum];
+}
+
+struct ip_nat_protocol *
+ip_nat_proto_find_get(u_int8_t protonum)
+{
+	struct ip_nat_protocol *p;
+
+	/* we need to disable preemption to make sure 'p' doesn't get
+	 * removed until we've grabbed the reference */
+	preempt_disable();
+	p = __ip_nat_proto_find(protonum);
+	if (p) {
+		if (!try_module_get(p->me))
+			p = &ip_nat_unknown_protocol;
+	}
+	preempt_enable();
+
+	return p;
+}
+
+void
+ip_nat_proto_put(struct ip_nat_protocol *p)
+{
+	module_put(p->me);
+}
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -103,7 +134,8 @@ static int
 in_range(const struct ip_conntrack_tuple *tuple,
 	 const struct ip_nat_range *range)
 {
-	struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+	struct ip_nat_protocol *proto = 
+				__ip_nat_proto_find(tuple->dst.protonum);
 
 	/* If we are supposed to map IPs, then we must be in the
 	   range specified, otherwise let this drag us onto a new src IP. */
@@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 struct ip_conntrack *conntrack,
 		 enum ip_nat_manip_type maniptype)
 {
-	struct ip_nat_protocol *proto
-		= ip_nat_find_proto(orig_tuple->dst.protonum);
+	struct ip_nat_protocol *proto;
 
 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
 	   and that same mapping gives a unique tuple within the given
@@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple,
 	/* 3) The per-protocol part of the manip is made to map into
 	   the range to make a unique tuple. */
 
+	proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
+
 	/* Only bother mapping if it's not already in range and unique */
 	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
 	     || proto->in_range(tuple, maniptype, &range->min, &range->max))
-	    && !ip_nat_used_tuple(tuple, conntrack))
+	    && !ip_nat_used_tuple(tuple, conntrack)) {
+		ip_nat_proto_put(proto);
 		return;
+	}
 
 	/* Last change: get protocol to try to obtain unique tuple. */
 	proto->unique_tuple(tuple, range, maniptype, conntrack);
+
+	ip_nat_proto_put(proto);
 }
 
 unsigned int
@@ -320,17 +357,20 @@ manip_pkt(u_int16_t proto,
 	  enum ip_nat_manip_type maniptype)
 {
 	struct iphdr *iph;
+	struct ip_nat_protocol *p;
 
-	(*pskb)->nfcache |= NFC_ALTERED;
-	if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+	if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
 		return 0;
 
 	iph = (void *)(*pskb)->data + iphdroff;
 
 	/* Manipulate protcol part. */
-	if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
-	                                         target, maniptype))
+	p = ip_nat_proto_find_get(proto);
+	if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
+		ip_nat_proto_put(p);
 		return 0;
+	}
+	ip_nat_proto_put(p);
 
 	iph = (void *)(*pskb)->data + iphdroff;
 
@@ -391,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
 	struct ip_conntrack_tuple inner, target;
 	int hdrlen = (*pskb)->nh.iph->ihl * 4;
 
-	if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+	if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
 		return 0;
 
 	inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
@@ -426,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb,
 
 	if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
 	                     sizeof(struct icmphdr) + inside->ip.ihl*4,
-	                     &inner, ip_ct_find_proto(inside->ip.protocol)))
+	                     &inner,
+			     __ip_conntrack_proto_find(inside->ip.protocol)))
 		return 0;
 
 	/* Change inner back to look like incoming packet.  We do the
@@ -496,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
 	synchronize_net();
 }
 
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+int
+ip_nat_port_range_to_nfattr(struct sk_buff *skb, 
+			    const struct ip_nat_range *range)
+{
+	NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
+		&range->min.tcp.port);
+	NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
+		&range->max.tcp.port);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
+
+int
+ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
+{
+	int ret = 0;
+	
+	/* we have to return whether we actually parsed something or not */
+
+	if (tb[CTA_PROTONAT_PORT_MIN-1]) {
+		ret = 1;
+		range->min.tcp.port = 
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
+	}
+	
+	if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
+		if (ret) 
+			range->max.tcp.port = range->min.tcp.port;
+	} else {
+		ret = 1;
+		range->max.tcp.port = 
+			*(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
+	}
+
+	return ret;
+}
+#endif
+
 int __init ip_nat_init(void)
 {
 	size_t i;
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 158f34f32c0..d2dd5d31355 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 	struct tcphdr *tcph;
 	int datalen;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return 0;
 
 	if (rep_len > match_len
@@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 	                       match_offset + match_len)
 		return 0;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return 0;
 
 	if (rep_len > match_len
@@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb,
 	optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
 	optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
 
-	if (!skb_ip_make_writable(pskb, optend))
+	if (!skb_make_writable(pskb, optend))
 		return 0;
 
 	dir = CTINFO2DIR(ctinfo);
@@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
 	this_way = &ct->nat.info.seq[dir];
 	other_way = &ct->nat.info.seq[!dir];
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+	if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
 		return 0;
 
 	tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 6596c9ee165..93871904399 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb,
 	struct icmphdr *hdr;
 	unsigned int hdroff = iphdroff + iph->ihl*4;
 
-	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+	if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
 		return 0;
 
 	hdr = (struct icmphdr *)((*pskb)->data + hdroff);
@@ -106,11 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_icmp
-= { "ICMP", IPPROTO_ICMP,
-    icmp_manip_pkt,
-    icmp_in_range,
-    icmp_unique_tuple,
-    icmp_print,
-    icmp_print_range
+struct ip_nat_protocol ip_nat_protocol_icmp = {
+	.name			= "ICMP",
+	.protonum		= IPPROTO_ICMP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= icmp_manip_pkt,
+	.in_range		= icmp_in_range,
+	.unique_tuple		= icmp_unique_tuple,
+	.print			= icmp_print,
+	.print_range		= icmp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a98e36d2b3c..1d381bf6857 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -12,6 +12,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/if.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -102,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb,
 	if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
 		hdrsize = sizeof(struct tcphdr);
 
-	if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+	if (!skb_make_writable(pskb, hdroff + hdrsize))
 		return 0;
 
 	iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -169,11 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_tcp
-= { "TCP", IPPROTO_TCP,
-    tcp_manip_pkt,
-    tcp_in_range,
-    tcp_unique_tuple,
-    tcp_print,
-    tcp_print_range
+struct ip_nat_protocol ip_nat_protocol_tcp = {
+	.name			= "TCP",
+	.protonum		= IPPROTO_TCP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= tcp_manip_pkt,
+	.in_range		= tcp_in_range,
+	.unique_tuple		= tcp_unique_tuple,
+	.print			= tcp_print,
+	.print_range		= tcp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index 9f66e562566..c4906e1aa24 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb,
 	u32 oldip, newip;
 	u16 *portptr, newport;
 
-	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+	if (!skb_make_writable(pskb, hdroff + sizeof(*hdr)))
 		return 0;
 
 	iph = (struct iphdr *)((*pskb)->data + iphdroff);
@@ -156,11 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range)
 	else return 0;
 }
 
-struct ip_nat_protocol ip_nat_protocol_udp
-= { "UDP", IPPROTO_UDP,
-    udp_manip_pkt,
-    udp_in_range,
-    udp_unique_tuple,
-    udp_print,
-    udp_print_range
+struct ip_nat_protocol ip_nat_protocol_udp = {
+	.name			= "UDP",
+	.protonum		= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.manip_pkt		= udp_manip_pkt,
+	.in_range		= udp_in_range,
+	.unique_tuple		= udp_unique_tuple,
+	.print			= udp_print,
+	.print_range		= udp_print_range,
+#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
+    defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
+	.range_to_nfattr	= ip_nat_port_range_to_nfattr,
+	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
+#endif
 };
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
index f5525bd58d1..99bbef56f84 100644
--- a/net/ipv4/netfilter/ip_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range)
 }
 
 struct ip_nat_protocol ip_nat_unknown_protocol = {
-	"unknown", 0,
-	unknown_manip_pkt,
-	unknown_in_range,
-	unknown_unique_tuple,
-	unknown_print,
-	unknown_print_range
+	.name			= "unknown",
+	.me			= THIS_MODULE,
+	.manip_pkt		= unknown_manip_pkt,
+	.in_range		= unknown_in_range,
+	.unique_tuple		= unknown_unique_tuple,
+	.print			= unknown_print,
+	.print_range		= unknown_print_range
 };
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
index 2a48b6e635a..93b2c5111bb 100644
--- a/net/ipv4/netfilter/ip_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb,
 		 return NF_DROP;
 	}
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
 	spin_lock_bh(&snmp_lock);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 91d5ea1dbbc..89db052add8 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum,
 	IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
 		       & htons(IP_MF|IP_OFFSET)));
 
-	(*pskb)->nfcache |= NFC_UNKNOWN;
-
 	/* If we had a hardware checksum before, it's now invalid */
 	if ((*pskb)->ip_summed == CHECKSUM_HW)
 		if (skb_checksum_help(*pskb, (out == NULL)))
@@ -396,6 +394,8 @@ module_exit(fini);
 EXPORT_SYMBOL(ip_nat_setup_info);
 EXPORT_SYMBOL(ip_nat_protocol_register);
 EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
 EXPORT_SYMBOL(ip_nat_cheat_check);
 EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
 EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c6baa817438..d54f14d926f 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -43,17 +43,10 @@
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
 
-struct ipq_rt_info {
-	__u8 tos;
-	__u32 daddr;
-	__u32 saddr;
-};
-
 struct ipq_queue_entry {
 	struct list_head list;
 	struct nf_info *info;
 	struct sk_buff *skb;
-	struct ipq_rt_info rt_info;
 };
 
 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -247,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 
 	pmsg->packet_id       = (unsigned long )entry;
 	pmsg->data_len        = data_len;
-	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
-	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+	pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
+	pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
 	pmsg->mark            = entry->skb->nfmark;
 	pmsg->hook            = entry->info->hook;
 	pmsg->hw_protocol     = entry->skb->protocol;
@@ -287,7 +280,8 @@ nlmsg_failure:
 }
 
 static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info,
+		   unsigned int queuenum, void *data)
 {
 	int status = -EINVAL;
 	struct sk_buff *nskb;
@@ -305,14 +299,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
 	entry->info = info;
 	entry->skb = skb;
 
-	if (entry->info->hook == NF_IP_LOCAL_OUT) {
-		struct iphdr *iph = skb->nh.iph;
-
-		entry->rt_info.tos = iph->tos;
-		entry->rt_info.daddr = iph->daddr;
-		entry->rt_info.saddr = iph->saddr;
-	}
-
 	nskb = ipq_build_packet_message(entry, &status);
 	if (nskb == NULL)
 		goto err_out_free;
@@ -388,24 +374,11 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 		}
 		skb_put(e->skb, diff);
 	}
-	if (!skb_ip_make_writable(&e->skb, v->data_len))
+	if (!skb_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
 	e->skb->ip_summed = CHECKSUM_NONE;
-	e->skb->nfcache |= NFC_ALTERED;
-
-	/*
-	 * Extra routing may needed on local out, as the QUEUE target never
-	 * returns control to the table.
-	 */
-	if (e->info->hook == NF_IP_LOCAL_OUT) {
-		struct iphdr *iph = e->skb->nh.iph;
-
-		if (!(iph->tos == e->rt_info.tos
-		      && iph->daddr == e->rt_info.daddr
-		      && iph->saddr == e->rt_info.saddr))
-			return ip_route_me_harder(&e->skb);
-	}
+
 	return 0;
 }
 
@@ -683,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
 }
 #endif /* CONFIG_PROC_FS */
 
+static struct nf_queue_handler nfqh = {
+	.name	= "ip_queue",
+	.outfn	= &ipq_enqueue_packet,
+};
+
 static int
 init_or_cleanup(int init)
 {
@@ -693,7 +671,8 @@ init_or_cleanup(int init)
 		goto cleanup;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk,
+				      THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
@@ -710,7 +689,7 @@ init_or_cleanup(int init)
 	register_netdevice_notifier(&ipq_dev_notifier);
 	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
 	
-	status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+	status = nf_register_queue_handler(PF_INET, &nfqh);
 	if (status < 0) {
 		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
 		goto cleanup_sysctl;
@@ -718,7 +697,7 @@ init_or_cleanup(int init)
 	return status;
 
 cleanup:
-	nf_unregister_queue_handler(PF_INET);
+	nf_unregister_queue_handlers(&nfqh);
 	synchronize_net();
 	ipq_flush(NF_DROP);
 	
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c88dfcd38c5..eef99a1b5de 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb,
 	do {
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
-		(*pskb)->nfcache |= e->nfcache;
 		if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
 			struct ipt_entry_target *t;
 
@@ -341,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb,
 							 back->comefrom);
 					continue;
 				}
-				if (table_base + v
-				    != (void *)e + e->next_offset) {
+				if (table_base + v != (void *)e + e->next_offset
+				    && !(e->ip.flags & IPT_F_GOTO)) {
 					/* Save old back ptr in next entry */
 					struct ipt_entry *next
 						= (void *)e + e->next_offset;
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
index 9842e6e2318..dab78d8bd49 100644
--- a/net/ipv4/netfilter/ipt_CLASSIFY.c
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -32,10 +32,8 @@ target(struct sk_buff **pskb,
 {
 	const struct ipt_classify_target_info *clinfo = targinfo;
 
-	if((*pskb)->priority != clinfo->priority) {
+	if((*pskb)->priority != clinfo->priority) 
 		(*pskb)->priority = clinfo->priority;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
 
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6706d3a1bc4..2d05cafec22 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,7 +367,7 @@ target(struct sk_buff **pskb,
 #ifdef DEBUG_CLUSTERP
 	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 #endif
-	DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+	DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark);
 	if (!clusterip_responsible(cipinfo->config, hash)) {
 		DEBUGP("not responsible\n");
 		return NF_DROP;
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
index 30ddd3e18eb..13463802133 100644
--- a/net/ipv4/netfilter/ipt_CONNMARK.c
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -40,9 +40,9 @@ target(struct sk_buff **pskb,
        void *userinfo)
 {
 	const struct ipt_connmark_target_info *markinfo = targinfo;
-	unsigned long diff;
-	unsigned long nfmark;
-	unsigned long newmark;
+	u_int32_t diff;
+	u_int32_t nfmark;
+	u_int32_t newmark;
 
 	enum ip_conntrack_info ctinfo;
 	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
@@ -61,10 +61,8 @@ target(struct sk_buff **pskb,
 	    case IPT_CONNMARK_RESTORE:
 		nfmark = (*pskb)->nfmark;
 		diff = (ct->mark ^ nfmark) & markinfo->mask;
-		if (diff != 0) {
+		if (diff != 0)
 		    (*pskb)->nfmark = nfmark ^ diff;
-		    (*pskb)->nfcache |= NFC_ALTERED;
-		}
 		break;
 	    }
 	}
@@ -94,6 +92,11 @@ checkentry(const char *tablename,
 	    }
 	}
 
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
index 3ea4509099f..6e319570a28 100644
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -39,7 +39,7 @@ target(struct sk_buff **pskb,
 	if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -51,7 +51,6 @@ target(struct sk_buff **pskb,
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^ 0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	}
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 94a0ce1c1c9..a1319693f64 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 	    != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return 0;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	} 
 	return 1;
 }
@@ -67,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 	     tcph->cwr == einfo->proto.tcp.cwr)))
 		return 1;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+	if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
 		return 0;
 	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
 
@@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 		tcph->check = csum_fold(csum_partial((char *)diffs,
 						     sizeof(diffs),
 						     tcph->check^0xFFFF));
-	(*pskb)->nfcache |= NFC_ALTERED;
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ef08733d26d..92ed050fac6 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -27,10 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("iptables syslog logging module");
 
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
- 
 #if 0
 #define DEBUGP printk
 #else
@@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
 static DEFINE_SPINLOCK(log_lock);
 
 /* One level of recursion won't kill us */
-static void dump_packet(const struct ipt_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
 			const struct sk_buff *skb,
 			unsigned int iphoff)
 {
 	struct iphdr _iph, *ih;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
@@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info,
 	if (ntohs(ih->frag_off) & IP_OFFSET)
 		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
-	if ((info->logflags & IPT_LOG_IPOPT)
+	if ((logflags & IPT_LOG_IPOPT)
 	    && ih->ihl * 4 > sizeof(struct iphdr)) {
 		unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
 		unsigned int i, optsize;
@@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info,
 		printk("SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-		if (info->logflags & IPT_LOG_TCPSEQ)
+		if (logflags & IPT_LOG_TCPSEQ)
 			printk("SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
@@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info,
 		/* Max length: 11 "URGP=65535 " */
 		printk("URGP=%u ", ntohs(th->urg_ptr));
 
-		if ((info->logflags & IPT_LOG_TCPOPT)
+		if ((logflags & IPT_LOG_TCPOPT)
 		    && th->doff * 4 > sizeof(struct tcphdr)) {
 			unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
 			unsigned char *op;
@@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
- 	if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+ 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
  			printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info,
 	/* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
+struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level    = 0,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
 static void
-ipt_log_packet(unsigned int hooknum,
+ipt_log_packet(unsigned int pf,
+	       unsigned int hooknum,
 	       const struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
-	       const struct ipt_log_info *loginfo,
-	       const char *level_string,
+	       const struct nf_loginfo *loginfo,
 	       const char *prefix)
 {
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
 	spin_lock_bh(&log_lock);
-	printk(level_string);
-	printk("%sIN=%s OUT=%s ",
-	       prefix == NULL ? loginfo->prefix : prefix,
+	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
 #ifdef CONFIG_BRIDGE_NETFILTER
@@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb,
 	       void *userinfo)
 {
 	const struct ipt_log_info *loginfo = targinfo;
-	char level_string[4] = "< >";
+	struct nf_loginfo li;
 
-	level_string[1] = '0' + (loginfo->level % 8);
-	ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = loginfo->level;
+	li.u.log.logflags = loginfo->logflags;
 
-	return IPT_CONTINUE;
-}
+	nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix);
 
-static void
-ipt_logfn(unsigned int hooknum,
-	  const struct sk_buff *skb,
-	  const struct net_device *in,
-	  const struct net_device *out,
-	  const char *prefix)
-{
-	struct ipt_log_info loginfo = { 
-		.level = 0, 
-		.logflags = IPT_LOG_MASK, 
-		.prefix = "" 
-	};
-
-	ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+	return IPT_CONTINUE;
 }
 
 static int ipt_log_checkentry(const char *tablename,
@@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = {
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ipt_log_logger ={
+	.name		= "ipt_LOG",
+	.logfn		= &ipt_log_packet,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	if (ipt_register_target(&ipt_log_reg))
 		return -EINVAL;
-	if (nflog)
-		nf_log_register(PF_INET, &ipt_logfn);
+	if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+		printk(KERN_WARNING "ipt_LOG: not logging via system console "
+		       "since somebody else already registered for PF_INET\n");
+		/* we cannot make module load fail here, since otherwise
+		 * iptables userspace would abort */
+	}
 	
 	return 0;
 }
 
 static void __exit fini(void)
 {
-	if (nflog)
-		nf_log_unregister(PF_INET, &ipt_logfn);
+	nf_log_unregister_logger(&ipt_log_logger);
 	ipt_unregister_target(&ipt_log_reg);
 }
 
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
index 33c6f9b63b8..52b4f2c296b 100644
--- a/net/ipv4/netfilter/ipt_MARK.c
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb,
 {
 	const struct ipt_mark_target_info *markinfo = targinfo;
 
-	if((*pskb)->nfmark != markinfo->mark) {
+	if((*pskb)->nfmark != markinfo->mark)
 		(*pskb)->nfmark = markinfo->mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return IPT_CONTINUE;
 }
 
@@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb,
 		break;
 	}
 
-	if((*pskb)->nfmark != mark) {
+	if((*pskb)->nfmark != mark)
 		(*pskb)->nfmark = mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return IPT_CONTINUE;
 }
 
@@ -76,6 +74,8 @@ checkentry_v0(const char *tablename,
 	      unsigned int targinfosize,
 	      unsigned int hook_mask)
 {
+	struct ipt_mark_target_info *markinfo = targinfo;
+
 	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
 		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
 		       targinfosize,
@@ -88,6 +88,11 @@ checkentry_v0(const char *tablename,
 		return 0;
 	}
 
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
@@ -120,6 +125,11 @@ checkentry_v1(const char *tablename,
 		return 0;
 	}
 
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 91e74502c3d..2f3e181c8e9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb,
 
 	IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
 
-	/* FIXME: For the moment, don't do local packets, breaks
-	   testsuite for 2.3.49 --RR */
-	if ((*pskb)->sk)
-		return NF_ACCEPT;
-
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
 	                    || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 06254b29d03..e6e7b609536 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -46,7 +46,8 @@ check(const char *tablename,
 		DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
 		return 0;
 	}
-	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) |
+	                  (1 << NF_IP_LOCAL_OUT))) {
 		DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
 		return 0;
 	}
@@ -76,12 +77,13 @@ target(struct sk_buff **pskb,
 	struct ip_nat_range newrange;
 
 	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
-		     || hooknum == NF_IP_POST_ROUTING);
+		     || hooknum == NF_IP_POST_ROUTING
+		     || hooknum == NF_IP_LOCAL_OUT);
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
 
-	if (hooknum == NF_IP_PRE_ROUTING)
+	if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT)
 		new_ip = (*pskb)->nh.iph->daddr & ~netmask;
 	else
 		new_ip = (*pskb)->nh.iph->saddr & ~netmask;
diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c
new file mode 100644
index 00000000000..3cedc9be880
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* iptables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_NFQ_info *tinfo = targinfo;
+
+	return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) {
+		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_NFQ_info)));
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 91569644602..f115a84a4ac 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 
 	/* This packet will not be the same as the other: clear nf fields */
 	nf_reset(nskb);
-	nskb->nfcache = 0;
 	nskb->nfmark = 0;
 #ifdef CONFIG_BRIDGE_NETFILTER
 	nf_bridge_put(nskb->nf_bridge);
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 7b84a254440..8db70d6908c 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	unsigned int i;
 	u_int8_t *opt;
 
-	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
 	if ((*pskb)->ip_summed == CHECKSUM_HW &&
@@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	       newmss);
 
  retmodified:
-	(*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
 	return IPT_CONTINUE;
 }
 
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 85c70d240f8..deadb36d442 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -33,7 +33,7 @@ target(struct sk_buff **pskb,
 	if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
 		u_int16_t diffs[2];
 
-		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
 			return NF_DROP;
 
 		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
@@ -46,7 +46,6 @@ target(struct sk_buff **pskb,
 						 sizeof(diffs),
 						 (*pskb)->nh.iph->check
 						 ^0xFFFF));
-		(*pskb)->nfcache |= NFC_ALTERED;
 	}
 	return IPT_CONTINUE;
 }
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
new file mode 100644
index 00000000000..b9ae6a9382f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -0,0 +1,119 @@
+/* TTL modification target for IP tables
+ * (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TTL.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int 
+ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, 
+		const struct net_device *out, unsigned int hooknum, 
+		const void *targinfo, void *userinfo)
+{
+	struct iphdr *iph;
+	const struct ipt_TTL_info *info = targinfo;
+	u_int16_t diffs[2];
+	int new_ttl;
+
+	if (!skb_make_writable(pskb, (*pskb)->len))
+		return NF_DROP;
+
+	iph = (*pskb)->nh.iph;
+
+	switch (info->mode) {
+		case IPT_TTL_SET:
+			new_ttl = info->ttl;
+			break;
+		case IPT_TTL_INC:
+			new_ttl = iph->ttl + info->ttl;
+			if (new_ttl > 255)
+				new_ttl = 255;
+			break;
+		case IPT_TTL_DEC:
+			new_ttl = iph->ttl - info->ttl;
+			if (new_ttl < 0)
+				new_ttl = 0;
+			break;
+		default:
+			new_ttl = iph->ttl;
+			break;
+	}
+
+	if (new_ttl != iph->ttl) {
+		diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF;
+		iph->ttl = new_ttl;
+		diffs[1] = htons(((unsigned)iph->ttl) << 8);
+		iph->check = csum_fold(csum_partial((char *)diffs,
+						    sizeof(diffs),
+						    iph->check^0xFFFF));
+	}
+
+	return IPT_CONTINUE;
+}
+
+static int ipt_ttl_checkentry(const char *tablename,
+		const struct ipt_entry *e,
+		void *targinfo,
+		unsigned int targinfosize,
+		unsigned int hook_mask)
+{
+	struct ipt_TTL_info *info = targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) {
+		printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n",
+				targinfosize,
+				IPT_ALIGN(sizeof(struct ipt_TTL_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle")) {
+		printk(KERN_WARNING "ipt_TTL: can only be called from "
+			"\"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (info->mode > IPT_TTL_MAXMODE) {
+		printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", 
+			info->mode);
+		return 0;
+	}
+
+	if ((info->mode != IPT_TTL_SET) && (info->ttl == 0))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_target ipt_TTL = { 
+	.name 		= "TTL",
+	.target 	= ipt_ttl_target, 
+	.checkentry 	= ipt_ttl_checkentry, 
+	.me 		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_TTL);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_TTL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 52a0076302a..e2c14f3cb2f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -62,6 +62,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("iptables userspace logging module");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
 
 #define ULOG_NL_EVENT		111		/* Harald's favorite number */
 #define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
@@ -115,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum)
 	if (ub->qlen > 1)
 		ub->lastnlh->nlmsg_type = NLMSG_DONE;
 
-	NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
-	DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
-		ub->qlen, nlgroupnum);
-	netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
+	DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n",
+		ub->qlen, nlgroupnum + 1);
+	netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
 
 	ub->qlen = 0;
 	ub->skb = NULL;
@@ -219,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
 	pm = NLMSG_DATA(nlh);
 
 	/* We might not have a timestamp, get one */
-	if (skb->stamp.tv_sec == 0)
-		do_gettimeofday((struct timeval *)&skb->stamp);
+	if (skb->tstamp.off_sec == 0)
+		__net_timestamp((struct sk_buff *)skb);
 
 	/* copy hook, prefix, timestamp, payload, etc. */
 	pm->data_len = copy_len;
-	pm->timestamp_sec = skb->stamp.tv_sec;
-	pm->timestamp_usec = skb->stamp.tv_usec;
+	pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
+	pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
 	pm->mark = skb->nfmark;
 	pm->hook = hooknum;
 	if (prefix != NULL)
@@ -303,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
  	return IPT_CONTINUE;
 }
  
-static void ipt_logfn(unsigned int hooknum,
+static void ipt_logfn(unsigned int pf,
+		      unsigned int hooknum,
 		      const struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
+		      const struct nf_loginfo *li,
 		      const char *prefix)
 {
-	struct ipt_ulog_info loginfo = { 
-		.nl_group = ULOG_DEFAULT_NLGROUP,
-		.copy_range = 0,
-		.qthreshold = ULOG_DEFAULT_QTHRESHOLD,
-		.prefix = ""
-	};
+	struct ipt_ulog_info loginfo;
+
+	if (!li || li->type != NF_LOG_TYPE_ULOG) {
+		loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
+		loginfo.copy_range = 0;
+		loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
+		loginfo.prefix[0] = '\0';
+	} else {
+		loginfo.nl_group = li->u.ulog.group;
+		loginfo.copy_range = li->u.ulog.copy_len;
+		loginfo.qthreshold = li->u.ulog.qthreshold;
+		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
+	}
 
 	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
 }
@@ -354,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = {
 	.me		= THIS_MODULE,
 };
 
+static struct nf_logger ipt_ulog_logger = {
+	.name		= "ipt_ULOG",
+	.logfn		= &ipt_logfn,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	int i;
@@ -372,7 +388,8 @@ static int __init init(void)
 		ulog_buffers[i].timer.data = i;
 	}
 
-	nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+	nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
+	                                THIS_MODULE);
 	if (!nflognl)
 		return -ENOMEM;
 
@@ -381,7 +398,7 @@ static int __init init(void)
 		return -EINVAL;
 	}
 	if (nflog)
-		nf_log_register(PF_INET, &ipt_logfn);
+		nf_log_register(PF_INET, &ipt_ulog_logger);
 	
 	return 0;
 }
@@ -394,7 +411,7 @@ static void __exit fini(void)
 	DEBUGP("ipt_ULOG: cleanup_module\n");
 
 	if (nflog)
-		nf_log_unregister(PF_INET, &ipt_logfn);
+		nf_log_unregister_logger(&ipt_ulog_logger);
 	ipt_unregister_target(&ipt_ulog_reg);
 	sock_release(nflognl->sk_socket);
 
diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c
new file mode 100644
index 00000000000..df4a42c6da2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connbytes.c
@@ -0,0 +1,162 @@
+/* Kernel module to match connection tracking byte counter.
+ * GPL (C) 2002 Martin Devera (devik@cdi.cz).
+ *
+ * 2004-07-20 Harald Welte <laforge@netfilter.org>
+ * 	- reimplemented to use per-connection accounting counters
+ * 	- add functionality to match number of packets
+ * 	- add functionality to match average packet size
+ * 	- add support to match directions seperately
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connbytes.h>
+
+#include <asm/div64.h>
+#include <asm/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection");
+
+/* 64bit divisor, dividend and result. dynamic precision */
+static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor)
+{
+	u_int32_t d = divisor;
+
+	if (divisor > 0xffffffffULL) {
+		unsigned int shift = fls(divisor >> 32);
+
+		d = divisor >> shift;
+		dividend >>= shift;
+	}
+
+	do_div(dividend, d);
+	return dividend;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_connbytes_info *sinfo = matchinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct;
+	u_int64_t what = 0;	/* initialize to make gcc happy */
+
+	if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)))
+		return 0; /* no match */
+
+	switch (sinfo->what) {
+	case IPT_CONNBYTES_PKTS:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = ct->counters[IP_CT_DIR_REPLY].packets;
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].packets;
+			what += ct->counters[IP_CT_DIR_REPLY].packets;
+			break;
+		}
+		break;
+	case IPT_CONNBYTES_BYTES:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = ct->counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			what = ct->counters[IP_CT_DIR_ORIGINAL].bytes;
+			what += ct->counters[IP_CT_DIR_REPLY].bytes;
+			break;
+		}
+		break;
+	case IPT_CONNBYTES_AVGPKT:
+		switch (sinfo->direction) {
+		case IPT_CONNBYTES_DIR_ORIGINAL:
+			what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes,
+					ct->counters[IP_CT_DIR_ORIGINAL].packets);
+			break;
+		case IPT_CONNBYTES_DIR_REPLY:
+			what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes,
+					ct->counters[IP_CT_DIR_REPLY].packets);
+			break;
+		case IPT_CONNBYTES_DIR_BOTH:
+			{
+				u_int64_t bytes;
+				u_int64_t pkts;
+				bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes +
+					ct->counters[IP_CT_DIR_REPLY].bytes;
+				pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+
+					ct->counters[IP_CT_DIR_REPLY].packets;
+
+				/* FIXME_THEORETICAL: what to do if sum
+				 * overflows ? */
+
+				what = div64_64(bytes, pkts);
+			}
+			break;
+		}
+		break;
+	}
+
+	if (sinfo->count.to)
+		return (what <= sinfo->count.to && what >= sinfo->count.from);
+	else
+		return (what >= sinfo->count.from);
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	const struct ipt_connbytes_info *sinfo = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info)))
+		return 0;
+
+	if (sinfo->what != IPT_CONNBYTES_PKTS &&
+	    sinfo->what != IPT_CONNBYTES_BYTES &&
+	    sinfo->what != IPT_CONNBYTES_AVGPKT)
+		return 0;
+
+	if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL &&
+	    sinfo->direction != IPT_CONNBYTES_DIR_REPLY &&
+	    sinfo->direction != IPT_CONNBYTES_DIR_BOTH)
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match state_match = {
+	.name		= "connbytes",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
index 2706f96cea5..bf8de47ce00 100644
--- a/net/ipv4/netfilter/ipt_connmark.c
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -54,9 +54,16 @@ checkentry(const char *tablename,
 	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
+	struct ipt_connmark_info *cm = 
+				(struct ipt_connmark_info *)matchinfo;
 	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
 		return 0;
 
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c
new file mode 100644
index 00000000000..ad3278bba6c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dccp.c
@@ -0,0 +1,176 @@
+/*
+ * iptables module for DCCP protocol header matching
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/ip.h>
+#include <linux/dccp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_dccp.h>
+
+#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+		                  || (!!((invflag) & (option)) ^ (cond)))
+
+static unsigned char *dccp_optbuf;
+static DEFINE_SPINLOCK(dccp_buflock);
+
+static inline int
+dccp_find_option(u_int8_t option,
+		 const struct sk_buff *skb,
+		 const struct dccp_hdr *dh,
+		 int *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	unsigned char *op;
+	unsigned int optoff = __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh);
+	unsigned int i;
+
+	if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) {
+		*hotdrop = 1;
+		return 0;
+	}
+
+	if (!optlen)
+		return 0;
+
+	spin_lock_bh(&dccp_buflock);
+	op = skb_header_pointer(skb,
+				skb->nh.iph->ihl*4 + optoff,
+				optlen, dccp_optbuf);
+	if (op == NULL) {
+		/* If we don't have the whole header, drop packet. */
+		spin_unlock_bh(&dccp_buflock);
+		*hotdrop = 1;
+		return 0;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) {
+			spin_unlock_bh(&dccp_buflock);
+			return 1;
+		}
+
+		if (op[i] < 2) 
+			i++;
+		else 
+			i += op[i+1]?:1;
+	}
+
+	spin_unlock_bh(&dccp_buflock);
+	return 0;
+}
+
+
+static inline int
+match_types(const struct dccp_hdr *dh, u_int16_t typemask)
+{
+	return (typemask & (1 << dh->dccph_type));
+}
+
+static inline int
+match_option(u_int8_t option, const struct sk_buff *skb,
+	     const struct dccp_hdr *dh, int *hotdrop)
+{
+	return dccp_find_option(option, skb, dh, hotdrop);
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_dccp_info *info = 
+				(const struct ipt_dccp_info *)matchinfo;
+	struct dccp_hdr _dh, *dh;
+
+	if (offset)
+		return 0;
+	
+	dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh);
+	if (dh == NULL) {
+		*hotdrop = 1;
+		return 0;
+       	}
+
+	return  DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) 
+			&& (ntohs(dh->dccph_sport) <= info->spts[1])), 
+		   	IPT_DCCP_SRC_PORTS, info->flags, info->invflags)
+		&& DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) 
+			&& (ntohs(dh->dccph_dport) <= info->dpts[1])), 
+			IPT_DCCP_DEST_PORTS, info->flags, info->invflags)
+		&& DCCHECK(match_types(dh, info->typemask),
+			   IPT_DCCP_TYPE, info->flags, info->invflags)
+		&& DCCHECK(match_option(info->option, skb, dh, hotdrop),
+			   IPT_DCCP_OPTION, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_dccp_info *info;
+
+	info = (const struct ipt_dccp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_DCCP
+		&& !(ip->invflags & IPT_INV_PROTO)
+		&& matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info))
+		&& !(info->flags & ~IPT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~IPT_DCCP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags);
+}
+
+static struct ipt_match dccp_match = 
+{ 
+	.name 		= "dccp",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me 		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	/* doff is 8 bits, so the maximum option size is (4*256).  Don't put
+	 * this in BSS since DaveM is worried about locked TLB's for kernel
+	 * BSS. */
+	dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL);
+	if (!dccp_optbuf)
+		return -ENOMEM;
+	ret = ipt_register_match(&dccp_match);
+	if (ret)
+		kfree(dccp_optbuf);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&dccp_match);
+	kfree(dccp_optbuf);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("Match for DCCP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 564b49bfebc..2dd1cccbdab 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -94,7 +94,7 @@ struct ipt_hashlimit_htable {
 static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
 static DECLARE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
-static kmem_cache_t *hashlimit_cachep;
+static kmem_cache_t *hashlimit_cachep __read_mostly;
 
 static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
 {
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
index 8955728127b..00bef6cdd3f 100644
--- a/net/ipv4/netfilter/ipt_mark.c
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -37,9 +37,16 @@ checkentry(const char *tablename,
            unsigned int matchsize,
            unsigned int hook_mask)
 {
+	struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo;
+
 	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
 		return 0;
 
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return 0;
+	}
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 3b9065e0638..c1889f88262 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -21,106 +21,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("iptables owner match");
 
 static int
-match_comm(const struct sk_buff *skb, const char *comm)
-{
-	struct task_struct *g, *p;
-	struct files_struct *files;
-	int i;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		if(strncmp(p->comm, comm, sizeof(p->comm)))
-			continue;
-
-		task_lock(p);
-		files = p->files;
-		if(files) {
-			spin_lock(&files->file_lock);
-			for (i=0; i < files->max_fds; i++) {
-				if (fcheck_files(files, i) ==
-				    skb->sk->sk_socket->file) {
-					spin_unlock(&files->file_lock);
-					task_unlock(p);
-					read_unlock(&tasklist_lock);
-					return 1;
-				}
-			}
-			spin_unlock(&files->file_lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	return 0;
-}
-
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
-	struct task_struct *p;
-	struct files_struct *files;
-	int i;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	if (!p)
-		goto out;
-	task_lock(p);
-	files = p->files;
-	if(files) {
-		spin_lock(&files->file_lock);
-		for (i=0; i < files->max_fds; i++) {
-			if (fcheck_files(files, i) ==
-			    skb->sk->sk_socket->file) {
-				spin_unlock(&files->file_lock);
-				task_unlock(p);
-				read_unlock(&tasklist_lock);
-				return 1;
-			}
-		}
-		spin_unlock(&files->file_lock);
-	}
-	task_unlock(p);
-out:
-	read_unlock(&tasklist_lock);
-	return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
-	struct task_struct *g, *p;
-	struct file *file = skb->sk->sk_socket->file;
-	int i, found=0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct files_struct *files;
-		if (p->signal->session != sid)
-			continue;
-
-		task_lock(p);
-		files = p->files;
-		if (files) {
-			spin_lock(&files->file_lock);
-			for (i=0; i < files->max_fds; i++) {
-				if (fcheck_files(files, i) == file) {
-					found = 1;
-					break;
-				}
-			}
-			spin_unlock(&files->file_lock);
-		}
-		task_unlock(p);
-		if (found)
-			goto out;
-	} while_each_thread(g, p);
-out:
-	read_unlock(&tasklist_lock);
-
-	return found;
-}
-
-static int
 match(const struct sk_buff *skb,
       const struct net_device *in,
       const struct net_device *out,
@@ -145,24 +45,6 @@ match(const struct sk_buff *skb,
 			return 0;
 	}
 
-	if(info->match & IPT_OWNER_PID) {
-		if (!match_pid(skb, info->pid) ^
-		    !!(info->invert & IPT_OWNER_PID))
-			return 0;
-	}
-
-	if(info->match & IPT_OWNER_SID) {
-		if (!match_sid(skb, info->sid) ^
-		    !!(info->invert & IPT_OWNER_SID))
-			return 0;
-	}
-
-	if(info->match & IPT_OWNER_COMM) {
-		if (!match_comm(skb, info->comm) ^
-		    !!(info->invert & IPT_OWNER_COMM))
-			return 0;
-	}
-
 	return 1;
 }
 
@@ -173,6 +55,8 @@ checkentry(const char *tablename,
            unsigned int matchsize,
            unsigned int hook_mask)
 {
+	const struct ipt_owner_info *info = matchinfo;
+
         if (hook_mask
             & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
                 printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -184,15 +68,13 @@ checkentry(const char *tablename,
 		       IPT_ALIGN(sizeof(struct ipt_owner_info)));
 		return 0;
 	}
-#ifdef CONFIG_SMP
-	/* files->file_lock can not be used in a BH */
-	if (((struct ipt_owner_info *)matchinfo)->match
-	    & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
-		printk("ipt_owner: pid, sid and command matching is broken "
-		       "on SMP.\n");
+
+	if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+		printk("ipt_owner: pid, sid and command matching "
+		       "not supported anymore\n");
 		return 0;
 	}
-#endif
+
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c
new file mode 100644
index 00000000000..b5def204d79
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_string.c
@@ -0,0 +1,91 @@
+/* String matching match for iptables
+ * 
+ * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_string.h>
+#include <linux/textsearch.h>
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>");
+MODULE_DESCRIPTION("IP tables string match module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb,
+		 const struct net_device *in,
+		 const struct net_device *out,
+		 const void *matchinfo,
+		 int offset,
+		 int *hotdrop)
+{
+	struct ts_state state;
+	struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo;
+
+	memset(&state, 0, sizeof(struct ts_state));
+
+	return (skb_find_text((struct sk_buff *)skb, conf->from_offset, 
+			     conf->to_offset, conf->config, &state) 
+			     != UINT_MAX) && !conf->invert;
+}
+
+#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m)
+
+static int checkentry(const char *tablename,
+		      const struct ipt_ip *ip,
+		      void *matchinfo,
+		      unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	struct ipt_string_info *conf = matchinfo;
+	struct ts_config *ts_conf;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info)))
+		return 0;
+
+	/* Damn, can't handle this case properly with iptables... */
+	if (conf->from_offset > conf->to_offset)
+		return 0;
+
+	ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen,
+				     GFP_KERNEL, TS_AUTOLOAD);
+	if (IS_ERR(ts_conf))
+		return 0;
+
+	conf->config = ts_conf;
+
+	return 1;
+}
+
+static void destroy(void *matchinfo, unsigned int matchsize)
+{
+	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
+}
+
+static struct ipt_match string_match = {
+	.name 		= "string",
+	.match 		= match,
+	.checkentry	= checkentry,
+	.destroy 	= destroy,
+	.me 		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&string_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&string_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 912bbcc7f41..f7943ba1f43 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,10 @@ static int fold_prot_inuse(struct proto *proto)
  */
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
-	/* From net/socket.c */
-	extern void socket_seq_show(struct seq_file *seq);
-
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
 		   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
-		   tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
 	seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
 	seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0db405a869f..291831e792a 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -40,7 +40,6 @@
 #include <linux/timer.h>
 #include <net/ip.h>
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/icmp.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index d1835b1bc8c..304bb0a1d4f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -59,7 +59,6 @@
 #include <linux/netdevice.h>
 #include <linux/in_route.h>
 #include <linux/route.h>
-#include <linux/tcp.h>
 #include <linux/skbuff.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -71,6 +70,7 @@
 #include <net/udp.h>
 #include <net/raw.h>
 #include <net/snmp.h>
+#include <net/tcp_states.h>
 #include <net/inet_common.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
  * RFC 1122: SHOULD pass TOS value up to the transport layer.
  * -> It does. And not only TOS, but all IP header.
  */
-void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 {
 	struct sock *sk;
 	struct hlist_head *head;
+	int delivered = 0;
 
 	read_lock(&raw_v4_lock);
 	head = &raw_v4_htable[hash];
@@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 			     skb->dev->ifindex);
 
 	while (sk) {
+		delivered = 1;
 		if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
@@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
 	}
 out:
 	read_unlock(&raw_v4_lock);
+	return delivered;
 }
 
 void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d675ff80b04..8c0b14e3bee 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,7 +240,9 @@ static unsigned			rt_hash_mask;
 static int			rt_hash_log;
 static unsigned int		rt_hash_rnd;
 
-struct rt_cache_stat *rt_cache_stat;
+static struct rt_cache_stat *rt_cache_stat;
+#define RT_CACHE_STAT_INC(field)					  \
+		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
 
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 				struct rtable **res);
@@ -2600,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
 	return ip_route_output_slow(rp, flp);
 }
 
+EXPORT_SYMBOL_GPL(__ip_route_output_key);
+
 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
 {
 	int err;
@@ -2618,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk,
 	return 0;
 }
 
+EXPORT_SYMBOL_GPL(ip_route_output_flow);
+
 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
 {
 	return ip_route_output_flow(rp, flp, NULL, 0);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 72d01444218..a34e60ea48a 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
 }
 
-extern struct request_sock_ops tcp_request_sock_ops;
-
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 					   struct request_sock *req,
 					   struct dst_entry *dst)
@@ -180,7 +178,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 
 	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
 	if (child)
-		tcp_acceptq_queue(sk, req, child);
+		inet_csk_reqsk_queue_add(sk, req, child);
 	else
 		reqsk_free(req);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e3289453241..65268562351 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -11,7 +11,9 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/config.h>
+#include <linux/igmp.h>
 #include <net/snmp.h>
+#include <net/icmp.h>
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
@@ -19,36 +21,6 @@
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
 
-/* From icmp.c */
-extern int sysctl_icmp_echo_ignore_all;
-extern int sysctl_icmp_echo_ignore_broadcasts;
-extern int sysctl_icmp_ignore_bogus_error_responses;
-extern int sysctl_icmp_errors_use_inbound_ifaddr;
-
-/* From ip_fragment.c */
-extern int sysctl_ipfrag_low_thresh;
-extern int sysctl_ipfrag_high_thresh; 
-extern int sysctl_ipfrag_time;
-extern int sysctl_ipfrag_secret_interval;
-
-/* From ip_output.c */
-extern int sysctl_ip_dynaddr;
-
-/* From icmp.c */
-extern int sysctl_icmp_ratelimit;
-extern int sysctl_icmp_ratemask;
-
-/* From igmp.c */
-extern int sysctl_igmp_max_memberships;
-extern int sysctl_igmp_max_msf;
-
-/* From inetpeer.c */
-extern int inet_peer_threshold;
-extern int inet_peer_minttl;
-extern int inet_peer_maxttl;
-extern int inet_peer_gc_mintime;
-extern int inet_peer_gc_maxtime;
-
 #ifdef CONFIG_SYSCTL
 static int tcp_retr1_max = 255; 
 static int ip_local_port_range_min[] = { 1, 1 };
@@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 };
 
 struct ipv4_config ipv4_config;
 
-extern ctl_table ipv4_route_table[];
-
 #ifdef CONFIG_SYSCTL
 
 static
@@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
 	return ret;
 }
 
-int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
-				  void __user *oldval, size_t __user *oldlenp,
-				  void __user *newval, size_t newlen,
-				  void **context)
+static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
+					 int nlen, void __user *oldval,
+					 size_t __user *oldlenp,
+					 void __user *newval, size_t newlen,
+					 void **context)
 {
 	char val[TCP_CA_NAME_MAX];
 	ctl_table tbl = {
@@ -259,7 +230,7 @@ ctl_table ipv4_table[] = {
 	{
 		.ctl_name	= NET_TCP_MAX_TW_BUCKETS,
 		.procname	= "tcp_max_tw_buckets",
-		.data		= &sysctl_tcp_max_tw_buckets,
+		.data		= &tcp_death_row.sysctl_max_tw_buckets,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
@@ -363,7 +334,7 @@ ctl_table ipv4_table[] = {
 	{
 		.ctl_name	= NET_TCP_TW_RECYCLE,
 		.procname	= "tcp_tw_recycle",
-		.data		= &sysctl_tcp_tw_recycle,
+		.data		= &tcp_death_row.sysctl_tw_recycle,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 69b1fcf7007..02fdda68718 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,13 +269,12 @@
 
 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
-
-kmem_cache_t *tcp_bucket_cachep;
-kmem_cache_t *tcp_timewait_cachep;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 
 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
+EXPORT_SYMBOL_GPL(tcp_orphan_count);
+
 int sysctl_tcp_mem[3];
 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
@@ -311,15 +310,6 @@ void tcp_enter_memory_pressure(void)
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 
 /*
- * LISTEN is a special case for poll..
- */
-static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
-					       poll_table *wait)
-{
-	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
-}
-
-/*
  *	Wait for a TCP event.
  *
  *	Note that we don't need to lock the socket, as the upper poll layers
@@ -334,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	poll_wait(file, sk->sk_sleep, wait);
 	if (sk->sk_state == TCP_LISTEN)
-		return tcp_listen_poll(sk, wait);
+		return inet_csk_listen_poll(sk);
 
 	/* Socket is not locked. We are protected from async events
 	   by poll logic and correct handling of state changes
@@ -457,109 +447,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 	return put_user(answ, (int __user *)arg);
 }
 
-
-int tcp_listen_start(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
-
-	if (rc != 0)
-		return rc;
-
-	sk->sk_max_ack_backlog = 0;
-	sk->sk_ack_backlog = 0;
-	tcp_delack_init(tp);
-
-	/* There is race window here: we announce ourselves listening,
-	 * but this transition is still not validated by get_port().
-	 * It is OK, because this socket enters to hash table only
-	 * after validation is complete.
-	 */
-	sk->sk_state = TCP_LISTEN;
-	if (!sk->sk_prot->get_port(sk, inet->num)) {
-		inet->sport = htons(inet->num);
-
-		sk_dst_reset(sk);
-		sk->sk_prot->hash(sk);
-
-		return 0;
-	}
-
-	sk->sk_state = TCP_CLOSE;
-	reqsk_queue_destroy(&tp->accept_queue);
-	return -EADDRINUSE;
-}
-
-/*
- *	This routine closes sockets which have been at least partially
- *	opened, but not yet accepted.
- */
-
-static void tcp_listen_stop (struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt;
-	struct request_sock *acc_req;
-	struct request_sock *req;
-	int i;
-
-	tcp_delete_keepalive_timer(sk);
-
-	/* make all the listen_opt local to us */
-	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
-	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
-
-	if (lopt->qlen) {
-		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
-			while ((req = lopt->syn_table[i]) != NULL) {
-				lopt->syn_table[i] = req->dl_next;
-				lopt->qlen--;
-				reqsk_free(req);
-
-		/* Following specs, it would be better either to send FIN
-		 * (and enter FIN-WAIT-1, it is normal close)
-		 * or to send active reset (abort).
-		 * Certainly, it is pretty dangerous while synflood, but it is
-		 * bad justification for our negligence 8)
-		 * To be honest, we are not able to make either
-		 * of the variants now.			--ANK
-		 */
-			}
-		}
-	}
-	BUG_TRAP(!lopt->qlen);
-
-	kfree(lopt);
-
-	while ((req = acc_req) != NULL) {
-		struct sock *child = req->sk;
-
-		acc_req = req->dl_next;
-
-		local_bh_disable();
-		bh_lock_sock(child);
-		BUG_TRAP(!sock_owned_by_user(child));
-		sock_hold(child);
-
-		tcp_disconnect(child, O_NONBLOCK);
-
-		sock_orphan(child);
-
-		atomic_inc(&tcp_orphan_count);
-
-		tcp_destroy_sock(child);
-
-		bh_unlock_sock(child);
-		local_bh_enable();
-		sock_put(child);
-
-		sk_acceptq_removed(sk);
-		__reqsk_free(req);
-	}
-	BUG_TRAP(!sk->sk_ack_backlog);
-}
-
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
@@ -975,7 +862,7 @@ do_fault:
 	if (!skb->len) {
 		if (sk->sk_send_head == skb)
 			sk->sk_send_head = NULL;
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &sk->sk_write_queue);
 		sk_stream_free_skb(sk, skb);
 	}
 
@@ -1057,20 +944,21 @@ static void cleanup_rbuf(struct sock *sk, int copied)
 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 #endif
 
-	if (tcp_ack_scheduled(tp)) {
+	if (inet_csk_ack_scheduled(sk)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		   /* Delayed ACKs frequently hit locked sockets during bulk
 		    * receive. */
-		if (tp->ack.blocked ||
+		if (icsk->icsk_ack.blocked ||
 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
-		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+		    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 		    /*
 		     * If this read emptied read buffer, we send ACK, if
 		     * connection is not bidirectional, user drained
 		     * receive buffer and there was a small segment
 		     * in queue.
 		     */
-		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
-		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 			time_to_ack = 1;
 	}
 
@@ -1572,40 +1460,6 @@ void tcp_shutdown(struct sock *sk, int how)
 	}
 }
 
-/*
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all.  Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void tcp_destroy_sock(struct sock *sk)
-{
-	BUG_TRAP(sk->sk_state == TCP_CLOSE);
-	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
-
-	/* It cannot be in hash table! */
-	BUG_TRAP(sk_unhashed(sk));
-
-	/* If it has not 0 inet_sk(sk)->num, it must be bound */
-	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
-
-	sk->sk_prot->destroy(sk);
-
-	sk_stream_kill_queues(sk);
-
-	xfrm_sk_free_policy(sk);
-
-#ifdef INET_REFCNT_DEBUG
-	if (atomic_read(&sk->sk_refcnt) != 1) {
-		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
-		       sk, atomic_read(&sk->sk_refcnt));
-	}
-#endif
-
-	atomic_dec(&tcp_orphan_count);
-	sock_put(sk);
-}
-
 void tcp_close(struct sock *sk, long timeout)
 {
 	struct sk_buff *skb;
@@ -1618,7 +1472,7 @@ void tcp_close(struct sock *sk, long timeout)
 		tcp_set_state(sk, TCP_CLOSE);
 
 		/* Special case. */
-		tcp_listen_stop(sk);
+		inet_csk_listen_stop(sk);
 
 		goto adjudge_to_death;
 	}
@@ -1721,12 +1575,12 @@ adjudge_to_death:
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
 		} else {
-			int tmo = tcp_fin_time(tp);
+			const int tmo = tcp_fin_time(sk);
 
 			if (tmo > TCP_TIMEWAIT_LEN) {
-				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+				inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
 			} else {
-				atomic_inc(&tcp_orphan_count);
+				atomic_inc(sk->sk_prot->orphan_count);
 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 				goto out;
 			}
@@ -1734,7 +1588,7 @@ adjudge_to_death:
 	}
 	if (sk->sk_state != TCP_CLOSE) {
 		sk_stream_mem_reclaim(sk);
-		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+		if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
 		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
 			if (net_ratelimit())
@@ -1745,10 +1599,10 @@ adjudge_to_death:
 			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
 		}
 	}
-	atomic_inc(&tcp_orphan_count);
+	atomic_inc(sk->sk_prot->orphan_count);
 
 	if (sk->sk_state == TCP_CLOSE)
-		tcp_destroy_sock(sk);
+		inet_csk_destroy_sock(sk);
 	/* Otherwise, socket is reprieved until protocol close. */
 
 out:
@@ -1769,6 +1623,7 @@ static inline int tcp_need_reset(int state)
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int err = 0;
 	int old_state = sk->sk_state;
@@ -1778,7 +1633,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	/* ABORT function of RFC793 */
 	if (old_state == TCP_LISTEN) {
-		tcp_listen_stop(sk);
+		inet_csk_listen_stop(sk);
 	} else if (tcp_need_reset(old_state) ||
 		   (tp->snd_nxt != tp->write_seq &&
 		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -1805,125 +1660,34 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->srtt = 0;
 	if ((tp->write_seq += tp->max_window + 2) == 0)
 		tp->write_seq = 1;
-	tp->backoff = 0;
+	icsk->icsk_backoff = 0;
 	tp->snd_cwnd = 2;
-	tp->probes_out = 0;
+	icsk->icsk_probes_out = 0;
 	tp->packets_out = 0;
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_cnt = 0;
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
-	tcp_delack_init(tp);
+	inet_csk_delack_init(sk);
 	sk->sk_send_head = NULL;
 	tp->rx_opt.saw_tstamp = 0;
 	tcp_sack_reset(&tp->rx_opt);
 	__sk_dst_reset(sk);
 
-	BUG_TRAP(!inet->num || tp->bind_hash);
+	BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
 
 	sk->sk_error_report(sk);
 	return err;
 }
 
 /*
- *	Wait for an incoming connection, avoid race
- *	conditions. This must be called with the socket locked.
- */
-static int wait_for_connect(struct sock *sk, long timeo)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	DEFINE_WAIT(wait);
-	int err;
-
-	/*
-	 * True wake-one mechanism for incoming connections: only
-	 * one process gets woken up, not the 'whole herd'.
-	 * Since we do not 'race & poll' for established sockets
-	 * anymore, the common case will execute the loop only once.
-	 *
-	 * Subtle issue: "add_wait_queue_exclusive()" will be added
-	 * after any current non-exclusive waiters, and we know that
-	 * it will always _stay_ after any new non-exclusive waiters
-	 * because all non-exclusive waiters are added at the
-	 * beginning of the wait-queue. As such, it's ok to "drop"
-	 * our exclusiveness temporarily when we get woken up without
-	 * having to remove and re-insert us on the wait queue.
-	 */
-	for (;;) {
-		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
-					  TASK_INTERRUPTIBLE);
-		release_sock(sk);
-		if (reqsk_queue_empty(&tp->accept_queue))
-			timeo = schedule_timeout(timeo);
-		lock_sock(sk);
-		err = 0;
-		if (!reqsk_queue_empty(&tp->accept_queue))
-			break;
-		err = -EINVAL;
-		if (sk->sk_state != TCP_LISTEN)
-			break;
-		err = sock_intr_errno(timeo);
-		if (signal_pending(current))
-			break;
-		err = -EAGAIN;
-		if (!timeo)
-			break;
-	}
-	finish_wait(sk->sk_sleep, &wait);
-	return err;
-}
-
-/*
- *	This will accept the next outstanding connection.
- */
-
-struct sock *tcp_accept(struct sock *sk, int flags, int *err)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sock *newsk;
-	int error;
-
-	lock_sock(sk);
-
-	/* We need to make sure that this socket is listening,
-	 * and that it has something pending.
-	 */
-	error = -EINVAL;
-	if (sk->sk_state != TCP_LISTEN)
-		goto out_err;
-
-	/* Find already established connection */
-	if (reqsk_queue_empty(&tp->accept_queue)) {
-		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
-		/* If this is a non blocking socket don't sleep */
-		error = -EAGAIN;
-		if (!timeo)
-			goto out_err;
-
-		error = wait_for_connect(sk, timeo);
-		if (error)
-			goto out_err;
-	}
-
-	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
-	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-out:
-	release_sock(sk);
-	return newsk;
-out_err:
-	newsk = NULL;
-	*err = error;
-	goto out;
-}
-
-/*
  *	Socket option code for TCP.
  */
 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int optlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int val;
 	int err = 0;
 
@@ -1945,7 +1709,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(tp, name);
+		err = tcp_set_congestion_control(sk, name);
 		release_sock(sk);
 		return err;
 	}
@@ -2022,7 +1786,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 					elapsed = tp->keepalive_time - elapsed;
 				else
 					elapsed = 0;
-				tcp_reset_keepalive_timer(sk, elapsed);
+				inet_csk_reset_keepalive_timer(sk, elapsed);
 			}
 		}
 		break;
@@ -2042,7 +1806,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		if (val < 1 || val > MAX_TCP_SYNCNT)
 			err = -EINVAL;
 		else
-			tp->syn_retries = val;
+			icsk->icsk_syn_retries = val;
 		break;
 
 	case TCP_LINGER2:
@@ -2055,15 +1819,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		break;
 
 	case TCP_DEFER_ACCEPT:
-		tp->defer_accept = 0;
+		icsk->icsk_accept_queue.rskq_defer_accept = 0;
 		if (val > 0) {
 			/* Translate value in seconds to number of
 			 * retransmits */
-			while (tp->defer_accept < 32 &&
+			while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
 			       val > ((TCP_TIMEOUT_INIT / HZ) <<
-				       tp->defer_accept))
-				tp->defer_accept++;
-			tp->defer_accept++;
+				       icsk->icsk_accept_queue.rskq_defer_accept))
+				icsk->icsk_accept_queue.rskq_defer_accept++;
+			icsk->icsk_accept_queue.rskq_defer_accept++;
 		}
 		break;
 
@@ -2081,16 +1845,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 	case TCP_QUICKACK:
 		if (!val) {
-			tp->ack.pingpong = 1;
+			icsk->icsk_ack.pingpong = 1;
 		} else {
-			tp->ack.pingpong = 0;
+			icsk->icsk_ack.pingpong = 0;
 			if ((1 << sk->sk_state) &
 			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
-			    tcp_ack_scheduled(tp)) {
-				tp->ack.pending |= TCP_ACK_PUSHED;
+			    inet_csk_ack_scheduled(sk)) {
+				icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 				cleanup_rbuf(sk, 1);
 				if (!(val & 1))
-					tp->ack.pingpong = 1;
+					icsk->icsk_ack.pingpong = 1;
 			}
 		}
 		break;
@@ -2107,15 +1871,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 void tcp_get_info(struct sock *sk, struct tcp_info *info)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 
 	memset(info, 0, sizeof(*info));
 
 	info->tcpi_state = sk->sk_state;
-	info->tcpi_ca_state = tp->ca_state;
-	info->tcpi_retransmits = tp->retransmits;
-	info->tcpi_probes = tp->probes_out;
-	info->tcpi_backoff = tp->backoff;
+	info->tcpi_ca_state = icsk->icsk_ca_state;
+	info->tcpi_retransmits = icsk->icsk_retransmits;
+	info->tcpi_probes = icsk->icsk_probes_out;
+	info->tcpi_backoff = icsk->icsk_backoff;
 
 	if (tp->rx_opt.tstamp_ok)
 		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
@@ -2130,10 +1895,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	if (tp->ecn_flags&TCP_ECN_OK)
 		info->tcpi_options |= TCPI_OPT_ECN;
 
-	info->tcpi_rto = jiffies_to_usecs(tp->rto);
-	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+	info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
+	info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
 	info->tcpi_snd_mss = tp->mss_cache;
-	info->tcpi_rcv_mss = tp->ack.rcv_mss;
+	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
 	info->tcpi_unacked = tp->packets_out;
 	info->tcpi_sacked = tp->sacked_out;
@@ -2142,7 +1907,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_fackets = tp->fackets_out;
 
 	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
-	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+	info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
 	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
 
 	info->tcpi_pmtu = tp->pmtu_cookie;
@@ -2165,6 +1930,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info);
 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		   int __user *optlen)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int val, len;
 
@@ -2202,7 +1968,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
 		break;
 	case TCP_SYNCNT:
-		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 		break;
 	case TCP_LINGER2:
 		val = tp->linger2;
@@ -2210,8 +1976,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
 		break;
 	case TCP_DEFER_ACCEPT:
-		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
-					       (tp->defer_accept - 1));
+		val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
+			((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
 		break;
 	case TCP_WINDOW_CLAMP:
 		val = tp->window_clamp;
@@ -2232,7 +1998,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		return 0;
 	}
 	case TCP_QUICKACK:
-		val = !tp->ack.pingpong;
+		val = !icsk->icsk_ack.pingpong;
 		break;
 
 	case TCP_CONGESTION:
@@ -2241,7 +2007,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
 		if (put_user(len, optlen))
 			return -EFAULT;
-		if (copy_to_user(optval, tp->ca_ops->name, len))
+		if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
 			return -EFAULT;
 		return 0;
 	default:
@@ -2278,79 +2044,72 @@ void __init tcp_init(void)
 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
 					   sizeof(skb->cb));
 
-	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
-					      sizeof(struct tcp_bind_bucket),
-					      0, SLAB_HWCACHE_ALIGN,
-					      NULL, NULL);
-	if (!tcp_bucket_cachep)
+	tcp_hashinfo.bind_bucket_cachep =
+		kmem_cache_create("tcp_bind_bucket",
+				  sizeof(struct inet_bind_bucket), 0,
+				  SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!tcp_hashinfo.bind_bucket_cachep)
 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
 
-	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
-						sizeof(struct tcp_tw_bucket),
-						0, SLAB_HWCACHE_ALIGN,
-						NULL, NULL);
-	if (!tcp_timewait_cachep)
-		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
-
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
 	 *
 	 * The methodology is similar to that of the buffer cache.
 	 */
-	tcp_ehash = (struct tcp_ehash_bucket *)
+	tcp_hashinfo.ehash =
 		alloc_large_system_hash("TCP established",
-					sizeof(struct tcp_ehash_bucket),
+					sizeof(struct inet_ehash_bucket),
 					thash_entries,
 					(num_physpages >= 128 * 1024) ?
 						(25 - PAGE_SHIFT) :
 						(27 - PAGE_SHIFT),
 					HASH_HIGHMEM,
-					&tcp_ehash_size,
+					&tcp_hashinfo.ehash_size,
 					NULL,
 					0);
-	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
-	for (i = 0; i < (tcp_ehash_size << 1); i++) {
-		rwlock_init(&tcp_ehash[i].lock);
-		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+	tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
+	for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
+		rwlock_init(&tcp_hashinfo.ehash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
 	}
 
-	tcp_bhash = (struct tcp_bind_hashbucket *)
+	tcp_hashinfo.bhash =
 		alloc_large_system_hash("TCP bind",
-					sizeof(struct tcp_bind_hashbucket),
-					tcp_ehash_size,
+					sizeof(struct inet_bind_hashbucket),
+					tcp_hashinfo.ehash_size,
 					(num_physpages >= 128 * 1024) ?
 						(25 - PAGE_SHIFT) :
 						(27 - PAGE_SHIFT),
 					HASH_HIGHMEM,
-					&tcp_bhash_size,
+					&tcp_hashinfo.bhash_size,
 					NULL,
 					64 * 1024);
-	tcp_bhash_size = 1 << tcp_bhash_size;
-	for (i = 0; i < tcp_bhash_size; i++) {
-		spin_lock_init(&tcp_bhash[i].lock);
-		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+	tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
+	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
+		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
 	/* Try to be a bit smarter and adjust defaults depending
 	 * on available memory.
 	 */
 	for (order = 0; ((1 << order) << PAGE_SHIFT) <
-			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
 			order++)
 		;
 	if (order >= 4) {
 		sysctl_local_port_range[0] = 32768;
 		sysctl_local_port_range[1] = 61000;
-		sysctl_tcp_max_tw_buckets = 180000;
+		tcp_death_row.sysctl_max_tw_buckets = 180000;
 		sysctl_tcp_max_orphans = 4096 << (order - 4);
 		sysctl_max_syn_backlog = 1024;
 	} else if (order < 3) {
 		sysctl_local_port_range[0] = 1024 * (3 - order);
-		sysctl_tcp_max_tw_buckets >>= (3 - order);
+		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
 		sysctl_tcp_max_orphans >>= (3 - order);
 		sysctl_max_syn_backlog = 128;
 	}
-	tcp_port_rover = sysctl_local_port_range[0] - 1;
+	tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
 
 	sysctl_tcp_mem[0] =  768 << order;
 	sysctl_tcp_mem[1] = 1024 << order;
@@ -2365,14 +2124,12 @@ void __init tcp_init(void)
 
 	printk(KERN_INFO "TCP: Hash tables configured "
 	       "(established %d bind %d)\n",
-	       tcp_ehash_size << 1, tcp_bhash_size);
+	       tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
 
 	tcp_register_congestion_control(&tcp_reno);
 }
 
-EXPORT_SYMBOL(tcp_accept);
 EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_destroy_sock);
 EXPORT_SYMBOL(tcp_disconnect);
 EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
@@ -2384,4 +2141,3 @@ EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
 EXPORT_SYMBOL(tcp_statistics);
-EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ec38d45d664..b940346de4e 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca)
 	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
 }
 
-static void bictcp_init(struct tcp_sock *tp)
+static void bictcp_init(struct sock *sk)
 {
-	bictcp_reset(tcp_ca(tp));
+	bictcp_reset(inet_csk_ca(sk));
 	if (initial_ssthresh)
-		tp->snd_ssthresh = initial_ssthresh;
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
 /*
@@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 
 
 /* Detect low utilization in congestion avoidance */
-static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+static inline void bictcp_low_utilization(struct sock *sk, int flag)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 	u32 dist, delay;
 
 	/* No time stamp */
@@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
 
 }
 
-static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void bictcp_cong_avoid(struct sock *sk, u32 ack,
 			      u32 seq_rtt, u32 in_flight, int data_acked)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 
-	bictcp_low_utilization(tp, data_acked);
+	bictcp_low_utilization(sk, data_acked);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
  *	behave like Reno until low_window is reached,
  *	then increase congestion window slowly
  */
-static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 bictcp_recalc_ssthresh(struct sock *sk)
 {
-	struct bictcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct bictcp *ca = inet_csk_ca(sk);
 
 	ca->epoch_start = 0;	/* end of epoch */
 
@@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
 		return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
 }
 
-static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+static u32 bictcp_undo_cwnd(struct sock *sk)
 {
-	struct bictcp *ca = tcp_ca(tp);
-
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bictcp *ca = inet_csk_ca(sk);
 	return max(tp->snd_cwnd, ca->last_max_cwnd);
 }
 
-static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+static u32 bictcp_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh;
 }
 
-static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+static void bictcp_state(struct sock *sk, u8 new_state)
 {
 	if (new_state == TCP_CA_Loss)
-		bictcp_reset(tcp_ca(tp));
+		bictcp_reset(inet_csk_ca(sk));
 }
 
 /* Track delayed acknowledgement ratio using sliding window
  * ratio = (15*ratio + sample) / 16
  */
-static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+static void bictcp_acked(struct sock *sk, u32 cnt)
 {
-	if (cnt > 0 && 	tp->ca_state == TCP_CA_Open) {
-		struct bictcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (cnt > 0 && 	icsk->icsk_ca_state == TCP_CA_Open) {
+		struct bictcp *ca = inet_csk_ca(sk);
 		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
 		ca->delayed_ack += cnt;
 	}
@@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = {
 
 static int __init bictcp_register(void)
 {
-	BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&bictcp);
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4970d10a778..bbf2d6624e8 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
 /* Assign choice of congestion control. */
-void tcp_init_congestion_control(struct tcp_sock *tp)
+void tcp_init_congestion_control(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 
-	if (tp->ca_ops != &tcp_init_congestion_ops)
+	if (icsk->icsk_ca_ops != &tcp_init_congestion_ops)
 		return;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 		if (try_module_get(ca->owner)) {
-			tp->ca_ops = ca;
+			icsk->icsk_ca_ops = ca;
 			break;
 		}
 
 	}
 	rcu_read_unlock();
 
-	if (tp->ca_ops->init)
-		tp->ca_ops->init(tp);
+	if (icsk->icsk_ca_ops->init)
+		icsk->icsk_ca_ops->init(sk);
 }
 
 /* Manage refcounts on socket close. */
-void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+void tcp_cleanup_congestion_control(struct sock *sk)
 {
-	if (tp->ca_ops->release)
-		tp->ca_ops->release(tp);
-	module_put(tp->ca_ops->owner);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (icsk->icsk_ca_ops->release)
+		icsk->icsk_ca_ops->release(sk);
+	module_put(icsk->icsk_ca_ops->owner);
 }
 
 /* Used by sysctl to change default congestion control */
@@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name)
 }
 
 /* Change congestion control for socket */
-int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_congestion_ops *ca;
 	int err = 0;
 
 	rcu_read_lock();
 	ca = tcp_ca_find(name);
-	if (ca == tp->ca_ops)
+	if (ca == icsk->icsk_ca_ops)
 		goto out;
 
 	if (!ca)
@@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
 		err = -EBUSY;
 
 	else {
-		tcp_cleanup_congestion_control(tp);
-		tp->ca_ops = ca;
-		if (tp->ca_ops->init)
-			tp->ca_ops->init(tp);
+		tcp_cleanup_congestion_control(sk);
+		icsk->icsk_ca_ops = ca;
+		if (icsk->icsk_ca_ops->init)
+			icsk->icsk_ca_ops->init(sk);
 	}
  out:
 	rcu_read_unlock();
@@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
 /* This is Jacobson's slow start and congestion avoidance.
  * SIGCOMM '88, p. 328.
  */
-void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
 			 int flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (in_flight < tp->snd_cwnd)
 		return;
 
@@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
 /* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+u32 tcp_reno_ssthresh(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return max(tp->snd_cwnd >> 1U, 2U);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
 
 /* Lower bound on congestion window. */
-u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+u32 tcp_reno_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh/2;
 }
 EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index f66945cb158..c148c108188 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,5 +1,5 @@
 /*
- * tcp_diag.c	Module for monitoring TCP sockets.
+ * tcp_diag.c	Module for monitoring TCP transport protocols sockets.
  *
  * Version:	$Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
  *
@@ -12,779 +12,43 @@
  */
 
 #include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/random.h>
-#include <linux/cache.h>
-#include <linux/init.h>
-#include <linux/time.h>
-
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/ipv6.h>
-#include <net/inet_common.h>
-
-#include <linux/inet.h>
-#include <linux/stddef.h>
-
-#include <linux/tcp_diag.h>
 
-struct tcpdiag_entry
-{
-	u32 *saddr;
-	u32 *daddr;
-	u16 sport;
-	u16 dport;
-	u16 family;
-	u16 userlocks;
-};
+#include <linux/module.h>
+#include <linux/inet_diag.h>
 
-static struct sock *tcpnl;
+#include <linux/tcp.h>
 
-#define TCPDIAG_PUT(skb, attrtype, attrlen) \
-	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
+#include <net/tcp.h>
 
-static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
-			int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *_info)
 {
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcpdiagmsg *r;
-	struct nlmsghdr  *nlh;
-	struct tcp_info  *info = NULL;
-	struct tcpdiag_meminfo  *minfo = NULL;
-	unsigned char	 *b = skb->tail;
-
-	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-	nlh->nlmsg_flags = nlmsg_flags;
-	r = NLMSG_DATA(nlh);
-	if (sk->sk_state != TCP_TIME_WAIT) {
-		if (ext & (1<<(TCPDIAG_MEMINFO-1)))
-			minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
-		if (ext & (1<<(TCPDIAG_INFO-1)))
-			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
-		
-		if (ext & (1<<(TCPDIAG_CONG-1))) {
-			size_t len = strlen(tp->ca_ops->name);
-			strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
-			       tp->ca_ops->name);
-		}
-	}
-	r->tcpdiag_family = sk->sk_family;
-	r->tcpdiag_state = sk->sk_state;
-	r->tcpdiag_timer = 0;
-	r->tcpdiag_retrans = 0;
-
-	r->id.tcpdiag_if = sk->sk_bound_dev_if;
-	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
-	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
-
-	if (r->tcpdiag_state == TCP_TIME_WAIT) {
-		struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
-		long tmo = tw->tw_ttd - jiffies;
-		if (tmo < 0)
-			tmo = 0;
-
-		r->id.tcpdiag_sport = tw->tw_sport;
-		r->id.tcpdiag_dport = tw->tw_dport;
-		r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
-		r->id.tcpdiag_dst[0] = tw->tw_daddr;
-		r->tcpdiag_state = tw->tw_substate;
-		r->tcpdiag_timer = 3;
-		r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
-		r->tcpdiag_rqueue = 0;
-		r->tcpdiag_wqueue = 0;
-		r->tcpdiag_uid = 0;
-		r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-		if (r->tcpdiag_family == AF_INET6) {
-			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-				       &tw->tw_v6_rcv_saddr);
-			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-				       &tw->tw_v6_daddr);
-		}
-#endif
-		nlh->nlmsg_len = skb->tail - b;
-		return skb->len;
-	}
-
-	r->id.tcpdiag_sport = inet->sport;
-	r->id.tcpdiag_dport = inet->dport;
-	r->id.tcpdiag_src[0] = inet->rcv_saddr;
-	r->id.tcpdiag_dst[0] = inet->daddr;
-
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	if (r->tcpdiag_family == AF_INET6) {
-		struct ipv6_pinfo *np = inet6_sk(sk);
-
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-			       &np->rcv_saddr);
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-			       &np->daddr);
-	}
-#endif
-
-#define EXPIRES_IN_MS(tmo)  ((tmo-jiffies)*1000+HZ-1)/HZ
-
-	if (tp->pending == TCP_TIME_RETRANS) {
-		r->tcpdiag_timer = 1;
-		r->tcpdiag_retrans = tp->retransmits;
-		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-	} else if (tp->pending == TCP_TIME_PROBE0) {
-		r->tcpdiag_timer = 4;
-		r->tcpdiag_retrans = tp->probes_out;
-		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
-	} else if (timer_pending(&sk->sk_timer)) {
-		r->tcpdiag_timer = 2;
-		r->tcpdiag_retrans = tp->probes_out;
-		r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
-	} else {
-		r->tcpdiag_timer = 0;
-		r->tcpdiag_expires = 0;
-	}
-#undef EXPIRES_IN_MS
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_info *info = _info;
 
-	r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
-	r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
-	r->tcpdiag_uid = sock_i_uid(sk);
-	r->tcpdiag_inode = sock_i_ino(sk);
-
-	if (minfo) {
-		minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
-		minfo->tcpdiag_wmem = sk->sk_wmem_queued;
-		minfo->tcpdiag_fmem = sk->sk_forward_alloc;
-		minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
-	}
-
-	if (info) 
+	r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+	r->idiag_wqueue = tp->write_seq - tp->snd_una;
+	if (info != NULL)
 		tcp_get_info(sk, info);
-
-	if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
-		tp->ca_ops->get_info(tp, ext, skb);
-
-	nlh->nlmsg_len = skb->tail - b;
-	return skb->len;
-
-rtattr_failure:
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
-				  int dif);
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-				  struct in6_addr *daddr, u16 dport,
-				  int dif);
-#else
-static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-					 struct in6_addr *daddr, u16 dport,
-					 int dif)
-{
-	return NULL;
-}
-#endif
-
-static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
-{
-	int err;
-	struct sock *sk;
-	struct tcpdiagreq *req = NLMSG_DATA(nlh);
-	struct sk_buff *rep;
-
-	if (req->tcpdiag_family == AF_INET) {
-		sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
-				   req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
-				   req->id.tcpdiag_if);
-	}
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	else if (req->tcpdiag_family == AF_INET6) {
-		sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
-				   (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
-				   req->id.tcpdiag_if);
-	}
-#endif
-	else {
-		return -EINVAL;
-	}
-
-	if (sk == NULL)
-		return -ENOENT;
-
-	err = -ESTALE;
-	if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
-	     req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
-	    ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
-	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
-		goto out;
-
-	err = -ENOMEM;
-	rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
-				    sizeof(struct tcpdiag_meminfo)+
-				    sizeof(struct tcp_info)+64), GFP_KERNEL);
-	if (!rep)
-		goto out;
-
-	if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
-			 NETLINK_CB(in_skb).pid,
-			 nlh->nlmsg_seq, 0) <= 0)
-		BUG();
-
-	err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-
-out:
-	if (sk) {
-		if (sk->sk_state == TCP_TIME_WAIT)
-			tcp_tw_put((struct tcp_tw_bucket*)sk);
-		else
-			sock_put(sk);
-	}
-	return err;
-}
-
-static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
-{
-	int words = bits >> 5;
-
-	bits &= 0x1f;
-
-	if (words) {
-		if (memcmp(a1, a2, words << 2))
-			return 0;
-	}
-	if (bits) {
-		__u32 w1, w2;
-		__u32 mask;
-
-		w1 = a1[words];
-		w2 = a2[words];
-
-		mask = htonl((0xffffffff) << (32 - bits));
-
-		if ((w1 ^ w2) & mask)
-			return 0;
-	}
-
-	return 1;
-}
-
-
-static int tcpdiag_bc_run(const void *bc, int len,
-			  const struct tcpdiag_entry *entry)
-{
-	while (len > 0) {
-		int yes = 1;
-		const struct tcpdiag_bc_op *op = bc;
-
-		switch (op->code) {
-		case TCPDIAG_BC_NOP:
-			break;
-		case TCPDIAG_BC_JMP:
-			yes = 0;
-			break;
-		case TCPDIAG_BC_S_GE:
-			yes = entry->sport >= op[1].no;
-			break;
-		case TCPDIAG_BC_S_LE:
-			yes = entry->dport <= op[1].no;
-			break;
-		case TCPDIAG_BC_D_GE:
-			yes = entry->dport >= op[1].no;
-			break;
-		case TCPDIAG_BC_D_LE:
-			yes = entry->dport <= op[1].no;
-			break;
-		case TCPDIAG_BC_AUTO:
-			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
-			break;
-		case TCPDIAG_BC_S_COND:
-		case TCPDIAG_BC_D_COND:
-		{
-			struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
-			u32 *addr;
-
-			if (cond->port != -1 &&
-			    cond->port != (op->code == TCPDIAG_BC_S_COND ?
-					     entry->sport : entry->dport)) {
-				yes = 0;
-				break;
-			}
-			
-			if (cond->prefix_len == 0)
-				break;
-
-			if (op->code == TCPDIAG_BC_S_COND)
-				addr = entry->saddr;
-			else
-				addr = entry->daddr;
-
-			if (bitstring_match(addr, cond->addr, cond->prefix_len))
-				break;
-			if (entry->family == AF_INET6 &&
-			    cond->family == AF_INET) {
-				if (addr[0] == 0 && addr[1] == 0 &&
-				    addr[2] == htonl(0xffff) &&
-				    bitstring_match(addr+3, cond->addr, cond->prefix_len))
-					break;
-			}
-			yes = 0;
-			break;
-		}
-		}
-
-		if (yes) { 
-			len -= op->yes;
-			bc += op->yes;
-		} else {
-			len -= op->no;
-			bc += op->no;
-		}
-	}
-	return (len == 0);
-}
-
-static int valid_cc(const void *bc, int len, int cc)
-{
-	while (len >= 0) {
-		const struct tcpdiag_bc_op *op = bc;
-
-		if (cc > len)
-			return 0;
-		if (cc == len)
-			return 1;
-		if (op->yes < 4)
-			return 0;
-		len -= op->yes;
-		bc  += op->yes;
-	}
-	return 0;
-}
-
-static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
-{
-	const unsigned char *bc = bytecode;
-	int  len = bytecode_len;
-
-	while (len > 0) {
-		struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
-
-//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
-		switch (op->code) {
-		case TCPDIAG_BC_AUTO:
-		case TCPDIAG_BC_S_COND:
-		case TCPDIAG_BC_D_COND:
-		case TCPDIAG_BC_S_GE:
-		case TCPDIAG_BC_S_LE:
-		case TCPDIAG_BC_D_GE:
-		case TCPDIAG_BC_D_LE:
-			if (op->yes < 4 || op->yes > len+4)
-				return -EINVAL;
-		case TCPDIAG_BC_JMP:
-			if (op->no < 4 || op->no > len+4)
-				return -EINVAL;
-			if (op->no < len &&
-			    !valid_cc(bytecode, bytecode_len, len-op->no))
-				return -EINVAL;
-			break;
-		case TCPDIAG_BC_NOP:
-			if (op->yes < 4 || op->yes > len+4)
-				return -EINVAL;
-			break;
-		default:
-			return -EINVAL;
-		}
-		bc += op->yes;
-		len -= op->yes;
-	}
-	return len == 0 ? 0 : -EINVAL;
-}
-
-static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
-			     struct netlink_callback *cb)
-{
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
-	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-		struct tcpdiag_entry entry;
-		struct rtattr *bc = (struct rtattr *)(r + 1);
-		struct inet_sock *inet = inet_sk(sk);
-
-		entry.family = sk->sk_family;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-		if (entry.family == AF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
-
-			entry.saddr = np->rcv_saddr.s6_addr32;
-			entry.daddr = np->daddr.s6_addr32;
-		} else
-#endif
-		{
-			entry.saddr = &inet->rcv_saddr;
-			entry.daddr = &inet->daddr;
-		}
-		entry.sport = inet->num;
-		entry.dport = ntohs(inet->dport);
-		entry.userlocks = sk->sk_userlocks;
-
-		if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
-			return 0;
-	}
-
-	return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
-			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
 }
 
-static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
-			    struct request_sock *req,
-			    u32 pid, u32 seq)
-{
-	const struct inet_request_sock *ireq = inet_rsk(req);
-	struct inet_sock *inet = inet_sk(sk);
-	unsigned char *b = skb->tail;
-	struct tcpdiagmsg *r;
-	struct nlmsghdr *nlh;
-	long tmo;
-
-	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
-	nlh->nlmsg_flags = NLM_F_MULTI;
-	r = NLMSG_DATA(nlh);
-
-	r->tcpdiag_family = sk->sk_family;
-	r->tcpdiag_state = TCP_SYN_RECV;
-	r->tcpdiag_timer = 1;
-	r->tcpdiag_retrans = req->retrans;
-
-	r->id.tcpdiag_if = sk->sk_bound_dev_if;
-	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
-	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
-
-	tmo = req->expires - jiffies;
-	if (tmo < 0)
-		tmo = 0;
-
-	r->id.tcpdiag_sport = inet->sport;
-	r->id.tcpdiag_dport = ireq->rmt_port;
-	r->id.tcpdiag_src[0] = ireq->loc_addr;
-	r->id.tcpdiag_dst[0] = ireq->rmt_addr;
-	r->tcpdiag_expires = jiffies_to_msecs(tmo),
-	r->tcpdiag_rqueue = 0;
-	r->tcpdiag_wqueue = 0;
-	r->tcpdiag_uid = sock_i_uid(sk);
-	r->tcpdiag_inode = 0;
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-	if (r->tcpdiag_family == AF_INET6) {
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-			       &tcp6_rsk(req)->loc_addr);
-		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-			       &tcp6_rsk(req)->rmt_addr);
-	}
-#endif
-	nlh->nlmsg_len = skb->tail - b;
-
-	return skb->len;
-
-nlmsg_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
-			     struct netlink_callback *cb)
-{
-	struct tcpdiag_entry entry;
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt;
-	struct rtattr *bc = NULL;
-	struct inet_sock *inet = inet_sk(sk);
-	int j, s_j;
-	int reqnum, s_reqnum;
-	int err = 0;
-
-	s_j = cb->args[3];
-	s_reqnum = cb->args[4];
-
-	if (s_j > 0)
-		s_j--;
-
-	entry.family = sk->sk_family;
-
-	read_lock_bh(&tp->accept_queue.syn_wait_lock);
-
-	lopt = tp->accept_queue.listen_opt;
-	if (!lopt || !lopt->qlen)
-		goto out;
-
-	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
-		bc = (struct rtattr *)(r + 1);
-		entry.sport = inet->num;
-		entry.userlocks = sk->sk_userlocks;
-	}
-
-	for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
-		struct request_sock *req, *head = lopt->syn_table[j];
-
-		reqnum = 0;
-		for (req = head; req; reqnum++, req = req->dl_next) {
-			struct inet_request_sock *ireq = inet_rsk(req);
-
-			if (reqnum < s_reqnum)
-				continue;
-			if (r->id.tcpdiag_dport != ireq->rmt_port &&
-			    r->id.tcpdiag_dport)
-				continue;
-
-			if (bc) {
-				entry.saddr =
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->loc_addr.s6_addr32 :
-#endif
-					&ireq->loc_addr;
-				entry.daddr = 
-#ifdef CONFIG_IP_TCPDIAG_IPV6
-					(entry.family == AF_INET6) ?
-					tcp6_rsk(req)->rmt_addr.s6_addr32 :
-#endif
-					&ireq->rmt_addr;
-				entry.dport = ntohs(ireq->rmt_port);
-
-				if (!tcpdiag_bc_run(RTA_DATA(bc),
-						    RTA_PAYLOAD(bc), &entry))
-					continue;
-			}
-
-			err = tcpdiag_fill_req(skb, sk, req,
-					       NETLINK_CB(cb->skb).pid,
-					       cb->nlh->nlmsg_seq);
-			if (err < 0) {
-				cb->args[3] = j + 1;
-				cb->args[4] = reqnum;
-				goto out;
-			}
-		}
-
-		s_reqnum = 0;
-	}
-
-out:
-	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
-
-	return err;
-}
-
-static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	int i, num;
-	int s_i, s_num;
-	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
-
-	s_i = cb->args[1];
-	s_num = num = cb->args[2];
-
-	if (cb->args[0] == 0) {
-		if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
-			goto skip_listen_ht;
-		tcp_listen_lock();
-		for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
-			struct sock *sk;
-			struct hlist_node *node;
-
-			num = 0;
-			sk_for_each(sk, node, &tcp_listening_hash[i]) {
-				struct inet_sock *inet = inet_sk(sk);
-
-				if (num < s_num) {
-					num++;
-					continue;
-				}
-
-				if (r->id.tcpdiag_sport != inet->sport &&
-				    r->id.tcpdiag_sport)
-					goto next_listen;
-
-				if (!(r->tcpdiag_states&TCPF_LISTEN) ||
-				    r->id.tcpdiag_dport ||
-				    cb->args[3] > 0)
-					goto syn_recv;
-
-				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-					tcp_listen_unlock();
-					goto done;
-				}
-
-syn_recv:
-				if (!(r->tcpdiag_states&TCPF_SYN_RECV))
-					goto next_listen;
-
-				if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
-					tcp_listen_unlock();
-					goto done;
-				}
-
-next_listen:
-				cb->args[3] = 0;
-				cb->args[4] = 0;
-				++num;
-			}
-
-			s_num = 0;
-			cb->args[3] = 0;
-			cb->args[4] = 0;
-		}
-		tcp_listen_unlock();
-skip_listen_ht:
-		cb->args[0] = 1;
-		s_i = num = s_num = 0;
-	}
-
-	if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
-		return skb->len;
-
-	for (i = s_i; i < tcp_ehash_size; i++) {
-		struct tcp_ehash_bucket *head = &tcp_ehash[i];
-		struct sock *sk;
-		struct hlist_node *node;
-
-		if (i > s_i)
-			s_num = 0;
-
-		read_lock_bh(&head->lock);
-
-		num = 0;
-		sk_for_each(sk, node, &head->chain) {
-			struct inet_sock *inet = inet_sk(sk);
-
-			if (num < s_num)
-				goto next_normal;
-			if (!(r->tcpdiag_states & (1 << sk->sk_state)))
-				goto next_normal;
-			if (r->id.tcpdiag_sport != inet->sport &&
-			    r->id.tcpdiag_sport)
-				goto next_normal;
-			if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
-				goto next_normal;
-			if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-				read_unlock_bh(&head->lock);
-				goto done;
-			}
-next_normal:
-			++num;
-		}
-
-		if (r->tcpdiag_states&TCPF_TIME_WAIT) {
-			sk_for_each(sk, node,
-				    &tcp_ehash[i + tcp_ehash_size].chain) {
-				struct inet_sock *inet = inet_sk(sk);
-
-				if (num < s_num)
-					goto next_dying;
-				if (r->id.tcpdiag_sport != inet->sport &&
-				    r->id.tcpdiag_sport)
-					goto next_dying;
-				if (r->id.tcpdiag_dport != inet->dport &&
-				    r->id.tcpdiag_dport)
-					goto next_dying;
-				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
-					read_unlock_bh(&head->lock);
-					goto done;
-				}
-next_dying:
-				++num;
-			}
-		}
-		read_unlock_bh(&head->lock);
-	}
-
-done:
-	cb->args[1] = i;
-	cb->args[2] = num;
-	return skb->len;
-}
-
-static int tcpdiag_dump_done(struct netlink_callback *cb)
-{
-	return 0;
-}
-
-
-static __inline__ int
-tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
-{
-	if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
-		return 0;
-
-	if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
-		goto err_inval;
-
-	if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
-		goto err_inval;
-
-	if (nlh->nlmsg_flags&NLM_F_DUMP) {
-		if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
-			struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
-			if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
-			    rta->rta_len < 8 ||
-			    rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
-				goto err_inval;
-			if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
-				goto err_inval;
-		}
-		return netlink_dump_start(tcpnl, skb, nlh,
-					  tcpdiag_dump,
-					  tcpdiag_dump_done);
-	} else {
-		return tcpdiag_get_exact(skb, nlh);
-	}
-
-err_inval:
-	return -EINVAL;
-}
-
-
-static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
-{
-	int err;
-	struct nlmsghdr * nlh;
-
-	if (skb->len >= NLMSG_SPACE(0)) {
-		nlh = (struct nlmsghdr *)skb->data;
-		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-			return;
-		err = tcpdiag_rcv_msg(skb, nlh);
-		if (err || nlh->nlmsg_flags & NLM_F_ACK) 
-			netlink_ack(skb, nlh, err);
-	}
-}
-
-static void tcpdiag_rcv(struct sock *sk, int len)
-{
-	struct sk_buff *skb;
-	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
-
-	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
-		tcpdiag_rcv_skb(skb);
-		kfree_skb(skb);
-	}
-}
+static struct inet_diag_handler tcp_diag_handler = {
+	.idiag_hashinfo	 = &tcp_hashinfo,
+	.idiag_get_info	 = tcp_diag_get_info,
+	.idiag_type	 = TCPDIAG_GETSOCK,
+	.idiag_info_size = sizeof(struct tcp_info),
+};
 
-static int __init tcpdiag_init(void)
+static int __init tcp_diag_init(void)
 {
-	tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
-	if (tcpnl == NULL)
-		return -ENOMEM;
-	return 0;
+	return inet_diag_register(&tcp_diag_handler);
 }
 
-static void __exit tcpdiag_exit(void)
+static void __exit tcp_diag_exit(void)
 {
-	sock_release(tcpnl->sk_socket);
+	inet_diag_unregister(&tcp_diag_handler);
 }
 
-module_init(tcpdiag_init);
-module_exit(tcpdiag_exit);
+module_init(tcp_diag_init);
+module_exit(tcp_diag_exit);
 MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 36c51f8136b..6acc04bde08 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -98,9 +98,10 @@ struct hstcp {
 	u32	ai;
 };
 
-static void hstcp_init(struct tcp_sock *tp)
+static void hstcp_init(struct sock *sk)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
 
 	ca->ai = 0;
 
@@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp)
 	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
 }
 
-static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
 			     u32 in_flight, int good)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hstcp *ca = inet_csk_ca(sk);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
 	}
 }
 
-static u32 hstcp_ssthresh(struct tcp_sock *tp)
+static u32 hstcp_ssthresh(struct sock *sk)
 {
-	struct hstcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct hstcp *ca = inet_csk_ca(sk);
 
 	/* Do multiplicative decrease */
 	return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
@@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = {
 
 static int __init hstcp_register(void)
 {
-	BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_highspeed);
 }
 
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 40168275acf..e47b37984e9 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca)
 	ca->snd_cwnd_cnt2 = 0;
 }
 
-static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+static u32 htcp_cwnd_undo(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	ca->ccount = ca->undo_ccount;
 	ca->maxRTT = ca->undo_maxRTT;
 	ca->old_maxB = ca->undo_old_maxB;
 	return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
 }
 
-static inline void measure_rtt(struct tcp_sock *tp)
+static inline void measure_rtt(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 srtt = tp->srtt>>3;
 
 	/* keep track of minimum RTT seen so far, minRTT is zero at first */
@@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp)
 		ca->minRTT = srtt;
 
 	/* max RTT */
-	if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+	if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
 		if (ca->maxRTT < ca->minRTT)
 			ca->maxRTT = ca->minRTT;
 		if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
@@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp)
 	}
 }
 
-static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked)
 {
-	struct htcp *ca = tcp_ca(tp);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 now = tcp_time_stamp;
 
 	/* achieved throughput calculations */
-	if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+	if (icsk->icsk_ca_state != TCP_CA_Open &&
+	    icsk->icsk_ca_state != TCP_CA_Disorder) {
 		ca->packetcount = 0;
 		ca->lasttime = now;
 		return;
@@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca)
  * that point do we really have a real sense of maxRTT (the queues en route
  * were getting just too full now).
  */
-static void htcp_param_update(struct tcp_sock *tp)
+static void htcp_param_update(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct htcp *ca = inet_csk_ca(sk);
 	u32 minRTT = ca->minRTT;
 	u32 maxRTT = ca->maxRTT;
 
@@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp)
 		ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
 }
 
-static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+static u32 htcp_recalc_ssthresh(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
-	htcp_param_update(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct htcp *ca = inet_csk_ca(sk);
+	htcp_param_update(sk);
 	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
 }
 
-static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 			    u32 in_flight, int data_acked)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct htcp *ca = inet_csk_ca(sk);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
@@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 			tp->snd_cwnd++;
 	} else {
-		measure_rtt(tp);
+		measure_rtt(sk);
 
 		/* keep track of number of round-trip times since last backoff event */
 		if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
@@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 }
 
 /* Lower bound on congestion window. */
-static u32 htcp_min_cwnd(struct tcp_sock *tp)
+static u32 htcp_min_cwnd(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return tp->snd_ssthresh;
 }
 
 
-static void htcp_init(struct tcp_sock *tp)
+static void htcp_init(struct sock *sk)
 {
-	struct htcp *ca = tcp_ca(tp);
+	struct htcp *ca = inet_csk_ca(sk);
 
 	memset(ca, 0, sizeof(struct htcp));
 	ca->alpha = ALPHA_BASE;
 	ca->beta = BETA_MIN;
 }
 
-static void htcp_state(struct tcp_sock *tp, u8 new_state)
+static void htcp_state(struct sock *sk, u8 new_state)
 {
 	switch (new_state) {
 	case TCP_CA_CWR:
 	case TCP_CA_Recovery:
 	case TCP_CA_Loss:
-		htcp_reset(tcp_ca(tp));
+		htcp_reset(inet_csk_ca(sk));
 		break;
 	}
 }
@@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = {
 
 static int __init htcp_register(void)
 {
-	BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
 	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
 	if (!use_bandwidth_switch)
 		htcp.pkts_acked = NULL;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 13a66342c30..77add63623d 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
 
 
 /* This is called to refresh values for hybla parameters */
-static inline void hybla_recalc_param (struct tcp_sock *tp)
+static inline void hybla_recalc_param (struct sock *sk)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct hybla *ca = inet_csk_ca(sk);
 
-	ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+	ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
 	ca->rho = ca->rho_3ls >> 3;
 	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
 	ca->rho2 = ca->rho2_7ls >>7;
 }
 
-static void hybla_init(struct tcp_sock *tp)
+static void hybla_init(struct sock *sk)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
 
 	ca->rho = 0;
 	ca->rho2 = 0;
@@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp)
 	tp->snd_cwnd_clamp = 65535;
 
 	/* 1st Rho measurement based on initial srtt */
-	hybla_recalc_param(tp);
+	hybla_recalc_param(sk);
 
 	/* set minimum rtt as this is the 1st ever seen */
 	ca->minrtt = tp->srtt;
 	tp->snd_cwnd = ca->rho;
 }
 
-static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+static void hybla_state(struct sock *sk, u8 ca_state)
 {
-	struct hybla *ca = tcp_ca(tp);
-
+	struct hybla *ca = inet_csk_ca(sk);
 	ca->hybla_en = (ca_state == TCP_CA_Open);
 }
 
@@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds)
  *     o Give cwnd a new value based on the model proposed
  *     o remember increments <1
  */
-static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 			    u32 in_flight, int flag)
 {
-	struct hybla *ca = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct hybla *ca = inet_csk_ca(sk);
 	u32 increment, odd, rho_fractions;
 	int is_slowstart = 0;
 
 	/*  Recalculate rho only if this srtt is the lowest */
 	if (tp->srtt < ca->minrtt){
-		hybla_recalc_param(tp);
+		hybla_recalc_param(sk);
 		ca->minrtt = tp->srtt;
 	}
 
 	if (!ca->hybla_en)
-		return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+		return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
 
 	if (in_flight < tp->snd_cwnd)
 		return;
 
 	if (ca->rho == 0)
-		hybla_recalc_param(tp);
+		hybla_recalc_param(sk);
 
 	rho_fractions = ca->rho_3ls - (ca->rho << 3);
 
@@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = {
 
 static int __init hybla_register(void)
 {
-	BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_hybla);
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53a8a5399f1..1afb080bdf0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1;
 /* Adapt the MSS value used to make delayed ack decision to the 
  * real world.
  */ 
-static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
-				       struct sk_buff *skb)
+static inline void tcp_measure_rcv_mss(struct sock *sk,
+				       const struct sk_buff *skb)
 {
-	unsigned int len, lss;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const unsigned int lss = icsk->icsk_ack.last_seg_size; 
+	unsigned int len;
 
-	lss = tp->ack.last_seg_size; 
-	tp->ack.last_seg_size = 0; 
+	icsk->icsk_ack.last_seg_size = 0; 
 
 	/* skb->len may jitter because of SACKs, even if peer
 	 * sends good full-sized frames.
 	 */
 	len = skb->len;
-	if (len >= tp->ack.rcv_mss) {
-		tp->ack.rcv_mss = len;
+	if (len >= icsk->icsk_ack.rcv_mss) {
+		icsk->icsk_ack.rcv_mss = len;
 	} else {
 		/* Otherwise, we make more careful check taking into account,
 		 * that SACKs block is variable.
@@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
 			 * tcp header plus fixed timestamp option length.
 			 * Resulting "len" is MSS free of SACK jitter.
 			 */
-			len -= tp->tcp_header_len;
-			tp->ack.last_seg_size = len;
+			len -= tcp_sk(sk)->tcp_header_len;
+			icsk->icsk_ack.last_seg_size = len;
 			if (len == lss) {
-				tp->ack.rcv_mss = len;
+				icsk->icsk_ack.rcv_mss = len;
 				return;
 			}
 		}
-		tp->ack.pending |= TCP_ACK_PUSHED;
+		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 	}
 }
 
-static void tcp_incr_quickack(struct tcp_sock *tp)
+static void tcp_incr_quickack(struct sock *sk)
 {
-	unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
 
 	if (quickacks==0)
 		quickacks=2;
-	if (quickacks > tp->ack.quick)
-		tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+	if (quickacks > icsk->icsk_ack.quick)
+		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
 
-void tcp_enter_quickack_mode(struct tcp_sock *tp)
+void tcp_enter_quickack_mode(struct sock *sk)
 {
-	tcp_incr_quickack(tp);
-	tp->ack.pingpong = 0;
-	tp->ack.ato = TCP_ATO_MIN;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	tcp_incr_quickack(sk);
+	icsk->icsk_ack.pingpong = 0;
+	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
 
 /* Send ACKs quickly, if "quick" count is not exhausted
  * and the session is not interactive.
  */
 
-static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+static inline int tcp_in_quickack_mode(const struct sock *sk)
 {
-	return (tp->ack.quick && !tp->ack.pingpong);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
 }
 
 /* Buffer size and advertised window tuning.
@@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk)
  */
 
 /* Slow part of check#2. */
-static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
-			     struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
+			     const struct sk_buff *skb)
 {
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(skb->truesize)/2;
@@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
 
 	while (tp->rcv_ssthresh <= window) {
 		if (truesize <= skb->len)
-			return 2*tp->ack.rcv_mss;
+			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
 
 		truesize >>= 1;
 		window >>= 1;
@@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
 
 		if (incr) {
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
-			tp->ack.quick |= 1;
+			inet_csk(sk)->icsk_ack.quick |= 1;
 		}
 	}
 }
@@ -321,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk)
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb;
 	unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
 	int ofo_win = 0;
 
-	tp->ack.quick = 0;
+	icsk->icsk_ack.quick = 0;
 
 	skb_queue_walk(&tp->out_of_order_queue, skb) {
 		ofo_win += skb->len;
@@ -346,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 		app_win += ofo_win;
 		if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
 			app_win >>= 1;
-		if (app_win > tp->ack.rcv_mss)
-			app_win -= tp->ack.rcv_mss;
+		if (app_win > icsk->icsk_ack.rcv_mss)
+			app_win -= icsk->icsk_ack.rcv_mss;
 		app_win = max(app_win, 2U*tp->advmss);
 
 		if (!ofo_win)
@@ -415,11 +420,12 @@ new_measure:
 	tp->rcv_rtt_est.time = tcp_time_stamp;
 }
 
-static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (tp->rx_opt.rcv_tsecr &&
 	    (TCP_SKB_CB(skb)->end_seq -
-	     TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
 		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
 }
 
@@ -492,41 +498,42 @@ new_measure:
  */
 static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now;
 
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
-	tcp_measure_rcv_mss(tp, skb);
+	tcp_measure_rcv_mss(sk, skb);
 
 	tcp_rcv_rtt_measure(tp);
 	
 	now = tcp_time_stamp;
 
-	if (!tp->ack.ato) {
+	if (!icsk->icsk_ack.ato) {
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
 		 */
-		tcp_incr_quickack(tp);
-		tp->ack.ato = TCP_ATO_MIN;
+		tcp_incr_quickack(sk);
+		icsk->icsk_ack.ato = TCP_ATO_MIN;
 	} else {
-		int m = now - tp->ack.lrcvtime;
+		int m = now - icsk->icsk_ack.lrcvtime;
 
 		if (m <= TCP_ATO_MIN/2) {
 			/* The fastest case is the first. */
-			tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
-		} else if (m < tp->ack.ato) {
-			tp->ack.ato = (tp->ack.ato>>1) + m;
-			if (tp->ack.ato > tp->rto)
-				tp->ack.ato = tp->rto;
-		} else if (m > tp->rto) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
+		} else if (m < icsk->icsk_ack.ato) {
+			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
+			if (icsk->icsk_ack.ato > icsk->icsk_rto)
+				icsk->icsk_ack.ato = icsk->icsk_rto;
+		} else if (m > icsk->icsk_rto) {
 			/* Too long gap. Apparently sender falled to
 			 * restart window, so that we send ACKs quickly.
 			 */
-			tcp_incr_quickack(tp);
+			tcp_incr_quickack(sk);
 			sk_stream_mem_reclaim(sk);
 		}
 	}
-	tp->ack.lrcvtime = now;
+	icsk->icsk_ack.lrcvtime = now;
 
 	TCP_ECN_check_ce(tp, skb);
 
@@ -543,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
+static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	long m = mrtt; /* RTT */
 
 	/*	The following amusing code comes from Jacobson's
@@ -604,15 +613,16 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 		tp->rtt_seq = tp->snd_nxt;
 	}
 
-	if (tp->ca_ops->rtt_sample)
-		tp->ca_ops->rtt_sample(tp, *usrtt);
+	if (icsk->icsk_ca_ops->rtt_sample)
+		icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static inline void tcp_set_rto(struct tcp_sock *tp)
+static inline void tcp_set_rto(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Old crap is replaced with new one. 8)
 	 *
 	 * More seriously:
@@ -623,7 +633,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
 	 *    is invisible. Actually, Linux-2.4 also generates erratic
 	 *    ACKs in some curcumstances.
 	 */
-	tp->rto = (tp->srtt >> 3) + tp->rttvar;
+	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
 
 	/* 2. Fixups made earlier cannot be right.
 	 *    If we do not estimate RTO correctly without them,
@@ -635,10 +645,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp)
 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
  * guarantees that rto is higher.
  */
-static inline void tcp_bound_rto(struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
 {
-	if (tp->rto > TCP_RTO_MAX)
-		tp->rto = TCP_RTO_MAX;
+	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
+		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
 }
 
 /* Save metrics learned by this TCP session.
@@ -656,9 +666,10 @@ void tcp_update_metrics(struct sock *sk)
 	dst_confirm(dst);
 
 	if (dst && (dst->flags&DST_HOST)) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		int m;
 
-		if (tp->backoff || !tp->srtt) {
+		if (icsk->icsk_backoff || !tp->srtt) {
 			/* This session failed to estimate rtt. Why?
 			 * Probably, no packets returned in time.
 			 * Reset our results.
@@ -707,7 +718,7 @@ void tcp_update_metrics(struct sock *sk)
 			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
 				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
 		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
-			   tp->ca_state == TCP_CA_Open) {
+			   icsk->icsk_ca_state == TCP_CA_Open) {
 			/* Cong. avoidance phase, cwnd is reliable. */
 			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
 				dst->metrics[RTAX_SSTHRESH-1] =
@@ -801,9 +812,9 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev = dst_metric(dst, RTAX_RTTVAR);
 		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
 	}
-	tcp_set_rto(tp);
-	tcp_bound_rto(tp);
-	if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+	tcp_set_rto(sk);
+	tcp_bound_rto(sk);
+	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
 		goto reset;
 	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -817,12 +828,14 @@ reset:
 	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
 		tp->srtt = 0;
 		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-		tp->rto = TCP_TIMEOUT_INIT;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
 	}
 }
 
-static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+static void tcp_update_reordering(struct sock *sk, const int metric,
+				  const int ts)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (metric > tp->reordering) {
 		tp->reordering = min(TCP_MAX_REORDERING, metric);
 
@@ -837,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
 			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
 #if FASTRETRANS_DEBUG > 1
 		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
-		       tp->rx_opt.sack_ok, tp->ca_state,
+		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
 		       tp->reordering,
 		       tp->fackets_out,
 		       tp->sacked_out,
@@ -899,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
 static int
 tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
 	struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
@@ -1064,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	 * we have to account for reordering! Ugly,
 	 * but should help.
 	 */
-	if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
 		struct sk_buff *skb;
 
 		sk_stream_for_retrans_queue(skb, sk) {
@@ -1093,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 
 	tp->left_out = tp->sacked_out + tp->lost_out;
 
-	if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
-		tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
 
 #if FASTRETRANS_DEBUG > 0
 	BUG_TRAP((int)tp->sacked_out >= 0);
@@ -1111,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
  */
 void tcp_enter_frto(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 
 	tp->frto_counter = 1;
 
-	if (tp->ca_state <= TCP_CA_Disorder ||
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
             tp->snd_una == tp->high_seq ||
-            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
-		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
-		tcp_ca_event(tp, CA_EVENT_FRTO);
+            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_FRTO);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1138,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk)
 	}
 	tcp_sync_left_out(tp);
 
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	tp->frto_highmark = tp->snd_nxt;
 }
 
@@ -1184,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk)
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
 }
@@ -1208,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
  */
 void tcp_enter_loss(struct sock *sk, int how)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int cnt = 0;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
-	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
-	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
-		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
-		tcp_ca_event(tp, CA_EVENT_LOSS);
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+		tcp_ca_event(sk, CA_EVENT_LOSS);
 	}
 	tp->snd_cwnd	   = 1;
 	tp->snd_cwnd_cnt   = 0;
@@ -1248,12 +1264,12 @@ void tcp_enter_loss(struct sock *sk, int how)
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
 					     sysctl_tcp_reordering);
-	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
 }
 
-static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+static int tcp_check_sack_reneging(struct sock *sk)
 {
 	struct sk_buff *skb;
 
@@ -1265,12 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
 	 */
 	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
 	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+		struct inet_connection_sock *icsk = inet_csk(sk);
 		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
 
 		tcp_enter_loss(sk, 1);
-		tp->retransmits++;
+		icsk->icsk_retransmits++;
 		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  icsk->icsk_rto, TCP_RTO_MAX);
 		return 1;
 	}
 	return 0;
@@ -1281,15 +1299,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
 	return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
 }
 
-static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
 }
 
 static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
 {
 	return tp->packets_out &&
-	       tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+	       tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
 }
 
 /* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1423,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
  * in assumption of absent reordering, interpret this as reordering.
  * The only another reason could be bug in receiver TCP.
  */
-static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	u32 holes;
 
 	holes = max(tp->lost_out, 1U);
@@ -1432,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
 
 	if ((tp->sacked_out + holes) > tp->packets_out) {
 		tp->sacked_out = tp->packets_out - holes;
-		tcp_update_reordering(tp, tp->packets_out+addend, 0);
+		tcp_update_reordering(sk, tp->packets_out + addend, 0);
 	}
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
 
-static void tcp_add_reno_sack(struct tcp_sock *tp)
+static void tcp_add_reno_sack(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	tp->sacked_out++;
-	tcp_check_reno_reordering(tp, 0);
+	tcp_check_reno_reordering(sk, 0);
 	tcp_sync_left_out(tp);
 }
 
@@ -1456,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke
 		else
 			tp->sacked_out -= acked-1;
 	}
-	tcp_check_reno_reordering(tp, acked);
+	tcp_check_reno_reordering(sk, acked);
 	tcp_sync_left_out(tp);
 }
 
@@ -1509,7 +1529,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
 		struct sk_buff *skb;
 
 		sk_stream_for_retrans_queue(skb, sk) {
-			if (tcp_skb_timedout(tp, skb) &&
+			if (tcp_skb_timedout(sk, skb) &&
 			    !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
 				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 				tp->lost_out += tcp_skb_pcount(skb);
@@ -1530,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 
 /* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct tcp_sock *tp)
+static void tcp_cwnd_down(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
 	int decr = tp->snd_cwnd_cnt + 1;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
+	if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1571,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 #define DBGUNDO(x...) do { } while (0)
 #endif
 
-static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+static void tcp_undo_cwr(struct sock *sk, const int undo)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	if (tp->prior_ssthresh) {
-		if (tp->ca_ops->undo_cwnd)
-			tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+
+		if (icsk->icsk_ca_ops->undo_cwnd)
+			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
 		else
 			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
@@ -1603,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
 		/* Happy end! We did not retransmit anything
 		 * or our original transmission succeeded.
 		 */
-		DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
-		tcp_undo_cwr(tp, 1);
-		if (tp->ca_state == TCP_CA_Loss)
+		DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		tcp_undo_cwr(sk, 1);
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
 		else
 			NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
@@ -1618,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
 		tcp_moderate_cwnd(tp);
 		return 1;
 	}
-	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_set_ca_state(sk, TCP_CA_Open);
 	return 0;
 }
 
@@ -1627,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
 {
 	if (tp->undo_marker && !tp->undo_retrans) {
 		DBGUNDO(sk, tp, "D-SACK");
-		tcp_undo_cwr(tp, 1);
+		tcp_undo_cwr(sk, 1);
 		tp->undo_marker = 0;
 		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
 	}
@@ -1648,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
 		if (tp->retrans_out == 0)
 			tp->retrans_stamp = 0;
 
-		tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
 
 		DBGUNDO(sk, tp, "Hoe");
-		tcp_undo_cwr(tp, 0);
+		tcp_undo_cwr(sk, 0);
 		NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
 
 		/* So... Do not make Hoe's retransmit yet.
@@ -1674,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 		DBGUNDO(sk, tp, "partial loss");
 		tp->lost_out = 0;
 		tp->left_out = tp->sacked_out;
-		tcp_undo_cwr(tp, 1);
+		tcp_undo_cwr(sk, 1);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
-		tp->retransmits = 0;
+		inet_csk(sk)->icsk_retransmits = 0;
 		tp->undo_marker = 0;
 		if (!IsReno(tp))
-			tcp_set_ca_state(tp, TCP_CA_Open);
+			tcp_set_ca_state(sk, TCP_CA_Open);
 		return 1;
 	}
 	return 0;
 }
 
-static inline void tcp_complete_cwr(struct tcp_sock *tp)
+static inline void tcp_complete_cwr(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
+	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
 }
 
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1700,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
 		tp->retrans_stamp = 0;
 
 	if (flag&FLAG_ECE)
-		tcp_enter_cwr(tp);
+		tcp_enter_cwr(sk);
 
-	if (tp->ca_state != TCP_CA_CWR) {
+	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		int state = TCP_CA_Open;
 
 		if (tp->left_out || tp->retrans_out || tp->undo_marker)
 			state = TCP_CA_Disorder;
 
-		if (tp->ca_state != state) {
-			tcp_set_ca_state(tp, state);
+		if (inet_csk(sk)->icsk_ca_state != state) {
+			tcp_set_ca_state(sk, state);
 			tp->high_seq = tp->snd_nxt;
 		}
 		tcp_moderate_cwnd(tp);
 	} else {
-		tcp_cwnd_down(tp);
+		tcp_cwnd_down(sk);
 	}
 }
 
@@ -1733,6 +1760,7 @@ static void
 tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		      int prior_packets, int flag)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
 
@@ -1750,13 +1778,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		tp->prior_ssthresh = 0;
 
 	/* B. In all the states check for reneging SACKs. */
-	if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+	if (tp->sacked_out && tcp_check_sack_reneging(sk))
 		return;
 
 	/* C. Process data loss notification, provided it is valid. */
 	if ((flag&FLAG_DATA_LOST) &&
 	    before(tp->snd_una, tp->high_seq) &&
-	    tp->ca_state != TCP_CA_Open &&
+	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
 		tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
@@ -1767,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 
 	/* E. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
-	if (tp->ca_state == TCP_CA_Open) {
+	if (icsk->icsk_ca_state == TCP_CA_Open) {
 		if (!sysctl_tcp_frto)
 			BUG_TRAP(tp->retrans_out == 0);
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
-		switch (tp->ca_state) {
+		switch (icsk->icsk_ca_state) {
 		case TCP_CA_Loss:
-			tp->retransmits = 0;
+			icsk->icsk_retransmits = 0;
 			if (tcp_try_undo_recovery(sk, tp))
 				return;
 			break;
@@ -1783,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			/* CWR is to be held something *above* high_seq
 			 * is ACKed for CWR bit to reach receiver. */
 			if (tp->snd_una != tp->high_seq) {
-				tcp_complete_cwr(tp);
-				tcp_set_ca_state(tp, TCP_CA_Open);
+				tcp_complete_cwr(sk);
+				tcp_set_ca_state(sk, TCP_CA_Open);
 			}
 			break;
 
@@ -1795,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			     * catching for all duplicate ACKs. */
 			    IsReno(tp) || tp->snd_una != tp->high_seq) {
 				tp->undo_marker = 0;
-				tcp_set_ca_state(tp, TCP_CA_Open);
+				tcp_set_ca_state(sk, TCP_CA_Open);
 			}
 			break;
 
@@ -1804,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 				tcp_reset_reno_sack(tp);
 			if (tcp_try_undo_recovery(sk, tp))
 				return;
-			tcp_complete_cwr(tp);
+			tcp_complete_cwr(sk);
 			break;
 		}
 	}
 
 	/* F. Process state. */
-	switch (tp->ca_state) {
+	switch (icsk->icsk_ca_state) {
 	case TCP_CA_Recovery:
 		if (prior_snd_una == tp->snd_una) {
 			if (IsReno(tp) && is_dupack)
-				tcp_add_reno_sack(tp);
+				tcp_add_reno_sack(sk);
 		} else {
 			int acked = prior_packets - tp->packets_out;
 			if (IsReno(tp))
@@ -1824,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		break;
 	case TCP_CA_Loss:
 		if (flag&FLAG_DATA_ACKED)
-			tp->retransmits = 0;
+			icsk->icsk_retransmits = 0;
 		if (!tcp_try_undo_loss(sk, tp)) {
 			tcp_moderate_cwnd(tp);
 			tcp_xmit_retransmit_queue(sk);
 			return;
 		}
-		if (tp->ca_state != TCP_CA_Open)
+		if (icsk->icsk_ca_state != TCP_CA_Open)
 			return;
 		/* Loss is undone; fall through to processing in Open state. */
 	default:
@@ -1838,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 			if (tp->snd_una != prior_snd_una)
 				tcp_reset_reno_sack(tp);
 			if (is_dupack)
-				tcp_add_reno_sack(tp);
+				tcp_add_reno_sack(sk);
 		}
 
-		if (tp->ca_state == TCP_CA_Disorder)
+		if (icsk->icsk_ca_state == TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk, tp);
 
 		if (!tcp_time_to_recover(sk, tp)) {
@@ -1861,30 +1889,28 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		tp->undo_marker = tp->snd_una;
 		tp->undo_retrans = tp->retrans_out;
 
-		if (tp->ca_state < TCP_CA_CWR) {
+		if (icsk->icsk_ca_state < TCP_CA_CWR) {
 			if (!(flag&FLAG_ECE))
-				tp->prior_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+				tp->prior_ssthresh = tcp_current_ssthresh(sk);
+			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 			TCP_ECN_queue_cwr(tp);
 		}
 
 		tp->snd_cwnd_cnt = 0;
-		tcp_set_ca_state(tp, TCP_CA_Recovery);
+		tcp_set_ca_state(sk, TCP_CA_Recovery);
 	}
 
 	if (is_dupack || tcp_head_timedout(sk, tp))
 		tcp_update_scoreboard(sk, tp);
-	tcp_cwnd_down(tp);
+	tcp_cwnd_down(sk);
 	tcp_xmit_retransmit_queue(sk);
 }
 
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Superceeds RFC1323)
  */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
+static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
 {
-	__u32 seq_rtt;
-
 	/* RTTM Rule: A TSecr value received in a segment is used to
 	 * update the averaged RTT measurement only if the segment
 	 * acknowledges some new data, i.e., only if it advances the
@@ -1900,14 +1926,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 	 * answer arrives rto becomes 120 seconds! If at least one of segments
 	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
-	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	tcp_rtt_estimator(tp, seq_rtt, usrtt);
-	tcp_set_rto(tp);
-	tp->backoff = 0;
-	tcp_bound_rto(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	tcp_rtt_estimator(sk, seq_rtt, usrtt);
+	tcp_set_rto(sk);
+	inet_csk(sk)->icsk_backoff = 0;
+	tcp_bound_rto(sk);
 }
 
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
+static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag)
 {
 	/* We don't have a timestamp. Can only use
 	 * packets that are not retransmitted to determine
@@ -1921,27 +1948,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		return;
 
-	tcp_rtt_estimator(tp, seq_rtt, usrtt);
-	tcp_set_rto(tp);
-	tp->backoff = 0;
-	tcp_bound_rto(tp);
+	tcp_rtt_estimator(sk, seq_rtt, usrtt);
+	tcp_set_rto(sk);
+	inet_csk(sk)->icsk_backoff = 0;
+	tcp_bound_rto(sk);
 }
 
-static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-				      int flag, s32 seq_rtt, u32 *usrtt)
+static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
+				      const s32 seq_rtt, u32 *usrtt)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(tp, usrtt, flag);
+		tcp_ack_saw_tstamp(sk, usrtt, flag);
 	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
+		tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag);
 }
 
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 				  u32 in_flight, int good)
 {
-	tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
-	tp->snd_cwnd_stamp = tcp_time_stamp;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good);
+	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
 }
 
 /* Restart timer after forward progress on connection.
@@ -1951,9 +1980,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
 {
 	if (!tp->packets_out) {
-		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 	}
 }
 
@@ -2068,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 				seq_rtt = -1;
 			} else if (seq_rtt < 0)
 				seq_rtt = now - scb->when;
-			if (seq_usrtt)
-				*seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
-					+ (usnow.tv_usec - skb->stamp.tv_usec);
+			if (seq_usrtt) {
+				struct timeval tv;
+			
+				skb_get_timestamp(skb, &tv);
+				*seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
+					+ (usnow.tv_usec - tv.tv_usec);
+			}
 
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2085,16 +2118,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 			seq_rtt = now - scb->when;
 		tcp_dec_pcount_approx(&tp->fackets_out, skb);
 		tcp_packets_out_dec(tp, skb);
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &sk->sk_write_queue);
 		sk_stream_free_skb(sk, skb);
 	}
 
 	if (acked&FLAG_ACKED) {
-		tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
 		tcp_ack_packets_out(sk, tp);
 
-		if (tp->ca_ops->pkts_acked)
-			tp->ca_ops->pkts_acked(tp, pkts_acked);
+		if (icsk->icsk_ca_ops->pkts_acked)
+			icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
 	}
 
 #if FASTRETRANS_DEBUG > 0
@@ -2102,19 +2136,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 	BUG_TRAP((int)tp->lost_out >= 0);
 	BUG_TRAP((int)tp->retrans_out >= 0);
 	if (!tp->packets_out && tp->rx_opt.sack_ok) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		if (tp->lost_out) {
 			printk(KERN_DEBUG "Leak l=%u %d\n",
-			       tp->lost_out, tp->ca_state);
+			       tp->lost_out, icsk->icsk_ca_state);
 			tp->lost_out = 0;
 		}
 		if (tp->sacked_out) {
 			printk(KERN_DEBUG "Leak s=%u %d\n",
-			       tp->sacked_out, tp->ca_state);
+			       tp->sacked_out, icsk->icsk_ca_state);
 			tp->sacked_out = 0;
 		}
 		if (tp->retrans_out) {
 			printk(KERN_DEBUG "Leak r=%u %d\n",
-			       tp->retrans_out, tp->ca_state);
+			       tp->retrans_out, icsk->icsk_ca_state);
 			tp->retrans_out = 0;
 		}
 	}
@@ -2125,40 +2160,43 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
 
 static void tcp_ack_probe(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	/* Was it a usable window open? */
 
 	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
 		   tp->snd_una + tp->snd_wnd)) {
-		tp->backoff = 0;
-		tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+		icsk->icsk_backoff = 0;
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
 		/* Socket must be waked up by subsequent tcp_data_snd_check().
 		 * This function is not for random using!
 		 */
 	} else {
-		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
-				     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
 	}
 }
 
-static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
 	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-		tp->ca_state != TCP_CA_Open);
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
 }
 
-static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
-		!((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
 }
 
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
-static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
-					u32 ack_seq, u32 nwin)
+static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+					const u32 ack_seq, const u32 nwin)
 {
 	return (after(ack, tp->snd_una) ||
 		after(ack_seq, tp->snd_wl1) ||
@@ -2241,6 +2279,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 prior_snd_una = tp->snd_una;
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
@@ -2268,7 +2307,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		tp->snd_una = ack;
 		flag |= FLAG_WIN_UPDATE;
 
-		tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
 
 		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
 	} else {
@@ -2285,7 +2324,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
 
-		tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
+		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -2301,19 +2340,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 
 	/* See if we can take anything off of the retransmit queue. */
 	flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
-				    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
+				    icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
 
-	if (tcp_ack_is_dubious(tp, flag)) {
+	if (tcp_ack_is_dubious(sk, flag)) {
 		/* Advanve CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
+			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
 		if ((flag & FLAG_DATA_ACKED))
-			tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
+			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -2322,7 +2361,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	return 1;
 
 no_queue:
-	tp->probes_out = 0;
+	icsk->icsk_probes_out = 0;
 
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
@@ -2500,8 +2539,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
  * up to bandwidth of 18Gigabit/sec. 8) ]
  */
 
-static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcphdr *th = skb->h.th;
 	u32 seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
@@ -2516,14 +2556,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
 		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
 
 		/* 4. ... and sits in replay window. */
-		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
 }
 
-static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
 		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
-		!tcp_disordered_ack(tp, skb));
+		!tcp_disordered_ack(sk, skb));
 }
 
 /* Check segment sequence number for validity.
@@ -2586,7 +2627,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
 	sk->sk_shutdown |= RCV_SHUTDOWN;
 	sock_set_flag(sk, SOCK_DONE);
@@ -2596,7 +2637,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 		case TCP_ESTABLISHED:
 			/* Move to CLOSE_WAIT */
 			tcp_set_state(sk, TCP_CLOSE_WAIT);
-			tp->ack.pingpong = 1;
+			inet_csk(sk)->icsk_ack.pingpong = 1;
 			break;
 
 		case TCP_CLOSE_WAIT:
@@ -2694,7 +2735,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
-		tcp_enter_quickack_mode(tp);
+		tcp_enter_quickack_mode(sk);
 
 		if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -2853,7 +2894,7 @@ static void tcp_ofo_queue(struct sock *sk)
 
 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received \n");
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
 			continue;
 		}
@@ -2861,7 +2902,7 @@ static void tcp_ofo_queue(struct sock *sk)
 			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
 			   TCP_SKB_CB(skb)->end_seq);
 
-		__skb_unlink(skb, skb->list);
+		__skb_unlink(skb, &tp->out_of_order_queue);
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
@@ -2942,7 +2983,7 @@ queue_and_out:
 			 * gap in queue is filled.
 			 */
 			if (skb_queue_empty(&tp->out_of_order_queue))
-				tp->ack.pingpong = 0;
+				inet_csk(sk)->icsk_ack.pingpong = 0;
 		}
 
 		if (tp->rx_opt.num_sacks)
@@ -2963,8 +3004,8 @@ queue_and_out:
 		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
 out_of_window:
-		tcp_enter_quickack_mode(tp);
-		tcp_schedule_ack(tp);
+		tcp_enter_quickack_mode(sk);
+		inet_csk_schedule_ack(sk);
 drop:
 		__kfree_skb(skb);
 		return;
@@ -2974,7 +3015,7 @@ drop:
 	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
 		goto out_of_window;
 
-	tcp_enter_quickack_mode(tp);
+	tcp_enter_quickack_mode(sk);
 
 	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		/* Partial packet, seq < rcv_next < end_seq */
@@ -3003,7 +3044,7 @@ drop:
 
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
-	tcp_schedule_ack(tp);
+	inet_csk_schedule_ack(sk);
 
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -3027,7 +3068,7 @@ drop:
 		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
 		if (seq == TCP_SKB_CB(skb1)->end_seq) {
-			__skb_append(skb1, skb);
+			__skb_append(skb1, skb, &tp->out_of_order_queue);
 
 			if (!tp->rx_opt.num_sacks ||
 			    tp->selective_acks[0].end_seq != seq)
@@ -3071,7 +3112,7 @@ drop:
 			       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
 			       break;
 		       }
-		       __skb_unlink(skb1, skb1->list);
+		       __skb_unlink(skb1, &tp->out_of_order_queue);
 		       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
 		       __kfree_skb(skb1);
 		}
@@ -3088,8 +3129,9 @@ add_sack:
  * simplifies code)
  */
 static void
-tcp_collapse(struct sock *sk, struct sk_buff *head,
-	     struct sk_buff *tail, u32 start, u32 end)
+tcp_collapse(struct sock *sk, struct sk_buff_head *list,
+	     struct sk_buff *head, struct sk_buff *tail,
+	     u32 start, u32 end)
 {
 	struct sk_buff *skb;
 
@@ -3099,7 +3141,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 		/* No new bits? It is possible on ofo queue. */
 		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 			struct sk_buff *next = skb->next;
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, list);
 			__kfree_skb(skb);
 			NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 			skb = next;
@@ -3145,7 +3187,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
-		__skb_insert(nskb, skb->prev, skb, skb->list);
+		__skb_insert(nskb, skb->prev, skb, list);
 		sk_stream_set_owner_r(nskb, sk);
 
 		/* Copy data, releasing collapsed skbs. */
@@ -3164,7 +3206,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head,
 			}
 			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 				struct sk_buff *next = skb->next;
-				__skb_unlink(skb, skb->list);
+				__skb_unlink(skb, list);
 				__kfree_skb(skb);
 				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
 				skb = next;
@@ -3200,7 +3242,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, head, skb, start, end);
+			tcp_collapse(sk, &tp->out_of_order_queue,
+				     head, skb, start, end);
 			head = skb;
 			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
 				break;
@@ -3237,7 +3280,8 @@ static int tcp_prune_queue(struct sock *sk)
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
-	tcp_collapse(sk, sk->sk_receive_queue.next,
+	tcp_collapse(sk, &sk->sk_receive_queue,
+		     sk->sk_receive_queue.next,
 		     (struct sk_buff*)&sk->sk_receive_queue,
 		     tp->copied_seq, tp->rcv_nxt);
 	sk_stream_mem_reclaim(sk);
@@ -3286,12 +3330,12 @@ void tcp_cwnd_application_limited(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tp->ca_state == TCP_CA_Open &&
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
 	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
 		/* Limited by application or receiver window. */
 		u32 win_used = max(tp->snd_cwnd_used, 2U);
 		if (win_used < tp->snd_cwnd) {
-			tp->snd_ssthresh = tcp_current_ssthresh(tp);
+			tp->snd_ssthresh = tcp_current_ssthresh(sk);
 			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
 		}
 		tp->snd_cwnd_used = 0;
@@ -3370,13 +3414,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	    /* More than one full frame received... */
-	if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise). Or...
 	      */
 	     && __tcp_select_window(sk) >= tp->rcv_wnd) ||
 	    /* We ACK each frame or... */
-	    tcp_in_quickack_mode(tp) ||
+	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
 	    (ofo_possible &&
 	     skb_peek(&tp->out_of_order_queue))) {
@@ -3390,8 +3434,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	if (!tcp_ack_scheduled(tp)) {
+	if (!inet_csk_ack_scheduled(sk)) {
 		/* We sent a data segment already. */
 		return;
 	}
@@ -3462,7 +3505,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 		tp->copied_seq++;
 		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &sk->sk_receive_queue);
 			__kfree_skb(skb);
 		}
 	}
@@ -3645,7 +3688,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
-				tcp_rcv_rtt_measure_ts(tp, skb);
+				tcp_rcv_rtt_measure_ts(sk, skb);
 
 				/* We know that such packets are checksummed
 				 * on entry.
@@ -3678,7 +3721,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					    tp->rcv_nxt == tp->rcv_wup)
 						tcp_store_ts_recent(tp);
 
-					tcp_rcv_rtt_measure_ts(tp, skb);
+					tcp_rcv_rtt_measure_ts(sk, skb);
 
 					__skb_pull(skb, tcp_header_len);
 					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3699,7 +3742,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				    tp->rcv_nxt == tp->rcv_wup)
 					tcp_store_ts_recent(tp);
 
-				tcp_rcv_rtt_measure_ts(tp, skb);
+				tcp_rcv_rtt_measure_ts(sk, skb);
 
 				if ((int)skb->truesize > sk->sk_forward_alloc)
 					goto step5;
@@ -3719,7 +3762,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				/* Well, only one small jumplet in fast path... */
 				tcp_ack(sk, skb, FLAG_DATA);
 				tcp_data_snd_check(sk, tp);
-				if (!tcp_ack_scheduled(tp))
+				if (!inet_csk_ack_scheduled(sk))
 					goto no_ack;
 			}
 
@@ -3741,7 +3784,7 @@ slow_path:
 	 * RFC1323: H1. Apply PAWS check first.
 	 */
 	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(tp, skb)) {
+	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 			tcp_send_dupack(sk, skb);
@@ -3788,7 +3831,7 @@ step5:
 	if(th->ack)
 		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
-	tcp_rcv_rtt_measure_ts(tp, skb);
+	tcp_rcv_rtt_measure_ts(sk, skb);
 
 	/* Process urgent data. */
 	tcp_urg(sk, skb, th);
@@ -3817,6 +3860,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	tcp_parse_options(skb, &tp->rx_opt, 0);
 
 	if (th->ack) {
+		struct inet_connection_sock *icsk;
 		/* rfc793:
 		 * "If the state is SYN-SENT then
 		 *    first check the ACK bit
@@ -3920,7 +3964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_init_metrics(sk);
 
-		tcp_init_congestion_control(tp);
+		tcp_init_congestion_control(sk);
 
 		/* Prevent spurious tcp_cwnd_restart() on first data
 		 * packet.
@@ -3930,7 +3974,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_init_buffer_space(sk);
 
 		if (sock_flag(sk, SOCK_KEEPOPEN))
-			tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
 
 		if (!tp->rx_opt.snd_wscale)
 			__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -3942,7 +3986,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			sk_wake_async(sk, 0, POLL_OUT);
 		}
 
-		if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+		icsk = inet_csk(sk);
+
+		if (sk->sk_write_pending ||
+		    icsk->icsk_accept_queue.rskq_defer_accept ||
+		    icsk->icsk_ack.pingpong) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
@@ -3950,12 +3998,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			 * look so _wonderfully_ clever, that I was not able
 			 * to stand against the temptation 8)     --ANK
 			 */
-			tcp_schedule_ack(tp);
-			tp->ack.lrcvtime = tcp_time_stamp;
-			tp->ack.ato	 = TCP_ATO_MIN;
-			tcp_incr_quickack(tp);
-			tcp_enter_quickack_mode(tp);
-			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+			inet_csk_schedule_ack(sk);
+			icsk->icsk_ack.lrcvtime = tcp_time_stamp;
+			icsk->icsk_ack.ato	 = TCP_ATO_MIN;
+			tcp_incr_quickack(sk);
+			tcp_enter_quickack_mode(sk);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
 			__kfree_skb(skb);
@@ -4111,7 +4160,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	}
 
 	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
-	    tcp_paws_discard(tp, skb)) {
+	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 			tcp_send_dupack(sk, skb);
@@ -4180,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 				    !tp->srtt)
-					tcp_ack_saw_tstamp(tp, 0, 0);
+					tcp_ack_saw_tstamp(sk, NULL, 0);
 
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4192,7 +4241,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 				tcp_init_metrics(sk);
 
-				tcp_init_congestion_control(tp);
+				tcp_init_congestion_control(sk);
 
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
@@ -4227,9 +4276,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 						return 1;
 					}
 
-					tmo = tcp_fin_time(tp);
+					tmo = tcp_fin_time(sk);
 					if (tmo > TCP_TIMEWAIT_LEN) {
-						tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
 					} else if (th->fin || sock_owned_by_user(sk)) {
 						/* Bad case. We could lose such FIN otherwise.
 						 * It is not a big problem, but it looks confusing
@@ -4237,7 +4286,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 						 * if it spins in bh_lock_sock(), but it is really
 						 * marginal case.
 						 */
-						tcp_reset_keepalive_timer(sk, tmo);
+						inet_csk_reset_keepalive_timer(sk, tmo);
 					} else {
 						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 						goto discard;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 67c670886c1..13dfb391cdf 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -64,7 +64,9 @@
 #include <linux/times.h>
 
 #include <net/icmp.h>
+#include <net/inet_hashtables.h>
 #include <net/tcp.h>
+#include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -75,7 +77,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-extern int sysctl_ip_dynaddr;
 int sysctl_tcp_tw_reuse;
 int sysctl_tcp_low_latency;
 
@@ -88,463 +89,29 @@ static struct socket *tcp_socket;
 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
 		       struct sk_buff *skb);
 
-struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.__tcp_lhash_lock	=	RW_LOCK_UNLOCKED,
-	.__tcp_lhash_users	=	ATOMIC_INIT(0),
-	.__tcp_lhash_wait
-	  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
-	.__tcp_portalloc_lock	=	SPIN_LOCK_UNLOCKED
+struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
+	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_users	= ATOMIC_INIT(0),
+	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
+	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
+	.port_rover	= 1024 - 1,
 };
 
-/*
- * This array holds the first and last local port number.
- * For high-usage systems, use sysctl to change this to
- * 32768-61000
- */
-int sysctl_local_port_range[2] = { 1024, 4999 };
-int tcp_port_rover = 1024 - 1;
-
-static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
-				 __u32 faddr, __u16 fport)
-{
-	int h = (laddr ^ lport) ^ (faddr ^ fport);
-	h ^= h >> 16;
-	h ^= h >> 8;
-	return h & (tcp_ehash_size - 1);
-}
-
-static __inline__ int tcp_sk_hashfn(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	__u32 laddr = inet->rcv_saddr;
-	__u16 lport = inet->num;
-	__u32 faddr = inet->daddr;
-	__u16 fport = inet->dport;
-
-	return tcp_hashfn(laddr, lport, faddr, fport);
-}
-
-/* Allocate and initialize a new TCP local port bind bucket.
- * The bindhash mutex for snum's hash chain must be held here.
- */
-struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
-					  unsigned short snum)
-{
-	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
-						      SLAB_ATOMIC);
-	if (tb) {
-		tb->port = snum;
-		tb->fastreuse = 0;
-		INIT_HLIST_HEAD(&tb->owners);
-		hlist_add_head(&tb->node, &head->chain);
-	}
-	return tb;
-}
-
-/* Caller must hold hashbucket lock for this tb with local BH disabled */
-void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
-{
-	if (hlist_empty(&tb->owners)) {
-		__hlist_del(&tb->node);
-		kmem_cache_free(tcp_bucket_cachep, tb);
-	}
-}
-
-/* Caller must disable local BH processing. */
-static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-	struct tcp_bind_hashbucket *head =
-				&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
-	struct tcp_bind_bucket *tb;
-
-	spin_lock(&head->lock);
-	tb = tcp_sk(sk)->bind_hash;
-	sk_add_bind_node(child, &tb->owners);
-	tcp_sk(child)->bind_hash = tb;
-	spin_unlock(&head->lock);
-}
-
-inline void tcp_inherit_port(struct sock *sk, struct sock *child)
-{
-	local_bh_disable();
-	__tcp_inherit_port(sk, child);
-	local_bh_enable();
-}
-
-void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
-		   unsigned short snum)
-{
-	inet_sk(sk)->num = snum;
-	sk_add_bind_node(sk, &tb->owners);
-	tcp_sk(sk)->bind_hash = tb;
-}
-
-static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
-{
-	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
-	struct sock *sk2;
-	struct hlist_node *node;
-	int reuse = sk->sk_reuse;
-
-	sk_for_each_bound(sk2, node, &tb->owners) {
-		if (sk != sk2 &&
-		    !tcp_v6_ipv6only(sk2) &&
-		    (!sk->sk_bound_dev_if ||
-		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
-			if (!reuse || !sk2->sk_reuse ||
-			    sk2->sk_state == TCP_LISTEN) {
-				const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
-				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
-				    sk2_rcv_saddr == sk_rcv_saddr)
-					break;
-			}
-		}
-	}
-	return node != NULL;
-}
-
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- */
 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
 {
-	struct tcp_bind_hashbucket *head;
-	struct hlist_node *node;
-	struct tcp_bind_bucket *tb;
-	int ret;
-
-	local_bh_disable();
-	if (!snum) {
-		int low = sysctl_local_port_range[0];
-		int high = sysctl_local_port_range[1];
-		int remaining = (high - low) + 1;
-		int rover;
-
-		spin_lock(&tcp_portalloc_lock);
-		if (tcp_port_rover < low)
-			rover = low;
-		else
-			rover = tcp_port_rover;
-		do {
-			rover++;
-			if (rover > high)
-				rover = low;
-			head = &tcp_bhash[tcp_bhashfn(rover)];
-			spin_lock(&head->lock);
-			tb_for_each(tb, node, &head->chain)
-				if (tb->port == rover)
-					goto next;
-			break;
-		next:
-			spin_unlock(&head->lock);
-		} while (--remaining > 0);
-		tcp_port_rover = rover;
-		spin_unlock(&tcp_portalloc_lock);
-
-		/* Exhausted local port range during search?  It is not
-		 * possible for us to be holding one of the bind hash
-		 * locks if this test triggers, because if 'remaining'
-		 * drops to zero, we broke out of the do/while loop at
-		 * the top level, not from the 'break;' statement.
-		 */
-		ret = 1;
-		if (unlikely(remaining <= 0))
-			goto fail;
-
-		/* OK, here is the one we will use.  HEAD is
-		 * non-NULL and we hold it's mutex.
-		 */
-		snum = rover;
-	} else {
-		head = &tcp_bhash[tcp_bhashfn(snum)];
-		spin_lock(&head->lock);
-		tb_for_each(tb, node, &head->chain)
-			if (tb->port == snum)
-				goto tb_found;
-	}
-	tb = NULL;
-	goto tb_not_found;
-tb_found:
-	if (!hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse > 1)
-			goto success;
-		if (tb->fastreuse > 0 &&
-		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
-			goto success;
-		} else {
-			ret = 1;
-			if (tcp_bind_conflict(sk, tb))
-				goto fail_unlock;
-		}
-	}
-tb_not_found:
-	ret = 1;
-	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
-		goto fail_unlock;
-	if (hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-			tb->fastreuse = 1;
-		else
-			tb->fastreuse = 0;
-	} else if (tb->fastreuse &&
-		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-		tb->fastreuse = 0;
-success:
-	if (!tcp_sk(sk)->bind_hash)
-		tcp_bind_hash(sk, tb, snum);
-	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
- 	ret = 0;
-
-fail_unlock:
-	spin_unlock(&head->lock);
-fail:
-	local_bh_enable();
-	return ret;
-}
-
-/* Get rid of any references to a local port held by the
- * given sock.
- */
-static void __tcp_put_port(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
-	struct tcp_bind_bucket *tb;
-
-	spin_lock(&head->lock);
-	tb = tcp_sk(sk)->bind_hash;
-	__sk_del_bind_node(sk);
-	tcp_sk(sk)->bind_hash = NULL;
-	inet->num = 0;
-	tcp_bucket_destroy(tb);
-	spin_unlock(&head->lock);
-}
-
-void tcp_put_port(struct sock *sk)
-{
-	local_bh_disable();
-	__tcp_put_port(sk);
-	local_bh_enable();
-}
-
-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-
-void tcp_listen_wlock(void)
-{
-	write_lock(&tcp_lhash_lock);
-
-	if (atomic_read(&tcp_lhash_users)) {
-		DEFINE_WAIT(wait);
-
-		for (;;) {
-			prepare_to_wait_exclusive(&tcp_lhash_wait,
-						&wait, TASK_UNINTERRUPTIBLE);
-			if (!atomic_read(&tcp_lhash_users))
-				break;
-			write_unlock_bh(&tcp_lhash_lock);
-			schedule();
-			write_lock_bh(&tcp_lhash_lock);
-		}
-
-		finish_wait(&tcp_lhash_wait, &wait);
-	}
-}
-
-static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
-{
-	struct hlist_head *list;
-	rwlock_t *lock;
-
-	BUG_TRAP(sk_unhashed(sk));
-	if (listen_possible && sk->sk_state == TCP_LISTEN) {
-		list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-		lock = &tcp_lhash_lock;
-		tcp_listen_wlock();
-	} else {
-		list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
-		lock = &tcp_ehash[sk->sk_hashent].lock;
-		write_lock(lock);
-	}
-	__sk_add_node(sk, list);
-	sock_prot_inc_use(sk->sk_prot);
-	write_unlock(lock);
-	if (listen_possible && sk->sk_state == TCP_LISTEN)
-		wake_up(&tcp_lhash_wait);
+	return inet_csk_get_port(&tcp_hashinfo, sk, snum);
 }
 
 static void tcp_v4_hash(struct sock *sk)
 {
-	if (sk->sk_state != TCP_CLOSE) {
-		local_bh_disable();
-		__tcp_v4_hash(sk, 1);
-		local_bh_enable();
-	}
+	inet_hash(&tcp_hashinfo, sk);
 }
 
 void tcp_unhash(struct sock *sk)
 {
-	rwlock_t *lock;
-
-	if (sk_unhashed(sk))
-		goto ende;
-
-	if (sk->sk_state == TCP_LISTEN) {
-		local_bh_disable();
-		tcp_listen_wlock();
-		lock = &tcp_lhash_lock;
-	} else {
-		struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
-		lock = &head->lock;
-		write_lock_bh(&head->lock);
-	}
-
-	if (__sk_del_node_init(sk))
-		sock_prot_dec_use(sk->sk_prot);
-	write_unlock_bh(lock);
-
- ende:
-	if (sk->sk_state == TCP_LISTEN)
-		wake_up(&tcp_lhash_wait);
-}
-
-/* Don't inline this cruft.  Here are some nice properties to
- * exploit here.  The BSD API does not allow a listening TCP
- * to specify the remote port nor the remote address for the
- * connection.  So always assume those are both wildcarded
- * during the search since they can never be otherwise.
- */
-static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
-					     unsigned short hnum, int dif)
-{
-	struct sock *result = NULL, *sk;
-	struct hlist_node *node;
-	int score, hiscore;
-
-	hiscore=-1;
-	sk_for_each(sk, node, head) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (inet->num == hnum && !ipv6_only_sock(sk)) {
-			__u32 rcv_saddr = inet->rcv_saddr;
-
-			score = (sk->sk_family == PF_INET ? 1 : 0);
-			if (rcv_saddr) {
-				if (rcv_saddr != daddr)
-					continue;
-				score+=2;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score+=2;
-			}
-			if (score == 5)
-				return sk;
-			if (score > hiscore) {
-				hiscore = score;
-				result = sk;
-			}
-		}
-	}
-	return result;
-}
-
-/* Optimize the common listener case. */
-static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
-		unsigned short hnum, int dif)
-{
-	struct sock *sk = NULL;
-	struct hlist_head *head;
-
-	read_lock(&tcp_lhash_lock);
-	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
-	if (!hlist_empty(head)) {
-		struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
-		if (inet->num == hnum && !sk->sk_node.next &&
-		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
-		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
-		    !sk->sk_bound_dev_if)
-			goto sherry_cache;
-		sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
-	}
-	if (sk) {
-sherry_cache:
-		sock_hold(sk);
-	}
-	read_unlock(&tcp_lhash_lock);
-	return sk;
+	inet_unhash(&tcp_hashinfo, sk);
 }
 
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * Local BH must be disabled here.
- */
-
-static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
-						       u32 daddr, u16 hnum,
-						       int dif)
-{
-	struct tcp_ehash_bucket *head;
-	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
-	struct sock *sk;
-	struct hlist_node *node;
-	/* Optimize here for direct hit, only listening connections can
-	 * have wildcards anyways.
-	 */
-	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
-	head = &tcp_ehash[hash];
-	read_lock(&head->lock);
-	sk_for_each(sk, node, &head->chain) {
-		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
-			goto hit; /* You sunk my battleship! */
-	}
-
-	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
-		if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
-			goto hit;
-	}
-	sk = NULL;
-out:
-	read_unlock(&head->lock);
-	return sk;
-hit:
-	sock_hold(sk);
-	goto out;
-}
-
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
-					   u32 daddr, u16 hnum, int dif)
-{
-	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
-						      daddr, hnum, dif);
-
-	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
-				  u16 dport, int dif)
-{
-	struct sock *sk;
-
-	local_bh_disable();
-	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
-	local_bh_enable();
-
-	return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v4_lookup);
-
 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 {
 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
@@ -555,27 +122,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 
 /* called with local bh disabled */
 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
-				      struct tcp_tw_bucket **twp)
+				      struct inet_timewait_sock **twp)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	u32 daddr = inet->rcv_saddr;
 	u32 saddr = inet->daddr;
 	int dif = sk->sk_bound_dev_if;
-	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
-	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
-	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
-	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+	INET_ADDR_COOKIE(acookie, saddr, daddr)
+	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
+	struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 	struct sock *sk2;
-	struct hlist_node *node;
-	struct tcp_tw_bucket *tw;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
 
 	write_lock(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
-		tw = (struct tcp_tw_bucket *)sk2;
+	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+		tw = inet_twsk(sk2);
 
-		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+		if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
 
 			/* With PAWS, it is safe from the viewpoint
@@ -592,15 +160,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 			   fall back to VJ's scheme and use initial
 			   timestamp retrieved from peer table.
 			 */
-			if (tw->tw_ts_recent_stamp &&
+			if (tcptw->tw_ts_recent_stamp &&
 			    (!twp || (sysctl_tcp_tw_reuse &&
 				      xtime.tv_sec -
-				      tw->tw_ts_recent_stamp > 1))) {
-				if ((tp->write_seq =
-						tw->tw_snd_nxt + 65535 + 2) == 0)
+				      tcptw->tw_ts_recent_stamp > 1))) {
+				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+				if (tp->write_seq == 0)
 					tp->write_seq = 1;
-				tp->rx_opt.ts_recent	   = tw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 				sock_hold(sk2);
 				goto unique;
 			} else
@@ -611,7 +179,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 
 	/* And established part... */
 	sk_for_each(sk2, node, &head->chain) {
-		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+		if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
 			goto not_unique;
 	}
 
@@ -631,10 +199,10 @@ unique:
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		tcp_tw_deschedule(tw);
+		inet_twsk_deschedule(tw, &tcp_death_row);
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 
-		tcp_tw_put(tw);
+		inet_twsk_put(tw);
 	}
 
 	return 0;
@@ -657,9 +225,9 @@ static inline u32 connect_port_offset(const struct sock *sk)
  */
 static inline int tcp_v4_hash_connect(struct sock *sk)
 {
-	unsigned short snum = inet_sk(sk)->num;
- 	struct tcp_bind_hashbucket *head;
- 	struct tcp_bind_bucket *tb;
+	const unsigned short snum = inet_sk(sk)->num;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
 	int ret;
 
  	if (!snum) {
@@ -671,19 +239,19 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
 		static u32 hint;
 		u32 offset = hint + connect_port_offset(sk);
 		struct hlist_node *node;
- 		struct tcp_tw_bucket *tw = NULL;
+ 		struct inet_timewait_sock *tw = NULL;
 
  		local_bh_disable();
 		for (i = 1; i <= range; i++) {
 			port = low + (i + offset) % range;
- 			head = &tcp_bhash[tcp_bhashfn(port)];
+ 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
  			spin_lock(&head->lock);
 
  			/* Does not bother with rcv_saddr checks,
  			 * because the established check is already
  			 * unique enough.
  			 */
-			tb_for_each(tb, node, &head->chain) {
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
  				if (tb->port == port) {
  					BUG_TRAP(!hlist_empty(&tb->owners));
  					if (tb->fastreuse >= 0)
@@ -696,7 +264,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk)
  				}
  			}
 
- 			tb = tcp_bucket_create(head, port);
+ 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
  			if (!tb) {
  				spin_unlock(&head->lock);
  				break;
@@ -715,27 +283,27 @@ ok:
 		hint += i;
 
  		/* Head lock still held and bh's disabled */
- 		tcp_bind_hash(sk, tb, port);
+ 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(port);
- 			__tcp_v4_hash(sk, 0);
+ 			__inet_hash(&tcp_hashinfo, sk, 0);
  		}
  		spin_unlock(&head->lock);
 
  		if (tw) {
- 			tcp_tw_deschedule(tw);
- 			tcp_tw_put(tw);
+ 			inet_twsk_deschedule(tw, &tcp_death_row);;
+ 			inet_twsk_put(tw);
  		}
 
 		ret = 0;
 		goto out;
  	}
 
- 	head  = &tcp_bhash[tcp_bhashfn(snum)];
- 	tb  = tcp_sk(sk)->bind_hash;
+ 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__tcp_v4_hash(sk, 0);
+		__inet_hash(&tcp_hashinfo, sk, 0);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
@@ -798,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		tp->write_seq		   = 0;
 	}
 
-	if (sysctl_tcp_tw_recycle &&
+	if (tcp_death_row.sysctl_tw_recycle &&
 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 		struct inet_peer *peer = rt_get_peer(rt);
 
@@ -837,8 +405,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto failure;
 
 	/* OK, now commit destination to socket.  */
-	__sk_dst_set(sk, &rt->u.dst);
-	tcp_v4_setup_caps(sk, &rt->u.dst);
+	sk_setup_caps(sk, &rt->u.dst);
 
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
@@ -864,53 +431,6 @@ failure:
 	return err;
 }
 
-static __inline__ int tcp_v4_iif(struct sk_buff *skb)
-{
-	return ((struct rtable *)skb->dst)->rt_iif;
-}
-
-static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
-{
-	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
-}
-
-static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
-					      struct request_sock ***prevp,
-					      __u16 rport,
-					      __u32 raddr, __u32 laddr)
-{
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	struct request_sock *req, **prev;
-
-	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
-	     (req = *prev) != NULL;
-	     prev = &req->dl_next) {
-		const struct inet_request_sock *ireq = inet_rsk(req);
-
-		if (ireq->rmt_port == rport &&
-		    ireq->rmt_addr == raddr &&
-		    ireq->loc_addr == laddr &&
-		    TCP_INET_FAMILY(req->rsk_ops->family)) {
-			BUG_TRAP(!req->sk);
-			*prevp = prev;
-			break;
-		}
-	}
-
-	return req;
-}
-
-static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
-
-	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
-	tcp_synq_added(sk);
-}
-
-
 /*
  * This routine does path mtu discovery as defined in RFC1191.
  */
@@ -993,14 +513,14 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		return;
 	}
 
-	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
-			   th->source, tcp_v4_iif(skb));
+	sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
+			 th->source, inet_iif(skb));
 	if (!sk) {
 		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
 		return;
 	}
 	if (sk->sk_state == TCP_TIME_WAIT) {
-		tcp_tw_put((struct tcp_tw_bucket *)sk);
+		inet_twsk_put((struct inet_timewait_sock *)sk);
 		return;
 	}
 
@@ -1054,8 +574,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		if (sock_owned_by_user(sk))
 			goto out;
 
-		req = tcp_v4_search_req(tp, &prev, th->dest,
-					iph->daddr, iph->saddr);
+		req = inet_csk_search_req(sk, &prev, th->dest,
+					  iph->daddr, iph->saddr);
 		if (!req)
 			goto out;
 
@@ -1075,7 +595,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		 * created socket, and POSIX does not want network
 		 * errors returned from accept().
 		 */
-		tcp_synq_drop(sk, req, prev);
+		inet_csk_reqsk_queue_drop(sk, req, prev);
 		goto out;
 
 	case TCP_SYN_SENT:
@@ -1245,12 +765,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 
 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
-	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
-			tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
 
-	tcp_tw_put(tw);
+	inet_twsk_put(tw);
 }
 
 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1259,36 +780,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 			req->ts_recent);
 }
 
-static struct dst_entry* tcp_v4_route_req(struct sock *sk,
-					  struct request_sock *req)
-{
-	struct rtable *rt;
-	const struct inet_request_sock *ireq = inet_rsk(req);
-	struct ip_options *opt = inet_rsk(req)->opt;
-	struct flowi fl = { .oif = sk->sk_bound_dev_if,
-			    .nl_u = { .ip4_u =
-				      { .daddr = ((opt && opt->srr) ?
-						  opt->faddr :
-						  ireq->rmt_addr),
-					.saddr = ireq->loc_addr,
-					.tos = RT_CONN_FLAGS(sk) } },
-			    .proto = IPPROTO_TCP,
-			    .uli_u = { .ports =
-				       { .sport = inet_sk(sk)->sport,
-					 .dport = ireq->rmt_port } } };
-
-	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
-		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-		return NULL;
-	}
-	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
-		ip_rt_put(rt);
-		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
-		return NULL;
-	}
-	return &rt->u.dst;
-}
-
 /*
  *	Send a SYN-ACK after having received an ACK.
  *	This still operates on a request_sock only, not on a big
@@ -1302,7 +793,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 	struct sk_buff * skb;
 
 	/* First, grab a route. */
-	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 		goto out;
 
 	skb = tcp_make_synack(sk, dst, req);
@@ -1404,7 +895,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * limitations, they conserve resources and peer is
 	 * evidently real one.
 	 */
-	if (tcp_synq_is_full(sk) && !isn) {
+	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 #ifdef CONFIG_SYN_COOKIES
 		if (sysctl_tcp_syncookies) {
 			want_cookie = 1;
@@ -1418,7 +909,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * clogging syn queue with openreqs with exponentially increasing
 	 * timeout.
 	 */
-	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
 	req = reqsk_alloc(&tcp_request_sock_ops);
@@ -1474,8 +965,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 * are made in the function processing timewait state.
 		 */
 		if (tmp_opt.saw_tstamp &&
-		    sysctl_tcp_tw_recycle &&
-		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+		    tcp_death_row.sysctl_tw_recycle &&
+		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
@@ -1488,7 +979,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		}
 		/* Kill the following clause, if you dislike this way. */
 		else if (!sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
 			  (sysctl_max_syn_backlog >> 2)) &&
 			 (!peer || !peer->tcp_ts_stamp) &&
 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
@@ -1499,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			 * to destinations, already remembered
 			 * to the moment of synflood.
 			 */
-			LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
-					      "request from %u.%u."
-					      "%u.%u/%u\n",
-					      NIPQUAD(saddr),
-					      ntohs(skb->h.th->source)));
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
+				       "request from %u.%u.%u.%u/%u\n",
+				       NIPQUAD(saddr),
+				       ntohs(skb->h.th->source));
 			dst_release(dst);
 			goto drop_and_free;
 		}
@@ -1518,7 +1008,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (want_cookie) {
 	   	reqsk_free(req);
 	} else {
-		tcp_v4_synq_add(sk, req);
+		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
 	return 0;
 
@@ -1546,15 +1036,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (sk_acceptq_is_full(sk))
 		goto exit_overflow;
 
-	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 		goto exit;
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
 		goto exit;
 
-	newsk->sk_dst_cache = dst;
-	tcp_v4_setup_caps(newsk, dst);
+	sk_setup_caps(newsk, dst);
 
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
@@ -1564,7 +1053,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->saddr	      = ireq->loc_addr;
 	newinet->opt	      = ireq->opt;
 	ireq->opt	      = NULL;
-	newinet->mc_index     = tcp_v4_iif(skb);
+	newinet->mc_index     = inet_iif(skb);
 	newinet->mc_ttl	      = skb->nh.iph->ttl;
 	newtp->ext_header_len = 0;
 	if (newinet->opt)
@@ -1575,8 +1064,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(newsk);
 
-	__tcp_v4_hash(newsk, 0);
-	__tcp_inherit_port(sk, newsk);
+	__inet_hash(&tcp_hashinfo, newsk, 0);
+	__inet_inherit_port(&tcp_hashinfo, sk, newsk);
 
 	return newsk;
 
@@ -1592,27 +1081,24 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcphdr *th = skb->h.th;
 	struct iphdr *iph = skb->nh.iph;
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct sock *nsk;
 	struct request_sock **prev;
 	/* Find possible connection requests. */
-	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
-						     iph->saddr, iph->daddr);
+	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
+						       iph->saddr, iph->daddr);
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
-	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
-					  th->source,
-					  skb->nh.iph->daddr,
-					  ntohs(th->dest),
-					  tcp_v4_iif(skb));
+	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+					th->source, skb->nh.iph->daddr,
+					ntohs(th->dest), inet_iif(skb));
 
 	if (nsk) {
 		if (nsk->sk_state != TCP_TIME_WAIT) {
 			bh_lock_sock(nsk);
 			return nsk;
 		}
-		tcp_tw_put((struct tcp_tw_bucket *)nsk);
+		inet_twsk_put((struct inet_timewait_sock *)nsk);
 		return NULL;
 	}
 
@@ -1631,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb)
 				  skb->nh.iph->daddr, skb->csum))
 			return 0;
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->len <= 76) {
@@ -1747,9 +1233,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
-	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
-			     skb->nh.iph->daddr, ntohs(th->dest),
-			     tcp_v4_iif(skb));
+	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
+			   skb->nh.iph->daddr, ntohs(th->dest),
+			   inet_iif(skb));
 
 	if (!sk)
 		goto no_tcp_socket;
@@ -1801,24 +1287,26 @@ discard_and_relse:
 
 do_time_wait:
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *) sk);
 		goto discard_it;
 	}
 
 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *) sk);
 		goto discard_it;
 	}
-	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-					   skb, th, skb->len)) {
+	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+					   skb, th)) {
 	case TCP_TW_SYN: {
-		struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
-							  ntohs(th->dest),
-							  tcp_v4_iif(skb));
+		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
+							skb->nh.iph->daddr,
+							ntohs(th->dest),
+							inet_iif(skb));
 		if (sk2) {
-			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
-			tcp_tw_put((struct tcp_tw_bucket *)sk);
+			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
+					     &tcp_death_row);
+			inet_twsk_put((struct inet_timewait_sock *)sk);
 			sk = sk2;
 			goto process;
 		}
@@ -1834,112 +1322,6 @@ do_time_wait:
 	goto discard_it;
 }
 
-/* With per-bucket locks this operation is not-atomic, so that
- * this version is not worse.
- */
-static void __tcp_v4_rehash(struct sock *sk)
-{
-	sk->sk_prot->unhash(sk);
-	sk->sk_prot->hash(sk);
-}
-
-static int tcp_v4_reselect_saddr(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	int err;
-	struct rtable *rt;
-	__u32 old_saddr = inet->saddr;
-	__u32 new_saddr;
-	__u32 daddr = inet->daddr;
-
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
-
-	/* Query new route. */
-	err = ip_route_connect(&rt, daddr, 0,
-			       RT_CONN_FLAGS(sk),
-			       sk->sk_bound_dev_if,
-			       IPPROTO_TCP,
-			       inet->sport, inet->dport, sk);
-	if (err)
-		return err;
-
-	__sk_dst_set(sk, &rt->u.dst);
-	tcp_v4_setup_caps(sk, &rt->u.dst);
-
-	new_saddr = rt->rt_src;
-
-	if (new_saddr == old_saddr)
-		return 0;
-
-	if (sysctl_ip_dynaddr > 1) {
-		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
-				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
-		       NIPQUAD(old_saddr),
-		       NIPQUAD(new_saddr));
-	}
-
-	inet->saddr = new_saddr;
-	inet->rcv_saddr = new_saddr;
-
-	/* XXX The only one ugly spot where we need to
-	 * XXX really change the sockets identity after
-	 * XXX it has entered the hashes. -DaveM
-	 *
-	 * Besides that, it does not check for connection
-	 * uniqueness. Wait for troubles.
-	 */
-	__tcp_v4_rehash(sk);
-	return 0;
-}
-
-int tcp_v4_rebuild_header(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
-	u32 daddr;
-	int err;
-
-	/* Route is OK, nothing to do. */
-	if (rt)
-		return 0;
-
-	/* Reroute. */
-	daddr = inet->daddr;
-	if (inet->opt && inet->opt->srr)
-		daddr = inet->opt->faddr;
-
-	{
-		struct flowi fl = { .oif = sk->sk_bound_dev_if,
-				    .nl_u = { .ip4_u =
-					      { .daddr = daddr,
-						.saddr = inet->saddr,
-						.tos = RT_CONN_FLAGS(sk) } },
-				    .proto = IPPROTO_TCP,
-				    .uli_u = { .ports =
-					       { .sport = inet->sport,
-						 .dport = inet->dport } } };
-						
-		err = ip_route_output_flow(&rt, &fl, sk, 0);
-	}
-	if (!err) {
-		__sk_dst_set(sk, &rt->u.dst);
-		tcp_v4_setup_caps(sk, &rt->u.dst);
-		return 0;
-	}
-
-	/* Routing failed... */
-	sk->sk_route_caps = 0;
-
-	if (!sysctl_ip_dynaddr ||
-	    sk->sk_state != TCP_SYN_SENT ||
-	    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
-	    (err = tcp_v4_reselect_saddr(sk)) != 0)
-		sk->sk_err_soft = -err;
-
-	return err;
-}
-
 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1988,18 +1370,18 @@ int tcp_v4_remember_stamp(struct sock *sk)
 	return 0;
 }
 
-int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
-	struct inet_peer *peer = NULL;
-
-	peer = inet_getpeer(tw->tw_daddr, 1);
+	struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
 
 	if (peer) {
-		if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+		const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+		if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
-		     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
-			peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
-			peer->tcp_ts = tw->tw_ts_recent;
+		     peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+			peer->tcp_ts	   = tcptw->tw_ts_recent;
 		}
 		inet_putpeer(peer);
 		return 1;
@@ -2011,7 +1393,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
 struct tcp_func ipv4_specific = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
-	.rebuild_header	=	tcp_v4_rebuild_header,
+	.rebuild_header	=	inet_sk_rebuild_header,
 	.conn_request	=	tcp_v4_conn_request,
 	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
 	.remember_stamp	=	tcp_v4_remember_stamp,
@@ -2027,13 +1409,14 @@ struct tcp_func ipv4_specific = {
  */
 static int tcp_v4_init_sock(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 
-	tp->rto  = TCP_TIMEOUT_INIT;
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
 
 	/* So many TCP implementations out there (incorrectly) count the
@@ -2051,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
-	tp->ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -2074,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_clear_xmit_timers(sk);
 
-	tcp_cleanup_congestion_control(tp);
+	tcp_cleanup_congestion_control(sk);
 
 	/* Cleanup up the write buffer. */
   	sk_stream_writequeue_purge(sk);
@@ -2086,8 +1469,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
 	__skb_queue_purge(&tp->ucopy.prequeue);
 
 	/* Clean up a referenced TCP bind bucket. */
-	if (tp->bind_hash)
-		tcp_put_port(sk);
+	if (inet_csk(sk)->icsk_bind_hash)
+		inet_put_port(&tcp_hashinfo, sk);
 
 	/*
 	 * If sendmsg cached page exists, toss it.
@@ -2107,13 +1490,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
 #ifdef CONFIG_PROC_FS
 /* Proc filesystem TCP sock list dumping. */
 
-static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
 {
 	return hlist_empty(head) ? NULL :
-		list_entry(head->first, struct tcp_tw_bucket, tw_node);
+		list_entry(head->first, struct inet_timewait_sock, tw_node);
 }
 
-static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 {
 	return tw->tw_node.next ?
 		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
@@ -2121,14 +1504,14 @@ static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
 
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_sock *tp;
+	struct inet_connection_sock *icsk;
 	struct hlist_node *node;
 	struct sock *sk = cur;
 	struct tcp_iter_state* st = seq->private;
 
 	if (!sk) {
 		st->bucket = 0;
-		sk = sk_head(&tcp_listening_hash[0]);
+		sk = sk_head(&tcp_hashinfo.listening_hash[0]);
 		goto get_sk;
 	}
 
@@ -2137,7 +1520,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
 		struct request_sock *req = cur;
 
-	       	tp = tcp_sk(st->syn_wait_sk);
+	       	icsk = inet_csk(st->syn_wait_sk);
 		req = req->dl_next;
 		while (1) {
 			while (req) {
@@ -2150,17 +1533,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
 				break;
 get_req:
-			req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
+			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
 		}
 		sk	  = sk_next(st->syn_wait_sk);
 		st->state = TCP_SEQ_STATE_LISTENING;
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	} else {
-	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->accept_queue.syn_wait_lock);
-		if (reqsk_queue_len(&tp->accept_queue))
+	       	icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue))
 			goto start_req;
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 		sk = sk_next(sk);
 	}
 get_sk:
@@ -2169,9 +1552,9 @@ get_sk:
 			cur = sk;
 			goto out;
 		}
-	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->accept_queue.syn_wait_lock);
-		if (reqsk_queue_len(&tp->accept_queue)) {
+	       	icsk = inet_csk(sk);
+		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
 start_req:
 			st->uid		= sock_i_uid(sk);
 			st->syn_wait_sk = sk;
@@ -2179,10 +1562,10 @@ start_req:
 			st->sbucket	= 0;
 			goto get_req;
 		}
-		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	}
-	if (++st->bucket < TCP_LHTABLE_SIZE) {
-		sk = sk_head(&tcp_listening_hash[st->bucket]);
+	if (++st->bucket < INET_LHTABLE_SIZE) {
+		sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
 		goto get_sk;
 	}
 	cur = NULL;
@@ -2206,16 +1589,16 @@ static void *established_get_first(struct seq_file *seq)
 	struct tcp_iter_state* st = seq->private;
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+	for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_node *node;
-		struct tcp_tw_bucket *tw;
+		struct inet_timewait_sock *tw;
 
 		/* We can reschedule _before_ having picked the target: */
 		cond_resched_softirq();
 
-		read_lock(&tcp_ehash[st->bucket].lock);
-		sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+		read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+		sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
 			if (sk->sk_family != st->family) {
 				continue;
 			}
@@ -2223,15 +1606,15 @@ static void *established_get_first(struct seq_file *seq)
 			goto out;
 		}
 		st->state = TCP_SEQ_STATE_TIME_WAIT;
-		tw_for_each(tw, node,
-			    &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+		inet_twsk_for_each(tw, node,
+				   &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
 			if (tw->tw_family != st->family) {
 				continue;
 			}
 			rc = tw;
 			goto out;
 		}
-		read_unlock(&tcp_ehash[st->bucket].lock);
+		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 	}
 out:
@@ -2241,7 +1624,7 @@ out:
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
 	struct sock *sk = cur;
-	struct tcp_tw_bucket *tw;
+	struct inet_timewait_sock *tw;
 	struct hlist_node *node;
 	struct tcp_iter_state* st = seq->private;
 
@@ -2258,15 +1641,15 @@ get_tw:
 			cur = tw;
 			goto out;
 		}
-		read_unlock(&tcp_ehash[st->bucket].lock);
+		read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
 		/* We can reschedule between buckets: */
 		cond_resched_softirq();
 
-		if (++st->bucket < tcp_ehash_size) {
-			read_lock(&tcp_ehash[st->bucket].lock);
-			sk = sk_head(&tcp_ehash[st->bucket].chain);
+		if (++st->bucket < tcp_hashinfo.ehash_size) {
+			read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+			sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
 		} else {
 			cur = NULL;
 			goto out;
@@ -2280,7 +1663,7 @@ get_tw:
 	}
 
 	st->state = TCP_SEQ_STATE_TIME_WAIT;
-	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+	tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
 	goto get_tw;
 found:
 	cur = sk;
@@ -2304,12 +1687,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 	void *rc;
 	struct tcp_iter_state* st = seq->private;
 
-	tcp_listen_lock();
+	inet_listen_lock(&tcp_hashinfo);
 	st->state = TCP_SEQ_STATE_LISTENING;
 	rc	  = listening_get_idx(seq, &pos);
 
 	if (!rc) {
-		tcp_listen_unlock();
+		inet_listen_unlock(&tcp_hashinfo);
 		local_bh_disable();
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		rc	  = established_get_idx(seq, pos);
@@ -2342,7 +1725,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	case TCP_SEQ_STATE_LISTENING:
 		rc = listening_get_next(seq, v);
 		if (!rc) {
-			tcp_listen_unlock();
+			inet_listen_unlock(&tcp_hashinfo);
 			local_bh_disable();
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
 			rc	  = established_get_first(seq);
@@ -2365,17 +1748,17 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_OPENREQ:
 		if (v) {
-			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
-			read_unlock_bh(&tp->accept_queue.syn_wait_lock);
+			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
+			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 		}
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			tcp_listen_unlock();
+			inet_listen_unlock(&tcp_hashinfo);
 		break;
 	case TCP_SEQ_STATE_TIME_WAIT:
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)
-			read_unlock(&tcp_ehash[st->bucket].lock);
+			read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
 		local_bh_enable();
 		break;
 	}
@@ -2472,18 +1855,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
 	int timer_active;
 	unsigned long timer_expires;
 	struct tcp_sock *tp = tcp_sk(sp);
+	const struct inet_connection_sock *icsk = inet_csk(sp);
 	struct inet_sock *inet = inet_sk(sp);
 	unsigned int dest = inet->daddr;
 	unsigned int src = inet->rcv_saddr;
 	__u16 destp = ntohs(inet->dport);
 	__u16 srcp = ntohs(inet->sport);
 
-	if (tp->pending == TCP_TIME_RETRANS) {
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
 		timer_active	= 1;
-		timer_expires	= tp->timeout;
-	} else if (tp->pending == TCP_TIME_PROBE0) {
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
-		timer_expires	= tp->timeout;
+		timer_expires	= icsk->icsk_timeout;
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;
@@ -2498,17 +1882,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
 		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
 		timer_active,
 		jiffies_to_clock_t(timer_expires - jiffies),
-		tp->retransmits,
+		icsk->icsk_retransmits,
 		sock_i_uid(sp),
-		tp->probes_out,
+		icsk->icsk_probes_out,
 		sock_i_ino(sp),
 		atomic_read(&sp->sk_refcnt), sp,
-		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+		icsk->icsk_rto,
+		icsk->icsk_ack.ato,
+		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
 		tp->snd_cwnd,
 		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
 }
 
-static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
@@ -2588,7 +1974,7 @@ struct proto tcp_prot = {
 	.close			= tcp_close,
 	.connect		= tcp_v4_connect,
 	.disconnect		= tcp_disconnect,
-	.accept			= tcp_accept,
+	.accept			= inet_csk_accept,
 	.ioctl			= tcp_ioctl,
 	.init			= tcp_v4_init_sock,
 	.destroy		= tcp_v4_destroy_sock,
@@ -2603,6 +1989,7 @@ struct proto tcp_prot = {
 	.get_port		= tcp_v4_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.sockets_allocated	= &tcp_sockets_allocated,
+	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
 	.sysctl_mem		= sysctl_tcp_mem,
@@ -2610,6 +1997,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
+	.twsk_obj_size		= sizeof(struct tcp_timewait_sock),
 	.rsk_prot		= &tcp_request_sock_ops,
 };
 
@@ -2631,19 +2019,13 @@ void __init tcp_v4_init(struct net_proto_family *ops)
 }
 
 EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_bind_hash);
-EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(inet_bind_bucket_create);
 EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_inherit_port);
-EXPORT_SYMBOL(tcp_listen_wlock);
-EXPORT_SYMBOL(tcp_port_rover);
 EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_put_port);
 EXPORT_SYMBOL(tcp_unhash);
 EXPORT_SYMBOL(tcp_v4_conn_request);
 EXPORT_SYMBOL(tcp_v4_connect);
 EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_rebuild_header);
 EXPORT_SYMBOL(tcp_v4_remember_stamp);
 EXPORT_SYMBOL(tcp_v4_send_check);
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f42a284164b..a88db28b0af 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,13 +35,27 @@
 #define SYNC_INIT 1
 #endif
 
-int sysctl_tcp_tw_recycle;
-int sysctl_tcp_max_tw_buckets = NR_FILE*2;
-
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_abort_on_overflow;
 
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+struct inet_timewait_death_row tcp_death_row = {
+	.sysctl_max_tw_buckets = NR_FILE * 2,
+	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
+	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.hashinfo	= &tcp_hashinfo,
+	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
+					    (unsigned long)&tcp_death_row),
+	.twkill_work	= __WORK_INITIALIZER(tcp_death_row.twkill_work,
+					     inet_twdr_twkill_work,
+					     &tcp_death_row),
+/* Short-time timewait calendar */
+
+	.twcal_hand	= -1,
+	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
+					    (unsigned long)&tcp_death_row),
+};
+
+EXPORT_SYMBOL_GPL(tcp_death_row);
 
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
@@ -52,47 +66,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 	return (seq == e_win && seq == end_seq);
 }
 
-/* New-style handling of TIME_WAIT sockets. */
-
-int tcp_tw_count;
-
-
-/* Must be called with locally disabled BHs. */
-static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
-{
-	struct tcp_ehash_bucket *ehead;
-	struct tcp_bind_hashbucket *bhead;
-	struct tcp_bind_bucket *tb;
-
-	/* Unlink from established hashes. */
-	ehead = &tcp_ehash[tw->tw_hashent];
-	write_lock(&ehead->lock);
-	if (hlist_unhashed(&tw->tw_node)) {
-		write_unlock(&ehead->lock);
-		return;
-	}
-	__hlist_del(&tw->tw_node);
-	sk_node_init(&tw->tw_node);
-	write_unlock(&ehead->lock);
-
-	/* Disassociate with bind bucket. */
-	bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
-	spin_lock(&bhead->lock);
-	tb = tw->tw_tb;
-	__hlist_del(&tw->tw_bind_node);
-	tw->tw_tb = NULL;
-	tcp_bucket_destroy(tb);
-	spin_unlock(&bhead->lock);
-
-#ifdef INET_REFCNT_DEBUG
-	if (atomic_read(&tw->tw_refcnt) != 1) {
-		printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
-		       atomic_read(&tw->tw_refcnt));
-	}
-#endif
-	tcp_tw_put(tw);
-}
-
 /* 
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -122,19 +95,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * to avoid misread sequence numbers, states etc.  --ANK
  */
 enum tcp_tw_status
-tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
-			   struct tcphdr *th, unsigned len)
+tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
+			   const struct tcphdr *th)
 {
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	struct tcp_options_received tmp_opt;
 	int paws_reject = 0;
 
 	tmp_opt.saw_tstamp = 0;
-	if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
 		tcp_parse_options(skb, &tmp_opt, 0);
 
 		if (tmp_opt.saw_tstamp) {
-			tmp_opt.ts_recent	   = tw->tw_ts_recent;
-			tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
+			tmp_opt.ts_recent_stamp	= tcptw->tw_ts_recent_stamp;
 			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 		}
 	}
@@ -145,20 +119,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		/* Out of window, send ACK */
 		if (paws_reject ||
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-				   tw->tw_rcv_nxt,
-				   tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+				   tcptw->tw_rcv_nxt,
+				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 			return TCP_TW_ACK;
 
 		if (th->rst)
 			goto kill;
 
-		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 			goto kill_with_rst;
 
 		/* Dup ACK? */
-		if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+		if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
-			tcp_tw_put(tw);
+			inet_twsk_put(tw);
 			return TCP_TW_SUCCESS;
 		}
 
@@ -166,19 +140,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		 * reset.
 		 */
 		if (!th->fin ||
-		    TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+		    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
-			tcp_tw_deschedule(tw);
-			tcp_tw_put(tw);
+			inet_twsk_deschedule(tw, &tcp_death_row);
+			inet_twsk_put(tw);
 			return TCP_TW_RST;
 		}
 
 		/* FIN arrived, enter true time-wait state. */
-		tw->tw_substate	= TCP_TIME_WAIT;
-		tw->tw_rcv_nxt	= TCP_SKB_CB(skb)->end_seq;
+		tw->tw_substate	  = TCP_TIME_WAIT;
+		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
-			tw->tw_ts_recent_stamp	= xtime.tv_sec;
-			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
 		}
 
 		/* I am shamed, but failed to make it more elegant.
@@ -187,11 +161,13 @@ kill_with_rst:
 		 * do not undertsnad recycling in any case, it not
 		 * a big problem in practice. --ANK */
 		if (tw->tw_family == AF_INET &&
-		    sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+		    tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
 		    tcp_v4_tw_remember_stamp(tw))
-			tcp_tw_schedule(tw, tw->tw_timeout);
+			inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
+					   TCP_TIMEWAIT_LEN);
 		else
-			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
 		return TCP_TW_ACK;
 	}
 
@@ -213,7 +189,7 @@ kill_with_rst:
 	 */
 
 	if (!paws_reject &&
-	    (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+	    (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 		/* In window segment, it may be only reset or bare ack. */
 
@@ -224,19 +200,20 @@ kill_with_rst:
 			 */
 			if (sysctl_tcp_rfc1337 == 0) {
 kill:
-				tcp_tw_deschedule(tw);
-				tcp_tw_put(tw);
+				inet_twsk_deschedule(tw, &tcp_death_row);
+				inet_twsk_put(tw);
 				return TCP_TW_SUCCESS;
 			}
 		}
-		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+		inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+				   TCP_TIMEWAIT_LEN);
 
 		if (tmp_opt.saw_tstamp) {
-			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
-			tw->tw_ts_recent_stamp	= xtime.tv_sec;
+			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval;
+			tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 		}
 
-		tcp_tw_put(tw);
+		inet_twsk_put(tw);
 		return TCP_TW_SUCCESS;
 	}
 
@@ -258,9 +235,10 @@ kill:
 	 */
 
 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
-	    (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
-	     (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
-		u32 isn = tw->tw_snd_nxt + 65535 + 2;
+	    (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+	     (tmp_opt.saw_tstamp &&
+	      (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 		if (isn == 0)
 			isn++;
 		TCP_SKB_CB(skb)->when = isn;
@@ -278,107 +256,57 @@ kill:
 		 * Do not reschedule in the last case.
 		 */
 		if (paws_reject || th->ack)
-			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
+					   TCP_TIMEWAIT_LEN);
 
 		/* Send ACK. Note, we do not put the bucket,
 		 * it will be released by caller.
 		 */
 		return TCP_TW_ACK;
 	}
-	tcp_tw_put(tw);
+	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
 }
 
-/* Enter the time wait state.  This is called with locally disabled BH.
- * Essentially we whip up a timewait bucket, copy the
- * relevant info into it from the SK, and mess with hash chains
- * and list linkage.
- */
-static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
-{
-	struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
-	struct tcp_bind_hashbucket *bhead;
-
-	/* Step 1: Put TW into bind hash. Original socket stays there too.
-	   Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
-	   binding cache, even if it is closed.
-	 */
-	bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
-	spin_lock(&bhead->lock);
-	tw->tw_tb = tcp_sk(sk)->bind_hash;
-	BUG_TRAP(tcp_sk(sk)->bind_hash);
-	tw_add_bind_node(tw, &tw->tw_tb->owners);
-	spin_unlock(&bhead->lock);
-
-	write_lock(&ehead->lock);
-
-	/* Step 2: Remove SK from established hash. */
-	if (__sk_del_node_init(sk))
-		sock_prot_dec_use(sk->sk_prot);
-
-	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
-	tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
-	atomic_inc(&tw->tw_refcnt);
-
-	write_unlock(&ehead->lock);
-}
-
 /* 
  * Move a socket to time-wait or dead fin-wait-2 state.
  */ 
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-	struct tcp_tw_bucket *tw = NULL;
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_timewait_sock *tw = NULL;
+	const struct tcp_sock *tp = tcp_sk(sk);
 	int recycle_ok = 0;
 
-	if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tp->af_specific->remember_stamp(sk);
 
-	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
-		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
-
-	if(tw != NULL) {
-		struct inet_sock *inet = inet_sk(sk);
-		int rto = (tp->rto<<2) - (tp->rto>>1);
-
-		/* Give us an identity. */
-		tw->tw_daddr		= inet->daddr;
-		tw->tw_rcv_saddr	= inet->rcv_saddr;
-		tw->tw_bound_dev_if	= sk->sk_bound_dev_if;
-		tw->tw_num		= inet->num;
-		tw->tw_state		= TCP_TIME_WAIT;
-		tw->tw_substate		= state;
-		tw->tw_sport		= inet->sport;
-		tw->tw_dport		= inet->dport;
-		tw->tw_family		= sk->sk_family;
-		tw->tw_reuse		= sk->sk_reuse;
-		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
-		atomic_set(&tw->tw_refcnt, 1);
+	if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+		tw = inet_twsk_alloc(sk, state);
 
-		tw->tw_hashent		= sk->sk_hashent;
-		tw->tw_rcv_nxt		= tp->rcv_nxt;
-		tw->tw_snd_nxt		= tp->snd_nxt;
-		tw->tw_rcv_wnd		= tcp_receive_window(tp);
-		tw->tw_ts_recent	= tp->rx_opt.ts_recent;
-		tw->tw_ts_recent_stamp	= tp->rx_opt.ts_recent_stamp;
-		tw_dead_node_init(tw);
+	if (tw != NULL) {
+		struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+		const struct inet_connection_sock *icsk = inet_csk(sk);
+		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
+
+		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
+		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
+		tcptw->tw_snd_nxt	= tp->snd_nxt;
+		tcptw->tw_rcv_wnd	= tcp_receive_window(tp);
+		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
+		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if (tw->tw_family == PF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
+			struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 
-			ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
-			ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
-			tw->tw_v6_ipv6only = np->ipv6only;
-		} else {
-			memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
-			memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
-			tw->tw_v6_ipv6only = 0;
+			ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_ipv6only = np->ipv6only;
 		}
 #endif
 		/* Linkage updates. */
-		__tcp_tw_hashdance(sk, tw);
+		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 
 		/* Get the TIME_WAIT timeout firing. */
 		if (timeo < rto)
@@ -392,8 +320,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 				timeo = TCP_TIMEWAIT_LEN;
 		}
 
-		tcp_tw_schedule(tw, timeo);
-		tcp_tw_put(tw);
+		inet_twsk_schedule(tw, &tcp_death_row, timeo,
+				   TCP_TIMEWAIT_LEN);
+		inet_twsk_put(tw);
 	} else {
 		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
@@ -407,277 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	tcp_done(sk);
 }
 
-/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-static int tcp_tw_death_row_slot;
-
-static void tcp_twkill(unsigned long);
-
-/* TIME_WAIT reaping mechanism. */
-#define TCP_TWKILL_SLOTS	8	/* Please keep this a power of 2. */
-#define TCP_TWKILL_PERIOD	(TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
-
-#define TCP_TWKILL_QUOTA	100
-
-static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static DEFINE_SPINLOCK(tw_death_lock);
-static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
-static void twkill_work(void *);
-static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
-static u32 twkill_thread_slots;
-
-/* Returns non-zero if quota exceeded.  */
-static int tcp_do_twkill_work(int slot, unsigned int quota)
-{
-	struct tcp_tw_bucket *tw;
-	struct hlist_node *node;
-	unsigned int killed;
-	int ret;
-
-	/* NOTE: compare this to previous version where lock
-	 * was released after detaching chain. It was racy,
-	 * because tw buckets are scheduled in not serialized context
-	 * in 2.3 (with netfilter), and with softnet it is common, because
-	 * soft irqs are not sequenced.
-	 */
-	killed = 0;
-	ret = 0;
-rescan:
-	tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
-		__tw_del_dead_node(tw);
-		spin_unlock(&tw_death_lock);
-		tcp_timewait_kill(tw);
-		tcp_tw_put(tw);
-		killed++;
-		spin_lock(&tw_death_lock);
-		if (killed > quota) {
-			ret = 1;
-			break;
-		}
-
-		/* While we dropped tw_death_lock, another cpu may have
-		 * killed off the next TW bucket in the list, therefore
-		 * do a fresh re-read of the hlist head node with the
-		 * lock reacquired.  We still use the hlist traversal
-		 * macro in order to get the prefetches.
-		 */
-		goto rescan;
-	}
-
-	tcp_tw_count -= killed;
-	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
-
-	return ret;
-}
-
-static void tcp_twkill(unsigned long dummy)
-{
-	int need_timer, ret;
-
-	spin_lock(&tw_death_lock);
-
-	if (tcp_tw_count == 0)
-		goto out;
-
-	need_timer = 0;
-	ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
-	if (ret) {
-		twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
-		mb();
-		schedule_work(&tcp_twkill_work);
-		need_timer = 1;
-	} else {
-		/* We purged the entire slot, anything left?  */
-		if (tcp_tw_count)
-			need_timer = 1;
-	}
-	tcp_tw_death_row_slot =
-		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
-	if (need_timer)
-		mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
-out:
-	spin_unlock(&tw_death_lock);
-}
-
-extern void twkill_slots_invalid(void);
-
-static void twkill_work(void *dummy)
-{
-	int i;
-
-	if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
-		twkill_slots_invalid();
-
-	while (twkill_thread_slots) {
-		spin_lock_bh(&tw_death_lock);
-		for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
-			if (!(twkill_thread_slots & (1 << i)))
-				continue;
-
-			while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
-				if (need_resched()) {
-					spin_unlock_bh(&tw_death_lock);
-					schedule();
-					spin_lock_bh(&tw_death_lock);
-				}
-			}
-
-			twkill_thread_slots &= ~(1 << i);
-		}
-		spin_unlock_bh(&tw_death_lock);
-	}
-}
-
-/* These are always called from BH context.  See callers in
- * tcp_input.c to verify this.
- */
-
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
-{
-	spin_lock(&tw_death_lock);
-	if (tw_del_dead_node(tw)) {
-		tcp_tw_put(tw);
-		if (--tcp_tw_count == 0)
-			del_timer(&tcp_tw_timer);
-	}
-	spin_unlock(&tw_death_lock);
-	tcp_timewait_kill(tw);
-}
-
-/* Short-time timewait calendar */
-
-static int tcp_twcal_hand = -1;
-static int tcp_twcal_jiffie;
-static void tcp_twcal_tick(unsigned long);
-static struct timer_list tcp_twcal_timer =
-		TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
-static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
-
-static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
-{
-	struct hlist_head *list;
-	int slot;
-
-	/* timeout := RTO * 3.5
-	 *
-	 * 3.5 = 1+2+0.5 to wait for two retransmits.
-	 *
-	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
-	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
-	 * FINs (or previous seqments) are lost (probability of such event
-	 * is p^(N+1), where p is probability to lose single packet and
-	 * time to detect the loss is about RTO*(2^N - 1) with exponential
-	 * backoff). Normal timewait length is calculated so, that we
-	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
-	 * [ BTW Linux. following BSD, violates this requirement waiting
-	 *   only for 60sec, we should wait at least for 240 secs.
-	 *   Well, 240 consumes too much of resources 8)
-	 * ]
-	 * This interval is not reduced to catch old duplicate and
-	 * responces to our wandering segments living for two MSLs.
-	 * However, if we use PAWS to detect
-	 * old duplicates, we can reduce the interval to bounds required
-	 * by RTO, rather than MSL. So, if peer understands PAWS, we
-	 * kill tw bucket after 3.5*RTO (it is important that this number
-	 * is greater than TS tick!) and detect old duplicates with help
-	 * of PAWS.
-	 */
-	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
-
-	spin_lock(&tw_death_lock);
-
-	/* Unlink it, if it was scheduled */
-	if (tw_del_dead_node(tw))
-		tcp_tw_count--;
-	else
-		atomic_inc(&tw->tw_refcnt);
-
-	if (slot >= TCP_TW_RECYCLE_SLOTS) {
-		/* Schedule to slow timer */
-		if (timeo >= TCP_TIMEWAIT_LEN) {
-			slot = TCP_TWKILL_SLOTS-1;
-		} else {
-			slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
-			if (slot >= TCP_TWKILL_SLOTS)
-				slot = TCP_TWKILL_SLOTS-1;
-		}
-		tw->tw_ttd = jiffies + timeo;
-		slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
-		list = &tcp_tw_death_row[slot];
-	} else {
-		tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
-
-		if (tcp_twcal_hand < 0) {
-			tcp_twcal_hand = 0;
-			tcp_twcal_jiffie = jiffies;
-			tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
-			add_timer(&tcp_twcal_timer);
-		} else {
-			if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
-				mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
-			slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
-		}
-		list = &tcp_twcal_row[slot];
-	}
-
-	hlist_add_head(&tw->tw_death_node, list);
-
-	if (tcp_tw_count++ == 0)
-		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
-	spin_unlock(&tw_death_lock);
-}
-
-void tcp_twcal_tick(unsigned long dummy)
-{
-	int n, slot;
-	unsigned long j;
-	unsigned long now = jiffies;
-	int killed = 0;
-	int adv = 0;
-
-	spin_lock(&tw_death_lock);
-	if (tcp_twcal_hand < 0)
-		goto out;
-
-	slot = tcp_twcal_hand;
-	j = tcp_twcal_jiffie;
-
-	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
-		if (time_before_eq(j, now)) {
-			struct hlist_node *node, *safe;
-			struct tcp_tw_bucket *tw;
-
-			tw_for_each_inmate_safe(tw, node, safe,
-					   &tcp_twcal_row[slot]) {
-				__tw_del_dead_node(tw);
-				tcp_timewait_kill(tw);
-				tcp_tw_put(tw);
-				killed++;
-			}
-		} else {
-			if (!adv) {
-				adv = 1;
-				tcp_twcal_jiffie = j;
-				tcp_twcal_hand = slot;
-			}
-
-			if (!hlist_empty(&tcp_twcal_row[slot])) {
-				mod_timer(&tcp_twcal_timer, j);
-				goto out;
-			}
-		}
-		j += (1<<TCP_TW_RECYCLE_TICK);
-		slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
-	}
-	tcp_twcal_hand = -1;
-
-out:
-	if ((tcp_tw_count -= killed) == 0)
-		del_timer(&tcp_tw_timer);
-	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
-	spin_unlock(&tw_death_lock);
-}
-
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -686,75 +344,27 @@ out:
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-	/* allocate the newsk from the same slab of the master sock,
-	 * if not, at sk_free time we'll try to free it from the wrong
-	 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
-	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 
-	if(newsk != NULL) {
-		struct inet_request_sock *ireq = inet_rsk(req);
+	if (newsk != NULL) {
+		const struct inet_request_sock *ireq = inet_rsk(req);
 		struct tcp_request_sock *treq = tcp_rsk(req);
+		struct inet_connection_sock *newicsk = inet_csk(sk);
 		struct tcp_sock *newtp;
-		struct sk_filter *filter;
-
-		memcpy(newsk, sk, sizeof(struct tcp_sock));
-		newsk->sk_state = TCP_SYN_RECV;
-
-		/* SANITY */
-		sk_node_init(&newsk->sk_node);
-		tcp_sk(newsk)->bind_hash = NULL;
-
-		/* Clone the TCP header template */
-		inet_sk(newsk)->dport = ireq->rmt_port;
-
-		sock_lock_init(newsk);
-		bh_lock_sock(newsk);
-
-		rwlock_init(&newsk->sk_dst_lock);
-		atomic_set(&newsk->sk_rmem_alloc, 0);
-		skb_queue_head_init(&newsk->sk_receive_queue);
-		atomic_set(&newsk->sk_wmem_alloc, 0);
-		skb_queue_head_init(&newsk->sk_write_queue);
-		atomic_set(&newsk->sk_omem_alloc, 0);
-		newsk->sk_wmem_queued = 0;
-		newsk->sk_forward_alloc = 0;
-
-		sock_reset_flag(newsk, SOCK_DONE);
-		newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
-		newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
-		newsk->sk_send_head = NULL;
-		rwlock_init(&newsk->sk_callback_lock);
-		skb_queue_head_init(&newsk->sk_error_queue);
-		newsk->sk_write_space = sk_stream_write_space;
-
-		if ((filter = newsk->sk_filter) != NULL)
-			sk_filter_charge(newsk, filter);
-
-		if (unlikely(xfrm_sk_clone_policy(newsk))) {
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			return NULL;
-		}
 
 		/* Now setup tcp_sock */
 		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
 		newtp->rcv_nxt = treq->rcv_isn + 1;
-		newtp->snd_nxt = treq->snt_isn + 1;
-		newtp->snd_una = treq->snt_isn + 1;
-		newtp->snd_sml = treq->snt_isn + 1;
+		newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 
 		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 
-		newtp->retransmits = 0;
-		newtp->backoff = 0;
 		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
-		newtp->rto = TCP_TIMEOUT_INIT;
+		newicsk->icsk_rto = TCP_TIMEOUT_INIT;
 
 		newtp->packets_out = 0;
 		newtp->left_out = 0;
@@ -774,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
-		newtp->ca_ops = &tcp_reno;
+		newicsk->icsk_ca_ops = &tcp_reno;
 
-		tcp_set_ca_state(newtp, TCP_CA_Open);
+		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->rcv_wup = treq->rcv_isn + 1;
@@ -789,26 +399,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rx_opt.dsack = 0;
 		newtp->rx_opt.eff_sacks = 0;
 
-		newtp->probes_out = 0;
 		newtp->rx_opt.num_sacks = 0;
 		newtp->urg_data = 0;
-		/* Deinitialize accept_queue to trap illegal accesses. */
-		memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
-
-		/* Back to base struct sock members. */
-		newsk->sk_err = 0;
-		newsk->sk_priority = 0;
-		atomic_set(&newsk->sk_refcnt, 2);
-#ifdef INET_REFCNT_DEBUG
-		atomic_inc(&inet_sock_nr);
-#endif
-		atomic_inc(&tcp_sockets_allocated);
 
 		if (sock_flag(newsk, SOCK_KEEPOPEN))
-			tcp_reset_keepalive_timer(newsk,
-						  keepalive_time_when(newtp));
-		newsk->sk_socket = NULL;
-		newsk->sk_sleep = NULL;
+			inet_csk_reset_keepalive_timer(newsk,
+						       keepalive_time_when(newtp));
 
 		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 		if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
@@ -838,7 +434,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
 		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
-			newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
 		if (newtp->ecn_flags&TCP_ECN_OK)
@@ -934,9 +530,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	   does sequence test, SYN is truncated, and thus we consider
 	   it a bare ACK.
 
-	   If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
-	   we create an established connection.  Both ends (listening sockets)
-	   accept the new incoming connection and try to talk to each other. 8-)
+	   If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
+	   bare ACK.  Otherwise, we create an established connection.  Both
+	   ends (listening sockets) accept the new incoming connection and try
+	   to talk to each other. 8-)
 
 	   Note: This case is both harmless, and rare.  Possibility is about the
 	   same as us discovering intelligent life on another plant tomorrow.
@@ -1003,7 +600,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			return NULL;
 
 		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-		if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+		if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+		    TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 			inet_rsk(req)->acked = 1;
 			return NULL;
 		}
@@ -1018,10 +616,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		if (child == NULL)
 			goto listen_overflow;
 
-		tcp_synq_unlink(tp, req, prev);
-		tcp_synq_removed(sk, req);
+		inet_csk_reqsk_queue_unlink(sk, req, prev);
+		inet_csk_reqsk_queue_removed(sk, req);
 
-		tcp_acceptq_queue(sk, req, child);
+		inet_csk_reqsk_queue_add(sk, req, child);
 		return child;
 
 	listen_overflow:
@@ -1035,7 +633,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		if (!(flg & TCP_FLAG_RST))
 			req->rsk_ops->send_reset(skb);
 
-		tcp_synq_drop(sk, req, prev);
+		inet_csk_reqsk_queue_drop(sk, req, prev);
 		return NULL;
 }
 
@@ -1074,4 +672,3 @@ EXPORT_SYMBOL(tcp_check_req);
 EXPORT_SYMBOL(tcp_child_process);
 EXPORT_SYMBOL(tcp_create_openreq_child);
 EXPORT_SYMBOL(tcp_timewait_state_process);
-EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dd30dd137b7..75b68116682 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -105,18 +105,19 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	s32 delta = tcp_time_stamp - tp->lsndtime;
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
 
-	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 
-	tp->snd_ssthresh = tcp_current_ssthresh(tp);
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
 	restart_cwnd = min(restart_cwnd, cwnd);
 
-	while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
 		cwnd >>= 1;
 	tp->snd_cwnd = max(cwnd, restart_cwnd);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 static inline void tcp_event_data_sent(struct tcp_sock *tp,
 				       struct sk_buff *skb, struct sock *sk)
 {
-	u32 now = tcp_time_stamp;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const u32 now = tcp_time_stamp;
 
-	if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
-		tcp_cwnd_restart(tp, __sk_dst_get(sk));
+	if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)
+		tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
 	tp->lsndtime = now;
 
 	/* If it is a reply for ato after last received
 	 * packet, enter pingpong mode.
 	 */
-	if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
-		tp->ack.pingpong = 1;
+	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+		icsk->icsk_ack.pingpong = 1;
 }
 
 static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_dec_quickack_mode(tp, pkts);
-	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+	tcp_dec_quickack_mode(sk, pkts);
+	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
 /* Determine a window scaling and initial window to offer.
@@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	if (skb != NULL) {
+		const struct inet_connection_sock *icsk = inet_csk(sk);
 		struct inet_sock *inet = inet_sk(sk);
 		struct tcp_sock *tp = tcp_sk(sk);
 		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -280,8 +281,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_SACK	0x4
 
 		/* If congestion control is doing timestamping */
-		if (tp->ca_ops->rtt_sample)
-			do_gettimeofday(&skb->stamp);
+		if (icsk->icsk_ca_ops->rtt_sample)
+			__net_timestamp(skb);
 
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
@@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		}
 		
 		if (tcp_packets_in_flight(tp) == 0)
-			tcp_ca_event(tp, CA_EVENT_TX_START);
+			tcp_ca_event(sk, CA_EVENT_TX_START);
 
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
@@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (err <= 0)
 			return err;
 
-		tcp_enter_cwr(tp);
+		tcp_enter_cwr(sk);
 
 		/* NET_XMIT_CN is special. It does not guarantee,
 		 * that this packet is lost. It tells that device
@@ -482,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
 	 * skbs, which it never sent before. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
-	buff->stamp = skb->stamp;
+	buff->tstamp = skb->tstamp;
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out -= tcp_skb_pcount(skb);
@@ -505,7 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
-	__skb_append(skb, buff);
+	__skb_append(skb, buff, &sk->sk_write_queue);
 
 	return 0;
 }
@@ -696,7 +697,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 		if (tp->packets_out > tp->snd_cwnd_used)
 			tp->snd_cwnd_used = tp->packets_out;
 
-		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+		if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
 			tcp_cwnd_application_limited(sk);
 	}
 }
@@ -893,7 +894,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* Link BUFF into the send queue. */
 	skb_header_release(buff);
-	__skb_append(skb, buff);
+	__skb_append(skb, buff, &sk->sk_write_queue);
 
 	return 0;
 }
@@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
 
 	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
 		return 0;
 
-	if (tp->ca_state != TCP_CA_Open)
+	if (icsk->icsk_ca_state != TCP_CA_Open)
 		return 0;
 
 	in_flight = tcp_packets_in_flight(tp);
@@ -1147,6 +1149,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
  */
 u32 __tcp_select_window(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	/* MSS for the peer's data.  Previous verions used mss_clamp
 	 * here.  I don't know if the value based on our guesses
@@ -1154,7 +1157,7 @@ u32 __tcp_select_window(struct sock *sk)
 	 * but may be worse for the performance because of rcv_mss
 	 * fluctuations.  --SAW  1998/11/1
 	 */
-	int mss = tp->ack.rcv_mss;
+	int mss = icsk->icsk_ack.rcv_mss;
 	int free_space = tcp_space(sk);
 	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
 	int window;
@@ -1163,7 +1166,7 @@ u32 __tcp_select_window(struct sock *sk)
 		mss = full_space; 
 
 	if (free_space < full_space/2) {
-		tp->ack.quick = 0;
+		icsk->icsk_ack.quick = 0;
 
 		if (tcp_memory_pressure)
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1238,7 +1241,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 		       tcp_skb_pcount(next_skb) != 1);
 
 		/* Ok.  We will be able to collapse the packet. */
-		__skb_unlink(next_skb, next_skb->list);
+		__skb_unlink(next_skb, &sk->sk_write_queue);
 
 		memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
@@ -1286,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
  */ 
 void tcp_simple_retransmit(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	unsigned int mss = tcp_current_mss(sk, 0);
@@ -1316,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk)
 	 * in network, but units changed and effective
 	 * cwnd/ssthresh really reduced now.
 	 */
-	if (tp->ca_state != TCP_CA_Loss) {
+	if (icsk->icsk_ca_state != TCP_CA_Loss) {
 		tp->high_seq = tp->snd_nxt;
-		tp->snd_ssthresh = tcp_current_ssthresh(tp);
+		tp->snd_ssthresh = tcp_current_ssthresh(sk);
 		tp->prior_ssthresh = 0;
 		tp->undo_marker = 0;
-		tcp_set_ca_state(tp, TCP_CA_Loss);
+		tcp_set_ca_state(sk, TCP_CA_Loss);
 	}
 	tcp_xmit_retransmit_queue(sk);
 }
@@ -1461,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int packet_cnt = tp->lost_out;
@@ -1484,14 +1489,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
 					if (tcp_retransmit_skb(sk, skb))
 						return;
-					if (tp->ca_state != TCP_CA_Loss)
+					if (icsk->icsk_ca_state != TCP_CA_Loss)
 						NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
 					else
 						NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
 					if (skb ==
 					    skb_peek(&sk->sk_write_queue))
-						tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+						inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+									  inet_csk(sk)->icsk_rto,
+									  TCP_RTO_MAX);
 				}
 
 				packet_cnt -= tcp_skb_pcount(skb);
@@ -1504,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 	/* OK, demanded retransmission is finished. */
 
 	/* Forward retransmissions are possible only during Recovery. */
-	if (tp->ca_state != TCP_CA_Recovery)
+	if (icsk->icsk_ca_state != TCP_CA_Recovery)
 		return;
 
 	/* No forward retransmissions in Reno are possible. */
@@ -1544,7 +1551,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 			break;
 
 		if (skb == skb_peek(&sk->sk_write_queue))
-			tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+						  inet_csk(sk)->icsk_rto,
+						  TCP_RTO_MAX);
 
 		NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
 	}
@@ -1573,7 +1582,7 @@ void tcp_send_fin(struct sock *sk)
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		for (;;) {
-			skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+			skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
 			if (skb)
 				break;
 			yield();
@@ -1780,8 +1789,8 @@ static inline void tcp_connect_init(struct sock *sk)
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
 
-	tp->rto = TCP_TIMEOUT_INIT;
-	tp->retransmits = 0;
+	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+	inet_csk(sk)->icsk_retransmits = 0;
 	tcp_clear_retrans(tp);
 }
 
@@ -1795,7 +1804,7 @@ int tcp_connect(struct sock *sk)
 
 	tcp_connect_init(sk);
 
-	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
 	if (unlikely(buff == NULL))
 		return -ENOBUFS;
 
@@ -1824,7 +1833,8 @@ int tcp_connect(struct sock *sk)
 	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the SYN until an answer. */
-	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 	return 0;
 }
 
@@ -1834,20 +1844,21 @@ int tcp_connect(struct sock *sk)
  */
 void tcp_send_delayed_ack(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	int ato = tp->ack.ato;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ato = icsk->icsk_ack.ato;
 	unsigned long timeout;
 
 	if (ato > TCP_DELACK_MIN) {
+		const struct tcp_sock *tp = tcp_sk(sk);
 		int max_ato = HZ/2;
 
-		if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+		if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
 			max_ato = TCP_DELACK_MAX;
 
 		/* Slow path, intersegment interval is "high". */
 
 		/* If some rtt estimate is known, use it to bound delayed ack.
-		 * Do not use tp->rto here, use results of rtt measurements
+		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
 		 * directly.
 		 */
 		if (tp->srtt) {
@@ -1864,21 +1875,22 @@ void tcp_send_delayed_ack(struct sock *sk)
 	timeout = jiffies + ato;
 
 	/* Use new timeout only if there wasn't a older one earlier. */
-	if (tp->ack.pending&TCP_ACK_TIMER) {
+	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
 		/* If delack timer was blocked or is about to expire,
 		 * send ACK now.
 		 */
-		if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+		if (icsk->icsk_ack.blocked ||
+		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
 			tcp_send_ack(sk);
 			return;
 		}
 
-		if (!time_before(timeout, tp->ack.timeout))
-			timeout = tp->ack.timeout;
+		if (!time_before(timeout, icsk->icsk_ack.timeout))
+			timeout = icsk->icsk_ack.timeout;
 	}
-	tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
-	tp->ack.timeout = timeout;
-	sk_reset_timer(sk, &tp->delack_timer, timeout);
+	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+	icsk->icsk_ack.timeout = timeout;
+	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
 
 /* This routine sends an ack and also updates the window. */
@@ -1895,9 +1907,10 @@ void tcp_send_ack(struct sock *sk)
 		 */
 		buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 		if (buff == NULL) {
-			tcp_schedule_ack(tp);
-			tp->ack.ato = TCP_ATO_MIN;
-			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+			inet_csk_schedule_ack(sk);
+			inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  TCP_DELACK_MAX, TCP_RTO_MAX);
 			return;
 		}
 
@@ -2011,6 +2024,7 @@ int tcp_write_wakeup(struct sock *sk)
  */
 void tcp_send_probe0(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int err;
 
@@ -2018,28 +2032,31 @@ void tcp_send_probe0(struct sock *sk)
 
 	if (tp->packets_out || !sk->sk_send_head) {
 		/* Cancel probe timer, if it is not required. */
-		tp->probes_out = 0;
-		tp->backoff = 0;
+		icsk->icsk_probes_out = 0;
+		icsk->icsk_backoff = 0;
 		return;
 	}
 
 	if (err <= 0) {
-		if (tp->backoff < sysctl_tcp_retries2)
-			tp->backoff++;
-		tp->probes_out++;
-		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-				      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		if (icsk->icsk_backoff < sysctl_tcp_retries2)
+			icsk->icsk_backoff++;
+		icsk->icsk_probes_out++;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+					  TCP_RTO_MAX);
 	} else {
 		/* If packet was not sent due to local congestion,
-		 * do not backoff and do not remember probes_out.
+		 * do not backoff and do not remember icsk_probes_out.
 		 * Let local senders to fight for local resources.
 		 *
 		 * Use accumulated backoff yet.
 		 */
-		if (!tp->probes_out)
-			tp->probes_out=1;
-		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-				      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+		if (!icsk->icsk_probes_out)
+			icsk->icsk_probes_out = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 
+					  min(icsk->icsk_rto << icsk->icsk_backoff,
+					      TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
 	}
 }
 
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 70e108e15c7..327770bf552 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -16,9 +16,10 @@
 #define TCP_SCALABLE_AI_CNT	50U
 #define TCP_SCALABLE_MD_SCALE	3
 
-static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
 				    u32 in_flight, int flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	if (in_flight < tp->snd_cwnd)
 		return;
 
@@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+static u32 tcp_scalable_ssthresh(struct sock *sk)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
 	return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
 }
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0084227438c..415ee47ac1c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -36,49 +36,13 @@ static void tcp_write_timer(unsigned long);
 static void tcp_delack_timer(unsigned long);
 static void tcp_keepalive_timer (unsigned long data);
 
-#ifdef TCP_DEBUG
-const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
-EXPORT_SYMBOL(tcp_timer_bug_msg);
-#endif
-
-/*
- * Using different timers for retransmit, delayed acks and probes
- * We may wish use just one timer maintaining a list of expire jiffies 
- * to optimize.
- */
-
 void tcp_init_xmit_timers(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	init_timer(&tp->retransmit_timer);
-	tp->retransmit_timer.function=&tcp_write_timer;
-	tp->retransmit_timer.data = (unsigned long) sk;
-	tp->pending = 0;
-
-	init_timer(&tp->delack_timer);
-	tp->delack_timer.function=&tcp_delack_timer;
-	tp->delack_timer.data = (unsigned long) sk;
-	tp->ack.pending = 0;
-
-	init_timer(&sk->sk_timer);
-	sk->sk_timer.function	= &tcp_keepalive_timer;
-	sk->sk_timer.data	= (unsigned long)sk;
+	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
+				  &tcp_keepalive_timer);
 }
 
-void tcp_clear_xmit_timers(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->pending = 0;
-	sk_stop_timer(sk, &tp->retransmit_timer);
-
-	tp->ack.pending = 0;
-	tp->ack.blocked = 0;
-	sk_stop_timer(sk, &tp->delack_timer);
-
-	sk_stop_timer(sk, &sk->sk_timer);
-}
+EXPORT_SYMBOL(tcp_init_xmit_timers);
 
 static void tcp_write_err(struct sock *sk)
 {
@@ -155,15 +119,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	int retry_until;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-		if (tp->retransmits)
+		if (icsk->icsk_retransmits)
 			dst_negative_advice(&sk->sk_dst_cache);
-		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (tp->retransmits >= sysctl_tcp_retries1) {
+		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 			   hole detection. :-(
 
@@ -189,16 +153,16 @@ static int tcp_write_timeout(struct sock *sk)
 
 		retry_until = sysctl_tcp_retries2;
 		if (sock_flag(sk, SOCK_DEAD)) {
-			int alive = (tp->rto < TCP_RTO_MAX);
+			const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
  
 			retry_until = tcp_orphan_retries(sk, alive);
 
-			if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 				return 1;
 		}
 	}
 
-	if (tp->retransmits >= retry_until) {
+	if (icsk->icsk_retransmits >= retry_until) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -210,26 +174,27 @@ static void tcp_delack_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */
-		tp->ack.blocked = 1;
+		icsk->icsk_ack.blocked = 1;
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
-		sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 		goto out_unlock;
 	}
 
 	sk_stream_mem_reclaim(sk);
 
-	if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+	if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 		goto out;
 
-	if (time_after(tp->ack.timeout, jiffies)) {
-		sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+	if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 		goto out;
 	}
-	tp->ack.pending &= ~TCP_ACK_TIMER;
+	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 
 	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 		struct sk_buff *skb;
@@ -242,16 +207,16 @@ static void tcp_delack_timer(unsigned long data)
 		tp->ucopy.memory = 0;
 	}
 
-	if (tcp_ack_scheduled(tp)) {
-		if (!tp->ack.pingpong) {
+	if (inet_csk_ack_scheduled(sk)) {
+		if (!icsk->icsk_ack.pingpong) {
 			/* Delayed ACK missed: inflate ATO. */
-			tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+			icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 		} else {
 			/* Delayed ACK missed: leave pingpong mode and
 			 * deflate ATO.
 			 */
-			tp->ack.pingpong = 0;
-			tp->ack.ato = TCP_ATO_MIN;
+			icsk->icsk_ack.pingpong = 0;
+			icsk->icsk_ack.ato      = TCP_ATO_MIN;
 		}
 		tcp_send_ack(sk);
 		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
@@ -268,11 +233,12 @@ out_unlock:
 
 static void tcp_probe_timer(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int max_probes;
 
 	if (tp->packets_out || !sk->sk_send_head) {
-		tp->probes_out = 0;
+		icsk->icsk_probes_out = 0;
 		return;
 	}
 
@@ -283,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk)
 	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 	 * this behaviour in Solaris down as a bug fix. [AC]
 	 *
-	 * Let me to explain. probes_out is zeroed by incoming ACKs
+	 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
 	 * even if they advertise zero window. Hence, connection is killed only
 	 * if we received no ACKs for normal connection timeout. It is not killed
 	 * only because window stays zero for some time, window may be zero
@@ -294,15 +260,15 @@ static void tcp_probe_timer(struct sock *sk)
 	max_probes = sysctl_tcp_retries2;
 
 	if (sock_flag(sk, SOCK_DEAD)) {
-		int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+		const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
  
 		max_probes = tcp_orphan_retries(sk, alive);
 
-		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+		if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
 			return;
 	}
 
-	if (tp->probes_out > max_probes) {
+	if (icsk->icsk_probes_out > max_probes) {
 		tcp_write_err(sk);
 	} else {
 		/* Only send another probe if we didn't close things up. */
@@ -317,6 +283,7 @@ static void tcp_probe_timer(struct sock *sk)
 static void tcp_retransmit_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (!tp->packets_out)
 		goto out;
@@ -351,20 +318,21 @@ static void tcp_retransmit_timer(struct sock *sk)
 	if (tcp_write_timeout(sk))
 		goto out;
 
-	if (tp->retransmits == 0) {
-		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+	if (icsk->icsk_retransmits == 0) {
+		if (icsk->icsk_ca_state == TCP_CA_Disorder ||
+		    icsk->icsk_ca_state == TCP_CA_Recovery) {
 			if (tp->rx_opt.sack_ok) {
-				if (tp->ca_state == TCP_CA_Recovery)
+				if (icsk->icsk_ca_state == TCP_CA_Recovery)
 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 				else
 					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 			} else {
-				if (tp->ca_state == TCP_CA_Recovery)
+				if (icsk->icsk_ca_state == TCP_CA_Recovery)
 					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 				else
 					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 			}
-		} else if (tp->ca_state == TCP_CA_Loss) {
+		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 		} else {
 			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
@@ -381,10 +349,11 @@ static void tcp_retransmit_timer(struct sock *sk)
 		/* Retransmission failed because of local congestion,
 		 * do not backoff.
 		 */
-		if (!tp->retransmits)
-			tp->retransmits=1;
-		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
-				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+		if (!icsk->icsk_retransmits)
+			icsk->icsk_retransmits = 1;
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+					  TCP_RTO_MAX);
 		goto out;
 	}
 
@@ -403,13 +372,13 @@ static void tcp_retransmit_timer(struct sock *sk)
 	 * implemented ftp to mars will work nicely. We will have to fix
 	 * the 120 second clamps though!
 	 */
-	tp->backoff++;
-	tp->retransmits++;
+	icsk->icsk_backoff++;
+	icsk->icsk_retransmits++;
 
 out_reset_timer:
-	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
-	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
-	if (tp->retransmits > sysctl_tcp_retries1)
+	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 		__sk_dst_reset(sk);
 
 out:;
@@ -418,32 +387,32 @@ out:;
 static void tcp_write_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
-	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	int event;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later */
-		sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 		goto out_unlock;
 	}
 
-	if (sk->sk_state == TCP_CLOSE || !tp->pending)
+	if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 		goto out;
 
-	if (time_after(tp->timeout, jiffies)) {
-		sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+	if (time_after(icsk->icsk_timeout, jiffies)) {
+		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 		goto out;
 	}
 
-	event = tp->pending;
-	tp->pending = 0;
+	event = icsk->icsk_pending;
+	icsk->icsk_pending = 0;
 
 	switch (event) {
-	case TCP_TIME_RETRANS:
+	case ICSK_TIME_RETRANS:
 		tcp_retransmit_timer(sk);
 		break;
-	case TCP_TIME_PROBE0:
+	case ICSK_TIME_PROBE0:
 		tcp_probe_timer(sk);
 		break;
 	}
@@ -462,96 +431,8 @@ out_unlock:
 
 static void tcp_synack_timer(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
-	int thresh = max_retries;
-	unsigned long now = jiffies;
-	struct request_sock **reqp, *req;
-	int i, budget;
-
-	if (lopt == NULL || lopt->qlen == 0)
-		return;
-
-	/* Normally all the openreqs are young and become mature
-	 * (i.e. converted to established socket) for first timeout.
-	 * If synack was not acknowledged for 3 seconds, it means
-	 * one of the following things: synack was lost, ack was lost,
-	 * rtt is high or nobody planned to ack (i.e. synflood).
-	 * When server is a bit loaded, queue is populated with old
-	 * open requests, reducing effective size of queue.
-	 * When server is well loaded, queue size reduces to zero
-	 * after several minutes of work. It is not synflood,
-	 * it is normal operation. The solution is pruning
-	 * too old entries overriding normal timeout, when
-	 * situation becomes dangerous.
-	 *
-	 * Essentially, we reserve half of room for young
-	 * embrions; and abort old ones without pity, if old
-	 * ones are about to clog our table.
-	 */
-	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
-		int young = (lopt->qlen_young<<1);
-
-		while (thresh > 2) {
-			if (lopt->qlen < young)
-				break;
-			thresh--;
-			young <<= 1;
-		}
-	}
-
-	if (tp->defer_accept)
-		max_retries = tp->defer_accept;
-
-	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
-	i = lopt->clock_hand;
-
-	do {
-		reqp=&lopt->syn_table[i];
-		while ((req = *reqp) != NULL) {
-			if (time_after_eq(now, req->expires)) {
-				if ((req->retrans < thresh ||
-				     (inet_rsk(req)->acked && req->retrans < max_retries))
-				    && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
-					unsigned long timeo;
-
-					if (req->retrans++ == 0)
-						lopt->qlen_young--;
-					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
-						    TCP_RTO_MAX);
-					req->expires = now + timeo;
-					reqp = &req->dl_next;
-					continue;
-				}
-
-				/* Drop this request */
-				tcp_synq_unlink(tp, req, reqp);
-				reqsk_queue_removed(&tp->accept_queue, req);
-				reqsk_free(req);
-				continue;
-			}
-			reqp = &req->dl_next;
-		}
-
-		i = (i+1)&(TCP_SYNQ_HSIZE-1);
-
-	} while (--budget > 0);
-
-	lopt->clock_hand = i;
-
-	if (lopt->qlen)
-		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
-}
-
-void tcp_delete_keepalive_timer (struct sock *sk)
-{
-	sk_stop_timer(sk, &sk->sk_timer);
-}
-
-void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
-{
-	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+	inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL,
+				   TCP_TIMEOUT_INIT, TCP_RTO_MAX);
 }
 
 void tcp_set_keepalive(struct sock *sk, int val)
@@ -560,15 +441,16 @@ void tcp_set_keepalive(struct sock *sk, int val)
 		return;
 
 	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
-		tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 	else if (!val)
-		tcp_delete_keepalive_timer(sk);
+		inet_csk_delete_keepalive_timer(sk);
 }
 
 
 static void tcp_keepalive_timer (unsigned long data)
 {
 	struct sock *sk = (struct sock *) data;
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 elapsed;
 
@@ -576,7 +458,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */ 
-		tcp_reset_keepalive_timer (sk, HZ/20);
+		inet_csk_reset_keepalive_timer (sk, HZ/20);
 		goto out;
 	}
 
@@ -587,7 +469,7 @@ static void tcp_keepalive_timer (unsigned long data)
 
 	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 		if (tp->linger2 >= 0) {
-			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+			const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 
 			if (tmo > 0) {
 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -610,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
 
 	if (elapsed >= keepalive_time_when(tp)) {
-		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
-		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+		if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) ||
+		     (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			tcp_write_err(sk);
 			goto out;
 		}
 		if (tcp_write_wakeup(sk) <= 0) {
-			tp->probes_out++;
+			icsk->icsk_probes_out++;
 			elapsed = keepalive_intvl_when(tp);
 		} else {
 			/* If keepalive was lost due to local congestion,
@@ -634,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
 	sk_stream_mem_reclaim(sk);
 
 resched:
-	tcp_reset_keepalive_timer (sk, elapsed);
+	inet_csk_reset_keepalive_timer (sk, elapsed);
 	goto out;
 
 death:	
@@ -644,8 +526,3 @@ out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
-
-EXPORT_SYMBOL(tcp_clear_xmit_timers);
-EXPORT_SYMBOL(tcp_delete_keepalive_timer);
-EXPORT_SYMBOL(tcp_init_xmit_timers);
-EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9bd443db519..93c5f92070f 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -35,7 +35,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 
 #include <net/tcp.h>
 
@@ -82,9 +82,10 @@ struct vegas {
  * Instead we must wait until the completion of an RTT during
  * which we actually receive ACKs.
  */
-static inline void vegas_enable(struct tcp_sock *tp)
+static inline void vegas_enable(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	/* Begin taking Vegas samples next time we send something. */
 	vegas->doing_vegas_now = 1;
@@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp)
 }
 
 /* Stop taking Vegas samples for now. */
-static inline void vegas_disable(struct tcp_sock *tp)
+static inline void vegas_disable(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	vegas->doing_vegas_now = 0;
 }
 
-static void tcp_vegas_init(struct tcp_sock *tp)
+static void tcp_vegas_init(struct sock *sk)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	vegas->baseRTT = 0x7fffffff;
-	vegas_enable(tp);
+	vegas_enable(sk);
 }
 
 /* Do RTT sampling needed for Vegas.
@@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp)
  *   o min-filter RTT samples from a much longer window (forever for now)
  *     to find the propagation delay (baseRTT)
  */
-static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct vegas *vegas = inet_csk_ca(sk);
 	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
 
 	/* Filter to find propagation delay: */
@@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
 	vegas->cntRTT++;
 }
 
-static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+static void tcp_vegas_state(struct sock *sk, u8 ca_state)
 {
 
 	if (ca_state == TCP_CA_Open)
-		vegas_enable(tp);
+		vegas_enable(sk);
 	else
-		vegas_disable(tp);
+		vegas_disable(sk);
 }
 
 /*
@@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
  * packets, _then_ we can make Vegas calculations
  * again.
  */
-static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 {
 	if (event == CA_EVENT_CWND_RESTART ||
 	    event == CA_EVENT_TX_START)
-		tcp_vegas_init(tp);
+		tcp_vegas_init(sk);
 }
 
-static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
 				 u32 seq_rtt, u32 in_flight, int flag)
 {
-	struct vegas *vegas = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct vegas *vegas = inet_csk_ca(sk);
 
 	if (!vegas->doing_vegas_now)
-		return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+		return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag);
 
 	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
 	 *
@@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
 		 * but that's not too awful, since we're taking the min,
 		 * rather than averaging.
 		 */
-		tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+		tcp_vegas_rtt_calc(sk, seq_rtt * 1000);
 
 		/* We do the Vegas calculations only if we got enough RTT
 		 * samples that we can be reasonably sure that we got
@@ -359,14 +361,14 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+static void tcp_vegas_get_info(struct sock *sk, u32 ext,
 			       struct sk_buff *skb)
 {
-	const struct vegas *ca = tcp_ca(tp);
-	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+	const struct vegas *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
 		struct tcpvegas_info *info;
 
-		info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+		info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO,
 					  sizeof(*info)));
 
 		info->tcpv_enabled = ca->doing_vegas_now;
@@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = {
 
 static int __init tcp_vegas_register(void)
 {
-	BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE);
 	tcp_register_congestion_control(&tcp_vegas);
 	return 0;
 }
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index ef827242c94..0c340c3756c 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 #include <net/tcp.h>
 
 /* TCP Westwood structure */
@@ -40,9 +40,9 @@ struct westwood {
  * way as soon as possible. It will reasonably happen within the first
  * RTT period of the connection lifetime.
  */
-static void tcp_westwood_init(struct tcp_sock *tp)
+static void tcp_westwood_init(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 
 	w->bk = 0;
         w->bw_ns_est = 0;
@@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp)
         w->cumul_ack = 0;
 	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
 	w->rtt_win_sx = tcp_time_stamp;
-	w->snd_una = tp->snd_una;
+	w->snd_una = tcp_sk(sk)->snd_una;
 }
 
 /*
@@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta)
  * Called after processing group of packets.
  * but all westwood needs is the last sample of srtt.
  */
-static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 	if (cnt > 0)
-		w->rtt = tp->srtt >> 3;
+		w->rtt = tcp_sk(sk)->srtt >> 3;
 }
 
 /*
@@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
  * It updates RTT evaluation window if it is the right moment to do
  * it. If so it calls filter for evaluating bandwidth.
  */
-static void westwood_update_window(struct tcp_sock *tp)
+static void westwood_update_window(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct westwood *w = inet_csk_ca(sk);
 	s32 delta = tcp_time_stamp - w->rtt_win_sx;
 
 	/*
@@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp)
  * header prediction is successful. In such case in fact update is
  * straight forward and doesn't need any particular care.
  */
-static inline void westwood_fast_bw(struct tcp_sock *tp)
+static inline void westwood_fast_bw(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
-	westwood_update_window(tp);
+	westwood_update_window(sk);
 
 	w->bk += tp->snd_una - w->snd_una;
 	w->snd_una = tp->snd_una;
@@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp)
  * This function evaluates cumul_ack for evaluating bk in case of
  * delayed or partial acks.
  */
-static inline u32 westwood_acked_count(struct tcp_sock *tp)
+static inline u32 westwood_acked_count(struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
 	w->cumul_ack = tp->snd_una - w->snd_una;
 
@@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp)
 	return w->cumul_ack;
 }
 
-static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+static inline u32 westwood_bw_rttmin(const struct sock *sk)
 {
-	struct westwood *w = tcp_ca(tp);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct westwood *w = inet_csk_ca(sk);
 	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
 }
 
@@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
  * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
  * so avoids ever returning 0.
  */
-static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+static u32 tcp_westwood_cwnd_min(struct sock *sk)
 {
-	return westwood_bw_rttmin(tp);
+	return westwood_bw_rttmin(sk);
 }
 
-static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 {
-	struct westwood *w = tcp_ca(tp);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
 	switch(event) {
 	case CA_EVENT_FAST_ACK:
-		westwood_fast_bw(tp);
+		westwood_fast_bw(sk);
 		break;
 
 	case CA_EVENT_COMPLETE_CWR:
-		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_FRTO:
-		tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		tp->snd_ssthresh = westwood_bw_rttmin(sk);
 		break;
 
 	case CA_EVENT_SLOW_ACK:
-		westwood_update_window(tp);
-		w->bk += westwood_acked_count(tp);
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
 		w->rtt_min = min(w->rtt, w->rtt_min);
 		break;
 
@@ -208,15 +212,15 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
 
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+static void tcp_westwood_info(struct sock *sk, u32 ext,
 			      struct sk_buff *skb)
 {
-	const struct westwood *ca = tcp_ca(tp);
-	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+	const struct westwood *ca = inet_csk_ca(sk);
+	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
 		struct rtattr *rta;
 		struct tcpvegas_info *info;
 
-		rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+		rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info));
 		info = RTA_DATA(rta);
 		info->tcpv_enabled = 1;
 		info->tcpv_rttcnt = 0;
@@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = {
 
 static int __init tcp_westwood_register(void)
 {
-	BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+	BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE);
 	return tcp_register_congestion_control(&tcp_westwood);
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dc4d07357e3..e5beca7de86 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,7 +95,8 @@
 #include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <net/snmp.h>
-#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/tcp_states.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
@@ -112,7 +113,7 @@
  *	Snmp MIB for the UDP layer
  */
 
-DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
 
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
@@ -628,7 +629,7 @@ back_from_confirm:
 		/* ... which is an evident application bug. --ANK */
 		release_sock(sk);
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
 		err = -EINVAL;
 		goto out;
 	}
@@ -693,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset,
 	if (unlikely(!up->pending)) {
 		release_sock(sk);
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
 		return -EINVAL;
 	}
 
@@ -1102,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
 			return 0;
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
 		skb->ip_summed = CHECKSUM_NONE;
 	}
 	if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -1181,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb)
 	return(0);
 
 short_packet:
-	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
-			      NIPQUAD(saddr),
-			      ntohs(uh->source),
-			      ulen,
-			      len,
-			      NIPQUAD(daddr),
-			      ntohs(uh->dest)));
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+		       NIPQUAD(saddr),
+		       ntohs(uh->source),
+		       ulen,
+		       len,
+		       NIPQUAD(daddr),
+		       ntohs(uh->dest));
 no_header:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
@@ -1198,12 +1199,12 @@ csum_error:
 	 * RFC1122: OK.  Discards the bad packet silently (as far as 
 	 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
 	 */
-	LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
-			      NIPQUAD(saddr),
-			      ntohs(uh->source),
-			      NIPQUAD(daddr),
-			      ntohs(uh->dest),
-			      ulen));
+	LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+		       NIPQUAD(saddr),
+		       ntohs(uh->source),
+		       NIPQUAD(daddr),
+		       ntohs(uh->dest),
+		       ulen);
 drop:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 	kfree_skb(skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 050611d7a96..d23e07fc81f 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -128,8 +128,10 @@ void __init xfrm4_state_init(void)
 	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
 }
 
+#if 0
 void __exit xfrm4_state_fini(void)
 {
 	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
 }
+#endif  /*  0  */
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index b39e0494059..6460eec834b 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -8,7 +8,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \
 		protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
 		exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \
-		ip6_flowlabel.o ipv6_syms.o
+		ip6_flowlabel.o ipv6_syms.o netfilter.o
 
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
@@ -23,3 +23,5 @@ obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 
 obj-y += exthdrs_core.o
+
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 77004b9456c..937ad32db77 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1041,9 +1041,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 	const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr;
 	const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2);
 	u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr;
-	u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+	u32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 	int sk_ipv6only = ipv6_only_sock(sk);
-	int sk2_ipv6only = tcp_v6_ipv6only(sk2);
+	int sk2_ipv6only = inet_v6_ipv6only(sk2);
 	int addr_type = ipv6_addr_type(sk_rcv_saddr6);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
 
@@ -1126,7 +1126,7 @@ void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr)
 	__ipv6_dev_mc_dec(idev, &maddr);
 }
 
-void addrconf_join_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
 {
 	struct in6_addr addr;
 	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -1135,7 +1135,7 @@ void addrconf_join_anycast(struct inet6_ifaddr *ifp)
 	ipv6_dev_ac_inc(ifp->idev->dev, &addr);
 }
 
-void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 {
 	struct in6_addr addr;
 	ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
@@ -2858,16 +2858,16 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb) {
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, ENOBUFS);
 		return;
 	}
 	if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFADDR, EINVAL);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFADDR;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFADDR, GFP_ATOMIC);
 }
 
 static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
@@ -2994,16 +2994,16 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 	
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb) {
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, ENOBUFS);
 		return;
 	}
 	if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_IFINFO, EINVAL);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_IFINFO;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC);
 }
 
 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
@@ -3054,16 +3054,16 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb) {
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, ENOBUFS);
 		return;
 	}
 	if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_PREFIX, EINVAL);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_PREFIX;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_PREFIX, GFP_ATOMIC);
 }
 
 static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 28d9bcab097..4f8795af2ed 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -44,6 +44,7 @@
 #include <linux/netdevice.h>
 #include <linux/icmpv6.h>
 #include <linux/smp_lock.h>
+#include <linux/netfilter_ipv6.h>
 
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -66,45 +67,14 @@ MODULE_AUTHOR("Cast of dozens");
 MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
 MODULE_LICENSE("GPL");
 
-/* IPv6 procfs goodies... */
-
-#ifdef CONFIG_PROC_FS
-extern int raw6_proc_init(void);
-extern void raw6_proc_exit(void);
-extern int tcp6_proc_init(void);
-extern void tcp6_proc_exit(void);
-extern int udp6_proc_init(void);
-extern void udp6_proc_exit(void);
-extern int ipv6_misc_proc_init(void);
-extern void ipv6_misc_proc_exit(void);
-extern int ac6_proc_init(void);
-extern void ac6_proc_exit(void);
-extern int if6_proc_init(void);
-extern void if6_proc_exit(void);
-#endif
-
 int sysctl_ipv6_bindv6only;
 
-#ifdef INET_REFCNT_DEBUG
-atomic_t inet6_sock_nr;
-EXPORT_SYMBOL(inet6_sock_nr);
-#endif
-
 /* The inetsw table contains everything that inet_create needs to
  * build a new socket.
  */
 static struct list_head inetsw6[SOCK_MAX];
 static DEFINE_SPINLOCK(inetsw6_lock);
 
-static void inet6_sock_destruct(struct sock *sk)
-{
-	inet_sock_destruct(sk);
-
-#ifdef INET_REFCNT_DEBUG
-	atomic_dec(&inet6_sock_nr);
-#endif
-}
-
 static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
 {
 	const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -185,7 +155,7 @@ static int inet6_create(struct socket *sock, int protocol)
 			inet->hdrincl = 1;
 	}
 
-	sk->sk_destruct		= inet6_sock_destruct;
+	sk->sk_destruct		= inet_sock_destruct;
 	sk->sk_family		= PF_INET6;
 	sk->sk_protocol		= protocol;
 
@@ -212,12 +182,17 @@ static int inet6_create(struct socket *sock, int protocol)
 		inet->pmtudisc = IP_PMTUDISC_DONT;
 	else
 		inet->pmtudisc = IP_PMTUDISC_WANT;
+	/* 
+	 * Increment only the relevant sk_prot->socks debug field, this changes
+	 * the previous behaviour of incrementing both the equivalent to
+	 * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
+	 *
+	 * This allows better debug granularity as we'll know exactly how many
+	 * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
+	 * transport protocol socks. -acme
+	 */
+	sk_refcnt_debug_inc(sk);
 
-
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet6_sock_nr);
-	atomic_inc(&inet_sock_nr);
-#endif
 	if (inet->num) {
 		/* It assumes that any protocol which allows
 		 * the user to assign a number at socket
@@ -513,11 +488,6 @@ static struct net_proto_family inet6_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-#ifdef CONFIG_SYSCTL
-extern void ipv6_sysctl_register(void);
-extern void ipv6_sysctl_unregister(void);
-#endif
-
 /* Same as inet6_dgram_ops, sans udp_poll.  */
 static struct proto_ops inet6_sockraw_ops = {
 	.family =	PF_INET6,
@@ -684,8 +654,6 @@ static void cleanup_ipv6_mibs(void)
 	snmp6_mib_free((void **)udp_stats_in6);
 }
 
-extern int ipv6_misc_proc_init(void);
-
 static int __init inet6_init(void)
 {
 	struct sk_buff *dummy_skb;
@@ -757,6 +725,9 @@ static int __init inet6_init(void)
 	err = igmp6_init(&inet6_family_ops);
 	if (err)
 		goto igmp_fail;
+	err = ipv6_netfilter_init();
+	if (err)
+		goto netfilter_fail;
 	/* Create /proc/foo6 entries. */
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -813,6 +784,8 @@ proc_tcp6_fail:
 	raw6_proc_exit();
 proc_raw6_fail:
 #endif
+	ipv6_netfilter_fini();
+netfilter_fail:
 	igmp6_cleanup();
 igmp_fail:
 	ndisc_cleanup();
@@ -852,6 +825,7 @@ static void __exit inet6_exit(void)
 	ip6_route_cleanup();
 	ipv6_packet_cleanup();
 	igmp6_cleanup();
+	ipv6_netfilter_fini();
 	ndisc_cleanup();
 	icmpv6_cleanup();
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 986fdfdccbc..0ebfad907a0 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -131,10 +131,10 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len)
 		case NEXTHDR_HOP:
 		case NEXTHDR_DEST:
 			if (!zero_out_mutable_opts(exthdr.opth)) {
-				LIMIT_NETDEBUG(printk(
+				LIMIT_NETDEBUG(
 					KERN_WARNING "overrun %sopts\n",
 					nexthdr == NEXTHDR_HOP ?
-						"hop" : "dest"));
+						"hop" : "dest");
 				return -EINVAL;
 			}
 			break;
@@ -293,8 +293,7 @@ static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
 		skb_push(skb, skb->data - skb->nh.raw);
 		ahp->icv(ahp, skb, ah->auth_data);
 		if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
-			LIMIT_NETDEBUG(
-				printk(KERN_WARNING "ipsec ah authentication error\n"));
+			LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n");
 			x->stats.integrity_failed++;
 			goto free_out;
 		}
@@ -332,9 +331,9 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
-			"%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
-	       ntohl(ah->spi), NIP6(iph->daddr)));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/"
+		 "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		 ntohl(ah->spi), NIP6(iph->daddr));
 
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5229365cd8b..01468fab3d3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -29,6 +29,7 @@
 #include <net/addrconf.h>
 #include <net/transp_v6.h>
 #include <net/ip6_route.h>
+#include <net/tcp_states.h>
 
 #include <linux/errqueue.h>
 #include <asm/uaccess.h>
@@ -588,8 +589,8 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl,
 			break;
 
 		default:
-			LIMIT_NETDEBUG(
-				printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type));
+			LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n",
+			               cmsg->cmsg_type);
 			err = -EINVAL;
 			break;
 		};
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 324db62515a..e8bff9d3d96 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -212,8 +212,7 @@ static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, stru
 
 		padlen = nexthdr[0];
 		if (padlen+2 >= elen) {
-			LIMIT_NETDEBUG(
-				printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen));
+			LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen);
 			ret = -EINVAL;
 			goto out;
 		}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index e0839eafc3a..5be6da2584e 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -424,8 +424,8 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
 		IP6CB(skb)->ra = optoff;
 		return 1;
 	}
-	LIMIT_NETDEBUG(
-		 printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1]));
+	LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n",
+	               skb->nh.raw[optoff+1]);
 	kfree_skb(skb);
 	return 0;
 }
@@ -437,8 +437,8 @@ static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
 	u32 pkt_len;
 
 	if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
-		LIMIT_NETDEBUG(
-			 printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1]));
+		LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
+		               skb->nh.raw[optoff+1]);
 		IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 		goto drop;
 	}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ff3ec9822e3..5176fc655ea 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -67,7 +67,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
-DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 
 /*
  *	The ICMP socket(s). This is the most convenient way to flow control
@@ -332,8 +332,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	 *	for now we don't know that.
 	 */
 	if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
-		LIMIT_NETDEBUG(
-			printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
 		return;
 	}
 
@@ -341,8 +340,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	 *	Never answer to a ICMP packet.
 	 */
 	if (is_ineligible(skb)) {
-		LIMIT_NETDEBUG(
-			printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); 
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
 		return;
 	}
 
@@ -393,8 +391,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	len = skb->len - msg.offset;
 	len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr));
 	if (len < 0) {
-		LIMIT_NETDEBUG(
-			printk(KERN_DEBUG "icmp: len problem\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n");
 		goto out_dst_release;
 	}
 
@@ -551,7 +548,8 @@ static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
 
 	read_lock(&raw_v6_lock);
 	if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
-		while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
+		while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr,
+					    skb->dev->ifindex))) {
 			rawv6_err(sk, skb, NULL, type, code, inner_offset, info);
 			sk = sk_next(sk);
 		}
@@ -583,17 +581,15 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
 				    skb->csum)) {
-			LIMIT_NETDEBUG(
-				printk(KERN_DEBUG "ICMPv6 hw checksum failed\n"));
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n");
 			skb->ip_summed = CHECKSUM_NONE;
 		}
 	}
 	if (skb->ip_summed == CHECKSUM_NONE) {
 		if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
 				    skb_checksum(skb, 0, skb->len, 0))) {
-			LIMIT_NETDEBUG(
-				printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
-				       NIP6(*saddr), NIP6(*daddr)));
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+				       NIP6(*saddr), NIP6(*daddr));
 			goto discard_it;
 		}
 	}
@@ -669,8 +665,7 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 		break;
 
 	default:
-		LIMIT_NETDEBUG(
-			printk(KERN_DEBUG "icmpv6: msg of unknown type\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n");
 
 		/* informational */
 		if (type & ICMPV6_INFOMSG_MASK)
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
new file mode 100644
index 00000000000..01d5f46d4e4
--- /dev/null
+++ b/net/ipv6/inet6_hashtables.c
@@ -0,0 +1,81 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Generic INET6 transport hashtables
+ *
+ * Authors:	Lotsa people, from code originally in tcp
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+
+#include <linux/module.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+
+struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo,
+				   const struct in6_addr *daddr,
+				   const unsigned short hnum, const int dif)
+{
+	struct sock *sk;
+	const struct hlist_node *node;
+	struct sock *result = NULL;
+	int score, hiscore = 0;
+
+	read_lock(&hashinfo->lhash_lock);
+	sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
+		if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
+			const struct ipv6_pinfo *np = inet6_sk(sk);
+			
+			score = 1;
+			if (!ipv6_addr_any(&np->rcv_saddr)) {
+				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
+					continue;
+				score++;
+			}
+			if (sk->sk_bound_dev_if) {
+				if (sk->sk_bound_dev_if != dif)
+					continue;
+				score++;
+			}
+			if (score == 3) {
+				result = sk;
+				break;
+			}
+			if (score > hiscore) {
+				hiscore = score;
+				result = sk;
+			}
+		}
+	}
+	if (result)
+		sock_hold(result);
+	read_unlock(&hashinfo->lhash_lock);
+	return result;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup_listener);
+
+struct sock *inet6_lookup(struct inet_hashinfo *hashinfo,
+			  const struct in6_addr *saddr, const u16 sport,
+			  const struct in6_addr *daddr, const u16 dport,
+			  const int dif)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+	sk = __inet6_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	local_bh_enable();
+
+	return sk;
+}
+
+EXPORT_SYMBOL_GPL(inet6_lookup);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1b354aa9793..16af874c9e8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -49,7 +49,7 @@
 
 struct rt6_statistics	rt6_stats;
 
-static kmem_cache_t * fib6_node_kmem;
+static kmem_cache_t * fib6_node_kmem __read_mostly;
 
 enum fib_walk_state_t
 {
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 10fbb50daea..6e348042693 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -56,7 +56,7 @@ static inline int ip6_rcv_finish( struct sk_buff *skb)
 	return dst_input(skb);
 }
 
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct ipv6hdr *hdr;
 	u32 		pkt_len;
@@ -166,8 +166,8 @@ resubmit:
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
-	if (raw_sk)
-		ipv6_raw_deliver(skb, nexthdr);
+	if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
+		raw_sk = NULL;
 
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 	if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ae652ca14bc..01ef94f7c7f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -153,51 +153,6 @@ int ip6_output(struct sk_buff *skb)
 		return ip6_output2(skb);
 }
 
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb)
-{
-	struct ipv6hdr *iph = skb->nh.ipv6h;
-	struct dst_entry *dst;
-	struct flowi fl = {
-		.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
-		.nl_u =
-		{ .ip6_u =
-		  { .daddr = iph->daddr,
-		    .saddr = iph->saddr, } },
-		.proto = iph->nexthdr,
-	};
-
-	dst = ip6_route_output(skb->sk, &fl);
-
-	if (dst->error) {
-		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
-		LIMIT_NETDEBUG(
-			printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
-		dst_release(dst);
-		return -EINVAL;
-	}
-
-	/* Drop old route. */
-	dst_release(skb->dst);
-
-	skb->dst = dst;
-	return 0;
-}
-#endif
-
-static inline int ip6_maybe_reroute(struct sk_buff *skb)
-{
-#ifdef CONFIG_NETFILTER
-	if (skb->nfcache & NFC_ALTERED){
-		if (ip6_route_me_harder(skb) != 0){
-			kfree_skb(skb);
-			return -EINVAL;
-		}
-	}
-#endif /* CONFIG_NETFILTER */
-	return dst_output(skb);
-}
-
 /*
  *	xmit an sk_buff (used by TCP)
  */
@@ -266,7 +221,8 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || ipfragok) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
-		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
+		return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
+				dst_output);
 	}
 
 	if (net_ratelimit())
@@ -321,7 +277,9 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 	read_lock(&ip6_ra_lock);
 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
 		struct sock *sk = ra->sk;
-		if (sk && ra->sel == sel) {
+		if (sk && ra->sel == sel &&
+		    (!sk->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 			if (last) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2)
@@ -667,7 +625,7 @@ slow_path:
 		 */
 
 		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
-			NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
+			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 			IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 			err = -ENOMEM;
 			goto fail;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3bc144a79fa..76466af8331 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,7 +55,7 @@
 
 #include <asm/uaccess.h>
 
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
 
 static struct packet_type ipv6_packet_type = {
 	.type = __constant_htons(ETH_P_IPV6), 
@@ -109,13 +109,6 @@ int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *))
 	return 0;
 }
 
-extern int ip6_mc_source(int add, int omode, struct sock *sk,
-	struct group_source_req *pgsr);
-extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf);
-extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
-	struct group_filter __user *optval, int __user *optlen);
-
-
 int ipv6_setsockopt(struct sock *sk, int level, int optname,
 		    char __user *optval, int optlen)
 {
@@ -163,6 +156,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 			fl6_free_socklist(sk);
 			ipv6_sock_mc_close(sk);
 
+			/*
+			 * Sock is moving from IPv6 to IPv4 (sk_prot), so
+			 * remove it from the refcnt debug socks count in the
+			 * original family...
+			 */
+			sk_refcnt_debug_dec(sk);
+
 			if (sk->sk_protocol == IPPROTO_TCP) {
 				struct tcp_sock *tp = tcp_sk(sk);
 
@@ -192,9 +192,11 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
 				kfree_skb(pktopt);
 
 			sk->sk_destruct = inet_sock_destruct;
-#ifdef INET_REFCNT_DEBUG
-			atomic_dec(&inet6_sock_nr);
-#endif
+			/*
+			 * ... and add it to the refcnt debug socks count
+			 * in the new family. -acme
+			 */
+			sk_refcnt_debug_inc(sk);
 			module_put(THIS_MODULE);
 			retv = 0;
 			break;
@@ -437,7 +439,6 @@ done:
 	}
 	case MCAST_MSFILTER:
 	{
-		extern int sysctl_optmem_max;
 		extern int sysctl_mld_max_msf;
 		struct group_filter *gsf;
 
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 5ade5a5d199..37a4a99c9fe 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -15,9 +15,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
 EXPORT_SYMBOL(register_inet6addr_notifier);
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 EXPORT_SYMBOL(ip6_route_output);
-#ifdef CONFIG_NETFILTER
-EXPORT_SYMBOL(ip6_route_me_harder);
-#endif
 EXPORT_SYMBOL(addrconf_lock);
 EXPORT_SYMBOL(ipv6_setsockopt);
 EXPORT_SYMBOL(ipv6_getsockopt);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7ae72d4c9bd..a7eae30f455 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -812,7 +812,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 		if (ipv6_chk_acast_addr(dev, &msg->target) ||
 		    (idev->cnf.forwarding && 
 		     pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
-			if (skb->stamp.tv_sec != LOCALLY_ENQUEUED &&
+			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
 			    skb->pkt_type != PACKET_HOST &&
 			    inc != 0 &&
 			    idev->nd_parms->proxy_delay != 0) {
@@ -1487,6 +1487,8 @@ int ndisc_rcv(struct sk_buff *skb)
 		return 0;
 	}
 
+	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
 	switch (msg->icmph.icmp6_type) {
 	case NDISC_NEIGHBOUR_SOLICITATION:
 		ndisc_recv_ns(skb);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
new file mode 100644
index 00000000000..f8626ebf90f
--- /dev/null
+++ b/net/ipv6/netfilter.c
@@ -0,0 +1,104 @@
+#include <linux/config.h>
+#include <linux/init.h>
+
+#ifdef CONFIG_NETFILTER
+
+#include <linux/kernel.h>
+#include <linux/ipv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/dst.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+
+int ip6_route_me_harder(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = skb->nh.ipv6h;
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
+		.nl_u =
+		{ .ip6_u =
+		  { .daddr = iph->daddr,
+		    .saddr = iph->saddr, } },
+		.proto = iph->nexthdr,
+	};
+
+	dst = ip6_route_output(skb->sk, &fl);
+
+	if (dst->error) {
+		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
+		dst_release(dst);
+		return -EINVAL;
+	}
+
+	/* Drop old route. */
+	dst_release(skb->dst);
+
+	skb->dst = dst;
+	return 0;
+}
+EXPORT_SYMBOL(ip6_route_me_harder);
+
+/*
+ * Extra routing may needed on local out, as the QUEUE target never
+ * returns control to the table.
+ */
+
+struct ip6_rt_info {
+	struct in6_addr daddr;
+	struct in6_addr saddr;
+};
+
+static void save(const struct sk_buff *skb, struct nf_info *info)
+{
+	struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP6_LOCAL_OUT) {
+		struct ipv6hdr *iph = skb->nh.ipv6h;
+
+		rt_info->daddr = iph->daddr;
+		rt_info->saddr = iph->saddr;
+	}
+}
+
+static int reroute(struct sk_buff **pskb, const struct nf_info *info)
+{
+	struct ip6_rt_info *rt_info = nf_info_reroute(info);
+
+	if (info->hook == NF_IP6_LOCAL_OUT) {
+		struct ipv6hdr *iph = (*pskb)->nh.ipv6h;
+		if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
+		    !ipv6_addr_equal(&iph->saddr, &rt_info->saddr))
+			return ip6_route_me_harder(*pskb);
+	}
+	return 0;
+}
+
+static struct nf_queue_rerouter ip6_reroute = {
+	.rer_size	= sizeof(struct ip6_rt_info),
+	.save 		= &save,
+	.reroute	= &reroute,
+};
+
+int __init ipv6_netfilter_init(void)
+{
+	return nf_register_queue_rerouter(PF_INET6, &ip6_reroute);
+}
+
+void ipv6_netfilter_fini(void)
+{
+	nf_unregister_queue_rerouter(PF_INET6);
+}
+
+#else /* CONFIG_NETFILTER */
+int __init ipv6_netfilter_init(void)
+{
+	return 0;
+}
+
+void ipv6_netfilter_fini(void)
+{
+}
+#endif /* CONFIG_NETFILTER */
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 77ec704c9ee..216fbe1ac65 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -10,13 +10,16 @@ menu "IPv6: Netfilter Configuration (EXPERIMENTAL)"
 #  dep_tristate '  FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK
 #fi
 config IP6_NF_QUEUE
-	tristate "Userspace queueing via NETLINK"
+	tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)"
 	---help---
 
 	  This option adds a queue handler to the kernel for IPv6
-	  packets which lets us to receive the filtered packets
-	  with QUEUE target using libiptc as we can do with
-	  the IPv4 now.
+	  packets which enables users to receive the filtered packets
+	  with QUEUE target using libipq.
+
+	  THis option enables the old IPv6-only "ip6_queue" implementation
+	  which has been obsoleted by the new "nfnetlink_queue" code (see
+	  CONFIG_NETFILTER_NETLINK_QUEUE).
 
 	  (C) Fernando Anton 2001
 	  IPv64 Project - Work based in IPv64 draft by Arturo Azcorra.
@@ -196,6 +199,16 @@ config IP6_NF_TARGET_LOG
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_TARGET_REJECT
+	tristate "REJECT target support"
+	depends on IP6_NF_FILTER
+	help
+	  The REJECT target allows a filtering rule to specify that an ICMPv6
+	  error should be issued in response to an incoming packet, rather
+	  than silently being dropped.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 #  if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then
 #    dep_tristate '    REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER
 #    if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
@@ -226,6 +239,22 @@ config IP6_NF_TARGET_MARK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_TARGET_HL
+	tristate  'HL (hoplimit) target support'
+	depends on IP6_NF_MANGLE
+	help
+	  This option adds a `HL' target, which enables the user to decrement
+	  the hoplimit value of the IPv6 header or set it to a given (lower)
+	  value.
+	
+	  While it is safe to decrement the hoplimit value, this option also
+	  enables functionality to increment and set the hoplimit value of the
+	  IPv6 header to arbitrary values.  This is EXTREMELY DANGEROUS since
+	  you can easily create immortal packets that loop forever on the
+	  network.  
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 #dep_tristate '  LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES
 config IP6_NF_RAW
 	tristate  'raw table support (required for TRACE)'
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 2e51714953b..bd9a16a5cbb 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -20,7 +20,10 @@ obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o
 obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
 obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
 obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o
+obj-$(CONFIG_IP6_NF_TARGET_HL) += ip6t_HL.o
 obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o
 obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o
 obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o
+obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ip6t_NFQUEUE.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index a16df5b27c8..aa11cf366ef 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -47,16 +47,10 @@
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip6_queue_maxlen"
 
-struct ipq_rt_info {
-	struct in6_addr daddr;
-	struct in6_addr saddr;
-};
-
 struct ipq_queue_entry {
 	struct list_head list;
 	struct nf_info *info;
 	struct sk_buff *skb;
-	struct ipq_rt_info rt_info;
 };
 
 typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
@@ -244,8 +238,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 
 	pmsg->packet_id       = (unsigned long )entry;
 	pmsg->data_len        = data_len;
-	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
-	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+	pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
+	pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
 	pmsg->mark            = entry->skb->nfmark;
 	pmsg->hook            = entry->info->hook;
 	pmsg->hw_protocol     = entry->skb->protocol;
@@ -284,7 +278,8 @@ nlmsg_failure:
 }
 
 static int
-ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, 
+		   unsigned int queuenum, void *data)
 {
 	int status = -EINVAL;
 	struct sk_buff *nskb;
@@ -302,13 +297,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
 	entry->info = info;
 	entry->skb = skb;
 
-	if (entry->info->hook == NF_IP_LOCAL_OUT) {
-		struct ipv6hdr *iph = skb->nh.ipv6h;
-
-		entry->rt_info.daddr = iph->daddr;
-		entry->rt_info.saddr = iph->saddr;
-	}
-
 	nskb = ipq_build_packet_message(entry, &status);
 	if (nskb == NULL)
 		goto err_out_free;
@@ -384,23 +372,11 @@ ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 		}
 		skb_put(e->skb, diff);
 	}
-	if (!skb_ip_make_writable(&e->skb, v->data_len))
+	if (!skb_make_writable(&e->skb, v->data_len))
 		return -ENOMEM;
 	memcpy(e->skb->data, v->payload, v->data_len);
 	e->skb->ip_summed = CHECKSUM_NONE;
-	e->skb->nfcache |= NFC_ALTERED;
-
-	/*
-	 * Extra routing may needed on local out, as the QUEUE target never
-	 * returns control to the table.
-         * Not a nice way to cmp, but works
-	 */
-	if (e->info->hook == NF_IP_LOCAL_OUT) {
-		struct ipv6hdr *iph = e->skb->nh.ipv6h;
-		if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) ||
-		    !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr))
-			return ip6_route_me_harder(e->skb);
-	}
+
 	return 0;
 }
 
@@ -676,6 +652,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length)
 	return len;
 }
 
+static struct nf_queue_handler nfqh = {
+	.name	= "ip6_queue",
+	.outfn	= &ipq_enqueue_packet,
+};
+
 static int
 init_or_cleanup(int init)
 {
@@ -686,7 +667,8 @@ init_or_cleanup(int init)
 		goto cleanup;
 
 	netlink_register_notifier(&ipq_nl_notifier);
-	ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk);
+	ipqnl = netlink_kernel_create(NETLINK_IP6_FW, 0, ipq_rcv_sk,
+	                              THIS_MODULE);
 	if (ipqnl == NULL) {
 		printk(KERN_ERR "ip6_queue: failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
@@ -703,7 +685,7 @@ init_or_cleanup(int init)
 	register_netdevice_notifier(&ipq_dev_notifier);
 	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
 	
-	status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL);
+	status = nf_register_queue_handler(PF_INET6, &nfqh);
 	if (status < 0) {
 		printk(KERN_ERR "ip6_queue: failed to register queue handler\n");
 		goto cleanup_sysctl;
@@ -711,7 +693,7 @@ init_or_cleanup(int init)
 	return status;
 
 cleanup:
-	nf_unregister_queue_handler(PF_INET6);
+	nf_unregister_queue_handlers(&nfqh);
 	synchronize_net();
 	ipq_flush(NF_DROP);
 	
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 73034511c8d..1cb8adb2787 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -401,7 +401,6 @@ ip6t_do_table(struct sk_buff **pskb,
 	do {
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
-		(*pskb)->nfcache |= e->nfcache;
 		if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6,
 			&protoff, &offset)) {
 			struct ip6t_entry_target *t;
@@ -434,8 +433,8 @@ ip6t_do_table(struct sk_buff **pskb,
 							 back->comefrom);
 					continue;
 				}
-				if (table_base + v
-				    != (void *)e + e->next_offset) {
+				if (table_base + v != (void *)e + e->next_offset
+				    && !(e->ipv6.flags & IP6T_F_GOTO)) {
 					/* Save old back ptr in next entry */
 					struct ip6t_entry *next
 						= (void *)e + e->next_offset;
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
new file mode 100644
index 00000000000..8f5549b7272
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -0,0 +1,118 @@
+/* 
+ * Hop Limit modification target for ip6tables
+ * Maciej Soltysiak <solt@dns.toxicfilms.tv>
+ * Based on HW's TTL module
+ *
+ * This software is distributed under the terms of GNU GPL
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_HL.h>
+
+MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>");
+MODULE_DESCRIPTION("IP tables Hop Limit modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int ip6t_hl_target(struct sk_buff **pskb, 
+				   const struct net_device *in,
+				   const struct net_device *out,
+				   unsigned int hooknum,
+				   const void *targinfo, void *userinfo)
+{
+	struct ipv6hdr *ip6h;
+	const struct ip6t_HL_info *info = targinfo;
+	u_int16_t diffs[2];
+	int new_hl;
+
+	if (!skb_make_writable(pskb, (*pskb)->len))
+		return NF_DROP;
+
+	ip6h = (*pskb)->nh.ipv6h;
+
+	switch (info->mode) {
+		case IP6T_HL_SET:
+			new_hl = info->hop_limit;
+			break;
+		case IP6T_HL_INC:
+			new_hl = ip6h->hop_limit + info->hop_limit;
+			if (new_hl > 255)
+				new_hl = 255;
+			break;
+		case IP6T_HL_DEC:
+			new_hl = ip6h->hop_limit - info->hop_limit;
+			if (new_hl < 0)
+				new_hl = 0;
+			break;
+		default:
+			new_hl = ip6h->hop_limit;
+			break;
+	}
+
+	if (new_hl != ip6h->hop_limit) {
+		diffs[0] = htons(((unsigned)ip6h->hop_limit) << 8) ^ 0xFFFF;
+		ip6h->hop_limit = new_hl;
+		diffs[1] = htons(((unsigned)ip6h->hop_limit) << 8);
+	}
+
+	return IP6T_CONTINUE;
+}
+
+static int ip6t_hl_checkentry(const char *tablename,
+		const struct ip6t_entry *e,
+		void *targinfo,
+		unsigned int targinfosize,
+		unsigned int hook_mask)
+{
+	struct ip6t_HL_info *info = targinfo;
+
+	if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_HL_info))) {
+		printk(KERN_WARNING "ip6t_HL: targinfosize %u != %Zu\n",
+				targinfosize,
+				IP6T_ALIGN(sizeof(struct ip6t_HL_info)));
+		return 0;	
+	}	
+
+	if (strcmp(tablename, "mangle")) {
+		printk(KERN_WARNING "ip6t_HL: can only be called from "
+			"\"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (info->mode > IP6T_HL_MAXMODE) {
+		printk(KERN_WARNING "ip6t_HL: invalid or unknown Mode %u\n", 
+			info->mode);
+		return 0;
+	}
+
+	if ((info->mode != IP6T_HL_SET) && (info->hop_limit == 0)) {
+		printk(KERN_WARNING "ip6t_HL: increment/decrement doesn't "
+			"make sense with value 0\n");
+		return 0;
+	}
+	
+	return 1;
+}
+
+static struct ip6t_target ip6t_HL = { 
+	.name 		= "HL", 
+	.target		= ip6t_hl_target, 
+	.checkentry	= ip6t_hl_checkentry, 
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ip6t_register_target(&ip6t_HL);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_target(&ip6t_HL);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index a692e26a4fa..0cd1d1bd903 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -26,10 +26,6 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
 MODULE_DESCRIPTION("IP6 tables LOG target module");
 MODULE_LICENSE("GPL");
 
-static unsigned int nflog = 1;
-module_param(nflog, int, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
- 
 struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
@@ -44,7 +40,7 @@ struct in_device;
 static DEFINE_SPINLOCK(log_lock);
 
 /* One level of recursion won't kill us */
-static void dump_packet(const struct ip6t_log_info *info,
+static void dump_packet(const struct nf_loginfo *info,
 			const struct sk_buff *skb, unsigned int ip6hoff,
 			int recurse)
 {
@@ -53,6 +49,12 @@ static void dump_packet(const struct ip6t_log_info *info,
 	struct ipv6hdr _ip6h, *ih;
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
@@ -84,7 +86,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 		}
 
 		/* Max length: 48 "OPT (...) " */
-		if (info->logflags & IP6T_LOG_IPOPT)
+		if (logflags & IP6T_LOG_IPOPT)
 			printk("OPT ( ");
 
 		switch (currenthdr) {
@@ -119,7 +121,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 		case IPPROTO_ROUTING:
 		case IPPROTO_HOPOPTS:
 			if (fragment) {
-				if (info->logflags & IP6T_LOG_IPOPT)
+				if (logflags & IP6T_LOG_IPOPT)
 					printk(")");
 				return;
 			}
@@ -127,7 +129,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 			break;
 		/* Max Length */
 		case IPPROTO_AH:
-			if (info->logflags & IP6T_LOG_IPOPT) {
+			if (logflags & IP6T_LOG_IPOPT) {
 				struct ip_auth_hdr _ahdr, *ah;
 
 				/* Max length: 3 "AH " */
@@ -158,7 +160,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 			hdrlen = (hp->hdrlen+2)<<2;
 			break;
 		case IPPROTO_ESP:
-			if (info->logflags & IP6T_LOG_IPOPT) {
+			if (logflags & IP6T_LOG_IPOPT) {
 				struct ip_esp_hdr _esph, *eh;
 
 				/* Max length: 4 "ESP " */
@@ -190,7 +192,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 			printk("Unknown Ext Hdr %u", currenthdr);
 			return;
 		}
-		if (info->logflags & IP6T_LOG_IPOPT)
+		if (logflags & IP6T_LOG_IPOPT)
 			printk(") ");
 
 		currenthdr = hp->nexthdr;
@@ -218,7 +220,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 		printk("SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-		if (info->logflags & IP6T_LOG_TCPSEQ)
+		if (logflags & IP6T_LOG_TCPSEQ)
 			printk("SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
@@ -245,7 +247,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 		/* Max length: 11 "URGP=65535 " */
 		printk("URGP=%u ", ntohs(th->urg_ptr));
 
-		if ((info->logflags & IP6T_LOG_TCPOPT)
+		if ((logflags & IP6T_LOG_TCPOPT)
 		    && th->doff * 4 > sizeof(struct tcphdr)) {
 			u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
 			unsigned int i;
@@ -349,7 +351,7 @@ static void dump_packet(const struct ip6t_log_info *info,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) {
+	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
 			printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
@@ -357,19 +359,31 @@ static void dump_packet(const struct ip6t_log_info *info,
 	}
 }
 
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level	  = 0,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
 static void
-ip6t_log_packet(unsigned int hooknum,
+ip6t_log_packet(unsigned int pf,
+		unsigned int hooknum,
 		const struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
-		const struct ip6t_log_info *loginfo,
-		const char *level_string,
+		const struct nf_loginfo *loginfo,
 		const char *prefix)
 {
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
 	spin_lock_bh(&log_lock);
-	printk(level_string);
-	printk("%sIN=%s OUT=%s ",
-		prefix == NULL ? loginfo->prefix : prefix,
+	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, 
+		prefix,
 		in ? in->name : "",
 		out ? out->name : "");
 	if (in && !out) {
@@ -416,29 +430,17 @@ ip6t_log_target(struct sk_buff **pskb,
 		void *userinfo)
 {
 	const struct ip6t_log_info *loginfo = targinfo;
-	char level_string[4] = "< >";
+	struct nf_loginfo li;
+
+	li.type = NF_LOG_TYPE_LOG;
+	li.u.log.level = loginfo->level;
+	li.u.log.logflags = loginfo->logflags;
 
-	level_string[1] = '0' + (loginfo->level % 8);
-	ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+	nf_log_packet(PF_INET6, hooknum, *pskb, in, out, &li, loginfo->prefix);
 
 	return IP6T_CONTINUE;
 }
 
-static void
-ip6t_logfn(unsigned int hooknum,
-	   const struct sk_buff *skb,
-	   const struct net_device *in,
-	   const struct net_device *out,
-	   const char *prefix)
-{
-	struct ip6t_log_info loginfo = {
-		.level = 0,
-		.logflags = IP6T_LOG_MASK,
-		.prefix = ""
-	};
-
-	ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
-}
 
 static int ip6t_log_checkentry(const char *tablename,
 			       const struct ip6t_entry *e,
@@ -475,20 +477,29 @@ static struct ip6t_target ip6t_log_reg = {
 	.me 		= THIS_MODULE,
 };
 
+static struct nf_logger ip6t_logger = {
+	.name		= "ip6t_LOG",
+	.logfn		= &ip6t_log_packet,
+	.me		= THIS_MODULE,
+};
+
 static int __init init(void)
 {
 	if (ip6t_register_target(&ip6t_log_reg))
 		return -EINVAL;
-	if (nflog)
-		nf_log_register(PF_INET6, &ip6t_logfn);
+	if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
+		printk(KERN_WARNING "ip6t_LOG: not logging via system console "
+		       "since somebody else already registered for PF_INET6\n");
+		/* we cannot make module load fail here, since otherwise
+		 * ip6tables userspace would abort */
+	}
 
 	return 0;
 }
 
 static void __exit fini(void)
 {
-	if (nflog)
-		nf_log_unregister(PF_INET6, &ip6t_logfn);
+	nf_log_unregister_logger(&ip6t_logger);
 	ip6t_unregister_target(&ip6t_log_reg);
 }
 
diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c
index d09ceb05013..81924fcc585 100644
--- a/net/ipv6/netfilter/ip6t_MARK.c
+++ b/net/ipv6/netfilter/ip6t_MARK.c
@@ -28,10 +28,9 @@ target(struct sk_buff **pskb,
 {
 	const struct ip6t_mark_target_info *markinfo = targinfo;
 
-	if((*pskb)->nfmark != markinfo->mark) {
+	if((*pskb)->nfmark != markinfo->mark)
 		(*pskb)->nfmark = markinfo->mark;
-		(*pskb)->nfcache |= NFC_ALTERED;
-	}
+
 	return IP6T_CONTINUE;
 }
 
diff --git a/net/ipv6/netfilter/ip6t_NFQUEUE.c b/net/ipv6/netfilter/ip6t_NFQUEUE.c
new file mode 100644
index 00000000000..c6e3730e740
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_NFQUEUE.c
@@ -0,0 +1,70 @@
+/* ip6tables module for using new netfilter netlink queue
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv4/ipt_NFQUEUE.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("ip6tables NFQUEUE target");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_NFQ_info *tinfo = targinfo;
+
+	return NF_QUEUE_NR(tinfo->queuenum);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ip6t_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != IP6T_ALIGN(sizeof(struct ipt_NFQ_info))) {
+		printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IP6T_ALIGN(sizeof(struct ipt_NFQ_info)));
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_target ipt_NFQ_reg = {
+	.name		= "NFQUEUE",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ip6t_register_target(&ipt_NFQ_reg);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_target(&ipt_NFQ_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
new file mode 100644
index 00000000000..14316c3ebde
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -0,0 +1,284 @@
+/*
+ * IP6 tables REJECT target module
+ * Linux INET6 implementation
+ *
+ * Copyright (C)2003 USAGI/WIDE Project
+ *
+ * Authors:
+ *	Yasuyuki Kozakai	<yasuyuki.kozakai@toshiba.co.jp>
+ *
+ * Based on net/ipv4/netfilter/ipt_REJECT.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmpv6.h>
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#include <net/flow.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_REJECT.h>
+
+MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>");
+MODULE_DESCRIPTION("IP6 tables REJECT target module");
+MODULE_LICENSE("GPL");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb)
+{
+	struct sk_buff *nskb;
+	struct tcphdr otcph, *tcph;
+	unsigned int otcplen, hh_len;
+	int tcphoff, needs_ack;
+	struct ipv6hdr *oip6h = oldskb->nh.ipv6h, *ip6h;
+	struct dst_entry *dst = NULL;
+	u8 proto;
+	struct flowi fl;
+
+	if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
+	    (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) {
+		DEBUGP("ip6t_REJECT: addr is not unicast.\n");
+		return;
+	}
+
+	proto = oip6h->nexthdr;
+	tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto);
+
+	if ((tcphoff < 0) || (tcphoff > oldskb->len)) {
+		DEBUGP("ip6t_REJECT: Can't get TCP header.\n");
+		return;
+	}
+
+	otcplen = oldskb->len - tcphoff;
+
+	/* IP header checks: fragment, too short. */
+	if ((proto != IPPROTO_TCP) || (otcplen < sizeof(struct tcphdr))) {
+		DEBUGP("ip6t_REJECT: proto(%d) != IPPROTO_TCP, or too short. otcplen = %d\n",
+			proto, otcplen);
+		return;
+	}
+
+	if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr)))
+		BUG();
+
+	/* No RST for RST. */
+	if (otcph.rst) {
+		DEBUGP("ip6t_REJECT: RST is set\n");
+		return;
+	}
+
+	/* Check checksum. */
+	if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP,
+			    skb_checksum(oldskb, tcphoff, otcplen, 0))) {
+		DEBUGP("ip6t_REJECT: TCP checksum is invalid\n");
+		return;
+	}
+
+	memset(&fl, 0, sizeof(fl));
+	fl.proto = IPPROTO_TCP;
+	ipv6_addr_copy(&fl.fl6_src, &oip6h->daddr);
+	ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
+	fl.fl_ip_sport = otcph.dest;
+	fl.fl_ip_dport = otcph.source;
+	dst = ip6_route_output(NULL, &fl);
+	if (dst == NULL)
+		return;
+	if (dst->error ||
+	    xfrm_lookup(&dst, &fl, NULL, 0)) {
+		dst_release(dst);
+		return;
+	}
+
+	hh_len = (dst->dev->hard_header_len + 15)&~15;
+	nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+			 + sizeof(struct tcphdr) + dst->trailer_len,
+			 GFP_ATOMIC);
+
+	if (!nskb) {
+		if (net_ratelimit())
+			printk("ip6t_REJECT: Can't alloc skb\n");
+		dst_release(dst);
+		return;
+	}
+
+	nskb->dst = dst;
+
+	skb_reserve(nskb, hh_len + dst->header_len);
+
+	ip6h = nskb->nh.ipv6h = (struct ipv6hdr *)
+					skb_put(nskb, sizeof(struct ipv6hdr));
+	ip6h->version = 6;
+	ip6h->hop_limit = dst_metric(dst, RTAX_HOPLIMIT);
+	ip6h->nexthdr = IPPROTO_TCP;
+	ip6h->payload_len = htons(sizeof(struct tcphdr));
+	ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr);
+	ipv6_addr_copy(&ip6h->daddr, &oip6h->saddr);
+
+	tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
+	/* Truncate to length (no data) */
+	tcph->doff = sizeof(struct tcphdr)/4;
+	tcph->source = otcph.dest;
+	tcph->dest = otcph.source;
+
+	if (otcph.ack) {
+		needs_ack = 0;
+		tcph->seq = otcph.ack_seq;
+		tcph->ack_seq = 0;
+	} else {
+		needs_ack = 1;
+		tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin
+				      + otcplen - (otcph.doff<<2));
+		tcph->seq = 0;
+	}
+
+	/* Reset flags */
+	((u_int8_t *)tcph)[13] = 0;
+	tcph->rst = 1;
+	tcph->ack = needs_ack;
+	tcph->window = 0;
+	tcph->urg_ptr = 0;
+	tcph->check = 0;
+
+	/* Adjust TCP checksum */
+	tcph->check = csum_ipv6_magic(&nskb->nh.ipv6h->saddr,
+				      &nskb->nh.ipv6h->daddr,
+				      sizeof(struct tcphdr), IPPROTO_TCP,
+				      csum_partial((char *)tcph,
+						   sizeof(struct tcphdr), 0));
+
+	NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+		dst_output);
+}
+
+static inline void
+send_unreach(struct sk_buff *skb_in, unsigned char code, unsigned int hooknum)
+{
+	if (hooknum == NF_IP6_LOCAL_OUT && skb_in->dev == NULL)
+		skb_in->dev = &loopback_dev;
+
+	icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0, NULL);
+}
+
+static unsigned int reject6_target(struct sk_buff **pskb,
+			   const struct net_device *in,
+			   const struct net_device *out,
+			   unsigned int hooknum,
+			   const void *targinfo,
+			   void *userinfo)
+{
+	const struct ip6t_reject_info *reject = targinfo;
+
+	DEBUGP(KERN_DEBUG "%s: medium point\n", __FUNCTION__);
+	/* WARNING: This code causes reentry within ip6tables.
+	   This means that the ip6tables jump stack is now crap.  We
+	   must return an absolute verdict. --RR */
+    	switch (reject->with) {
+    	case IP6T_ICMP6_NO_ROUTE:
+    		send_unreach(*pskb, ICMPV6_NOROUTE, hooknum);
+    		break;
+    	case IP6T_ICMP6_ADM_PROHIBITED:
+    		send_unreach(*pskb, ICMPV6_ADM_PROHIBITED, hooknum);
+    		break;
+    	case IP6T_ICMP6_NOT_NEIGHBOUR:
+    		send_unreach(*pskb, ICMPV6_NOT_NEIGHBOUR, hooknum);
+    		break;
+    	case IP6T_ICMP6_ADDR_UNREACH:
+    		send_unreach(*pskb, ICMPV6_ADDR_UNREACH, hooknum);
+    		break;
+    	case IP6T_ICMP6_PORT_UNREACH:
+    		send_unreach(*pskb, ICMPV6_PORT_UNREACH, hooknum);
+    		break;
+    	case IP6T_ICMP6_ECHOREPLY:
+		/* Do nothing */
+		break;
+	case IP6T_TCP_RESET:
+		send_reset(*pskb);
+		break;
+	default:
+		if (net_ratelimit())
+			printk(KERN_WARNING "ip6t_REJECT: case %u not handled yet\n", reject->with);
+		break;
+	}
+
+	return NF_DROP;
+}
+
+static int check(const char *tablename,
+		 const struct ip6t_entry *e,
+		 void *targinfo,
+		 unsigned int targinfosize,
+		 unsigned int hook_mask)
+{
+ 	const struct ip6t_reject_info *rejinfo = targinfo;
+
+ 	if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_reject_info))) {
+  		DEBUGP("ip6t_REJECT: targinfosize %u != 0\n", targinfosize);
+  		return 0;
+  	}
+
+	/* Only allow these for packet filtering. */
+	if (strcmp(tablename, "filter") != 0) {
+		DEBUGP("ip6t_REJECT: bad table `%s'.\n", tablename);
+		return 0;
+	}
+
+	if ((hook_mask & ~((1 << NF_IP6_LOCAL_IN)
+			   | (1 << NF_IP6_FORWARD)
+			   | (1 << NF_IP6_LOCAL_OUT))) != 0) {
+		DEBUGP("ip6t_REJECT: bad hook mask %X\n", hook_mask);
+		return 0;
+	}
+
+	if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) {
+		printk("ip6t_REJECT: ECHOREPLY is not supported.\n");
+		return 0;
+	} else if (rejinfo->with == IP6T_TCP_RESET) {
+		/* Must specify that it's a TCP packet */
+		if (e->ipv6.proto != IPPROTO_TCP
+		    || (e->ipv6.invflags & IP6T_INV_PROTO)) {
+			DEBUGP("ip6t_REJECT: TCP_RESET illegal for non-tcp\n");
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static struct ip6t_target ip6t_reject_reg = {
+	.name		= "REJECT",
+	.target		= reject6_target,
+	.checkentry	= check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	if (ip6t_register_target(&ip6t_reject_reg))
+		return -EINVAL;
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_target(&ip6t_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index ab0e32d3de4..9b91decbfdd 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -20,71 +20,6 @@ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
 MODULE_DESCRIPTION("IP6 tables owner matching module");
 MODULE_LICENSE("GPL");
 
-static int
-match_pid(const struct sk_buff *skb, pid_t pid)
-{
-	struct task_struct *p;
-	struct files_struct *files;
-	int i;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	if (!p)
-		goto out;
-	task_lock(p);
-	files = p->files;
-	if(files) {
-		spin_lock(&files->file_lock);
-		for (i=0; i < files->max_fds; i++) {
-			if (fcheck_files(files, i) == skb->sk->sk_socket->file) {
-				spin_unlock(&files->file_lock);
-				task_unlock(p);
-				read_unlock(&tasklist_lock);
-				return 1;
-			}
-		}
-		spin_unlock(&files->file_lock);
-	}
-	task_unlock(p);
-out:
-	read_unlock(&tasklist_lock);
-	return 0;
-}
-
-static int
-match_sid(const struct sk_buff *skb, pid_t sid)
-{
-	struct task_struct *g, *p;
-	struct file *file = skb->sk->sk_socket->file;
-	int i, found=0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		struct files_struct *files;
-		if (p->signal->session != sid)
-			continue;
-
-		task_lock(p);
-		files = p->files;
-		if (files) {
-			spin_lock(&files->file_lock);
-			for (i=0; i < files->max_fds; i++) {
-				if (fcheck_files(files, i) == file) {
-					found = 1;
-					break;
-				}
-			}
-			spin_unlock(&files->file_lock);
-		}
-		task_unlock(p);
-		if (found)
-			goto out;
-	} while_each_thread(g, p);
-out:
-	read_unlock(&tasklist_lock);
-
-	return found;
-}
 
 static int
 match(const struct sk_buff *skb,
@@ -112,18 +47,6 @@ match(const struct sk_buff *skb,
 			return 0;
 	}
 
-	if(info->match & IP6T_OWNER_PID) {
-		if (!match_pid(skb, info->pid) ^
-		    !!(info->invert & IP6T_OWNER_PID))
-			return 0;
-	}
-
-	if(info->match & IP6T_OWNER_SID) {
-		if (!match_sid(skb, info->sid) ^
-		    !!(info->invert & IP6T_OWNER_SID))
-			return 0;
-	}
-
 	return 1;
 }
 
@@ -134,6 +57,8 @@ checkentry(const char *tablename,
            unsigned int matchsize,
            unsigned int hook_mask)
 {
+	const struct ip6t_owner_info *info = matchinfo;
+
         if (hook_mask
             & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) {
                 printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
@@ -142,14 +67,13 @@ checkentry(const char *tablename,
 
 	if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info)))
 		return 0;
-#ifdef CONFIG_SMP
-	/* files->file_lock can not be used in a BH */
-	if (((struct ip6t_owner_info *)matchinfo)->match
-	    & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
-		printk("ip6t_owner: pid and sid matching is broken on SMP.\n");
+
+	if (info->match & (IP6T_OWNER_PID|IP6T_OWNER_SID)) {
+		printk("ipt_owner: pid and sid matching "
+		       "not supported anymore\n");
 		return 0;
 	}
-#endif
+
 	return 1;
 }
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 1d4d75b34d3..7a5863298f3 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
 #include <net/transp_v6.h>
 #include <net/udp.h>
 #include <net/inet_common.h>
+#include <net/tcp_states.h>
 
 #include <net/rawv6.h>
 #include <net/xfrm.h>
@@ -81,7 +82,8 @@ static void raw_v6_unhash(struct sock *sk)
 
 /* Grumble... icmp and ip_input want to get at this... */
 struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
-			     struct in6_addr *loc_addr, struct in6_addr *rmt_addr)
+			     struct in6_addr *loc_addr, struct in6_addr *rmt_addr,
+			     int dif)
 {
 	struct hlist_node *node;
 	int is_multicast = ipv6_addr_is_multicast(loc_addr);
@@ -94,6 +96,9 @@ struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num,
 			    !ipv6_addr_equal(&np->daddr, rmt_addr))
 				continue;
 
+			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+				continue;
+
 			if (!ipv6_addr_any(&np->rcv_saddr)) {
 				if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
 					goto found;
@@ -137,11 +142,12 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
  *
  *	Caller owns SKB so we must make clones.
  */
-void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
+int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 {
 	struct in6_addr *saddr;
 	struct in6_addr *daddr;
 	struct sock *sk;
+	int delivered = 0;
 	__u8 hash;
 
 	saddr = &skb->nh.ipv6h->saddr;
@@ -160,9 +166,10 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	if (sk == NULL)
 		goto out;
 
-	sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr);
+	sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, skb->dev->ifindex);
 
 	while (sk) {
+		delivered = 1;
 		if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
@@ -170,10 +177,12 @@ void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			if (clone)
 				rawv6_rcv(sk, clone);
 		}
-		sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr);
+		sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr,
+				     skb->dev->ifindex);
 	}
 out:
 	read_unlock(&raw_v6_lock);
+	return delivered;
 }
 
 /* This cleans up af_inet6 a bit. -DaveM */
@@ -334,8 +343,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
 			if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
 					    &skb->nh.ipv6h->daddr,
 					    skb->len, inet->num, skb->csum)) {
-				LIMIT_NETDEBUG(
-			        printk(KERN_DEBUG "raw v6 hw csum failure.\n"));
+				LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
 				skb->ip_summed = CHECKSUM_NONE;
 			}
 		}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 59e7c631787..9d9e04344c7 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -562,7 +562,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	if (skb->dev)
 		fq->iif = skb->dev->ifindex;
 	skb->dev = NULL;
-	fq->stamp = skb->stamp;
+	skb_get_timestamp(skb, &fq->stamp);
 	fq->meat += skb->len;
 	atomic_add(skb->truesize, &ip6_frag_mem);
 
@@ -664,7 +664,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 
 	head->next = NULL;
 	head->dev = dev;
-	head->stamp = fq->stamp;
+	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
 
 	*skb_in = head;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 878789b3122..5d5bbb49ec7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1372,7 +1372,7 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
  *	Drop the packet on the floor
  */
 
-int ip6_pkt_discard(struct sk_buff *skb)
+static int ip6_pkt_discard(struct sk_buff *skb)
 {
 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
@@ -1380,7 +1380,7 @@ int ip6_pkt_discard(struct sk_buff *skb)
 	return 0;
 }
 
-int ip6_pkt_discard_out(struct sk_buff *skb)
+static int ip6_pkt_discard_out(struct sk_buff *skb)
 {
 	skb->dev = skb->dst->dev;
 	return ip6_pkt_discard(skb);
@@ -1850,16 +1850,16 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
 	
 	skb = alloc_skb(size, gfp_any());
 	if (!skb) {
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
 		return;
 	}
 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
 		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
+		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
 		return;
 	}
-	NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE;
-	netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any());
+	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
+	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
 }
 
 /*
@@ -1960,8 +1960,6 @@ static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
 	return arg.len;
 }
 
-extern struct rt6_statistics rt6_stats;
-
 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 {
 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e553e5b80d6..c3123c9e1a8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -770,7 +770,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
 	return 0;
 }
 
-int __init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __init ipip6_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = dev->priv;
 	struct iphdr *iph = &tunnel->parms.iph;
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 3a18e0e6ffe..8eff9fa1e98 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -14,9 +14,6 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 
-extern ctl_table ipv6_route_table[];
-extern ctl_table ipv6_icmp_table[];
-
 #ifdef CONFIG_SYSCTL
 
 static ctl_table ipv6_table[] = {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ef29cfd936d..794734f1d23 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -47,6 +47,7 @@
 
 #include <net/tcp.h>
 #include <net/ndisc.h>
+#include <net/inet6_hashtables.h>
 #include <net/ipv6.h>
 #include <net/transp_v6.h>
 #include <net/addrconf.h>
@@ -75,34 +76,11 @@ static int	tcp_v6_xmit(struct sk_buff *skb, int ipfragok);
 static struct tcp_func ipv6_mapped;
 static struct tcp_func ipv6_specific;
 
-/* I have no idea if this is a good hash for v6 or not. -DaveM */
-static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport,
-				    struct in6_addr *faddr, u16 fport)
+static inline int tcp_v6_bind_conflict(const struct sock *sk,
+				       const struct inet_bind_bucket *tb)
 {
-	int hashent = (lport ^ fport);
-
-	hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
-	hashent ^= hashent>>16;
-	hashent ^= hashent>>8;
-	return (hashent & (tcp_ehash_size - 1));
-}
-
-static __inline__ int tcp_v6_sk_hashfn(struct sock *sk)
-{
-	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct in6_addr *laddr = &np->rcv_saddr;
-	struct in6_addr *faddr = &np->daddr;
-	__u16 lport = inet->num;
-	__u16 fport = inet->dport;
-	return tcp_v6_hashfn(laddr, lport, faddr, fport);
-}
-
-static inline int tcp_v6_bind_conflict(struct sock *sk,
-				       struct tcp_bind_bucket *tb)
-{
-	struct sock *sk2;
-	struct hlist_node *node;
+	const struct sock *sk2;
+	const struct hlist_node *node;
 
 	/* We must walk the whole port owner list in this case. -DaveM */
 	sk_for_each_bound(sk2, node, &tb->owners) {
@@ -126,8 +104,8 @@ static inline int tcp_v6_bind_conflict(struct sock *sk,
  */
 static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 {
-	struct tcp_bind_hashbucket *head;
-	struct tcp_bind_bucket *tb;
+	struct inet_bind_hashbucket *head;
+	struct inet_bind_bucket *tb;
 	struct hlist_node *node;
 	int ret;
 
@@ -138,25 +116,25 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 		int remaining = (high - low) + 1;
 		int rover;
 
-		spin_lock(&tcp_portalloc_lock);
-		if (tcp_port_rover < low)
+		spin_lock(&tcp_hashinfo.portalloc_lock);
+		if (tcp_hashinfo.port_rover < low)
 			rover = low;
 		else
-			rover = tcp_port_rover;
+			rover = tcp_hashinfo.port_rover;
 		do {	rover++;
 			if (rover > high)
 				rover = low;
-			head = &tcp_bhash[tcp_bhashfn(rover)];
+			head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
 			spin_lock(&head->lock);
-			tb_for_each(tb, node, &head->chain)
+			inet_bind_bucket_for_each(tb, node, &head->chain)
 				if (tb->port == rover)
 					goto next;
 			break;
 		next:
 			spin_unlock(&head->lock);
 		} while (--remaining > 0);
-		tcp_port_rover = rover;
-		spin_unlock(&tcp_portalloc_lock);
+		tcp_hashinfo.port_rover = rover;
+		spin_unlock(&tcp_hashinfo.portalloc_lock);
 
 		/* Exhausted local port range during search?  It is not
 		 * possible for us to be holding one of the bind hash
@@ -171,9 +149,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
 		/* OK, here is the one we will use. */
 		snum = rover;
 	} else {
-		head = &tcp_bhash[tcp_bhashfn(snum)];
+		head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
 		spin_lock(&head->lock);
-		tb_for_each(tb, node, &head->chain)
+		inet_bind_bucket_for_each(tb, node, &head->chain)
 			if (tb->port == snum)
 				goto tb_found;
 	}
@@ -192,8 +170,11 @@ tb_found:
 	}
 tb_not_found:
 	ret = 1;
-	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
-		goto fail_unlock;
+	if (tb == NULL) {
+	       	tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum);
+		if (tb == NULL)
+			goto fail_unlock;
+	}
 	if (hlist_empty(&tb->owners)) {
 		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
 			tb->fastreuse = 1;
@@ -204,9 +185,9 @@ tb_not_found:
 		tb->fastreuse = 0;
 
 success:
-	if (!tcp_sk(sk)->bind_hash)
-		tcp_bind_hash(sk, tb, snum);
-	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+	if (!inet_csk(sk)->icsk_bind_hash)
+		inet_bind_hash(sk, tb, snum);
+	BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb);
 	ret = 0;
 
 fail_unlock:
@@ -224,13 +205,13 @@ static __inline__ void __tcp_v6_hash(struct sock *sk)
 	BUG_TRAP(sk_unhashed(sk));
 
 	if (sk->sk_state == TCP_LISTEN) {
-		list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-		lock = &tcp_lhash_lock;
-		tcp_listen_wlock();
+		list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)];
+		lock = &tcp_hashinfo.lhash_lock;
+		inet_listen_wlock(&tcp_hashinfo);
 	} else {
-		sk->sk_hashent = tcp_v6_sk_hashfn(sk);
-		list = &tcp_ehash[sk->sk_hashent].chain;
-		lock = &tcp_ehash[sk->sk_hashent].lock;
+		sk->sk_hashent = inet6_sk_ehashfn(sk, tcp_hashinfo.ehash_size);
+		list = &tcp_hashinfo.ehash[sk->sk_hashent].chain;
+		lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock;
 		write_lock(lock);
 	}
 
@@ -255,131 +236,11 @@ static void tcp_v6_hash(struct sock *sk)
 	}
 }
 
-static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif)
-{
-	struct sock *sk;
-	struct hlist_node *node;
-	struct sock *result = NULL;
-	int score, hiscore;
-
-	hiscore=0;
-	read_lock(&tcp_lhash_lock);
-	sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) {
-		if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
-			struct ipv6_pinfo *np = inet6_sk(sk);
-			
-			score = 1;
-			if (!ipv6_addr_any(&np->rcv_saddr)) {
-				if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
-					continue;
-				score++;
-			}
-			if (sk->sk_bound_dev_if) {
-				if (sk->sk_bound_dev_if != dif)
-					continue;
-				score++;
-			}
-			if (score == 3) {
-				result = sk;
-				break;
-			}
-			if (score > hiscore) {
-				hiscore = score;
-				result = sk;
-			}
-		}
-	}
-	if (result)
-		sock_hold(result);
-	read_unlock(&tcp_lhash_lock);
-	return result;
-}
-
-/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
- * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- *
- * The sockhash lock must be held as a reader here.
- */
-
-static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport,
-						       struct in6_addr *daddr, u16 hnum,
-						       int dif)
-{
-	struct tcp_ehash_bucket *head;
-	struct sock *sk;
-	struct hlist_node *node;
-	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
-	int hash;
-
-	/* Optimize here for direct hit, only listening connections can
-	 * have wildcards anyways.
-	 */
-	hash = tcp_v6_hashfn(daddr, hnum, saddr, sport);
-	head = &tcp_ehash[hash];
-	read_lock(&head->lock);
-	sk_for_each(sk, node, &head->chain) {
-		/* For IPV6 do the cheaper port and family tests first. */
-		if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif))
-			goto hit; /* You sunk my battleship! */
-	}
-	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
-		/* FIXME: acme: check this... */
-		struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
-
-		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
-		   sk->sk_family		== PF_INET6) {
-			if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr)	&&
-			   ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr)	&&
-			   (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
-				goto hit;
-		}
-	}
-	read_unlock(&head->lock);
-	return NULL;
-
-hit:
-	sock_hold(sk);
-	read_unlock(&head->lock);
-	return sk;
-}
-
-
-static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-					   struct in6_addr *daddr, u16 hnum,
-					   int dif)
-{
-	struct sock *sk;
-
-	sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif);
-
-	if (sk)
-		return sk;
-
-	return tcp_v6_lookup_listener(daddr, hnum, dif);
-}
-
-inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
-				  struct in6_addr *daddr, u16 dport,
-				  int dif)
-{
-	struct sock *sk;
-
-	local_bh_disable();
-	sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
-	local_bh_enable();
-
-	return sk;
-}
-
-EXPORT_SYMBOL_GPL(tcp_v6_lookup);
-
-
 /*
  * Open request hash tables.
  */
 
-static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
+static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd)
 {
 	u32 a, b, c;
 
@@ -399,14 +260,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
 	return c & (TCP_SYNQ_HSIZE - 1);
 }
 
-static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
+static struct request_sock *tcp_v6_search_req(const struct sock *sk,
 					      struct request_sock ***prevp,
 					      __u16 rport,
 					      struct in6_addr *raddr,
 					      struct in6_addr *laddr,
 					      int iif)
 {
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 	struct request_sock *req, **prev;  
 
 	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -451,44 +313,48 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
-static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
-				      struct tcp_tw_bucket **twp)
+static int __tcp_v6_check_established(struct sock *sk, const __u16 lport,
+				      struct inet_timewait_sock **twp)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct in6_addr *daddr = &np->rcv_saddr;
-	struct in6_addr *saddr = &np->daddr;
-	int dif = sk->sk_bound_dev_if;
-	u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
-	int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport);
-	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct in6_addr *daddr = &np->rcv_saddr;
+	const struct in6_addr *saddr = &np->daddr;
+	const int dif = sk->sk_bound_dev_if;
+	const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+	const int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport,
+				       tcp_hashinfo.ehash_size);
+	struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
 	struct sock *sk2;
-	struct hlist_node *node;
-	struct tcp_tw_bucket *tw;
+	const struct hlist_node *node;
+	struct inet_timewait_sock *tw;
 
 	write_lock(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
-	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
-		tw = (struct tcp_tw_bucket*)sk2;
+	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
+		const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2);
+
+		tw = inet_twsk(sk2);
 
 		if(*((__u32 *)&(tw->tw_dport))	== ports	&&
 		   sk2->sk_family		== PF_INET6	&&
-		   ipv6_addr_equal(&tw->tw_v6_daddr, saddr)	&&
-		   ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr)	&&
+		   ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr)	&&
+		   ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr)	&&
 		   sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
+			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
 
-			if (tw->tw_ts_recent_stamp &&
-			    (!twp || (sysctl_tcp_tw_reuse &&
-				      xtime.tv_sec - 
-				      tw->tw_ts_recent_stamp > 1))) {
+			if (tcptw->tw_ts_recent_stamp &&
+			    (!twp ||
+			     (sysctl_tcp_tw_reuse &&
+			      xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
 				/* See comment in tcp_ipv4.c */
-				tp->write_seq = tw->tw_snd_nxt + 65535 + 2;
+				tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 				if (!tp->write_seq)
 					tp->write_seq = 1;
-				tp->rx_opt.ts_recent = tw->tw_ts_recent;
-				tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+				tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
+				tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 				sock_hold(sk2);
 				goto unique;
 			} else
@@ -499,7 +365,7 @@ static int __tcp_v6_check_established(struct sock *sk, __u16 lport,
 
 	/* And established part... */
 	sk_for_each(sk2, node, &head->chain) {
-		if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif))
+		if (INET6_MATCH(sk2, saddr, daddr, ports, dif))
 			goto not_unique;
 	}
 
@@ -515,10 +381,10 @@ unique:
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 	} else if (tw) {
 		/* Silly. Should hash-dance instead... */
-		tcp_tw_deschedule(tw);
+		inet_twsk_deschedule(tw, &tcp_death_row);
 		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
 
-		tcp_tw_put(tw);
+		inet_twsk_put(tw);
 	}
 	return 0;
 
@@ -540,8 +406,8 @@ static inline u32 tcpv6_port_offset(const struct sock *sk)
 static int tcp_v6_hash_connect(struct sock *sk)
 {
 	unsigned short snum = inet_sk(sk)->num;
- 	struct tcp_bind_hashbucket *head;
- 	struct tcp_bind_bucket *tb;
+ 	struct inet_bind_hashbucket *head;
+ 	struct inet_bind_bucket *tb;
 	int ret;
 
  	if (!snum) {
@@ -553,19 +419,19 @@ static int tcp_v6_hash_connect(struct sock *sk)
 		static u32 hint;
 		u32 offset = hint + tcpv6_port_offset(sk);
 		struct hlist_node *node;
- 		struct tcp_tw_bucket *tw = NULL;
+ 		struct inet_timewait_sock *tw = NULL;
 
  		local_bh_disable();
 		for (i = 1; i <= range; i++) {
 			port = low + (i + offset) % range;
- 			head = &tcp_bhash[tcp_bhashfn(port)];
+ 			head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
  			spin_lock(&head->lock);
 
  			/* Does not bother with rcv_saddr checks,
  			 * because the established check is already
  			 * unique enough.
  			 */
-			tb_for_each(tb, node, &head->chain) {
+			inet_bind_bucket_for_each(tb, node, &head->chain) {
  				if (tb->port == port) {
  					BUG_TRAP(!hlist_empty(&tb->owners));
  					if (tb->fastreuse >= 0)
@@ -578,7 +444,7 @@ static int tcp_v6_hash_connect(struct sock *sk)
  				}
  			}
 
- 			tb = tcp_bucket_create(head, port);
+ 			tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
  			if (!tb) {
  				spin_unlock(&head->lock);
  				break;
@@ -597,7 +463,7 @@ ok:
 		hint += i;
 
  		/* Head lock still held and bh's disabled */
- 		tcp_bind_hash(sk, tb, port);
+ 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
  			inet_sk(sk)->sport = htons(port);
  			__tcp_v6_hash(sk);
@@ -605,16 +471,16 @@ ok:
  		spin_unlock(&head->lock);
 
  		if (tw) {
- 			tcp_tw_deschedule(tw);
- 			tcp_tw_put(tw);
+ 			inet_twsk_deschedule(tw, &tcp_death_row);
+ 			inet_twsk_put(tw);
  		}
 
 		ret = 0;
 		goto out;
  	}
 
- 	head  = &tcp_bhash[tcp_bhashfn(snum)];
- 	tb  = tcp_sk(sk)->bind_hash;
+ 	head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
+ 	tb   = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
@@ -631,11 +497,6 @@ out:
 	}
 }
 
-static __inline__ int tcp_v6_iif(struct sk_buff *skb)
-{
-	return IP6CB(skb)->iif;
-}
-
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, 
 			  int addr_len)
 {
@@ -827,14 +688,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		int type, int code, int offset, __u32 info)
 {
 	struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
-	struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
+	const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
 	struct ipv6_pinfo *np;
 	struct sock *sk;
 	int err;
 	struct tcp_sock *tp; 
 	__u32 seq;
 
-	sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex);
+	sk = inet6_lookup(&tcp_hashinfo, &hdr->daddr, th->dest, &hdr->saddr,
+			  th->source, skb->dev->ifindex);
 
 	if (sk == NULL) {
 		ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
@@ -842,7 +704,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	if (sk->sk_state == TCP_TIME_WAIT) {
-		tcp_tw_put((struct tcp_tw_bucket*)sk);
+		inet_twsk_put((struct inet_timewait_sock *)sk);
 		return;
 	}
 
@@ -920,8 +782,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (sock_owned_by_user(sk))
 			goto out;
 
-		req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr,
-					&hdr->saddr, tcp_v6_iif(skb));
+		req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr,
+					&hdr->saddr, inet6_iif(skb));
 		if (!req)
 			goto out;
 
@@ -935,7 +797,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			goto out;
 		}
 
-		tcp_synq_drop(sk, req, prev);
+		inet_csk_reqsk_queue_drop(sk, req, prev);
 		goto out;
 
 	case TCP_SYN_SENT:
@@ -1132,7 +994,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
 				    buff->csum);
 
 	fl.proto = IPPROTO_TCP;
-	fl.oif = tcp_v6_iif(skb);
+	fl.oif = inet6_iif(skb);
 	fl.fl_ip_dport = t1->dest;
 	fl.fl_ip_sport = t1->source;
 
@@ -1201,7 +1063,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 				    buff->csum);
 
 	fl.proto = IPPROTO_TCP;
-	fl.oif = tcp_v6_iif(skb);
+	fl.oif = inet6_iif(skb);
 	fl.fl_ip_dport = t1->dest;
 	fl.fl_ip_sport = t1->source;
 
@@ -1220,12 +1082,14 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+	struct inet_timewait_sock *tw = inet_twsk(sk);
+	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
-	tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
-			tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+	tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_ts_recent);
 
-	tcp_tw_put(tw);
+	inet_twsk_put(tw);
 }
 
 static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
@@ -1237,28 +1101,25 @@ static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 {
 	struct request_sock *req, **prev;
-	struct tcphdr *th = skb->h.th;
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcphdr *th = skb->h.th;
 	struct sock *nsk;
 
 	/* Find possible connection requests. */
-	req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr,
-				&skb->nh.ipv6h->daddr, tcp_v6_iif(skb));
+	req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr,
+				&skb->nh.ipv6h->daddr, inet6_iif(skb));
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
-	nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr,
-					  th->source,
-					  &skb->nh.ipv6h->daddr,
-					  ntohs(th->dest),
-					  tcp_v6_iif(skb));
+	nsk = __inet6_lookup_established(&tcp_hashinfo, &skb->nh.ipv6h->saddr,
+					 th->source, &skb->nh.ipv6h->daddr,
+					 ntohs(th->dest), inet6_iif(skb));
 
 	if (nsk) {
 		if (nsk->sk_state != TCP_TIME_WAIT) {
 			bh_lock_sock(nsk);
 			return nsk;
 		}
-		tcp_tw_put((struct tcp_tw_bucket*)nsk);
+		inet_twsk_put((struct inet_timewait_sock *)nsk);
 		return NULL;
 	}
 
@@ -1271,12 +1132,12 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 
 static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct listen_sock *lopt = tp->accept_queue.listen_opt;
-	u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
+	const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
-	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
-	tcp_synq_added(sk);
+	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT);
+	inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
 }
 
 
@@ -1301,13 +1162,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	/*
 	 *	There are no SYN attacks on IPv6, yet...	
 	 */
-	if (tcp_synq_is_full(sk) && !isn) {
+	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
 		if (net_ratelimit())
 			printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
 		goto drop;		
 	}
 
-	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
 		goto drop;
 
 	req = reqsk_alloc(&tcp6_request_sock_ops);
@@ -1339,7 +1200,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
 	    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		treq->iif = tcp_v6_iif(skb);
+		treq->iif = inet6_iif(skb);
 
 	if (isn == 0) 
 		isn = tcp_v6_init_sequence(sk,skb);
@@ -1404,15 +1265,14 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
-		newnp->mcast_oif   = tcp_v6_iif(skb);
+		newnp->mcast_oif   = inet6_iif(skb);
 		newnp->mcast_hops  = skb->nh.ipv6h->hop_limit;
 
-		/* Charge newly allocated IPv6 socket. Though it is mapped,
-		 * it is IPv6 yet.
+		/*
+		 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
+		 * here, tcp_create_openreq_child now does this for us, see the comment in
+		 * that function for the gory details. -acme
 		 */
-#ifdef INET_REFCNT_DEBUG
-		atomic_inc(&inet6_sock_nr);
-#endif
 
 		/* It is tricky place. Until this moment IPv4 tcp
 		   worked with IPv6 af_tcp.af_specific.
@@ -1467,10 +1327,11 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (newsk == NULL)
 		goto out;
 
-	/* Charge newly allocated IPv6 socket */
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet6_sock_nr);
-#endif
+	/*
+	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
+	 * count here, tcp_create_openreq_child now does this for us, see the
+	 * comment in that function for the gory details. -acme
+	 */
 
 	ip6_dst_store(newsk, dst, NULL);
 	newsk->sk_route_caps = dst->dev->features &
@@ -1509,7 +1370,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 			skb_set_owner_r(newnp->pktoptions, newsk);
 	}
 	newnp->opt	  = NULL;
-	newnp->mcast_oif  = tcp_v6_iif(skb);
+	newnp->mcast_oif  = inet6_iif(skb);
 	newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
 
 	/* Clone native IPv6 options from listening socket (if any)
@@ -1536,7 +1397,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
 
 	__tcp_v6_hash(newsk);
-	tcp_inherit_port(sk, newsk);
+	inet_inherit_port(&tcp_hashinfo, sk, newsk);
 
 	return newsk;
 
@@ -1557,7 +1418,7 @@ static int tcp_v6_checksum_init(struct sk_buff *skb)
 		if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
 				  &skb->nh.ipv6h->daddr,skb->csum))
 			return 0;
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n");
 	}
 	if (skb->len <= 76) {
 		if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
@@ -1684,7 +1545,7 @@ ipv6_pktoptions:
 	if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
 	    !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
 		if (np->rxopt.bits.rxinfo)
-			np->mcast_oif = tcp_v6_iif(opt_skb);
+			np->mcast_oif = inet6_iif(opt_skb);
 		if (np->rxopt.bits.rxhlim)
 			np->mcast_hops = opt_skb->nh.ipv6h->hop_limit;
 		if (ipv6_opt_accepted(sk, opt_skb)) {
@@ -1739,8 +1600,9 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h);
 	TCP_SKB_CB(skb)->sacked = 0;
 
-	sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source,
-			     &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+	sk = __inet6_lookup(&tcp_hashinfo, &skb->nh.ipv6h->saddr, th->source,
+			    &skb->nh.ipv6h->daddr, ntohs(th->dest),
+			    inet6_iif(skb));
 
 	if (!sk)
 		goto no_tcp_socket;
@@ -1795,26 +1657,29 @@ discard_and_relse:
 
 do_time_wait:
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *)sk);
 		goto discard_it;
 	}
 
 	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TCP_MIB_INERRS);
-		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		inet_twsk_put((struct inet_timewait_sock *)sk);
 		goto discard_it;
 	}
 
-	switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-					  skb, th, skb->len)) {
+	switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
+					   skb, th)) {
 	case TCP_TW_SYN:
 	{
 		struct sock *sk2;
 
-		sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb));
+		sk2 = inet6_lookup_listener(&tcp_hashinfo,
+					    &skb->nh.ipv6h->daddr,
+					    ntohs(th->dest), inet6_iif(skb));
 		if (sk2 != NULL) {
-			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
-			tcp_tw_put((struct tcp_tw_bucket *)sk);
+			struct inet_timewait_sock *tw = inet_twsk(sk);
+			inet_twsk_deschedule(tw, &tcp_death_row);
+			inet_twsk_put(tw);
 			sk = sk2;
 			goto process;
 		}
@@ -1983,7 +1848,7 @@ static struct tcp_func ipv6_specific = {
 static struct tcp_func ipv6_mapped = {
 	.queue_xmit	=	ip_queue_xmit,
 	.send_check	=	tcp_v4_send_check,
-	.rebuild_header	=	tcp_v4_rebuild_header,
+	.rebuild_header	=	inet_sk_rebuild_header,
 	.conn_request	=	tcp_v6_conn_request,
 	.syn_recv_sock	=	tcp_v6_syn_recv_sock,
 	.remember_stamp	=	tcp_v4_remember_stamp,
@@ -2002,13 +1867,14 @@ static struct tcp_func ipv6_mapped = {
  */
 static int tcp_v6_init_sock(struct sock *sk)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 	tcp_prequeue_init(tp);
 
-	tp->rto  = TCP_TIMEOUT_INIT;
+	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
 
 	/* So many TCP implementations out there (incorrectly) count the
@@ -2030,7 +1896,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-	tp->ca_ops = &tcp_init_congestion_ops;
+	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
@@ -2044,8 +1910,6 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 static int tcp_v6_destroy_sock(struct sock *sk)
 {
-	extern int tcp_v4_destroy_sock(struct sock *sk);
-
 	tcp_v4_destroy_sock(sk);
 	return inet6_destroy_sock(sk);
 }
@@ -2091,18 +1955,20 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 	unsigned long timer_expires;
 	struct inet_sock *inet = inet_sk(sp);
 	struct tcp_sock *tp = tcp_sk(sp);
+	const struct inet_connection_sock *icsk = inet_csk(sp);
 	struct ipv6_pinfo *np = inet6_sk(sp);
 
 	dest  = &np->daddr;
 	src   = &np->rcv_saddr;
 	destp = ntohs(inet->dport);
 	srcp  = ntohs(inet->sport);
-	if (tp->pending == TCP_TIME_RETRANS) {
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
 		timer_active	= 1;
-		timer_expires	= tp->timeout;
-	} else if (tp->pending == TCP_TIME_PROBE0) {
+		timer_expires	= icsk->icsk_timeout;
+	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 		timer_active	= 4;
-		timer_expires	= tp->timeout;
+		timer_expires	= icsk->icsk_timeout;
 	} else if (timer_pending(&sp->sk_timer)) {
 		timer_active	= 2;
 		timer_expires	= sp->sk_timer.expires;
@@ -2123,28 +1989,31 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 		   tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
 		   timer_active,
 		   jiffies_to_clock_t(timer_expires - jiffies),
-		   tp->retransmits,
+		   icsk->icsk_retransmits,
 		   sock_i_uid(sp),
-		   tp->probes_out,
+		   icsk->icsk_probes_out,
 		   sock_i_ino(sp),
 		   atomic_read(&sp->sk_refcnt), sp,
-		   tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
+		   icsk->icsk_rto,
+		   icsk->icsk_ack.ato,
+		   (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong,
 		   tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
 		   );
 }
 
 static void get_timewait6_sock(struct seq_file *seq, 
-			       struct tcp_tw_bucket *tw, int i)
+			       struct inet_timewait_sock *tw, int i)
 {
 	struct in6_addr *dest, *src;
 	__u16 destp, srcp;
+	struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 	int ttd = tw->tw_ttd - jiffies;
 
 	if (ttd < 0)
 		ttd = 0;
 
-	dest  = &tw->tw_v6_daddr;
-	src   = &tw->tw_v6_rcv_saddr;
+	dest = &tcp6tw->tw_v6_daddr;
+	src  = &tcp6tw->tw_v6_rcv_saddr;
 	destp = ntohs(tw->tw_dport);
 	srcp  = ntohs(tw->tw_sport);
 
@@ -2219,7 +2088,7 @@ struct proto tcpv6_prot = {
 	.close			= tcp_close,
 	.connect		= tcp_v6_connect,
 	.disconnect		= tcp_disconnect,
-	.accept			= tcp_accept,
+	.accept			= inet_csk_accept,
 	.ioctl			= tcp_ioctl,
 	.init			= tcp_v6_init_sock,
 	.destroy		= tcp_v6_destroy_sock,
@@ -2236,11 +2105,13 @@ struct proto tcpv6_prot = {
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
+	.orphan_count		= &tcp_orphan_count,
 	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
+	.twsk_obj_size		= sizeof(struct tcp6_timewait_sock),
 	.rsk_prot		= &tcp6_request_sock_ops,
 };
 
@@ -2250,8 +2121,6 @@ static struct inet6_protocol tcpv6_protocol = {
 	.flags		=	INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
-extern struct proto_ops inet6_stream_ops;
-
 static struct inet_protosw tcpv6_protosw = {
 	.type		=	SOCK_STREAM,
 	.protocol	=	IPPROTO_TCP,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eff050ac704..390d750449c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -51,6 +51,7 @@
 #include <net/udp.h>
 #include <net/raw.h>
 #include <net/inet_common.h>
+#include <net/tcp_states.h>
 
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
@@ -58,7 +59,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 
 /* Grrr, addr_type already calculated by caller, but I don't want
  * to add some silly "cookie" argument to this method just for that.
@@ -477,8 +478,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 		/* RFC 2460 section 8.1 says that we SHOULD log
 		   this error. Well, it is reasonable.
 		 */
-		LIMIT_NETDEBUG(
-			printk(KERN_INFO "IPv6: udp checksum is 0\n"));
+		LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n");
 		goto discard;
 	}
 
@@ -493,7 +493,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
 	if (skb->ip_summed==CHECKSUM_HW) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
-			LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n"));
+			LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
 			skb->ip_summed = CHECKSUM_NONE;
 		}
 	}
@@ -825,7 +825,7 @@ back_from_confirm:
 		/* ... which is an evident application bug. --ANK */
 		release_sock(sk);
 
-		LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n"));
+		LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
 		err = -EINVAL;
 		goto out;
 	}
@@ -1054,8 +1054,6 @@ struct proto udpv6_prot = {
 	.obj_size =	sizeof(struct udp6_sock),
 };
 
-extern struct proto_ops inet6_dgram_ops;
-
 static struct inet_protosw udpv6_protosw = {
 	.type =      SOCK_DGRAM,
 	.protocol =  IPPROTO_UDP,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 60c26c87277..fbef7826a74 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -79,7 +79,7 @@ static u32 xfrm6_tunnel_spi;
 #define XFRM6_TUNNEL_SPI_MIN	1
 #define XFRM6_TUNNEL_SPI_MAX	0xffffffff
 
-static kmem_cache_t *xfrm6_tunnel_spi_kmem;
+static kmem_cache_t *xfrm6_tunnel_spi_kmem __read_mostly;
 
 #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256
 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 5a27e5df588..34b3bb86840 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -44,7 +44,6 @@
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/string.h>
-#include <linux/tcp.h>
 #include <linux/types.h>
 #include <linux/termios.h>
 
@@ -52,6 +51,7 @@
 #include <net/p8022.h>
 #include <net/psnap.h>
 #include <net/sock.h>
+#include <net/tcp_states.h>
 
 #include <asm/uaccess.h>
 
@@ -1627,7 +1627,7 @@ out:
 	return rc;
 }
 
-static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	/* NULL here for pt means the packet was looped back */
 	struct ipx_interface *intrfc;
@@ -1796,8 +1796,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 				     copied);
 	if (rc)
 		goto out_free;
-	if (skb->stamp.tv_sec)
-		sk->sk_stamp = skb->stamp;
+	if (skb->tstamp.off_sec)
+		skb_get_timestamp(skb, &sk->sk_stamp);
 
 	msg->msg_namelen = sizeof(*sipx);
 
@@ -1940,9 +1940,7 @@ static struct notifier_block ipx_dev_notifier = {
 };
 
 extern struct datalink_proto *make_EII_client(void);
-extern struct datalink_proto *make_8023_client(void);
 extern void destroy_EII_client(struct datalink_proto *);
-extern void destroy_8023_client(struct datalink_proto *);
 
 static unsigned char ipx_8022_type = 0xE0;
 static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c
index b6761913445..1f73d9ea434 100644
--- a/net/ipx/ipx_proc.c
+++ b/net/ipx/ipx_proc.c
@@ -10,7 +10,7 @@
 #include <linux/proc_fs.h>
 #include <linux/spinlock.h>
 #include <linux/seq_file.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ipx.h>
 
 static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92c6e8d4e73..6f92f9c6299 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -56,7 +56,7 @@
 #include <asm/uaccess.h>
 
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 
 #include <net/irda/af_irda.h>
 
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 6dafbb43b52..3e9a06abbdd 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -988,9 +988,6 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
 			IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
 			return;
 		}
-		/* Unlink tx_skb from list */
-		tx_skb->next = tx_skb->prev = NULL;
-		tx_skb->list = NULL;
 
 		/* Clear old Nr field + poll bit */
 		tx_skb->data[1] &= 0x0f;
@@ -1063,9 +1060,6 @@ void irlap_resend_rejected_frame(struct irlap_cb *self, int command)
 			IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__);
 			return;
 		}
-		/* Unlink tx_skb from list */
-		tx_skb->next = tx_skb->prev = NULL;
-		tx_skb->list = NULL;
 
 		/* Clear old Nr field + poll bit */
 		tx_skb->data[1] &= 0x0f;
@@ -1309,7 +1303,7 @@ static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb,
  * Jean II
  */
 int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
-		     struct packet_type *ptype)
+		     struct packet_type *ptype, struct net_device *orig_dev)
 {
 	struct irlap_info info;
 	struct irlap_cb *self;
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 7a4a4d7fbe6..c19e9ce05a3 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -53,7 +53,6 @@ struct irlmp_cb *irlmp = NULL;
 /* These can be altered by the sysctl interface */
 int  sysctl_discovery         = 0;
 int  sysctl_discovery_timeout = 3; /* 3 seconds by default */
-EXPORT_SYMBOL(sysctl_discovery_timeout);
 int  sysctl_discovery_slots   = 6; /* 6 slots by default */
 int  sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ;
 char sysctl_devname[65];
@@ -67,7 +66,6 @@ const char *irlmp_reasons[] = {
 	"LM_INIT_DISCONNECT",
 	"ERROR, NOT USED",
 };
-EXPORT_SYMBOL(irlmp_reasons);
 
 /*
  * Function irlmp_init (void)
@@ -675,7 +673,6 @@ struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance)
 
 	return new;
 }
-EXPORT_SYMBOL(irlmp_dup);
 
 /*
  * Function irlmp_disconnect_request (handle, userdata)
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 6ffaed4544e..634901dd156 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -54,7 +54,7 @@ extern int  irsock_init(void);
 extern void irsock_cleanup(void);
 /* irlap_frame.c */
 extern int  irlap_driver_rcv(struct sk_buff *, struct net_device *, 
-			     struct packet_type *);
+			     struct packet_type *, struct net_device *);
 
 /*
  * Module parameters
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 9004f7349a7..b391cb3893d 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -517,9 +517,6 @@ extern int
 	irda_irnet_init(void);		/* Initialise IrDA part of IrNET */
 extern void
 	irda_irnet_cleanup(void);	/* Teardown IrDA part of IrNET */
-/* ---------------------------- MODULE ---------------------------- */
-extern int
-	irnet_init(void);		/* Initialise IrNET module */
 
 /**************************** VARIABLES ****************************/
 
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index f8f984bb992..e53bf9e0053 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -1107,7 +1107,7 @@ ppp_irnet_cleanup(void)
 /*
  * Module main entry point
  */
-int __init
+static int __init
 irnet_init(void)
 {
   int err;
diff --git a/net/irda/irqueue.c b/net/irda/irqueue.c
index b0dd3ea3599..1ba8c710663 100644
--- a/net/irda/irqueue.c
+++ b/net/irda/irqueue.c
@@ -822,7 +822,6 @@ void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name,
 
 	return entry;
 }
-EXPORT_SYMBOL(hashbin_find_next);
 
 /*
  * Function hashbin_get_first (hashbin)
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 5de05a0bc0f..8b5eefd70f0 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -78,7 +78,7 @@ void lapb_requeue_frames(struct lapb_cb *lapb)
 		if (!skb_prev)
 			skb_queue_head(&lapb->write_queue, skb);
 		else
-			skb_append(skb_prev, skb);
+			skb_append(skb_prev, skb, &lapb->write_queue);
 		skb_prev = skb;
 	}
 }
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 20b4cfebd74..66f55e514b5 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -23,13 +23,13 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/tcp.h>
 #include <linux/rtnetlink.h>
 #include <linux/init.h>
 #include <net/llc.h>
 #include <net/llc_sap.h>
 #include <net/llc_pdu.h>
 #include <net/llc_conn.h>
+#include <net/tcp_states.h>
 
 /* remember: uninitialized global data is zeroed because its in .bss */
 static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START;
@@ -714,7 +714,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (uaddr)
 		memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr));
 	msg->msg_namelen = sizeof(*uaddr);
-	if (!skb->list) {
+	if (!skb->next) {
 dgram_free:
 		kfree_skb(skb);
 	}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index eba812a9c69..4c644bc70ea 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -16,7 +16,7 @@
 #include <net/llc_sap.h>
 #include <net/llc_conn.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/llc_c_ev.h>
 #include <net/llc_c_ac.h>
 #include <net/llc_c_st.h>
@@ -71,7 +71,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
 
 	if (!ev->ind_prim && !ev->cfm_prim) {
 		/* indicate or confirm not required */
-		if (!skb->list)
+		/* XXX this is not very pretty, perhaps we should store
+		 * XXX indicate/confirm-needed state in the llc_conn_state_ev
+		 * XXX control block of the SKB instead? -DaveM
+		 */
+		if (!skb->next)
 			goto out_kfree_skb;
 		goto out_skb_put;
 	}
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 5ff02c080a0..9727455bf0e 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -103,7 +103,8 @@ out:
 struct llc_sap *llc_sap_open(unsigned char lsap,
 			     int (*func)(struct sk_buff *skb,
 					 struct net_device *dev,
-					 struct packet_type *pt))
+					 struct packet_type *pt,
+					 struct net_device *orig_dev))
 {
 	struct llc_sap *sap = llc_sap_find(lsap);
 
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 0f9fc48aeaf..0f84f66018e 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -15,7 +15,6 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
-#include <linux/tcp.h>
 #include <asm/errno.h>
 #include <net/llc_if.h>
 #include <net/llc_sap.h>
@@ -25,6 +24,7 @@
 #include <net/llc_c_ev.h>
 #include <net/llc_c_ac.h>
 #include <net/llc_c_st.h>
+#include <net/tcp_states.h>
 
 u8 llc_mac_null_var[IFHWADDRLEN];
 
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 4da6976efc9..13b46240b7a 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -132,7 +132,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
  *	data now), it queues this frame in the connection's backlog.
  */
 int llc_rcv(struct sk_buff *skb, struct net_device *dev,
-	    struct packet_type *pt)
+	    struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct llc_sap *sap;
 	struct llc_pdu_sn *pdu;
@@ -165,7 +165,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
 	 * LLC functionality
 	 */
 	if (sap->rcv_func) {
-		sap->rcv_func(skb, dev, pt);
+		sap->rcv_func(skb, dev, pt, orig_dev);
 		goto out;
 	}
 	dest = llc_pdu_type(skb);
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 965c94eb4bb..34228ef1498 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -21,7 +21,7 @@
 #include <net/llc_s_ev.h>
 #include <net/llc_s_st.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <linux/llc.h>
 
 /**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
new file mode 100644
index 00000000000..8296b38bf27
--- /dev/null
+++ b/net/netfilter/Kconfig
@@ -0,0 +1,24 @@
+config NETFILTER_NETLINK
+       tristate "Netfilter netlink interface"
+       help
+         If this option is enabled, the kernel will include support
+         for the new netfilter netlink interface.
+
+config NETFILTER_NETLINK_QUEUE
+	tristate "Netfilter NFQUEUE over NFNETLINK interface"
+	depends on NETFILTER_NETLINK
+	help
+	  If this option isenabled, the kernel will include support
+	  for queueing packets via NFNETLINK.
+	  
+config NETFILTER_NETLINK_LOG
+	tristate "Netfilter LOG over NFNETLINK interface"
+	depends on NETFILTER_NETLINK
+	help
+	  If this option is enabled, the kernel will include support
+	  for logging packets via NFNETLINK.
+
+	  This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms,
+	  and is also scheduled to replace the old syslog-based ipt_LOG
+	  and ip6t_LOG modules.
+
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
new file mode 100644
index 00000000000..b3b44f8b415
--- /dev/null
+++ b/net/netfilter/Makefile
@@ -0,0 +1,7 @@
+netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
+
+obj-$(CONFIG_NETFILTER) = netfilter.o
+
+obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
+obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
+obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
new file mode 100644
index 00000000000..1ceb1a6c254
--- /dev/null
+++ b/net/netfilter/core.c
@@ -0,0 +1,216 @@
+/* netfilter.c: look after the filters for various protocols. 
+ * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
+ *
+ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
+ * way.
+ *
+ * Rusty Russell (C)2000 -- This code is GPL.
+ *
+ * February 2000: Modified by James Morris to have 1 queue per protocol.
+ * 15-Mar-2000:   Added NF_REPEAT --RR.
+ * 08-May-2003:	  Internal logging interface added by Jozsef Kadlecsik.
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* In this code, we can be waiting indefinitely for userspace to
+ * service a packet if a hook returns NF_QUEUE.  We could keep a count
+ * of skbuffs queued for userspace, and not deregister a hook unless
+ * this is zero, but that sucks.  Now, we simply check when the
+ * packets come back: if the hook is gone, the packet is discarded. */
+struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+EXPORT_SYMBOL(nf_hooks);
+static DEFINE_SPINLOCK(nf_hook_lock);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	struct list_head *i;
+
+	spin_lock_bh(&nf_hook_lock);
+	list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
+		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+			break;
+	}
+	list_add_rcu(&reg->list, i->prev);
+	spin_unlock_bh(&nf_hook_lock);
+
+	synchronize_net();
+	return 0;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	spin_lock_bh(&nf_hook_lock);
+	list_del_rcu(&reg->list);
+	spin_unlock_bh(&nf_hook_lock);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(nf_unregister_hook);
+
+unsigned int nf_iterate(struct list_head *head,
+			struct sk_buff **skb,
+			int hook,
+			const struct net_device *indev,
+			const struct net_device *outdev,
+			struct list_head **i,
+			int (*okfn)(struct sk_buff *),
+			int hook_thresh)
+{
+	unsigned int verdict;
+
+	/*
+	 * The caller must not block between calls to this
+	 * function because of risk of continuing from deleted element.
+	 */
+	list_for_each_continue_rcu(*i, head) {
+		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
+
+		if (hook_thresh > elem->priority)
+			continue;
+
+		/* Optimization: we don't need to hold module
+                   reference here, since function can't sleep. --RR */
+		verdict = elem->hook(hook, skb, indev, outdev, okfn);
+		if (verdict != NF_ACCEPT) {
+#ifdef CONFIG_NETFILTER_DEBUG
+			if (unlikely((verdict & NF_VERDICT_MASK)
+							> NF_MAX_VERDICT)) {
+				NFDEBUG("Evil return from %p(%u).\n",
+				        elem->hook, hook);
+				continue;
+			}
+#endif
+			if (verdict != NF_REPEAT)
+				return verdict;
+			*i = (*i)->prev;
+		}
+	}
+	return NF_ACCEPT;
+}
+
+
+/* Returns 1 if okfn() needs to be executed by the caller,
+ * -EPERM for NF_DROP, 0 otherwise. */
+int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
+		 struct net_device *indev,
+		 struct net_device *outdev,
+		 int (*okfn)(struct sk_buff *),
+		 int hook_thresh)
+{
+	struct list_head *elem;
+	unsigned int verdict;
+	int ret = 0;
+
+	/* We may already have this, but read-locks nest anyway */
+	rcu_read_lock();
+
+	elem = &nf_hooks[pf][hook];
+next_hook:
+	verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
+			     outdev, &elem, okfn, hook_thresh);
+	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+		ret = 1;
+		goto unlock;
+	} else if (verdict == NF_DROP) {
+		kfree_skb(*pskb);
+		ret = -EPERM;
+	} else if ((verdict & NF_VERDICT_MASK)  == NF_QUEUE) {
+		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
+		if (!nf_queue(pskb, elem, pf, hook, indev, outdev, okfn,
+			      verdict >> NF_VERDICT_BITS))
+			goto next_hook;
+	}
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_hook_slow);
+
+
+int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len)
+{
+	struct sk_buff *nskb;
+
+	if (writable_len > (*pskb)->len)
+		return 0;
+
+	/* Not exclusive use of packet?  Must copy. */
+	if (skb_shared(*pskb) || skb_cloned(*pskb))
+		goto copy_skb;
+
+	return pskb_may_pull(*pskb, writable_len);
+
+copy_skb:
+	nskb = skb_copy(*pskb, GFP_ATOMIC);
+	if (!nskb)
+		return 0;
+	BUG_ON(skb_is_nonlinear(nskb));
+
+	/* Rest of kernel will get very unhappy if we pass it a
+	   suddenly-orphaned skbuff */
+	if ((*pskb)->sk)
+		skb_set_owner_w(nskb, (*pskb)->sk);
+	kfree_skb(*pskb);
+	*pskb = nskb;
+	return 1;
+}
+EXPORT_SYMBOL(skb_make_writable);
+
+
+/* This does not belong here, but locally generated errors need it if connection
+   tracking in use: without this, connection may not be in hash table, and hence
+   manufactured ICMP or RST packets will not be associated with it. */
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+EXPORT_SYMBOL(ip_ct_attach);
+
+void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
+{
+	void (*attach)(struct sk_buff *, struct sk_buff *);
+
+	if (skb->nfct && (attach = ip_ct_attach) != NULL) {
+		mb(); /* Just to be sure: must be read before executing this */
+		attach(new, skb);
+	}
+}
+EXPORT_SYMBOL(nf_ct_attach);
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_net_netfilter;
+EXPORT_SYMBOL(proc_net_netfilter);
+#endif
+
+void __init netfilter_init(void)
+{
+	int i, h;
+	for (i = 0; i < NPROTO; i++) {
+		for (h = 0; h < NF_MAX_HOOKS; h++)
+			INIT_LIST_HEAD(&nf_hooks[i][h]);
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_net_netfilter = proc_mkdir("netfilter", proc_net);
+	if (!proc_net_netfilter)
+		panic("cannot create netfilter proc entry");
+#endif
+
+	if (netfilter_queue_init() < 0)
+		panic("cannot initialize nf_queue");
+	if (netfilter_log_init() < 0)
+		panic("cannot initialize nf_log");
+}
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
new file mode 100644
index 00000000000..6bdee291061
--- /dev/null
+++ b/net/netfilter/nf_internals.h
@@ -0,0 +1,39 @@
+#ifndef _NF_INTERNALS_H
+#define _NF_INTERNALS_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define NFDEBUG(format, args...)  printk(format , ## args)
+#else
+#define NFDEBUG(format, args...)
+#endif
+
+
+/* core.c */
+extern unsigned int nf_iterate(struct list_head *head,
+				struct sk_buff **skb,
+				int hook,
+				const struct net_device *indev,
+				const struct net_device *outdev,
+				struct list_head **i,
+				int (*okfn)(struct sk_buff *),
+				int hook_thresh);
+
+/* nf_queue.c */
+extern int nf_queue(struct sk_buff **skb, 
+		    struct list_head *elem, 
+		    int pf, unsigned int hook,
+		    struct net_device *indev,
+		    struct net_device *outdev,
+		    int (*okfn)(struct sk_buff *),
+		    unsigned int queuenum);
+extern int __init netfilter_queue_init(void);
+
+/* nf_log.c */
+extern int __init netfilter_log_init(void);
+
+#endif
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
new file mode 100644
index 00000000000..3e76bd0824a
--- /dev/null
+++ b/net/netfilter/nf_log.c
@@ -0,0 +1,178 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/* Internal logging interface, which relies on the real 
+   LOG target modules */
+
+#define NF_LOG_PREFIXLEN		128
+
+static struct nf_logger *nf_logging[NPROTO]; /* = NULL */
+static DEFINE_SPINLOCK(nf_log_lock);
+
+/* return EBUSY if somebody else is registered, EEXIST if the same logger
+ * is registred, 0 on success. */
+int nf_log_register(int pf, struct nf_logger *logger)
+{
+	int ret = -EBUSY;
+
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	/* Any setup of logging members must be done before
+	 * substituting pointer. */
+	spin_lock(&nf_log_lock);
+	if (!nf_logging[pf]) {
+		rcu_assign_pointer(nf_logging[pf], logger);
+		ret = 0;
+	} else if (nf_logging[pf] == logger)
+		ret = -EEXIST;
+
+	spin_unlock(&nf_log_lock);
+	return ret;
+}		
+EXPORT_SYMBOL(nf_log_register);
+
+int nf_log_unregister_pf(int pf)
+{
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	spin_lock(&nf_log_lock);
+	nf_logging[pf] = NULL;
+	spin_unlock(&nf_log_lock);
+
+	/* Give time to concurrent readers. */
+	synchronize_net();
+
+	return 0;
+}
+EXPORT_SYMBOL(nf_log_unregister_pf);
+
+void nf_log_unregister_logger(struct nf_logger *logger)
+{
+	int i;
+
+	spin_lock(&nf_log_lock);
+	for (i = 0; i < NPROTO; i++) {
+		if (nf_logging[i] == logger)
+			nf_logging[i] = NULL;
+	}
+	spin_unlock(&nf_log_lock);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(nf_log_unregister_logger);
+
+void nf_log_packet(int pf,
+		   unsigned int hooknum,
+		   const struct sk_buff *skb,
+		   const struct net_device *in,
+		   const struct net_device *out,
+		   struct nf_loginfo *loginfo,
+		   const char *fmt, ...)
+{
+	va_list args;
+	char prefix[NF_LOG_PREFIXLEN];
+	struct nf_logger *logger;
+	
+	rcu_read_lock();
+	logger = rcu_dereference(nf_logging[pf]);
+	if (logger) {
+		va_start(args, fmt);
+		vsnprintf(prefix, sizeof(prefix), fmt, args);
+		va_end(args);
+		/* We must read logging before nf_logfn[pf] */
+		logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+	} else if (net_ratelimit()) {
+		printk(KERN_WARNING "nf_log_packet: can\'t log since "
+		       "no backend logging module loaded in! Please either "
+		       "load one, or disable logging explicitly\n");
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(nf_log_packet);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	rcu_read_lock();
+
+	if (*pos >= NPROTO)
+		return NULL;
+
+	return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	if (*pos >= NPROTO)
+		return NULL;
+
+	return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	loff_t *pos = v;
+	const struct nf_logger *logger;
+
+	logger = rcu_dereference(nf_logging[*pos]);
+
+	if (!logger)
+		return seq_printf(s, "%2lld NONE\n", *pos);
+	
+	return seq_printf(s, "%2lld %s\n", *pos, logger->name);
+}
+
+static struct seq_operations nflog_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nflog_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nflog_seq_ops);
+}
+
+static struct file_operations nflog_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nflog_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* PROC_FS */
+
+
+int __init netfilter_log_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;
+
+	pde = create_proc_entry("nf_log", S_IRUGO, proc_net_netfilter);
+	if (!pde)
+		return -1;
+
+	pde->proc_fops = &nflog_file_ops;
+#endif
+	return 0;
+}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
new file mode 100644
index 00000000000..d10d552d9c4
--- /dev/null
+++ b/net/netfilter/nf_queue.c
@@ -0,0 +1,343 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/seq_file.h>
+#include <net/protocol.h>
+
+#include "nf_internals.h"
+
+/* 
+ * A queue handler may be registered for each protocol.  Each is protected by
+ * long term mutex.  The handler must provide an an outfn() to accept packets
+ * for queueing and must reinject all packets it receives, no matter what.
+ */
+static struct nf_queue_handler *queue_handler[NPROTO];
+static struct nf_queue_rerouter *queue_rerouter;
+
+static DEFINE_RWLOCK(queue_handler_lock);
+
+/* return EBUSY when somebody else is registered, return EEXIST if the
+ * same handler is registered, return 0 in case of success. */
+int nf_register_queue_handler(int pf, struct nf_queue_handler *qh)
+{      
+	int ret;
+
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	write_lock_bh(&queue_handler_lock);
+	if (queue_handler[pf] == qh)
+		ret = -EEXIST;
+	else if (queue_handler[pf])
+		ret = -EBUSY;
+	else {
+		queue_handler[pf] = qh;
+		ret = 0;
+	}
+	write_unlock_bh(&queue_handler_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_queue_handler);
+
+/* The caller must flush their queue before this */
+int nf_unregister_queue_handler(int pf)
+{
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	write_lock_bh(&queue_handler_lock);
+	queue_handler[pf] = NULL;
+	write_unlock_bh(&queue_handler_lock);
+	
+	return 0;
+}
+EXPORT_SYMBOL(nf_unregister_queue_handler);
+
+int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer)
+{
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	write_lock_bh(&queue_handler_lock);
+	memcpy(&queue_rerouter[pf], rer, sizeof(queue_rerouter[pf]));
+	write_unlock_bh(&queue_handler_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_register_queue_rerouter);
+
+int nf_unregister_queue_rerouter(int pf)
+{
+	if (pf >= NPROTO)
+		return -EINVAL;
+
+	write_lock_bh(&queue_handler_lock);
+	memset(&queue_rerouter[pf], 0, sizeof(queue_rerouter[pf]));
+	write_unlock_bh(&queue_handler_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_rerouter);
+
+void nf_unregister_queue_handlers(struct nf_queue_handler *qh)
+{
+	int pf;
+
+	write_lock_bh(&queue_handler_lock);
+	for (pf = 0; pf < NPROTO; pf++)  {
+		if (queue_handler[pf] == qh)
+			queue_handler[pf] = NULL;
+	}
+	write_unlock_bh(&queue_handler_lock);
+}
+EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers);
+
+/* 
+ * Any packet that leaves via this function must come back 
+ * through nf_reinject().
+ */
+int nf_queue(struct sk_buff **skb, 
+	     struct list_head *elem, 
+	     int pf, unsigned int hook,
+	     struct net_device *indev,
+	     struct net_device *outdev,
+	     int (*okfn)(struct sk_buff *),
+	     unsigned int queuenum)
+{
+	int status;
+	struct nf_info *info;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	struct net_device *physindev = NULL;
+	struct net_device *physoutdev = NULL;
+#endif
+
+	/* QUEUE == DROP if noone is waiting, to be safe. */
+	read_lock(&queue_handler_lock);
+	if (!queue_handler[pf]->outfn) {
+		read_unlock(&queue_handler_lock);
+		kfree_skb(*skb);
+		return 1;
+	}
+
+	info = kmalloc(sizeof(*info)+queue_rerouter[pf].rer_size, GFP_ATOMIC);
+	if (!info) {
+		if (net_ratelimit())
+			printk(KERN_ERR "OOM queueing packet %p\n",
+			       *skb);
+		read_unlock(&queue_handler_lock);
+		kfree_skb(*skb);
+		return 1;
+	}
+
+	*info = (struct nf_info) { 
+		(struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
+
+	/* If it's going away, ignore hook. */
+	if (!try_module_get(info->elem->owner)) {
+		read_unlock(&queue_handler_lock);
+		kfree(info);
+		return 0;
+	}
+
+	/* Bump dev refs so they don't vanish while packet is out */
+	if (indev) dev_hold(indev);
+	if (outdev) dev_hold(outdev);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if ((*skb)->nf_bridge) {
+		physindev = (*skb)->nf_bridge->physindev;
+		if (physindev) dev_hold(physindev);
+		physoutdev = (*skb)->nf_bridge->physoutdev;
+		if (physoutdev) dev_hold(physoutdev);
+	}
+#endif
+	if (queue_rerouter[pf].save)
+		queue_rerouter[pf].save(*skb, info);
+
+	status = queue_handler[pf]->outfn(*skb, info, queuenum,
+					  queue_handler[pf]->data);
+
+	if (status >= 0 && queue_rerouter[pf].reroute)
+		status = queue_rerouter[pf].reroute(skb, info);
+
+	read_unlock(&queue_handler_lock);
+
+	if (status < 0) {
+		/* James M doesn't say fuck enough. */
+		if (indev) dev_put(indev);
+		if (outdev) dev_put(outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+		if (physindev) dev_put(physindev);
+		if (physoutdev) dev_put(physoutdev);
+#endif
+		module_put(info->elem->owner);
+		kfree(info);
+		kfree_skb(*skb);
+
+		return 1;
+	}
+
+	return 1;
+}
+
+void nf_reinject(struct sk_buff *skb, struct nf_info *info,
+		 unsigned int verdict)
+{
+	struct list_head *elem = &info->elem->list;
+	struct list_head *i;
+
+	rcu_read_lock();
+
+	/* Release those devices we held, or Alexey will kill me. */
+	if (info->indev) dev_put(info->indev);
+	if (info->outdev) dev_put(info->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge) {
+		if (skb->nf_bridge->physindev)
+			dev_put(skb->nf_bridge->physindev);
+		if (skb->nf_bridge->physoutdev)
+			dev_put(skb->nf_bridge->physoutdev);
+	}
+#endif
+
+	/* Drop reference to owner of hook which queued us. */
+	module_put(info->elem->owner);
+
+	list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
+		if (i == elem) 
+  			break;
+  	}
+  
+	if (elem == &nf_hooks[info->pf][info->hook]) {
+		/* The module which sent it to userspace is gone. */
+		NFDEBUG("%s: module disappeared, dropping packet.\n",
+			__FUNCTION__);
+		verdict = NF_DROP;
+	}
+
+	/* Continue traversal iff userspace said ok... */
+	if (verdict == NF_REPEAT) {
+		elem = elem->prev;
+		verdict = NF_ACCEPT;
+	}
+
+	if (verdict == NF_ACCEPT) {
+	next_hook:
+		verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
+				     &skb, info->hook, 
+				     info->indev, info->outdev, &elem,
+				     info->okfn, INT_MIN);
+	}
+
+	switch (verdict & NF_VERDICT_MASK) {
+	case NF_ACCEPT:
+		info->okfn(skb);
+		break;
+
+	case NF_QUEUE:
+		if (!nf_queue(&skb, elem, info->pf, info->hook, 
+			      info->indev, info->outdev, info->okfn,
+			      verdict >> NF_VERDICT_BITS))
+			goto next_hook;
+		break;
+	}
+	rcu_read_unlock();
+
+	if (verdict == NF_DROP)
+		kfree_skb(skb);
+
+	kfree(info);
+	return;
+}
+EXPORT_SYMBOL(nf_reinject);
+
+#ifdef CONFIG_PROC_FS
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos >= NPROTO)
+		return NULL;
+
+	return pos;
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	if (*pos >= NPROTO)
+		return NULL;
+
+	return pos;
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	int ret;
+	loff_t *pos = v;
+	struct nf_queue_handler *qh;
+
+	read_lock_bh(&queue_handler_lock);
+	qh = queue_handler[*pos];
+	if (!qh)
+		ret = seq_printf(s, "%2lld NONE\n", *pos);
+	else
+		ret = seq_printf(s, "%2lld %s\n", *pos, qh->name);
+	read_unlock_bh(&queue_handler_lock);
+
+	return ret;
+}
+
+static struct seq_operations nfqueue_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nfqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nfqueue_seq_ops);
+}
+
+static struct file_operations nfqueue_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nfqueue_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+#endif /* PROC_FS */
+
+
+int __init netfilter_queue_init(void)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;
+#endif
+	queue_rerouter = kmalloc(NPROTO * sizeof(struct nf_queue_rerouter),
+				 GFP_KERNEL);
+	if (!queue_rerouter)
+		return -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+	pde = create_proc_entry("nf_queue", S_IRUGO, proc_net_netfilter);
+	if (!pde) {
+		kfree(queue_rerouter);
+		return -1;
+	}
+	pde->proc_fops = &nfqueue_file_ops;
+#endif
+	memset(queue_rerouter, 0, NPROTO * sizeof(struct nf_queue_rerouter));
+
+	return 0;
+}
+
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
new file mode 100644
index 00000000000..61a833a9caa
--- /dev/null
+++ b/net/netfilter/nf_sockopt.c
@@ -0,0 +1,132 @@
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <net/sock.h>
+
+#include "nf_internals.h"
+
+/* Sockopts only registered and called from user context, so
+   net locking would be overkill.  Also, [gs]etsockopt calls may
+   sleep. */
+static DECLARE_MUTEX(nf_sockopt_mutex);
+static LIST_HEAD(nf_sockopts);
+
+/* Do exclusive ranges overlap? */
+static inline int overlap(int min1, int max1, int min2, int max2)
+{
+	return max1 > min2 && min1 < max2;
+}
+
+/* Functions to register sockopt ranges (exclusive). */
+int nf_register_sockopt(struct nf_sockopt_ops *reg)
+{
+	struct list_head *i;
+	int ret = 0;
+
+	if (down_interruptible(&nf_sockopt_mutex) != 0)
+		return -EINTR;
+
+	list_for_each(i, &nf_sockopts) {
+		struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+		if (ops->pf == reg->pf
+		    && (overlap(ops->set_optmin, ops->set_optmax, 
+				reg->set_optmin, reg->set_optmax)
+			|| overlap(ops->get_optmin, ops->get_optmax, 
+				   reg->get_optmin, reg->get_optmax))) {
+			NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
+				ops->set_optmin, ops->set_optmax, 
+				ops->get_optmin, ops->get_optmax, 
+				reg->set_optmin, reg->set_optmax,
+				reg->get_optmin, reg->get_optmax);
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	list_add(&reg->list, &nf_sockopts);
+out:
+	up(&nf_sockopt_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_sockopt);
+
+void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
+{
+	/* No point being interruptible: we're probably in cleanup_module() */
+ restart:
+	down(&nf_sockopt_mutex);
+	if (reg->use != 0) {
+		/* To be woken by nf_sockopt call... */
+		/* FIXME: Stuart Young's name appears gratuitously. */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		reg->cleanup_task = current;
+		up(&nf_sockopt_mutex);
+		schedule();
+		goto restart;
+	}
+	list_del(&reg->list);
+	up(&nf_sockopt_mutex);
+}
+EXPORT_SYMBOL(nf_unregister_sockopt);
+
+/* Call get/setsockopt() */
+static int nf_sockopt(struct sock *sk, int pf, int val, 
+		      char __user *opt, int *len, int get)
+{
+	struct list_head *i;
+	struct nf_sockopt_ops *ops;
+	int ret;
+
+	if (down_interruptible(&nf_sockopt_mutex) != 0)
+		return -EINTR;
+
+	list_for_each(i, &nf_sockopts) {
+		ops = (struct nf_sockopt_ops *)i;
+		if (ops->pf == pf) {
+			if (get) {
+				if (val >= ops->get_optmin
+				    && val < ops->get_optmax) {
+					ops->use++;
+					up(&nf_sockopt_mutex);
+					ret = ops->get(sk, val, opt, len);
+					goto out;
+				}
+			} else {
+				if (val >= ops->set_optmin
+				    && val < ops->set_optmax) {
+					ops->use++;
+					up(&nf_sockopt_mutex);
+					ret = ops->set(sk, val, opt, *len);
+					goto out;
+				}
+			}
+		}
+	}
+	up(&nf_sockopt_mutex);
+	return -ENOPROTOOPT;
+	
+ out:
+	down(&nf_sockopt_mutex);
+	ops->use--;
+	if (ops->cleanup_task)
+		wake_up_process(ops->cleanup_task);
+	up(&nf_sockopt_mutex);
+	return ret;
+}
+
+int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
+		  int len)
+{
+	return nf_sockopt(sk, pf, val, opt, &len, 0);
+}
+EXPORT_SYMBOL(nf_setsockopt);
+
+int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
+{
+	return nf_sockopt(sk, pf, val, opt, len, 1);
+}
+EXPORT_SYMBOL(nf_getsockopt);
+
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
new file mode 100644
index 00000000000..e089f17bb80
--- /dev/null
+++ b/net/netfilter/nfnetlink.c
@@ -0,0 +1,376 @@
+/* Netfilter messages via netlink socket. Allows for user space
+ * protocol helpers and general trouble making from userspace.
+ *
+ * (C) 2001 by Jay Schulist <jschlst@samba.org>,
+ * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 by Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * Initial netfilter messages via netlink development funded and
+ * generally made possible by Network Robots, Inc. (www.networkrobots.com)
+ *
+ * Further development of this code funded by Astaro AG (http://www.astaro.com)
+ *
+ * This software may be used and distributed according to the terms
+ * of the GNU General Public License, incorporated herein by reference.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/skbuff.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <net/sock.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+
+static char __initdata nfversion[] = "0.30";
+
+#if 0
+#define DEBUGP(format, args...)	\
+		printk(KERN_DEBUG "%s(%d):%s(): " format, __FILE__, \
+			__LINE__, __FUNCTION__, ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static struct sock *nfnl = NULL;
+static struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
+DECLARE_MUTEX(nfnl_sem);
+
+void nfnl_lock(void)
+{
+	nfnl_shlock();
+}
+
+void nfnl_unlock(void)
+{
+	nfnl_shunlock();
+}
+
+int nfnetlink_subsys_register(struct nfnetlink_subsystem *n)
+{
+	DEBUGP("registering subsystem ID %u\n", n->subsys_id);
+
+	nfnl_lock();
+	if (subsys_table[n->subsys_id]) {
+		nfnl_unlock();
+		return -EBUSY;
+	}
+	subsys_table[n->subsys_id] = n;
+	nfnl_unlock();
+
+	return 0;
+}
+
+int nfnetlink_subsys_unregister(struct nfnetlink_subsystem *n)
+{
+	DEBUGP("unregistering subsystem ID %u\n", n->subsys_id);
+
+	nfnl_lock();
+	subsys_table[n->subsys_id] = NULL;
+	nfnl_unlock();
+
+	return 0;
+}
+
+static inline struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type)
+{
+	u_int8_t subsys_id = NFNL_SUBSYS_ID(type);
+
+	if (subsys_id >= NFNL_SUBSYS_COUNT
+	    || subsys_table[subsys_id] == NULL)
+		return NULL;
+
+	return subsys_table[subsys_id];
+}
+
+static inline struct nfnl_callback *
+nfnetlink_find_client(u_int16_t type, struct nfnetlink_subsystem *ss)
+{
+	u_int8_t cb_id = NFNL_MSG_TYPE(type);
+	
+	if (cb_id >= ss->cb_count) {
+		DEBUGP("msgtype %u >= %u, returning\n", type, ss->cb_count);
+		return NULL;
+	}
+
+	return &ss->cb[cb_id];
+}
+
+void __nfa_fill(struct sk_buff *skb, int attrtype, int attrlen,
+		const void *data)
+{
+	struct nfattr *nfa;
+	int size = NFA_LENGTH(attrlen);
+
+	nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+	nfa->nfa_type = attrtype;
+	nfa->nfa_len  = size;
+	memcpy(NFA_DATA(nfa), data, attrlen);
+	memset(NFA_DATA(nfa) + attrlen, 0, NFA_ALIGN(size) - size);
+}
+
+int nfattr_parse(struct nfattr *tb[], int maxattr, struct nfattr *nfa, int len)
+{
+	memset(tb, 0, sizeof(struct nfattr *) * maxattr);
+
+	while (NFA_OK(nfa, len)) {
+		unsigned flavor = nfa->nfa_type;
+		if (flavor && flavor <= maxattr)
+			tb[flavor-1] = nfa;
+		nfa = NFA_NEXT(nfa, len);
+	}
+
+	return 0;
+}
+
+/**
+ * nfnetlink_check_attributes - check and parse nfnetlink attributes
+ *
+ * subsys: nfnl subsystem for which this message is to be parsed
+ * nlmsghdr: netlink message to be checked/parsed
+ * cda: array of pointers, needs to be at least subsys->attr_count big
+ *
+ */
+static int
+nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
+			   struct nlmsghdr *nlh, struct nfattr *cda[])
+{
+	int min_len;
+	u_int16_t attr_count;
+	u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+
+	if (unlikely(cb_id >= subsys->cb_count)) {
+		DEBUGP("msgtype %u >= %u, returning\n",
+			cb_id, subsys->cb_count);
+		return -EINVAL;
+	}
+
+	min_len = NLMSG_ALIGN(sizeof(struct nfgenmsg));
+	if (unlikely(nlh->nlmsg_len < min_len))
+		return -EINVAL;
+
+	attr_count = subsys->cb[cb_id].attr_count;
+	memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+
+	/* check attribute lengths. */
+	if (likely(nlh->nlmsg_len > min_len)) {
+		struct nfattr *attr = NFM_NFA(NLMSG_DATA(nlh));
+		int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+
+		while (NFA_OK(attr, attrlen)) {
+			unsigned flavor = attr->nfa_type;
+			if (flavor) {
+				if (flavor > attr_count)
+					return -EINVAL;
+				cda[flavor - 1] = attr;
+			}
+			attr = NFA_NEXT(attr, attrlen);
+		}
+	}
+
+	/* implicit: if nlmsg_len == min_len, we return 0, and an empty
+	 * (zeroed) cda[] array. The message is valid, but empty. */
+
+        return 0;
+}
+
+int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+{
+	int allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+	int err = 0;
+
+	NETLINK_CB(skb).dst_group = group;
+	if (echo)
+		atomic_inc(&skb->users);
+	netlink_broadcast(nfnl, skb, pid, group, allocation);
+	if (echo)
+		err = netlink_unicast(nfnl, skb, pid, MSG_DONTWAIT);
+
+	return err;
+}
+
+int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags)
+{
+	return netlink_unicast(nfnl, skb, pid, flags);
+}
+
+/* Process one complete nfnetlink message. */
+static inline int nfnetlink_rcv_msg(struct sk_buff *skb,
+				    struct nlmsghdr *nlh, int *errp)
+{
+	struct nfnl_callback *nc;
+	struct nfnetlink_subsystem *ss;
+	int type, err = 0;
+
+	DEBUGP("entered; subsys=%u, msgtype=%u\n",
+		 NFNL_SUBSYS_ID(nlh->nlmsg_type),
+		 NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+	/* Only requests are handled by kernel now. */
+	if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
+		DEBUGP("received non-request message\n");
+		return 0;
+	}
+
+	/* All the messages must at least contain nfgenmsg */
+	if (nlh->nlmsg_len < 
+			NLMSG_LENGTH(NLMSG_ALIGN(sizeof(struct nfgenmsg)))) {
+		DEBUGP("received message was too short\n");
+		return 0;
+	}
+
+	type = nlh->nlmsg_type;
+	ss = nfnetlink_get_subsys(type);
+	if (!ss) {
+#ifdef CONFIG_KMOD
+		/* don't call nfnl_shunlock, since it would reenter
+		 * with further packet processing */
+		up(&nfnl_sem);
+		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
+		nfnl_shlock();
+		ss = nfnetlink_get_subsys(type);
+		if (!ss)
+#endif
+		goto err_inval;
+	}
+
+	nc = nfnetlink_find_client(type, ss);
+	if (!nc) {
+		DEBUGP("unable to find client for type %d\n", type);
+		goto err_inval;
+	}
+
+	if (nc->cap_required && 
+	    !cap_raised(NETLINK_CB(skb).eff_cap, nc->cap_required)) {
+		DEBUGP("permission denied for type %d\n", type);
+		*errp = -EPERM;
+		return -1;
+	}
+
+	{
+		u_int16_t attr_count = 
+			ss->cb[NFNL_MSG_TYPE(nlh->nlmsg_type)].attr_count;
+		struct nfattr *cda[attr_count];
+
+		memset(cda, 0, sizeof(struct nfattr *) * attr_count);
+		
+		err = nfnetlink_check_attributes(ss, nlh, cda);
+		if (err < 0)
+			goto err_inval;
+
+		DEBUGP("calling handler\n");
+		err = nc->call(nfnl, skb, nlh, cda, errp);
+		*errp = err;
+		return err;
+	}
+
+err_inval:
+	DEBUGP("returning -EINVAL\n");
+	*errp = -EINVAL;
+	return -1;
+}
+
+/* Process one packet of messages. */
+static inline int nfnetlink_rcv_skb(struct sk_buff *skb)
+{
+	int err;
+	struct nlmsghdr *nlh;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		u32 rlen;
+
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
+		    || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if (nfnetlink_rcv_msg(skb, nlh, &err)) {
+			if (!err)
+				return -1;
+			netlink_ack(skb, nlh, err);
+		} else
+			if (nlh->nlmsg_flags & NLM_F_ACK)
+				netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+
+	return 0;
+}
+
+static void nfnetlink_rcv(struct sock *sk, int len)
+{
+	do {
+		struct sk_buff *skb;
+
+		if (nfnl_shlock_nowait())
+			return;
+
+		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+			if (nfnetlink_rcv_skb(skb)) {
+				if (skb->len)
+					skb_queue_head(&sk->sk_receive_queue,
+						       skb);
+				else
+					kfree_skb(skb);
+				break;
+			}
+			kfree_skb(skb);
+		}
+
+		/* don't call nfnl_shunlock, since it would reenter
+		 * with further packet processing */
+		up(&nfnl_sem);
+	} while(nfnl && nfnl->sk_receive_queue.qlen);
+}
+
+void __exit nfnetlink_exit(void)
+{
+	printk("Removing netfilter NETLINK layer.\n");
+	sock_release(nfnl->sk_socket);
+	return;
+}
+
+int __init nfnetlink_init(void)
+{
+	printk("Netfilter messages via NETLINK v%s.\n", nfversion);
+
+	nfnl = netlink_kernel_create(NETLINK_NETFILTER, NFNLGRP_MAX,
+	                             nfnetlink_rcv, THIS_MODULE);
+	if (!nfnl) {
+		printk(KERN_ERR "cannot initialize nfnetlink!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+module_init(nfnetlink_init);
+module_exit(nfnetlink_exit);
+
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);
+EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);
+EXPORT_SYMBOL_GPL(nfnetlink_send);
+EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+EXPORT_SYMBOL_GPL(nfattr_parse);
+EXPORT_SYMBOL_GPL(__nfa_fill);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
new file mode 100644
index 00000000000..ff5601ceedc
--- /dev/null
+++ b/net/netfilter/nfnetlink_log.c
@@ -0,0 +1,1055 @@
+/*
+ * This is a module which is used for logging packets to userspace via
+ * nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ipt_ULOG.c:
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_log.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFULNL_NLBUFSIZ_DEFAULT	4096
+#define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
+#define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
+
+#define PRINTR(x, args...)	do { if (net_ratelimit()) \
+				     printk(x, ## args); } while (0);
+
+#if 0
+#define UDEBUG(x, args ...)	printk(KERN_DEBUG "%s(%d):%s():	" x, 	   \
+					__FILE__, __LINE__, __FUNCTION__,  \
+					## args)
+#else
+#define UDEBUG(x, ...)
+#endif
+
+struct nfulnl_instance {
+	struct hlist_node hlist;	/* global list of instances */
+	spinlock_t lock;
+	atomic_t use;			/* use count */
+
+	unsigned int qlen;		/* number of nlmsgs in skb */
+	struct sk_buff *skb;		/* pre-allocatd skb */
+	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
+	struct timer_list timer;
+	int peer_pid;			/* PID of the peer process */
+
+	/* configurable parameters */
+	unsigned int flushtimeout;	/* timeout until queue flush */
+	unsigned int nlbufsiz;		/* netlink buffer allocation size */
+	unsigned int qthreshold;	/* threshold of the queue */
+	u_int32_t copy_range;
+	u_int16_t group_num;		/* number of this queue */
+	u_int8_t copy_mode;	
+};
+
+static DEFINE_RWLOCK(instances_lock);
+
+#define INSTANCE_BUCKETS	16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+static unsigned int hash_init;
+
+static inline u_int8_t instance_hashfn(u_int16_t group_num)
+{
+	return ((group_num & 0xff) % INSTANCE_BUCKETS);
+}
+
+static struct nfulnl_instance *
+__instance_lookup(u_int16_t group_num)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct nfulnl_instance *inst;
+
+	UDEBUG("entering (group_num=%u)\n", group_num);
+
+	head = &instance_table[instance_hashfn(group_num)];
+	hlist_for_each_entry(inst, pos, head, hlist) {
+		if (inst->group_num == group_num)
+			return inst;
+	}
+	return NULL;
+}
+
+static inline void
+instance_get(struct nfulnl_instance *inst)
+{
+	atomic_inc(&inst->use);
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(u_int16_t group_num)
+{
+	struct nfulnl_instance *inst;
+
+	read_lock_bh(&instances_lock);
+	inst = __instance_lookup(group_num);
+	if (inst)
+		instance_get(inst);
+	read_unlock_bh(&instances_lock);
+
+	return inst;
+}
+
+static void
+instance_put(struct nfulnl_instance *inst)
+{
+	if (inst && atomic_dec_and_test(&inst->use)) {
+		UDEBUG("kfree(inst=%p)\n", inst);
+		kfree(inst);
+	}
+}
+
+static void nfulnl_timer(unsigned long data);
+
+static struct nfulnl_instance *
+instance_create(u_int16_t group_num, int pid)
+{
+	struct nfulnl_instance *inst;
+
+	UDEBUG("entering (group_num=%u, pid=%d)\n", group_num,
+		pid);
+
+	write_lock_bh(&instances_lock);	
+	if (__instance_lookup(group_num)) {
+		inst = NULL;
+		UDEBUG("aborting, instance already exists\n");
+		goto out_unlock;
+	}
+
+	inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+	if (!inst)
+		goto out_unlock;
+
+	memset(inst, 0, sizeof(*inst));
+	INIT_HLIST_NODE(&inst->hlist);
+	inst->lock = SPIN_LOCK_UNLOCKED;
+	/* needs to be two, since we _put() after creation */
+	atomic_set(&inst->use, 2);
+
+	init_timer(&inst->timer);
+	inst->timer.function = nfulnl_timer;
+	inst->timer.data = (unsigned long)inst;
+	/* don't start timer yet. (re)start it  with every packet */
+
+	inst->peer_pid = pid;
+	inst->group_num = group_num;
+
+	inst->qthreshold 	= NFULNL_QTHRESH_DEFAULT;
+	inst->flushtimeout 	= NFULNL_TIMEOUT_DEFAULT;
+	inst->nlbufsiz 		= NFULNL_NLBUFSIZ_DEFAULT;
+	inst->copy_mode 	= NFULNL_COPY_PACKET;
+	inst->copy_range 	= 0xffff;
+
+	if (!try_module_get(THIS_MODULE))
+		goto out_free;
+
+	hlist_add_head(&inst->hlist, 
+		       &instance_table[instance_hashfn(group_num)]);
+
+	UDEBUG("newly added node: %p, next=%p\n", &inst->hlist, 
+		inst->hlist.next);
+
+	write_unlock_bh(&instances_lock);
+
+	return inst;
+
+out_free:
+	instance_put(inst);
+out_unlock:
+	write_unlock_bh(&instances_lock);
+	return NULL;
+}
+
+static int __nfulnl_send(struct nfulnl_instance *inst);
+
+static void
+_instance_destroy2(struct nfulnl_instance *inst, int lock)
+{
+	/* first pull it out of the global list */
+	if (lock)
+		write_lock_bh(&instances_lock);
+
+	UDEBUG("removing instance %p (queuenum=%u) from hash\n",
+		inst, inst->group_num);
+
+	hlist_del(&inst->hlist);
+
+	if (lock)
+		write_unlock_bh(&instances_lock);
+
+	/* then flush all pending packets from skb */
+
+	spin_lock_bh(&inst->lock);
+	if (inst->skb) {
+		if (inst->qlen)
+			__nfulnl_send(inst);
+		if (inst->skb) {
+			kfree_skb(inst->skb);
+			inst->skb = NULL;
+		}
+	}
+	spin_unlock_bh(&inst->lock);
+
+	/* and finally put the refcount */
+	instance_put(inst);
+
+	module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfulnl_instance *inst)
+{
+	_instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfulnl_instance *inst)
+{
+	_instance_destroy2(inst, 1);
+}
+
+static int
+nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
+		  unsigned int range)
+{
+	int status = 0;
+
+	spin_lock_bh(&inst->lock);
+	
+	switch (mode) {
+	case NFULNL_COPY_NONE:
+	case NFULNL_COPY_META:
+		inst->copy_mode = mode;
+		inst->copy_range = 0;
+		break;
+		
+	case NFULNL_COPY_PACKET:
+		inst->copy_mode = mode;
+		/* we're using struct nfattr which has 16bit nfa_len */
+		if (range > 0xffff)
+			inst->copy_range = 0xffff;
+		else
+			inst->copy_range = range;
+		break;
+		
+	default:
+		status = -EINVAL;
+		break;
+	}
+
+	spin_unlock_bh(&inst->lock);
+
+	return status;
+}
+
+static int
+nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
+{
+	int status;
+
+	spin_lock_bh(&inst->lock);
+	if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
+		status = -ERANGE;
+	else if (nlbufsiz > 131072)
+		status = -ERANGE;
+	else {
+		inst->nlbufsiz = nlbufsiz;
+		status = 0;
+	}
+	spin_unlock_bh(&inst->lock);
+
+	return status;
+}
+
+static int
+nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
+{
+	spin_lock_bh(&inst->lock);
+	inst->flushtimeout = timeout;
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
+static int
+nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
+{
+	spin_lock_bh(&inst->lock);
+	inst->qthreshold = qthresh;
+	spin_unlock_bh(&inst->lock);
+
+	return 0;
+}
+
+static struct sk_buff *nfulnl_alloc_skb(unsigned int inst_size, 
+					unsigned int pkt_size)
+{
+	struct sk_buff *skb;
+
+	UDEBUG("entered (%u, %u)\n", inst_size, pkt_size);
+
+	/* alloc skb which should be big enough for a whole multipart
+	 * message.  WARNING: has to be <= 128k due to slab restrictions */
+
+	skb = alloc_skb(inst_size, GFP_ATOMIC);
+	if (!skb) {
+		PRINTR("nfnetlink_log: can't alloc whole buffer (%u bytes)\n",
+			inst_size);
+
+		/* try to allocate only as much as we need for current
+		 * packet */
+
+		skb = alloc_skb(pkt_size, GFP_ATOMIC);
+		if (!skb)
+			PRINTR("nfnetlink_log: can't even alloc %u bytes\n",
+				pkt_size);
+	}
+
+	return skb;
+}
+
+static int
+__nfulnl_send(struct nfulnl_instance *inst)
+{
+	int status;
+
+	if (timer_pending(&inst->timer))
+		del_timer(&inst->timer);
+
+	if (inst->qlen > 1)
+		inst->lastnlh->nlmsg_type = NLMSG_DONE;
+
+	status = nfnetlink_unicast(inst->skb, inst->peer_pid, MSG_DONTWAIT);
+	if (status < 0) {
+		UDEBUG("netlink_unicast() failed\n");
+		/* FIXME: statistics */
+	}
+
+	inst->qlen = 0;
+	inst->skb = NULL;
+	inst->lastnlh = NULL;
+
+	return status;
+}
+
+static void nfulnl_timer(unsigned long data)
+{
+	struct nfulnl_instance *inst = (struct nfulnl_instance *)data; 
+
+	UDEBUG("timer function called, flushing buffer\n");
+
+	spin_lock_bh(&inst->lock);
+	__nfulnl_send(inst);
+	instance_put(inst);
+	spin_unlock_bh(&inst->lock);
+}
+
+static inline int 
+__build_packet_message(struct nfulnl_instance *inst,
+			const struct sk_buff *skb, 
+			unsigned int data_len,
+			unsigned int pf,
+			unsigned int hooknum,
+			const struct net_device *indev,
+			const struct net_device *outdev,
+			const struct nf_loginfo *li,
+			const char *prefix)
+{
+	unsigned char *old_tail;
+	struct nfulnl_msg_packet_hdr pmsg;
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	u_int32_t tmp_uint;
+
+	UDEBUG("entered\n");
+		
+	old_tail = inst->skb->tail;
+	nlh = NLMSG_PUT(inst->skb, 0, 0, 
+			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+			sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+	nfmsg->nfgen_family = pf;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = htons(inst->group_num);
+
+	pmsg.hw_protocol	= htons(skb->protocol);
+	pmsg.hook		= hooknum;
+
+	NFA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+	if (prefix) {
+		int slen = strlen(prefix);
+		if (slen > NFULNL_PREFIXLEN)
+			slen = NFULNL_PREFIXLEN;
+		NFA_PUT(inst->skb, NFULA_PREFIX, slen, prefix);
+	}
+
+	if (indev) {
+		tmp_uint = htonl(indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV, sizeof(tmp_uint),
+			&tmp_uint);
+#else
+		if (pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical input device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+				sizeof(tmp_uint), &tmp_uint);
+			/* this is the bridge group "brX" */
+			tmp_uint = htonl(indev->br_port->br->dev->ifindex);
+			NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+				sizeof(tmp_uint), &tmp_uint);
+		} else {
+			/* Case 2: indev is bridge group, we need to look for
+			 * physical device (when called from ipv4) */
+			NFA_PUT(inst->skb, NFULA_IFINDEX_INDEV,
+				sizeof(tmp_uint), &tmp_uint);
+			if (skb->nf_bridge && skb->nf_bridge->physindev) {
+				tmp_uint = 
+				    htonl(skb->nf_bridge->physindev->ifindex);
+				NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSINDEV,
+					sizeof(tmp_uint), &tmp_uint);
+			}
+		}
+#endif
+	}
+
+	if (outdev) {
+		tmp_uint = htonl(outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+			&tmp_uint);
+#else
+		if (pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical output device, we need to
+			 * look for bridge group (when called from
+			 * netfilter_bridge) */
+			NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+				sizeof(tmp_uint), &tmp_uint);
+			/* this is the bridge group "brX" */
+			tmp_uint = htonl(outdev->br_port->br->dev->ifindex);
+			NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+				sizeof(tmp_uint), &tmp_uint);
+		} else {
+			/* Case 2: indev is a bridge group, we need to look
+			 * for physical device (when called from ipv4) */
+			NFA_PUT(inst->skb, NFULA_IFINDEX_OUTDEV,
+				sizeof(tmp_uint), &tmp_uint);
+			if (skb->nf_bridge) {
+				tmp_uint = 
+				    htonl(skb->nf_bridge->physoutdev->ifindex);
+				NFA_PUT(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
+					sizeof(tmp_uint), &tmp_uint);
+			}
+		}
+#endif
+	}
+
+	if (skb->nfmark) {
+		tmp_uint = htonl(skb->nfmark);
+		NFA_PUT(inst->skb, NFULA_MARK, sizeof(tmp_uint), &tmp_uint);
+	}
+
+	if (indev && skb->dev && skb->dev->hard_header_parse) {
+		struct nfulnl_msg_packet_hw phw;
+
+		phw.hw_addrlen = 
+			skb->dev->hard_header_parse((struct sk_buff *)skb, 
+						    phw.hw_addr);
+		phw.hw_addrlen = htons(phw.hw_addrlen);
+		NFA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw);
+	}
+
+	if (skb->tstamp.off_sec) {
+		struct nfulnl_msg_packet_timestamp ts;
+
+		ts.sec = cpu_to_be64(skb_tv_base.tv_sec + skb->tstamp.off_sec);
+		ts.usec = cpu_to_be64(skb_tv_base.tv_usec + skb->tstamp.off_usec);
+
+		NFA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts);
+	}
+
+	/* UID */
+	if (skb->sk) {
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
+			u_int32_t uid = htonl(skb->sk->sk_socket->file->f_uid);
+			/* need to unlock here since NFA_PUT may goto */
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+			NFA_PUT(inst->skb, NFULA_UID, sizeof(uid), &uid);
+		} else
+			read_unlock_bh(&skb->sk->sk_callback_lock);
+	}
+
+	if (data_len) {
+		struct nfattr *nfa;
+		int size = NFA_LENGTH(data_len);
+
+		if (skb_tailroom(inst->skb) < (int)NFA_SPACE(data_len)) {
+			printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
+			goto nlmsg_failure;
+		}
+
+		nfa = (struct nfattr *)skb_put(inst->skb, NFA_ALIGN(size));
+		nfa->nfa_type = NFULA_PAYLOAD;
+		nfa->nfa_len = size;
+
+		if (skb_copy_bits(skb, 0, NFA_DATA(nfa), data_len))
+			BUG();
+	}
+		
+	nlh->nlmsg_len = inst->skb->tail - old_tail;
+	return 0;
+
+nlmsg_failure:
+	UDEBUG("nlmsg_failure\n");
+nfattr_failure:
+	PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+	return -1;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static struct nf_loginfo default_loginfo = {
+	.type =		NF_LOG_TYPE_ULOG,
+	.u = {
+		.ulog = {
+			.copy_len	= 0xffff,
+			.group		= 0,
+			.qthreshold	= 1,
+		},
+	},
+};
+
+/* log handler for internal netfilter logging api */
+static void
+nfulnl_log_packet(unsigned int pf,
+		  unsigned int hooknum,
+		  const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct nf_loginfo *li_user,
+		  const char *prefix)
+{
+	unsigned int size, data_len;
+	struct nfulnl_instance *inst;
+	const struct nf_loginfo *li;
+	unsigned int qthreshold;
+	unsigned int nlbufsiz;
+
+	if (li_user && li_user->type == NF_LOG_TYPE_ULOG) 
+		li = li_user;
+	else
+		li = &default_loginfo;
+
+	inst = instance_lookup_get(li->u.ulog.group);
+	if (!inst)
+		inst = instance_lookup_get(0);
+	if (!inst) {
+		PRINTR("nfnetlink_log: trying to log packet, "
+			"but no instance for group %u\n", li->u.ulog.group);
+		return;
+	}
+
+	/* all macros expand to constant values at compile time */
+	/* FIXME: do we want to make the size calculation conditional based on
+	 * what is actually present?  way more branches and checks, but more
+	 * memory efficient... */
+	size =    NLMSG_SPACE(sizeof(struct nfgenmsg))
+		+ NFA_SPACE(sizeof(struct nfulnl_msg_packet_hdr))
+		+ NFA_SPACE(sizeof(u_int32_t))	/* ifindex */
+		+ NFA_SPACE(sizeof(u_int32_t))	/* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+		+ NFA_SPACE(sizeof(u_int32_t))	/* ifindex */
+		+ NFA_SPACE(sizeof(u_int32_t))	/* ifindex */
+#endif
+		+ NFA_SPACE(sizeof(u_int32_t))	/* mark */
+		+ NFA_SPACE(sizeof(u_int32_t))	/* uid */
+		+ NFA_SPACE(NFULNL_PREFIXLEN)	/* prefix */
+		+ NFA_SPACE(sizeof(struct nfulnl_msg_packet_hw))
+		+ NFA_SPACE(sizeof(struct nfulnl_msg_packet_timestamp));
+
+	UDEBUG("initial size=%u\n", size);
+
+	spin_lock_bh(&inst->lock);
+
+	qthreshold = inst->qthreshold;
+	/* per-rule qthreshold overrides per-instance */
+	if (qthreshold > li->u.ulog.qthreshold)
+		qthreshold = li->u.ulog.qthreshold;
+	
+	switch (inst->copy_mode) {
+	case NFULNL_COPY_META:
+	case NFULNL_COPY_NONE:
+		data_len = 0;
+		break;
+	
+	case NFULNL_COPY_PACKET:
+		if (inst->copy_range == 0 
+		    || inst->copy_range > skb->len)
+			data_len = skb->len;
+		else
+			data_len = inst->copy_range;
+		
+		size += NFA_SPACE(data_len);
+		UDEBUG("copy_packet, therefore size now %u\n", size);
+		break;
+	
+	default:
+		spin_unlock_bh(&inst->lock);
+		instance_put(inst);
+		return;
+	}
+
+	if (size > inst->nlbufsiz)
+		nlbufsiz = size;
+	else
+		nlbufsiz = inst->nlbufsiz;
+
+	if (!inst->skb) {
+		if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+			UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+				inst->nlbufsiz, size);
+			goto alloc_failure;
+		}
+	} else if (inst->qlen >= qthreshold ||
+		   size > skb_tailroom(inst->skb)) {
+		/* either the queue len is too high or we don't have
+		 * enough room in the skb left. flush to userspace. */
+		UDEBUG("flushing old skb\n");
+
+		__nfulnl_send(inst);
+
+		if (!(inst->skb = nfulnl_alloc_skb(nlbufsiz, size))) {
+			UDEBUG("error in nfulnl_alloc_skb(%u, %u)\n",
+				inst->nlbufsiz, size);
+			goto alloc_failure;
+		}
+	}
+
+	UDEBUG("qlen %d, qthreshold %d\n", inst->qlen, qthreshold);
+	inst->qlen++;
+
+	__build_packet_message(inst, skb, data_len, pf,
+				hooknum, in, out, li, prefix);
+
+	/* timer_pending always called within inst->lock, so there
+	 * is no chance of a race here */
+	if (!timer_pending(&inst->timer)) {
+		instance_get(inst);
+		inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
+		add_timer(&inst->timer);
+	}
+	spin_unlock_bh(&inst->lock);
+
+	return;
+
+alloc_failure:
+	spin_unlock_bh(&inst->lock);
+	instance_put(inst);
+	UDEBUG("error allocating skb\n");
+	/* FIXME: statistics */
+}
+
+static int
+nfulnl_rcv_nl_event(struct notifier_block *this,
+		   unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE &&
+	    n->protocol == NETLINK_NETFILTER && n->pid) {
+		int i;
+
+		/* destroy all instances for this pid */
+		write_lock_bh(&instances_lock);
+		for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+			struct hlist_node *tmp, *t2;
+			struct nfulnl_instance *inst;
+			struct hlist_head *head = &instance_table[i];
+
+			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+				UDEBUG("node = %p\n", inst);
+				if (n->pid == inst->peer_pid)
+					__instance_destroy(inst);
+			}
+		}
+		write_unlock_bh(&instances_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfulnl_rtnl_notifier = {
+	.notifier_call	= nfulnl_rcv_nl_event,
+};
+
+static int
+nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+		  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+	return -ENOTSUPP;
+}
+
+static struct nf_logger nfulnl_logger = {
+	.name	= "nfnetlink_log",
+	.logfn	= &nfulnl_log_packet,
+	.me	= THIS_MODULE,
+};
+
+static const int nfula_min[NFULA_MAX] = {
+	[NFULA_PACKET_HDR-1]	= sizeof(struct nfulnl_msg_packet_hdr),
+	[NFULA_MARK-1]		= sizeof(u_int32_t),
+	[NFULA_TIMESTAMP-1]	= sizeof(struct nfulnl_msg_packet_timestamp),
+	[NFULA_IFINDEX_INDEV-1]	= sizeof(u_int32_t),
+	[NFULA_IFINDEX_OUTDEV-1]= sizeof(u_int32_t),
+	[NFULA_HWADDR-1]	= sizeof(struct nfulnl_msg_packet_hw),
+	[NFULA_PAYLOAD-1]	= 0,
+	[NFULA_PREFIX-1]	= 0,
+	[NFULA_UID-1]		= sizeof(u_int32_t),
+};
+
+static const int nfula_cfg_min[NFULA_CFG_MAX] = {
+	[NFULA_CFG_CMD-1]	= sizeof(struct nfulnl_msg_config_cmd),
+	[NFULA_CFG_MODE-1]	= sizeof(struct nfulnl_msg_config_mode),
+	[NFULA_CFG_TIMEOUT-1]	= sizeof(u_int32_t),
+	[NFULA_CFG_QTHRESH-1]	= sizeof(u_int32_t),
+	[NFULA_CFG_NLBUFSIZ-1]	= sizeof(u_int32_t),
+};
+
+static int
+nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+		   struct nlmsghdr *nlh, struct nfattr *nfula[], int *errp)
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t group_num = ntohs(nfmsg->res_id);
+	struct nfulnl_instance *inst;
+	int ret = 0;
+
+	UDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+	if (nfattr_bad_size(nfula, NFULA_CFG_MAX, nfula_cfg_min)) {
+		UDEBUG("bad attribute size\n");
+		return -EINVAL;
+	}
+
+	inst = instance_lookup_get(group_num);
+	if (nfula[NFULA_CFG_CMD-1]) {
+		u_int8_t pf = nfmsg->nfgen_family;
+		struct nfulnl_msg_config_cmd *cmd;
+		cmd = NFA_DATA(nfula[NFULA_CFG_CMD-1]);
+		UDEBUG("found CFG_CMD for\n");
+
+		switch (cmd->command) {
+		case NFULNL_CFG_CMD_BIND:
+			if (inst) {
+				ret = -EBUSY;
+				goto out_put;
+			}
+
+			inst = instance_create(group_num,
+					       NETLINK_CB(skb).pid);
+			if (!inst) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+			break;
+		case NFULNL_CFG_CMD_UNBIND:
+			if (!inst) {
+				ret = -ENODEV;
+				goto out_put;
+			}
+
+			if (inst->peer_pid != NETLINK_CB(skb).pid) {
+				ret = -EPERM;
+				goto out_put;
+			}
+
+			instance_destroy(inst);
+			break;
+		case NFULNL_CFG_CMD_PF_BIND:
+			UDEBUG("registering log handler for pf=%u\n", pf);
+			ret = nf_log_register(pf, &nfulnl_logger);
+			break;
+		case NFULNL_CFG_CMD_PF_UNBIND:
+			UDEBUG("unregistering log handler for pf=%u\n", pf);
+			/* This is a bug and a feature.  We cannot unregister
+			 * other handlers, like nfnetlink_inst can */
+			nf_log_unregister_pf(pf);
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+		}
+	} else {
+		if (!inst) {
+			UDEBUG("no config command, and no instance for "
+				"group=%u pid=%u =>ENOENT\n",
+				group_num, NETLINK_CB(skb).pid);
+			ret = -ENOENT;
+			goto out_put;
+		}
+
+		if (inst->peer_pid != NETLINK_CB(skb).pid) {
+			UDEBUG("no config command, and wrong pid\n");
+			ret = -EPERM;
+			goto out_put;
+		}
+	}
+
+	if (nfula[NFULA_CFG_MODE-1]) {
+		struct nfulnl_msg_config_mode *params;
+		params = NFA_DATA(nfula[NFULA_CFG_MODE-1]);
+
+		nfulnl_set_mode(inst, params->copy_mode,
+				ntohs(params->copy_range));
+	}
+
+	if (nfula[NFULA_CFG_TIMEOUT-1]) {
+		u_int32_t timeout = 
+			*(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_TIMEOUT-1]);
+
+		nfulnl_set_timeout(inst, ntohl(timeout));
+	}
+
+	if (nfula[NFULA_CFG_NLBUFSIZ-1]) {
+		u_int32_t nlbufsiz = 
+			*(u_int32_t *)NFA_DATA(nfula[NFULA_CFG_NLBUFSIZ-1]);
+
+		nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
+	}
+
+	if (nfula[NFULA_CFG_QTHRESH-1]) {
+		u_int32_t qthresh = 
+			*(u_int16_t *)NFA_DATA(nfula[NFULA_CFG_QTHRESH-1]);
+
+		nfulnl_set_qthresh(inst, ntohl(qthresh));
+	}
+
+out_put:
+	instance_put(inst);
+	return ret;
+}
+
+static struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
+	[NFULNL_MSG_PACKET]	= { .call = nfulnl_recv_unsupp,
+				    .attr_count = NFULA_MAX,
+				    .cap_required = CAP_NET_ADMIN, },
+	[NFULNL_MSG_CONFIG]	= { .call = nfulnl_recv_config,
+				    .attr_count = NFULA_CFG_MAX,
+				    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfulnl_subsys = {
+	.name		= "log",
+	.subsys_id	= NFNL_SUBSYS_ULOG,
+	.cb_count	= NFULNL_MSG_MAX,
+	.cb		= nfulnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+	unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+	struct iter_state *st = seq->private;
+
+	if (!st)
+		return NULL;
+
+	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+		if (!hlist_empty(&instance_table[st->bucket]))
+			return instance_table[st->bucket].first;
+	}
+	return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+	struct iter_state *st = seq->private;
+
+	h = h->next;
+	while (!h) {
+		if (++st->bucket >= INSTANCE_BUCKETS)
+			return NULL;
+
+		h = instance_table[st->bucket].first;
+	}
+	return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_node *head;
+	head = get_first(seq);
+
+	if (head)
+		while (pos && (head = get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_bh(&instances_lock);
+	return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	const struct nfulnl_instance *inst = v;
+
+	return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", 
+			  inst->group_num,
+			  inst->peer_pid, inst->qlen, 
+			  inst->copy_mode, inst->copy_range,
+			  inst->flushtimeout, atomic_read(&inst->use));
+}
+
+static struct seq_operations nful_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nful_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct iter_state *is;
+	int ret;
+
+	is = kmalloc(sizeof(*is), GFP_KERNEL);
+	if (!is)
+		return -ENOMEM;
+	memset(is, 0, sizeof(*is));
+	ret = seq_open(file, &nful_seq_ops);
+	if (ret < 0)
+		goto out_free;
+	seq = file->private_data;
+	seq->private = is;
+	return ret;
+out_free:
+	kfree(is);
+	return ret;
+}
+
+static struct file_operations nful_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nful_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+	int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc_nful;
+#endif
+	
+	if (!init)
+		goto cleanup;
+
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		INIT_HLIST_HEAD(&instance_table[i]);
+	
+	/* it's not really all that important to have a random value, so
+	 * we can do this from the init function, even if there hasn't
+	 * been that much entropy yet */
+	get_random_bytes(&hash_init, sizeof(hash_init));
+
+	netlink_register_notifier(&nfulnl_rtnl_notifier);
+	status = nfnetlink_subsys_register(&nfulnl_subsys);
+	if (status < 0) {
+		printk(KERN_ERR "log: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_nful = create_proc_entry("nfnetlink_log", 0440,
+				      proc_net_netfilter);
+	if (!proc_nful)
+		goto cleanup_subsys;
+	proc_nful->proc_fops = &nful_file_ops;
+#endif
+
+	return status;
+
+cleanup:
+	nf_log_unregister_logger(&nfulnl_logger);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("nfnetlink_log", proc_net_netfilter);
+cleanup_subsys:
+#endif
+	nfnetlink_subsys_unregister(&nfulnl_subsys);
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+	return status;
+}
+
+static int __init init(void)
+{
+	
+	return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter userspace logging");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
new file mode 100644
index 00000000000..e3a5285329a
--- /dev/null
+++ b/net/netfilter/nfnetlink_queue.c
@@ -0,0 +1,1132 @@
+/*
+ * This is a module which is used for queueing packets and communicating with
+ * userspace via nfetlink.
+ *
+ * (C) 2005 by Harald Welte <laforge@netfilter.org>
+ *
+ * Based on the old ipv4-only ip_queue.c:
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+#include <linux/list.h>
+#include <net/sock.h>
+
+#include <asm/atomic.h>
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include "../bridge/br_private.h"
+#endif
+
+#define NFQNL_QMAX_DEFAULT 1024
+
+#if 0
+#define QDEBUG(x, args ...)	printk(KERN_DEBUG "%s(%d):%s():	" x, 	   \
+					__FILE__, __LINE__, __FUNCTION__,  \
+					## args)
+#else
+#define QDEBUG(x, ...)
+#endif
+
+struct nfqnl_queue_entry {
+	struct list_head list;
+	struct nf_info *info;
+	struct sk_buff *skb;
+	unsigned int id;
+};
+
+struct nfqnl_instance {
+	struct hlist_node hlist;		/* global list of queues */
+	atomic_t use;
+
+	int peer_pid;
+	unsigned int queue_maxlen;
+	unsigned int copy_range;
+	unsigned int queue_total;
+	unsigned int queue_dropped;
+	unsigned int queue_user_dropped;
+
+	atomic_t id_sequence;			/* 'sequence' of pkt ids */
+
+	u_int16_t queue_num;			/* number of this queue */
+	u_int8_t copy_mode;
+
+	spinlock_t lock;
+
+	struct list_head queue_list;		/* packets in queue */
+};
+
+typedef int (*nfqnl_cmpfn)(struct nfqnl_queue_entry *, unsigned long);
+
+static DEFINE_RWLOCK(instances_lock);
+
+u_int64_t htonll(u_int64_t in)
+{
+	u_int64_t out;
+	int i;
+
+	for (i = 0; i < sizeof(u_int64_t); i++)
+		((u_int8_t *)&out)[sizeof(u_int64_t)-1] = ((u_int8_t *)&in)[i];
+
+	return out;
+}
+
+#define INSTANCE_BUCKETS	16
+static struct hlist_head instance_table[INSTANCE_BUCKETS];
+
+static inline u_int8_t instance_hashfn(u_int16_t queue_num)
+{
+	return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+}
+
+static struct nfqnl_instance *
+__instance_lookup(u_int16_t queue_num)
+{
+	struct hlist_head *head;
+	struct hlist_node *pos;
+	struct nfqnl_instance *inst;
+
+	head = &instance_table[instance_hashfn(queue_num)];
+	hlist_for_each_entry(inst, pos, head, hlist) {
+		if (inst->queue_num == queue_num)
+			return inst;
+	}
+	return NULL;
+}
+
+static struct nfqnl_instance *
+instance_lookup_get(u_int16_t queue_num)
+{
+	struct nfqnl_instance *inst;
+
+	read_lock_bh(&instances_lock);
+	inst = __instance_lookup(queue_num);
+	if (inst)
+		atomic_inc(&inst->use);
+	read_unlock_bh(&instances_lock);
+
+	return inst;
+}
+
+static void
+instance_put(struct nfqnl_instance *inst)
+{
+	if (inst && atomic_dec_and_test(&inst->use)) {
+		QDEBUG("kfree(inst=%p)\n", inst);
+		kfree(inst);
+	}
+}
+
+static struct nfqnl_instance *
+instance_create(u_int16_t queue_num, int pid)
+{
+	struct nfqnl_instance *inst;
+
+	QDEBUG("entering for queue_num=%u, pid=%d\n", queue_num, pid);
+
+	write_lock_bh(&instances_lock);	
+	if (__instance_lookup(queue_num)) {
+		inst = NULL;
+		QDEBUG("aborting, instance already exists\n");
+		goto out_unlock;
+	}
+
+	inst = kmalloc(sizeof(*inst), GFP_ATOMIC);
+	if (!inst)
+		goto out_unlock;
+
+	memset(inst, 0, sizeof(*inst));
+	inst->queue_num = queue_num;
+	inst->peer_pid = pid;
+	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
+	inst->copy_range = 0xfffff;
+	inst->copy_mode = NFQNL_COPY_NONE;
+	atomic_set(&inst->id_sequence, 0);
+	/* needs to be two, since we _put() after creation */
+	atomic_set(&inst->use, 2);
+	inst->lock = SPIN_LOCK_UNLOCKED;
+	INIT_LIST_HEAD(&inst->queue_list);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out_free;
+
+	hlist_add_head(&inst->hlist, 
+		       &instance_table[instance_hashfn(queue_num)]);
+
+	write_unlock_bh(&instances_lock);
+
+	QDEBUG("successfully created new instance\n");
+
+	return inst;
+
+out_free:
+	kfree(inst);
+out_unlock:
+	write_unlock_bh(&instances_lock);
+	return NULL;
+}
+
+static void nfqnl_flush(struct nfqnl_instance *queue, int verdict);
+
+static void
+_instance_destroy2(struct nfqnl_instance *inst, int lock)
+{
+	/* first pull it out of the global list */
+	if (lock)
+		write_lock_bh(&instances_lock);
+
+	QDEBUG("removing instance %p (queuenum=%u) from hash\n",
+		inst, inst->queue_num);
+	hlist_del(&inst->hlist);
+
+	if (lock)
+		write_unlock_bh(&instances_lock);
+
+	/* then flush all pending skbs from the queue */
+	nfqnl_flush(inst, NF_DROP);
+
+	/* and finally put the refcount */
+	instance_put(inst);
+
+	module_put(THIS_MODULE);
+}
+
+static inline void
+__instance_destroy(struct nfqnl_instance *inst)
+{
+	_instance_destroy2(inst, 0);
+}
+
+static inline void
+instance_destroy(struct nfqnl_instance *inst)
+{
+	_instance_destroy2(inst, 1);
+}
+
+
+
+static void
+issue_verdict(struct nfqnl_queue_entry *entry, int verdict)
+{
+	QDEBUG("entering for entry %p, verdict %u\n", entry, verdict);
+
+	/* TCP input path (and probably other bits) assume to be called
+	 * from softirq context, not from syscall, like issue_verdict is
+	 * called.  TCP input path deadlocks with locks taken from timer
+	 * softirq, e.g.  We therefore emulate this by local_bh_disable() */
+
+	local_bh_disable();
+	nf_reinject(entry->skb, entry->info, verdict);
+	local_bh_enable();
+
+	kfree(entry);
+}
+
+static inline void
+__enqueue_entry(struct nfqnl_instance *queue,
+		      struct nfqnl_queue_entry *entry)
+{
+       list_add(&entry->list, &queue->queue_list);
+       queue->queue_total++;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct nfqnl_queue_entry *
+__find_entry(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, 
+		   unsigned long data)
+{
+	struct list_head *p;
+
+	list_for_each_prev(p, &queue->queue_list) {
+		struct nfqnl_queue_entry *entry = (struct nfqnl_queue_entry *)p;
+		
+		if (!cmpfn || cmpfn(entry, data))
+			return entry;
+	}
+	return NULL;
+}
+
+static inline void
+__dequeue_entry(struct nfqnl_instance *q, struct nfqnl_queue_entry *entry)
+{
+	list_del(&entry->list);
+	q->queue_total--;
+}
+
+static inline struct nfqnl_queue_entry *
+__find_dequeue_entry(struct nfqnl_instance *queue,
+		     nfqnl_cmpfn cmpfn, unsigned long data)
+{
+	struct nfqnl_queue_entry *entry;
+
+	entry = __find_entry(queue, cmpfn, data);
+	if (entry == NULL)
+		return NULL;
+
+	__dequeue_entry(queue, entry);
+	return entry;
+}
+
+
+static inline void
+__nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+	struct nfqnl_queue_entry *entry;
+	
+	while ((entry = __find_dequeue_entry(queue, NULL, 0)))
+		issue_verdict(entry, verdict);
+}
+
+static inline int
+__nfqnl_set_mode(struct nfqnl_instance *queue,
+		 unsigned char mode, unsigned int range)
+{
+	int status = 0;
+	
+	switch (mode) {
+	case NFQNL_COPY_NONE:
+	case NFQNL_COPY_META:
+		queue->copy_mode = mode;
+		queue->copy_range = 0;
+		break;
+		
+	case NFQNL_COPY_PACKET:
+		queue->copy_mode = mode;
+		/* we're using struct nfattr which has 16bit nfa_len */
+		if (range > 0xffff)
+			queue->copy_range = 0xffff;
+		else
+			queue->copy_range = range;
+		break;
+		
+	default:
+		status = -EINVAL;
+
+	}
+	return status;
+}
+
+static struct nfqnl_queue_entry *
+find_dequeue_entry(struct nfqnl_instance *queue,
+			 nfqnl_cmpfn cmpfn, unsigned long data)
+{
+	struct nfqnl_queue_entry *entry;
+	
+	spin_lock_bh(&queue->lock);
+	entry = __find_dequeue_entry(queue, cmpfn, data);
+	spin_unlock_bh(&queue->lock);
+
+	return entry;
+}
+
+static void
+nfqnl_flush(struct nfqnl_instance *queue, int verdict)
+{
+	spin_lock_bh(&queue->lock);
+	__nfqnl_flush(queue, verdict);
+	spin_unlock_bh(&queue->lock);
+}
+
+static struct sk_buff *
+nfqnl_build_packet_message(struct nfqnl_instance *queue,
+			   struct nfqnl_queue_entry *entry, int *errp)
+{
+	unsigned char *old_tail;
+	size_t size;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct nfqnl_msg_packet_hdr pmsg;
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int tmp_uint;
+
+	QDEBUG("entered\n");
+
+	/* all macros expand to constant values at compile time */
+	size =    NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hdr))
+		+ NLMSG_SPACE(sizeof(u_int32_t))	/* ifindex */
+		+ NLMSG_SPACE(sizeof(u_int32_t))	/* ifindex */
+#ifdef CONFIG_BRIDGE_NETFILTER
+		+ NLMSG_SPACE(sizeof(u_int32_t))	/* ifindex */
+		+ NLMSG_SPACE(sizeof(u_int32_t))	/* ifindex */
+#endif
+		+ NLMSG_SPACE(sizeof(u_int32_t))	/* mark */
+		+ NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_hw))
+		+ NLMSG_SPACE(sizeof(struct nfqnl_msg_packet_timestamp));
+
+	spin_lock_bh(&queue->lock);
+	
+	switch (queue->copy_mode) {
+	case NFQNL_COPY_META:
+	case NFQNL_COPY_NONE:
+		data_len = 0;
+		break;
+	
+	case NFQNL_COPY_PACKET:
+		if (queue->copy_range == 0 
+		    || queue->copy_range > entry->skb->len)
+			data_len = entry->skb->len;
+		else
+			data_len = queue->copy_range;
+		
+		size += NLMSG_SPACE(data_len);
+		break;
+	
+	default:
+		*errp = -EINVAL;
+		spin_unlock_bh(&queue->lock);
+		return NULL;
+	}
+
+	spin_unlock_bh(&queue->lock);
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+		
+	old_tail= skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, 
+			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+			sizeof(struct nfgenmsg));
+	nfmsg = NLMSG_DATA(nlh);
+	nfmsg->nfgen_family = entry->info->pf;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = htons(queue->queue_num);
+
+	pmsg.packet_id 		= htonl(entry->id);
+	pmsg.hw_protocol	= htons(entry->skb->protocol);
+	pmsg.hook		= entry->info->hook;
+
+	NFA_PUT(skb, NFQA_PACKET_HDR, sizeof(pmsg), &pmsg);
+
+	if (entry->info->indev) {
+		tmp_uint = htonl(entry->info->indev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+		if (entry->info->pf == PF_BRIDGE) {
+			/* Case 1: indev is physical input device, we need to
+			 * look for bridge group (when called from 
+			 * netfilter_bridge) */
+			NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV, sizeof(tmp_uint), 
+				&tmp_uint);
+			/* this is the bridge group "brX" */
+			tmp_uint = htonl(entry->info->indev->br_port->br->dev->ifindex);
+			NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+				&tmp_uint);
+		} else {
+			/* Case 2: indev is bridge group, we need to look for
+			 * physical device (when called from ipv4) */
+			NFA_PUT(skb, NFQA_IFINDEX_INDEV, sizeof(tmp_uint),
+				&tmp_uint);
+			if (entry->skb->nf_bridge
+			    && entry->skb->nf_bridge->physindev) {
+				tmp_uint = htonl(entry->skb->nf_bridge->physindev->ifindex);
+				NFA_PUT(skb, NFQA_IFINDEX_PHYSINDEV,
+					sizeof(tmp_uint), &tmp_uint);
+			}
+		}
+#endif
+	}
+
+	if (entry->info->outdev) {
+		tmp_uint = htonl(entry->info->outdev->ifindex);
+#ifndef CONFIG_BRIDGE_NETFILTER
+		NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint), &tmp_uint);
+#else
+		if (entry->info->pf == PF_BRIDGE) {
+			/* Case 1: outdev is physical output device, we need to
+			 * look for bridge group (when called from 
+			 * netfilter_bridge) */
+			NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV, sizeof(tmp_uint),
+				&tmp_uint);
+			/* this is the bridge group "brX" */
+			tmp_uint = htonl(entry->info->outdev->br_port->br->dev->ifindex);
+			NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+				&tmp_uint);
+		} else {
+			/* Case 2: outdev is bridge group, we need to look for
+			 * physical output device (when called from ipv4) */
+			NFA_PUT(skb, NFQA_IFINDEX_OUTDEV, sizeof(tmp_uint),
+				&tmp_uint);
+			if (entry->skb->nf_bridge
+			    && entry->skb->nf_bridge->physoutdev) {
+				tmp_uint = htonl(entry->skb->nf_bridge->physoutdev->ifindex);
+				NFA_PUT(skb, NFQA_IFINDEX_PHYSOUTDEV,
+					sizeof(tmp_uint), &tmp_uint);
+			}
+		}
+#endif
+	}
+
+	if (entry->skb->nfmark) {
+		tmp_uint = htonl(entry->skb->nfmark);
+		NFA_PUT(skb, NFQA_MARK, sizeof(u_int32_t), &tmp_uint);
+	}
+
+	if (entry->info->indev && entry->skb->dev
+	    && entry->skb->dev->hard_header_parse) {
+		struct nfqnl_msg_packet_hw phw;
+
+		phw.hw_addrlen =
+			entry->skb->dev->hard_header_parse(entry->skb,
+			                                   phw.hw_addr);
+		phw.hw_addrlen = htons(phw.hw_addrlen);
+		NFA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw);
+	}
+
+	if (entry->skb->tstamp.off_sec) {
+		struct nfqnl_msg_packet_timestamp ts;
+
+		ts.sec = htonll(skb_tv_base.tv_sec + entry->skb->tstamp.off_sec);
+		ts.usec = htonll(skb_tv_base.tv_usec + entry->skb->tstamp.off_usec);
+
+		NFA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts);
+	}
+
+	if (data_len) {
+		struct nfattr *nfa;
+		int size = NFA_LENGTH(data_len);
+
+		if (skb_tailroom(skb) < (int)NFA_SPACE(data_len)) {
+			printk(KERN_WARNING "nf_queue: no tailroom!\n");
+			goto nlmsg_failure;
+		}
+
+		nfa = (struct nfattr *)skb_put(skb, NFA_ALIGN(size));
+		nfa->nfa_type = NFQA_PAYLOAD;
+		nfa->nfa_len = size;
+
+		if (skb_copy_bits(entry->skb, 0, NFA_DATA(nfa), data_len))
+			BUG();
+	}
+		
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+nfattr_failure:
+	if (skb)
+		kfree_skb(skb);
+	*errp = -EINVAL;
+	if (net_ratelimit())
+		printk(KERN_ERR "nf_queue: error creating packet message\n");
+	return NULL;
+}
+
+static int
+nfqnl_enqueue_packet(struct sk_buff *skb, struct nf_info *info, 
+		     unsigned int queuenum, void *data)
+{
+	int status = -EINVAL;
+	struct sk_buff *nskb;
+	struct nfqnl_instance *queue;
+	struct nfqnl_queue_entry *entry;
+
+	QDEBUG("entered\n");
+
+	queue = instance_lookup_get(queuenum);
+	if (!queue) {
+		QDEBUG("no queue instance matching\n");
+		return -EINVAL;
+	}
+
+	if (queue->copy_mode == NFQNL_COPY_NONE) {
+		QDEBUG("mode COPY_NONE, aborting\n");
+		status = -EAGAIN;
+		goto err_out_put;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL) {
+		if (net_ratelimit())
+			printk(KERN_ERR 
+				"nf_queue: OOM in nfqnl_enqueue_packet()\n");
+		status = -ENOMEM;
+		goto err_out_put;
+	}
+
+	entry->info = info;
+	entry->skb = skb;
+	entry->id = atomic_inc_return(&queue->id_sequence);
+
+	nskb = nfqnl_build_packet_message(queue, entry, &status);
+	if (nskb == NULL)
+		goto err_out_free;
+		
+	spin_lock_bh(&queue->lock);
+	
+	if (!queue->peer_pid)
+		goto err_out_free_nskb; 
+
+	if (queue->queue_total >= queue->queue_maxlen) {
+                queue->queue_dropped++;
+		status = -ENOSPC;
+		if (net_ratelimit())
+		          printk(KERN_WARNING "ip_queue: full at %d entries, "
+				 "dropping packets(s). Dropped: %d\n", 
+				 queue->queue_total, queue->queue_dropped);
+		goto err_out_free_nskb;
+	}
+
+	/* nfnetlink_unicast will either free the nskb or add it to a socket */
+	status = nfnetlink_unicast(nskb, queue->peer_pid, MSG_DONTWAIT);
+	if (status < 0) {
+	        queue->queue_user_dropped++;
+		goto err_out_unlock;
+	}
+
+	__enqueue_entry(queue, entry);
+
+	spin_unlock_bh(&queue->lock);
+	instance_put(queue);
+	return status;
+
+err_out_free_nskb:
+	kfree_skb(nskb); 
+	
+err_out_unlock:
+	spin_unlock_bh(&queue->lock);
+
+err_out_free:
+	kfree(entry);
+err_out_put:
+	instance_put(queue);
+	return status;
+}
+
+static int
+nfqnl_mangle(void *data, int data_len, struct nfqnl_queue_entry *e)
+{
+	int diff;
+
+	diff = data_len - e->skb->len;
+	if (diff < 0)
+		skb_trim(e->skb, data_len);
+	else if (diff > 0) {
+		if (data_len > 0xFFFF)
+			return -EINVAL;
+		if (diff > skb_tailroom(e->skb)) {
+			struct sk_buff *newskb;
+			
+			newskb = skb_copy_expand(e->skb,
+			                         skb_headroom(e->skb),
+			                         diff,
+			                         GFP_ATOMIC);
+			if (newskb == NULL) {
+				printk(KERN_WARNING "ip_queue: OOM "
+				      "in mangle, dropping packet\n");
+				return -ENOMEM;
+			}
+			if (e->skb->sk)
+				skb_set_owner_w(newskb, e->skb->sk);
+			kfree_skb(e->skb);
+			e->skb = newskb;
+		}
+		skb_put(e->skb, diff);
+	}
+	if (!skb_make_writable(&e->skb, data_len))
+		return -ENOMEM;
+	memcpy(e->skb->data, data, data_len);
+
+	return 0;
+}
+
+static inline int
+id_cmp(struct nfqnl_queue_entry *e, unsigned long id)
+{
+	return (id == e->id);
+}
+
+static int
+nfqnl_set_mode(struct nfqnl_instance *queue,
+	       unsigned char mode, unsigned int range)
+{
+	int status;
+
+	spin_lock_bh(&queue->lock);
+	status = __nfqnl_set_mode(queue, mode, range);
+	spin_unlock_bh(&queue->lock);
+
+	return status;
+}
+
+static int
+dev_cmp(struct nfqnl_queue_entry *entry, unsigned long ifindex)
+{
+	if (entry->info->indev)
+		if (entry->info->indev->ifindex == ifindex)
+			return 1;
+			
+	if (entry->info->outdev)
+		if (entry->info->outdev->ifindex == ifindex)
+			return 1;
+
+	return 0;
+}
+
+/* drop all packets with either indev or outdev == ifindex from all queue
+ * instances */
+static void
+nfqnl_dev_drop(int ifindex)
+{
+	int i;
+	
+	QDEBUG("entering for ifindex %u\n", ifindex);
+
+	/* this only looks like we have to hold the readlock for a way too long
+	 * time, issue_verdict(),  nf_reinject(), ... - but we always only
+	 * issue NF_DROP, which is processed directly in nf_reinject() */
+	read_lock_bh(&instances_lock);
+
+	for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+		struct hlist_node *tmp;
+		struct nfqnl_instance *inst;
+		struct hlist_head *head = &instance_table[i];
+
+		hlist_for_each_entry(inst, tmp, head, hlist) {
+			struct nfqnl_queue_entry *entry;
+			while ((entry = find_dequeue_entry(inst, dev_cmp, 
+							   ifindex)) != NULL)
+				issue_verdict(entry, NF_DROP);
+		}
+	}
+
+	read_unlock_bh(&instances_lock);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static int
+nfqnl_rcv_dev_event(struct notifier_block *this,
+		    unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	/* Drop any packets associated with the downed device */
+	if (event == NETDEV_DOWN)
+		nfqnl_dev_drop(dev->ifindex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_dev_notifier = {
+	.notifier_call	= nfqnl_rcv_dev_event,
+};
+
+static int
+nfqnl_rcv_nl_event(struct notifier_block *this,
+		   unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE &&
+	    n->protocol == NETLINK_NETFILTER && n->pid) {
+		int i;
+
+		/* destroy all instances for this pid */
+		write_lock_bh(&instances_lock);
+		for  (i = 0; i < INSTANCE_BUCKETS; i++) {
+			struct hlist_node *tmp, *t2;
+			struct nfqnl_instance *inst;
+			struct hlist_head *head = &instance_table[i];
+
+			hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
+				if (n->pid == inst->peer_pid)
+					__instance_destroy(inst);
+			}
+		}
+		write_unlock_bh(&instances_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfqnl_rtnl_notifier = {
+	.notifier_call	= nfqnl_rcv_nl_event,
+};
+
+static const int nfqa_verdict_min[NFQA_MAX] = {
+	[NFQA_VERDICT_HDR-1]	= sizeof(struct nfqnl_msg_verdict_hdr),
+	[NFQA_MARK-1]		= sizeof(u_int32_t),
+	[NFQA_PAYLOAD-1]	= 0,
+};
+
+static int
+nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
+		   struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t queue_num = ntohs(nfmsg->res_id);
+
+	struct nfqnl_msg_verdict_hdr *vhdr;
+	struct nfqnl_instance *queue;
+	unsigned int verdict;
+	struct nfqnl_queue_entry *entry;
+	int err;
+
+	if (nfattr_bad_size(nfqa, NFQA_MAX, nfqa_verdict_min)) {
+		QDEBUG("bad attribute size\n");
+		return -EINVAL;
+	}
+
+	queue = instance_lookup_get(queue_num);
+	if (!queue)
+		return -ENODEV;
+
+	if (queue->peer_pid != NETLINK_CB(skb).pid) {
+		err = -EPERM;
+		goto err_out_put;
+	}
+
+	if (!nfqa[NFQA_VERDICT_HDR-1]) {
+		err = -EINVAL;
+		goto err_out_put;
+	}
+
+	vhdr = NFA_DATA(nfqa[NFQA_VERDICT_HDR-1]);
+	verdict = ntohl(vhdr->verdict);
+
+	if ((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT) {
+		err = -EINVAL;
+		goto err_out_put;
+	}
+
+	entry = find_dequeue_entry(queue, id_cmp, ntohl(vhdr->id));
+	if (entry == NULL) {
+		err = -ENOENT;
+		goto err_out_put;
+	}
+
+	if (nfqa[NFQA_PAYLOAD-1]) {
+		if (nfqnl_mangle(NFA_DATA(nfqa[NFQA_PAYLOAD-1]),
+				 NFA_PAYLOAD(nfqa[NFQA_PAYLOAD-1]), entry) < 0)
+			verdict = NF_DROP;
+	}
+
+	if (nfqa[NFQA_MARK-1])
+		skb->nfmark = ntohl(*(u_int32_t *)NFA_DATA(nfqa[NFQA_MARK-1]));
+		
+	issue_verdict(entry, verdict);
+	instance_put(queue);
+	return 0;
+
+err_out_put:
+	instance_put(queue);
+	return err;
+}
+
+static int
+nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
+		  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+	return -ENOTSUPP;
+}
+
+static const int nfqa_cfg_min[NFQA_CFG_MAX] = {
+	[NFQA_CFG_CMD-1]	= sizeof(struct nfqnl_msg_config_cmd),
+	[NFQA_CFG_PARAMS-1]	= sizeof(struct nfqnl_msg_config_params),
+};
+
+static struct nf_queue_handler nfqh = {
+	.name 	= "nf_queue",
+	.outfn	= &nfqnl_enqueue_packet,
+};
+
+static int
+nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
+		  struct nlmsghdr *nlh, struct nfattr *nfqa[], int *errp)
+{
+	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	u_int16_t queue_num = ntohs(nfmsg->res_id);
+	struct nfqnl_instance *queue;
+	int ret = 0;
+
+	QDEBUG("entering for msg %u\n", NFNL_MSG_TYPE(nlh->nlmsg_type));
+
+	if (nfattr_bad_size(nfqa, NFQA_CFG_MAX, nfqa_cfg_min)) {
+		QDEBUG("bad attribute size\n");
+		return -EINVAL;
+	}
+
+	queue = instance_lookup_get(queue_num);
+	if (nfqa[NFQA_CFG_CMD-1]) {
+		struct nfqnl_msg_config_cmd *cmd;
+		cmd = NFA_DATA(nfqa[NFQA_CFG_CMD-1]);
+		QDEBUG("found CFG_CMD\n");
+
+		switch (cmd->command) {
+		case NFQNL_CFG_CMD_BIND:
+			if (queue)
+				return -EBUSY;
+
+			queue = instance_create(queue_num, NETLINK_CB(skb).pid);
+			if (!queue)
+				return -EINVAL;
+			break;
+		case NFQNL_CFG_CMD_UNBIND:
+			if (!queue)
+				return -ENODEV;
+
+			if (queue->peer_pid != NETLINK_CB(skb).pid) {
+				ret = -EPERM;
+				goto out_put;
+			}
+
+			instance_destroy(queue);
+			break;
+		case NFQNL_CFG_CMD_PF_BIND:
+			QDEBUG("registering queue handler for pf=%u\n",
+				ntohs(cmd->pf));
+			ret = nf_register_queue_handler(ntohs(cmd->pf), &nfqh);
+			break;
+		case NFQNL_CFG_CMD_PF_UNBIND:
+			QDEBUG("unregistering queue handler for pf=%u\n",
+				ntohs(cmd->pf));
+			/* This is a bug and a feature.  We can unregister
+			 * other handlers(!) */
+			ret = nf_unregister_queue_handler(ntohs(cmd->pf));
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+		}
+	} else {
+		if (!queue) {
+			QDEBUG("no config command, and no instance ENOENT\n");
+			ret = -ENOENT;
+			goto out_put;
+		}
+
+		if (queue->peer_pid != NETLINK_CB(skb).pid) {
+			QDEBUG("no config command, and wrong pid\n");
+			ret = -EPERM;
+			goto out_put;
+		}
+	}
+
+	if (nfqa[NFQA_CFG_PARAMS-1]) {
+		struct nfqnl_msg_config_params *params;
+		params = NFA_DATA(nfqa[NFQA_CFG_PARAMS-1]);
+
+		nfqnl_set_mode(queue, params->copy_mode,
+				ntohl(params->copy_range));
+	}
+
+out_put:
+	instance_put(queue);
+	return ret;
+}
+
+static struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
+	[NFQNL_MSG_PACKET]	= { .call = nfqnl_recv_unsupp,
+				    .attr_count = NFQA_MAX,
+				    .cap_required = CAP_NET_ADMIN },
+	[NFQNL_MSG_VERDICT]	= { .call = nfqnl_recv_verdict,
+				    .attr_count = NFQA_MAX,
+				    .cap_required = CAP_NET_ADMIN },
+	[NFQNL_MSG_CONFIG]	= { .call = nfqnl_recv_config,
+				    .attr_count = NFQA_CFG_MAX,
+				    .cap_required = CAP_NET_ADMIN },
+};
+
+static struct nfnetlink_subsystem nfqnl_subsys = {
+	.name		= "nf_queue",
+	.subsys_id	= NFNL_SUBSYS_QUEUE,
+	.cb_count	= NFQNL_MSG_MAX,
+	.cb		= nfqnl_cb,
+};
+
+#ifdef CONFIG_PROC_FS
+struct iter_state {
+	unsigned int bucket;
+};
+
+static struct hlist_node *get_first(struct seq_file *seq)
+{
+	struct iter_state *st = seq->private;
+
+	if (!st)
+		return NULL;
+
+	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
+		if (!hlist_empty(&instance_table[st->bucket]))
+			return instance_table[st->bucket].first;
+	}
+	return NULL;
+}
+
+static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
+{
+	struct iter_state *st = seq->private;
+
+	h = h->next;
+	while (!h) {
+		if (++st->bucket >= INSTANCE_BUCKETS)
+			return NULL;
+
+		h = instance_table[st->bucket].first;
+	}
+	return h;
+}
+
+static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct hlist_node *head;
+	head = get_first(seq);
+
+	if (head)
+		while (pos && (head = get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_bh(&instances_lock);
+	return get_idx(seq, *pos);
+}
+
+static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return get_next(s, v);
+}
+
+static void seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&instances_lock);
+}
+
+static int seq_show(struct seq_file *s, void *v)
+{
+	const struct nfqnl_instance *inst = v;
+
+	return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n",
+			  inst->queue_num,
+			  inst->peer_pid, inst->queue_total,
+			  inst->copy_mode, inst->copy_range,
+			  inst->queue_dropped, inst->queue_user_dropped,
+			  atomic_read(&inst->id_sequence),
+			  atomic_read(&inst->use));
+}
+
+static struct seq_operations nfqnl_seq_ops = {
+	.start	= seq_start,
+	.next	= seq_next,
+	.stop	= seq_stop,
+	.show	= seq_show,
+};
+
+static int nfqnl_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct iter_state *is;
+	int ret;
+
+	is = kmalloc(sizeof(*is), GFP_KERNEL);
+	if (!is)
+		return -ENOMEM;
+	memset(is, 0, sizeof(*is));
+	ret = seq_open(file, &nfqnl_seq_ops);
+	if (ret < 0)
+		goto out_free;
+	seq = file->private_data;
+	seq->private = is;
+	return ret;
+out_free:
+	kfree(is);
+	return ret;
+}
+
+static struct file_operations nfqnl_file_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = nfqnl_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif /* PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+	int i, status = -ENOMEM;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc_nfqueue;
+#endif
+	
+	if (!init)
+		goto cleanup;
+
+	for (i = 0; i < INSTANCE_BUCKETS; i++)
+		INIT_HLIST_HEAD(&instance_table[i]);
+
+	netlink_register_notifier(&nfqnl_rtnl_notifier);
+	status = nfnetlink_subsys_register(&nfqnl_subsys);
+	if (status < 0) {
+		printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+#ifdef CONFIG_PROC_FS
+	proc_nfqueue = create_proc_entry("nfnetlink_queue", 0440,
+					 proc_net_netfilter);
+	if (!proc_nfqueue)
+		goto cleanup_subsys;
+	proc_nfqueue->proc_fops = &nfqnl_file_ops;
+#endif
+
+	register_netdevice_notifier(&nfqnl_dev_notifier);
+
+	return status;
+
+cleanup:
+	nf_unregister_queue_handlers(&nfqh);
+	unregister_netdevice_notifier(&nfqnl_dev_notifier);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
+cleanup_subsys:
+#endif	
+	nfnetlink_subsys_unregister(&nfqnl_subsys);
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+	return status;
+}
+
+static int __init init(void)
+{
+	
+	return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("netfilter packet queue handler");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ff774a06c89..62435ffc618 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -13,7 +13,12 @@
  *                               added netlink_proto_exit
  * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
  * 				 use nlk_sk, as sk->protinfo is on a diet 8)
- *
+ * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
+ * 				 - inc module use count of module that owns
+ * 				   the kernel socket in case userspace opens
+ * 				   socket of same protocol
+ * 				 - remove all module support, since netlink is
+ * 				   mandatory if CONFIG_NET=y these days
  */
 
 #include <linux/config.h>
@@ -55,21 +60,29 @@
 #include <net/scm.h>
 
 #define Nprintk(a...)
+#define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 
 struct netlink_sock {
 	/* struct sock has to be the first member of netlink_sock */
 	struct sock		sk;
 	u32			pid;
-	unsigned int		groups;
 	u32			dst_pid;
-	unsigned int		dst_groups;
+	u32			dst_group;
+	u32			flags;
+	u32			subscriptions;
+	u32			ngroups;
+	unsigned long		*groups;
 	unsigned long		state;
 	wait_queue_head_t	wait;
 	struct netlink_callback	*cb;
 	spinlock_t		cb_lock;
 	void			(*data_ready)(struct sock *sk, int bytes);
+	struct module		*module;
 };
 
+#define NETLINK_KERNEL_SOCKET	0x1
+#define NETLINK_RECV_PKTINFO	0x2
+
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
 {
 	return (struct netlink_sock *)sk;
@@ -92,6 +105,9 @@ struct netlink_table {
 	struct nl_pid_hash hash;
 	struct hlist_head mc_list;
 	unsigned int nl_nonroot;
+	unsigned int groups;
+	struct module *module;
+	int registered;
 };
 
 static struct netlink_table *nl_table;
@@ -106,6 +122,11 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 
 static struct notifier_block *netlink_chain;
 
+static u32 netlink_group_mask(u32 group)
+{
+	return group ? 1 << (group - 1) : 0;
+}
+
 static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid)
 {
 	return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask];
@@ -122,6 +143,7 @@ static void netlink_sock_destruct(struct sock *sk)
 	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
 	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
 	BUG_TRAP(!nlk_sk(sk)->cb);
+	BUG_TRAP(!nlk_sk(sk)->groups);
 }
 
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP.
@@ -317,7 +339,7 @@ static void netlink_remove(struct sock *sk)
 	netlink_table_grab();
 	if (sk_del_node_init(sk))
 		nl_table[sk->sk_protocol].hash.entries--;
-	if (nlk_sk(sk)->groups)
+	if (nlk_sk(sk)->subscriptions)
 		__sk_del_bind_node(sk);
 	netlink_table_ungrab();
 }
@@ -328,19 +350,11 @@ static struct proto netlink_proto = {
 	.obj_size = sizeof(struct netlink_sock),
 };
 
-static int netlink_create(struct socket *sock, int protocol)
+static int __netlink_create(struct socket *sock, int protocol)
 {
 	struct sock *sk;
 	struct netlink_sock *nlk;
 
-	sock->state = SS_UNCONNECTED;
-
-	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
-		return -ESOCKTNOSUPPORT;
-
-	if (protocol<0 || protocol >= MAX_LINKS)
-		return -EPROTONOSUPPORT;
-
 	sock->ops = &netlink_ops;
 
 	sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
@@ -350,15 +364,67 @@ static int netlink_create(struct socket *sock, int protocol)
 	sock_init_data(sock, sk);
 
 	nlk = nlk_sk(sk);
-
 	spin_lock_init(&nlk->cb_lock);
 	init_waitqueue_head(&nlk->wait);
-	sk->sk_destruct = netlink_sock_destruct;
 
+	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
 	return 0;
 }
 
+static int netlink_create(struct socket *sock, int protocol)
+{
+	struct module *module = NULL;
+	struct netlink_sock *nlk;
+	unsigned int groups;
+	int err = 0;
+
+	sock->state = SS_UNCONNECTED;
+
+	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol<0 || protocol >= MAX_LINKS)
+		return -EPROTONOSUPPORT;
+
+	netlink_lock_table();
+#ifdef CONFIG_KMOD
+	if (!nl_table[protocol].registered) {
+		netlink_unlock_table();
+		request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
+		netlink_lock_table();
+	}
+#endif
+	if (nl_table[protocol].registered &&
+	    try_module_get(nl_table[protocol].module))
+		module = nl_table[protocol].module;
+	else
+		err = -EPROTONOSUPPORT;
+	groups = nl_table[protocol].groups;
+	netlink_unlock_table();
+
+	if (err || (err = __netlink_create(sock, protocol) < 0))
+		goto out_module;
+
+	nlk = nlk_sk(sock->sk);
+
+	nlk->groups = kmalloc(NLGRPSZ(groups), GFP_KERNEL);
+	if (nlk->groups == NULL) {
+		err = -ENOMEM;
+		goto out_module;
+	}
+	memset(nlk->groups, 0, NLGRPSZ(groups));
+	nlk->ngroups = groups;
+
+	nlk->module = module;
+out:
+	return err;
+
+out_module:
+	module_put(module);
+	goto out;
+}
+
 static int netlink_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -387,14 +453,27 @@ static int netlink_release(struct socket *sock)
 
 	skb_queue_purge(&sk->sk_write_queue);
 
-	if (nlk->pid && !nlk->groups) {
+	if (nlk->pid && !nlk->subscriptions) {
 		struct netlink_notify n = {
 						.protocol = sk->sk_protocol,
 						.pid = nlk->pid,
 					  };
 		notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
 	}	
-	
+
+	if (nlk->module)
+		module_put(nlk->module);
+
+	if (nlk->flags & NETLINK_KERNEL_SOCKET) {
+		netlink_table_grab();
+		nl_table[sk->sk_protocol].module = NULL;
+		nl_table[sk->sk_protocol].registered = 0;
+		netlink_table_ungrab();
+	}
+
+	kfree(nlk->groups);
+	nlk->groups = NULL;
+
 	sock_put(sk);
 	return 0;
 }
@@ -443,6 +522,18 @@ static inline int netlink_capable(struct socket *sock, unsigned int flag)
 	       capable(CAP_NET_ADMIN);
 } 
 
+static void
+netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (nlk->subscriptions && !subscriptions)
+		__sk_del_bind_node(sk);
+	else if (!nlk->subscriptions && subscriptions)
+		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
+	nlk->subscriptions = subscriptions;
+}
+
 static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sock *sk = sock->sk;
@@ -468,15 +559,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
 			return err;
 	}
 
-	if (!nladdr->nl_groups && !nlk->groups)
+	if (!nladdr->nl_groups && !(u32)nlk->groups[0])
 		return 0;
 
 	netlink_table_grab();
-	if (nlk->groups && !nladdr->nl_groups)
-		__sk_del_bind_node(sk);
-	else if (!nlk->groups && nladdr->nl_groups)
-		sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
-	nlk->groups = nladdr->nl_groups;
+	netlink_update_subscriptions(sk, nlk->subscriptions +
+	                                 hweight32(nladdr->nl_groups) -
+	                                 hweight32(nlk->groups[0]));
+	nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups; 
 	netlink_table_ungrab();
 
 	return 0;
@@ -493,7 +583,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
 	if (addr->sa_family == AF_UNSPEC) {
 		sk->sk_state	= NETLINK_UNCONNECTED;
 		nlk->dst_pid	= 0;
-		nlk->dst_groups = 0;
+		nlk->dst_group  = 0;
 		return 0;
 	}
 	if (addr->sa_family != AF_NETLINK)
@@ -509,7 +599,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
 	if (err == 0) {
 		sk->sk_state	= NETLINK_CONNECTED;
 		nlk->dst_pid 	= nladdr->nl_pid;
-		nlk->dst_groups = nladdr->nl_groups;
+		nlk->dst_group  = ffs(nladdr->nl_groups);
 	}
 
 	return err;
@@ -527,10 +617,10 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
 
 	if (peer) {
 		nladdr->nl_pid = nlk->dst_pid;
-		nladdr->nl_groups = nlk->dst_groups;
+		nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
 	} else {
 		nladdr->nl_pid = nlk->pid;
-		nladdr->nl_groups = nlk->groups;
+		nladdr->nl_groups = nlk->groups[0];
 	}
 	return 0;
 }
@@ -731,7 +821,8 @@ static inline int do_one_broadcast(struct sock *sk,
 	if (p->exclude_sk == sk)
 		goto out;
 
-	if (nlk->pid == p->pid || !(nlk->groups & p->group))
+	if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
 
 	if (p->failure) {
@@ -770,7 +861,7 @@ out:
 }
 
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
-		      u32 group, int allocation)
+		      u32 group, unsigned int __nocast allocation)
 {
 	struct netlink_broadcast_data info;
 	struct hlist_node *node;
@@ -827,7 +918,8 @@ static inline int do_one_set_err(struct sock *sk,
 	if (sk == p->exclude_sk)
 		goto out;
 
-	if (nlk->pid == p->pid || !(nlk->groups & p->group))
+	if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
+	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
 
 	sk->sk_err = p->code;
@@ -855,6 +947,94 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 	read_unlock(&nl_table_lock);
 }
 
+static int netlink_setsockopt(struct socket *sock, int level, int optname,
+                              char __user *optval, int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int val = 0, err;
+
+	if (level != SOL_NETLINK)
+		return -ENOPROTOOPT;
+
+	if (optlen >= sizeof(int) &&
+	    get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	switch (optname) {
+	case NETLINK_PKTINFO:
+		if (val)
+			nlk->flags |= NETLINK_RECV_PKTINFO;
+		else
+			nlk->flags &= ~NETLINK_RECV_PKTINFO;
+		err = 0;
+		break;
+	case NETLINK_ADD_MEMBERSHIP:
+	case NETLINK_DROP_MEMBERSHIP: {
+		unsigned int subscriptions;
+		int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
+
+		if (!netlink_capable(sock, NL_NONROOT_RECV))
+			return -EPERM;
+		if (!val || val - 1 >= nlk->ngroups)
+			return -EINVAL;
+		netlink_table_grab();
+		old = test_bit(val - 1, nlk->groups);
+		subscriptions = nlk->subscriptions - old + new;
+		if (new)
+			__set_bit(val - 1, nlk->groups);
+		else
+			__clear_bit(val - 1, nlk->groups);
+		netlink_update_subscriptions(sk, subscriptions);
+		netlink_table_ungrab();
+		err = 0;
+		break;
+	}
+	default:
+		err = -ENOPROTOOPT;
+	}
+	return err;
+}
+
+static int netlink_getsockopt(struct socket *sock, int level, int optname,
+                              char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	int len, val, err;
+
+	if (level != SOL_NETLINK)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case NETLINK_PKTINFO:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
+		put_user(len, optlen);
+		put_user(val, optval);
+		err = 0;
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+	return err;
+}
+
+static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct nl_pktinfo info;
+
+	info.group = NETLINK_CB(skb).dst_group;
+	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
+}
+
 static inline void netlink_rcv_wake(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -873,7 +1053,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct sockaddr_nl *addr=msg->msg_name;
 	u32 dst_pid;
-	u32 dst_groups;
+	u32 dst_group;
 	struct sk_buff *skb;
 	int err;
 	struct scm_cookie scm;
@@ -891,12 +1071,12 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		if (addr->nl_family != AF_NETLINK)
 			return -EINVAL;
 		dst_pid = addr->nl_pid;
-		dst_groups = addr->nl_groups;
-		if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+		dst_group = ffs(addr->nl_groups);
+		if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
 			return -EPERM;
 	} else {
 		dst_pid = nlk->dst_pid;
-		dst_groups = nlk->dst_groups;
+		dst_group = nlk->dst_group;
 	}
 
 	if (!nlk->pid) {
@@ -914,9 +1094,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 
 	NETLINK_CB(skb).pid	= nlk->pid;
-	NETLINK_CB(skb).groups	= nlk->groups;
 	NETLINK_CB(skb).dst_pid = dst_pid;
-	NETLINK_CB(skb).dst_groups = dst_groups;
+	NETLINK_CB(skb).dst_group = dst_group;
 	NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
 	memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
 
@@ -938,9 +1117,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 	}
 
-	if (dst_groups) {
+	if (dst_group) {
 		atomic_inc(&skb->users);
-		netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
+		netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
 	}
 	err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
 
@@ -986,7 +1165,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 		addr->nl_family = AF_NETLINK;
 		addr->nl_pad    = 0;
 		addr->nl_pid	= NETLINK_CB(skb).pid;
-		addr->nl_groups	= NETLINK_CB(skb).dst_groups;
+		addr->nl_groups	= netlink_group_mask(NETLINK_CB(skb).dst_group);
 		msg->msg_namelen = sizeof(*addr);
 	}
 
@@ -1001,6 +1180,8 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 		netlink_dump(sk);
 
 	scm_recv(sock, msg, siocb->scm, flags);
+	if (nlk->flags & NETLINK_RECV_PKTINFO)
+		netlink_cmsg_recv_pktinfo(msg, skb);
 
 out:
 	netlink_rcv_wake(sk);
@@ -1023,10 +1204,13 @@ static void netlink_data_ready(struct sock *sk, int len)
  */
 
 struct sock *
-netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
+netlink_kernel_create(int unit, unsigned int groups,
+                      void (*input)(struct sock *sk, int len),
+                      struct module *module)
 {
 	struct socket *sock;
 	struct sock *sk;
+	struct netlink_sock *nlk;
 
 	if (!nl_table)
 		return NULL;
@@ -1037,20 +1221,31 @@ netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len))
 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
 		return NULL;
 
-	if (netlink_create(sock, unit) < 0) {
-		sock_release(sock);
-		return NULL;
-	}
+	if (__netlink_create(sock, unit) < 0)
+		goto out_sock_release;
+
 	sk = sock->sk;
 	sk->sk_data_ready = netlink_data_ready;
 	if (input)
 		nlk_sk(sk)->data_ready = input;
 
-	if (netlink_insert(sk, 0)) {
-		sock_release(sock);
-		return NULL;
-	}
+	if (netlink_insert(sk, 0))
+		goto out_sock_release;
+
+	nlk = nlk_sk(sk);
+	nlk->flags |= NETLINK_KERNEL_SOCKET;
+
+	netlink_table_grab();
+	nl_table[unit].groups = groups < 32 ? 32 : groups;
+	nl_table[unit].module = module;
+	nl_table[unit].registered = 1;
+	netlink_table_ungrab();
+
 	return sk;
+
+out_sock_release:
+	sock_release(sock);
+	return NULL;
 }
 
 void netlink_set_nonroot(int protocol, unsigned int flags)
@@ -1288,7 +1483,8 @@ static int netlink_seq_show(struct seq_file *seq, void *v)
 			   s,
 			   s->sk_protocol,
 			   nlk->pid,
-			   nlk->groups,
+			   nlk->flags & NETLINK_KERNEL_SOCKET ?
+				0 : (unsigned int)nlk->groups[0],
 			   atomic_read(&s->sk_rmem_alloc),
 			   atomic_read(&s->sk_wmem_alloc),
 			   nlk->cb,
@@ -1362,8 +1558,8 @@ static struct proto_ops netlink_ops = {
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
-	.setsockopt =	sock_no_setsockopt,
-	.getsockopt =	sock_no_getsockopt,
+	.setsockopt =	netlink_setsockopt,
+	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
 	.mmap =		sock_no_mmap,
@@ -1438,21 +1634,7 @@ out:
 	return err;
 }
 
-static void __exit netlink_proto_exit(void)
-{
-	sock_unregister(PF_NETLINK);
-	proc_net_remove("netlink");
-	kfree(nl_table);
-	nl_table = NULL;
-	proto_unregister(&netlink_proto);
-}
-
 core_initcall(netlink_proto_init);
-module_exit(netlink_proto_exit);
-
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS_NETPROTO(PF_NETLINK);
 
 EXPORT_SYMBOL(netlink_ack);
 EXPORT_SYMBOL(netlink_broadcast);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 162a85fed15..4b53de98211 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -39,7 +39,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/arp.h>
 #include <linux/init.h>
 
@@ -858,17 +858,16 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
 	frametype          = skb->data[19] & 0x0F;
 	flags              = skb->data[19] & 0xF0;
 
-#ifdef CONFIG_INET
 	/*
 	 * Check for an incoming IP over NET/ROM frame.
 	 */
-	if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
+	if (frametype == NR_PROTOEXT &&
+	    circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) {
 		skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
 		skb->h.raw = skb->data;
 
 		return nr_rx_ip(skb, dev);
 	}
-#endif
 
 	/*
 	 * Find an existing socket connection, based on circuit ID, if it's
diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c
index 220bf7494f7..263da4c2649 100644
--- a/net/netrom/nr_dev.c
+++ b/net/netrom/nr_dev.c
@@ -38,8 +38,6 @@
 #include <net/ax25.h>
 #include <net/netrom.h>
 
-#ifdef CONFIG_INET
-
 /*
  *	Only allow IP over NET/ROM frames through if the netrom device is up.
  */
@@ -64,11 +62,12 @@ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev)
 	skb->nh.raw   = skb->data;
 	skb->pkt_type = PACKET_HOST;
 
-	ip_rcv(skb, skb->dev, NULL);
+	netif_rx(skb);
 
 	return 1;
 }
 
+#ifdef CONFIG_INET
 
 static int nr_rebuild_header(struct sk_buff *skb)
 {
diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c
index 9c44b379412..64b81a79690 100644
--- a/net/netrom/nr_in.c
+++ b/net/netrom/nr_in.c
@@ -22,8 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
-#include <net/ip.h>			/* For ip_rcv */
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c
index 0627347b14b..587bed2674b 100644
--- a/net/netrom/nr_subr.c
+++ b/net/netrom/nr_subr.c
@@ -21,7 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
@@ -77,7 +77,7 @@ void nr_requeue_frames(struct sock *sk)
 		if (skb_prev == NULL)
 			skb_queue_head(&sk->sk_write_queue, skb);
 		else
-			skb_append(skb_prev, skb);
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
 		skb_prev = skb;
 	}
 }
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index faabda8088b..75b72d389ba 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -22,7 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9d5980aa4d..ba997095f08 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -241,7 +241,7 @@ static struct proto_ops packet_ops;
 #ifdef CONFIG_SOCK_PACKET
 static struct proto_ops packet_ops_spkt;
 
-static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct sock *sk;
 	struct sockaddr_pkt *spkt;
@@ -441,7 +441,7 @@ static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned
    we will not harm anyone.
  */
 
-static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct sock *sk;
 	struct sockaddr_ll *sll;
@@ -546,7 +546,7 @@ drop:
 }
 
 #ifdef CONFIG_PACKET_MMAP
-static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct sock *sk;
 	struct packet_sock *po;
@@ -635,12 +635,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct pack
 	h->tp_snaplen = snaplen;
 	h->tp_mac = macoff;
 	h->tp_net = netoff;
-	if (skb->stamp.tv_sec == 0) { 
-		do_gettimeofday(&skb->stamp);
+	if (skb->tstamp.off_sec == 0) { 
+		__net_timestamp(skb);
 		sock_enable_timestamp(sk);
 	}
-	h->tp_sec = skb->stamp.tv_sec;
-	h->tp_usec = skb->stamp.tv_usec;
+	h->tp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
+	h->tp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
 
 	sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
 	sll->sll_halen = 0;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 5480caf8ccc..c6e59f84c3a 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -41,7 +41,7 @@
 #include <net/rose.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/ip.h>
 #include <net/arp.h>
 
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index ef475a1bb1b..8348d33f1ef 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -26,8 +26,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/ip.h>			/* For ip_rcv */
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 25da6f699fd..4510cd7613e 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -24,7 +24,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/fcntl.h>
diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c
index 7db7e1cedc3..a29a3a960fd 100644
--- a/net/rose/rose_subr.c
+++ b/net/rose/rose_subr.c
@@ -21,7 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
@@ -74,7 +74,7 @@ void rose_requeue_frames(struct sock *sk)
 		if (skb_prev == NULL)
 			skb_queue_head(&sk->sk_write_queue, skb);
 		else
-			skb_append(skb_prev, skb);
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
 		skb_prev = skb;
 	}
 }
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index 84dd4403f79..50ae0371dab 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -22,7 +22,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/system.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 9bce7794130..122c086ee2d 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -330,7 +330,7 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
 
 	msg->trans = trans;
 	msg->state = RXRPC_MSG_RECEIVED;
-	msg->stamp = pkt->stamp;
+	skb_get_timestamp(pkt, &msg->stamp);
 	if (msg->stamp.tv_sec == 0) {
 		do_gettimeofday(&msg->stamp); 
 		if (pkt->sk) 
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 59d3e71f8b8..45d3bc0812c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -491,6 +491,7 @@ config NET_EMATCH_TEXT
 	depends on NET_EMATCH
 	select TEXTSEARCH
 	select TEXTSEARCH_KMP
+	select TEXTSEARCH_BM
 	select TEXTSEARCH_FSM
 	---help---
 	  Say Y here if you want to be ablt to classify packets based on
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 249c61936ea..8aebe8f6d27 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -165,7 +165,7 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
 	while ((a = act) != NULL) {
 repeat:
 		if (a->ops && a->ops->act) {
-			ret = a->ops->act(&skb, a);
+			ret = a->ops->act(&skb, a, res);
 			if (TC_MUNGED & skb->tc_verd) {
 				/* copied already, allow trampling */
 				skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
@@ -179,11 +179,6 @@ repeat:
 		act = a->next;
 	}
 exec_done:
-	if (skb->tc_classid > 0) {
-		res->classid = skb->tc_classid;
-		res->class = 0;
-		skb->tc_classid = 0;
-	}
 	return ret;
 }
 
@@ -598,7 +593,7 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
 	nlh->nlmsg_flags |= NLM_F_ROOT;
 	module_put(a->ops->owner);
 	kfree(a);
-	err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	err = rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 	if (err > 0)
 		return 0;
 
@@ -661,7 +656,7 @@ tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event)
 
 		/* now do the delete */
 		tcf_action_destroy(head, 0);
-		ret = rtnetlink_send(skb, pid, RTMGRP_TC,
+		ret = rtnetlink_send(skb, pid, RTNLGRP_TC,
 		                     n->nlmsg_flags&NLM_F_ECHO);
 		if (ret > 0)
 			return 0;
@@ -703,9 +698,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
 	x->rta_len = skb->tail - (u8*)x;
 	
 	nlh->nlmsg_len = skb->tail - b;
-	NETLINK_CB(skb).dst_groups = RTMGRP_TC;
+	NETLINK_CB(skb).dst_group = RTNLGRP_TC;
 	
-	err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO);
+	err = rtnetlink_send(skb, pid, RTNLGRP_TC, flags&NLM_F_ECHO);
 	if (err > 0)
 		err = 0;
 	return err;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3b5714ef4d1..b4d89fbb378 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -367,7 +367,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 }
 
 struct tcf_dump_args
diff --git a/net/sched/gact.c b/net/sched/gact.c
index a811c89fef7..d1c6d542912 100644
--- a/net/sched/gact.c
+++ b/net/sched/gact.c
@@ -135,7 +135,7 @@ tcf_gact_cleanup(struct tc_action *a, int bind)
 }
 
 static int
-tcf_gact(struct sk_buff **pskb, struct tc_action *a)
+tcf_gact(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
 	struct tcf_gact *p = PRIV(a, gact);
 	struct sk_buff *skb = *pskb;
diff --git a/net/sched/ipt.c b/net/sched/ipt.c
index b114d994d52..f50136eed21 100644
--- a/net/sched/ipt.c
+++ b/net/sched/ipt.c
@@ -201,7 +201,7 @@ tcf_ipt_cleanup(struct tc_action *a, int bind)
 }
 
 static int
-tcf_ipt(struct sk_buff **pskb, struct tc_action *a)
+tcf_ipt(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
 	int ret = 0, result = 0;
 	struct tcf_ipt *p = PRIV(a, ipt);
diff --git a/net/sched/mirred.c b/net/sched/mirred.c
index f309ce33680..20d06916dc0 100644
--- a/net/sched/mirred.c
+++ b/net/sched/mirred.c
@@ -158,7 +158,7 @@ tcf_mirred_cleanup(struct tc_action *a, int bind)
 }
 
 static int
-tcf_mirred(struct sk_buff **pskb, struct tc_action *a)
+tcf_mirred(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
 	struct tcf_mirred *p = PRIV(a, mirred);
 	struct net_device *dev;
diff --git a/net/sched/pedit.c b/net/sched/pedit.c
index 678be6a645f..767d24f4610 100644
--- a/net/sched/pedit.c
+++ b/net/sched/pedit.c
@@ -130,7 +130,7 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
 }
 
 static int
-tcf_pedit(struct sk_buff **pskb, struct tc_action *a)
+tcf_pedit(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
 	struct tcf_pedit *p = PRIV(a, pedit);
 	struct sk_buff *skb = *pskb;
diff --git a/net/sched/police.c b/net/sched/police.c
index c03545faf52..eb39fb2f39b 100644
--- a/net/sched/police.c
+++ b/net/sched/police.c
@@ -284,7 +284,8 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
 	return 0;
 }
 
-static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a,
+                          struct tcf_result *res)
 {
 	psched_time_t now;
 	struct sk_buff *skb = *pskb;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9a069af4a0..737681cb9a9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -816,7 +816,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 	}
 
 	if (skb->len)
-		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+		return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 
 err_out:
 	kfree_skb(skb);
@@ -1040,7 +1040,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 		return -EINVAL;
 	}
 
-	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
+	return rtnetlink_send(skb, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
 }
 
 struct qdisc_dump_args
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 0d066c96534..99ceb91f015 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -238,6 +238,20 @@ static void dev_watchdog_down(struct net_device *dev)
 	spin_unlock_bh(&dev->xmit_lock);
 }
 
+void netif_carrier_on(struct net_device *dev)
+{
+	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
+		linkwatch_fire_event(dev);
+	if (netif_running(dev))
+		__netdev_watchdog_up(dev);
+}
+
+void netif_carrier_off(struct net_device *dev)
+{
+	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
+		linkwatch_fire_event(dev);
+}
+
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
@@ -600,6 +614,8 @@ void dev_shutdown(struct net_device *dev)
 }
 
 EXPORT_SYMBOL(__netdev_watchdog_up);
+EXPORT_SYMBOL(netif_carrier_on);
+EXPORT_SYMBOL(netif_carrier_off);
 EXPORT_SYMBOL(noop_qdisc);
 EXPORT_SYMBOL(noop_qdisc_ops);
 EXPORT_SYMBOL(qdisc_create_dflt);
diff --git a/net/sched/simple.c b/net/sched/simple.c
index 3ab4c675ab5..8a6ae4f491e 100644
--- a/net/sched/simple.c
+++ b/net/sched/simple.c
@@ -44,7 +44,7 @@ static DEFINE_RWLOCK(simp_lock);
 #include <net/pkt_act.h>
 #include <net/act_generic.h>
 
-static int tcf_simp(struct sk_buff **pskb, struct tc_action *a)
+static int tcf_simp(struct sk_buff **pskb, struct tc_action *a, struct tcf_result *res)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcf_defact *p = PRIV(a, defact);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 742be9171b7..28f32243397 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -236,8 +236,8 @@ int sctp_rcv(struct sk_buff *skb)
 	}
 
 	/* SCTP seems to always need a timestamp right now (FIXME) */
-	if (skb->stamp.tv_sec == 0) {
-		do_gettimeofday(&skb->stamp);
+	if (skb->tstamp.off_sec == 0) {
+		__net_timestamp(skb);
 		sock_enable_timestamp(sk); 
 	}
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e9b2fd480d6..fa3be2b8fb5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -66,8 +66,8 @@
 #include <linux/seq_file.h>
 
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <net/ndisc.h>
+#include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/transp_v6.h>
 #include <net/addrconf.h>
@@ -641,10 +641,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	else
 		newinet->pmtudisc = IP_PMTUDISC_WANT;
 
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet6_sock_nr);
-	atomic_inc(&inet_sock_nr);
-#endif
+	sk_refcnt_debug_inc(newsk);
 
 	if (newsk->sk_prot->init(newsk)) {
 		sk_common_release(newsk);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index ce9245e71fc..e7025be7769 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -62,7 +62,7 @@
 /* Global data structures. */
 struct sctp_globals sctp_globals;
 struct proc_dir_entry	*proc_net_sctp;
-DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics);
+DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics) __read_mostly;
 
 struct idr sctp_assocs_id;
 DEFINE_SPINLOCK(sctp_assocs_id_lock);
@@ -78,8 +78,8 @@ static struct sctp_pf *sctp_pf_inet_specific;
 static struct sctp_af *sctp_af_v4_specific;
 static struct sctp_af *sctp_af_v6_specific;
 
-kmem_cache_t *sctp_chunk_cachep;
-kmem_cache_t *sctp_bucket_cachep;
+kmem_cache_t *sctp_chunk_cachep __read_mostly;
+kmem_cache_t *sctp_bucket_cachep __read_mostly;
 
 extern int sctp_snmp_proc_init(void);
 extern int sctp_snmp_proc_exit(void);
@@ -593,9 +593,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
 	newinet->mc_index = 0;
 	newinet->mc_list = NULL;
 
-#ifdef INET_REFCNT_DEBUG
-	atomic_inc(&inet_sock_nr);
-#endif
+	sk_refcnt_debug_inc(newsk);
 
 	if (newsk->sk_prot->init(newsk)) {
 		sk_common_release(newsk);
@@ -1244,6 +1242,10 @@ SCTP_STATIC __exit void sctp_exit(void)
 module_init(sctp_init);
 module_exit(sctp_exit);
 
+/*
+ * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
+ */
+MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
 MODULE_LICENSE("GPL");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 00d32b7c826..3868a8d70cc 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1362,6 +1362,7 @@ struct sctp_association *sctp_unpack_cookie(
 	char *key;
 	sctp_scope_t scope;
 	struct sk_buff *skb = chunk->skb;
+	struct timeval tv;
 
 	headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE;
 	bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
@@ -1434,7 +1435,8 @@ no_hmac:
 	 * an association, there is no need to check cookie's expiration
 	 * for init collision case of lost COOKIE ACK.
 	 */
-	if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) {
+	skb_get_timestamp(skb, &tv);
+	if (!asoc && tv_lt(bear_cookie->expiration, tv)) {
 		__u16 len;
 		/*
 		 * Section 3.3.10.3 Stale Cookie Error (3)
@@ -1447,10 +1449,9 @@ no_hmac:
 		len = ntohs(chunk->chunk_hdr->length);
 		*errp = sctp_make_op_error_space(asoc, chunk, len);
 		if (*errp) {
-			suseconds_t usecs = (skb->stamp.tv_sec -
+			suseconds_t usecs = (tv.tv_sec -
 				bear_cookie->expiration.tv_sec) * 1000000L +
-				skb->stamp.tv_usec -
-				bear_cookie->expiration.tv_usec;
+				tv.tv_usec - bear_cookie->expiration.tv_usec;
 
 			usecs = htonl(usecs);
 			sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 091a66f06a3..4454afe4727 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4892,7 +4892,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
 		event = sctp_skb2event(skb);
 		if (event->asoc == assoc) {
-			__skb_unlink(skb, skb->list);
+			__skb_unlink(skb, &oldsk->sk_receive_queue);
 			__skb_queue_tail(&newsk->sk_receive_queue, skb);
 		}
 	}
@@ -4921,7 +4921,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 		sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
 			event = sctp_skb2event(skb);
 			if (event->asoc == assoc) {
-				__skb_unlink(skb, skb->list);
+				__skb_unlink(skb, &oldsp->pd_lobby);
 				__skb_queue_tail(queue, skb);
 			}
 		}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 8bbc279d6c9..ec2c857eae7 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -50,9 +50,9 @@
 
 /* Forward declarations for internal helpers.  */
 static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
-						struct sctp_ulpevent *);
+					      struct sctp_ulpevent *);
 static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *,
-						struct sctp_ulpevent *);
+					      struct sctp_ulpevent *);
 
 /* 1st Level Abstractions */
 
@@ -125,7 +125,9 @@ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 		event = sctp_ulpq_order(ulpq, event);
 	}
 
-	/* Send event to the ULP.  */
+	/* Send event to the ULP.  'event' is the sctp_ulpevent for
+	 * very first SKB on the 'temp' list.
+	 */
 	if (event)
 		sctp_ulpq_tail_event(ulpq, event);
 
@@ -158,14 +160,18 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
 	return sctp_clear_pd(ulpq->asoc->base.sk);
 }
 
-
-
+/* If the SKB of 'event' is on a list, it is the first such member
+ * of that list.
+ */
 int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 {
 	struct sock *sk = ulpq->asoc->base.sk;
-	struct sk_buff_head *queue;
+	struct sk_buff_head *queue, *skb_list;
+	struct sk_buff *skb = sctp_event2skb(event);
 	int clear_pd = 0;
 
+	skb_list = (struct sk_buff_head *) skb->prev;
+
 	/* If the socket is just going to throw this away, do not
 	 * even try to deliver it.
 	 */
@@ -197,10 +203,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 	/* If we are harvesting multiple skbs they will be
 	 * collected on a list.
 	 */
-	if (sctp_event2skb(event)->list)
-		sctp_skb_list_tail(sctp_event2skb(event)->list, queue);
+	if (skb_list)
+		sctp_skb_list_tail(skb_list, queue);
 	else
-		__skb_queue_tail(queue, sctp_event2skb(event));
+		__skb_queue_tail(queue, skb);
 
 	/* Did we just complete partial delivery and need to get
 	 * rolling again?  Move pending data to the receive
@@ -214,10 +220,11 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
 	return 1;
 
 out_free:
-	if (sctp_event2skb(event)->list)
-		sctp_queue_purge_ulpevents(sctp_event2skb(event)->list);
+	if (skb_list)
+		sctp_queue_purge_ulpevents(skb_list);
 	else
 		sctp_ulpevent_free(event);
+
 	return 0;
 }
 
@@ -269,7 +276,7 @@ static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
  * payload was fragmented on the way and ip had to reassemble them.
  * We add the rest of skb's to the first skb's fraglist.
  */
-static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag)
+static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
 {
 	struct sk_buff *pos;
 	struct sctp_ulpevent *event;
@@ -294,7 +301,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
 		skb_shinfo(f_frag)->frag_list = pos;
 
 	/* Remove the first fragment from the reassembly queue.  */
-	__skb_unlink(f_frag, f_frag->list);
+	__skb_unlink(f_frag, queue);
 	while (pos) {
 
 		pnext = pos->next;
@@ -304,7 +311,7 @@ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag,
 		f_frag->data_len += pos->len;
 
 		/* Remove the fragment from the reassembly queue.  */
-		__skb_unlink(pos, pos->list);
+		__skb_unlink(pos, queue);
 	
 		/* Break if we have reached the last fragment.  */
 		if (pos == l_frag)
@@ -375,7 +382,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_u
 done:
 	return retval;
 found:
-	retval = sctp_make_reassembled_event(first_frag, pos);
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, pos);
 	if (retval)
 		retval->msg_flags |= MSG_EOR;
 	goto done;
@@ -435,7 +442,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq
 	 * further.
 	 */
 done:
-	retval = sctp_make_reassembled_event(first_frag, last_frag);
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
 	if (retval && is_last)
 		retval->msg_flags |= MSG_EOR;
 
@@ -527,7 +534,7 @@ static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *u
 	 * further.
 	 */
 done:
-	retval = sctp_make_reassembled_event(first_frag, last_frag);
+	retval = sctp_make_reassembled_event(&ulpq->reasm, first_frag, last_frag);
 	return retval;
 }
 
@@ -537,6 +544,7 @@ done:
 static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 					      struct sctp_ulpevent *event)
 {
+	struct sk_buff_head *event_list;
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
 	struct sctp_stream *in;
@@ -547,6 +555,8 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 	ssn = event->ssn;
 	in  = &ulpq->asoc->ssnmap->in;
 
+	event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
+
 	/* We are holding the chunks by stream, by SSN.  */
 	sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
 		cevent = (struct sctp_ulpevent *) pos->cb;
@@ -567,10 +577,10 @@ static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
 		/* Found it, so mark in the ssnmap. */
 		sctp_ssn_next(in, sid);
 
-		__skb_unlink(pos, pos->list);
+		__skb_unlink(pos, &ulpq->lobby);
 
 		/* Attach all gathered skbs to the event.  */
-		__skb_queue_tail(sctp_event2skb(event)->list, pos);
+		__skb_queue_tail(event_list, pos);
 	}
 }
 
@@ -626,7 +636,7 @@ static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
 }
 
 static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
-						struct sctp_ulpevent *event)
+					     struct sctp_ulpevent *event)
 {
 	__u16 sid, ssn;
 	struct sctp_stream *in;
@@ -667,7 +677,7 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
 {
 	struct sk_buff *pos, *tmp;
 	struct sctp_ulpevent *cevent;
-	struct sctp_ulpevent *event = NULL;
+	struct sctp_ulpevent *event;
 	struct sctp_stream *in;
 	struct sk_buff_head temp;
 	__u16 csid, cssn;
@@ -675,6 +685,8 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
 	in  = &ulpq->asoc->ssnmap->in;
 
 	/* We are holding the chunks by stream, by SSN.  */
+	skb_queue_head_init(&temp);
+	event = NULL;
 	sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
 		cevent = (struct sctp_ulpevent *) pos->cb;
 		csid = cevent->stream;
@@ -686,19 +698,20 @@ static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq)
 		/* Found it, so mark in the ssnmap. */	       
 		sctp_ssn_next(in, csid);
 
-		__skb_unlink(pos, pos->list);
+		__skb_unlink(pos, &ulpq->lobby);
 		if (!event) {						
 			/* Create a temporary list to collect chunks on.  */
 			event = sctp_skb2event(pos);
-			skb_queue_head_init(&temp);
 			__skb_queue_tail(&temp, sctp_event2skb(event));
 		} else {
 			/* Attach all gathered skbs to the event.  */
-			__skb_queue_tail(sctp_event2skb(event)->list, pos);
+			__skb_queue_tail(&temp, pos);
 		}
 	}
 
-	/* Send event to the ULP.  */
+	/* Send event to the ULP.  'event' is the sctp_ulpevent for
+	 * very first SKB on the 'temp' list.
+	 */
 	if (event)
 		sctp_ulpq_tail_event(ulpq, event);
 }
diff --git a/net/socket.c b/net/socket.c
index 6f2a1788197..94fe638b4d7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -70,6 +70,8 @@
 #include <linux/seq_file.h>
 #include <linux/wanrouter.h>
 #include <linux/if_bridge.h>
+#include <linux/if_frad.h>
+#include <linux/if_vlan.h>
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/cache.h>
@@ -272,7 +274,7 @@ int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ule
 
 #define SOCKFS_MAGIC 0x534F434B
 
-static kmem_cache_t * sock_inode_cachep;
+static kmem_cache_t * sock_inode_cachep __read_mostly;
 
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
@@ -331,7 +333,7 @@ static struct super_block *sockfs_get_sb(struct file_system_type *fs_type,
 	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
 }
 
-static struct vfsmount *sock_mnt;
+static struct vfsmount *sock_mnt __read_mostly;
 
 static struct file_system_type sock_fs_type = {
 	.name =		"sockfs",
@@ -404,6 +406,7 @@ int sock_map_fd(struct socket *sock)
 		file->f_mode = FMODE_READ | FMODE_WRITE;
 		file->f_flags = O_RDWR;
 		file->f_pos = 0;
+		file->private_data = sock;
 		fd_install(fd, file);
 	}
 
@@ -436,6 +439,9 @@ struct socket *sockfd_lookup(int fd, int *err)
 		return NULL;
 	}
 
+	if (file->f_op == &socket_file_ops)
+		return file->private_data;	/* set in sock_map_fd */
+
 	inode = file->f_dentry->d_inode;
 	if (!S_ISSOCK(inode->i_mode)) {
 		*err = -ENOTSOCK;
@@ -720,8 +726,8 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
 	return __sock_sendmsg(iocb, sock, &x->async_msg, size);
 }
 
-ssize_t sock_sendpage(struct file *file, struct page *page,
-		      int offset, size_t size, loff_t *ppos, int more)
+static ssize_t sock_sendpage(struct file *file, struct page *page,
+			     int offset, size_t size, loff_t *ppos, int more)
 {
 	struct socket *sock;
 	int flags;
@@ -944,7 +950,7 @@ static int sock_mmap(struct file * file, struct vm_area_struct * vma)
 	return sock->ops->mmap(file, sock, vma);
 }
 
-int sock_close(struct inode *inode, struct file *filp)
+static int sock_close(struct inode *inode, struct file *filp)
 {
 	/*
 	 *	It was possible the inode is NULL we were 
@@ -2023,9 +2029,6 @@ int sock_unregister(int family)
 	return 0;
 }
 
-
-extern void sk_init(void);
-
 void __init sock_init(void)
 {
 	/*
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 554f224c044..fe1a73ce6cf 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -28,13 +28,13 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
-static struct vfsmount *rpc_mount;
+static struct vfsmount *rpc_mount __read_mostly;
 static int rpc_mount_count;
 
 static struct file_system_type rpc_pipe_fs_type;
 
 
-static kmem_cache_t *rpc_inode_cachep;
+static kmem_cache_t *rpc_inode_cachep __read_mostly;
 
 #define RPC_UPCALL_TIMEOUT (30*HZ)
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 2d9eb7fbd52..f3104035e35 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -34,10 +34,10 @@ static int			rpc_task_id;
 #define RPC_BUFFER_MAXSIZE	(2048)
 #define RPC_BUFFER_POOLSIZE	(8)
 #define RPC_TASK_POOLSIZE	(8)
-static kmem_cache_t	*rpc_task_slabp;
-static kmem_cache_t	*rpc_buffer_slabp;
-static mempool_t	*rpc_task_mempool;
-static mempool_t	*rpc_buffer_mempool;
+static kmem_cache_t	*rpc_task_slabp __read_mostly;
+static kmem_cache_t	*rpc_buffer_slabp __read_mostly;
+static mempool_t	*rpc_task_mempool __read_mostly;
+static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			__rpc_default_timer(struct rpc_task *task);
 static void			rpciod_killall(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d0c3120d023..05fe2e73553 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -34,7 +34,7 @@
 #include <net/sock.h>
 #include <net/checksum.h>
 #include <net/ip.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
@@ -584,13 +584,16 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 		/* possibly an icmp error */
 		dprintk("svc: recvfrom returned error %d\n", -err);
 	}
-	if (skb->stamp.tv_sec == 0) {
-		skb->stamp.tv_sec = xtime.tv_sec; 
-		skb->stamp.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; 
+	if (skb->tstamp.off_sec == 0) {
+		struct timeval tv;
+
+		tv.tv_sec = xtime.tv_sec;
+		tv.tv_usec = xtime.tv_nsec * 1000;
+		skb_set_timestamp(skb, &tv);
 		/* Don't enable netstamp, sunrpc doesn't 
 		   need that much accuracy */
 	}
-	svsk->sk_sk->sk_stamp = skb->stamp;
+	skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
 	set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
 
 	/*
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 3f6e31069c5..c5241fcbb96 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -17,17 +17,15 @@
 #include <linux/sysctl.h>
 
 #ifdef CONFIG_INET
-extern struct ctl_table ipv4_table[];
+#include <net/ip.h>
 #endif
 
-extern struct ctl_table core_table[];
-
 #ifdef CONFIG_NET
-extern struct ctl_table ether_table[];
+#include <linux/if_ether.h>
 #endif
 
 #ifdef CONFIG_TR
-extern struct ctl_table tr_table[];
+#include <linux/if_tr.h>
 #endif
 
 struct ctl_table net_table[] = {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d403e34088a..41feca3bef8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -105,7 +105,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <net/sock.h>
-#include <linux/tcp.h>
+#include <net/tcp_states.h>
 #include <net/af_unix.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -2026,14 +2026,6 @@ static struct net_proto_family unix_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
-#ifdef CONFIG_SYSCTL
-extern void unix_sysctl_register(void);
-extern void unix_sysctl_unregister(void);
-#else
-static inline void unix_sysctl_register(void) {}
-static inline void unix_sysctl_unregister(void) {}
-#endif
-
 static int __init af_unix_init(void)
 {
 	int rc = -1;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 4bd95c8f593..6ffc64e1712 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -76,11 +76,11 @@
 #include <linux/netdevice.h>
 #include <linux/file.h>
 #include <linux/proc_fs.h>
-#include <linux/tcp.h>
 
 #include <net/sock.h>
 #include <net/af_unix.h>
 #include <net/scm.h>
+#include <net/tcp_states.h>
 
 /* Internal data structures and random procedures: */
 
@@ -286,16 +286,16 @@ void unix_gc(void)
 			skb = skb_peek(&s->sk_receive_queue);
 			while (skb &&
 			       skb != (struct sk_buff *)&s->sk_receive_queue) {
-				nextsk=skb->next;
+				nextsk = skb->next;
 				/*
 				 *	Do we have file descriptors ?
 				 */
-				if(UNIXCB(skb).fp)
-				{
-					__skb_unlink(skb, skb->list);
-					__skb_queue_tail(&hitlist,skb);
+				if (UNIXCB(skb).fp) {
+					__skb_unlink(skb,
+						     &s->sk_receive_queue);
+					__skb_queue_tail(&hitlist, skb);
 				}
-				skb=nextsk;
+				skb = nextsk;
 			}
 			spin_unlock(&s->sk_receive_queue.lock);
 		}
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index c974dac4580..690ffa5d5bf 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 
-extern int sysctl_unix_max_dgram_qlen;
+#include <net/af_unix.h>
 
 static ctl_table unix_table[] = {
 	{
diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c
index d93b19faaab..596cb96e5f4 100644
--- a/net/wanrouter/af_wanpipe.c
+++ b/net/wanrouter/af_wanpipe.c
@@ -57,7 +57,7 @@
 #include <linux/wanpipe.h>
 #include <linux/if_wanpipe.h>
 #include <linux/pkt_sched.h>
-#include <linux/tcp.h>
+#include <linux/tcp_states.h>
 #include <linux/if_wanpipe_common.h>
 #include <linux/sdla_x25.h>
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 04bec047fa9..020d73cc841 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -47,7 +47,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <asm/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>	/* For TIOCINQ/OUTQ */
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 36fc3bf6d88..adfe7b8df35 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -81,7 +81,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
 }
 
 int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev,
-			   struct packet_type *ptype)
+			   struct packet_type *ptype, struct net_device *orig_dev)
 {
 	struct sk_buff *nskb;
 	struct x25_neigh *nb;
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index b0197c70a9f..26146874b83 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -28,7 +28,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 
 static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 7fd872ad0c2..8be9b8fbc24 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -27,7 +27,7 @@
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 
 /*
@@ -80,7 +80,7 @@ void x25_requeue_frames(struct sock *sk)
 		if (!skb_prev)
 			skb_queue_head(&sk->sk_write_queue, skb);
 		else
-			skb_append(skb_prev, skb);
+			skb_append(skb_prev, skb, &sk->sk_write_queue);
 		skb_prev = skb;
 	}
 }
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index d6a21a3ad80..0a92e1da392 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -23,7 +23,7 @@
 #include <linux/jiffies.h>
 #include <linux/timer.h>
 #include <net/sock.h>
-#include <net/tcp.h>
+#include <net/tcp_states.h>
 #include <net/x25.h>
 
 static void x25_heartbeat_expiry(unsigned long);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c58a6f05a0b..2407a707232 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -12,7 +12,7 @@
 #include <net/ip.h>
 #include <net/xfrm.h>
 
-static kmem_cache_t *secpath_cachep;
+static kmem_cache_t *secpath_cachep __read_mostly;
 
 void __secpath_destroy(struct sec_path *sp)
 {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d65ed8684fc..83c8135e176 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -37,7 +37,7 @@ EXPORT_SYMBOL(xfrm_policy_list);
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
-static kmem_cache_t *xfrm_dst_cache;
+static kmem_cache_t *xfrm_dst_cache __read_mostly;
 
 static struct work_struct xfrm_policy_gc_work;
 static struct list_head xfrm_policy_gc_list =
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 8da3e25b2c4..c35336a0f71 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1125,9 +1125,8 @@ static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
 	if (build_expire(skb, x, c->data.hard) < 0)
 		BUG();
 
-	NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
-
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
 }
 
 static int xfrm_notify_sa_flush(struct km_event *c)
@@ -1152,7 +1151,8 @@ static int xfrm_notify_sa_flush(struct km_event *c)
 
 	nlh->nlmsg_len = skb->tail - b;
 
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
 
 nlmsg_failure:
 	kfree_skb(skb);
@@ -1226,7 +1226,8 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
 
 	nlh->nlmsg_len = skb->tail - b;
 
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_SA;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_SA, GFP_ATOMIC);
 
 nlmsg_failure:
 rtattr_failure:
@@ -1304,9 +1305,8 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
 	if (build_acquire(skb, x, xt, xp, dir) < 0)
 		BUG();
 
-	NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE;
-
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_ACQUIRE;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_ACQUIRE, GFP_ATOMIC);
 }
 
 /* User gives us xfrm_user_policy_info followed by an array of 0
@@ -1405,9 +1405,8 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
 	if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
 		BUG();
 
-	NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
-
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_EXPIRE;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_EXPIRE, GFP_ATOMIC);
 }
 
 static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
@@ -1455,7 +1454,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
 
 	nlh->nlmsg_len = skb->tail - b;
 
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
 
 nlmsg_failure:
 rtattr_failure:
@@ -1480,7 +1480,8 @@ static int xfrm_notify_policy_flush(struct km_event *c)
 
 	nlh->nlmsg_len = skb->tail - b;
 
-	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_POLICY;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
 
 nlmsg_failure:
 	kfree_skb(skb);
@@ -1519,7 +1520,8 @@ static int __init xfrm_user_init(void)
 {
 	printk(KERN_INFO "Initializing IPsec netlink socket\n");
 
-	xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv);
+	xfrm_nl = netlink_kernel_create(NETLINK_XFRM, XFRMNLGRP_MAX,
+	                                xfrm_netlink_rcv, THIS_MODULE);
 	if (xfrm_nl == NULL)
 		return -ENOMEM;
 
@@ -1537,3 +1539,4 @@ static void __exit xfrm_user_exit(void)
 module_init(xfrm_user_init);
 module_exit(xfrm_user_exit);
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 5180405c1a8..d8ee38aede2 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -341,6 +341,22 @@ static int do_of_entry (const char *filename, struct of_device_id *of, char *ali
     return 1;
 }
 
+static int do_vio_entry(const char *filename, struct vio_device_id *vio,
+		char *alias)
+{
+	char *tmp;
+
+	sprintf(alias, "vio:T%sS%s", vio->type[0] ? vio->type : "*",
+			vio->compat[0] ? vio->compat : "*");
+
+	/* Replace all whitespace with underscores */
+	for (tmp = alias; tmp && *tmp; tmp++)
+		if (isspace (*tmp))
+			*tmp = '_';
+
+	return 1;
+}
+
 /* Ignore any prefix, eg. v850 prepends _ */
 static inline int sym_is(const char *symbol, const char *name)
 {
@@ -422,6 +438,9 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
         else if (sym_is(symname, "__mod_of_device_table"))
 		do_table(symval, sym->st_size, sizeof(struct of_device_id),
 			 do_of_entry, mod);
+        else if (sym_is(symname, "__mod_vio_device_table"))
+		do_table(symval, sym->st_size, sizeof(struct vio_device_id),
+			 do_vio_entry, mod);
 
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2253f388234..8641f8894b4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -659,7 +659,7 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_NETLINK_ROUTE_SOCKET;
 		case NETLINK_FIREWALL:
 			return SECCLASS_NETLINK_FIREWALL_SOCKET;
-		case NETLINK_TCPDIAG:
+		case NETLINK_INET_DIAG:
 			return SECCLASS_NETLINK_TCPDIAG_SOCKET;
 		case NETLINK_NFLOG:
 			return SECCLASS_NETLINK_NFLOG_SOCKET;
diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 18d08acafa7..e203883406d 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -80,7 +80,8 @@ static void selnl_notify(int msgtype, void *data)
 	nlh = NLMSG_PUT(skb, 0, 0, msgtype, len);
 	selnl_add_payload(nlh, len, msgtype, data);
 	nlh->nlmsg_len = skb->tail - tmp;
-	netlink_broadcast(selnl, skb, 0, SELNL_GRP_AVC, GFP_USER);
+	NETLINK_CB(skb).dst_group = SELNLGRP_AVC;
+	netlink_broadcast(selnl, skb, 0, SELNLGRP_AVC, GFP_USER);
 out:
 	return;
 	
@@ -103,7 +104,8 @@ void selnl_notify_policyload(u32 seqno)
 
 static int __init selnl_init(void)
 {
-	selnl = netlink_kernel_create(NETLINK_SELINUX, NULL);
+	selnl = netlink_kernel_create(NETLINK_SELINUX, SELNLGRP_MAX, NULL,
+	                              THIS_MODULE);
 	if (selnl == NULL)
 		panic("SELinux:  Cannot create netlink socket.");
 	netlink_set_nonroot(NETLINK_SELINUX, NL_NONROOT_RECV);	
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 92b057becb4..69b9329b205 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -16,7 +16,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/if.h>
 #include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/tcp_diag.h>
+#include <linux/inet_diag.h>
 #include <linux/xfrm.h>
 #include <linux/audit.h>
 
@@ -76,6 +76,7 @@ static struct nlmsg_perm nlmsg_firewall_perms[] =
 static struct nlmsg_perm nlmsg_tcpdiag_perms[] =
 {
 	{ TCPDIAG_GETSOCK,	NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+	{ DCCPDIAG_GETSOCK,	NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
 };
 
 static struct nlmsg_perm nlmsg_xfrm_perms[] =