From 00e4d116a7ef94eb910be037912b0b2fc09f608b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Fri, 22 Sep 2006 09:33:58 +0100
Subject: [DCCP]: Allow default/fallback service code.

This has been discussed on dccp@vger and removes the necessity for applications
to supply service codes in each and every case.

If an application does not want to provide a service code, that's fine, it will
be given 0. Otherwise, service codes can be set via socket options as before.

This patch has been tested using various client/server configurations
(including listening on multiple service codes).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 Documentation/networking/dccp.txt |  8 +++++---
 include/linux/dccp.h              |  6 +-----
 net/dccp/ipv4.c                   |  3 ---
 net/dccp/proto.c                  | 11 +----------
 4 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index c45daabd3bf..74563b38ffd 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -1,7 +1,6 @@
 DCCP protocol
 ============
 
-Last updated: 10 November 2005
 
 Contents
 ========
@@ -42,8 +41,11 @@ Socket options
 DCCP_SOCKOPT_PACKET_SIZE is used for CCID3 to set default packet size for
 calculations.
 
-DCCP_SOCKOPT_SERVICE sets the service. This is compulsory as per the
-specification. If you don't set it you will get EPROTO.
+DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
+service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
+the socket will fall back to 0 (which means that no meaningful service code
+is present). Connecting sockets set at most one service option; for
+listening sockets, multiple service codes can be specified.
 
 Notes
 =====
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 2d7671c92c0..29f9291e45c 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -404,6 +404,7 @@ struct dccp_service_list {
 };
 
 #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1)
+#define DCCP_SERVICE_CODE_IS_ABSENT 		 0
 
 static inline int dccp_list_has_service(const struct dccp_service_list *sl,
 					const __be32 service)
@@ -484,11 +485,6 @@ static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
 	return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
 }
 
-static inline int dccp_service_not_initialized(const struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_service == DCCP_SERVICE_INVALID_VALUE;
-}
-
 static inline const char *dccp_role(const struct sock *sk)
 {
 	switch (dccp_sk(sk)->dccps_role) {
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9a1a76a7dc4..66be29b6f50 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -56,9 +56,6 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	dp->dccps_role = DCCP_ROLE_CLIENT;
 
-	if (dccp_service_not_initialized(sk))
-		return -EPROTO;
-
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 962df0ea31a..72cbdcfc2c6 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -217,7 +217,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	icsk->icsk_sync_mss	= dccp_sync_mss;
 	dp->dccps_mss_cache	= 536;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
-	dp->dccps_service	= DCCP_SERVICE_INVALID_VALUE;
+	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
 	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
 
 	return 0;
@@ -267,12 +267,6 @@ static inline int dccp_listen_start(struct sock *sk)
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	dp->dccps_role = DCCP_ROLE_LISTEN;
-	/*
-	 * Apps need to use setsockopt(DCCP_SOCKOPT_SERVICE)
-	 * before calling listen()
-	 */
-	if (dccp_service_not_initialized(sk))
-		return -EPROTO;
 	return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
 }
 
@@ -540,9 +534,6 @@ static int dccp_getsockopt_service(struct sock *sk, int len,
 	int err = -ENOENT, slen = 0, total_len = sizeof(u32);
 
 	lock_sock(sk);
-	if (dccp_service_not_initialized(sk))
-		goto out;
-
 	if ((sl = dp->dccps_service_list) != NULL) {
 		slen = sl->dccpsl_nr * sizeof(u32);
 		total_len += slen;
-- 
cgit v1.2.3


From b83eff641ed39bd631535b9a8971e161b056f541 Mon Sep 17 00:00:00 2001
From: Ian McDonald <ian.mcdonald@jandi.co.nz>
Date: Fri, 22 Sep 2006 14:25:36 +1200
Subject: [DCCP]: Introduce constants for CCID numbers

Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 include/linux/dccp.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 29f9291e45c..d6f4ec467a4 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -169,6 +169,12 @@ enum {
 	DCCPO_MAX_CCID_SPECIFIC = 255,
 };
 
+/* DCCP CCIDS */
+enum {
+	DCCPC_CCID2 = 2,
+	DCCPC_CCID3 = 3,
+};
+
 /* DCCP features */
 enum {
 	DCCPF_RESERVED = 0,
@@ -320,7 +326,7 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 /* initial values for each feature */
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
 #define DCCPF_INITIAL_ACK_RATIO			2
-#define DCCPF_INITIAL_CCID			2
+#define DCCPF_INITIAL_CCID			DCCPC_CCID2
 #define DCCPF_INITIAL_SEND_ACK_VECTOR		1
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
-- 
cgit v1.2.3


From 3dd9a7c3a155ee96160876cba92439fdc96d7e0b Mon Sep 17 00:00:00 2001
From: Ian McDonald <ian.mcdonald@jandi.co.nz>
Date: Fri, 22 Sep 2006 14:26:44 +1200
Subject: [DCCP]: Use constants for CCIDs

With constants for CCID numbers this now uses them in some places.

Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/ccid2.c | 2 +-
 net/dccp/ccids/ccid3.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 457dd3db7f4..2efb505aeb3 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -808,7 +808,7 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 }
 
 static struct ccid_operations ccid2 = {
-	.ccid_id		= 2,
+	.ccid_id		= DCCPC_CCID2,
 	.ccid_name		= "ccid2",
 	.ccid_owner		= THIS_MODULE,
 	.ccid_hc_tx_obj_size	= sizeof(struct ccid2_hc_tx_sock),
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 195aa956622..67d2dc0e7c6 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -1240,7 +1240,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 }
 
 static struct ccid_operations ccid3 = {
-	.ccid_id		   = 3,
+	.ccid_id		   = DCCPC_CCID3,
 	.ccid_name		   = "ccid3",
 	.ccid_owner		   = THIS_MODULE,
 	.ccid_hc_tx_obj_size	   = sizeof(struct ccid3_hc_tx_sock),
-- 
cgit v1.2.3


From e41542f5167d6b506607f8dd111fa0a3e468ccb8 Mon Sep 17 00:00:00 2001
From: Ian McDonald <ian.mcdonald@jandi.co.nz>
Date: Fri, 22 Sep 2006 14:28:01 +1200
Subject: [DCCP]: Introduce dccp_probe

This adds DCCP probing shamelessly ripped off from TCP probes by Stephen
Hemminger.

I've put in here support for further CCID3 variables as well.
Andrea/Arnaldo might look to extend for CCID2.

Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/Kconfig  |  16 +++++
 net/dccp/Makefile |   2 +
 net/dccp/probe.c  | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)
 create mode 100644 net/dccp/probe.c

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 859e3359fcd..e2a095d0fd8 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -40,6 +40,22 @@ config IP_DCCP_DEBUG
 
 	  Just say N.
 
+config NET_DCCPPROBE
+	tristate "DCCP connection probing"
+	depends on PROC_FS && KPROBES
+	---help---
+	This module allows for capturing the changes to DCCP connection
+	state in response to incoming packets. It is used for debugging
+	DCCP congestion avoidance modules. If you don't understand
+	what was just said, you don't need it: say N.
+
+	Documentation on how to use the packet generator can be found
+	at http://linux-net.osdl.org/index.php/DccpProbe
+
+	To compile this code as a module, choose M here: the
+	module will be called dccp_probe.
+
+
 endmenu
 
 endmenu
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 7696e219b05..17ed99c4661 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -11,9 +11,11 @@ dccp_ipv4-y := ipv4.o
 dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
 
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
 dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
+dccp_probe-y := probe.o
 
 obj-y += ccids/
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
new file mode 100644
index 00000000000..146496fce2e
--- /dev/null
+++ b/net/dccp/probe.c
@@ -0,0 +1,198 @@
+/*
+ * dccp_probe - Observe the DCCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for DCCP from Stephen Hemminger's code
+ * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/dccp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+
+#include "dccp.h"
+#include "ccid.h"
+#include "ccids/ccid3.h"
+
+static int port;
+
+static int bufsize = 64 * 1024;
+
+static const char procname[] = "dccpprobe";
+
+struct {
+	struct kfifo	  *fifo;
+	spinlock_t	  lock;
+	wait_queue_head_t wait;
+	struct timeval	  tstart;
+} dccpw;
+
+static void printl(const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	struct timeval now;
+	char tbuf[256];
+
+	va_start(args, fmt);
+	do_gettimeofday(&now);
+
+	now.tv_sec -= dccpw.tstart.tv_sec;
+	now.tv_usec -= dccpw.tstart.tv_usec;
+	if (now.tv_usec < 0) {
+		--now.tv_sec;
+		now.tv_usec += 1000000;
+	}
+
+	len = sprintf(tbuf, "%lu.%06lu ",
+		      (unsigned long) now.tv_sec,
+		      (unsigned long) now.tv_usec);
+	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+	va_end(args);
+
+	kfifo_put(dccpw.fifo, tbuf, len);
+	wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			 struct msghdr *msg, size_t size)
+{
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ccid3_hc_tx_sock *hctx;
+
+	if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
+		hctx = ccid3_hc_tx_sk(sk);
+	else
+		hctx = NULL;
+
+	if (port == 0 || ntohs(inet->dport) == port ||
+	    ntohs(inet->sport) == port) {
+		if (hctx)
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %d\n",
+			   NIPQUAD(inet->saddr), ntohs(inet->sport),
+			   NIPQUAD(inet->daddr), ntohs(inet->dport), size,
+			   hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
+			   hctx->ccid3hctx_p, hctx->ccid3hctx_t_ipi);
+		else
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
+			   NIPQUAD(inet->saddr), ntohs(inet->sport),
+			   NIPQUAD(inet->daddr), ntohs(inet->dport), size);
+	}
+
+	jprobe_return();
+	return 0;
+}
+
+static struct jprobe dccp_send_probe = {
+	.kp	= { .addr = (kprobe_opcode_t *)&dccp_sendmsg, },
+	.entry	= (kprobe_opcode_t *)&jdccp_sendmsg,
+};
+
+static int dccpprobe_open(struct inode *inode, struct file *file)
+{
+	kfifo_reset(dccpw.fifo);
+	do_gettimeofday(&dccpw.tstart);
+	return 0;
+}
+
+static ssize_t dccpprobe_read(struct file *file, char __user *buf,
+			      size_t len, loff_t *ppos)
+{
+	int error = 0, cnt = 0;
+	unsigned char *tbuf;
+
+	if (!buf || len < 0)
+		return -EINVAL;
+
+	if (len == 0)
+		return 0;
+
+	tbuf = vmalloc(len);
+	if (!tbuf)
+		return -ENOMEM;
+
+	error = wait_event_interruptible(dccpw.wait,
+					 __kfifo_len(dccpw.fifo) != 0);
+	if (error)
+		goto out_free;
+
+	cnt = kfifo_get(dccpw.fifo, tbuf, len);
+	error = copy_to_user(buf, tbuf, cnt);
+
+out_free:
+	vfree(tbuf);
+
+	return error ? error : cnt;
+}
+
+static struct file_operations dccpprobe_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = dccpprobe_open,
+	.read    = dccpprobe_read,
+};
+
+static __init int dccpprobe_init(void)
+{
+	int ret = -ENOMEM;
+
+	init_waitqueue_head(&dccpw.wait);
+	spin_lock_init(&dccpw.lock);
+	dccpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &dccpw.lock);
+
+	if (!proc_net_fops_create(procname, S_IRUSR, &dccpprobe_fops))
+		goto err0;
+
+	ret = register_jprobe(&dccp_send_probe);
+	if (ret)
+		goto err1;
+
+	pr_info("DCCP watch registered (port=%d)\n", port);
+	return 0;
+err1:
+	proc_net_remove(procname);
+err0:
+	kfifo_free(dccpw.fifo);
+	return ret;
+}
+module_init(dccpprobe_init);
+
+static __exit void dccpprobe_exit(void)
+{
+	kfifo_free(dccpw.fifo);
+	proc_net_remove(procname);
+	unregister_jprobe(&dccp_send_probe);
+
+}
+module_exit(dccpprobe_exit);
+
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
+MODULE_DESCRIPTION("DCCP snooper");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 0d6c7ae22d4fadbc83677ea1a6144eea8911e319 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Sun, 24 Sep 2006 19:28:47 -0700
Subject: [NETFILTER]: Add dscp,DSCP headers to header-y

This patch adds xt_dscp.h and xt_DSCP.h to the kernel headers which are
exported via 'make headers_install'. These are necessary for userspace
to add rules using dscp match and DSCP target.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/Kbuild | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index 9a285cecf24..312bd2ffee3 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -10,6 +10,8 @@ header-y += xt_connmark.h
 header-y += xt_CONNMARK.h
 header-y += xt_conntrack.h
 header-y += xt_dccp.h
+header-y += xt_dscp.h
+header-y += xt_DSCP.h
 header-y += xt_esp.h
 header-y += xt_helper.h
 header-y += xt_length.h
-- 
cgit v1.2.3


From 5b7c714ec27584b18279b741b6043016f8adb9de Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@digitalvampire.org>
Date: Sun, 24 Sep 2006 20:09:33 -0700
Subject: [ATM] he: Fix __init/__devinit conflict

he_init_one() is declared __devinit, but calls lots of init functions
that are marked __init.  However, if CONFIG_HOTPLUG is enabled,
__devinit functions go into normal .text, which leads to

    WARNING: drivers/atm/he.o - Section mismatch: reference to .init.text: from .text between 'he_start' (at offset 0x2130) and 'he_service_tbrq'

Fix this by changing the __init functions to __devinit.

Signed-off-by: Roland Dreier <roland@digitalvampire.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/he.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 41e052fecd7..f2511b42dba 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -454,7 +454,7 @@ rate_to_atmf(unsigned rate)		/* cps to atm forum format */
 	return (NONZERO | (exp << 9) | (rate & 0x1ff));
 }
 
-static void __init
+static void __devinit
 he_init_rx_lbfp0(struct he_dev *he_dev)
 {
 	unsigned i, lbm_offset, lbufd_index, lbuf_addr, lbuf_count;
@@ -485,7 +485,7 @@ he_init_rx_lbfp0(struct he_dev *he_dev)
 	he_writel(he_dev, he_dev->r0_numbuffs, RLBF0_C);
 }
 
-static void __init
+static void __devinit
 he_init_rx_lbfp1(struct he_dev *he_dev)
 {
 	unsigned i, lbm_offset, lbufd_index, lbuf_addr, lbuf_count;
@@ -516,7 +516,7 @@ he_init_rx_lbfp1(struct he_dev *he_dev)
 	he_writel(he_dev, he_dev->r1_numbuffs, RLBF1_C);
 }
 
-static void __init
+static void __devinit
 he_init_tx_lbfp(struct he_dev *he_dev)
 {
 	unsigned i, lbm_offset, lbufd_index, lbuf_addr, lbuf_count;
@@ -546,7 +546,7 @@ he_init_tx_lbfp(struct he_dev *he_dev)
 	he_writel(he_dev, lbufd_index - 1, TLBF_T);
 }
 
-static int __init
+static int __devinit
 he_init_tpdrq(struct he_dev *he_dev)
 {
 	he_dev->tpdrq_base = pci_alloc_consistent(he_dev->pci_dev,
@@ -568,7 +568,7 @@ he_init_tpdrq(struct he_dev *he_dev)
 	return 0;
 }
 
-static void __init
+static void __devinit
 he_init_cs_block(struct he_dev *he_dev)
 {
 	unsigned clock, rate, delta;
@@ -664,7 +664,7 @@ he_init_cs_block(struct he_dev *he_dev)
 
 }
 
-static int __init
+static int __devinit
 he_init_cs_block_rcm(struct he_dev *he_dev)
 {
 	unsigned (*rategrid)[16][16];
@@ -785,7 +785,7 @@ he_init_cs_block_rcm(struct he_dev *he_dev)
 	return 0;
 }
 
-static int __init
+static int __devinit
 he_init_group(struct he_dev *he_dev, int group)
 {
 	int i;
@@ -955,7 +955,7 @@ he_init_group(struct he_dev *he_dev, int group)
 	return 0;
 }
 
-static int __init
+static int __devinit
 he_init_irq(struct he_dev *he_dev)
 {
 	int i;
-- 
cgit v1.2.3


From 3d2573f7ebe507e372a23cdd3c8b03305d6e90aa Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 24 Sep 2006 20:11:58 -0700
Subject: [TCP]: default congestion control menu

Change how default TCP congestion control is chosen. Don't just use
last installed module, instead allow selection during configuration,
and make sure and use the default regardless of load order.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig           | 45 ++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/sysctl_net_ipv4.c |  6 ++++++
 net/ipv4/tcp_cong.c        |  2 +-
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 1650b64415a..1bbc3689cd8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -448,7 +448,7 @@ config INET_TCP_DIAG
 	depends on INET_DIAG
 	def_tristate INET_DIAG
 
-config TCP_CONG_ADVANCED
+menuconfig TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
 	  Support for selection of various TCP congestion control
@@ -459,9 +459,7 @@ config TCP_CONG_ADVANCED
 
 	  If unsure, say N.
 
-# TCP Reno is builtin (required as fallback)
-menu "TCP congestion control"
-	depends on TCP_CONG_ADVANCED
+if TCP_CONG_ADVANCED
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
@@ -574,12 +572,49 @@ config TCP_CONG_VENO
 	loss packets.
 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
 
-endmenu
+choice
+	prompt "Default TCP congestion control"
+	default DEFAULT_BIC
+	help
+	  Select the TCP congestion control that will be used by default
+	  for all connections.
+
+	config DEFAULT_BIC
+		bool "Bic" if TCP_CONG_BIC=y
+
+	config DEFAULT_CUBIC
+		bool "Cubic" if TCP_CONG_CUBIC=y
+
+	config DEFAULT_HTCP
+		bool "Htcp" if TCP_CONG_HTCP=y
+
+	config DEFAULT_VEGAS
+		bool "Vegas" if TCP_CONG_VEGAS=y
+
+	config DEFAULT_WESTWOOD
+		bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+	config DEFAULT_RENO
+		bool "Reno"
+
+endchoice
+
+endif
 
 config TCP_CONG_BIC
 	tristate
 	depends on !TCP_CONG_ADVANCED
 	default y
 
+config DEFAULT_TCP_CONG
+	string
+	default "bic" if DEFAULT_BIC
+	default "cubic" if DEFAULT_CUBIC
+	default "htcp" if DEFAULT_HTCP
+	default "vegas" if DEFAULT_VEGAS
+	default "westwood" if DEFAULT_WESTWOOD
+	default "reno" if DEFAULT_RENO
+	default "bic"
+
 source "net/ipv4/ipvs/Kconfig"
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 19b2071ff31..e82a5be894b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -129,6 +129,12 @@ static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name,
 	return ret;
 }
 
+static int __init tcp_congestion_default(void)
+{
+	return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
+}
+
+late_initcall(tcp_congestion_default);
 
 ctl_table ipv4_table[] = {
         {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 7ff2e4273a7..af0aca1e6be 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -48,7 +48,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
 		ret = -EEXIST;
 	} else {
-		list_add_rcu(&ca->list, &tcp_cong_list);
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
 		printk(KERN_INFO "TCP %s registered\n", ca->name);
 	}
 	spin_unlock(&tcp_cong_list_lock);
-- 
cgit v1.2.3


From 597811ec167fa01c926a0957a91d9e39baa30e64 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 24 Sep 2006 20:13:03 -0700
Subject: [TCP]: make cubic the default

Change default congestion control used from BIC to the newer CUBIC
which it the successor to BIC but has better properties over long delay links.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 1bbc3689cd8..30af4a4dfcc 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -455,7 +455,7 @@ menuconfig TCP_CONG_ADVANCED
 	  modules.
 
 	  Nearly all users can safely say no here, and a safe default
-	  selection will be made (BIC-TCP with new Reno as a fallback).
+	  selection will be made (CUBIC with new Reno as a fallback).
 
 	  If unsure, say N.
 
@@ -463,7 +463,7 @@ if TCP_CONG_ADVANCED
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
-	default y
+	default m
 	---help---
 	BIC-TCP is a sender-side only change that ensures a linear RTT
 	fairness under large windows while offering both scalability and
@@ -477,7 +477,7 @@ config TCP_CONG_BIC
 
 config TCP_CONG_CUBIC
 	tristate "CUBIC TCP"
-	default m
+	default y
 	---help---
 	This is version 2.0 of BIC-TCP which uses a cubic growth function
 	among other techniques.
@@ -574,7 +574,7 @@ config TCP_CONG_VENO
 
 choice
 	prompt "Default TCP congestion control"
-	default DEFAULT_BIC
+	default DEFAULT_CUBIC
 	help
 	  Select the TCP congestion control that will be used by default
 	  for all connections.
@@ -601,7 +601,7 @@ endchoice
 
 endif
 
-config TCP_CONG_BIC
+config TCP_CONG_CUBIC
 	tristate
 	depends on !TCP_CONG_ADVANCED
 	default y
@@ -614,7 +614,7 @@ config DEFAULT_TCP_CONG
 	default "vegas" if DEFAULT_VEGAS
 	default "westwood" if DEFAULT_WESTWOOD
 	default "reno" if DEFAULT_RENO
-	default "bic"
+	default "cubic"
 
 source "net/ipv4/ipvs/Kconfig"
 
-- 
cgit v1.2.3


From 14a72f53fb1bb5d5c2bdd8cf172219519664729a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:52:01 -0700
Subject: [NetLabel]: correct improper handling of non-NetLabel peer contexts

Fix a problem where NetLabel would always set the value of
sk_security_struct->peer_sid in selinux_netlbl_sock_graft() to the context of
the socket, causing problems when users would query the context of the
connection.  This patch fixes this so that the value in
sk_security_struct->peer_sid is only set when the connection is NetLabel based,
otherwise the value is untouched.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cipso_ipv4.h       |  7 ++++++
 include/net/netlabel.h         |  8 +++++++
 net/ipv4/cipso_ipv4.c          | 48 +++++++++++++++++++++++++++++-------------
 net/netlabel/netlabel_kapi.c   | 23 ++++++++++++++++++++
 security/selinux/ss/services.c | 12 ++++++++++-
 5 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index 59406e0dc5b..6718452a5cd 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -205,6 +205,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway);
 int cipso_v4_socket_setattr(const struct socket *sock,
 			    const struct cipso_v4_doi *doi_def,
 			    const struct netlbl_lsm_secattr *secattr);
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr);
 int cipso_v4_socket_getattr(const struct socket *sock,
 			    struct netlbl_lsm_secattr *secattr);
 int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
@@ -225,6 +226,12 @@ static inline int cipso_v4_socket_setattr(const struct socket *sock,
 	return -ENOSYS;
 }
 
+static inline int cipso_v4_sock_getattr(struct sock *sk,
+					struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
 static inline int cipso_v4_socket_getattr(const struct socket *sock,
 					  struct netlbl_lsm_secattr *secattr)
 {
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index dd5780b3691..bf7b564e354 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -238,6 +238,8 @@ static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr,
 #ifdef CONFIG_NETLABEL
 int netlbl_socket_setattr(const struct socket *sock,
 			  const struct netlbl_lsm_secattr *secattr);
+int netlbl_sock_getattr(struct sock *sk,
+			struct netlbl_lsm_secattr *secattr);
 int netlbl_socket_getattr(const struct socket *sock,
 			  struct netlbl_lsm_secattr *secattr);
 int netlbl_skbuff_getattr(const struct sk_buff *skb,
@@ -250,6 +252,12 @@ static inline int netlbl_socket_setattr(const struct socket *sock,
 	return -ENOSYS;
 }
 
+static inline int netlbl_sock_getattr(struct sock *sk,
+				      struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
 static inline int netlbl_socket_getattr(const struct socket *sock,
 					struct netlbl_lsm_secattr *secattr)
 {
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 80a2a0911b4..a3bae2ca8ac 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1486,43 +1486,40 @@ socket_setattr_failure:
 }
 
 /**
- * cipso_v4_socket_getattr - Get the security attributes from a socket
- * @sock: the socket
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
  * @secattr: the security attributes
  *
  * Description:
- * Query @sock to see if there is a CIPSO option attached to the socket and if
- * there is return the CIPSO security attributes in @secattr.  Returns zero on
- * success and negative values on failure.
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr.  This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself.  Returns zero on success and negative values on failure.
  *
  */
-int cipso_v4_socket_getattr(const struct socket *sock,
-			    struct netlbl_lsm_secattr *secattr)
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
 {
 	int ret_val = -ENOMSG;
-	struct sock *sk;
 	struct inet_sock *sk_inet;
 	unsigned char *cipso_ptr;
 	u32 doi;
 	struct cipso_v4_doi *doi_def;
 
-	sk = sock->sk;
-	lock_sock(sk);
 	sk_inet = inet_sk(sk);
 	if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0)
-		goto socket_getattr_return;
+		return -ENOMSG;
 	cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso -
 		sizeof(struct iphdr);
 	ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr);
 	if (ret_val == 0)
-		goto socket_getattr_return;
+		return ret_val;
 
 	doi = ntohl(*(u32 *)&cipso_ptr[2]);
 	rcu_read_lock();
 	doi_def = cipso_v4_doi_getdef(doi);
 	if (doi_def == NULL) {
 		rcu_read_unlock();
-		goto socket_getattr_return;
+		return -ENOMSG;
 	}
 	switch (cipso_ptr[6]) {
 	case CIPSO_V4_TAG_RBITMAP:
@@ -1533,8 +1530,29 @@ int cipso_v4_socket_getattr(const struct socket *sock,
 	}
 	rcu_read_unlock();
 
-socket_getattr_return:
-	release_sock(sk);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_socket_getattr - Get the security attributes from a socket
+ * @sock: the socket
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sock to see if there is a CIPSO option attached to the socket and if
+ * there is return the CIPSO security attributes in @secattr.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int cipso_v4_socket_getattr(const struct socket *sock,
+			    struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+
+	lock_sock(sock->sk);
+	ret_val = cipso_v4_sock_getattr(sock->sk, secattr);
+	release_sock(sock->sk);
+
 	return ret_val;
 }
 
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 0fd8aaafe23..54fb7de3c2b 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -84,6 +84,29 @@ socket_setattr_return:
 	return ret_val;
 }
 
+/**
+ * netlbl_sock_getattr - Determine the security attributes of a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Examines the given sock to see any NetLabel style labeling has been
+ * applied to the sock, if so it parses the socket label and returns the
+ * security attributes in @secattr.  Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+
+	ret_val = cipso_v4_sock_getattr(sk, secattr);
+	if (ret_val == 0)
+		return 0;
+
+	return netlbl_unlabel_getattr(secattr);
+}
+
 /**
  * netlbl_socket_getattr - Determine the security attributes of a socket
  * @sock: the socket
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 7eb69a602d8..d67f7e65852 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2502,14 +2502,24 @@ void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
 {
 	struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
 	struct sk_security_struct *sksec = sk->sk_security;
+	struct netlbl_lsm_secattr secattr;
+	u32 nlbl_peer_sid;
 
 	sksec->sclass = isec->sclass;
 
 	if (sk->sk_family != PF_INET)
 		return;
 
+	netlbl_secattr_init(&secattr);
+	if (netlbl_sock_getattr(sk, &secattr) == 0 &&
+	    selinux_netlbl_secattr_to_sid(NULL,
+					  &secattr,
+					  sksec->sid,
+					  &nlbl_peer_sid) == 0)
+		sksec->peer_sid = nlbl_peer_sid;
+	netlbl_secattr_destroy(&secattr, 0);
+
 	sksec->nlbl_state = NLBL_REQUIRE;
-	sksec->peer_sid = sksec->sid;
 
 	/* Try to set the NetLabel on the socket to save time later, if we fail
 	 * here we will pick up the pieces in later calls to
-- 
cgit v1.2.3


From 609c92feea5652809319bb77f19d24a44615687d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:52:37 -0700
Subject: [NetLabel]: make the CIPSOv4 cache spinlocks bottom half safe

The CIPSOv4 cache traversal routines are triggered both the userspace events
(cache invalidation due to DOI removal or updated SELinux policy) and network
packet processing events.  As a result there is a problem with the existing
CIPSOv4 cache spinlocks as they are not bottom-half/softirq safe.  This patch
converts the CIPSOv4 cache spin_[un]lock() calls into spin_[un]lock_bh() calls
to address this problem.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/cipso_ipv4.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a3bae2ca8ac..87e71563335 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -259,7 +259,7 @@ void cipso_v4_cache_invalidate(void)
 	u32 iter;
 
 	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
-		spin_lock(&cipso_v4_cache[iter].lock);
+		spin_lock_bh(&cipso_v4_cache[iter].lock);
 		list_for_each_entry_safe(entry,
 					 tmp_entry,
 					 &cipso_v4_cache[iter].list, list) {
@@ -267,7 +267,7 @@ void cipso_v4_cache_invalidate(void)
 			cipso_v4_cache_entry_free(entry);
 		}
 		cipso_v4_cache[iter].size = 0;
-		spin_unlock(&cipso_v4_cache[iter].lock);
+		spin_unlock_bh(&cipso_v4_cache[iter].lock);
 	}
 
 	return;
@@ -309,7 +309,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
 
 	hash = cipso_v4_map_cache_hash(key, key_len);
 	bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
-	spin_lock(&cipso_v4_cache[bkt].lock);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
 	list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
 		if (entry->hash == hash &&
 		    entry->key_len == key_len &&
@@ -318,7 +318,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
 			secattr->cache.free = entry->lsm_data.free;
 			secattr->cache.data = entry->lsm_data.data;
 			if (prev_entry == NULL) {
-				spin_unlock(&cipso_v4_cache[bkt].lock);
+				spin_unlock_bh(&cipso_v4_cache[bkt].lock);
 				return 0;
 			}
 
@@ -333,12 +333,12 @@ static int cipso_v4_cache_check(const unsigned char *key,
 					   &prev_entry->list);
 			}
 
-			spin_unlock(&cipso_v4_cache[bkt].lock);
+			spin_unlock_bh(&cipso_v4_cache[bkt].lock);
 			return 0;
 		}
 		prev_entry = entry;
 	}
-	spin_unlock(&cipso_v4_cache[bkt].lock);
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
 
 	return -ENOENT;
 }
@@ -387,7 +387,7 @@ int cipso_v4_cache_add(const struct sk_buff *skb,
 	entry->lsm_data.data = secattr->cache.data;
 
 	bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
-	spin_lock(&cipso_v4_cache[bkt].lock);
+	spin_lock_bh(&cipso_v4_cache[bkt].lock);
 	if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
 		list_add(&entry->list, &cipso_v4_cache[bkt].list);
 		cipso_v4_cache[bkt].size += 1;
@@ -398,7 +398,7 @@ int cipso_v4_cache_add(const struct sk_buff *skb,
 		list_add(&entry->list, &cipso_v4_cache[bkt].list);
 		cipso_v4_cache_entry_free(old_entry);
 	}
-	spin_unlock(&cipso_v4_cache[bkt].lock);
+	spin_unlock_bh(&cipso_v4_cache[bkt].lock);
 
 	return 0;
 
-- 
cgit v1.2.3


From df2115c3134d0d1a18c1f37f5192394e7f64d1e0 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:53:13 -0700
Subject: [NetLabel]: change the SELinux permissions

Change NetLabel to use the 'recvfrom' socket permission and the
SECINITSID_NETMSG SELinux SID as the NetLabel base SID for incoming packets.
This patch effectively makes the old, and currently unused, SELinux NETMSG
permissions NetLabel permissions.

Signed-of-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 security/selinux/ss/services.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d67f7e65852..22ed17c1771 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2611,7 +2611,7 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 	u32 netlbl_sid;
 	u32 recv_perm;
 
-	rc = selinux_netlbl_skbuff_getsid(skb, sksec->sid, &netlbl_sid);
+	rc = selinux_netlbl_skbuff_getsid(skb, SECINITSID_NETMSG, &netlbl_sid);
 	if (rc != 0)
 		return rc;
 
@@ -2620,13 +2620,13 @@ int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
 
 	switch (sksec->sclass) {
 	case SECCLASS_UDP_SOCKET:
-		recv_perm = UDP_SOCKET__RECV_MSG;
+		recv_perm = UDP_SOCKET__RECVFROM;
 		break;
 	case SECCLASS_TCP_SOCKET:
-		recv_perm = TCP_SOCKET__RECV_MSG;
+		recv_perm = TCP_SOCKET__RECVFROM;
 		break;
 	default:
-		recv_perm = RAWIP_SOCKET__RECV_MSG;
+		recv_perm = RAWIP_SOCKET__RECVFROM;
 	}
 
 	rc = avc_has_perm(sksec->sid,
-- 
cgit v1.2.3


From 22acb19a91d2b551ea37647747972e5286284b22 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:53:37 -0700
Subject: [NETLINK]: add nla_for_each_nested() to the interface list

At the top of include/net/netlink.h is a list of Netlink interfaces, however,
the nla_for_each_nested() macro was not listed.  This patch adds this interface
to the list at the top of the header file.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 11dc2e7f679..2897a73f81e 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -151,6 +151,7 @@
  *   nla_parse()			parse and validate stream of attrs
  *   nla_parse_nested()			parse nested attribuets
  *   nla_for_each_attr()		loop over all attributes
+ *   nla_for_each_nested()		loop over the nested attributes
  *=========================================================================
  */
 
-- 
cgit v1.2.3


From 4fe5d5c07ab615a52fd1b0ceba5aeed7c612821a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:54:03 -0700
Subject: [Netlink]: add nla_validate_nested()

Add a new function, nla_validate_nested(), to validate nested Netlink
attributes.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 2897a73f81e..4ab68a7a636 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -146,6 +146,7 @@
  *   nla_ok(nla, remaining)		does nla fit into remaining bytes?
  *   nla_next(nla, remaining)		get next netlink attribute
  *   nla_validate()			validate a stream of attributes
+ *   nla_validate_nested()		validate a stream of nested attributes
  *   nla_find()				find attribute in stream of attributes
  *   nla_find_nested()			find attribute in nested attributes
  *   nla_parse()			parse and validate stream of attrs
@@ -950,6 +951,24 @@ static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
 	return nlmsg_trim(skb, start);
 }
 
+/**
+ * nla_validate_nested - Validate a stream of nested attributes
+ * @start: container attribute
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ *
+ * Validates all attributes in the nested attribute stream against the
+ * specified policy. Attributes with a type exceeding maxtype will be
+ * ignored. See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_validate_nested(struct nlattr *start, int maxtype,
+				      struct nla_policy *policy)
+{
+	return nla_validate(nla_data(start), nla_len(start), maxtype, policy);
+}
+
 /**
  * nla_for_each_attr - iterate over a stream of attributes
  * @pos: loop counter, set to current attribute
-- 
cgit v1.2.3


From fcd48280643e92ec6cb29a04e9079dd7b6b5bfef Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:56:09 -0700
Subject: [NetLabel]: rework the Netlink attribute handling (part 1)

At the suggestion of Thomas Graf, rewrite NetLabel's use of Netlink attributes
to better follow the common Netlink attribute usage.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cipso_ipv4.h           |  16 ++-
 include/net/netlabel.h             |  49 +--------
 net/ipv4/cipso_ipv4.c              | 203 +++++--------------------------------
 net/netlabel/netlabel_domainhash.c | 183 +++++++--------------------------
 net/netlabel/netlabel_domainhash.h |   6 +-
 net/netlabel/netlabel_user.c       |  82 ---------------
 net/netlabel/netlabel_user.h       | 141 +-------------------------
 7 files changed, 76 insertions(+), 604 deletions(-)

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index 6718452a5cd..2d72496c202 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -130,8 +130,9 @@ extern int cipso_v4_rbm_strictvalid;
 int cipso_v4_doi_add(struct cipso_v4_doi *doi_def);
 int cipso_v4_doi_remove(u32 doi, void (*callback) (struct rcu_head * head));
 struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi);
-struct sk_buff *cipso_v4_doi_dump_all(size_t headroom);
-struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom);
+int cipso_v4_doi_walk(u32 *skip_cnt,
+		     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+	             void *cb_arg);
 int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain);
 int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
 			       const char *domain);
@@ -152,14 +153,11 @@ static inline struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
 	return NULL;
 }
 
-static inline struct sk_buff *cipso_v4_doi_dump_all(size_t headroom)
+static inline int cipso_v4_doi_walk(u32 *skip_cnt,
+		     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+		     void *cb_arg)
 {
-	return NULL;
-}
-
-static inline struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom)
-{
-	return NULL;
+	return 0;
 }
 
 static inline int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def,
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index bf7b564e354..6692430063f 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -57,9 +57,8 @@
  * The payload is dependent on the subsystem specified in the
  * 'nlmsghdr->nlmsg_type' and should be defined below, supporting functions
  * should be defined in the corresponding net/netlabel/netlabel_<subsys>.h|c
- * file.  All of the fields in the NetLabel payload are NETLINK attributes, the
- * length of each field is the length of the NETLINK attribute payload, see
- * include/net/netlink.h for more information on NETLINK attributes.
+ * file.  All of the fields in the NetLabel payload are NETLINK attributes, see
+ * the include/net/netlink.h file for more information on NETLINK attributes.
  *
  */
 
@@ -82,50 +81,6 @@
 #define NETLBL_NLTYPE_UNLABELED         5
 #define NETLBL_NLTYPE_UNLABELED_NAME    "NLBL_UNLBL"
 
-/* NetLabel return codes */
-#define NETLBL_E_OK                     0
-
-/*
- * Helper functions
- */
-
-#define NETLBL_LEN_U8                   nla_total_size(sizeof(u8))
-#define NETLBL_LEN_U16                  nla_total_size(sizeof(u16))
-#define NETLBL_LEN_U32                  nla_total_size(sizeof(u32))
-
-/**
- * netlbl_netlink_alloc_skb - Allocate a NETLINK message buffer
- * @head: the amount of headroom in bytes
- * @body: the desired size (minus headroom) in bytes
- * @gfp_flags: the alloc flags to pass to alloc_skb()
- *
- * Description:
- * Allocate a NETLINK message buffer based on the sizes given in @head and
- * @body.  If @head is greater than zero skb_reserve() is called to reserve
- * @head bytes at the start of the buffer.  Returns a valid sk_buff pointer on
- * success, NULL on failure.
- *
- */
-static inline struct sk_buff *netlbl_netlink_alloc_skb(size_t head,
-						       size_t body,
-						       gfp_t gfp_flags)
-{
-	struct sk_buff *skb;
-
-	skb = alloc_skb(NLMSG_ALIGN(head + body), gfp_flags);
-	if (skb == NULL)
-		return NULL;
-	if (head > 0) {
-		skb_reserve(skb, head);
-		if (skb_tailroom(skb) < body) {
-			kfree_skb(skb);
-			return NULL;
-		}
-	}
-
-	return skb;
-}
-
 /*
  * NetLabel - Kernel API for accessing the network packet label mappings.
  *
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 87e71563335..e6ce0b3ba62 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -530,197 +530,42 @@ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
 }
 
 /**
- * cipso_v4_doi_dump_all - Dump all the CIPSO DOI definitions into a sk_buff
- * @headroom: the amount of headroom to allocate for the sk_buff
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
  *
  * Description:
- * Dump a list of all the configured DOI values into a sk_buff.  The returned
- * sk_buff has room at the front of the sk_buff for @headroom bytes.  See
- * net/netlabel/netlabel_cipso_v4.h for the LISTALL message format.  This
- * function may fail if another process is changing the DOI list at the same
- * time.  Returns a pointer to a sk_buff on success, NULL on error.
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return.  Updates the value in @skip_cnt upon
+ * return.  Returns zero on success, negative values on failure.
  *
  */
-struct sk_buff *cipso_v4_doi_dump_all(size_t headroom)
+int cipso_v4_doi_walk(u32 *skip_cnt,
+		     int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+		     void *cb_arg)
 {
-	struct sk_buff *skb = NULL;
-	struct cipso_v4_doi *iter;
+	int ret_val = -ENOENT;
 	u32 doi_cnt = 0;
-	ssize_t buf_len;
-
-	buf_len = NETLBL_LEN_U32;
-	rcu_read_lock();
-	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
-		if (iter->valid) {
-			doi_cnt += 1;
-			buf_len += 2 * NETLBL_LEN_U32;
-		}
-
-	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
-	if (skb == NULL)
-		goto doi_dump_all_failure;
-
-	if (nla_put_u32(skb, NLA_U32, doi_cnt) != 0)
-		goto doi_dump_all_failure;
-	buf_len -= NETLBL_LEN_U32;
-	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
-		if (iter->valid) {
-			if (buf_len < 2 * NETLBL_LEN_U32)
-				goto doi_dump_all_failure;
-			if (nla_put_u32(skb, NLA_U32, iter->doi) != 0)
-				goto doi_dump_all_failure;
-			if (nla_put_u32(skb, NLA_U32, iter->type) != 0)
-				goto doi_dump_all_failure;
-			buf_len -= 2 * NETLBL_LEN_U32;
-		}
-	rcu_read_unlock();
-
-	return skb;
-
-doi_dump_all_failure:
-	rcu_read_unlock();
-	kfree(skb);
-	return NULL;
-}
-
-/**
- * cipso_v4_doi_dump - Dump a CIPSO DOI definition into a sk_buff
- * @doi: the DOI value
- * @headroom: the amount of headroom to allocate for the sk_buff
- *
- * Description:
- * Lookup the DOI definition matching @doi and dump it's contents into a
- * sk_buff.  The returned sk_buff has room at the front of the sk_buff for
- * @headroom bytes.  See net/netlabel/netlabel_cipso_v4.h for the LIST message
- * format.  This function may fail if another process is changing the DOI list
- * at the same time.  Returns a pointer to a sk_buff on success, NULL on error.
- *
- */
-struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom)
-{
-	struct sk_buff *skb = NULL;
-	struct cipso_v4_doi *iter;
-	u32 tag_cnt = 0;
-	u32 lvl_cnt = 0;
-	u32 cat_cnt = 0;
-	ssize_t buf_len;
-	ssize_t tmp;
+	struct cipso_v4_doi *iter_doi;
 
 	rcu_read_lock();
-	iter = cipso_v4_doi_getdef(doi);
-	if (iter == NULL)
-		goto doi_dump_failure;
-	buf_len = NETLBL_LEN_U32;
-	switch (iter->type) {
-	case CIPSO_V4_MAP_PASS:
-		buf_len += NETLBL_LEN_U32;
-		while(tag_cnt < CIPSO_V4_TAG_MAXCNT &&
-		      iter->tags[tag_cnt] != CIPSO_V4_TAG_INVALID) {
-			tag_cnt += 1;
-			buf_len += NETLBL_LEN_U8;
-		}
-		break;
-	case CIPSO_V4_MAP_STD:
-		buf_len += 3 * NETLBL_LEN_U32;
-		while (tag_cnt < CIPSO_V4_TAG_MAXCNT &&
-		       iter->tags[tag_cnt] != CIPSO_V4_TAG_INVALID) {
-			tag_cnt += 1;
-			buf_len += NETLBL_LEN_U8;
-		}
-		for (tmp = 0; tmp < iter->map.std->lvl.local_size; tmp++)
-			if (iter->map.std->lvl.local[tmp] !=
-			    CIPSO_V4_INV_LVL) {
-				lvl_cnt += 1;
-				buf_len += NETLBL_LEN_U32 + NETLBL_LEN_U8;
+	list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+		if (iter_doi->valid) {
+			if (doi_cnt++ < *skip_cnt)
+				continue;
+			ret_val = callback(iter_doi, cb_arg);
+			if (ret_val < 0) {
+				doi_cnt--;
+				goto doi_walk_return;
 			}
-		for (tmp = 0; tmp < iter->map.std->cat.local_size; tmp++)
-			if (iter->map.std->cat.local[tmp] !=
-			    CIPSO_V4_INV_CAT) {
-				cat_cnt += 1;
-				buf_len += NETLBL_LEN_U32 + NETLBL_LEN_U16;
-			}
-		break;
-	}
-
-	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
-	if (skb == NULL)
-		goto doi_dump_failure;
-
-	if (nla_put_u32(skb, NLA_U32, iter->type) != 0)
-		goto doi_dump_failure;
-	buf_len -= NETLBL_LEN_U32;
-	if (iter != cipso_v4_doi_getdef(doi))
-		goto doi_dump_failure;
-	switch (iter->type) {
-	case CIPSO_V4_MAP_PASS:
-		if (nla_put_u32(skb, NLA_U32, tag_cnt) != 0)
-			goto doi_dump_failure;
-		buf_len -= NETLBL_LEN_U32;
-		for (tmp = 0;
-		     tmp < CIPSO_V4_TAG_MAXCNT &&
-			     iter->tags[tmp] != CIPSO_V4_TAG_INVALID;
-		     tmp++) {
-			if (buf_len < NETLBL_LEN_U8)
-				goto doi_dump_failure;
-			if (nla_put_u8(skb, NLA_U8, iter->tags[tmp]) != 0)
-				goto doi_dump_failure;
-			buf_len -= NETLBL_LEN_U8;
 		}
-		break;
-	case CIPSO_V4_MAP_STD:
-		if (nla_put_u32(skb, NLA_U32, tag_cnt) != 0)
-			goto doi_dump_failure;
-		if (nla_put_u32(skb, NLA_U32, lvl_cnt) != 0)
-			goto doi_dump_failure;
-		if (nla_put_u32(skb, NLA_U32, cat_cnt) != 0)
-			goto doi_dump_failure;
-		buf_len -= 3 * NETLBL_LEN_U32;
-		for (tmp = 0;
-		     tmp < CIPSO_V4_TAG_MAXCNT &&
-			     iter->tags[tmp] != CIPSO_V4_TAG_INVALID;
-		     tmp++) {
-			if (buf_len < NETLBL_LEN_U8)
-				goto doi_dump_failure;
-			if (nla_put_u8(skb, NLA_U8, iter->tags[tmp]) != 0)
-				goto doi_dump_failure;
-			buf_len -= NETLBL_LEN_U8;
-		}
-		for (tmp = 0; tmp < iter->map.std->lvl.local_size; tmp++)
-			if (iter->map.std->lvl.local[tmp] !=
-			    CIPSO_V4_INV_LVL) {
-				if (buf_len < NETLBL_LEN_U32 + NETLBL_LEN_U8)
-					goto doi_dump_failure;
-				if (nla_put_u32(skb, NLA_U32, tmp) != 0)
-					goto doi_dump_failure;
-				if (nla_put_u8(skb,
-					   NLA_U8,
-					   iter->map.std->lvl.local[tmp]) != 0)
-					goto doi_dump_failure;
-				buf_len -= NETLBL_LEN_U32 + NETLBL_LEN_U8;
-			}
-		for (tmp = 0; tmp < iter->map.std->cat.local_size; tmp++)
-			if (iter->map.std->cat.local[tmp] !=
-			    CIPSO_V4_INV_CAT) {
-				if (buf_len < NETLBL_LEN_U32 + NETLBL_LEN_U16)
-					goto doi_dump_failure;
-				if (nla_put_u32(skb, NLA_U32, tmp) != 0)
-					goto doi_dump_failure;
-				if (nla_put_u16(skb,
-					   NLA_U16,
-					   iter->map.std->cat.local[tmp]) != 0)
-					goto doi_dump_failure;
-				buf_len -= NETLBL_LEN_U32 + NETLBL_LEN_U16;
-			}
-		break;
-	}
-	rcu_read_unlock();
-
-	return skb;
 
-doi_dump_failure:
+doi_walk_return:
 	rcu_read_unlock();
-	kfree(skb);
-	return NULL;
+	*skip_cnt = doi_cnt;
+	return ret_val;
 }
 
 /**
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 0489a137810..f56d7a8ac7b 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -354,160 +354,51 @@ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
 }
 
 /**
- * netlbl_domhsh_dump - Dump the domain hash table into a sk_buff
+ * netlbl_domhsh_walk - Iterate through the domain mapping hash table
+ * @skip_bkt: the number of buckets to skip at the start
+ * @skip_chain: the number of entries to skip in the first iterated bucket
+ * @callback: callback for each entry
+ * @cb_arg: argument for the callback function
  *
  * Description:
- * Dump the domain hash table into a buffer suitable for returning to an
- * application in response to a NetLabel management DOMAIN message.  This
- * function may fail if another process is growing the hash table at the same
- * time.  The returned sk_buff has room at the front of the sk_buff for
- * @headroom bytes.  See netlabel.h for the DOMAIN message format.  Returns a
- * pointer to a sk_buff on success, NULL on error.
+ * Interate over the domain mapping hash table, skipping the first @skip_bkt
+ * buckets and @skip_chain entries.  For each entry in the table call
+ * @callback, if @callback returns a negative value stop 'walking' through the
+ * table and return.  Updates the values in @skip_bkt and @skip_chain on
+ * return.  Returns zero on succcess, negative values on failure.
  *
  */
-struct sk_buff *netlbl_domhsh_dump(size_t headroom)
+int netlbl_domhsh_walk(u32 *skip_bkt,
+		     u32 *skip_chain,
+		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
+		     void *cb_arg)
 {
-	struct sk_buff *skb = NULL;
-	ssize_t buf_len;
-	u32 bkt_iter;
-	u32 dom_cnt = 0;
-	struct netlbl_domhsh_tbl *hsh_tbl;
-	struct netlbl_dom_map *list_iter;
-	ssize_t tmp_len;
+	int ret_val = -ENOENT;
+	u32 iter_bkt;
+	struct netlbl_dom_map *iter_entry;
+	u32 chain_cnt = 0;
 
-	buf_len = NETLBL_LEN_U32;
 	rcu_read_lock();
-	hsh_tbl = rcu_dereference(netlbl_domhsh);
-	for (bkt_iter = 0; bkt_iter < hsh_tbl->size; bkt_iter++)
-		list_for_each_entry_rcu(list_iter,
-					&hsh_tbl->tbl[bkt_iter], list) {
-			buf_len += NETLBL_LEN_U32 +
-				nla_total_size(strlen(list_iter->domain) + 1);
-			switch (list_iter->type) {
-			case NETLBL_NLTYPE_UNLABELED:
-				break;
-			case NETLBL_NLTYPE_CIPSOV4:
-				buf_len += 2 * NETLBL_LEN_U32;
-				break;
-			}
-			dom_cnt++;
-		}
-
-	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
-	if (skb == NULL)
-		goto dump_failure;
-
-	if (nla_put_u32(skb, NLA_U32, dom_cnt) != 0)
-		goto dump_failure;
-	buf_len -= NETLBL_LEN_U32;
-	hsh_tbl = rcu_dereference(netlbl_domhsh);
-	for (bkt_iter = 0; bkt_iter < hsh_tbl->size; bkt_iter++)
-		list_for_each_entry_rcu(list_iter,
-					&hsh_tbl->tbl[bkt_iter], list) {
-			tmp_len = nla_total_size(strlen(list_iter->domain) +
-						 1);
-			if (buf_len < NETLBL_LEN_U32 + tmp_len)
-				goto dump_failure;
-			if (nla_put_string(skb,
-					   NLA_STRING,
-					   list_iter->domain) != 0)
-				goto dump_failure;
-			if (nla_put_u32(skb, NLA_U32, list_iter->type) != 0)
-				goto dump_failure;
-			buf_len -= NETLBL_LEN_U32 + tmp_len;
-			switch (list_iter->type) {
-			case NETLBL_NLTYPE_UNLABELED:
-				break;
-			case NETLBL_NLTYPE_CIPSOV4:
-				if (buf_len < 2 * NETLBL_LEN_U32)
-					goto dump_failure;
-				if (nla_put_u32(skb,
-				       NLA_U32,
-				       list_iter->type_def.cipsov4->type) != 0)
-					goto dump_failure;
-				if (nla_put_u32(skb,
-				       NLA_U32,
-				       list_iter->type_def.cipsov4->doi) != 0)
-					goto dump_failure;
-				buf_len -= 2 * NETLBL_LEN_U32;
-				break;
+	for (iter_bkt = *skip_bkt;
+	     iter_bkt < rcu_dereference(netlbl_domhsh)->size;
+	     iter_bkt++, chain_cnt = 0) {
+		list_for_each_entry_rcu(iter_entry,
+					&netlbl_domhsh->tbl[iter_bkt],
+					list)
+			if (iter_entry->valid) {
+				if (chain_cnt++ < *skip_chain)
+					continue;
+				ret_val = callback(iter_entry, cb_arg);
+				if (ret_val < 0) {
+					chain_cnt--;
+					goto walk_return;
+				}
 			}
-		}
-	rcu_read_unlock();
-
-	return skb;
-
-dump_failure:
-	rcu_read_unlock();
-	kfree_skb(skb);
-	return NULL;
-}
-
-/**
- * netlbl_domhsh_dump_default - Dump the default domain mapping into a sk_buff
- *
- * Description:
- * Dump the default domain mapping into a buffer suitable for returning to an
- * application in response to a NetLabel management DEFDOMAIN message.  This
- * function may fail if another process is changing the default domain mapping
- * at the same time.  The returned sk_buff has room at the front of the
- * skb_buff for @headroom bytes.  See netlabel.h for the DEFDOMAIN message
- * format.  Returns a pointer to a sk_buff on success, NULL on error.
- *
- */
-struct sk_buff *netlbl_domhsh_dump_default(size_t headroom)
-{
-	struct sk_buff *skb;
-	ssize_t buf_len;
-	struct netlbl_dom_map *entry;
-
-	buf_len = NETLBL_LEN_U32;
-	rcu_read_lock();
-	entry = rcu_dereference(netlbl_domhsh_def);
-	if (entry != NULL)
-		switch (entry->type) {
-		case NETLBL_NLTYPE_UNLABELED:
-			break;
-		case NETLBL_NLTYPE_CIPSOV4:
-			buf_len += 2 * NETLBL_LEN_U32;
-			break;
-		}
-
-	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
-	if (skb == NULL)
-		goto dump_default_failure;
-
-	if (entry != rcu_dereference(netlbl_domhsh_def))
-		goto dump_default_failure;
-	if (entry != NULL) {
-		if (nla_put_u32(skb, NLA_U32, entry->type) != 0)
-			goto dump_default_failure;
-		buf_len -= NETLBL_LEN_U32;
-		switch (entry->type) {
-		case NETLBL_NLTYPE_UNLABELED:
-			break;
-		case NETLBL_NLTYPE_CIPSOV4:
-			if (buf_len < 2 * NETLBL_LEN_U32)
-				goto dump_default_failure;
-			if (nla_put_u32(skb,
-					NLA_U32,
-					entry->type_def.cipsov4->type) != 0)
-				goto dump_default_failure;
-			if (nla_put_u32(skb,
-					NLA_U32,
-					entry->type_def.cipsov4->doi) != 0)
-				goto dump_default_failure;
-			buf_len -= 2 * NETLBL_LEN_U32;
-			break;
-		}
-	} else
-		nla_put_u32(skb, NLA_U32, NETLBL_NLTYPE_NONE);
-	rcu_read_unlock();
-
-	return skb;
+	}
 
-dump_default_failure:
+walk_return:
 	rcu_read_unlock();
-	kfree_skb(skb);
-	return NULL;
+	*skip_bkt = iter_bkt;
+	*skip_chain = chain_cnt;
+	return ret_val;
 }
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 99a2287de24..02af72a7877 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -61,7 +61,9 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry);
 int netlbl_domhsh_add_default(struct netlbl_dom_map *entry);
 int netlbl_domhsh_remove_default(void);
 struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
-struct sk_buff *netlbl_domhsh_dump(size_t headroom);
-struct sk_buff *netlbl_domhsh_dump_default(size_t headroom);
+int netlbl_domhsh_walk(u32 *skip_bkt,
+		     u32 *skip_chain,
+		     int (*callback) (struct netlbl_dom_map *entry, void *arg),
+		     void *cb_arg);
 
 #endif
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 73cbe66e42f..eeb7d768d2b 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -74,85 +74,3 @@ int netlbl_netlink_init(void)
 
 	return 0;
 }
-
-/*
- * NetLabel Common Protocol Functions
- */
-
-/**
- * netlbl_netlink_send_ack - Send an ACK message
- * @info: the generic NETLINK information
- * @genl_family: the generic NETLINK family ID value
- * @ack_cmd: the generic NETLINK family ACK command value
- * @ret_code: return code to use
- *
- * Description:
- * This function sends an ACK message to the sender of the NETLINK message
- * specified by @info.
- *
- */
-void netlbl_netlink_send_ack(const struct genl_info *info,
-			     u32 genl_family,
-			     u8 ack_cmd,
-			     u32 ret_code)
-{
-	size_t data_size;
-	struct sk_buff *skb;
-
-	data_size = GENL_HDRLEN + 2 * NETLBL_LEN_U32;
-	skb = netlbl_netlink_alloc_skb(0, data_size, GFP_KERNEL);
-	if (skb == NULL)
-		return;
-
-	if (netlbl_netlink_hdr_put(skb,
-				   info->snd_pid,
-				   0,
-				   genl_family,
-				   ack_cmd) == NULL)
-		goto send_ack_failure;
-
-	if (nla_put_u32(skb, NLA_U32, info->snd_seq) != 0)
-		goto send_ack_failure;
-	if (nla_put_u32(skb, NLA_U32, ret_code) != 0)
-		goto send_ack_failure;
-
-	netlbl_netlink_snd(skb, info->snd_pid);
-	return;
-
-send_ack_failure:
-	kfree_skb(skb);
-}
-
-/*
- * NETLINK I/O Functions
- */
-
-/**
- * netlbl_netlink_snd - Send a NetLabel message
- * @skb: NetLabel message
- * @pid: destination PID
- *
- * Description:
- * Sends a unicast NetLabel message over the NETLINK socket.
- *
- */
-int netlbl_netlink_snd(struct sk_buff *skb, u32 pid)
-{
-	return genlmsg_unicast(skb, pid);
-}
-
-/**
- * netlbl_netlink_snd - Send a NetLabel message
- * @skb: NetLabel message
- * @pid: sending PID
- * @group: multicast group id
- *
- * Description:
- * Sends a multicast NetLabel message over the NETLINK socket to all members
- * of @group except @pid.
- *
- */
-int netlbl_netlink_snd_multicast(struct sk_buff *skb, u32 pid, u32 group)
-{
-	return genlmsg_multicast(skb, pid, group, GFP_KERNEL);
-}
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index 385a6c7488c..3f9386b917d 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -40,72 +40,6 @@
 
 /* NetLabel NETLINK helper functions */
 
-/**
- * netlbl_netlink_cap_check - Check the NETLINK msg capabilities
- * @skb: the NETLINK buffer
- * @req_cap: the required capability
- *
- * Description:
- * Check the NETLINK buffer's capabilities against the required capabilities.
- * Returns zero on success, negative values on failure.
- *
- */
-static inline int netlbl_netlink_cap_check(const struct sk_buff *skb,
-					   kernel_cap_t req_cap)
-{
-	if (cap_raised(NETLINK_CB(skb).eff_cap, req_cap))
-		return 0;
-	return -EPERM;
-}
-
-/**
- * netlbl_getinc_u8 - Read a u8 value from a nlattr stream and move on
- * @nla: the attribute
- * @rem_len: remaining length
- *
- * Description:
- * Return a u8 value pointed to by @nla and advance it to the next attribute.
- *
- */
-static inline u8 netlbl_getinc_u8(struct nlattr **nla, int *rem_len)
-{
-	u8 val = nla_get_u8(*nla);
-	*nla = nla_next(*nla, rem_len);
-	return val;
-}
-
-/**
- * netlbl_getinc_u16 - Read a u16 value from a nlattr stream and move on
- * @nla: the attribute
- * @rem_len: remaining length
- *
- * Description:
- * Return a u16 value pointed to by @nla and advance it to the next attribute.
- *
- */
-static inline u16 netlbl_getinc_u16(struct nlattr **nla, int *rem_len)
-{
-	u16 val = nla_get_u16(*nla);
-	*nla = nla_next(*nla, rem_len);
-	return val;
-}
-
-/**
- * netlbl_getinc_u32 - Read a u32 value from a nlattr stream and move on
- * @nla: the attribute
- * @rem_len: remaining length
- *
- * Description:
- * Return a u32 value pointed to by @nla and advance it to the next attribute.
- *
- */
-static inline u32 netlbl_getinc_u32(struct nlattr **nla, int *rem_len)
-{
-	u32 val = nla_get_u32(*nla);
-	*nla = nla_next(*nla, rem_len);
-	return val;
-}
-
 /**
  * netlbl_netlink_hdr_put - Write the NETLINK buffers into a sk_buff
  * @skb: the packet
@@ -124,6 +58,7 @@ static inline void *netlbl_netlink_hdr_put(struct sk_buff *skb,
 					   u32 pid,
 					   u32 seq,
 					   int type,
+					   int flags,
 					   u8 cmd)
 {
 	return genlmsg_put(skb,
@@ -131,85 +66,13 @@ static inline void *netlbl_netlink_hdr_put(struct sk_buff *skb,
 			   seq,
 			   type,
 			   0,
-			   0,
+			   flags,
 			   cmd,
 			   NETLBL_PROTO_VERSION);
 }
 
-/**
- * netlbl_netlink_hdr_push - Write the NETLINK buffers into a sk_buff
- * @skb: the packet
- * @pid: the PID of the receipient
- * @seq: the sequence number
- * @type: the generic NETLINK message family type
- * @cmd: command
- *
- * Description:
- * Write both a NETLINK nlmsghdr structure and a Generic NETLINK genlmsghdr
- * struct to the packet.
- *
- */
-static inline void netlbl_netlink_hdr_push(struct sk_buff *skb,
-					   u32 pid,
-					   u32 seq,
-					   int type,
-					   u8 cmd)
-
-{
-	struct nlmsghdr *nlh;
-	struct genlmsghdr *hdr;
-
-	nlh = (struct nlmsghdr *)skb_push(skb, NLMSG_SPACE(GENL_HDRLEN));
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_len = skb->len;
-	nlh->nlmsg_flags = 0;
-	nlh->nlmsg_pid = pid;
-	nlh->nlmsg_seq = seq;
-
-	hdr = nlmsg_data(nlh);
-	hdr->cmd = cmd;
-	hdr->version = NETLBL_PROTO_VERSION;
-	hdr->reserved = 0;
-}
-
-/**
- * netlbl_netlink_payload_len - Return the length of the payload
- * @skb: the NETLINK buffer
- *
- * Description:
- * This function returns the length of the NetLabel payload.
- *
- */
-static inline u32 netlbl_netlink_payload_len(const struct sk_buff *skb)
-{
-	return nlmsg_len((struct nlmsghdr *)skb->data) - GENL_HDRLEN;
-}
-
-/**
- * netlbl_netlink_payload_data - Returns a pointer to the start of the payload
- * @skb: the NETLINK buffer
- *
- * Description:
- * This function returns a pointer to the start of the NetLabel payload.
- *
- */
-static inline void *netlbl_netlink_payload_data(const struct sk_buff *skb)
-{
-  return (unsigned char *)nlmsg_data((struct nlmsghdr *)skb->data) +
-	  GENL_HDRLEN;
-}
-
-/* NetLabel common protocol functions */
-
-void netlbl_netlink_send_ack(const struct genl_info *info,
-			     u32 genl_family,
-			     u8 ack_cmd,
-			     u32 ret_code);
-
 /* NetLabel NETLINK I/O functions */
 
 int netlbl_netlink_init(void);
-int netlbl_netlink_snd(struct sk_buff *skb, u32 pid);
-int netlbl_netlink_snd_multicast(struct sk_buff *skb, u32 pid, u32 group);
 
 #endif
-- 
cgit v1.2.3


From fd3858554b62c3af6b7664b5c58ad864c87116c9 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:56:37 -0700
Subject: [NetLabel]: rework the Netlink attribute handling (part 2)

At the suggestion of Thomas Graf, rewrite NetLabel's use of Netlink attributes
to better follow the common Netlink attribute usage.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlabel/netlabel_cipso_v4.c  | 628 +++++++++++++++++++++++++-------------
 net/netlabel/netlabel_cipso_v4.h  | 225 ++++++--------
 net/netlabel/netlabel_mgmt.c      | 541 ++++++++++++++++----------------
 net/netlabel/netlabel_mgmt.h      | 211 +++++--------
 net/netlabel/netlabel_unlabeled.c |  79 +++--
 net/netlabel/netlabel_unlabeled.h |  41 +--
 6 files changed, 895 insertions(+), 830 deletions(-)

diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index a4f40adc447..4125a55f469 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -41,15 +41,37 @@
 #include "netlabel_user.h"
 #include "netlabel_cipso_v4.h"
 
+/* Argument struct for cipso_v4_doi_walk() */
+struct netlbl_cipsov4_doiwalk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_cipsov4_gnl_family = {
 	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
 	.version = NETLBL_PROTO_VERSION,
-	.maxattr = 0,
+	.maxattr = NLBL_CIPSOV4_A_MAX,
 };
 
+/* NetLabel Netlink attribute policy */
+static struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
+	[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 },
+	[NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 },
+	[NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED },
+	[NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED },
+};
 
 /*
  * Helper Functions
@@ -81,6 +103,41 @@ static void netlbl_cipsov4_doi_free(struct rcu_head *entry)
 	kfree(ptr);
 }
 
+/**
+ * netlbl_cipsov4_add_common - Parse the common sections of a ADD message
+ * @info: the Generic NETLINK info block
+ * @doi_def: the CIPSO V4 DOI definition
+ *
+ * Description:
+ * Parse the common sections of a ADD message and fill in the related values
+ * in @doi_def.  Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_cipsov4_add_common(struct genl_info *info,
+				     struct cipso_v4_doi *doi_def)
+{
+	struct nlattr *nla;
+	int nla_rem;
+	u32 iter = 0;
+
+	doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+	if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST],
+				NLBL_CIPSOV4_A_MAX,
+				netlbl_cipsov4_genl_policy) != 0)
+		return -EINVAL;
+
+	nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem)
+		if (nla->nla_type == NLBL_CIPSOV4_A_TAG) {
+			if (iter > CIPSO_V4_TAG_MAXCNT)
+				return -EINVAL;
+			doi_def->tags[iter++] = nla_get_u8(nla);
+		}
+	if (iter < CIPSO_V4_TAG_MAXCNT)
+		doi_def->tags[iter] = CIPSO_V4_TAG_INVALID;
+
+	return 0;
+}
 
 /*
  * NetLabel Command Handlers
@@ -88,9 +145,7 @@ static void netlbl_cipsov4_doi_free(struct rcu_head *entry)
 
 /**
  * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition
- * @doi: the DOI value
- * @msg: the ADD message data
- * @msg_size: the size of the ADD message buffer
+ * @info: the Generic NETLINK info block
  *
  * Description:
  * Create a new CIPSO_V4_MAP_STD DOI definition based on the given ADD message
@@ -98,29 +153,28 @@ static void netlbl_cipsov4_doi_free(struct rcu_head *entry)
  * error.
  *
  */
-static int netlbl_cipsov4_add_std(u32 doi, struct nlattr *msg, size_t msg_size)
+static int netlbl_cipsov4_add_std(struct genl_info *info)
 {
 	int ret_val = -EINVAL;
-	int msg_len = msg_size;
-	u32 num_tags;
-	u32 num_lvls;
-	u32 num_cats;
 	struct cipso_v4_doi *doi_def = NULL;
-	u32 iter;
-	u32 tmp_val_a;
-	u32 tmp_val_b;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	int nla_a_rem;
+	int nla_b_rem;
 
-	if (msg_len < NETLBL_LEN_U32)
-		goto add_std_failure;
-	num_tags = netlbl_getinc_u32(&msg, &msg_len);
-	if (num_tags == 0 || num_tags > CIPSO_V4_TAG_MAXCNT)
-		goto add_std_failure;
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI] ||
+	    !info->attrs[NLBL_CIPSOV4_A_TAGLST] ||
+	    !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST])
+		return -EINVAL;
+
+	if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+				NLBL_CIPSOV4_A_MAX,
+				netlbl_cipsov4_genl_policy) != 0)
+		return -EINVAL;
 
 	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
-	if (doi_def == NULL) {
-		ret_val = -ENOMEM;
-		goto add_std_failure;
-	}
+	if (doi_def == NULL)
+		return -ENOMEM;
 	doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL);
 	if (doi_def->map.std == NULL) {
 		ret_val = -ENOMEM;
@@ -128,28 +182,32 @@ static int netlbl_cipsov4_add_std(u32 doi, struct nlattr *msg, size_t msg_size)
 	}
 	doi_def->type = CIPSO_V4_MAP_STD;
 
-	for (iter = 0; iter < num_tags; iter++) {
-		if (msg_len < NETLBL_LEN_U8)
-			goto add_std_failure;
-		doi_def->tags[iter] = netlbl_getinc_u8(&msg, &msg_len);
-		switch (doi_def->tags[iter]) {
-		case CIPSO_V4_TAG_RBITMAP:
-			break;
-		default:
-			goto add_std_failure;
-		}
-	}
-	if (iter < CIPSO_V4_TAG_MAXCNT)
-		doi_def->tags[iter] = CIPSO_V4_TAG_INVALID;
-
-	if (msg_len < 6 * NETLBL_LEN_U32)
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
 		goto add_std_failure;
 
-	num_lvls = netlbl_getinc_u32(&msg, &msg_len);
-	if (num_lvls == 0)
-		goto add_std_failure;
-	doi_def->map.std->lvl.local_size = netlbl_getinc_u32(&msg, &msg_len);
-	if (doi_def->map.std->lvl.local_size > CIPSO_V4_MAX_LOC_LVLS)
+	nla_for_each_nested(nla_a,
+			    info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+			    nla_a_rem)
+		if (nla_a->nla_type == NLBL_CIPSOV4_A_MLSLVL) {
+			nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+				switch (nla_b->nla_type) {
+				case NLBL_CIPSOV4_A_MLSLVLLOC:
+					if (nla_get_u32(nla_b) >=
+					    doi_def->map.std->lvl.local_size)
+					     doi_def->map.std->lvl.local_size =
+						     nla_get_u32(nla_b) + 1;
+					break;
+				case NLBL_CIPSOV4_A_MLSLVLREM:
+					if (nla_get_u32(nla_b) >=
+					    doi_def->map.std->lvl.cipso_size)
+					     doi_def->map.std->lvl.cipso_size =
+						     nla_get_u32(nla_b) + 1;
+					break;
+				}
+		}
+	if (doi_def->map.std->lvl.local_size > CIPSO_V4_MAX_LOC_LVLS ||
+	    doi_def->map.std->lvl.cipso_size > CIPSO_V4_MAX_REM_LVLS)
 		goto add_std_failure;
 	doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size,
 					      sizeof(u32),
@@ -158,9 +216,6 @@ static int netlbl_cipsov4_add_std(u32 doi, struct nlattr *msg, size_t msg_size)
 		ret_val = -ENOMEM;
 		goto add_std_failure;
 	}
-	doi_def->map.std->lvl.cipso_size = netlbl_getinc_u8(&msg, &msg_len);
-	if (doi_def->map.std->lvl.cipso_size > CIPSO_V4_MAX_REM_LVLS)
-		goto add_std_failure;
 	doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size,
 					      sizeof(u32),
 					      GFP_KERNEL);
@@ -168,68 +223,101 @@ static int netlbl_cipsov4_add_std(u32 doi, struct nlattr *msg, size_t msg_size)
 		ret_val = -ENOMEM;
 		goto add_std_failure;
 	}
+	nla_for_each_nested(nla_a,
+			    info->attrs[NLBL_CIPSOV4_A_MLSLVLLST],
+			    nla_a_rem)
+		if (nla_a->nla_type == NLBL_CIPSOV4_A_MLSLVL) {
+			struct nlattr *lvl_loc;
+			struct nlattr *lvl_rem;
+
+			if (nla_validate_nested(nla_a,
+					      NLBL_CIPSOV4_A_MAX,
+					      netlbl_cipsov4_genl_policy) != 0)
+				goto add_std_failure;
+
+			lvl_loc = nla_find_nested(nla_a,
+						  NLBL_CIPSOV4_A_MLSLVLLOC);
+			lvl_rem = nla_find_nested(nla_a,
+						  NLBL_CIPSOV4_A_MLSLVLREM);
+			if (lvl_loc == NULL || lvl_rem == NULL)
+				goto add_std_failure;
+			doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] =
+				nla_get_u32(lvl_rem);
+			doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] =
+				nla_get_u32(lvl_loc);
+		}
 
-	num_cats = netlbl_getinc_u32(&msg, &msg_len);
-	doi_def->map.std->cat.local_size = netlbl_getinc_u32(&msg, &msg_len);
-	if (doi_def->map.std->cat.local_size > CIPSO_V4_MAX_LOC_CATS)
-		goto add_std_failure;
-	doi_def->map.std->cat.local = kcalloc(doi_def->map.std->cat.local_size,
+	if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) {
+		if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+					NLBL_CIPSOV4_A_MAX,
+					netlbl_cipsov4_genl_policy) != 0)
+			goto add_std_failure;
+
+		nla_for_each_nested(nla_a,
+				    info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+				    nla_a_rem)
+			if (nla_a->nla_type == NLBL_CIPSOV4_A_MLSCAT) {
+				if (nla_validate_nested(nla_a,
+					      NLBL_CIPSOV4_A_MAX,
+					      netlbl_cipsov4_genl_policy) != 0)
+					goto add_std_failure;
+				nla_for_each_nested(nla_b, nla_a, nla_b_rem)
+					switch (nla_b->nla_type) {
+					case NLBL_CIPSOV4_A_MLSCATLOC:
+						if (nla_get_u32(nla_b) >=
+					      doi_def->map.std->cat.local_size)
+					     doi_def->map.std->cat.local_size =
+						     nla_get_u32(nla_b) + 1;
+						break;
+					case NLBL_CIPSOV4_A_MLSCATREM:
+						if (nla_get_u32(nla_b) >=
+					      doi_def->map.std->cat.cipso_size)
+					     doi_def->map.std->cat.cipso_size =
+						     nla_get_u32(nla_b) + 1;
+						break;
+					}
+			}
+		if (doi_def->map.std->cat.local_size > CIPSO_V4_MAX_LOC_CATS ||
+		    doi_def->map.std->cat.cipso_size > CIPSO_V4_MAX_REM_CATS)
+			goto add_std_failure;
+		doi_def->map.std->cat.local = kcalloc(
+			                      doi_def->map.std->cat.local_size,
 					      sizeof(u32),
 					      GFP_KERNEL);
-	if (doi_def->map.std->cat.local == NULL) {
-		ret_val = -ENOMEM;
-		goto add_std_failure;
-	}
-	doi_def->map.std->cat.cipso_size = netlbl_getinc_u16(&msg, &msg_len);
-	if (doi_def->map.std->cat.cipso_size > CIPSO_V4_MAX_REM_CATS)
-		goto add_std_failure;
-	doi_def->map.std->cat.cipso = kcalloc(doi_def->map.std->cat.cipso_size,
+		if (doi_def->map.std->cat.local == NULL) {
+			ret_val = -ENOMEM;
+			goto add_std_failure;
+		}
+		doi_def->map.std->cat.cipso = kcalloc(
+			                      doi_def->map.std->cat.cipso_size,
 					      sizeof(u32),
 					      GFP_KERNEL);
-	if (doi_def->map.std->cat.cipso == NULL) {
-		ret_val = -ENOMEM;
-		goto add_std_failure;
-	}
-
-	if (msg_len <
-	    num_lvls * (NETLBL_LEN_U32 + NETLBL_LEN_U8) +
-	    num_cats * (NETLBL_LEN_U32 + NETLBL_LEN_U16))
-		goto add_std_failure;
-
-	for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++)
-		doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL;
-	for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++)
-		doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL;
-	for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++)
-		doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT;
-	for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++)
-		doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT;
-
-	for (iter = 0; iter < num_lvls; iter++) {
-		tmp_val_a = netlbl_getinc_u32(&msg, &msg_len);
-		tmp_val_b = netlbl_getinc_u8(&msg, &msg_len);
-
-		if (tmp_val_a >= doi_def->map.std->lvl.local_size ||
-		    tmp_val_b >= doi_def->map.std->lvl.cipso_size)
-			goto add_std_failure;
-
-		doi_def->map.std->lvl.cipso[tmp_val_b] = tmp_val_a;
-		doi_def->map.std->lvl.local[tmp_val_a] = tmp_val_b;
-	}
-
-	for (iter = 0; iter < num_cats; iter++) {
-		tmp_val_a = netlbl_getinc_u32(&msg, &msg_len);
-		tmp_val_b = netlbl_getinc_u16(&msg, &msg_len);
-
-		if (tmp_val_a >= doi_def->map.std->cat.local_size ||
-		    tmp_val_b >= doi_def->map.std->cat.cipso_size)
+		if (doi_def->map.std->cat.cipso == NULL) {
+			ret_val = -ENOMEM;
 			goto add_std_failure;
-
-		doi_def->map.std->cat.cipso[tmp_val_b] = tmp_val_a;
-		doi_def->map.std->cat.local[tmp_val_a] = tmp_val_b;
+		}
+		nla_for_each_nested(nla_a,
+				    info->attrs[NLBL_CIPSOV4_A_MLSCATLST],
+				    nla_a_rem)
+			if (nla_a->nla_type == NLBL_CIPSOV4_A_MLSCAT) {
+				struct nlattr *cat_loc;
+				struct nlattr *cat_rem;
+
+				cat_loc = nla_find_nested(nla_a,
+						     NLBL_CIPSOV4_A_MLSCATLOC);
+				cat_rem = nla_find_nested(nla_a,
+						     NLBL_CIPSOV4_A_MLSCATREM);
+				if (cat_loc == NULL || cat_rem == NULL)
+					goto add_std_failure;
+				doi_def->map.std->cat.local[
+				                        nla_get_u32(cat_loc)] =
+					nla_get_u32(cat_rem);
+				doi_def->map.std->cat.cipso[
+					                nla_get_u32(cat_rem)] =
+					nla_get_u32(cat_loc);
+			}
 	}
 
-	doi_def->doi = doi;
 	ret_val = cipso_v4_doi_add(doi_def);
 	if (ret_val != 0)
 		goto add_std_failure;
@@ -243,9 +331,7 @@ add_std_failure:
 
 /**
  * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition
- * @doi: the DOI value
- * @msg: the ADD message data
- * @msg_size: the size of the ADD message buffer
+ * @info: the Generic NETLINK info block
  *
  * Description:
  * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message
@@ -253,52 +339,31 @@ add_std_failure:
  * error.
  *
  */
-static int netlbl_cipsov4_add_pass(u32 doi,
-				   struct nlattr *msg,
-				   size_t msg_size)
+static int netlbl_cipsov4_add_pass(struct genl_info *info)
 {
-	int ret_val = -EINVAL;
-	int msg_len = msg_size;
-	u32 num_tags;
+	int ret_val;
 	struct cipso_v4_doi *doi_def = NULL;
-	u32 iter;
 
-	if (msg_len < NETLBL_LEN_U32)
-		goto add_pass_failure;
-	num_tags = netlbl_getinc_u32(&msg, &msg_len);
-	if (num_tags == 0 || num_tags > CIPSO_V4_TAG_MAXCNT)
-		goto add_pass_failure;
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI] ||
+	    !info->attrs[NLBL_CIPSOV4_A_TAGLST])
+		return -EINVAL;
 
 	doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
-	if (doi_def == NULL) {
-		ret_val = -ENOMEM;
-		goto add_pass_failure;
-	}
+	if (doi_def == NULL)
+		return -ENOMEM;
 	doi_def->type = CIPSO_V4_MAP_PASS;
 
-	for (iter = 0; iter < num_tags; iter++) {
-		if (msg_len < NETLBL_LEN_U8)
-			goto add_pass_failure;
-		doi_def->tags[iter] = netlbl_getinc_u8(&msg, &msg_len);
-		switch (doi_def->tags[iter]) {
-		case CIPSO_V4_TAG_RBITMAP:
-			break;
-		default:
-			goto add_pass_failure;
-		}
-	}
-	if (iter < CIPSO_V4_TAG_MAXCNT)
-		doi_def->tags[iter] = CIPSO_V4_TAG_INVALID;
+	ret_val = netlbl_cipsov4_add_common(info, doi_def);
+	if (ret_val != 0)
+		goto add_pass_failure;
 
-	doi_def->doi = doi;
 	ret_val = cipso_v4_doi_add(doi_def);
 	if (ret_val != 0)
 		goto add_pass_failure;
 	return 0;
 
 add_pass_failure:
-	if (doi_def)
-		netlbl_cipsov4_doi_free(&doi_def->rcu);
+	netlbl_cipsov4_doi_free(&doi_def->rcu);
 	return ret_val;
 }
 
@@ -316,34 +381,21 @@ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info)
 
 {
 	int ret_val = -EINVAL;
-	u32 doi;
 	u32 map_type;
-	int msg_len = netlbl_netlink_payload_len(skb);
-	struct nlattr *msg = netlbl_netlink_payload_data(skb);
-
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		goto add_return;
 
-	if (msg_len < 2 * NETLBL_LEN_U32)
-		goto add_return;
+	if (!info->attrs[NLBL_CIPSOV4_A_MTYPE])
+		return -EINVAL;
 
-	doi = netlbl_getinc_u32(&msg, &msg_len);
-	map_type = netlbl_getinc_u32(&msg, &msg_len);
+	map_type = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE]);
 	switch (map_type) {
 	case CIPSO_V4_MAP_STD:
-		ret_val = netlbl_cipsov4_add_std(doi, msg, msg_len);
+		ret_val = netlbl_cipsov4_add_std(info);
 		break;
 	case CIPSO_V4_MAP_PASS:
-		ret_val = netlbl_cipsov4_add_pass(doi, msg, msg_len);
+		ret_val = netlbl_cipsov4_add_pass(info);
 		break;
 	}
 
-add_return:
-	netlbl_netlink_send_ack(info,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_ACK,
-				-ret_val);
 	return ret_val;
 }
 
@@ -353,84 +405,239 @@ add_return:
  * @info: the Generic NETLINK info block
  *
  * Description:
- * Process a user generated LIST message and respond accordingly.  Returns
- * zero on success and negative values on error.
+ * Process a user generated LIST message and respond accordingly.  While the
+ * response message generated by the kernel is straightforward, determining
+ * before hand the size of the buffer to allocate is not (we have to generate
+ * the message to know the size).  In order to keep this function sane what we
+ * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in
+ * that size, if we fail then we restart with a larger buffer and try again.
+ * We continue in this manner until we hit a limit of failed attempts then we
+ * give up and just send an error message.  Returns zero on success and
+ * negative values on error.
  *
  */
 static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val = -EINVAL;
+	int ret_val;
+	struct sk_buff *ans_skb = NULL;
+	u32 nlsze_mult = 1;
+	void *data;
 	u32 doi;
-	struct nlattr *msg = netlbl_netlink_payload_data(skb);
-	struct sk_buff *ans_skb;
+	struct nlattr *nla_a;
+	struct nlattr *nla_b;
+	struct cipso_v4_doi *doi_def;
+	u32 iter;
 
-	if (netlbl_netlink_payload_len(skb) != NETLBL_LEN_U32)
+	if (!info->attrs[NLBL_CIPSOV4_A_DOI]) {
+		ret_val = -EINVAL;
 		goto list_failure;
+	}
 
-	doi = nla_get_u32(msg);
-	ans_skb = cipso_v4_doi_dump(doi, NLMSG_SPACE(GENL_HDRLEN));
+list_start:
+	ans_skb = nlmsg_new(NLMSG_GOODSIZE * nlsze_mult, GFP_KERNEL);
 	if (ans_skb == NULL) {
 		ret_val = -ENOMEM;
 		goto list_failure;
 	}
-	netlbl_netlink_hdr_push(ans_skb,
-				info->snd_pid,
-				0,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_LIST);
+	data = netlbl_netlink_hdr_put(ans_skb,
+				      info->snd_pid,
+				      info->snd_seq,
+				      netlbl_cipsov4_gnl_family.id,
+				      0,
+				      NLBL_CIPSOV4_C_LIST);
+	if (data == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure;
+	}
+
+	doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_getdef(doi);
+	if (doi_def == NULL) {
+		ret_val = -EINVAL;
+		goto list_failure;
+	}
+
+	ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type);
+	if (ret_val != 0)
+		goto list_failure_lock;
+
+	nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_TAGLST);
+	if (nla_a == NULL) {
+		ret_val = -ENOMEM;
+		goto list_failure_lock;
+	}
+	for (iter = 0;
+	     iter < CIPSO_V4_TAG_MAXCNT &&
+	       doi_def->tags[iter] != CIPSO_V4_TAG_INVALID;
+	     iter++) {
+		ret_val = nla_put_u8(ans_skb,
+				     NLBL_CIPSOV4_A_TAG,
+				     doi_def->tags[iter]);
+		if (ret_val != 0)
+			goto list_failure_lock;
+	}
+	nla_nest_end(ans_skb, nla_a);
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_STD:
+		nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST);
+		if (nla_a == NULL) {
+			ret_val = -ENOMEM;
+			goto list_failure_lock;
+		}
+		for (iter = 0;
+		     iter < doi_def->map.std->lvl.local_size;
+		     iter++) {
+			if (doi_def->map.std->lvl.local[iter] ==
+			    CIPSO_V4_INV_LVL)
+				continue;
+
+			nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSLVL);
+			if (nla_b == NULL) {
+				ret_val = -ENOMEM;
+				goto list_retry;
+			}
+			ret_val = nla_put_u32(ans_skb,
+					      NLBL_CIPSOV4_A_MLSLVLLOC,
+					      iter);
+			if (ret_val != 0)
+				goto list_retry;
+			ret_val = nla_put_u32(ans_skb,
+					    NLBL_CIPSOV4_A_MLSLVLREM,
+					    doi_def->map.std->lvl.local[iter]);
+			if (ret_val != 0)
+				goto list_retry;
+			nla_nest_end(ans_skb, nla_b);
+		}
+		nla_nest_end(ans_skb, nla_a);
+
+		nla_a = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCATLST);
+		if (nla_a == NULL) {
+			ret_val = -ENOMEM;
+			goto list_retry;
+		}
+		for (iter = 0;
+		     iter < doi_def->map.std->cat.local_size;
+		     iter++) {
+			if (doi_def->map.std->cat.local[iter] ==
+			    CIPSO_V4_INV_CAT)
+				continue;
+
+			nla_b = nla_nest_start(ans_skb, NLBL_CIPSOV4_A_MLSCAT);
+			if (nla_b == NULL) {
+				ret_val = -ENOMEM;
+				goto list_retry;
+			}
+			ret_val = nla_put_u32(ans_skb,
+					      NLBL_CIPSOV4_A_MLSCATLOC,
+					      iter);
+			if (ret_val != 0)
+				goto list_retry;
+			ret_val = nla_put_u32(ans_skb,
+					    NLBL_CIPSOV4_A_MLSCATREM,
+					    doi_def->map.std->cat.local[iter]);
+			if (ret_val != 0)
+				goto list_retry;
+			nla_nest_end(ans_skb, nla_b);
+		}
+		nla_nest_end(ans_skb, nla_a);
+
+		break;
+	}
+	rcu_read_unlock();
 
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
+	genlmsg_end(ans_skb, data);
+
+	ret_val = genlmsg_unicast(ans_skb, info->snd_pid);
 	if (ret_val != 0)
 		goto list_failure;
 
 	return 0;
 
+list_retry:
+	/* XXX - this limit is a guesstimate */
+	if (nlsze_mult < 4) {
+		rcu_read_unlock();
+		kfree_skb(ans_skb);
+		nlsze_mult++;
+		goto list_start;
+	}
+list_failure_lock:
+	rcu_read_unlock();
 list_failure:
-	netlbl_netlink_send_ack(info,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_ACK,
-				-ret_val);
+	kfree_skb(ans_skb);
+	return ret_val;
+}
+
+/**
+ * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL
+ * @doi_def: the CIPSOv4 DOI definition
+ * @arg: the netlbl_cipsov4_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * cipso_v4_doi_walk() function for use in generating a response for a LISTALL
+ * message.  Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg;
+	void *data;
+
+	data = netlbl_netlink_hdr_put(cb_arg->skb,
+				      NETLINK_CB(cb_arg->nl_cb->skb).pid,
+				      cb_arg->seq,
+				      netlbl_cipsov4_gnl_family.id,
+				      NLM_F_MULTI,
+				      NLBL_CIPSOV4_C_LISTALL);
+	if (data == NULL)
+		goto listall_cb_failure;
+
+	ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+	ret_val = nla_put_u32(cb_arg->skb,
+			      NLBL_CIPSOV4_A_MTYPE,
+			      doi_def->type);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+
+	return genlmsg_end(cb_arg->skb, data);
+
+listall_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
 	return ret_val;
 }
 
 /**
  * netlbl_cipsov4_listall - Handle a LISTALL message
  * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
+ * @cb: the NETLINK callback
  *
  * Description:
  * Process a user generated LISTALL message and respond accordingly.  Returns
  * zero on success and negative values on error.
  *
  */
-static int netlbl_cipsov4_listall(struct sk_buff *skb, struct genl_info *info)
+static int netlbl_cipsov4_listall(struct sk_buff *skb,
+				  struct netlink_callback *cb)
 {
-	int ret_val = -EINVAL;
-	struct sk_buff *ans_skb;
+	struct netlbl_cipsov4_doiwalk_arg cb_arg;
+	int doi_skip = cb->args[0];
 
-	ans_skb = cipso_v4_doi_dump_all(NLMSG_SPACE(GENL_HDRLEN));
-	if (ans_skb == NULL) {
-		ret_val = -ENOMEM;
-		goto listall_failure;
-	}
-	netlbl_netlink_hdr_push(ans_skb,
-				info->snd_pid,
-				0,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_LISTALL);
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
 
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
-	if (ret_val != 0)
-		goto listall_failure;
-
-	return 0;
+	cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg);
 
-listall_failure:
-	netlbl_netlink_send_ack(info,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_ACK,
-				-ret_val);
-	return ret_val;
+	cb->args[0] = doi_skip;
+	return skb->len;
 }
 
 /**
@@ -445,27 +652,14 @@ listall_failure:
  */
 static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val;
+	int ret_val = -EINVAL;
 	u32 doi;
-	struct nlattr *msg = netlbl_netlink_payload_data(skb);
 
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		goto remove_return;
-
-	if (netlbl_netlink_payload_len(skb) != NETLBL_LEN_U32) {
-		ret_val = -EINVAL;
-		goto remove_return;
+	if (info->attrs[NLBL_CIPSOV4_A_DOI]) {
+		doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]);
+		ret_val = cipso_v4_doi_remove(doi, netlbl_cipsov4_doi_free);
 	}
 
-	doi = nla_get_u32(msg);
-	ret_val = cipso_v4_doi_remove(doi, netlbl_cipsov4_doi_free);
-
-remove_return:
-	netlbl_netlink_send_ack(info,
-				netlbl_cipsov4_gnl_family.id,
-				NLBL_CIPSOV4_C_ACK,
-				-ret_val);
 	return ret_val;
 }
 
@@ -475,14 +669,16 @@ remove_return:
 
 static struct genl_ops netlbl_cipsov4_genl_c_add = {
 	.cmd = NLBL_CIPSOV4_C_ADD,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_cipsov4_genl_policy,
 	.doit = netlbl_cipsov4_add,
 	.dumpit = NULL,
 };
 
 static struct genl_ops netlbl_cipsov4_genl_c_remove = {
 	.cmd = NLBL_CIPSOV4_C_REMOVE,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_cipsov4_genl_policy,
 	.doit = netlbl_cipsov4_remove,
 	.dumpit = NULL,
 };
@@ -490,6 +686,7 @@ static struct genl_ops netlbl_cipsov4_genl_c_remove = {
 static struct genl_ops netlbl_cipsov4_genl_c_list = {
 	.cmd = NLBL_CIPSOV4_C_LIST,
 	.flags = 0,
+	.policy = netlbl_cipsov4_genl_policy,
 	.doit = netlbl_cipsov4_list,
 	.dumpit = NULL,
 };
@@ -497,8 +694,9 @@ static struct genl_ops netlbl_cipsov4_genl_c_list = {
 static struct genl_ops netlbl_cipsov4_genl_c_listall = {
 	.cmd = NLBL_CIPSOV4_C_LISTALL,
 	.flags = 0,
-	.doit = netlbl_cipsov4_listall,
-	.dumpit = NULL,
+	.policy = netlbl_cipsov4_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_cipsov4_listall,
 };
 
 /*
diff --git a/net/netlabel/netlabel_cipso_v4.h b/net/netlabel/netlabel_cipso_v4.h
index 4c6ff4b9300..f03cf9b7828 100644
--- a/net/netlabel/netlabel_cipso_v4.h
+++ b/net/netlabel/netlabel_cipso_v4.h
@@ -34,175 +34,71 @@
 #include <net/netlabel.h>
 
 /*
- * The following NetLabel payloads are supported by the CIPSO subsystem, all
- * of which are preceeded by the nlmsghdr struct.
+ * The following NetLabel payloads are supported by the CIPSO subsystem.
  *
- * o ACK:
- *   Sent by the kernel in response to an applications message, applications
- *   should never send this message.
+ * o ADD:
+ *   Sent by an application to add a new DOI mapping table.
  *
- *   +----------------------+-----------------------+
- *   | seq number (32 bits) | return code (32 bits) |
- *   +----------------------+-----------------------+
+ *   Required attributes:
  *
- *     seq number:  the sequence number of the original message, taken from the
- *                  nlmsghdr structure
- *     return code: return value, based on errno values
+ *     NLBL_CIPSOV4_A_DOI
+ *     NLBL_CIPSOV4_A_MTYPE
+ *     NLBL_CIPSOV4_A_TAGLST
  *
- * o ADD:
- *   Sent by an application to add a new DOI mapping table, after completion
- *   of the task the kernel should ACK this message.
- *
- *   +---------------+--------------------+---------------------+
- *   | DOI (32 bits) | map type (32 bits) | tag count (32 bits) | ...
- *   +---------------+--------------------+---------------------+
- *
- *   +-----------------+
- *   | tag #X (8 bits) | ... repeated
- *   +-----------------+
- *
- *   +-------------- ---- --- -- -
- *   | mapping data
- *   +-------------- ---- --- -- -
- *
- *     DOI:          the DOI value
- *     map type:     the mapping table type (defined in the cipso_ipv4.h header
- *                   as CIPSO_V4_MAP_*)
- *     tag count:    the number of tags, must be greater than zero
- *     tag:          the CIPSO tag for the DOI, tags listed first are given
- *                   higher priorirty when sending packets
- *     mapping data: specific to the map type (see below)
- *
- *   CIPSO_V4_MAP_STD
- *
- *   +------------------+-----------------------+----------------------+
- *   | levels (32 bits) | max l level (32 bits) | max r level (8 bits) | ...
- *   +------------------+-----------------------+----------------------+
- *
- *   +----------------------+---------------------+---------------------+
- *   | categories (32 bits) | max l cat (32 bits) | max r cat (16 bits) | ...
- *   +----------------------+---------------------+---------------------+
- *
- *   +--------------------------+-------------------------+
- *   | local level #X (32 bits) | CIPSO level #X (8 bits) | ... repeated
- *   +--------------------------+-------------------------+
- *
- *   +-----------------------------+-----------------------------+
- *   | local category #X (32 bits) | CIPSO category #X (16 bits) | ... repeated
- *   +-----------------------------+-----------------------------+
- *
- *     levels:         the number of level mappings
- *     max l level:    the highest local level
- *     max r level:    the highest remote/CIPSO level
- *     categories:     the number of category mappings
- *     max l cat:      the highest local category
- *     max r cat:      the highest remote/CIPSO category
- *     local level:    the local part of a level mapping
- *     CIPSO level:    the remote/CIPSO part of a level mapping
- *     local category: the local part of a category mapping
- *     CIPSO category: the remote/CIPSO part of a category mapping
- *
- *   CIPSO_V4_MAP_PASS
- *
- *   No mapping data is needed for this map type.
+ *   If using CIPSO_V4_MAP_STD the following attributes are required:
+ *
+ *     NLBL_CIPSOV4_A_MLSLVLLST
+ *     NLBL_CIPSOV4_A_MLSCATLST
+ *
+ *   If using CIPSO_V4_MAP_PASS no additional attributes are required.
  *
  * o REMOVE:
  *   Sent by an application to remove a specific DOI mapping table from the
- *   CIPSO V4 system.  The kernel should ACK this message.
+ *   CIPSO V4 system.
  *
- *   +---------------+
- *   | DOI (32 bits) |
- *   +---------------+
+ *   Required attributes:
  *
- *     DOI:          the DOI value
+ *     NLBL_CIPSOV4_A_DOI
  *
  * o LIST:
- *   Sent by an application to list the details of a DOI definition.  The
- *   kernel should send an ACK on error or a response as indicated below.  The
- *   application generated message format is shown below.
+ *   Sent by an application to list the details of a DOI definition.  On
+ *   success the kernel should send a response using the following format.
  *
- *   +---------------+
- *   | DOI (32 bits) |
- *   +---------------+
+ *   Required attributes:
  *
- *     DOI:          the DOI value
+ *     NLBL_CIPSOV4_A_DOI
  *
  *   The valid response message format depends on the type of the DOI mapping,
- *   the known formats are shown below.
- *
- *   +--------------------+
- *   | map type (32 bits) | ...
- *   +--------------------+
- *
- *     map type:       the DOI mapping table type (defined in the cipso_ipv4.h
- *                     header as CIPSO_V4_MAP_*)
- *
- *   (map type == CIPSO_V4_MAP_STD)
- *
- *   +----------------+------------------+----------------------+
- *   | tags (32 bits) | levels (32 bits) | categories (32 bits) | ...
- *   +----------------+------------------+----------------------+
+ *   the defined formats are shown below.
  *
- *   +-----------------+
- *   | tag #X (8 bits) | ... repeated
- *   +-----------------+
+ *   Required attributes:
  *
- *   +--------------------------+-------------------------+
- *   | local level #X (32 bits) | CIPSO level #X (8 bits) | ... repeated
- *   +--------------------------+-------------------------+
+ *     NLBL_CIPSOV4_A_MTYPE
+ *     NLBL_CIPSOV4_A_TAGLST
  *
- *   +-----------------------------+-----------------------------+
- *   | local category #X (32 bits) | CIPSO category #X (16 bits) | ... repeated
- *   +-----------------------------+-----------------------------+
+ *   If using CIPSO_V4_MAP_STD the following attributes are required:
  *
- *     tags:           the number of CIPSO tag types
- *     levels:         the number of level mappings
- *     categories:     the number of category mappings
- *     tag:            the tag number, tags listed first are given higher
- *                     priority when sending packets
- *     local level:    the local part of a level mapping
- *     CIPSO level:    the remote/CIPSO part of a level mapping
- *     local category: the local part of a category mapping
- *     CIPSO category: the remote/CIPSO part of a category mapping
+ *     NLBL_CIPSOV4_A_MLSLVLLST
+ *     NLBL_CIPSOV4_A_MLSCATLST
  *
- *   (map type == CIPSO_V4_MAP_PASS)
- *
- *   +----------------+
- *   | tags (32 bits) | ...
- *   +----------------+
- *
- *   +-----------------+
- *   | tag #X (8 bits) | ... repeated
- *   +-----------------+
- *
- *     tags:           the number of CIPSO tag types
- *     tag:            the tag number, tags listed first are given higher
- *                     priority when sending packets
+ *   If using CIPSO_V4_MAP_PASS no additional attributes are required.
  *
  * o LISTALL:
  *   This message is sent by an application to list the valid DOIs on the
- *   system.  There is no payload and the kernel should respond with an ACK
- *   or the following message.
- *
- *   +---------------------+------------------+-----------------------+
- *   | DOI count (32 bits) | DOI #X (32 bits) | map type #X (32 bits) |
- *   +---------------------+------------------+-----------------------+
+ *   system.  When sent by an application there is no payload and the
+ *   NLM_F_DUMP flag should be set.  The kernel should respond with a series of
+ *   the following messages.
  *
- *   +-----------------------+
- *   | map type #X (32 bits) | ...
- *   +-----------------------+
+ *   Required attributes:
  *
- *     DOI count:      the number of DOIs
- *     DOI:            the DOI value
- *     map type:       the DOI mapping table type (defined in the cipso_ipv4.h
- *                     header as CIPSO_V4_MAP_*)
+ *    NLBL_CIPSOV4_A_DOI
+ *    NLBL_CIPSOV4_A_MTYPE
  *
  */
 
 /* NetLabel CIPSOv4 commands */
 enum {
 	NLBL_CIPSOV4_C_UNSPEC,
-	NLBL_CIPSOV4_C_ACK,
 	NLBL_CIPSOV4_C_ADD,
 	NLBL_CIPSOV4_C_REMOVE,
 	NLBL_CIPSOV4_C_LIST,
@@ -211,6 +107,59 @@ enum {
 };
 #define NLBL_CIPSOV4_C_MAX (__NLBL_CIPSOV4_C_MAX - 1)
 
+/* NetLabel CIPSOv4 attributes */
+enum {
+	NLBL_CIPSOV4_A_UNSPEC,
+	NLBL_CIPSOV4_A_DOI,
+	/* (NLA_U32)
+	 * the DOI value */
+	NLBL_CIPSOV4_A_MTYPE,
+	/* (NLA_U32)
+	 * the mapping table type (defined in the cipso_ipv4.h header as
+	 * CIPSO_V4_MAP_*) */
+	NLBL_CIPSOV4_A_TAG,
+	/* (NLA_U8)
+	 * a CIPSO tag type, meant to be used within a NLBL_CIPSOV4_A_TAGLST
+	 * attribute */
+	NLBL_CIPSOV4_A_TAGLST,
+	/* (NLA_NESTED)
+	 * the CIPSO tag list for the DOI, there must be at least one
+	 * NLBL_CIPSOV4_A_TAG attribute, tags listed first are given higher
+	 * priorirty when sending packets */
+	NLBL_CIPSOV4_A_MLSLVLLOC,
+	/* (NLA_U32)
+	 * the local MLS sensitivity level */
+	NLBL_CIPSOV4_A_MLSLVLREM,
+	/* (NLA_U32)
+	 * the remote MLS sensitivity level */
+	NLBL_CIPSOV4_A_MLSLVL,
+	/* (NLA_NESTED)
+	 * a MLS sensitivity level mapping, must contain only one attribute of
+	 * each of the following types: NLBL_CIPSOV4_A_MLSLVLLOC and
+	 * NLBL_CIPSOV4_A_MLSLVLREM */
+	NLBL_CIPSOV4_A_MLSLVLLST,
+	/* (NLA_NESTED)
+	 * the CIPSO level mappings, there must be at least one
+	 * NLBL_CIPSOV4_A_MLSLVL attribute */
+	NLBL_CIPSOV4_A_MLSCATLOC,
+	/* (NLA_U32)
+	 * the local MLS category */
+	NLBL_CIPSOV4_A_MLSCATREM,
+	/* (NLA_U32)
+	 * the remote MLS category */
+	NLBL_CIPSOV4_A_MLSCAT,
+	/* (NLA_NESTED)
+	 * a MLS category mapping, must contain only one attribute of each of
+	 * the following types: NLBL_CIPSOV4_A_MLSCATLOC and
+	 * NLBL_CIPSOV4_A_MLSCATREM */
+	NLBL_CIPSOV4_A_MLSCATLST,
+	/* (NLA_NESTED)
+	 * the CIPSO category mappings, there must be at least one
+	 * NLBL_CIPSOV4_A_MLSCAT attribute */
+	__NLBL_CIPSOV4_A_MAX,
+};
+#define NLBL_CIPSOV4_A_MAX (__NLBL_CIPSOV4_A_MAX - 1)
+
 /* NetLabel protocol functions */
 int netlbl_cipsov4_genl_init(void);
 
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 85bc11a1fc4..8626c9f678e 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -42,15 +42,29 @@
 #include "netlabel_user.h"
 #include "netlabel_mgmt.h"
 
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+	struct netlink_callback *nl_cb;
+	struct sk_buff *skb;
+	u32 seq;
+};
+
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_mgmt_gnl_family = {
 	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_MGMT_NAME,
 	.version = NETLBL_PROTO_VERSION,
-	.maxattr = 0,
+	.maxattr = NLBL_MGMT_A_MAX,
 };
 
+/* NetLabel Netlink attribute policy */
+static struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
+	[NLBL_MGMT_A_DOMAIN] = { .type = NLA_NUL_STRING },
+	[NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 },
+	[NLBL_MGMT_A_VERSION] = { .type = NLA_U32 },
+	[NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
+};
 
 /*
  * NetLabel Command Handlers
@@ -70,97 +84,62 @@ static struct genl_family netlbl_mgmt_gnl_family = {
 static int netlbl_mgmt_add(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -EINVAL;
-	struct nlattr *msg_ptr = netlbl_netlink_payload_data(skb);
-	int msg_len = netlbl_netlink_payload_len(skb);
-	u32 count;
 	struct netlbl_dom_map *entry = NULL;
-	u32 iter;
+	size_t tmp_size;
 	u32 tmp_val;
-	int tmp_size;
 
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
+	if (!info->attrs[NLBL_MGMT_A_DOMAIN] ||
+	    !info->attrs[NLBL_MGMT_A_PROTOCOL])
 		goto add_failure;
 
-	if (msg_len < NETLBL_LEN_U32)
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry == NULL) {
+		ret_val = -ENOMEM;
+		goto add_failure;
+	}
+	tmp_size = nla_len(info->attrs[NLBL_MGMT_A_DOMAIN]);
+	entry->domain = kmalloc(tmp_size, GFP_KERNEL);
+	if (entry->domain == NULL) {
+		ret_val = -ENOMEM;
 		goto add_failure;
-	count = netlbl_getinc_u32(&msg_ptr, &msg_len);
+	}
+	entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
+	nla_strlcpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size);
 
-	for (iter = 0; iter < count && msg_len > 0; iter++, entry = NULL) {
-		if (msg_len <= 0) {
-			ret_val = -EINVAL;
-			goto add_failure;
-		}
-		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-		if (entry == NULL) {
-			ret_val = -ENOMEM;
-			goto add_failure;
-		}
-		tmp_size = nla_len(msg_ptr);
-		if (tmp_size <= 0 || tmp_size > msg_len) {
-			ret_val = -EINVAL;
-			goto add_failure;
-		}
-		entry->domain = kmalloc(tmp_size, GFP_KERNEL);
-		if (entry->domain == NULL) {
-			ret_val = -ENOMEM;
+	switch (entry->type) {
+	case NETLBL_NLTYPE_UNLABELED:
+		ret_val = netlbl_domhsh_add(entry);
+		break;
+	case NETLBL_NLTYPE_CIPSOV4:
+		if (!info->attrs[NLBL_MGMT_A_CV4DOI])
 			goto add_failure;
-		}
-		nla_strlcpy(entry->domain, msg_ptr, tmp_size);
-		entry->domain[tmp_size - 1] = '\0';
-		msg_ptr = nla_next(msg_ptr, &msg_len);
 
-		if (msg_len < NETLBL_LEN_U32) {
-			ret_val = -EINVAL;
-			goto add_failure;
-		}
-		tmp_val = netlbl_getinc_u32(&msg_ptr, &msg_len);
-		entry->type = tmp_val;
-		switch (tmp_val) {
-		case NETLBL_NLTYPE_UNLABELED:
-			ret_val = netlbl_domhsh_add(entry);
-			break;
-		case NETLBL_NLTYPE_CIPSOV4:
-			if (msg_len < NETLBL_LEN_U32) {
-				ret_val = -EINVAL;
-				goto add_failure;
-			}
-			tmp_val = netlbl_getinc_u32(&msg_ptr, &msg_len);
-			/* We should be holding a rcu_read_lock() here
-			 * while we hold the result but since the entry
-			 * will always be deleted when the CIPSO DOI
-			 * is deleted we aren't going to keep the lock. */
-			rcu_read_lock();
-			entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
-			if (entry->type_def.cipsov4 == NULL) {
-				rcu_read_unlock();
-				ret_val = -EINVAL;
-				goto add_failure;
-			}
-			ret_val = netlbl_domhsh_add(entry);
+		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
+		/* We should be holding a rcu_read_lock() here while we hold
+		 * the result but since the entry will always be deleted when
+		 * the CIPSO DOI is deleted we aren't going to keep the
+		 * lock. */
+		rcu_read_lock();
+		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
+		if (entry->type_def.cipsov4 == NULL) {
 			rcu_read_unlock();
-			break;
-		default:
-			ret_val = -EINVAL;
-		}
-		if (ret_val != 0)
 			goto add_failure;
+		}
+		ret_val = netlbl_domhsh_add(entry);
+		rcu_read_unlock();
+		break;
+	default:
+		goto add_failure;
 	}
+	if (ret_val != 0)
+		goto add_failure;
 
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				NETLBL_E_OK);
 	return 0;
 
 add_failure:
 	if (entry)
 		kfree(entry->domain);
 	kfree(entry);
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
 	return ret_val;
 }
 
@@ -176,87 +155,98 @@ add_failure:
  */
 static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val = -EINVAL;
-	struct nlattr *msg_ptr = netlbl_netlink_payload_data(skb);
-	int msg_len = netlbl_netlink_payload_len(skb);
-	u32 count;
-	u32 iter;
-	int tmp_size;
-	unsigned char *domain;
-
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		goto remove_return;
+	char *domain;
 
-	if (msg_len < NETLBL_LEN_U32)
-		goto remove_return;
-	count = netlbl_getinc_u32(&msg_ptr, &msg_len);
+	if (!info->attrs[NLBL_MGMT_A_DOMAIN])
+		return -EINVAL;
 
-	for (iter = 0; iter < count && msg_len > 0; iter++) {
-		if (msg_len <= 0) {
-			ret_val = -EINVAL;
-			goto remove_return;
-		}
-		tmp_size = nla_len(msg_ptr);
-		domain = nla_data(msg_ptr);
-		if (tmp_size <= 0 || tmp_size > msg_len ||
-		    domain[tmp_size - 1] != '\0') {
-			ret_val = -EINVAL;
-			goto remove_return;
-		}
-		ret_val = netlbl_domhsh_remove(domain);
+	domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]);
+	return netlbl_domhsh_remove(domain);
+}
+
+/**
+ * netlbl_mgmt_listall_cb - netlbl_domhsh_walk() callback for LISTALL
+ * @entry: the domain mapping hash table entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * netlbl_domhsh_walk() function for use in generating a response for a LISTALL
+ * message.  Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_mgmt_listall_cb(struct netlbl_dom_map *entry, void *arg)
+{
+	int ret_val = -ENOMEM;
+	struct netlbl_domhsh_walk_arg *cb_arg = arg;
+	void *data;
+
+	data = netlbl_netlink_hdr_put(cb_arg->skb,
+				      NETLINK_CB(cb_arg->nl_cb->skb).pid,
+				      cb_arg->seq,
+				      netlbl_mgmt_gnl_family.id,
+				      NLM_F_MULTI,
+				      NLBL_MGMT_C_LISTALL);
+	if (data == NULL)
+		goto listall_cb_failure;
+
+	ret_val = nla_put_string(cb_arg->skb,
+				 NLBL_MGMT_A_DOMAIN,
+				 entry->domain);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+	ret_val = nla_put_u32(cb_arg->skb, NLBL_MGMT_A_PROTOCOL, entry->type);
+	if (ret_val != 0)
+		goto listall_cb_failure;
+	switch (entry->type) {
+	case NETLBL_NLTYPE_CIPSOV4:
+		ret_val = nla_put_u32(cb_arg->skb,
+				      NLBL_MGMT_A_CV4DOI,
+				      entry->type_def.cipsov4->doi);
 		if (ret_val != 0)
-			goto remove_return;
-		msg_ptr = nla_next(msg_ptr, &msg_len);
+			goto listall_cb_failure;
+		break;
 	}
 
-	ret_val = 0;
+	cb_arg->seq++;
+	return genlmsg_end(cb_arg->skb, data);
 
-remove_return:
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
+listall_cb_failure:
+	genlmsg_cancel(cb_arg->skb, data);
 	return ret_val;
 }
 
 /**
- * netlbl_mgmt_list - Handle a LIST message
+ * netlbl_mgmt_listall - Handle a LISTALL message
  * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
+ * @cb: the NETLINK callback
  *
  * Description:
- * Process a user generated LIST message and dumps the domain hash table in a
- * form suitable for use in a kernel generated LIST message.  Returns zero on
- * success, negative values on failure.
+ * Process a user generated LISTALL message and dumps the domain hash table in
+ * a form suitable for use in a kernel generated LISTALL message.  Returns zero
+ * on success, negative values on failure.
  *
  */
-static int netlbl_mgmt_list(struct sk_buff *skb, struct genl_info *info)
+static int netlbl_mgmt_listall(struct sk_buff *skb,
+			       struct netlink_callback *cb)
 {
-	int ret_val = -ENOMEM;
-	struct sk_buff *ans_skb;
-
-	ans_skb = netlbl_domhsh_dump(NLMSG_SPACE(GENL_HDRLEN));
-	if (ans_skb == NULL)
-		goto list_failure;
-	netlbl_netlink_hdr_push(ans_skb,
-				info->snd_pid,
-				0,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_LIST);
-
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
-	if (ret_val != 0)
-		goto list_failure;
-
-	return 0;
-
-list_failure:
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
-	return ret_val;
+	struct netlbl_domhsh_walk_arg cb_arg;
+	u32 skip_bkt = cb->args[0];
+	u32 skip_chain = cb->args[1];
+
+	cb_arg.nl_cb = cb;
+	cb_arg.skb = skb;
+	cb_arg.seq = cb->nlh->nlmsg_seq;
+
+	netlbl_domhsh_walk(&skip_bkt,
+			   &skip_chain,
+			   netlbl_mgmt_listall_cb,
+			   &cb_arg);
+
+	cb->args[0] = skip_bkt;
+	cb->args[1] = skip_chain;
+	return skb->len;
 }
 
 /**
@@ -272,68 +262,51 @@ list_failure:
 static int netlbl_mgmt_adddef(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -EINVAL;
-	struct nlattr *msg_ptr = netlbl_netlink_payload_data(skb);
-	int msg_len = netlbl_netlink_payload_len(skb);
 	struct netlbl_dom_map *entry = NULL;
 	u32 tmp_val;
 
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		goto adddef_failure;
-
-	if (msg_len < NETLBL_LEN_U32)
+	if (!info->attrs[NLBL_MGMT_A_PROTOCOL])
 		goto adddef_failure;
-	tmp_val = netlbl_getinc_u32(&msg_ptr, &msg_len);
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL) {
 		ret_val = -ENOMEM;
 		goto adddef_failure;
 	}
+	entry->type = nla_get_u32(info->attrs[NLBL_MGMT_A_PROTOCOL]);
 
-	entry->type = tmp_val;
 	switch (entry->type) {
 	case NETLBL_NLTYPE_UNLABELED:
 		ret_val = netlbl_domhsh_add_default(entry);
 		break;
 	case NETLBL_NLTYPE_CIPSOV4:
-		if (msg_len < NETLBL_LEN_U32) {
-			ret_val = -EINVAL;
+		if (!info->attrs[NLBL_MGMT_A_CV4DOI])
 			goto adddef_failure;
-		}
-		tmp_val = netlbl_getinc_u32(&msg_ptr, &msg_len);
-		/* We should be holding a rcu_read_lock here while we
-		 * hold the result but since the entry will always be
-		 * deleted when the CIPSO DOI is deleted we are going
-		 * to skip the lock. */
+
+		tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CV4DOI]);
+		/* We should be holding a rcu_read_lock() here while we hold
+		 * the result but since the entry will always be deleted when
+		 * the CIPSO DOI is deleted we aren't going to keep the
+		 * lock. */
 		rcu_read_lock();
 		entry->type_def.cipsov4 = cipso_v4_doi_getdef(tmp_val);
 		if (entry->type_def.cipsov4 == NULL) {
 			rcu_read_unlock();
-			ret_val = -EINVAL;
 			goto adddef_failure;
 		}
 		ret_val = netlbl_domhsh_add_default(entry);
 		rcu_read_unlock();
 		break;
 	default:
-		ret_val = -EINVAL;
+		goto adddef_failure;
 	}
 	if (ret_val != 0)
 		goto adddef_failure;
 
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				NETLBL_E_OK);
 	return 0;
 
 adddef_failure:
 	kfree(entry);
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
 	return ret_val;
 }
 
@@ -349,20 +322,7 @@ adddef_failure:
  */
 static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val;
-
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		goto removedef_return;
-
-	ret_val = netlbl_domhsh_remove_default();
-
-removedef_return:
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
-	return ret_val;
+	return netlbl_domhsh_remove_default();
 }
 
 /**
@@ -379,87 +339,130 @@ removedef_return:
 static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -ENOMEM;
-	struct sk_buff *ans_skb;
+	struct sk_buff *ans_skb = NULL;
+	void *data;
+	struct netlbl_dom_map *entry;
 
-	ans_skb = netlbl_domhsh_dump_default(NLMSG_SPACE(GENL_HDRLEN));
+	ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (ans_skb == NULL)
+		return -ENOMEM;
+	data = netlbl_netlink_hdr_put(ans_skb,
+				      info->snd_pid,
+				      info->snd_seq,
+				      netlbl_mgmt_gnl_family.id,
+				      0,
+				      NLBL_MGMT_C_LISTDEF);
+	if (data == NULL)
 		goto listdef_failure;
-	netlbl_netlink_hdr_push(ans_skb,
-				info->snd_pid,
-				0,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_LISTDEF);
 
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
+	rcu_read_lock();
+	entry = netlbl_domhsh_getentry(NULL);
+	if (entry == NULL) {
+		ret_val = -ENOENT;
+		goto listdef_failure_lock;
+	}
+	ret_val = nla_put_u32(ans_skb, NLBL_MGMT_A_PROTOCOL, entry->type);
 	if (ret_val != 0)
-		goto listdef_failure;
+		goto listdef_failure_lock;
+	switch (entry->type) {
+	case NETLBL_NLTYPE_CIPSOV4:
+		ret_val = nla_put_u32(ans_skb,
+				      NLBL_MGMT_A_CV4DOI,
+				      entry->type_def.cipsov4->doi);
+		if (ret_val != 0)
+			goto listdef_failure_lock;
+		break;
+	}
+	rcu_read_unlock();
 
+	genlmsg_end(ans_skb, data);
+
+	ret_val = genlmsg_unicast(ans_skb, info->snd_pid);
+	if (ret_val != 0)
+		goto listdef_failure;
 	return 0;
 
+listdef_failure_lock:
+	rcu_read_unlock();
 listdef_failure:
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
+	kfree_skb(ans_skb);
 	return ret_val;
 }
 
 /**
- * netlbl_mgmt_modules - Handle a MODULES message
- * @skb: the NETLINK buffer
- * @info: the Generic NETLINK info block
+ * netlbl_mgmt_protocols_cb - Write an individual PROTOCOL message response
+ * @skb: the skb to write to
+ * @seq: the NETLINK sequence number
+ * @cb: the NETLINK callback
+ * @protocol: the NetLabel protocol to use in the message
  *
  * Description:
- * Process a user generated MODULES message and respond accordingly.
+ * This function is to be used in conjunction with netlbl_mgmt_protocols() to
+ * answer a application's PROTOCOLS message.  Returns the size of the message
+ * on success, negative values on failure.
  *
  */
-static int netlbl_mgmt_modules(struct sk_buff *skb, struct genl_info *info)
+static int netlbl_mgmt_protocols_cb(struct sk_buff *skb,
+				    struct netlink_callback *cb,
+				    u32 protocol)
 {
 	int ret_val = -ENOMEM;
-	size_t data_size;
-	u32 mod_count;
-	struct sk_buff *ans_skb = NULL;
-
-	/* unlabeled + cipsov4 */
-	mod_count = 2;
-
-	data_size = GENL_HDRLEN + NETLBL_LEN_U32 + mod_count * NETLBL_LEN_U32;
-	ans_skb = netlbl_netlink_alloc_skb(0, data_size, GFP_KERNEL);
-	if (ans_skb == NULL)
-		goto modules_failure;
-
-	if (netlbl_netlink_hdr_put(ans_skb,
-				   info->snd_pid,
-				   0,
-				   netlbl_mgmt_gnl_family.id,
-				   NLBL_MGMT_C_MODULES) == NULL)
-		goto modules_failure;
-
-	ret_val = nla_put_u32(ans_skb, NLA_U32, mod_count);
-	if (ret_val != 0)
-		goto modules_failure;
-	ret_val = nla_put_u32(ans_skb, NLA_U32, NETLBL_NLTYPE_UNLABELED);
+	void *data;
+
+	data = netlbl_netlink_hdr_put(skb,
+				      NETLINK_CB(cb->skb).pid,
+				      cb->nlh->nlmsg_seq,
+				      netlbl_mgmt_gnl_family.id,
+				      NLM_F_MULTI,
+				      NLBL_MGMT_C_PROTOCOLS);
+	if (data == NULL)
+		goto protocols_cb_failure;
+
+	ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, protocol);
 	if (ret_val != 0)
-		goto modules_failure;
-	ret_val = nla_put_u32(ans_skb, NLA_U32, NETLBL_NLTYPE_CIPSOV4);
-	if (ret_val != 0)
-		goto modules_failure;
-
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
-	if (ret_val != 0)
-		goto modules_failure;
+		goto protocols_cb_failure;
 
-	return 0;
+	return genlmsg_end(skb, data);
 
-modules_failure:
-	kfree_skb(ans_skb);
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
+protocols_cb_failure:
+	genlmsg_cancel(skb, data);
 	return ret_val;
 }
 
+/**
+ * netlbl_mgmt_protocols - Handle a PROTOCOLS message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated PROTOCOLS message and respond accordingly.
+ *
+ */
+static int netlbl_mgmt_protocols(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	u32 protos_sent = cb->args[0];
+
+	if (protos_sent == 0) {
+		if (netlbl_mgmt_protocols_cb(skb,
+					     cb,
+					     NETLBL_NLTYPE_UNLABELED) < 0)
+			goto protocols_return;
+		protos_sent++;
+	}
+	if (protos_sent == 1) {
+		if (netlbl_mgmt_protocols_cb(skb,
+					     cb,
+					     NETLBL_NLTYPE_CIPSOV4) < 0)
+			goto protocols_return;
+		protos_sent++;
+	}
+
+protocols_return:
+	cb->args[0] = protos_sent;
+	return skb->len;
+}
+
 /**
  * netlbl_mgmt_version - Handle a VERSION message
  * @skb: the NETLINK buffer
@@ -474,35 +477,35 @@ static int netlbl_mgmt_version(struct sk_buff *skb, struct genl_info *info)
 {
 	int ret_val = -ENOMEM;
 	struct sk_buff *ans_skb = NULL;
+	void *data;
 
-	ans_skb = netlbl_netlink_alloc_skb(0,
-					   GENL_HDRLEN + NETLBL_LEN_U32,
-					   GFP_KERNEL);
+	ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (ans_skb == NULL)
-		goto version_failure;
-	if (netlbl_netlink_hdr_put(ans_skb,
-				   info->snd_pid,
-				   0,
-				   netlbl_mgmt_gnl_family.id,
-				   NLBL_MGMT_C_VERSION) == NULL)
+		return -ENOMEM;
+	data = netlbl_netlink_hdr_put(ans_skb,
+				      info->snd_pid,
+				      info->snd_seq,
+				      netlbl_mgmt_gnl_family.id,
+				      0,
+				      NLBL_MGMT_C_VERSION);
+	if (data == NULL)
 		goto version_failure;
 
-	ret_val = nla_put_u32(ans_skb, NLA_U32, NETLBL_PROTO_VERSION);
+	ret_val = nla_put_u32(ans_skb,
+			      NLBL_MGMT_A_VERSION,
+			      NETLBL_PROTO_VERSION);
 	if (ret_val != 0)
 		goto version_failure;
 
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
+	genlmsg_end(ans_skb, data);
+
+	ret_val = genlmsg_unicast(ans_skb, info->snd_pid);
 	if (ret_val != 0)
 		goto version_failure;
-
 	return 0;
 
 version_failure:
 	kfree_skb(ans_skb);
-	netlbl_netlink_send_ack(info,
-				netlbl_mgmt_gnl_family.id,
-				NLBL_MGMT_C_ACK,
-				-ret_val);
 	return ret_val;
 }
 
@@ -513,35 +516,40 @@ version_failure:
 
 static struct genl_ops netlbl_mgmt_genl_c_add = {
 	.cmd = NLBL_MGMT_C_ADD,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_add,
 	.dumpit = NULL,
 };
 
 static struct genl_ops netlbl_mgmt_genl_c_remove = {
 	.cmd = NLBL_MGMT_C_REMOVE,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_remove,
 	.dumpit = NULL,
 };
 
-static struct genl_ops netlbl_mgmt_genl_c_list = {
-	.cmd = NLBL_MGMT_C_LIST,
+static struct genl_ops netlbl_mgmt_genl_c_listall = {
+	.cmd = NLBL_MGMT_C_LISTALL,
 	.flags = 0,
-	.doit = netlbl_mgmt_list,
-	.dumpit = NULL,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_mgmt_listall,
 };
 
 static struct genl_ops netlbl_mgmt_genl_c_adddef = {
 	.cmd = NLBL_MGMT_C_ADDDEF,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_adddef,
 	.dumpit = NULL,
 };
 
 static struct genl_ops netlbl_mgmt_genl_c_removedef = {
 	.cmd = NLBL_MGMT_C_REMOVEDEF,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_removedef,
 	.dumpit = NULL,
 };
@@ -549,20 +557,23 @@ static struct genl_ops netlbl_mgmt_genl_c_removedef = {
 static struct genl_ops netlbl_mgmt_genl_c_listdef = {
 	.cmd = NLBL_MGMT_C_LISTDEF,
 	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_listdef,
 	.dumpit = NULL,
 };
 
-static struct genl_ops netlbl_mgmt_genl_c_modules = {
-	.cmd = NLBL_MGMT_C_MODULES,
+static struct genl_ops netlbl_mgmt_genl_c_protocols = {
+	.cmd = NLBL_MGMT_C_PROTOCOLS,
 	.flags = 0,
-	.doit = netlbl_mgmt_modules,
-	.dumpit = NULL,
+	.policy = netlbl_mgmt_genl_policy,
+	.doit = NULL,
+	.dumpit = netlbl_mgmt_protocols,
 };
 
 static struct genl_ops netlbl_mgmt_genl_c_version = {
 	.cmd = NLBL_MGMT_C_VERSION,
 	.flags = 0,
+	.policy = netlbl_mgmt_genl_policy,
 	.doit = netlbl_mgmt_version,
 	.dumpit = NULL,
 };
@@ -596,7 +607,7 @@ int netlbl_mgmt_genl_init(void)
 	if (ret_val != 0)
 		return ret_val;
 	ret_val = genl_register_ops(&netlbl_mgmt_gnl_family,
-				    &netlbl_mgmt_genl_c_list);
+				    &netlbl_mgmt_genl_c_listall);
 	if (ret_val != 0)
 		return ret_val;
 	ret_val = genl_register_ops(&netlbl_mgmt_gnl_family,
@@ -612,7 +623,7 @@ int netlbl_mgmt_genl_init(void)
 	if (ret_val != 0)
 		return ret_val;
 	ret_val = genl_register_ops(&netlbl_mgmt_gnl_family,
-				    &netlbl_mgmt_genl_c_modules);
+				    &netlbl_mgmt_genl_c_protocols);
 	if (ret_val != 0)
 		return ret_val;
 	ret_val = genl_register_ops(&netlbl_mgmt_gnl_family,
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index fd6c6acbfa0..3642d3bfc8e 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -34,212 +34,137 @@
 #include <net/netlabel.h>
 
 /*
- * The following NetLabel payloads are supported by the management interface,
- * all of which are preceeded by the nlmsghdr struct.
- *
- * o ACK:
- *   Sent by the kernel in response to an applications message, applications
- *   should never send this message.
- *
- *   +----------------------+-----------------------+
- *   | seq number (32 bits) | return code (32 bits) |
- *   +----------------------+-----------------------+
- *
- *     seq number:  the sequence number of the original message, taken from the
- *                  nlmsghdr structure
- *     return code: return value, based on errno values
+ * The following NetLabel payloads are supported by the management interface.
  *
  * o ADD:
  *   Sent by an application to add a domain mapping to the NetLabel system.
- *   The kernel should respond with an ACK.
- *
- *   +-------------------+
- *   | domains (32 bits) | ...
- *   +-------------------+
- *
- *     domains: the number of domains in the message
- *
- *   +--------------------------+-------------------------+
- *   | domain string (variable) | protocol type (32 bits) | ...
- *   +--------------------------+-------------------------+
  *
- *   +-------------- ---- --- -- -
- *   | mapping data                ... repeated
- *   +-------------- ---- --- -- -
+ *   Required attributes:
  *
- *     domain string: the domain string, NULL terminated
- *     protocol type: the protocol type (defined by NETLBL_NLTYPE_*)
- *     mapping data:  specific to the map type (see below)
+ *     NLBL_MGMT_A_DOMAIN
+ *     NLBL_MGMT_A_PROTOCOL
  *
- *   NETLBL_NLTYPE_UNLABELED
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
  *
- *     No mapping data for this protocol type.
+ *     NLBL_MGMT_A_CV4DOI
  *
- *   NETLBL_NLTYPE_CIPSOV4
- *
- *   +---------------+
- *   | doi (32 bits) |
- *   +---------------+
- *
- *     doi:  the CIPSO DOI value
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
  *
  * o REMOVE:
  *   Sent by an application to remove a domain mapping from the NetLabel
- *   system.  The kernel should ACK this message.
- *
- *   +-------------------+
- *   | domains (32 bits) | ...
- *   +-------------------+
+ *   system.
  *
- *     domains: the number of domains in the message
+ *   Required attributes:
  *
- *   +--------------------------+
- *   | domain string (variable) | ...
- *   +--------------------------+
+ *     NLBL_MGMT_A_DOMAIN
  *
- *     domain string: the domain string, NULL terminated
- *
- * o LIST:
+ * o LISTALL:
  *   This message can be sent either from an application or by the kernel in
- *   response to an application generated LIST message.  When sent by an
- *   application there is no payload.  The kernel should respond to a LIST
- *   message either with a LIST message on success or an ACK message on
- *   failure.
- *
- *   +-------------------+
- *   | domains (32 bits) | ...
- *   +-------------------+
- *
- *     domains: the number of domains in the message
+ *   response to an application generated LISTALL message.  When sent by an
+ *   application there is no payload and the NLM_F_DUMP flag should be set.
+ *   The kernel should respond with a series of the following messages.
  *
- *   +--------------------------+
- *   | domain string (variable) | ...
- *   +--------------------------+
+ *   Required attributes:
  *
- *   +-------------------------+-------------- ---- --- -- -
- *   | protocol type (32 bits) | mapping data                ... repeated
- *   +-------------------------+-------------- ---- --- -- -
+ *     NLBL_MGMT_A_DOMAIN
+ *     NLBL_MGMT_A_PROTOCOL
  *
- *     domain string: the domain string, NULL terminated
- *     protocol type: the protocol type (defined by NETLBL_NLTYPE_*)
- *     mapping data:  specific to the map type (see below)
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
  *
- *   NETLBL_NLTYPE_UNLABELED
+ *     NLBL_MGMT_A_CV4DOI
  *
- *     No mapping data for this protocol type.
- *
- *   NETLBL_NLTYPE_CIPSOV4
- *
- *   +----------------+---------------+
- *   | type (32 bits) | doi (32 bits) |
- *   +----------------+---------------+
- *
- *     type: the CIPSO mapping table type (defined in the cipso_ipv4.h header
- *           as CIPSO_V4_MAP_*)
- *     doi:  the CIPSO DOI value
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
  *
  * o ADDDEF:
  *   Sent by an application to set the default domain mapping for the NetLabel
- *   system.  The kernel should respond with an ACK.
+ *   system.
  *
- *   +-------------------------+-------------- ---- --- -- -
- *   | protocol type (32 bits) | mapping data                ... repeated
- *   +-------------------------+-------------- ---- --- -- -
+ *   Required attributes:
  *
- *     protocol type: the protocol type (defined by NETLBL_NLTYPE_*)
- *     mapping data:  specific to the map type (see below)
+ *     NLBL_MGMT_A_PROTOCOL
  *
- *   NETLBL_NLTYPE_UNLABELED
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
  *
- *     No mapping data for this protocol type.
+ *     NLBL_MGMT_A_CV4DOI
  *
- *   NETLBL_NLTYPE_CIPSOV4
- *
- *   +---------------+
- *   | doi (32 bits) |
- *   +---------------+
- *
- *     doi:  the CIPSO DOI value
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
  *
  * o REMOVEDEF:
  *   Sent by an application to remove the default domain mapping from the
- *   NetLabel system, there is no payload.  The kernel should ACK this message.
+ *   NetLabel system, there is no payload.
  *
  * o LISTDEF:
  *   This message can be sent either from an application or by the kernel in
  *   response to an application generated LISTDEF message.  When sent by an
- *   application there is no payload.  The kernel should respond to a
- *   LISTDEF message either with a LISTDEF message on success or an ACK message
- *   on failure.
- *
- *   +-------------------------+-------------- ---- --- -- -
- *   | protocol type (32 bits) | mapping data                ... repeated
- *   +-------------------------+-------------- ---- --- -- -
+ *   application there is no payload.  On success the kernel should send a
+ *   response using the following format.
  *
- *     protocol type: the protocol type (defined by NETLBL_NLTYPE_*)
- *     mapping data:  specific to the map type (see below)
+ *   Required attributes:
  *
- *   NETLBL_NLTYPE_UNLABELED
+ *     NLBL_MGMT_A_PROTOCOL
  *
- *     No mapping data for this protocol type.
+ *   If using NETLBL_NLTYPE_CIPSOV4 the following attributes are required:
  *
- *   NETLBL_NLTYPE_CIPSOV4
+ *     NLBL_MGMT_A_CV4DOI
  *
- *   +----------------+---------------+
- *   | type (32 bits) | doi (32 bits) |
- *   +----------------+---------------+
+ *   If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
  *
- *     type: the CIPSO mapping table type (defined in the cipso_ipv4.h header
- *           as CIPSO_V4_MAP_*)
- *     doi:  the CIPSO DOI value
+ * o PROTOCOLS:
+ *   Sent by an application to request a list of configured NetLabel protocols
+ *   in the kernel.  When sent by an application there is no payload and the
+ *   NLM_F_DUMP flag should be set.  The kernel should respond with a series of
+ *   the following messages.
  *
- * o MODULES:
- *   Sent by an application to request a list of configured NetLabel modules
- *   in the kernel.  When sent by an application there is no payload.
+ *   Required attributes:
  *
- *   +-------------------+
- *   | modules (32 bits) | ...
- *   +-------------------+
- *
- *     modules: the number of modules in the message, if this is an application
- *              generated message and the value is zero then return a list of
- *              the configured modules
- *
- *   +------------------+
- *   | module (32 bits) | ... repeated
- *   +------------------+
- *
- *     module: the module number as defined by NETLBL_NLTYPE_*
+ *     NLBL_MGMT_A_PROTOCOL
  *
  * o VERSION:
- *   Sent by an application to request the NetLabel version string.  When sent
- *   by an application there is no payload.  This message type is also used by
- *   the kernel to respond to an VERSION request.
+ *   Sent by an application to request the NetLabel version.  When sent by an
+ *   application there is no payload.  This message type is also used by the
+ *   kernel to respond to an VERSION request.
  *
- *   +-------------------+
- *   | version (32 bits) |
- *   +-------------------+
+ *   Required attributes:
  *
- *     version: the protocol version number
+ *     NLBL_MGMT_A_VERSION
  *
  */
 
 /* NetLabel Management commands */
 enum {
 	NLBL_MGMT_C_UNSPEC,
-	NLBL_MGMT_C_ACK,
 	NLBL_MGMT_C_ADD,
 	NLBL_MGMT_C_REMOVE,
-	NLBL_MGMT_C_LIST,
+	NLBL_MGMT_C_LISTALL,
 	NLBL_MGMT_C_ADDDEF,
 	NLBL_MGMT_C_REMOVEDEF,
 	NLBL_MGMT_C_LISTDEF,
-	NLBL_MGMT_C_MODULES,
+	NLBL_MGMT_C_PROTOCOLS,
 	NLBL_MGMT_C_VERSION,
 	__NLBL_MGMT_C_MAX,
 };
 #define NLBL_MGMT_C_MAX (__NLBL_MGMT_C_MAX - 1)
 
+/* NetLabel Management attributes */
+enum {
+	NLBL_MGMT_A_UNSPEC,
+	NLBL_MGMT_A_DOMAIN,
+	/* (NLA_NUL_STRING)
+	 * the NULL terminated LSM domain string */
+	NLBL_MGMT_A_PROTOCOL,
+	/* (NLA_U32)
+	 * the NetLabel protocol type (defined by NETLBL_NLTYPE_*) */
+	NLBL_MGMT_A_VERSION,
+	/* (NLA_U32)
+	 * the NetLabel protocol version number (defined by
+	 * NETLBL_PROTO_VERSION) */
+	NLBL_MGMT_A_CV4DOI,
+	/* (NLA_U32)
+	 * the CIPSOv4 DOI value */
+	__NLBL_MGMT_A_MAX,
+};
+#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
+
 /* NetLabel protocol functions */
 int netlbl_mgmt_genl_init(void);
 
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 785f4960e0d..440f5c4e1e2 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -55,9 +55,13 @@ static struct genl_family netlbl_unlabel_gnl_family = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
-	.maxattr = 0,
+	.maxattr = NLBL_UNLABEL_A_MAX,
 };
 
+/* NetLabel Netlink attribute policy */
+static struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
+	[NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
+};
 
 /*
  * NetLabel Command Handlers
@@ -75,31 +79,18 @@ static struct genl_family netlbl_unlabel_gnl_family = {
  */
 static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val;
-	struct nlattr *data = netlbl_netlink_payload_data(skb);
-	u32 value;
-
-	ret_val = netlbl_netlink_cap_check(skb, CAP_NET_ADMIN);
-	if (ret_val != 0)
-		return ret_val;
+	int ret_val = -EINVAL;
+	u8 value;
 
-	if (netlbl_netlink_payload_len(skb) == NETLBL_LEN_U32) {
-		value = nla_get_u32(data);
+	if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
+		value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
 		if (value == 1 || value == 0) {
 			atomic_set(&netlabel_unlabel_accept_flg, value);
-			netlbl_netlink_send_ack(info,
-						netlbl_unlabel_gnl_family.id,
-						NLBL_UNLABEL_C_ACK,
-						NETLBL_E_OK);
-			return 0;
+			ret_val = 0;
 		}
 	}
 
-	netlbl_netlink_send_ack(info,
-				netlbl_unlabel_gnl_family.id,
-				NLBL_UNLABEL_C_ACK,
-				EINVAL);
-	return -EINVAL;
+	return ret_val;
 }
 
 /**
@@ -114,39 +105,39 @@ static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
  */
 static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
 {
-	int ret_val = -ENOMEM;
+	int ret_val = -EINVAL;
 	struct sk_buff *ans_skb;
+	void *data;
 
-	ans_skb = netlbl_netlink_alloc_skb(0,
-					   GENL_HDRLEN + NETLBL_LEN_U32,
-					   GFP_KERNEL);
+	ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (ans_skb == NULL)
 		goto list_failure;
-
-	if (netlbl_netlink_hdr_put(ans_skb,
-				   info->snd_pid,
-				   0,
-				   netlbl_unlabel_gnl_family.id,
-				   NLBL_UNLABEL_C_LIST) == NULL)
+	data = netlbl_netlink_hdr_put(ans_skb,
+				      info->snd_pid,
+				      info->snd_seq,
+				      netlbl_unlabel_gnl_family.id,
+				      0,
+				      NLBL_UNLABEL_C_LIST);
+	if (data == NULL) {
+		ret_val = -ENOMEM;
 		goto list_failure;
+	}
 
-	ret_val = nla_put_u32(ans_skb,
-			      NLA_U32,
-			      atomic_read(&netlabel_unlabel_accept_flg));
+	ret_val = nla_put_u8(ans_skb,
+			     NLBL_UNLABEL_A_ACPTFLG,
+			     atomic_read(&netlabel_unlabel_accept_flg));
 	if (ret_val != 0)
 		goto list_failure;
 
-	ret_val = netlbl_netlink_snd(ans_skb, info->snd_pid);
+	genlmsg_end(ans_skb, data);
+
+	ret_val = genlmsg_unicast(ans_skb, info->snd_pid);
 	if (ret_val != 0)
 		goto list_failure;
-
 	return 0;
 
 list_failure:
-	netlbl_netlink_send_ack(info,
-				netlbl_unlabel_gnl_family.id,
-				NLBL_UNLABEL_C_ACK,
-				-ret_val);
+	kfree(ans_skb);
 	return ret_val;
 }
 
@@ -157,7 +148,8 @@ list_failure:
 
 static struct genl_ops netlbl_unlabel_genl_c_accept = {
 	.cmd = NLBL_UNLABEL_C_ACCEPT,
-	.flags = 0,
+	.flags = GENL_ADMIN_PERM,
+	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_accept,
 	.dumpit = NULL,
 };
@@ -165,6 +157,7 @@ static struct genl_ops netlbl_unlabel_genl_c_accept = {
 static struct genl_ops netlbl_unlabel_genl_c_list = {
 	.cmd = NLBL_UNLABEL_C_LIST,
 	.flags = 0,
+	.policy = netlbl_unlabel_genl_policy,
 	.doit = netlbl_unlabel_list,
 	.dumpit = NULL,
 };
@@ -218,10 +211,8 @@ int netlbl_unlabel_genl_init(void)
  */
 int netlbl_unlabel_getattr(struct netlbl_lsm_secattr *secattr)
 {
-	if (atomic_read(&netlabel_unlabel_accept_flg) == 1) {
-		memset(secattr, 0, sizeof(*secattr));
-		return 0;
-	}
+	if (atomic_read(&netlabel_unlabel_accept_flg) == 1)
+		return netlbl_secattr_init(secattr);
 
 	return -ENOMSG;
 }
diff --git a/net/netlabel/netlabel_unlabeled.h b/net/netlabel/netlabel_unlabeled.h
index f300e54e14b..c2917fbb42c 100644
--- a/net/netlabel/netlabel_unlabeled.h
+++ b/net/netlabel/netlabel_unlabeled.h
@@ -36,56 +36,47 @@
 /*
  * The following NetLabel payloads are supported by the Unlabeled subsystem.
  *
- * o ACK:
- *   Sent by the kernel in response to an applications message, applications
- *   should never send this message.
- *
- *   +----------------------+-----------------------+
- *   | seq number (32 bits) | return code (32 bits) |
- *   +----------------------+-----------------------+
- *
- *     seq number:  the sequence number of the original message, taken from the
- *                  nlmsghdr structure
- *     return code: return value, based on errno values
- *
  * o ACCEPT
  *   This message is sent from an application to specify if the kernel should
  *   allow unlabled packets to pass if they do not match any of the static
  *   mappings defined in the unlabeled module.
  *
- *   +-----------------+
- *   | allow (32 bits) |
- *   +-----------------+
+ *   Required attributes:
  *
- *     allow: if true (1) then allow the packets to pass, if false (0) then
- *            reject the packets
+ *     NLBL_UNLABEL_A_ACPTFLG
  *
  * o LIST
  *   This message can be sent either from an application or by the kernel in
  *   response to an application generated LIST message.  When sent by an
  *   application there is no payload.  The kernel should respond to a LIST
- *   message either with a LIST message on success or an ACK message on
- *   failure.
+ *   message with a LIST message on success.
  *
- *   +-----------------------+
- *   | accept flag (32 bits) |
- *   +-----------------------+
+ *   Required attributes:
  *
- *     accept flag: if true (1) then unlabeled packets are allowed to pass,
- *                  if false (0) then unlabeled packets are rejected
+ *     NLBL_UNLABEL_A_ACPTFLG
  *
  */
 
 /* NetLabel Unlabeled commands */
 enum {
 	NLBL_UNLABEL_C_UNSPEC,
-	NLBL_UNLABEL_C_ACK,
 	NLBL_UNLABEL_C_ACCEPT,
 	NLBL_UNLABEL_C_LIST,
 	__NLBL_UNLABEL_C_MAX,
 };
 #define NLBL_UNLABEL_C_MAX (__NLBL_UNLABEL_C_MAX - 1)
 
+/* NetLabel Unlabeled attributes */
+enum {
+	NLBL_UNLABEL_A_UNSPEC,
+	NLBL_UNLABEL_A_ACPTFLG,
+	/* (NLA_U8)
+	 * if true then unlabeled packets are allowed to pass, else unlabeled
+	 * packets are rejected */
+	__NLBL_UNLABEL_A_MAX,
+};
+#define NLBL_UNLABEL_A_MAX (__NLBL_UNLABEL_A_MAX - 1)
+
 /* NetLabel protocol functions */
 int netlbl_unlabel_genl_init(void);
 
-- 
cgit v1.2.3


From 4cc6773508299377099aa30cf30e6a2196c5872d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Mon, 25 Sep 2006 15:57:13 -0700
Subject: [NetLabel]: update docs with website information

Now that all of the supporting pieces of NetLabel have a home at SourceForge
update the Kconfig help text and add an entry to the MAINTAINERS file.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS          | 7 +++++++
 net/netlabel/Kconfig | 5 ++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 16af51ba44a..23348c0d37b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2031,6 +2031,13 @@ L:	netfilter@lists.netfilter.org
 L:	netfilter-devel@lists.netfilter.org
 S:	Supported
 
+NETLABEL
+P:	Paul Moore
+M:	paul.moore@hp.com
+W:	http://netlabel.sf.net
+L:	netdev@vger.kernel.org
+S:	Supported
+
 NETROM NETWORK LAYER
 P:	Ralf Baechle
 M:	ralf@linux-mips.org
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
index fe23cb7f1e8..9f7121ae13e 100644
--- a/net/netlabel/Kconfig
+++ b/net/netlabel/Kconfig
@@ -9,6 +9,9 @@ config NETLABEL
 	---help---
 	  NetLabel provides support for explicit network packet labeling
 	  protocols such as CIPSO and RIPSO.  For more information see
-	  Documentation/netlabel.
+	  Documentation/netlabel as well as the NetLabel SourceForge project
+	  for configuration tools and additional documentation.
+
+	   * http://netlabel.sf.net
 
 	  If you are unsure, say N.
-- 
cgit v1.2.3


From a4c0291aa942dceddabe23bf2b74addb958d0964 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 25 Sep 2006 14:00:45 -0700
Subject: [SPARC64]: Fix section-mismatch errors in solaris emul module.

init_socksys() was marked __init but invoked from a
non-__init function.

Use the correct module_{init,exit}() faciltiies while we're
here and eliminate some seriously bogus ifdefs.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/solaris/misc.c    | 20 ++++----------------
 arch/sparc64/solaris/socksys.c |  6 ++----
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index 8135ec322c9..642541769a1 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -736,20 +736,15 @@ struct exec_domain solaris_exec_domain = {
 
 extern int init_socksys(void);
 
-#ifdef MODULE
-
 MODULE_AUTHOR("Jakub Jelinek (jj@ultra.linux.cz), Patrik Rak (prak3264@ss1000.ms.mff.cuni.cz)");
 MODULE_DESCRIPTION("Solaris binary emulation module");
 MODULE_LICENSE("GPL");
 
-#ifdef __sparc_v9__
 extern u32 tl0_solaris[8];
 #define update_ttable(x) 										\
 	tl0_solaris[3] = (((long)(x) - (long)tl0_solaris - 3) >> 2) | 0x40000000;			\
 	wmb();		\
 	__asm__ __volatile__ ("flush %0" : : "r" (&tl0_solaris[3]))
-#else
-#endif	
 
 extern u32 solaris_sparc_syscall[];
 extern u32 solaris_syscall[];
@@ -757,7 +752,7 @@ extern void cleanup_socksys(void);
 
 extern u32 entry64_personality_patch;
 
-int init_module(void)
+static int __init solaris_init(void)
 {
 	int ret;
 
@@ -777,19 +772,12 @@ int init_module(void)
 	return 0;
 }
 
-void cleanup_module(void)
+static void __exit solaris_exit(void)
 {
 	update_ttable(solaris_syscall);
 	cleanup_socksys();
 	unregister_exec_domain(&solaris_exec_domain);
 }
 
-#else
-int init_solaris_emul(void)
-{
-	register_exec_domain(&solaris_exec_domain);
-	init_socksys();
-	return 0;
-}
-#endif
-
+module_init(solaris_init);
+module_exit(solaris_exit);
diff --git a/arch/sparc64/solaris/socksys.c b/arch/sparc64/solaris/socksys.c
index bc3df95bc05..7c90e41fd3b 100644
--- a/arch/sparc64/solaris/socksys.c
+++ b/arch/sparc64/solaris/socksys.c
@@ -168,8 +168,7 @@ static struct file_operations socksys_fops = {
 	.release =	socksys_release,
 };
 
-int __init
-init_socksys(void)
+int __init init_socksys(void)
 {
 	int ret;
 	struct file * file;
@@ -199,8 +198,7 @@ init_socksys(void)
 	return 0;
 }
 
-void
-cleanup_socksys(void)
+void __exit cleanup_socksys(void)
 {
 	if (unregister_chrdev(30, "socksys"))
 		printk ("Couldn't unregister socksys character device\n");
-- 
cgit v1.2.3


From efdbc1a7caf770b1b312000a42c630597f06973d Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 25 Sep 2006 14:04:49 -0700
Subject: [SUNLANCE]: Mark sparc_lance_probe_one as __devinit.

Fixes section mismatch warnings when built as a module.

Also, mark find_ledma and sun4 init function as __devinit
too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/sunlance.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/sunlance.c b/drivers/net/sunlance.c
index 77670741e10..feb42db10ee 100644
--- a/drivers/net/sunlance.c
+++ b/drivers/net/sunlance.c
@@ -1323,9 +1323,9 @@ static const struct ethtool_ops sparc_lance_ethtool_ops = {
 	.get_link		= sparc_lance_get_link,
 };
 
-static int __init sparc_lance_probe_one(struct sbus_dev *sdev,
-					struct sbus_dma *ledma,
-					struct sbus_dev *lebuffer)
+static int __devinit sparc_lance_probe_one(struct sbus_dev *sdev,
+					   struct sbus_dma *ledma,
+					   struct sbus_dev *lebuffer)
 {
 	static unsigned version_printed;
 	struct net_device *dev;
@@ -1515,7 +1515,7 @@ fail:
 }
 
 /* On 4m, find the associated dma for the lance chip */
-static inline struct sbus_dma *find_ledma(struct sbus_dev *sdev)
+static struct sbus_dma * __devinit find_ledma(struct sbus_dev *sdev)
 {
 	struct sbus_dma *p;
 
@@ -1533,7 +1533,7 @@ static inline struct sbus_dma *find_ledma(struct sbus_dev *sdev)
 
 /* Find all the lance cards on the system and initialize them */
 static struct sbus_dev sun4_sdev;
-static int __init sparc_lance_init(void)
+static int __devinit sparc_lance_init(void)
 {
 	if ((idprom->id_machtype == (SM_SUN4|SM_4_330)) ||
 	    (idprom->id_machtype == (SM_SUN4|SM_4_470))) {
-- 
cgit v1.2.3


From be5b6d3d6cb7311893c9fbeebf094591d5f760a8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 25 Sep 2006 14:08:37 -0700
Subject: [SOUND] sparc/amd7930: Use __devinit and __devinitdata as needed.

Fixes section-mismatch errors.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 sound/sparc/amd7930.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sound/sparc/amd7930.c b/sound/sparc/amd7930.c
index 2bd8e40b854..be0bd503f01 100644
--- a/sound/sparc/amd7930.c
+++ b/sound/sparc/amd7930.c
@@ -755,7 +755,7 @@ static struct snd_pcm_ops snd_amd7930_capture_ops = {
 	.pointer	=	snd_amd7930_capture_pointer,
 };
 
-static int __init snd_amd7930_pcm(struct snd_amd7930 *amd)
+static int __devinit snd_amd7930_pcm(struct snd_amd7930 *amd)
 {
 	struct snd_pcm *pcm;
 	int err;
@@ -870,7 +870,7 @@ static int snd_amd7930_put_volume(struct snd_kcontrol *kctl, struct snd_ctl_elem
 	return change;
 }
 
-static struct snd_kcontrol_new amd7930_controls[] __initdata = {
+static struct snd_kcontrol_new amd7930_controls[] __devinitdata = {
 	{
 		.iface		=	SNDRV_CTL_ELEM_IFACE_MIXER,
 		.name		=	"Monitor Volume",
@@ -900,7 +900,7 @@ static struct snd_kcontrol_new amd7930_controls[] __initdata = {
 	},
 };
 
-static int __init snd_amd7930_mixer(struct snd_amd7930 *amd)
+static int __devinit snd_amd7930_mixer(struct snd_amd7930 *amd)
 {
 	struct snd_card *card;
 	int idx, err;
@@ -945,11 +945,11 @@ static struct snd_device_ops snd_amd7930_dev_ops = {
 	.dev_free	=	snd_amd7930_dev_free,
 };
 
-static int __init snd_amd7930_create(struct snd_card *card,
-				     struct resource *rp,
-				     unsigned int reg_size,
-				     int irq, int dev,
-				     struct snd_amd7930 **ramd)
+static int __devinit snd_amd7930_create(struct snd_card *card,
+					struct resource *rp,
+					unsigned int reg_size,
+					int irq, int dev,
+					struct snd_amd7930 **ramd)
 {
 	unsigned long flags;
 	struct snd_amd7930 *amd;
@@ -1013,7 +1013,7 @@ static int __init snd_amd7930_create(struct snd_card *card,
 	return 0;
 }
 
-static int __init amd7930_attach_common(struct resource *rp, int irq)
+static int __devinit amd7930_attach_common(struct resource *rp, int irq)
 {
 	static int dev_num;
 	struct snd_card *card;
@@ -1065,7 +1065,7 @@ out_err:
 	return err;
 }
 
-static int __init amd7930_obio_attach(struct device_node *dp)
+static int __devinit amd7930_obio_attach(struct device_node *dp)
 {
 	struct linux_prom_registers *regs;
 	struct linux_prom_irqs *irqp;
-- 
cgit v1.2.3


From c32a8fd7cb33f30bcd855188dc8e243b144c5cee Mon Sep 17 00:00:00 2001
From: Henne <henne@nachtwindheim.de>
Date: Mon, 25 Sep 2006 22:00:46 +0200
Subject: [PATCH] ata-piix: fixes kerneldoc error

Fixes an error in kerneldoc of ata_piix.c.
Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/ata_piix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index ab2ecccf779..ffa111eea9d 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -851,7 +851,7 @@ static void piix_set_piomode (struct ata_port *ap, struct ata_device *adev)
  *	@ap: Port whose timings we are configuring
  *	@adev: Drive in question
  *	@udma: udma mode, 0 - 6
- *	@is_ich: set if the chip is an ICH device
+ *	@isich: set if the chip is an ICH device
  *
  *	Set UDMA mode for device, in host controller PCI config space.
  *
-- 
cgit v1.2.3


From c0ba7e5147829eaa607351997bccd06200a8db12 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 25 Sep 2006 16:24:16 -0700
Subject: [PATCH] autofs4: zero timeout prevents shutdown

If the timeout of an autofs mount is set to zero then umounts are disabled.
 This works fine, however the kernel module checks the expire timeout and
goes no further if it is zero.  This is not the right thing to do at
shutdown as the module is passed an option to expire mounts regardless of
their timeout setting.

This patch allows autofs to honor the force expire option.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/expire.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 8dbd44f10e9..d96e5c14a9c 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -32,7 +32,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 
 	if (!do_now) {
 		/* Too young to die */
-		if (time_after(ino->last_used + timeout, now))
+		if (!timeout || time_after(ino->last_used + timeout, now))
 			return 0;
 
 		/* update last_used here :-
@@ -253,7 +253,7 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
 	struct dentry *root = dget(sb->s_root);
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 
-	if (!sbi->exp_timeout || !root)
+	if (!root)
 		return NULL;
 
 	now = jiffies;
@@ -293,7 +293,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 	int exp_leaves = how & AUTOFS_EXP_LEAVES;
 
-	if ( !sbi->exp_timeout || !root )
+	if (!root)
 		return NULL;
 
 	now = jiffies;
-- 
cgit v1.2.3


From 0b16f21f144010aa627c58b3a33be49ebfd685dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 16:24:23 -0700
Subject: [PATCH] rtc: lockdep fix/workaround

BUG: warning at kernel/lockdep.c:1816/trace_hardirqs_on() (Not tainted)
 [<c04051ee>] show_trace_log_lvl+0x58/0x171
 [<c0405802>] show_trace+0xd/0x10
 [<c040591b>] dump_stack+0x19/0x1b
 [<c043abee>] trace_hardirqs_on+0xa2/0x11e
 [<c06143c3>] _spin_unlock_irq+0x22/0x26
 [<c0541540>] rtc_get_rtc_time+0x32/0x176
 [<c0419ba4>] hpet_rtc_interrupt+0x92/0x14d
 [<c0450f94>] handle_IRQ_event+0x20/0x4d
 [<c0451055>] __do_IRQ+0x94/0xef
 [<c040678d>] do_IRQ+0x9e/0xbd
 [<c0404a49>] common_interrupt+0x25/0x2c
DWARF2 unwinder stuck at common_interrupt+0x25/0x2c

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/rtc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 6e6a7c7a7ef..ab6429b4a84 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -209,11 +209,12 @@ static const unsigned char days_in_mo[] =
  */
 static inline unsigned char rtc_is_updating(void)
 {
+	unsigned long flags;
 	unsigned char uip;
 
-	spin_lock_irq(&rtc_lock);
+	spin_lock_irqsave(&rtc_lock, flags);
 	uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 	return uip;
 }
 
-- 
cgit v1.2.3


From 24fd425edd53ea580cad917e825c1f6715e9b939 Mon Sep 17 00:00:00 2001
From: keith mannthey <kmannth@us.ibm.com>
Date: Mon, 25 Sep 2006 16:24:39 -0700
Subject: [PATCH] i386 bootioremap / kexec fix

With CONFIG_PHYSICAL_START set to a non default values the i386
boot_ioremap code calculated its pte index wrong and users of boot_ioremap
have their areas incorrectly mapped (for me SRAT table not mapped during
early boot).  This patch removes the addr < BOOT_PTE_PTRS constraint.

[ Keith says this is applicable to 2.6.16 and 2.6.17 as well ]

Signed-off-by: Keith Mannthey<kmannth@us.ibm.com>
Cc: Vivek Goyal <vgoyal@in.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: <stable@kernel.org>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/boot_ioremap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c
index 5d44f4f5ff5..4de11f508c3 100644
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -29,8 +29,11 @@
  */
 
 #define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-#define boot_pte_index(address) \
-	     (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1))
+
+static unsigned long boot_pte_index(unsigned long vaddr) 
+{
+	return __pa(vaddr) >> PAGE_SHIFT;
+}
 
 static inline boot_pte_t* boot_vaddr_to_pte(void *address)
 {
-- 
cgit v1.2.3


From f3ef9ead31ae995251b420ac98398bd7545bf4e1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@cs.washington.edu>
Date: Mon, 25 Sep 2006 16:24:57 -0700
Subject: [PATCH] do not free non slab allocated per_cpu_pageset

Stops panic associated with attempting to free a non slab-allocated
per_cpu_pageset.

Signed-off-by: David Rientjes <rientjes@cs.washington.edu>
Acked-by: Christoph Lameter <clameter@sgi.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b5358a0561..8a52ba9fe69 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1845,8 +1845,10 @@ static inline void free_zone_pagesets(int cpu)
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
 
+		/* Free per_cpu_pageset if it is slab allocated */
+		if (pset != &boot_pageset[cpu])
+			kfree(pset);
 		zone_pcp(zone, cpu) = NULL;
-		kfree(pset);
 	}
 }
 
-- 
cgit v1.2.3


From 25981de5b836581364612a4b1fe27db3b5d1f861 Mon Sep 17 00:00:00 2001
From: Michael Hanselmann <linux-kernel@hansmi.ch>
Date: Mon, 25 Sep 2006 16:25:07 -0700
Subject: [PATCH] backlight: fix oops in __mutex_lock_slowpath during head
 /sys/class/graphics/fb0/*

Seems like not all drivers use the framebuffer_alloc() function and won't
have an initialized mutex.  But those don't have a backlight, anyway.

Signed-off-by: Michael Hanselmann <linux-kernel@hansmi.ch>
Cc: Olaf Hering <olaf@aepfle.de>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Cc: Daniel R Thompson <daniel.thompson@st.com>
Cc: Jon Smirl <jonsmirl@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/fbsysfs.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c
index 4f78f234473..c151dcf6878 100644
--- a/drivers/video/fbsysfs.c
+++ b/drivers/video/fbsysfs.c
@@ -397,6 +397,12 @@ static ssize_t store_bl_curve(struct class_device *class_device,
 	u8 tmp_curve[FB_BACKLIGHT_LEVELS];
 	unsigned int i;
 
+	/* Some drivers don't use framebuffer_alloc(), but those also
+	 * don't have backlights.
+	 */
+	if (!fb_info || !fb_info->bl_dev)
+		return -ENODEV;
+
 	if (count != (FB_BACKLIGHT_LEVELS / 8 * 24))
 		return -EINVAL;
 
@@ -430,6 +436,12 @@ static ssize_t show_bl_curve(struct class_device *class_device, char *buf)
 	ssize_t len = 0;
 	unsigned int i;
 
+	/* Some drivers don't use framebuffer_alloc(), but those also
+	 * don't have backlights.
+	 */
+	if (!fb_info || !fb_info->bl_dev)
+		return -ENODEV;
+
 	mutex_lock(&fb_info->bl_mutex);
 	for (i = 0; i < FB_BACKLIGHT_LEVELS; i += 8)
 		len += snprintf(&buf[len], PAGE_SIZE,
-- 
cgit v1.2.3


From 08992986497471ce575f23796268fb1b50b5c2ab Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 25 Sep 2006 16:25:21 -0700
Subject: [PATCH] cpu to node relationship fixup: acpi_map_cpu2node

Problem description:

  We have additional_cpus= option for allocating possible_cpus.  But nid
  for possible cpus are not fixed at boot time.  cpus which is offlined at
  boot or cpus which is not on SRAT is not tied to its node.  This will
  cause panic at cpu onlining.

Usually, pxm_to_nid() mapping is fixed at boot time by SRAT.

But, unfortunately, some system (my system!) do not include
full SRAT table for possible cpus.  (Then, I use
additiona_cpus= option.)

For such possible cpus, pxm<->nid should be fixed at
hot-add.  We now have acpi_map_pxm_to_node() which is also
used at boot.  It's suitable here.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/acpi.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 0176556aeec..32c3abededc 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -771,16 +771,19 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
 {
 #ifdef CONFIG_ACPI_NUMA
 	int pxm_id;
+	int nid;
 
 	pxm_id = acpi_get_pxm(handle);
-
 	/*
-	 * Assuming that the container driver would have set the proximity
-	 * domain and would have initialized pxm_to_node(pxm_id) && pxm_flag
+	 * We don't have cpu-only-node hotadd. But if the system equips
+	 * SRAT table, pxm is already found and node is ready.
+  	 * So, just pxm_to_nid(pxm) is OK.
+	 * This code here is for the system which doesn't have full SRAT
+  	 * table for possible cpus.
 	 */
-	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_node(pxm_id);
-
+	nid = acpi_map_pxm_to_node(pxm_id);
 	node_cpuid[cpu].phys_id = physid;
+	node_cpuid[cpu].nid = nid;
 #endif
 	return (0);
 }
-- 
cgit v1.2.3


From 3212fe1594e577463bc8601d28aa008f520c3377 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 25 Sep 2006 16:25:31 -0700
Subject: [PATCH] cpu to node relationship fixup: map cpu to node

Assume that a cpu is *physically* offlined at boot time...

Because smpboot.c::smp_boot_cpu_map() canoot find cpu's sapicid,
numa.c::build_cpu_to_node_map() cannot build cpu<->node map for
offlined cpu.

For such cpus, cpu_to_node map should be fixed at cpu-hot-add.
This mapping should be done before cpu onlining.

This patch also handles cpu hotremove case.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/numa.c     | 34 +++++++++++++++++++++++++++++++---
 arch/ia64/kernel/topology.c |  6 +++++-
 include/asm-ia64/numa.h     |  4 ++++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
index 1cc360c83e7..20340631179 100644
--- a/arch/ia64/kernel/numa.c
+++ b/arch/ia64/kernel/numa.c
@@ -29,6 +29,36 @@ EXPORT_SYMBOL(cpu_to_node_map);
 
 cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
 
+void __cpuinit map_cpu_to_node(int cpu, int nid)
+{
+	int oldnid;
+	if (nid < 0) { /* just initialize by zero */
+		cpu_to_node_map[cpu] = 0;
+		return;
+	}
+	/* sanity check first */
+	oldnid = cpu_to_node_map[cpu];
+	if (cpu_isset(cpu, node_to_cpu_mask[oldnid])) {
+		return; /* nothing to do */
+	}
+	/* we don't have cpu-driven node hot add yet...
+	   In usual case, node is created from SRAT at boot time. */
+	if (!node_online(nid))
+		nid = first_online_node;
+	cpu_to_node_map[cpu] = nid;
+	cpu_set(cpu, node_to_cpu_mask[nid]);
+	return;
+}
+
+void __cpuinit unmap_cpu_from_node(int cpu, int nid)
+{
+	WARN_ON(!cpu_isset(cpu, node_to_cpu_mask[nid]));
+	WARN_ON(cpu_to_node_map[cpu] != nid);
+	cpu_to_node_map[cpu] = 0;
+	cpu_clear(cpu, node_to_cpu_mask[nid]);
+}
+
+
 /**
  * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays
  *
@@ -49,8 +79,6 @@ void __init build_cpu_to_node_map(void)
 				node = node_cpuid[i].nid;
 				break;
 			}
-		cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
-		if (node >= 0)
-			cpu_set(cpu, node_to_cpu_mask[node]);
+		map_cpu_to_node(cpu, node);
 	}
 }
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index f648c610b10..05bdf7affb4 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,6 +36,9 @@ int arch_register_cpu(int num)
 	 */
 	if (!can_cpei_retarget() && is_cpu_cpei_target(num))
 		sysfs_cpus[num].cpu.no_control = 1;
+#ifdef CONFIG_NUMA
+	map_cpu_to_node(num, node_cpuid[num].nid);
+#endif
 #endif
 
 	return register_cpu(&sysfs_cpus[num].cpu, num);
@@ -45,7 +48,8 @@ int arch_register_cpu(int num)
 
 void arch_unregister_cpu(int num)
 {
-	return unregister_cpu(&sysfs_cpus[num].cpu);
+	unregister_cpu(&sysfs_cpus[num].cpu);
+	unmap_cpu_from_node(num, cpu_to_node(num));
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
diff --git a/include/asm-ia64/numa.h b/include/asm-ia64/numa.h
index e5a8260593a..e0a1d173e42 100644
--- a/include/asm-ia64/numa.h
+++ b/include/asm-ia64/numa.h
@@ -64,6 +64,10 @@ extern int paddr_to_nid(unsigned long paddr);
 
 #define local_nodeid (cpu_to_node_map[smp_processor_id()])
 
+extern void map_cpu_to_node(int cpu, int nid);
+extern void unmap_cpu_from_node(int cpu, int nid);
+
+
 #else /* !CONFIG_NUMA */
 
 #define paddr_to_nid(addr)	0
-- 
cgit v1.2.3


From bfa0e9a07cd31f3858239dbc93011b82780acf4b Mon Sep 17 00:00:00 2001
From: keith mannthey <kmannth@us.ibm.com>
Date: Mon, 25 Sep 2006 16:25:35 -0700
Subject: [PATCH] i386: fix flat mode numa on a real numa system

If there is only 1 node in the system cpus should think they are apart of
some other node.

If cases where a real numa system boots the Flat numa option make sure the
cpus don't claim to be apart on a non-existent node.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index f948419c888..efe07990e7f 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -642,9 +642,13 @@ static void map_cpu_to_logical_apicid(void)
 {
 	int cpu = smp_processor_id();
 	int apicid = logical_smp_processor_id();
+	int node = apicid_to_node(apicid);
+
+	if (!node_online(node))
+		node = first_online_node;
 
 	cpu_2_logical_apicid[cpu] = apicid;
-	map_cpu_to_node(cpu, apicid_to_node(apicid));
+	map_cpu_to_node(cpu, node);
 }
 
 static void unmap_cpu_to_logical_apicid(int cpu)
-- 
cgit v1.2.3


From 1cc5f7142eca352109895fe20b1fc6405dd17727 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@arastra.com>
Date: Mon, 25 Sep 2006 16:25:36 -0700
Subject: [PATCH] load_module: no BUG if module_subsys uninitialized

Invoking load_module() before param_sysfs_init() is called crashes in
mod_sysfs_setup(), since the kset in module_subsys is not initialized yet.

In my case, net-pf-1 is getting modprobed as a result of hotplug trying to
create a UNIX socket.  Calls to hotplug begin after the topology_init
initcall.

Another patch for the same symptom (module_subsys-initialize-earlier.patch)
moves param_sysfs_init() to the subsys initcalls, but this is still not
early enough in the boot process in some cases.  In particular,
topology_init() causes /sbin/hotplug to run, which requests net-pf-1 (the
UNIX socket protocol) which can be compiled as a module.  Moving
param_sysfs_init() to the postcore initcalls fixes this particular race,
but there might well be other cases where a usermodehelper causes a module
to load earlier still.

The patch makes load_module() return an error rather than crashing the
kernel if invoked before module_subsys is initialized.

Cc: Mark Huang <mlhuang@cs.princeton.edu>
Cc: Greg KH <greg@kroah.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 2a19cd47c04..b7fe6e84096 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1054,6 +1054,12 @@ static int mod_sysfs_setup(struct module *mod,
 {
 	int err;
 
+	if (!module_subsys.kset.subsys) {
+		printk(KERN_ERR "%s: module_subsys not initialized\n",
+		       mod->name);
+		err = -EINVAL;
+		goto out;
+	}
 	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
 	err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
 	if (err)
-- 
cgit v1.2.3


From b7de567bf3187ccf776e2fe0e241593cdcba5459 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet-v4l@lwn.net>
Date: Mon, 25 Sep 2006 16:25:37 -0700
Subject: [PATCH] VIDIOC_ENUMSTD bug

The v4l2 API documentation for VIDIOC_ENUMSTD says:

	To enumerate all standards applications shall begin at index
	zero, incrementing by one until the driver returns EINVAL.

The actual code, however, tests the index this way:

               if (index<=0 || index >= vfd->tvnormsize) {
                        ret=-EINVAL;

So any application which passes in index=0 gets EINVAL right off the bat
- and, in fact, this is what happens to mplayer.  So I think the
following patch is called for, and maybe even appropriate for a 2.6.18.x
stable release.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/video/videodev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 88bf2af2a0e..edd7b83c346 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -836,7 +836,7 @@ static int __video_do_ioctl(struct inode *inode, struct file *file,
 			break;
 		}
 
-		if (index<=0 || index >= vfd->tvnormsize) {
+		if (index < 0 || index >= vfd->tvnormsize) {
 			ret=-EINVAL;
 			break;
 		}
-- 
cgit v1.2.3


From 29da9f6d9cc3685ae7f6c8b817f6ed8864c78a4c Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Mon, 25 Sep 2006 21:56:33 -0400
Subject: [libata] Fix oops introduced in non-uniform port handling fix

Noticed by several people.

Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/sata_nv.c  | 6 +++---
 drivers/ata/sata_sis.c | 6 +++---
 drivers/ata/sata_uli.c | 6 +++---
 drivers/ata/sata_via.c | 7 ++++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 27c22feebf3..8cd730fe5dd 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -484,7 +484,7 @@ static void nv_error_handler(struct ata_port *ap)
 static int nv_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	static int printed_version = 0;
-	struct ata_port_info *ppi;
+	struct ata_port_info *ppi[2];
 	struct ata_probe_ent *probe_ent;
 	int pci_dev_busy = 0;
 	int rc;
@@ -520,8 +520,8 @@ static int nv_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rc = -ENOMEM;
 
-	ppi = &nv_port_info[ent->driver_data];
-	probe_ent = ata_pci_init_native_mode(pdev, &ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
+	ppi[0] = ppi[1] = &nv_port_info[ent->driver_data];
+	probe_ent = ata_pci_init_native_mode(pdev, ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
 	if (!probe_ent)
 		goto err_out_regions;
 
diff --git a/drivers/ata/sata_sis.c b/drivers/ata/sata_sis.c
index 9b17375d805..18d49fff8dc 100644
--- a/drivers/ata/sata_sis.c
+++ b/drivers/ata/sata_sis.c
@@ -240,7 +240,7 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct ata_probe_ent *probe_ent = NULL;
 	int rc;
 	u32 genctl;
-	struct ata_port_info *ppi;
+	struct ata_port_info *ppi[2];
 	int pci_dev_busy = 0;
 	u8 pmr;
 	u8 port2_start;
@@ -265,8 +265,8 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto err_out_regions;
 
-	ppi = &sis_port_info;
-	probe_ent = ata_pci_init_native_mode(pdev, &ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
+	ppi[0] = ppi[1] = &sis_port_info;
+	probe_ent = ata_pci_init_native_mode(pdev, ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
 	if (!probe_ent) {
 		rc = -ENOMEM;
 		goto err_out_regions;
diff --git a/drivers/ata/sata_uli.c b/drivers/ata/sata_uli.c
index 8fc6e800011..dd76f37be18 100644
--- a/drivers/ata/sata_uli.c
+++ b/drivers/ata/sata_uli.c
@@ -185,7 +185,7 @@ static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	static int printed_version;
 	struct ata_probe_ent *probe_ent;
-	struct ata_port_info *ppi;
+	struct ata_port_info *ppi[2];
 	int rc;
 	unsigned int board_idx = (unsigned int) ent->driver_data;
 	int pci_dev_busy = 0;
@@ -211,8 +211,8 @@ static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto err_out_regions;
 
-	ppi = &uli_port_info;
-	probe_ent = ata_pci_init_native_mode(pdev, &ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
+	ppi[0] = ppi[1] = &uli_port_info;
+	probe_ent = ata_pci_init_native_mode(pdev, ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
 	if (!probe_ent) {
 		rc = -ENOMEM;
 		goto err_out_regions;
diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c
index 7f087aef99d..a72a2389a11 100644
--- a/drivers/ata/sata_via.c
+++ b/drivers/ata/sata_via.c
@@ -318,9 +318,10 @@ static void vt6421_init_addrs(struct ata_probe_ent *probe_ent,
 static struct ata_probe_ent *vt6420_init_probe_ent(struct pci_dev *pdev)
 {
 	struct ata_probe_ent *probe_ent;
-	struct ata_port_info *ppi = &vt6420_port_info;
-
-	probe_ent = ata_pci_init_native_mode(pdev, &ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
+	struct ata_port_info *ppi[2];
+	
+	ppi[0] = ppi[1] = &vt6420_port_info;
+	probe_ent = ata_pci_init_native_mode(pdev, ppi, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY);
 	if (!probe_ent)
 		return NULL;
 
-- 
cgit v1.2.3


From 0a2966b48fb784e437520e400ddc94874ddbd4e8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@sgi.com>
Date: Mon, 25 Sep 2006 23:30:51 -0700
Subject: [PATCH] Fix longstanding load balancing bug in the scheduler

The scheduler will stop load balancing if the most busy processor contains
processes pinned via processor affinity.

The scheduler currently only does one search for busiest cpu.  If it cannot
pull any tasks away from the busiest cpu because they were pinned then the
scheduler goes into a corner and sulks leaving the idle processors idle.

F.e.  If you have processor 0 busy running four tasks pinned via taskset,
there are none on processor 1 and one just started two processes on
processor 2 then the scheduler will not move one of the two processes away
from processor 2.

This patch fixes that issue by forcing the scheduler to come out of its
corner and retrying the load balancing by considering other processors for
load balancing.

This patch was originally developed by John Hawkes and discussed at

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2.

I have removed extraneous material and gone back to equipping struct rq
with the cpu the queue is associated with since this makes the patch much
easier and it is likely that others in the future will have the same
difficulty of figuring out which processor owns which runqueue.

The overhead added through these patches is a single word on the stack if
the kernel is configured to support 32 cpus or less (32 bit).  For 32 bit
environments the maximum number of cpus that can be configued is 255 which
would result in the use of 32 bytes additional on the stack.  On IA64 up to
1k cpus can be configured which will result in the use of 128 additional
bytes on the stack.  The maximum additional cache footprint is one
cacheline.  Typically memory use will be much less than a cacheline and the
additional cpumask will be placed on the stack in a cacheline that already
contains other local variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: John Hawkes <hawkes@sgi.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee123..5c848fd4e46 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
+	int cpu;		/* cpu of this runqueue */
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
 
 static DEFINE_PER_CPU(struct rq, runqueues);
 
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+	return rq->cpu;
+#else
+	return 0;
+#endif
+}
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -2211,7 +2221,8 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   cpumask_t *cpus)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
-			struct rq *rq = cpu_rq(i);
+			struct rq *rq;
+
+			if (!cpu_isset(i, *cpus))
+				continue;
+
+			rq = cpu_rq(i);
 
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
-		   unsigned long imbalance)
+		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
+
+		if (!cpu_isset(i, *cpus))
+			continue;
+
 		rq = cpu_rq(i);
 
 		if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	schedstat_inc(sd, lb_cnt[idle]);
 
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+							&cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle, imbalance);
+	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		double_rq_unlock(this_rq, busiest);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(all_pinned))
+		if (unlikely(all_pinned)) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
 			goto out_balanced;
+		}
 	}
 
 	if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+				&sd_idle, &cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+	busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+				&cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
+
+		if (!nr_moved) {
+			cpu_clear(cpu_of(busiest), cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
+		}
 	}
 
 	if (!nr_moved) {
@@ -6747,6 +6784,7 @@ void __init sched_init(void)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
+		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
-- 
cgit v1.2.3


From 632bbfeee4f042c05bc65150b4433a297d3fe387 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Mon, 25 Sep 2006 23:30:53 -0700
Subject: [PATCH] trigger a syntax error if percpu macros are incorrectly used

get_cpu_var()/per_cpu()/__get_cpu_var() arguments must be simple
identifiers.  Otherwise the arch dependent implementations might break.

This patch enforces the correct usage of the macros by producing a syntax
error if the variable is not a simple identifier.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/percpu.h |  4 +++-
 include/asm-s390/percpu.h    | 20 +++++++++++---------
 include/asm-x86_64/percpu.h  | 12 +++++++++---
 include/linux/percpu.h       | 10 ++++++++--
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index e160e04290f..6d45ee5472a 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -14,7 +14,9 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 /* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
+#define per_cpu(var, cpu) (*({				\
+	extern int simple_indentifier_##var(void);	\
+	RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 #define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id())
 
diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h
index 28b3517e787..495ad99c763 100644
--- a/include/asm-s390/percpu.h
+++ b/include/asm-s390/percpu.h
@@ -15,18 +15,20 @@
  */
 #if defined(__s390x__) && defined(MODULE)
 
-#define __reloc_hide(var,offset) \
-  (*({ unsigned long *__ptr; \
-       asm ( "larl %0,per_cpu__"#var"@GOTENT" \
-             : "=a" (__ptr) : "X" (per_cpu__##var) ); \
-       (typeof(&per_cpu__##var))((*__ptr) + (offset)); }))
+#define __reloc_hide(var,offset) (*({			\
+	extern int simple_indentifier_##var(void);	\
+	unsigned long *__ptr;				\
+	asm ( "larl %0,per_cpu__"#var"@GOTENT"		\
+	    : "=a" (__ptr) : "X" (per_cpu__##var) );	\
+	(typeof(&per_cpu__##var))((*__ptr) + (offset));	}))
 
 #else
 
-#define __reloc_hide(var, offset) \
-  (*({ unsigned long __ptr; \
-       asm ( "" : "=a" (__ptr) : "0" (&per_cpu__##var) ); \
-       (typeof(&per_cpu__##var)) (__ptr + (offset)); }))
+#define __reloc_hide(var, offset) (*({				\
+	extern int simple_indentifier_##var(void);		\
+	unsigned long __ptr;					\
+	asm ( "" : "=a" (__ptr) : "0" (&per_cpu__##var) );	\
+	(typeof(&per_cpu__##var)) (__ptr + (offset)); }))
 
 #endif
 
diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h
index 08dd9f9dda8..bffb2f886a5 100644
--- a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -21,9 +21,15 @@
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
 
 /* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
-#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
-#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
+#define per_cpu(var, cpu) (*({				\
+	extern int simple_indentifier_##var(void);	\
+	RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)); }))
+#define __get_cpu_var(var) (*({				\
+	extern int simple_indentifier_##var(void);	\
+	RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
+#define __raw_get_cpu_var(var) (*({			\
+	extern int simple_indentifier_##var(void);	\
+	RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()); }))
 
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index cb9039a21f2..f926490a7d8 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -11,8 +11,14 @@
 #define PERCPU_ENOUGH_ROOM 32768
 #endif
 
-/* Must be an lvalue. */
-#define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
+/*
+ * Must be an lvalue. Since @var must be a simple identifier,
+ * we force a syntax error here if it isn't.
+ */
+#define get_cpu_var(var) (*({				\
+	extern int simple_indentifier_##var(void);	\
+	preempt_disable();				\
+	&__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 3998b9301d3d55be8373add22b6bc5e11c1d9b71 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 25 Sep 2006 23:30:53 -0700
Subject: [PATCH] jbd: fix commit of ordered data buffers

Original commit code assumes, that when a buffer on BJ_SyncData list is
locked, it is being written to disk.  But this is not true and hence it can
lead to a potential data loss on crash.  Also the code didn't count with
the fact that journal_dirty_data() can steal buffers from committing
transaction and hence could write buffers that no longer belong to the
committing transaction.  Finally it could possibly happen that we tried
writing out one buffer several times.

The patch below tries to solve these problems by a complete rewrite of the
data commit code.  We go through buffers on t_sync_datalist, lock buffers
needing write out and store them in an array.  Buffers are also immediately
refiled to BJ_Locked list or unfiled (if the write out is completed).  When
the array is full or we have to block on buffer lock, we submit all
accumulated buffers for IO.

[suitable for 2.6.18.x around the 2.6.19-rc2 timeframe]

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/commit.c | 182 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 113 insertions(+), 69 deletions(-)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 42da6078431..32a8caf0c41 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal,
 	return (ret == -EIO);
 }
 
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+	int i;
+
+	for (i = 0; i < bufs; i++) {
+		wbuf[i]->b_end_io = end_buffer_write_sync;
+		/* We use-up our safety reference in submit_bh() */
+		submit_bh(WRITE, wbuf[i]);
+	}
+}
+
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+				transaction_t *commit_transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	int locked;
+	int bufs = 0;
+	struct buffer_head **wbuf = journal->j_wbuf;
+
+	/*
+	 * Whenever we unlock the journal and sleep, things can get added
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
+	 *
+	 * Cleanup any flushed data buffers from the data list.  Even in
+	 * abort mode, we want to flush this out as soon as possible.
+	 */
+write_out_data:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	while (commit_transaction->t_sync_datalist) {
+		jh = commit_transaction->t_sync_datalist;
+		bh = jh2bh(jh);
+		locked = 0;
+
+		/* Get reference just to make sure buffer does not disappear
+		 * when we are forced to drop various locks */
+		get_bh(bh);
+		/* If the buffer is dirty, we need to submit IO and hence
+		 * we need the buffer lock. We try to lock the buffer without
+		 * blocking. If we fail, we need to drop j_list_lock and do
+		 * blocking lock_buffer().
+		 */
+		if (buffer_dirty(bh)) {
+			if (test_set_buffer_locked(bh)) {
+				BUFFER_TRACE(bh, "needs blocking lock");
+				spin_unlock(&journal->j_list_lock);
+				/* Write out all data to prevent deadlocks */
+				journal_do_submit_data(wbuf, bufs);
+				bufs = 0;
+				lock_buffer(bh);
+				spin_lock(&journal->j_list_lock);
+			}
+			locked = 1;
+		}
+		/* We have to get bh_state lock. Again out of order, sigh. */
+		if (!inverted_lock(journal, bh)) {
+			jbd_lock_bh_state(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		/* Someone already cleaned up the buffer? */
+		if (!buffer_jbd(bh)
+			|| jh->b_transaction != commit_transaction
+			|| jh->b_jlist != BJ_SyncData) {
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			BUFFER_TRACE(bh, "already cleaned up");
+			put_bh(bh);
+			continue;
+		}
+		if (locked && test_clear_buffer_dirty(bh)) {
+			BUFFER_TRACE(bh, "needs writeout, adding to array");
+			wbuf[bufs++] = bh;
+			__journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (bufs == journal->j_wbufsize) {
+				spin_unlock(&journal->j_list_lock);
+				journal_do_submit_data(wbuf, bufs);
+				bufs = 0;
+				goto write_out_data;
+			}
+		}
+		else {
+			BUFFER_TRACE(bh, "writeout complete: unfile");
+			__journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			journal_remove_journal_head(bh);
+			/* Once for our safety reference, once for
+			 * journal_remove_journal_head() */
+			put_bh(bh);
+			put_bh(bh);
+		}
+
+		if (lock_need_resched(&journal->j_list_lock)) {
+			spin_unlock(&journal->j_list_lock);
+			goto write_out_data;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+	journal_do_submit_data(wbuf, bufs);
+}
+
 /*
  * journal_commit_transaction
  *
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-
 	err = 0;
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 */
-	bufs = 0;
-	/*
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
-	spin_lock(&journal->j_list_lock);
-
-	while (commit_transaction->t_sync_datalist) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_sync_datalist;
-		commit_transaction->t_sync_datalist = jh->b_tnext;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			BUFFER_TRACE(bh, "locked");
-			if (!inverted_lock(journal, bh))
-				goto write_out_data;
-			__journal_temp_unlink_buffer(jh);
-			__journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (lock_need_resched(&journal->j_list_lock)) {
-				spin_unlock(&journal->j_list_lock);
-				goto write_out_data;
-			}
-		} else {
-			if (buffer_dirty(bh)) {
-				BUFFER_TRACE(bh, "start journal writeout");
-				get_bh(bh);
-				wbuf[bufs++] = bh;
-				if (bufs == journal->j_wbufsize) {
-					jbd_debug(2, "submit %d writes\n",
-							bufs);
-					spin_unlock(&journal->j_list_lock);
-					ll_rw_block(SWRITE, bufs, wbuf);
-					journal_brelse_array(wbuf, bufs);
-					bufs = 0;
-					goto write_out_data;
-				}
-			} else {
-				BUFFER_TRACE(bh, "writeout complete: unfile");
-				if (!inverted_lock(journal, bh))
-					goto write_out_data;
-				__journal_unfile_buffer(jh);
-				jbd_unlock_bh_state(bh);
-				journal_remove_journal_head(bh);
-				put_bh(bh);
-				if (lock_need_resched(&journal->j_list_lock)) {
-					spin_unlock(&journal->j_list_lock);
-					goto write_out_data;
-				}
-			}
-		}
-	}
-
-	if (bufs) {
-		spin_unlock(&journal->j_list_lock);
-		ll_rw_block(SWRITE, bufs, wbuf);
-		journal_brelse_array(wbuf, bufs);
-		spin_lock(&journal->j_list_lock);
-	}
+	journal_submit_data_buffers(journal, commit_transaction);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
 	 */
+	spin_lock(&journal->j_list_lock);
 	while (commit_transaction->t_locked_list) {
 		struct buffer_head *bh;
 
-- 
cgit v1.2.3


From a6ca1b99ed434f3fb41bbed647ed36c0420501e5 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Mon, 25 Sep 2006 23:30:55 -0700
Subject: [PATCH] update to the kernel kmap/kunmap API

Give non-highmem architectures access to the kmap API for the purposes of
overriding (this is what the attached patch does).

The proposal is that we should now require all architectures with coherence
issues to manage data coherence via the kmap/kunmap API.  Thus driver
writers never have to write code like

    kmap(page)
    modify data in page
    flush_kernel_dcache_page(page)
    kunmap(page)

instead, kmap/kunmap will manage the coherence and driver (and filesystem)
writers don't need to worry about how to flush between kmap and kunmap.

For most architectures, the page only needs to be flushed if it was
actually written to *and* there are user mappings of it, so the best
implementation looks to be: clear the page dirty pte bit in the kernel page
tables on kmap and on kunmap, check page->mappings for user maps, and then
the dirty bit, and only flush if it both has user mappings and is dirty.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/highmem.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 85ce7ef9a51..42620e723ab 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -29,6 +29,7 @@ unsigned int nr_free_highpages(void);
 
 static inline unsigned int nr_free_highpages(void) { return 0; }
 
+#ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
 	might_sleep();
@@ -41,6 +42,7 @@ static inline void *kmap(struct page *page)
 #define kunmap_atomic(addr, idx)	do { } while (0)
 #define kmap_atomic_pfn(pfn, idx)	page_address(pfn_to_page(pfn))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
+#endif
 
 #endif /* CONFIG_HIGHMEM */
 
-- 
cgit v1.2.3


From 725d704ecaca4a43f067092c140d4f3271cf2856 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:30:55 -0700
Subject: [PATCH] mm: VM_BUG_ON

Introduce a VM_BUG_ON, which is turned on with CONFIG_DEBUG_VM.  Use this
in the lightweight, inline refcounting functions; PageLRU and PageActive
checks in vmscan, because they're pretty well confined to vmscan.  And in
page allocate/free fastpaths which can be the hottest parts of the kernel
for kbuilds.

Unlike BUG_ON, VM_BUG_ON must not be used to execute statements with
side-effects, and should not be used outside core mm code.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 10 +++++++++-
 mm/internal.h      |  4 ++--
 mm/page_alloc.c    | 23 +++++++++++------------
 mm/swap.c          | 12 ++++++------
 mm/vmscan.c        | 16 ++++++++--------
 5 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 224178a000d..7d20b25c58f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -278,6 +278,12 @@ struct page {
  */
 #include <linux/page-flags.h>
 
+#ifdef CONFIG_DEBUG_VM
+#define VM_BUG_ON(cond) BUG_ON(cond)
+#else
+#define VM_BUG_ON(condition) do { } while(0)
+#endif
+
 /*
  * Methods to modify the page usage count.
  *
@@ -297,7 +303,7 @@ struct page {
  */
 static inline int put_page_testzero(struct page *page)
 {
-	BUG_ON(atomic_read(&page->_count) == 0);
+	VM_BUG_ON(atomic_read(&page->_count) == 0);
 	return atomic_dec_and_test(&page->_count);
 }
 
@@ -307,6 +313,7 @@ static inline int put_page_testzero(struct page *page)
  */
 static inline int get_page_unless_zero(struct page *page)
 {
+	VM_BUG_ON(PageCompound(page));
 	return atomic_inc_not_zero(&page->_count);
 }
 
@@ -323,6 +330,7 @@ static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
+	VM_BUG_ON(atomic_read(&page->_count) == 0);
 	atomic_inc(&page->_count);
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index d20e3cc4aef..d527b80b292 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v)
  */
 static inline void set_page_refcounted(struct page *page)
 {
-	BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
-	BUG_ON(atomic_read(&page->_count));
+	VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+	VM_BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8a52ba9fe69..4b33878e948 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,7 +127,6 @@ static int bad_range(struct zone *zone, struct page *page)
 
 	return 0;
 }
-
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
@@ -218,12 +217,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 
-	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
-	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
@@ -347,8 +346,8 @@ static inline void __free_one_page(struct page *page,
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
-	BUG_ON(page_idx & (order_size - 1));
-	BUG_ON(bad_range(zone, page));
+	VM_BUG_ON(page_idx & (order_size - 1));
+	VM_BUG_ON(bad_range(zone, page));
 
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
@@ -421,7 +420,7 @@ static void free_pages_bulk(struct zone *zone, int count,
 	while (count--) {
 		struct page *page;
 
-		BUG_ON(list_empty(list));
+		VM_BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
@@ -512,7 +511,7 @@ static inline void expand(struct zone *zone, struct page *page,
 		area--;
 		high--;
 		size >>= 1;
-		BUG_ON(bad_range(zone, &page[size]));
+		VM_BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list);
 		area->nr_free++;
 		set_page_order(&page[size], high);
@@ -761,8 +760,8 @@ void split_page(struct page *page, unsigned int order)
 {
 	int i;
 
-	BUG_ON(PageCompound(page));
-	BUG_ON(!page_count(page));
+	VM_BUG_ON(PageCompound(page));
+	VM_BUG_ON(!page_count(page));
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
@@ -809,7 +808,7 @@ again:
 	local_irq_restore(flags);
 	put_cpu();
 
-	BUG_ON(bad_range(zone, page));
+	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
@@ -1083,7 +1082,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
 	 * get_zeroed_page() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
-	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 
 	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 	if (page)
@@ -1116,7 +1115,7 @@ EXPORT_SYMBOL(__free_pages);
 fastcall void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
-		BUG_ON(!virt_addr_valid((void *)addr));
+		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
diff --git a/mm/swap.c b/mm/swap.c
index 687686a61f7..600235e4370 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -233,7 +233,7 @@ void fastcall __page_cache_release(struct page *page)
 		struct zone *zone = page_zone(page);
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		BUG_ON(!PageLRU(page));
+		VM_BUG_ON(!PageLRU(page));
 		__ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -284,7 +284,7 @@ void release_pages(struct page **pages, int nr, int cold)
 				zone = pagezone;
 				spin_lock_irq(&zone->lru_lock);
 			}
-			BUG_ON(!PageLRU(page));
+			VM_BUG_ON(!PageLRU(page));
 			__ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
@@ -337,7 +337,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 		if (put_page_testzero(page))
 			pagevec_add(&pages_to_free, page);
 	}
@@ -364,7 +364,7 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
@@ -391,9 +391,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		BUG_ON(PageActive(page));
+		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d4c4d02254..41a3da3d6cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -440,7 +440,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (TestSetPageLocked(page))
 			goto keep;
 
-		BUG_ON(PageActive(page));
+		VM_BUG_ON(PageActive(page));
 
 		sc->nr_scanned++;
 
@@ -564,7 +564,7 @@ keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
@@ -603,7 +603,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		BUG_ON(!PageLRU(page));
+		VM_BUG_ON(!PageLRU(page));
 
 		list_del(&page->lru);
 		target = src;
@@ -674,7 +674,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
-			BUG_ON(PageLRU(page));
+			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
@@ -797,9 +797,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		BUG_ON(!PageActive(page));
+		VM_BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 
 		list_move(&page->lru, &zone->inactive_list);
@@ -827,9 +827,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
-		BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		BUG_ON(!PageActive(page));
+		VM_BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-- 
cgit v1.2.3


From d08b3851da41d0ee60851f2c75b118e1f7a5fc89 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:30:57 -0700
Subject: [PATCH] mm: tracking shared dirty pages

Tracking of dirty pages in shared writeable mmap()s.

The idea is simple: write protect clean shared writeable pages, catch the
write-fault, make writeable and set dirty.  On page write-back clean all the
PTE dirty bits and write protect them once again.

The implementation is a tad harder, mainly because the default
backing_dev_info capabilities were too loosely maintained.  Hence it is not
enough to test the backing_dev_info for cap_account_dirty.

The current heuristic is as follows, a VMA is eligible when:
 - its shared writeable
    (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)
 - it is not a 'special' mapping
    (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0
 - the backing_dev_info is cap_account_dirty
    mapping_cap_account_dirty(vma->vm_file->f_mapping)
 - f_op->mmap() didn't change the default page protection

Page from remap_pfn_range() are explicitly excluded because their COW
semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and
because they don't have a backing store anyway.

mprotect() is taught about the new behaviour as well.  However it overrides
the last condition.

Cleaning the pages on write-back is done with page_mkclean() a new rmap call.
It can be called on any page, but is currently only implemented for mapped
pages, if the page is found the be of a VMA that accounts dirty pages it will
also wrprotect the PTE.

Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from
under ->private_lock.  This seems to be safe, since ->private_lock is used to
serialize access to the buffers, not the page itself.  This is needed because
clear_page_dirty() will call into page_mkclean() and would thereby violate
locking order.

[dhowells@redhat.com: Provide a page_mkclean() implementation for NOMMU]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c          |  2 +-
 include/linux/mm.h   | 34 +++++++++++++++++++++++++++
 include/linux/rmap.h | 14 +++++++++++
 mm/memory.c          | 29 ++++++++++++++++++-----
 mm/mmap.c            | 10 ++++----
 mm/mprotect.c        | 21 +++++++----------
 mm/page-writeback.c  | 17 ++++++++++----
 mm/rmap.c            | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 162 insertions(+), 30 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 71649ef9b65..3b6d701073e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,7 @@ int try_to_free_buffers(struct page *page)
 
 	spin_lock(&mapping->private_lock);
 	ret = drop_buffers(page, &buffers_to_free);
+	spin_unlock(&mapping->private_lock);
 	if (ret) {
 		/*
 		 * If the filesystem writes its buffers by hand (eg ext3)
@@ -2998,7 +2999,6 @@ int try_to_free_buffers(struct page *page)
 		 */
 		clear_page_dirty(page);
 	}
-	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
 		struct buffer_head *bh = buffers_to_free;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d20b25c58f..449841413cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/debug_locks.h>
+#include <linux/backing-dev.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -810,6 +811,39 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+static inline int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+	unsigned int vm_flags = vma->vm_flags;
+
+	/* If it was private or non-writable, the write bit is already clear */
+	if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+		return 0;
+
+	/* The backer wishes to know when pages are first written to? */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		return 1;
+
+	/* The open routine did something to the protections already? */
+	if (pgprot_val(vma->vm_page_prot) !=
+	    pgprot_val(protection_map[vm_flags &
+		    (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]))
+		return 0;
+
+	/* Specialty mapping? */
+	if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+		return 0;
+
+	/* Can the mapping track the dirty pages? */
+	return vma->vm_file && vma->vm_file->f_mapping &&
+		mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
 extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
 
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bf97b090001..db2c1df4fef 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -103,6 +103,14 @@ pte_t *page_check_address(struct page *, struct mm_struct *,
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+/*
+ * Cleans the PTEs of shared mappings.
+ * (and since clean PTEs should also be readonly, write protects them too)
+ *
+ * returns the number of cleaned PTEs.
+ */
+int page_mkclean(struct page *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
@@ -112,6 +120,12 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 #define page_referenced(page,l) TestClearPageReferenced(page)
 #define try_to_unmap(page, refs) SWAP_FAIL
 
+static inline int page_mkclean(struct page *page)
+{
+	return 0;
+}
+
+
 #endif	/* CONFIG_MMU */
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index 109e9866237..fa941b16907 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1458,14 +1458,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int reuse, ret = VM_FAULT_MINOR;
+	int reuse = 0, ret = VM_FAULT_MINOR;
+	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 
-	if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
-				(VM_SHARED|VM_WRITE))) {
+	/*
+	 * Only catch write-faults on shared writable pages, read-only
+	 * shared pages can get COWed by get_user_pages(.write=1, .force=1).
+	 */
+	if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+					(VM_WRITE|VM_SHARED))) {
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 			/*
 			 * Notify the address space that the page is about to
@@ -1494,13 +1499,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			if (!pte_same(*page_table, orig_pte))
 				goto unlock;
 		}
-
+		dirty_page = old_page;
+		get_page(dirty_page);
 		reuse = 1;
 	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
-	} else {
-		reuse = 0;
 	}
 
 	if (reuse) {
@@ -1566,6 +1570,10 @@ gotten:
 		page_cache_release(old_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (dirty_page) {
+		set_page_dirty(dirty_page);
+		put_page(dirty_page);
+	}
 	return ret;
 oom:
 	if (old_page)
@@ -2098,6 +2106,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
+	struct page *dirty_page = NULL;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2192,6 +2201,10 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (write_access) {
+				dirty_page = new_page;
+				get_page(dirty_page);
+			}
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
@@ -2204,6 +2217,10 @@ retry:
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (dirty_page) {
+		set_page_dirty(dirty_page);
+		put_page(dirty_page);
+	}
 	return ret;
 oom:
 	page_cache_release(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
index d799d896d74..8507ee9cd57 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1105,12 +1105,6 @@ munmap_back:
 			goto free_vma;
 	}
 
-	/* Don't make the VMA automatically writable if it's shared, but the
-	 * backer wishes to know when pages are first written to */
-	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
-		vma->vm_page_prot =
-			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
-
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
@@ -1128,6 +1122,10 @@ munmap_back:
 	pgoff = vma->vm_pgoff;
 	vm_flags = vma->vm_flags;
 
+	if (vma_wants_writenotify(vma))
+		vma->vm_page_prot =
+			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
 	if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
 			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
 		file = vma->vm_file;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 638edabaff7..367b7f6c063 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -123,8 +123,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
-	unsigned int mask;
-	pgprot_t newprot;
 	pgoff_t pgoff;
 	int error;
 
@@ -176,24 +174,21 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	}
 
 success:
-	/* Don't make the VMA automatically writable if it's shared, but the
-	 * backer wishes to know when pages are first written to */
-	mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
-	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
-		mask &= ~VM_SHARED;
-
-	newprot = protection_map[newflags & mask];
-
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.
 	 */
 	vma->vm_flags = newflags;
-	vma->vm_page_prot = newprot;
+	vma->vm_page_prot = protection_map[newflags &
+		(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+	if (vma_wants_writenotify(vma))
+		vma->vm_page_prot = protection_map[newflags &
+			(VM_READ|VM_WRITE|VM_EXEC)];
+
 	if (is_vm_hugetlb_page(vma))
-		hugetlb_change_protection(vma, start, end, newprot);
+		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
-		change_protection(vma, start, end, newprot);
+		change_protection(vma, start, end, vma->vm_page_prot);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 77a0bc4e261..1c87430b7a2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/mpage.h>
+#include <linux/rmap.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -550,7 +551,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 		return 0;
 	wbc->for_writepages = 1;
 	if (mapping->a_ops->writepages)
-		ret =  mapping->a_ops->writepages(mapping, wbc);
+		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
 	wbc->for_writepages = 0;
@@ -712,9 +713,15 @@ int test_clear_page_dirty(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-			if (mapping_cap_account_dirty(mapping))
-				__dec_zone_page_state(page, NR_FILE_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
+			/*
+			 * We can continue to use `mapping' here because the
+			 * page is locked, which pins the address_space
+			 */
+			if (mapping_cap_account_dirty(mapping)) {
+				page_mkclean(page);
+				dec_zone_page_state(page, NR_FILE_DIRTY);
+			}
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -744,8 +751,10 @@ int clear_page_dirty_for_io(struct page *page)
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_mkclean(page);
 				dec_zone_page_state(page, NR_FILE_DIRTY);
+			}
 			return 1;
 		}
 		return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index 40158b59729..e2155d791d9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked)
 	return referenced;
 }
 
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
+
+	if (!pte_dirty(*pte) && !pte_write(*pte))
+		goto unlock;
+
+	entry = ptep_get_and_clear(mm, address, pte);
+	entry = pte_mkclean(entry);
+	entry = pte_wrprotect(entry);
+	ptep_establish(vma, address, pte, entry);
+	lazy_mmu_prot_update(entry);
+	ret = 1;
+
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return ret;
+}
+
+static int page_mkclean_file(struct address_space *mapping, struct page *page)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = 0;
+
+	BUG_ON(PageAnon(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (vma->vm_flags & VM_SHARED)
+			ret += page_mkclean_one(page, vma);
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+int page_mkclean(struct page *page)
+{
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_mapped(page)) {
+		struct address_space *mapping = page_mapping(page);
+		if (mapping)
+			ret = page_mkclean_file(mapping, page);
+	}
+
+	return ret;
+}
+
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
-- 
cgit v1.2.3


From edc79b2a46ed854595e40edcf3f8b37f9f14aa3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:30:58 -0700
Subject: [PATCH] mm: balance dirty pages

Now that we can detect writers of shared mappings, throttle them.  Avoids OOM
by surprise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h |  1 +
 mm/memory.c               |  5 +++--
 mm/page-writeback.c       | 10 ++++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0422036af4e..56a23a0e7f2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -116,6 +116,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, loff_t count);
 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 			   loff_t pos, loff_t count);
+void set_page_dirty_balance(struct page *page);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff --git a/mm/memory.c b/mm/memory.c
index fa941b16907..dd7d7fc5ed6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1571,7 +1572,7 @@ gotten:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
-		set_page_dirty(dirty_page);
+		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 	return ret;
@@ -2218,7 +2219,7 @@ retry:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
-		set_page_dirty(dirty_page);
+		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 	return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1c87430b7a2..b9f4c6f1be8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -244,6 +244,16 @@ static void balance_dirty_pages(struct address_space *mapping)
 		pdflush_operation(background_writeout, 0);
 }
 
+void set_page_dirty_balance(struct page *page)
+{
+	if (set_page_dirty(page)) {
+		struct address_space *mapping = page_mapping(page);
+
+		if (mapping)
+			balance_dirty_pages_ratelimited(mapping);
+	}
+}
+
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
-- 
cgit v1.2.3


From c1e6098b23bb46e2b488fe9a26f831f867157483 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:30:59 -0700
Subject: [PATCH] mm: optimize the new mprotect() code a bit

mprotect() resets the page protections, which could result in extra write
faults for those pages whose dirty state we track using write faults and are
dirty already.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mprotect.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 367b7f6c063..955f9d0e38a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -27,7 +27,8 @@
 #include <asm/tlbflush.h>
 
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
-		unsigned long addr, unsigned long end, pgprot_t newprot)
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable)
 {
 	pte_t *pte, oldpte;
 	spinlock_t *ptl;
@@ -42,7 +43,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			 * bits by wiping the pte and then setting the new pte
 			 * into place.
 			 */
-			ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
+			ptent = ptep_get_and_clear(mm, addr, pte);
+			ptent = pte_modify(ptent, newprot);
+			/*
+			 * Avoid taking write faults for pages we know to be
+			 * dirty.
+			 */
+			if (dirty_accountable && pte_dirty(ptent))
+				ptent = pte_mkwrite(ptent);
 			set_pte_at(mm, addr, pte, ptent);
 			lazy_mmu_prot_update(ptent);
 #ifdef CONFIG_MIGRATION
@@ -66,7 +74,8 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 }
 
 static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
-		unsigned long addr, unsigned long end, pgprot_t newprot)
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -76,12 +85,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		change_pte_range(mm, pmd, addr, next, newprot);
+		change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
 	} while (pmd++, addr = next, addr != end);
 }
 
 static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
-		unsigned long addr, unsigned long end, pgprot_t newprot)
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -91,12 +101,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		change_pmd_range(mm, pud, addr, next, newprot);
+		change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
 	} while (pud++, addr = next, addr != end);
 }
 
 static void change_protection(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long end, pgprot_t newprot)
+		unsigned long addr, unsigned long end, pgprot_t newprot,
+		int dirty_accountable)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
@@ -110,7 +121,7 @@ static void change_protection(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		change_pud_range(mm, pgd, addr, next, newprot);
+		change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
 	} while (pgd++, addr = next, addr != end);
 	flush_tlb_range(vma, start, end);
 }
@@ -125,6 +136,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long charged = 0;
 	pgoff_t pgoff;
 	int error;
+	int dirty_accountable = 0;
 
 	if (newflags == oldflags) {
 		*pprev = vma;
@@ -181,14 +193,16 @@ success:
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = protection_map[newflags &
 		(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
-	if (vma_wants_writenotify(vma))
+	if (vma_wants_writenotify(vma)) {
 		vma->vm_page_prot = protection_map[newflags &
 			(VM_READ|VM_WRITE|VM_EXEC)];
+		dirty_accountable = 1;
+	}
 
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
-		change_protection(vma, start, end, vma->vm_page_prot);
+		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
-- 
cgit v1.2.3


From e88dd6c11c5aef74d8b74a062767add53315533b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:30:59 -0700
Subject: [PATCH] mm: small cleanup of install_page()

Smallish cleanup to install_page(), could save a memory read (haven't checked
the asm output) and sure looks nicer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/fremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/fremap.c b/mm/fremap.c
index 21b7d0cbc98..aa30618ec6b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		inc_mm_counter(mm, file_rss);
 
 	flush_icache_page(vma, page);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	pte_val = mk_pte(page, prot);
+	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
-	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
 	lazy_mmu_prot_update(pte_val);
 	err = 0;
-- 
cgit v1.2.3


From ee6a6457886a80415db209e87033b63f2b06558c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:31:00 -0700
Subject: [PATCH] mm: fixup do_wp_page()

Wrt. the recent modifications in do_wp_page() Hugh Dickins pointed out:

  "I now realize it's right to the first order (normal case) and to the
   second order (ptrace poke), but not to the third order (ptrace poke
   anon page here to be COWed - perhaps can't occur without intervening
   mprotects)."

This patch restores the old COW behaviour for anonymous pages.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index dd7d7fc5ed6..65962534b4e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1467,11 +1467,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto gotten;
 
 	/*
-	 * Only catch write-faults on shared writable pages, read-only
-	 * shared pages can get COWed by get_user_pages(.write=1, .force=1).
+	 * Take out anonymous pages first, anonymous shared vmas are
+	 * not dirty accountable.
 	 */
-	if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+	if (PageAnon(old_page)) {
+		if (!TestSetPageLocked(old_page)) {
+			reuse = can_share_swap_page(old_page);
+			unlock_page(old_page);
+		}
+	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
+		/*
+		 * Only catch write-faults on shared writable pages,
+		 * read-only shared pages can get COWed by
+		 * get_user_pages(.write=1, .force=1).
+		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 			/*
 			 * Notify the address space that the page is about to
@@ -1503,9 +1513,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		dirty_page = old_page;
 		get_page(dirty_page);
 		reuse = 1;
-	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-		reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
 	}
 
 	if (reuse) {
-- 
cgit v1.2.3


From 204ec841fbea3e5138168edbc3a76d46747cc987 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:31:01 -0700
Subject: [PATCH] mm: msync() cleanup

With the tracking of dirty pages properly done now, msync doesn't need to scan
the PTEs anymore to determine the dirty status.

From: Hugh Dickins <hugh@veritas.com>

In looking to do that, I made some other tidyups: can remove several
#includes, and sys_msync loop termination not quite right.

Most of those points are criticisms of the existing sys_msync, not of your
patch.  In particular, the loop termination errors were introduced in 2.6.17:
I did notice this shortly before it came out, but decided I was more likely to
get it wrong myself, and make matters worse if I tried to rush a last-minute
fix in.  And it's not terribly likely to go wrong, nor disastrous if it does
go wrong (may miss reporting an unmapped area; may also fsync file of a
following vma).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 196 +++++++++++--------------------------------------------------
 1 file changed, 33 insertions(+), 163 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index d083544df21..358d73cf7b7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -7,149 +7,33 @@
 /*
  * The msync() system call.
  */
-#include <linux/slab.h>
-#include <linux/pagemap.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/hugetlb.h>
-#include <linux/writeback.h>
 #include <linux/file.h>
 #include <linux/syscalls.h>
 
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-				unsigned long addr, unsigned long end)
-{
-	pte_t *pte;
-	spinlock_t *ptl;
-	int progress = 0;
-	unsigned long ret = 0;
-
-again:
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	do {
-		struct page *page;
-
-		if (progress >= 64) {
-			progress = 0;
-			if (need_resched() || need_lockbreak(ptl))
-				break;
-		}
-		progress++;
-		if (!pte_present(*pte))
-			continue;
-		if (!pte_maybe_dirty(*pte))
-			continue;
-		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
-			continue;
-		if (ptep_clear_flush_dirty(vma, addr, pte) ||
-				page_test_and_clear_dirty(page))
-			ret += set_page_dirty(page);
-		progress += 3;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap_unlock(pte - 1, ptl);
-	cond_resched();
-	if (addr != end)
-		goto again;
-	return ret;
-}
-
-static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
-			pud_t *pud, unsigned long addr, unsigned long end)
-{
-	pmd_t *pmd;
-	unsigned long next;
-	unsigned long ret = 0;
-
-	pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
-			continue;
-		ret += msync_pte_range(vma, pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-	return ret;
-}
-
-static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
-			pgd_t *pgd, unsigned long addr, unsigned long end)
-{
-	pud_t *pud;
-	unsigned long next;
-	unsigned long ret = 0;
-
-	pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
-			continue;
-		ret += msync_pmd_range(vma, pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-	return ret;
-}
-
-static unsigned long msync_page_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end)
-{
-	pgd_t *pgd;
-	unsigned long next;
-	unsigned long ret = 0;
-
-	/* For hugepages we can't go walking the page table normally,
-	 * but that's ok, hugetlbfs is memory based, so we don't need
-	 * to do anything more on an msync().
-	 */
-	if (vma->vm_flags & VM_HUGETLB)
-		return 0;
-
-	BUG_ON(addr >= end);
-	pgd = pgd_offset(vma->vm_mm, addr);
-	flush_cache_range(vma, addr, end);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		ret += msync_pud_range(vma, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-	return ret;
-}
-
 /*
  * MS_SYNC syncs the entire file - including mappings.
  *
- * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
- * marks the relevant pages dirty.  The application may now run fsync() to
+ * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
+ * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
+ * Now it doesn't do anything, since dirty pages are properly tracked.
+ *
+ * The application may now run fsync() to
  * write out the dirty pages and wait on the writeout and check the result.
  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
  * async writeout immediately.
  * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
  * applications.
  */
-static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
-			unsigned long end, int flags,
-			unsigned long *nr_pages_dirtied)
-{
-	struct file *file = vma->vm_file;
-
-	if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
-		return -EBUSY;
-
-	if (file && (vma->vm_flags & VM_SHARED))
-		*nr_pages_dirtied = msync_page_range(vma, addr, end);
-	return 0;
-}
-
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
 	unsigned long end;
+	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int unmapped_error = 0;
 	int error = -EINVAL;
-	int done = 0;
 
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 		goto out;
@@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * If the interval [start,end) covers some unmapped address ranges,
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma(current->mm, start);
-	if (!vma) {
-		error = -ENOMEM;
-		goto out_unlock;
-	}
-	do {
-		unsigned long nr_pages_dirtied = 0;
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, start);
+	for (;;) {
 		struct file *file;
 
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out_unlock;
 		/* Here start < vma->vm_end. */
 		if (start < vma->vm_start) {
-			unmapped_error = -ENOMEM;
 			start = vma->vm_start;
+			if (start >= end)
+				goto out_unlock;
+			unmapped_error = -ENOMEM;
 		}
 		/* Here vma->vm_start <= start < vma->vm_end. */
-		if (end <= vma->vm_end) {
-			if (start < end) {
-				error = msync_interval(vma, start, end, flags,
-							&nr_pages_dirtied);
-				if (error)
-					goto out_unlock;
-			}
-			error = unmapped_error;
-			done = 1;
-		} else {
-			/* Here vma->vm_start <= start < vma->vm_end < end. */
-			error = msync_interval(vma, start, vma->vm_end, flags,
-						&nr_pages_dirtied);
-			if (error)
-				goto out_unlock;
+		if ((flags & MS_INVALIDATE) &&
+				(vma->vm_flags & VM_LOCKED)) {
+			error = -EBUSY;
+			goto out_unlock;
 		}
 		file = vma->vm_file;
 		start = vma->vm_end;
-		if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
-			get_file(file);
-			up_read(&current->mm->mmap_sem);
-			balance_dirty_pages_ratelimited_nr(file->f_mapping,
-							nr_pages_dirtied);
-			fput(file);
-			down_read(&current->mm->mmap_sem);
-			vma = find_vma(current->mm, start);
-		} else if ((flags & MS_SYNC) && file &&
+		if ((flags & MS_SYNC) && file &&
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
-			up_read(&current->mm->mmap_sem);
+			up_read(&mm->mmap_sem);
 			error = do_fsync(file, 0);
 			fput(file);
-			down_read(&current->mm->mmap_sem);
-			if (error)
-				goto out_unlock;
-			vma = find_vma(current->mm, start);
+			if (error || start >= end)
+				goto out;
+			down_read(&mm->mmap_sem);
+			vma = find_vma(mm, start);
 		} else {
+			if (start >= end) {
+				error = 0;
+				goto out_unlock;
+			}
 			vma = vma->vm_next;
 		}
-	} while (vma && !done);
+	}
 out_unlock:
-	up_read(&current->mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 out:
-	return error;
+	return error ? : unmapped_error;
 }
-- 
cgit v1.2.3


From b221385bc41d6789edde3d2fa0cb20d5045730eb Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 25 Sep 2006 23:31:02 -0700
Subject: [PATCH] mm/: make functions static

This patch makes the following needlessly global functions static:
 - slab.c: kmem_find_general_cachep()
 - swap.c: __page_cache_release()
 - vmalloc.c: __vmalloc_node()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h      |  2 --
 include/linux/slab.h    |  2 --
 include/linux/vmalloc.h |  2 --
 mm/slab.c               |  3 +--
 mm/swap.c               | 39 +++++++++++++++++++--------------------
 mm/vmalloc.c            |  8 +++++---
 6 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 449841413cf..45678b03695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -318,8 +318,6 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
-extern void FASTCALL(__page_cache_release(struct page *));
-
 static inline int page_count(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 45ad55b70d1..193c03c547e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -67,7 +67,6 @@ extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 extern const char *kmem_cache_name(kmem_cache_t *);
-extern kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags);
 
 /* Size description struct for general caches. */
 struct cache_sizes {
@@ -223,7 +222,6 @@ extern int FASTCALL(kmem_ptr_validate(kmem_cache_t *cachep, void *ptr));
 /* SLOB allocator routines */
 
 void kmem_cache_init(void);
-struct kmem_cache *kmem_find_general_cachep(size_t, gfp_t gfpflags);
 struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
 	unsigned long,
 	void (*)(void *, struct kmem_cache *, unsigned long),
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 71b6363caaa..dee88c6b6fa 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -44,8 +44,6 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
 				pgprot_t prot);
-extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
-				pgprot_t prot, int node);
 extern void vfree(void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/mm/slab.c b/mm/slab.c
index 21ba0603570..5870bcbd33c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -768,11 +768,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
 	return csizep->cs_cachep;
 }
 
-struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
-EXPORT_SYMBOL(kmem_find_general_cachep);
 
 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
diff --git a/mm/swap.c b/mm/swap.c
index 600235e4370..2e0e871f542 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,6 +34,25 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
+ */
+static void fastcall __page_cache_release(struct page *page)
+{
+	if (PageLRU(page)) {
+		unsigned long flags;
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		VM_BUG_ON(!PageLRU(page));
+		__ClearPageLRU(page);
+		del_page_from_lru(zone, page);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+	free_hot_page(page);
+}
+
 static void put_compound_page(struct page *page)
 {
 	page = (struct page *)page_private(page);
@@ -222,26 +241,6 @@ int lru_add_drain_all(void)
 }
 #endif
 
-/*
- * This path almost never happens for VM activity - pages are normally
- * freed via pagevecs.  But it gets used by networking.
- */
-void fastcall __page_cache_release(struct page *page)
-{
-	if (PageLRU(page)) {
-		unsigned long flags;
-		struct zone *zone = page_zone(page);
-
-		spin_lock_irqsave(&zone->lru_lock, flags);
-		VM_BUG_ON(!PageLRU(page));
-		__ClearPageLRU(page);
-		del_page_from_lru(zone, page);
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
-	}
-	free_hot_page(page);
-}
-EXPORT_SYMBOL(__page_cache_release);
-
 /*
  * Batched page_cache_release().  Decrement the reference count on all the
  * passed pages.  If it fell to zero then remove the page from the LRU and
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 266162d2ba2..9aad8b0cc6e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,9 @@
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+			    int node);
+
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
@@ -478,8 +481,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
-void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			int node)
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+			    int node)
 {
 	struct vm_struct *area;
 
@@ -493,7 +496,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 
 	return __vmalloc_area_node(area, gfp_mask, prot, node);
 }
-EXPORT_SYMBOL(__vmalloc_node);
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-- 
cgit v1.2.3


From 91023300057e96de7f46e95166a3e02394ae72f9 Mon Sep 17 00:00:00 2001
From: keith mannthey <kmannth@us.ibm.com>
Date: Mon, 25 Sep 2006 23:31:03 -0700
Subject: [PATCH] convert i386 NUMA KVA space to bootmem

Address a long standing issue of booting with an initrd on an i386 numa
system.  Currently (and always) the numa kva area is mapped into low memory
by finding the end of low memory and moving that mark down (thus creating
space for the kva).  The issue with this is that Grub loads initrds into
this similar space so when the kernel check the initrd it finds it outside
max_low_pfn and disables it (it thinks the initrd is not mapped into usable
memory) thus initrd enabled kernels can't boot i386 numa :(

My solution to the problem just converts the numa kva area to use the
bootmem allocator to save it's area (instead of moving the end of low
memory).  Using bootmem allows the kva area to be mapped into more diverse
addresses (not just the end of low memory) and enables the kva area to be
mapped below the initrd if present.

I have tested this patch on numaq(no initrd) and summit(initrd) i386 numa
based systems.

[akpm@osdl.org: cleanups]
Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c  |  3 ++-
 arch/i386/mm/discontig.c  | 29 +++++++++++++++++++++--------
 include/asm-i386/mmzone.h |  6 ++++++
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index f1682206d30..27d4dc0d3ef 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -53,6 +53,7 @@
 #include <asm/apic.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
+#include <asm/mmzone.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
@@ -1258,7 +1259,7 @@ void __init setup_bootmem_allocator(void)
 	 */
 	find_smp_config();
 #endif
-
+	numa_kva_reserve();
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 7c392dc553b..2e36eff8aff 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -117,7 +117,8 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 void *node_remap_end_vaddr[MAX_NUMNODES];
 void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
+static unsigned long kva_start_pfn;
+static unsigned long kva_pages;
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -286,7 +287,6 @@ unsigned long __init setup_memory(void)
 {
 	int nid;
 	unsigned long system_start_pfn, system_max_low_pfn;
-	unsigned long reserve_pages;
 
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -298,14 +298,23 @@ unsigned long __init setup_memory(void)
 	find_max_pfn();
 	get_memcfg_numa();
 
-	reserve_pages = calculate_numa_remap_pages();
+	kva_pages = calculate_numa_remap_pages();
 
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
-	printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
-			reserve_pages, max_low_pfn + reserve_pages);
+	kva_start_pfn = find_max_low_pfn() - kva_pages;
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Numa kva area is below the initrd */
+	if (LOADER_TYPE && INITRD_START)
+		kva_start_pfn = PFN_DOWN(INITRD_START)  - kva_pages;
+#endif
+	kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+
+	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+	printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
+		kva_start_pfn, max_low_pfn);
 	printk("max_pfn = %ld\n", max_pfn);
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
@@ -323,7 +332,7 @@ unsigned long __init setup_memory(void)
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid) {
 		node_remap_start_vaddr[nid] = pfn_to_kaddr(
-				highstart_pfn + node_remap_offset[nid]);
+				kva_start_pfn + node_remap_offset[nid]);
 		/* Init the node remap allocator */
 		node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
 			(node_remap_size[nid] * PAGE_SIZE);
@@ -338,7 +347,6 @@ unsigned long __init setup_memory(void)
 	}
 	printk("High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
-	vmalloc_earlyreserve = reserve_pages * PAGE_SIZE;
 	for_each_online_node(nid)
 		find_max_pfn_node(nid);
 
@@ -348,6 +356,11 @@ unsigned long __init setup_memory(void)
 	return max_low_pfn;
 }
 
+void __init numa_kva_reserve(void)
+{
+	reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+}
+
 void __init zone_sizes_init(void)
 {
 	int nid;
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 22cb07cc8f3..61b07332200 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -38,10 +38,16 @@ static inline void get_memcfg_numa(void)
 }
 
 extern int early_pfn_to_nid(unsigned long pfn);
+extern void numa_kva_reserve(void);
 
 #else /* !CONFIG_NUMA */
+
 #define get_memcfg_numa get_memcfg_numa_flat
 #define get_zholes_size(n) (0)
+
+static inline void numa_kva_reserve(void)
+{
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DISCONTIGMEM
-- 
cgit v1.2.3


From 2d1a07d487d8b36658404839cdf03a974968cefd Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:03 -0700
Subject: [PATCH] bootmem: remove useless __init in header file

__init in headers is pretty useless because the compiler doesn't check it, and
they get out of sync relatively frequently.  So if you see an __init in a
header file, it's quite unreliable and you need to check the definition
anyway.

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index e319c649e4f..c7124d4ce7c 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -41,23 +41,23 @@ typedef struct bootmem_data {
 	struct list_head list;
 } bootmem_data_t;
 
-extern unsigned long __init bootmem_bootmap_pages (unsigned long);
-extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
-extern void __init free_bootmem (unsigned long addr, unsigned long size);
-extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
-extern void * __init __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal);
-extern void * __init __alloc_bootmem_low(unsigned long size,
+extern unsigned long bootmem_bootmap_pages (unsigned long);
+extern unsigned long init_bootmem (unsigned long addr, unsigned long memend);
+extern void free_bootmem (unsigned long addr, unsigned long size);
+extern void * __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __alloc_bootmem_low(unsigned long size,
 					 unsigned long align,
 					 unsigned long goal);
-extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
+extern void * __alloc_bootmem_low_node(pg_data_t *pgdat,
 					      unsigned long size,
 					      unsigned long align,
 					      unsigned long goal);
-extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+extern void * __alloc_bootmem_core(struct bootmem_data *bdata,
 		unsigned long size, unsigned long align, unsigned long goal,
 		unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
+extern void reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
 	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
@@ -67,12 +67,12 @@ extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low((x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-extern unsigned long __init free_all_bootmem (void);
-extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
-extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
-extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
-extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
-extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
+extern unsigned long free_all_bootmem (void);
+extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
+extern unsigned long init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
+extern void reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
+extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
+extern unsigned long free_all_bootmem_node (pg_data_t *pgdat);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
@@ -94,14 +94,14 @@ static inline void *alloc_remap(int nid, unsigned long size)
 extern unsigned long __meminitdata nr_kernel_pages;
 extern unsigned long nr_all_pages;
 
-extern void *__init alloc_large_system_hash(const char *tablename,
-					    unsigned long bucketsize,
-					    unsigned long numentries,
-					    int scale,
-					    int flags,
-					    unsigned int *_hash_shift,
-					    unsigned int *_hash_mask,
-					    unsigned long limit);
+extern void * alloc_large_system_hash(const char *tablename,
+				      unsigned long bucketsize,
+				      unsigned long numentries,
+				      int scale,
+				      int flags,
+				      unsigned int *_hash_shift,
+				      unsigned int *_hash_mask,
+				      unsigned long limit);
 
 #define HASH_HIGHMEM	0x00000001	/* Consider highmem? */
 #define HASH_EARLY	0x00000002	/* Allocating during early boot? */
-- 
cgit v1.2.3


From 69d49e681d7c7ed864a1ba45efc1e78433df8b9a Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:04 -0700
Subject: [PATCH] bootmem: mark link_bootmem() as part of the __init section

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/bootmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 50353e0dac1..70f1528a3c8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -54,7 +54,7 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 /*
  * link bdata in order
  */
-static void link_bootmem(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
 {
 	bootmem_data_t *ent;
 	if (list_empty(&bdata_list)) {
-- 
cgit v1.2.3


From 71fb2e8f8753b66b1f4295aa264a2eb4e69381e8 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:05 -0700
Subject: [PATCH] bootmem: remove useless parentheses in bootmem header file

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index c7124d4ce7c..fa2523f4762 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -59,13 +59,13 @@ extern void * __alloc_bootmem_core(struct bootmem_data *bdata,
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
-	__alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
-	__alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
+	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
-	__alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_low((x), PAGE_SIZE, 0)
+	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long free_all_bootmem (void);
 extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
@@ -75,11 +75,11 @@ extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned lo
 extern unsigned long free_all_bootmem_node (pg_data_t *pgdat);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
-	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
-	__alloc_bootmem_low_node((pgdat), (x), PAGE_SIZE, 0)
+	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
-- 
cgit v1.2.3


From bb0923a66820718f636736b22ce47372f79e3400 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:05 -0700
Subject: [PATCH] bootmem: limit to 80 columns width

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 42 +++++++++++++++++++++++++++++-------------
 mm/bootmem.c            | 28 ++++++++++++++++++----------
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index fa2523f4762..1e8dddfb308 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,18 +44,24 @@ typedef struct bootmem_data {
 extern unsigned long bootmem_bootmap_pages (unsigned long);
 extern unsigned long init_bootmem (unsigned long addr, unsigned long memend);
 extern void free_bootmem (unsigned long addr, unsigned long size);
-extern void * __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
-extern void * __alloc_bootmem_nopanic (unsigned long size, unsigned long align, unsigned long goal);
+extern void * __alloc_bootmem (unsigned long size,
+			       unsigned long align,
+			       unsigned long goal);
+extern void * __alloc_bootmem_nopanic (unsigned long size,
+				       unsigned long align,
+				       unsigned long goal);
 extern void * __alloc_bootmem_low(unsigned long size,
-					 unsigned long align,
-					 unsigned long goal);
+				  unsigned long align,
+				  unsigned long goal);
 extern void * __alloc_bootmem_low_node(pg_data_t *pgdat,
-					      unsigned long size,
-					      unsigned long align,
-					      unsigned long goal);
+				       unsigned long size,
+				       unsigned long align,
+				       unsigned long goal);
 extern void * __alloc_bootmem_core(struct bootmem_data *bdata,
-		unsigned long size, unsigned long align, unsigned long goal,
-		unsigned long limit);
+				   unsigned long size,
+				   unsigned long align,
+				   unsigned long goal,
+				   unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
@@ -68,10 +74,20 @@ extern void reserve_bootmem (unsigned long addr, unsigned long size);
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long free_all_bootmem (void);
-extern void * __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
-extern unsigned long init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
-extern void reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
-extern void free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
+extern void * __alloc_bootmem_node (pg_data_t *pgdat,
+				    unsigned long size,
+				    unsigned long align,
+				    unsigned long goal);
+extern unsigned long init_bootmem_node (pg_data_t *pgdat,
+					unsigned long freepfn,
+					unsigned long startpfn,
+					unsigned long endpfn);
+extern void reserve_bootmem_node (pg_data_t *pgdat,
+				  unsigned long physaddr,
+				  unsigned long size);
+extern void free_bootmem_node (pg_data_t *pgdat,
+			       unsigned long addr,
+			       unsigned long size);
 extern unsigned long free_all_bootmem_node (pg_data_t *pgdat);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 70f1528a3c8..9083f502967 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -102,7 +102,8 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
  * might be used for boot-time allocations - or it might get added
  * to the free page pool later on.
  */
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+					unsigned long size)
 {
 	unsigned long i;
 	/*
@@ -127,7 +128,8 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
 		}
 }
 
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size)
+static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+				     unsigned long size)
 {
 	unsigned long i;
 	unsigned long start;
@@ -355,17 +357,20 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 	return total;
 }
 
-unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn)
+unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn,
+				unsigned long startpfn, unsigned long endpfn)
 {
 	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
 }
 
-void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
+				  unsigned long size)
 {
 	reserve_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
-void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size)
+void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
+			       unsigned long size)
 {
 	free_bootmem_core(pgdat->bdata, physaddr, size);
 }
@@ -399,7 +404,8 @@ unsigned long __init free_all_bootmem (void)
 	return(free_all_bootmem_core(NODE_DATA(0)));
 }
 
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+				      unsigned long goal)
 {
 	bootmem_data_t *bdata;
 	void *ptr;
@@ -410,7 +416,8 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, u
 	return NULL;
 }
 
-void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+			      unsigned long goal)
 {
 	void *mem = __alloc_bootmem_nopanic(size,align,goal);
 	if (mem)
@@ -424,8 +431,8 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned
 }
 
 
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
-				   unsigned long goal)
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+				   unsigned long align, unsigned long goal)
 {
 	void *ptr;
 
@@ -438,7 +445,8 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
 
 #define LOW32LIMIT 0xffffffff
 
-void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+				  unsigned long goal)
 {
 	bootmem_data_t *bdata;
 	void *ptr;
-- 
cgit v1.2.3


From e786e86a542ccc1133f333402526ad00b9c088ae Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:06 -0700
Subject: [PATCH] bootmem: remove useless headers inclusions

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  5 +----
 mm/bootmem.c            | 10 +++-------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 1e8dddfb308..b2067e4a448 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -4,11 +4,8 @@
 #ifndef _LINUX_BOOTMEM_H
 #define _LINUX_BOOTMEM_H
 
-#include <asm/pgtable.h>
-#include <asm/dma.h>
-#include <linux/cache.h>
-#include <linux/init.h>
 #include <linux/mmzone.h>
+#include <asm/dma.h>
 
 /*
  *  simple boot-time physical memory area allocator.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 9083f502967..e136c086fd9 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -8,17 +8,13 @@
  *  free memory collector. It's used to deal with reserved
  *  system memory and memory holes as well.
  */
-
-#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/swap.h>
-#include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
-#include <linux/mmzone.h>
 #include <linux/module.h>
-#include <asm/dma.h>
+
+#include <asm/bug.h>
 #include <asm/io.h>
+
 #include "internal.h"
 
 /*
-- 
cgit v1.2.3


From bbc7b92e337ac349ca917f9bf0b6be4743c14f3a Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:07 -0700
Subject: [PATCH] bootmem: use pfn/page conversion macros

It also creates get_mapsize() helper in order to make the code more readable
when it calculates the boot bitmap size.

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/bootmem.c | 82 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index e136c086fd9..23c3317c09e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -9,6 +9,7 @@
  *  system memory and memory holes as well.
  */
 #include <linux/init.h>
+#include <linux/pfn.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 
@@ -68,6 +69,18 @@ static void __init link_bootmem(bootmem_data_t *bdata)
 	return;
 }
 
+/*
+ * Given an initialised bdata, it returns the size of the boot bitmap
+ */
+static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+{
+	unsigned long mapsize;
+	unsigned long start = PFN_DOWN(bdata->node_boot_start);
+	unsigned long end = bdata->node_low_pfn;
+
+	mapsize = ((end - start) + 7) / 8;
+	return ALIGN(mapsize, sizeof(long));
+}
 
 /*
  * Called once to set up the allocator itself.
@@ -76,11 +89,10 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	unsigned long mapstart, unsigned long start, unsigned long end)
 {
 	bootmem_data_t *bdata = pgdat->bdata;
-	unsigned long mapsize = ((end - start)+7)/8;
+	unsigned long mapsize;
 
-	mapsize = ALIGN(mapsize, sizeof(long));
-	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
-	bdata->node_boot_start = (start << PAGE_SHIFT);
+	bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
+	bdata->node_boot_start = PFN_PHYS(start);
 	bdata->node_low_pfn = end;
 	link_bootmem(bdata);
 
@@ -88,6 +100,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	 * Initially all pages are reserved - setup_arch() has to
 	 * register free RAM areas explicitly.
 	 */
+	mapsize = get_mapsize(bdata);
 	memset(bdata->node_bootmem_map, 0xff, mapsize);
 
 	return mapsize;
@@ -101,20 +114,19 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 					unsigned long size)
 {
+	unsigned long sidx, eidx;
 	unsigned long i;
+
 	/*
 	 * round up, partially reserved pages are considered
 	 * fully reserved.
 	 */
-	unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
-	unsigned long eidx = (addr + size - bdata->node_boot_start + 
-							PAGE_SIZE-1)/PAGE_SIZE;
-	unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
-
 	BUG_ON(!size);
-	BUG_ON(sidx >= eidx);
-	BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn);
-	BUG_ON(end > bdata->node_low_pfn);
+	BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
+	BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
+
+	sidx = PFN_DOWN(addr - bdata->node_boot_start);
+	eidx = PFN_UP(addr + size - bdata->node_boot_start);
 
 	for (i = sidx; i < eidx; i++)
 		if (test_and_set_bit(i, bdata->node_bootmem_map)) {
@@ -127,18 +139,15 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
 static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 				     unsigned long size)
 {
+	unsigned long sidx, eidx;
 	unsigned long i;
-	unsigned long start;
+
 	/*
 	 * round down end of usable mem, partially free pages are
 	 * considered reserved.
 	 */
-	unsigned long sidx;
-	unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE;
-	unsigned long end = (addr + size)/PAGE_SIZE;
-
 	BUG_ON(!size);
-	BUG_ON(end > bdata->node_low_pfn);
+	BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
 
 	if (addr < bdata->last_success)
 		bdata->last_success = addr;
@@ -146,8 +155,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 	/*
 	 * Round up the beginning of the address.
 	 */
-	start = (addr + PAGE_SIZE-1) / PAGE_SIZE;
-	sidx = start - (bdata->node_boot_start/PAGE_SIZE);
+	sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
+	eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
 
 	for (i = sidx; i < eidx; i++) {
 		if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
@@ -173,7 +182,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	      unsigned long align, unsigned long goal, unsigned long limit)
 {
 	unsigned long offset, remaining_size, areasize, preferred;
-	unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn;
+	unsigned long i, start = 0, incr, eidx, end_pfn;
 	void *ret;
 
 	if(!size) {
@@ -185,23 +194,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	if (limit && bdata->node_boot_start >= limit)
 		return NULL;
 
-        limit >>=PAGE_SHIFT;
+	end_pfn = bdata->node_low_pfn;
+	limit = PFN_DOWN(limit);
 	if (limit && end_pfn > limit)
 		end_pfn = limit;
 
-	eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
 	offset = 0;
-	if (align &&
-	    (bdata->node_boot_start & (align - 1UL)) != 0)
-		offset = (align - (bdata->node_boot_start & (align - 1UL)));
-	offset >>= PAGE_SHIFT;
+	if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
+		offset = align - (bdata->node_boot_start & (align - 1UL));
+	offset = PFN_DOWN(offset);
 
 	/*
 	 * We try to allocate bootmem pages above 'goal'
 	 * first, then we try to allocate lower pages.
 	 */
-	if (goal && (goal >= bdata->node_boot_start) && 
-	    ((goal >> PAGE_SHIFT) < end_pfn)) {
+	if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
 		preferred = goal - bdata->node_boot_start;
 
 		if (bdata->last_success >= preferred)
@@ -210,9 +218,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	} else
 		preferred = 0;
 
-	preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
-	preferred += offset;
-	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
+	preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
+	areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
 	incr = align >> PAGE_SHIFT ? : 1;
 
 restart_scan:
@@ -243,7 +250,7 @@ restart_scan:
 	return NULL;
 
 found:
-	bdata->last_success = start << PAGE_SHIFT;
+	bdata->last_success = PFN_PHYS(start);
 	BUG_ON(start >= eidx);
 
 	/*
@@ -301,8 +308,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 
 	count = 0;
 	/* first extant page of the node */
-	pfn = bdata->node_boot_start >> PAGE_SHIFT;
-	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	pfn = PFN_DOWN(bdata->node_boot_start);
+	idx = bdata->node_low_pfn - pfn;
 	map = bdata->node_bootmem_map;
 	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
 	if (bdata->node_boot_start == 0 ||
@@ -343,9 +350,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 	 */
 	page = virt_to_page(bdata->node_bootmem_map);
 	count = 0;
-	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
-		count++;
+	idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
+	for (i = 0; i < idx; i++, page++) {
 		__free_pages_bootmem(page, 0);
+		count++;
 	}
 	total += count;
 	bdata->node_bootmem_map = NULL;
-- 
cgit v1.2.3


From f71bf0cac730ccb5ebcdf21747db75ae0445ccde Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <vagabon.xyz@gmail.com>
Date: Mon, 25 Sep 2006 23:31:08 -0700
Subject: [PATCH] bootmem: miscellaneous coding style fixes

It fixes various coding style issues, specially when spaces are useless.  For
example '*' go next to the function name.

Signed-off-by: Franck Bui-Huu <vagabon.xyz@gmail.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h | 95 +++++++++++++++++++++++++------------------------
 mm/bootmem.c            | 81 ++++++++++++++++++++++-------------------
 2 files changed, 93 insertions(+), 83 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index b2067e4a448..31e9abb6d97 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -38,29 +38,30 @@ typedef struct bootmem_data {
 	struct list_head list;
 } bootmem_data_t;
 
-extern unsigned long bootmem_bootmap_pages (unsigned long);
-extern unsigned long init_bootmem (unsigned long addr, unsigned long memend);
-extern void free_bootmem (unsigned long addr, unsigned long size);
-extern void * __alloc_bootmem (unsigned long size,
-			       unsigned long align,
-			       unsigned long goal);
-extern void * __alloc_bootmem_nopanic (unsigned long size,
-				       unsigned long align,
-				       unsigned long goal);
-extern void * __alloc_bootmem_low(unsigned long size,
+extern unsigned long bootmem_bootmap_pages(unsigned long);
+extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
+extern void free_bootmem(unsigned long addr, unsigned long size);
+extern void *__alloc_bootmem(unsigned long size,
+			     unsigned long align,
+			     unsigned long goal);
+extern void *__alloc_bootmem_nopanic(unsigned long size,
+				     unsigned long align,
+				     unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+				 unsigned long align,
+				 unsigned long goal);
+extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
+				      unsigned long size,
+				      unsigned long align,
+				      unsigned long goal);
+extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
+				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
-extern void * __alloc_bootmem_low_node(pg_data_t *pgdat,
-				       unsigned long size,
-				       unsigned long align,
-				       unsigned long goal);
-extern void * __alloc_bootmem_core(struct bootmem_data *bdata,
-				   unsigned long size,
-				   unsigned long align,
-				   unsigned long goal,
-				   unsigned long limit);
+				  unsigned long goal,
+				  unsigned long limit);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern void reserve_bootmem (unsigned long addr, unsigned long size);
+extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
@@ -70,22 +71,24 @@ extern void reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-extern unsigned long free_all_bootmem (void);
-extern void * __alloc_bootmem_node (pg_data_t *pgdat,
-				    unsigned long size,
-				    unsigned long align,
-				    unsigned long goal);
-extern unsigned long init_bootmem_node (pg_data_t *pgdat,
-					unsigned long freepfn,
-					unsigned long startpfn,
-					unsigned long endpfn);
-extern void reserve_bootmem_node (pg_data_t *pgdat,
-				  unsigned long physaddr,
-				  unsigned long size);
-extern void free_bootmem_node (pg_data_t *pgdat,
-			       unsigned long addr,
-			       unsigned long size);
-extern unsigned long free_all_bootmem_node (pg_data_t *pgdat);
+
+extern unsigned long free_all_bootmem(void);
+extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
+extern void *__alloc_bootmem_node(pg_data_t *pgdat,
+				  unsigned long size,
+				  unsigned long align,
+				  unsigned long goal);
+extern unsigned long init_bootmem_node(pg_data_t *pgdat,
+				       unsigned long freepfn,
+				       unsigned long startpfn,
+				       unsigned long endpfn);
+extern void reserve_bootmem_node(pg_data_t *pgdat,
+				 unsigned long physaddr,
+				 unsigned long size);
+extern void free_bootmem_node(pg_data_t *pgdat,
+			      unsigned long addr,
+			      unsigned long size);
+
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
@@ -102,19 +105,19 @@ static inline void *alloc_remap(int nid, unsigned long size)
 {
 	return NULL;
 }
-#endif
+#endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
 
 extern unsigned long __meminitdata nr_kernel_pages;
 extern unsigned long nr_all_pages;
 
-extern void * alloc_large_system_hash(const char *tablename,
-				      unsigned long bucketsize,
-				      unsigned long numentries,
-				      int scale,
-				      int flags,
-				      unsigned int *_hash_shift,
-				      unsigned int *_hash_mask,
-				      unsigned long limit);
+extern void *alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int flags,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask,
+				     unsigned long limit);
 
 #define HASH_HIGHMEM	0x00000001	/* Consider highmem? */
 #define HASH_EARLY	0x00000002	/* Allocating during early boot? */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 23c3317c09e..f0f85fa713c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -38,7 +38,7 @@ unsigned long saved_max_pfn;
 #endif
 
 /* return the number of _pages_ that will be allocated for the boot bitmap */
-unsigned long __init bootmem_bootmap_pages (unsigned long pages)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
 	unsigned long mapsize;
 
@@ -48,12 +48,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 
 	return mapsize;
 }
+
 /*
  * link bdata in order
  */
 static void __init link_bootmem(bootmem_data_t *bdata)
 {
 	bootmem_data_t *ent;
+
 	if (list_empty(&bdata_list)) {
 		list_add(&bdata->list, &bdata_list);
 		return;
@@ -66,7 +68,6 @@ static void __init link_bootmem(bootmem_data_t *bdata)
 		}
 	}
 	list_add_tail(&bdata->list, &bdata_list);
-	return;
 }
 
 /*
@@ -85,7 +86,7 @@ static unsigned long __init get_mapsize(bootmem_data_t *bdata)
 /*
  * Called once to set up the allocator itself.
  */
-static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
 	unsigned long mapstart, unsigned long start, unsigned long end)
 {
 	bootmem_data_t *bdata = pgdat->bdata;
@@ -185,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	unsigned long i, start = 0, incr, eidx, end_pfn;
 	void *ret;
 
-	if(!size) {
+	if (!size) {
 		printk("__alloc_bootmem_core(): zero-sized request\n");
 		BUG();
 	}
@@ -234,7 +235,7 @@ restart_scan:
 		for (j = i + 1; j < i + areasize; ++j) {
 			if (j >= eidx)
 				goto fail_block;
-			if (test_bit (j, bdata->node_bootmem_map))
+			if (test_bit(j, bdata->node_bootmem_map))
 				goto fail_block;
 		}
 		start = i;
@@ -262,19 +263,21 @@ found:
 	    bdata->last_offset && bdata->last_pos+1 == start) {
 		offset = ALIGN(bdata->last_offset, align);
 		BUG_ON(offset > PAGE_SIZE);
-		remaining_size = PAGE_SIZE-offset;
+		remaining_size = PAGE_SIZE - offset;
 		if (size < remaining_size) {
 			areasize = 0;
 			/* last_pos unchanged */
-			bdata->last_offset = offset+size;
-			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
-						bdata->node_boot_start);
+			bdata->last_offset = offset + size;
+			ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
+					   offset +
+					   bdata->node_boot_start);
 		} else {
 			remaining_size = size - remaining_size;
-			areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
-			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
-						bdata->node_boot_start);
-			bdata->last_pos = start+areasize-1;
+			areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
+			ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
+					   offset +
+					   bdata->node_boot_start);
+			bdata->last_pos = start + areasize - 1;
 			bdata->last_offset = remaining_size;
 		}
 		bdata->last_offset &= ~PAGE_MASK;
@@ -287,7 +290,7 @@ found:
 	/*
 	 * Reserve the area now:
 	 */
-	for (i = start; i < start+areasize; i++)
+	for (i = start; i < start + areasize; i++)
 		if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
 			BUG();
 	memset(ret, 0, size);
@@ -338,7 +341,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 				}
 			}
 		} else {
-			i+=BITS_PER_LONG;
+			i += BITS_PER_LONG;
 		}
 		pfn += BITS_PER_LONG;
 	}
@@ -361,51 +364,51 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 	return total;
 }
 
-unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn,
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
 				unsigned long startpfn, unsigned long endpfn)
 {
-	return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn));
+	return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
 }
 
-void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
-				  unsigned long size)
+void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+				 unsigned long size)
 {
 	reserve_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
-void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr,
-			       unsigned long size)
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+			      unsigned long size)
 {
 	free_bootmem_core(pgdat->bdata, physaddr, size);
 }
 
-unsigned long __init free_all_bootmem_node (pg_data_t *pgdat)
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-	return(free_all_bootmem_core(pgdat));
+	return free_all_bootmem_core(pgdat);
 }
 
-unsigned long __init init_bootmem (unsigned long start, unsigned long pages)
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 {
 	max_low_pfn = pages;
 	min_low_pfn = start;
-	return(init_bootmem_core(NODE_DATA(0), start, 0, pages));
+	return init_bootmem_core(NODE_DATA(0), start, 0, pages);
 }
 
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-void __init reserve_bootmem (unsigned long addr, unsigned long size)
+void __init reserve_bootmem(unsigned long addr, unsigned long size)
 {
 	reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
-void __init free_bootmem (unsigned long addr, unsigned long size)
+void __init free_bootmem(unsigned long addr, unsigned long size)
 {
 	free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
 }
 
-unsigned long __init free_all_bootmem (void)
+unsigned long __init free_all_bootmem(void)
 {
-	return(free_all_bootmem_core(NODE_DATA(0)));
+	return free_all_bootmem_core(NODE_DATA(0));
 }
 
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
@@ -414,9 +417,11 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
 	bootmem_data_t *bdata;
 	void *ptr;
 
-	list_for_each_entry(bdata, &bdata_list, list)
-		if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
-			return(ptr);
+	list_for_each_entry(bdata, &bdata_list, list) {
+		ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+		if (ptr)
+			return ptr;
+	}
 	return NULL;
 }
 
@@ -424,6 +429,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 			      unsigned long goal)
 {
 	void *mem = __alloc_bootmem_nopanic(size,align,goal);
+
 	if (mem)
 		return mem;
 	/*
@@ -442,7 +448,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 
 	ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
-		return (ptr);
+		return ptr;
 
 	return __alloc_bootmem(size, align, goal);
 }
@@ -455,10 +461,11 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 	bootmem_data_t *bdata;
 	void *ptr;
 
-	list_for_each_entry(bdata, &bdata_list, list)
-		if ((ptr = __alloc_bootmem_core(bdata, size,
-						 align, goal, LOW32LIMIT)))
-			return(ptr);
+	list_for_each_entry(bdata, &bdata_list, list) {
+		ptr = __alloc_bootmem_core(bdata, size, align, goal, LOW32LIMIT);
+		if (ptr)
+			return ptr;
+	}
 
 	/*
 	 * Whoops, we cannot satisfy the allocation request.
-- 
cgit v1.2.3


From 776ed98b842ee8551793f842fe028c8091f3633e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:09 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: remove two strange uses of MAX_NR_ZONES

I keep seeing zones on various platforms that are never used and wonder why we
compile support for them into the kernel.  Counters show up for HIGHMEM and
DMA32 that are alway zero.

This patch allows the removal of ZONE_DMA32 for non x86_64 architectures and
it will get rid of ZONE_HIGHMEM for arches not using highmem (like 64 bit
architectures).  If an arch does not define CONFIG_HIGHMEM then ZONE_HIGHMEM
will not be defined.  Similarly if an arch does not define CONFIG_ZONE_DMA32
then ZONE_DMA32 will not be defined.

No current architecture uses all the 4 zones (DMA,DMA32,NORMAL,HIGH) that we
have now.  The patchset will reduce the number of zones for all platforms.

On many platforms that do not have DMA32 or HIGHMEM this will reduce the
number of zones by 50%.  F.e.  ia64 only uses DMA and NORMAL.

Large amounts of memory can be saved for larger systemss that may have a few
hundred NUMA nodes.

With ZONE_DMA32 and ZONE_HIGHMEM support optional MAX_NR_ZONES will be 2 for
many non i386 platforms and even for i386 without CONFIG_HIGHMEM set.

Tested on ia64, x86_64 and on i386 with and without highmem.

The patchset consists of 11 patches that are following this message.

One could go even further than this patchset and also make ZONE_DMA optional
because some platforms do not need a separate DMA zone and can do DMA to all
of memory.  This could reduce MAX_NR_ZONES to 1.  Such a patchset will
hopefully follow soon.

This patch:

Fix strange uses of MAX_NR_ZONES

Sometimes we use MAX_NR_ZONES - x to refer to a zone.  Make that explicit.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/init.c   | 2 +-
 arch/x86_64/mm/init.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 89e8486aac3..353453d550a 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -657,7 +657,7 @@ void __init mem_init(void)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdata = &contig_page_data;
-	struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
+	struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index d14fb2dfbfc..52fd42c40c8 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -536,7 +536,7 @@ int memory_add_physaddr_to_nid(u64 start)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
-- 
cgit v1.2.3


From f06a96844a577c43249fce25809a4fae07407f46 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:10 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: fix MAX_NR_ZONES array initializations

Fix array initialization in lots of arches

The number of zones may now be reduced from 4 to 2 for many arches.  Fix the
array initialization for the zones array for all architectures so that it is
not initializing a fixed number of elements.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/init.c             | 2 +-
 arch/frv/mm/init.c               | 2 +-
 arch/h8300/mm/init.c             | 2 +-
 arch/i386/kernel/setup.c         | 2 +-
 arch/i386/mm/discontig.c         | 2 +-
 arch/m32r/mm/init.c              | 2 +-
 arch/m68knommu/mm/init.c         | 2 +-
 arch/mips/mm/init.c              | 4 ++--
 arch/mips/sgi-ip27/ip27-memory.c | 2 +-
 arch/parisc/mm/init.c            | 2 +-
 arch/sh64/mm/init.c              | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 917dad1b74c..550f4907d61 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -270,7 +270,7 @@ callback_init(void * kernel_end)
 void
 paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long zones_size[MAX_NR_ZONES] = {0, };
 	unsigned long dma_pfn, high_pfn;
 
 	dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index b5b4286f9dd..3f3a0ed3539 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -98,7 +98,7 @@ void show_mem(void)
  */
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
 	/* allocate some pages for kernel housekeeping tasks */
 	empty_bad_page_table	= (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index d3d40bdc2d6..e4f4199f97a 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -138,7 +138,7 @@ void paging_init(void)
 #endif
 
 	{
-		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
 		zones_size[ZONE_DMA]     = 0 >> PAGE_SHIFT;
 		zones_size[ZONE_NORMAL]  = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 27d4dc0d3ef..060c68004be 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1182,7 +1182,7 @@ static unsigned long __init setup_memory(void)
 
 void __init zone_sizes_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long zones_size[MAX_NR_ZONES] = { 0, };
 	unsigned int max_dma, low;
 
 	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 2e36eff8aff..07c300f9376 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -367,7 +367,7 @@ void __init zone_sizes_init(void)
 
 
 	for_each_online_node(nid) {
-		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned long zones_size[MAX_NR_ZONES] = {0, };
 		unsigned long *zholes_size;
 		unsigned int max_dma;
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index b71348fec1f..bbd97c85bc5 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -100,7 +100,7 @@ void free_initrd_mem(unsigned long, unsigned long);
 #ifndef CONFIG_DISCONTIGMEM
 unsigned long __init zone_sizes_init(void)
 {
-	unsigned long  zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long  zones_size[MAX_NR_ZONES] = {0, };
 	unsigned long  max_dma;
 	unsigned long  low;
 	unsigned long  start_pfn;
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index e4c233eef19..06e538d1be3 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -136,7 +136,7 @@ void paging_init(void)
 #endif
 
 	{
-		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
 		zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
 		zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index c52497bb102..5b06349af2d 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -163,10 +163,10 @@ static int __init page_is_ram(unsigned long pagenr)
 
 void __init paging_init(void)
 {
-	unsigned long zones_size[] = { [0 ... MAX_NR_ZONES - 1] = 0 };
+	unsigned long zones_size[] = { 0, };
 	unsigned long max_dma, high, low;
 #ifndef CONFIG_FLATMEM
-	unsigned long zholes_size[] = { [0 ... MAX_NR_ZONES - 1] = 0 };
+	unsigned long zholes_size[] = { 0, };
 	unsigned long i, j, pfn;
 #endif
 
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index efe6971fc80..59bfc0fc3f4 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -508,7 +508,7 @@ extern unsigned long setup_zero_pages(void);
 
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long zones_size[MAX_NR_ZONES] = {0, };
 	unsigned node;
 
 	pagetable_init();
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index f2b96f1e0da..c7329615ef9 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -809,7 +809,7 @@ void __init paging_init(void)
 	flush_tlb_all_local(NULL);
 
 	for (i = 0; i < npmem_ranges; i++) {
-		unsigned long zones_size[MAX_NR_ZONES] = { 0, 0, 0 };
+		unsigned long zones_size[MAX_NR_ZONES] = { 0, };
 
 		/* We have an IOMMU, so all memory can go into a single
 		   ZONE_DMA zone. */
diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c
index 1169757fb38..83295bd21aa 100644
--- a/arch/sh64/mm/init.c
+++ b/arch/sh64/mm/init.c
@@ -110,7 +110,7 @@ void show_mem(void)
  */
 void __init paging_init(void)
 {
-	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+	unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
 	pgd_init((unsigned long)swapper_pg_dir);
 	pgd_init((unsigned long)swapper_pg_dir +
-- 
cgit v1.2.3


From 182e8e237349e7b6354f45aee4780b6423fd6a50 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:10 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: make display of highmem counters
 conditional on CONFIG_HIGHMEM

Do not display HIGHMEM memory sizes if CONFIG_HIGHMEM is not set.

Make HIGHMEM dependent texts and make display of highmem counters optional

Some texts are depending on CONFIG_HIGHMEM.

Remove those strings and remove the display of highmem counter values if
CONFIG_HIGHMEM is not set.

[akpm@osdl.org: remove some ifdefs]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/node.c | 4 ++++
 fs/proc/proc_misc.c | 4 ++++
 mm/page_alloc.c     | 4 ----
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e9b0957f15d..e09f5c2c11e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -54,10 +54,12 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d MemUsed:      %8lu kB\n"
 		       "Node %d Active:       %8lu kB\n"
 		       "Node %d Inactive:     %8lu kB\n"
+#ifdef CONFIG_HIGHMEM
 		       "Node %d HighTotal:    %8lu kB\n"
 		       "Node %d HighFree:     %8lu kB\n"
 		       "Node %d LowTotal:     %8lu kB\n"
 		       "Node %d LowFree:      %8lu kB\n"
+#endif
 		       "Node %d Dirty:        %8lu kB\n"
 		       "Node %d Writeback:    %8lu kB\n"
 		       "Node %d FilePages:    %8lu kB\n"
@@ -72,10 +74,12 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(i.totalram - i.freeram),
 		       nid, K(active),
 		       nid, K(inactive),
+#ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
 		       nid, K(i.totalram - i.totalhigh),
 		       nid, K(i.freeram - i.freehigh),
+#endif
 		       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
 		       nid, K(node_page_state(nid, NR_WRITEBACK)),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 94215622544..caa0a51560a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -157,10 +157,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8lu kB\n"
 		"Inactive:     %8lu kB\n"
+#ifdef CONFIG_HIGHMEM
 		"HighTotal:    %8lu kB\n"
 		"HighFree:     %8lu kB\n"
 		"LowTotal:     %8lu kB\n"
 		"LowFree:      %8lu kB\n"
+#endif
 		"SwapTotal:    %8lu kB\n"
 		"SwapFree:     %8lu kB\n"
 		"Dirty:        %8lu kB\n"
@@ -183,10 +185,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(total_swapcache_pages),
 		K(active),
 		K(inactive),
+#ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
 		K(i.freeram-i.freehigh),
+#endif
 		K(i.totalswap),
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4b33878e948..e26491cb5a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1281,10 +1281,6 @@ void show_free_areas(void)
 
 	get_zone_counts(&active, &inactive, &free);
 
-	printk("Free pages: %11ukB (%ukB HighMem)\n",
-		K(nr_free_pages()),
-		K(nr_free_highpages()));
-
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
 		"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
 		active,
-- 
cgit v1.2.3


From c1f60a5a419cc60aff27daffb150f5a3a3a79ef4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:11 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: move HIGHMEM counters into highmem.c/.h

Move totalhigh_pages and nr_free_highpages() into highmem.c/.h

Move the totalhigh_pages definition into highmem.c/.h.  Move the
nr_free_highpages function into highmem.c

[yoichi_yuasa@tripeaks.co.jp: build fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/sgi-ip27/ip27-memory.c |  1 +
 arch/um/kernel/mem.c             |  2 ++
 include/linux/highmem.h          |  3 +++
 include/linux/swap.h             |  1 -
 mm/highmem.c                     | 13 +++++++++++++
 mm/page_alloc.c                  | 15 ---------------
 mm/shmem.c                       |  1 +
 7 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 59bfc0fc3f4..16e5682b01f 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -19,6 +19,7 @@
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
+#include <linux/highmem.h>
 #include <asm/page.h>
 #include <asm/sections.h>
 
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 61280167c56..b39e624c329 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -79,8 +79,10 @@ void mem_init(void)
 
 	/* this will put all low memory onto the freelists */
 	totalram_pages = free_all_bootmem();
+#ifdef CONFIG_HIGHMEM
 	totalhigh_pages = highmem >> PAGE_SHIFT;
 	totalram_pages += totalhigh_pages;
+#endif
 	num_physpages = totalram_pages;
 	max_pfn = totalram_pages;
 	printk(KERN_INFO "Memory: %luk available\n", 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 42620e723ab..fd7d12daa94 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -24,11 +24,14 @@ static inline void flush_kernel_dcache_page(struct page *page)
 
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
+extern unsigned long totalhigh_pages;
 
 #else /* CONFIG_HIGHMEM */
 
 static inline unsigned int nr_free_highpages(void) { return 0; }
 
+#define totalhigh_pages 0
+
 #ifndef ARCH_HAS_KMAP
 static inline void *kmap(struct page *page)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5e59184c909..34a6bc3e6cf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -162,7 +162,6 @@ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
-extern unsigned long totalhigh_pages;
 extern unsigned long totalreserve_pages;
 extern long nr_swap_pages;
 extern unsigned int nr_free_pages(void);
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b2a5403c44..ee5519b176e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,6 +46,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
  */
 #ifdef CONFIG_HIGHMEM
 
+unsigned long totalhigh_pages __read_mostly;
+
+unsigned int nr_free_highpages (void)
+{
+	pg_data_t *pgdat;
+	unsigned int pages = 0;
+
+	for_each_online_pgdat(pgdat)
+		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+	return pages;
+}
+
 static int pkmap_count[LAST_PKMAP];
 static unsigned int last_pkmap_nr;
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e26491cb5a2..5cde54695cf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 unsigned long totalram_pages __read_mostly;
-unsigned long totalhigh_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
@@ -1185,20 +1184,6 @@ unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
 }
-
-#ifdef CONFIG_HIGHMEM
-unsigned int nr_free_highpages (void)
-{
-	pg_data_t *pgdat;
-	unsigned int pages = 0;
-
-	for_each_online_pgdat(pgdat)
-		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
-
-	return pages;
-}
-#endif
-
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index db21c51531c..8631be45b40 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
 #include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/migrate.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
-- 
cgit v1.2.3


From 98d2b0ebda72fc39cdefd3720d50b9b3ce409085 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:12 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: page allocator ZONE_HIGHMEM cleanup

page allocator ZONE_HIGHMEM fixups

1. We do not need to do an #ifdef in si_meminfo since both counters
   in use are zero if !CONFIG_HIGHMEM.

2. Add #ifdef in si_meminfo_node instead to avoid referencing zone
   information for ZONE_HIGHMEM if we do not have HIGHMEM
   (may not be there after the following patches).

3. Replace the use of ZONE_HIGHMEM with MAX_NR_ZONES in build_zonelists_node

4. build_zonelists_node: Remove BUG_ON for ZONE_HIGHMEM. Zone will
   be optional soon and thus BUG_ON cannot be triggered anymore.

5. init_free_area_core: Replace a use of ZONE_HIGHMEM with NR_MAX_ZONES.

[akpm@osdl.org: cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5cde54695cf..13c102b95c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1199,13 +1199,8 @@ void si_meminfo(struct sysinfo *val)
 	val->sharedram = 0;
 	val->freeram = nr_free_pages();
 	val->bufferram = nr_blockdev_pages();
-#ifdef CONFIG_HIGHMEM
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
-#else
-	val->totalhigh = 0;
-	val->freehigh = 0;
-#endif
 	val->mem_unit = PAGE_SIZE;
 }
 
@@ -1218,8 +1213,13 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = nr_free_pages_pgdat(pgdat);
+#ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+#else
+	val->totalhigh = 0;
+	val->freehigh = 0;
+#endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
@@ -1344,14 +1344,11 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
 {
 	struct zone *zone;
 
-	BUG_ON(zone_type > ZONE_HIGHMEM);
+	BUG_ON(zone_type >= MAX_NR_ZONES);
 
 	do {
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
-#ifndef CONFIG_HIGHMEM
-			BUG_ON(zone_type > ZONE_NORMAL);
-#endif
 			zonelist->zones[nr_zones++] = zone;
 			check_highest_zone(zone_type);
 		}
@@ -1981,7 +1978,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
-		if (j < ZONE_HIGHMEM)
+		if (!is_highmem_idx(j))
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 
-- 
cgit v1.2.3


From 2f1b6248682f8b39ca3c7e549dfc216d26c4109b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:13 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: use enum to define zones, reformat and
 comment

Use enum for zones and reformat zones dependent information

Add comments explaning the use of zones and add a zones_t type for zone
numbers.

Line up information that will be #ifdefd by the following patches.

[akpm@osdl.org: comment cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     |  7 +++---
 include/linux/mmzone.h | 66 +++++++++++++++++++++++++++++++++++---------------
 mm/page_alloc.c        | 24 +++++++++++++-----
 3 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 45678b03695..2db4229a006 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -470,7 +470,7 @@ void split_page(struct page *page, unsigned int order);
 #define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
-static inline unsigned long page_zonenum(struct page *page)
+static inline enum zone_type page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
@@ -499,11 +499,12 @@ static inline unsigned long page_to_section(struct page *page)
 	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long zone)
+static inline void set_page_zone(struct page *page, enum zone_type zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
 	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
 }
+
 static inline void set_page_node(struct page *page, unsigned long node)
 {
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
@@ -515,7 +516,7 @@ static inline void set_page_section(struct page *page, unsigned long section)
 	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
 }
 
-static inline void set_page_links(struct page *page, unsigned long zone,
+static inline void set_page_links(struct page *page, enum zone_type zone,
 	unsigned long node, unsigned long pfn)
 {
 	set_page_zone(page, zone);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f45163c528e..03a5a6eb0ff 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -88,14 +88,53 @@ struct per_cpu_pageset {
 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
 #endif
 
-#define ZONE_DMA		0
-#define ZONE_DMA32		1
-#define ZONE_NORMAL		2
-#define ZONE_HIGHMEM		3
+enum zone_type {
+	/*
+	 * ZONE_DMA is used when there are devices that are not able
+	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
+	 * carve out the portion of memory that is needed for these devices.
+	 * The range is arch specific.
+	 *
+	 * Some examples
+	 *
+	 * Architecture		Limit
+	 * ---------------------------
+	 * parisc, ia64, sparc	<4G
+	 * s390			<2G
+	 * arm26		<48M
+	 * arm			Various
+	 * alpha		Unlimited or 0-16MB.
+	 *
+	 * i386, x86_64 and multiple other arches
+	 * 			<16M.
+	 */
+	ZONE_DMA,
+	/*
+	 * x86_64 needs two ZONE_DMAs because it supports devices that are
+	 * only able to do DMA to the lower 16M but also 32 bit devices that
+	 * can only do DMA areas below 4G.
+	 */
+	ZONE_DMA32,
+	/*
+	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
+	 * performed on pages in ZONE_NORMAL if the DMA devices support
+	 * transfers to all addressable memory.
+	 */
+	ZONE_NORMAL,
+	/*
+	 * A memory area that is only addressable by the kernel through
+	 * mapping portions into its own address space. This is for example
+	 * used by i386 to allow the kernel to address the memory beyond
+	 * 900MB. The kernel will set up special mappings (page
+	 * table entries on i386) for each page that the kernel needs to
+	 * access.
+	 */
+	ZONE_HIGHMEM,
 
-#define MAX_NR_ZONES		4	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+	MAX_NR_ZONES
+};
 
+#define	ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 
 /*
  * When a memory allocation must conform to specific limitations (such
@@ -126,16 +165,6 @@ struct per_cpu_pageset {
 /* #define GFP_ZONETYPES       (GFP_ZONEMASK + 1) */           /* Non-loner */
 #define GFP_ZONETYPES  ((GFP_ZONEMASK + 1) / 2 + 1)            /* Loner */
 
-/*
- * On machines where it is needed (eg PCs) we divide physical memory
- * into multiple physical zones. On a 32bit PC we have 4 zones:
- *
- * ZONE_DMA	  < 16 MB	ISA DMA capable memory
- * ZONE_DMA32	     0 MB 	Empty
- * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
- * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
- */
-
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	unsigned long		free_pages;
@@ -266,7 +295,6 @@ struct zone {
 	char			*name;
 } ____cacheline_internodealigned_in_smp;
 
-
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -373,12 +401,12 @@ static inline int populated_zone(struct zone *zone)
 	return (!!zone->present_pages);
 }
 
-static inline int is_highmem_idx(int idx)
+static inline int is_highmem_idx(enum zone_type idx)
 {
 	return (idx == ZONE_HIGHMEM);
 }
 
-static inline int is_normal_idx(int idx)
+static inline int is_normal_idx(enum zone_type idx)
 {
 	return (idx == ZONE_NORMAL);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 13c102b95c5..2410a3cb1c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,11 @@ static void __free_pages_ok(struct page *page, unsigned int order);
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+	 256,
+	 256,
+	 32
+};
 
 EXPORT_SYMBOL(totalram_pages);
 
@@ -79,7 +83,13 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = {
+	 "DMA",
+	 "DMA32",
+	 "Normal",
+	 "HighMem"
+};
+
 int min_free_kbytes = 1024;
 
 unsigned long __meminitdata nr_kernel_pages;
@@ -1487,7 +1497,9 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int i, j, k, node, local_node;
+	int i, node, local_node;
+	enum zone_type k;
+	enum zone_type j;
 
 	local_node = pgdat->node_id;
 	for (i = 0; i < GFP_ZONETYPES; i++) {
@@ -1675,8 +1687,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 }
 
 #define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-		unsigned long size)
+void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
+		unsigned long pfn, unsigned long size)
 {
 	unsigned long snum = pfn_to_section_nr(pfn);
 	unsigned long end = pfn_to_section_nr(pfn + size);
@@ -1960,7 +1972,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
 static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
-	unsigned long j;
+	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
-- 
cgit v1.2.3


From fb0e7942bdcbbd2f90e61cb4cfa4fa892a873f8a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:13 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_DMA32 optional

Make ZONE_DMA32 optional

- Add #ifdefs around ZONE_DMA32 specific code and definitions.

- Add CONFIG_ZONE_DMA32 config option and use that for x86_64
  that alone needs this zone.

- Remove the use of CONFIG_DMA_IS_DMA32 and CONFIG_DMA_IS_NORMAL
  for ia64 and fix up the way per node ZVCs are calculated.

- Fall back to prior GFP_ZONEMASK of 0x03 if there is no
  DMA32 zone.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/Kconfig      |  9 ---------
 arch/x86_64/Kconfig    |  4 ++++
 include/linux/gfp.h    |  2 +-
 include/linux/mmzone.h | 16 +++++++++++++---
 include/linux/vmstat.h |  4 +---
 mm/page_alloc.c        |  6 ++++++
 6 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index db274da7dba..f521f2f60a7 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -66,15 +66,6 @@ config IA64_UNCACHED_ALLOCATOR
 	bool
 	select GENERIC_ALLOCATOR
 
-config DMA_IS_DMA32
-	bool
-	default y
-
-config DMA_IS_NORMAL
-	bool
-	depends on IA64_SGI_SN2
-	default y
-
 config AUDIT_ARCH
 	bool
 	default y
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 6cd4878625f..581ce9af0ec 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,10 @@ config X86
 	bool
 	default y
 
+config ZONE_DMA32
+	bool
+	default y
+
 config LOCKDEP_SUPPORT
 	bool
 	default y
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index cc9e6084448..14610b56c13 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -13,7 +13,7 @@ struct vm_area_struct;
 /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
-#ifdef CONFIG_DMA_IS_DMA32
+#ifndef CONFIG_ZONE_DMA32
 #define __GFP_DMA32	((__force gfp_t)0x01)	/* ZONE_DMA is ZONE_DMA32 */
 #elif BITS_PER_LONG < 64
 #define __GFP_DMA32	((__force gfp_t)0x00)	/* ZONE_NORMAL is ZONE_DMA32 */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 03a5a6eb0ff..adae3c91593 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -109,12 +109,14 @@ enum zone_type {
 	 * 			<16M.
 	 */
 	ZONE_DMA,
+#ifdef CONFIG_ZONE_DMA32
 	/*
 	 * x86_64 needs two ZONE_DMAs because it supports devices that are
 	 * only able to do DMA to the lower 16M but also 32 bit devices that
 	 * can only do DMA areas below 4G.
 	 */
 	ZONE_DMA32,
+#endif
 	/*
 	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
 	 * performed on pages in ZONE_NORMAL if the DMA devices support
@@ -161,9 +163,13 @@ enum zone_type {
  *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
-#define GFP_ZONEMASK	0x07
-/* #define GFP_ZONETYPES       (GFP_ZONEMASK + 1) */           /* Non-loner */
-#define GFP_ZONETYPES  ((GFP_ZONEMASK + 1) / 2 + 1)            /* Loner */
+#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
+
+#ifdef CONFIG_ZONE_DMA32
+#define GFP_ZONEMASK		0x07
+#else
+#define GFP_ZONEMASK		0x03
+#endif
 
 struct zone {
 	/* Fields commonly accessed by the page allocator */
@@ -429,7 +435,11 @@ static inline int is_normal(struct zone *zone)
 
 static inline int is_dma32(struct zone *zone)
 {
+#ifdef CONFIG_ZONE_DMA32
 	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
+#else
+	return 0;
+#endif
 }
 
 static inline int is_dma(struct zone *zone)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 2d9b1b60798..9c6e62c56ec 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -124,12 +124,10 @@ static inline unsigned long node_page_state(int node,
 	struct zone *zones = NODE_DATA(node)->node_zones;
 
 	return
-#ifndef CONFIG_DMA_IS_NORMAL
-#if !defined(CONFIG_DMA_IS_DMA32) && BITS_PER_LONG >= 64
+#ifdef CONFIG_ZONE_DMA32
 		zone_page_state(&zones[ZONE_DMA32], item) +
 #endif
 		zone_page_state(&zones[ZONE_NORMAL], item) +
-#endif
 #ifdef CONFIG_HIGHMEM
 		zone_page_state(&zones[ZONE_HIGHMEM], item) +
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2410a3cb1c5..5b5cbb5e181 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -70,7 +70,9 @@ static void __free_pages_ok(struct page *page, unsigned int order);
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 	 256,
+#ifdef CONFIG_ZONE_DMA32
 	 256,
+#endif
 	 32
 };
 
@@ -85,7 +87,9 @@ EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = {
 	 "DMA",
+#ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
+#endif
 	 "Normal",
 	 "HighMem"
 };
@@ -1373,8 +1377,10 @@ static inline int highest_zone(int zone_bits)
 	int res = ZONE_NORMAL;
 	if (zone_bits & (__force int)__GFP_HIGHMEM)
 		res = ZONE_HIGHMEM;
+#ifdef CONFIG_ZONE_DMA32
 	if (zone_bits & (__force int)__GFP_DMA32)
 		res = ZONE_DMA32;
+#endif
 	if (zone_bits & (__force int)__GFP_DMA)
 		res = ZONE_DMA;
 	return res;
-- 
cgit v1.2.3


From e53ef38d05dd59ed281a35590e4a5b64d8ff4c52 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:14 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: make ZONE_HIGHMEM optional

Make ZONE_HIGHMEM optional

- ifdef out code and definitions related to CONFIG_HIGHMEM

- __GFP_HIGHMEM falls back to normal allocations if there is no
  ZONE_HIGHMEM

- GFP_ZONEMASK becomes 0x01 if there is no DMA32 and no HIGHMEM
  zone.

[jdike@addtoit.com: build fix]
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/mem.c   |  2 ++
 include/linux/gfp.h    | 18 ++++++++++--------
 include/linux/mmzone.h | 36 +++++++++++++++++++++++++++++++++---
 mm/page_alloc.c        |  6 ++++++
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index b39e624c329..b1cd5c6e468 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -226,7 +226,9 @@ void paging_init(void)
 	for(i=0;i<sizeof(zones_size)/sizeof(zones_size[0]);i++) 
 		zones_size[i] = 0;
 	zones_size[ZONE_DMA] = (end_iomem >> PAGE_SHIFT) - (uml_physmem >> PAGE_SHIFT);
+#ifdef CONFIG_HIGHMEM
 	zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT;
+#endif
 	free_area_init(zones_size);
 
 	/*
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 14610b56c13..2a2153ebfe0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -9,17 +9,19 @@ struct vm_area_struct;
 
 /*
  * GFP bitmasks..
+ *
+ * Zone modifiers (see linux/mmzone.h - low three bits)
+ *
+ * These may be masked by GFP_ZONEMASK to make allocations with this bit
+ * set fall back to ZONE_NORMAL.
+ *
+ * Do not put any conditional on these. If necessary modify the definitions
+ * without the underscores and use the consistently. The definitions here may
+ * be used in bit comparisons.
  */
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
 #define __GFP_DMA	((__force gfp_t)0x01u)
 #define __GFP_HIGHMEM	((__force gfp_t)0x02u)
-#ifndef CONFIG_ZONE_DMA32
-#define __GFP_DMA32	((__force gfp_t)0x01)	/* ZONE_DMA is ZONE_DMA32 */
-#elif BITS_PER_LONG < 64
-#define __GFP_DMA32	((__force gfp_t)0x00)	/* ZONE_NORMAL is ZONE_DMA32 */
-#else
-#define __GFP_DMA32	((__force gfp_t)0x04)	/* Has own ZONE_DMA32 */
-#endif
+#define __GFP_DMA32	((__force gfp_t)0x04u)
 
 /*
  * Action modifiers - doesn't change the zoning
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index adae3c91593..76d33e68859 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -123,6 +123,7 @@ enum zone_type {
 	 * transfers to all addressable memory.
 	 */
 	ZONE_NORMAL,
+#ifdef CONFIG_HIGHMEM
 	/*
 	 * A memory area that is only addressable by the kernel through
 	 * mapping portions into its own address space. This is for example
@@ -132,11 +133,10 @@ enum zone_type {
 	 * access.
 	 */
 	ZONE_HIGHMEM,
-
+#endif
 	MAX_NR_ZONES
 };
 
-#define	ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 
 /*
  * When a memory allocation must conform to specific limitations (such
@@ -163,12 +163,34 @@ enum zone_type {
  *
  * NOTE! Make sure this matches the zones in <linux/gfp.h>
  */
-#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
 
 #ifdef CONFIG_ZONE_DMA32
+
+#ifdef CONFIG_HIGHMEM
+#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
 #define GFP_ZONEMASK		0x07
+#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
 #else
+#define GFP_ZONETYPES		((0x07 + 1) / 2 + 1)    /* Loner */
+/* Mask __GFP_HIGHMEM */
+#define GFP_ZONEMASK		0x05
+#define ZONES_SHIFT		2
+#endif
+
+#else
+#ifdef CONFIG_HIGHMEM
+
 #define GFP_ZONEMASK		0x03
+#define ZONES_SHIFT		2
+#define GFP_ZONETYPES		3
+
+#else
+
+#define GFP_ZONEMASK		0x01
+#define ZONES_SHIFT		1
+#define GFP_ZONETYPES		2
+
+#endif
 #endif
 
 struct zone {
@@ -409,7 +431,11 @@ static inline int populated_zone(struct zone *zone)
 
 static inline int is_highmem_idx(enum zone_type idx)
 {
+#ifdef CONFIG_HIGHMEM
 	return (idx == ZONE_HIGHMEM);
+#else
+	return 0;
+#endif
 }
 
 static inline int is_normal_idx(enum zone_type idx)
@@ -425,7 +451,11 @@ static inline int is_normal_idx(enum zone_type idx)
  */
 static inline int is_highmem(struct zone *zone)
 {
+#ifdef CONFIG_HIGHMEM
 	return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
+#else
+	return 0;
+#endif
 }
 
 static inline int is_normal(struct zone *zone)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b5cbb5e181..6c7c2dd1b3e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -73,7 +73,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
+#ifdef CONFIG_HIGHMEM
 	 32
+#endif
 };
 
 EXPORT_SYMBOL(totalram_pages);
@@ -91,7 +93,9 @@ static char *zone_names[MAX_NR_ZONES] = {
 	 "DMA32",
 #endif
 	 "Normal",
+#ifdef CONFIG_HIGHMEM
 	 "HighMem"
+#endif
 };
 
 int min_free_kbytes = 1024;
@@ -1375,8 +1379,10 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
 static inline int highest_zone(int zone_bits)
 {
 	int res = ZONE_NORMAL;
+#ifdef CONFIG_HIGHMEM
 	if (zone_bits & (__force int)__GFP_HIGHMEM)
 		res = ZONE_HIGHMEM;
+#endif
 #ifdef CONFIG_ZONE_DMA32
 	if (zone_bits & (__force int)__GFP_DMA32)
 		res = ZONE_DMA32;
-- 
cgit v1.2.3


From 27bf71c2a7e596ed34e9bf2d4a5030321a09a1ad Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:15 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: remove display of counters for
 unconfigured zones

eventcounters: Do not display counters for zones that are not available on an
arch

Do not define or display counters for the DMA32 and the HIGHMEM zone if such
zones were not configured.

[akpm@osdl.org: s390 fix]
[heiko.carstens@de.ibm.com: s390 fix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/appldata/appldata_mem.c |  3 +--
 include/linux/vmstat.h            | 14 ++++++++++++-
 mm/vmstat.c                       | 43 ++++++++++++++++++---------------------
 3 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index ab3b0765a64..8aea3698a77 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -117,8 +117,7 @@ static void appldata_get_mem_data(void *data)
 	mem_data->pgpgout    = ev[PGPGOUT] >> 1;
 	mem_data->pswpin     = ev[PSWPIN];
 	mem_data->pswpout    = ev[PSWPOUT];
-	mem_data->pgalloc    = ev[PGALLOC_HIGH] + ev[PGALLOC_NORMAL] +
-			       ev[PGALLOC_DMA];
+	mem_data->pgalloc    = ev[PGALLOC_NORMAL] + ev[PGALLOC_DMA];
 	mem_data->pgfault    = ev[PGFAULT];
 	mem_data->pgmajfault = ev[PGMAJFAULT];
 
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9c6e62c56ec..176c7f79733 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -18,7 +18,19 @@
  * generated will simply be the increment of a global address.
  */
 
-#define FOR_ALL_ZONES(x) x##_DMA, x##_DMA32, x##_NORMAL, x##_HIGH
+#ifdef CONFIG_ZONE_DMA32
+#define DMA32_ZONE(xx) xx##_DMA32,
+#else
+#define DMA32_ZONE(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define HIGHMEM_ZONE(xx) , xx##_HIGH
+#else
+#define HIGHMEM_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) xx##_DMA, DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGALLOC),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c1b5f4106b3..04a9093f649 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -435,6 +435,21 @@ struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
 
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+
+#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
+					TEXT_FOR_HIGHMEM(xx)
+
 static char *vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_anon_pages",
@@ -462,10 +477,7 @@ static char *vmstat_text[] = {
 	"pswpin",
 	"pswpout",
 
-	"pgalloc_dma",
-	"pgalloc_dma32",
-	"pgalloc_normal",
-	"pgalloc_high",
+	TEXTS_FOR_ZONES("pgalloc")
 
 	"pgfree",
 	"pgactivate",
@@ -474,25 +486,10 @@ static char *vmstat_text[] = {
 	"pgfault",
 	"pgmajfault",
 
-	"pgrefill_dma",
-	"pgrefill_dma32",
-	"pgrefill_normal",
-	"pgrefill_high",
-
-	"pgsteal_dma",
-	"pgsteal_dma32",
-	"pgsteal_normal",
-	"pgsteal_high",
-
-	"pgscan_kswapd_dma",
-	"pgscan_kswapd_dma32",
-	"pgscan_kswapd_normal",
-	"pgscan_kswapd_high",
-
-	"pgscan_direct_dma",
-	"pgscan_direct_dma32",
-	"pgscan_direct_normal",
-	"pgscan_direct_high",
+	TEXTS_FOR_ZONES("pgrefill")
+	TEXTS_FOR_ZONES("pgsteal")
+	TEXTS_FOR_ZONES("pgscan_kswapd")
+	TEXTS_FOR_ZONES("pgscan_direct")
 
 	"pginodesteal",
 	"slabs_scanned",
-- 
cgit v1.2.3


From b9b15780f808efa2c897f337644ba7a2bec03ecc Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:16 -0700
Subject: [PATCH] reduce MAX_NR_ZONES: fix i386 SRAT check for MAX_NR_ZONES

We cannot check MAX_NR_ZONES since it not defined in the preprocessor
anymore.

So remove the check.

The maximum number of zones per node for i386 is 3 since i386 does not
support ZONE_DMA32.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/srat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index b1809c9a089..83db411b3aa 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -42,7 +42,7 @@
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
 static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
 
-#define MAX_CHUNKS_PER_NODE	4
+#define MAX_CHUNKS_PER_NODE	3
 #define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
 struct node_memory_chunk_s {
 	unsigned long	start_pfn;
@@ -135,9 +135,6 @@ static void __init parse_memory_affinity_structure (char *sratp)
 		 "enabled and removable" : "enabled" ) );
 }
 
-#if MAX_NR_ZONES != 4
-#error "MAX_NR_ZONES != 4, chunk_to_zone requires review"
-#endif
 /* Take a chunk of pages from page frame cstart to cend and count the number
  * of pages in each zone, returned via zones[].
  */
-- 
cgit v1.2.3


From 4e4785bcf0c8503224fa6c17d8e0228de781bff6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:17 -0700
Subject: [PATCH] mempolicies: fix policy_zone check

There is a check in zonelist_policy that compares pieces of the bitmap
obtained from a gfp mask via GFP_ZONETYPES with a zone number in function
zonelist_policy().

The bitmap is an ORed mask of __GFP_DMA, __GFP_DMA32 and __GFP_HIGHMEM.
The policy_zone is a zone number with the possible values of ZONE_DMA,
ZONE_DMA32, ZONE_HIGHMEM and ZONE_NORMAL. These are two different domains
of values.

For some reason seemed to work before the zone reduction patchset (It
definitely works on SGI boxes since we just have one zone and the check
cannot fail).

With the zone reduction patchset this check definitely fails on systems
with two zones if the system actually has memory in both zones.

This is because ZONE_NORMAL is selected using no __GFP flag at
all and thus gfp_zone(gfpmask) == 0. ZONE_DMA is selected when __GFP_DMA
is set. __GFP_DMA is 0x01.  So gfp_zone(gfpmask) == 1.

policy_zone is set to ZONE_NORMAL (==1) if ZONE_NORMAL and ZONE_DMA are
populated.

For ZONE_NORMAL gfp_zone(<no _GFP_DMA>) yields 0 which is <
policy_zone(ZONE_NORMAL) and so policy is not applied to regular memory
allocations!

Instead gfp_zone(__GFP_DMA) == 1 which results in policy being applied
to DMA allocations!

What we realy want in that place is to establish the highest allowable
zone for a given gfp_mask. If the highest zone is higher or equal to the
policy_zone then memory policies need to be applied. We have such
a highest_zone() function in page_alloc.c.

So move the highest_zone() function from mm/page_alloc.c into
include/linux/gfp.h.  On the way we simplify the function and use the new
zone_type that was also introduced with the zone reduction patchset plus we
also specify the right type for the gfp flags parameter.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 15 +++++++++++++++
 mm/mempolicy.c      |  2 +-
 mm/page_alloc.c     | 16 ----------------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2a2153ebfe0..a0992d392d7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -85,6 +85,21 @@ static inline int gfp_zone(gfp_t gfp)
 	return zone;
 }
 
+static inline enum zone_type highest_zone(gfp_t flags)
+{
+	if (flags & __GFP_DMA)
+		return ZONE_DMA;
+#ifdef CONFIG_ZONE_DMA32
+	if (flags & __GFP_DMA32)
+		return ZONE_DMA32;
+#endif
+#ifdef CONFIG_HIGHMEM
+	if (flags & __GFP_HIGHMEM)
+		return ZONE_HIGHMEM;
+#endif
+	return ZONE_NORMAL;
+}
+
 /*
  * There is only one page-allocator function, and two main namespaces to
  * it. The alloc_page*() variants return 'struct page *' and as such
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a9963ceddd6..9870624d72a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1096,7 +1096,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 	case MPOL_BIND:
 		/* Lower zones don't get a policy applied */
 		/* Careful: current->mems_allowed might have moved */
-		if (gfp_zone(gfp) >= policy_zone)
+		if (highest_zone(gfp) >= policy_zone)
 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 				return policy->v.zonelist;
 		/*FALL THROUGH*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6c7c2dd1b3e..25f39865e32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1376,22 +1376,6 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
 	return nr_zones;
 }
 
-static inline int highest_zone(int zone_bits)
-{
-	int res = ZONE_NORMAL;
-#ifdef CONFIG_HIGHMEM
-	if (zone_bits & (__force int)__GFP_HIGHMEM)
-		res = ZONE_HIGHMEM;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-	if (zone_bits & (__force int)__GFP_DMA32)
-		res = ZONE_DMA32;
-#endif
-	if (zone_bits & (__force int)__GFP_DMA)
-		res = ZONE_DMA;
-	return res;
-}
-
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
 static int __meminitdata node_load[MAX_NUMNODES];
-- 
cgit v1.2.3


From 2f6726e54a9410e2e4cee864947c05e954051916 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:18 -0700
Subject: [PATCH] Apply type enum zone_type

After we have done this we can now do some typing cleanup.

The memory policy layer keeps a policy_zone that specifies
the zone that gets memory policies applied. This variable
can now be of type enum zone_type.

The check_highest_zone function and the build_zonelists funnctionm must
then also take a enum zone_type parameter.

Plus there are a number of loops over zones that also should use
zone_type.

We run into some troubles at some points with functions that need a
zone_type variable to become -1. Fix that up.

[pj@sgi.com: fix set_mempolicy() crash]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  4 ++--
 mm/mempolicy.c            | 11 ++++++++---
 mm/page_alloc.c           | 27 +++++++++++++++++----------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 72440f0a443..09f0f575ddf 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -162,9 +162,9 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 		unsigned long addr);
 extern unsigned slab_node(struct mempolicy *policy);
 
-extern int policy_zone;
+extern enum zone_type policy_zone;
 
-static inline void check_highest_zone(int k)
+static inline void check_highest_zone(enum zone_type k)
 {
 	if (k > policy_zone)
 		policy_zone = k;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9870624d72a..7da4142ce96 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
-int policy_zone = ZONE_DMA;
+enum zone_type policy_zone = ZONE_DMA;
 
 struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
@@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
 	struct zonelist *zl;
-	int num, max, nd, k;
+	int num, max, nd;
+	enum zone_type k;
 
 	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
@@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 	   lower zones etc. Avoid empty zones because the memory allocator
 	   doesn't like them. If you implement node hot removal you
 	   have to fix that. */
-	for (k = policy_zone; k >= 0; k--) { 
+	k = policy_zone;
+	while (1) {
 		for_each_node_mask(nd, *nodes) { 
 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
 			if (z->present_pages > 0) 
 				zl->zones[num++] = z;
 		}
+		if (k == 0)
+			break;
+		k--;
 	}
 	zl->zones[num] = NULL;
 	return zl;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 25f39865e32..9c44b9a39d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -637,7 +637,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_node_pages(int nodeid)
 {
-	int i, z;
+	int i;
+	enum zone_type z;
 	unsigned long flags;
 
 	for (z = 0; z < MAX_NR_ZONES; z++) {
@@ -1158,7 +1159,8 @@ EXPORT_SYMBOL(nr_free_pages);
 #ifdef CONFIG_NUMA
 unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 {
-	unsigned int i, sum = 0;
+	unsigned int sum = 0;
+	enum zone_type i;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		sum += pgdat->node_zones[i].free_pages;
@@ -1358,21 +1360,22 @@ void show_free_areas(void)
  * Add all populated zones of a node to the zonelist.
  */
 static int __meminit build_zonelists_node(pg_data_t *pgdat,
-			struct zonelist *zonelist, int nr_zones, int zone_type)
+			struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 
 	BUG_ON(zone_type >= MAX_NR_ZONES);
+	zone_type++;
 
 	do {
+		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zonelist->zones[nr_zones++] = zone;
 			check_highest_zone(zone_type);
 		}
-		zone_type--;
 
-	} while (zone_type >= 0);
+	} while (zone_type);
 	return nr_zones;
 }
 
@@ -1441,10 +1444,11 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int i, j, k, node, local_node;
+	int i, j, node, local_node;
 	int prev_node, load;
 	struct zonelist *zonelist;
 	nodemask_t used_mask;
+	enum zone_type k;
 
 	/* initialize zonelists */
 	for (i = 0; i < GFP_ZONETYPES; i++) {
@@ -1628,7 +1632,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
-	int i;
+	enum zone_type i;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zones_size[i];
@@ -2116,7 +2120,7 @@ static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
-	int i, j;
+	enum zone_type i, j;
 
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -2149,7 +2153,7 @@ static void calculate_totalreserve_pages(void)
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
-	int j, idx;
+	enum zone_type j, idx;
 
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -2158,9 +2162,12 @@ static void setup_per_zone_lowmem_reserve(void)
 
 			zone->lowmem_reserve[j] = 0;
 
-			for (idx = j-1; idx >= 0; idx--) {
+			idx = j;
+			while (idx) {
 				struct zone *lower_zone;
 
+				idx--;
+
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 
-- 
cgit v1.2.3


From 19655d3487001d7df0e10e9cbfc27c758b77c2b5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:19 -0700
Subject: [PATCH] linearly index zone->node_zonelists[]

I wonder why we need this bitmask indexing into zone->node_zonelists[]?

We always start with the highest zone and then include all lower zones
if we build zonelists.

Are there really cases where we need allocation from ZONE_DMA or
ZONE_HIGHMEM but not ZONE_NORMAL? It seems that the current implementation
of highest_zone() makes that already impossible.

If we go linear on the index then gfp_zone() == highest_zone() and a lot
of definitions fall by the wayside.

We can now revert back to the use of gfp_zone() in mempolicy.c ;-)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h    | 12 +-----------
 include/linux/mmzone.h | 52 +++++---------------------------------------------
 mm/mempolicy.c         |  2 +-
 mm/page_alloc.c        | 27 +++++++++++---------------
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a0992d392d7..63ab88a36f3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -12,9 +12,6 @@ struct vm_area_struct;
  *
  * Zone modifiers (see linux/mmzone.h - low three bits)
  *
- * These may be masked by GFP_ZONEMASK to make allocations with this bit
- * set fall back to ZONE_NORMAL.
- *
  * Do not put any conditional on these. If necessary modify the definitions
  * without the underscores and use the consistently. The definitions here may
  * be used in bit comparisons.
@@ -78,14 +75,7 @@ struct vm_area_struct;
 #define GFP_DMA32	__GFP_DMA32
 
 
-static inline int gfp_zone(gfp_t gfp)
-{
-	int zone = GFP_ZONEMASK & (__force int) gfp;
-	BUG_ON(zone >= GFP_ZONETYPES);
-	return zone;
-}
-
-static inline enum zone_type highest_zone(gfp_t flags)
+static inline enum zone_type gfp_zone(gfp_t flags)
 {
 	if (flags & __GFP_DMA)
 		return ZONE_DMA;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 76d33e68859..7fe317164b7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -137,60 +137,18 @@ enum zone_type {
 	MAX_NR_ZONES
 };
 
-
 /*
  * When a memory allocation must conform to specific limitations (such
  * as being suitable for DMA) the caller will pass in hints to the
  * allocator in the gfp_mask, in the zone modifier bits.  These bits
  * are used to select a priority ordered list of memory zones which
- * match the requested limits.  GFP_ZONEMASK defines which bits within
- * the gfp_mask should be considered as zone modifiers.  Each valid
- * combination of the zone modifier bits has a corresponding list
- * of zones (in node_zonelists).  Thus for two zone modifiers there
- * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will
- * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
- * combinations of zone modifiers in "zone modifier space".
- *
- * As an optimisation any zone modifier bits which are only valid when
- * no other zone modifier bits are set (loners) should be placed in
- * the highest order bits of this field.  This allows us to reduce the
- * extent of the zonelists thus saving space.  For example in the case
- * of three zone modifier bits, we could require up to eight zonelists.
- * If the left most zone modifier is a "loner" then the highest valid
- * zonelist would be four allowing us to allocate only five zonelists.
- * Use the first form for GFP_ZONETYPES when the left most bit is not
- * a "loner", otherwise use the second.
- *
- * NOTE! Make sure this matches the zones in <linux/gfp.h>
+ * match the requested limits. See gfp_zone() in include/linux/gfp.h
  */
 
-#ifdef CONFIG_ZONE_DMA32
-
-#ifdef CONFIG_HIGHMEM
-#define GFP_ZONETYPES		((GFP_ZONEMASK + 1) / 2 + 1)    /* Loner */
-#define GFP_ZONEMASK		0x07
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
-#else
-#define GFP_ZONETYPES		((0x07 + 1) / 2 + 1)    /* Loner */
-/* Mask __GFP_HIGHMEM */
-#define GFP_ZONEMASK		0x05
-#define ZONES_SHIFT		2
-#endif
-
-#else
-#ifdef CONFIG_HIGHMEM
-
-#define GFP_ZONEMASK		0x03
-#define ZONES_SHIFT		2
-#define GFP_ZONETYPES		3
-
+#if !defined(CONFIG_ZONE_DMA32) && !defined(CONFIG_HIGHMEM)
+#define ZONES_SHIFT 1
 #else
-
-#define GFP_ZONEMASK		0x01
-#define ZONES_SHIFT		1
-#define GFP_ZONETYPES		2
-
-#endif
+#define ZONES_SHIFT 2
 #endif
 
 struct zone {
@@ -360,7 +318,7 @@ struct zonelist {
 struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
-	struct zonelist node_zonelists[GFP_ZONETYPES];
+	struct zonelist node_zonelists[MAX_NR_ZONES];
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7da4142ce96..c3429a710ab 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1101,7 +1101,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 	case MPOL_BIND:
 		/* Lower zones don't get a policy applied */
 		/* Careful: current->mems_allowed might have moved */
-		if (highest_zone(gfp) >= policy_zone)
+		if (gfp_zone(gfp) >= policy_zone)
 			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 				return policy->v.zonelist;
 		/*FALL THROUGH*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9c44b9a39d3..208a6b03aa7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1444,14 +1444,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int i, j, node, local_node;
+	int j, node, local_node;
+	enum zone_type i;
 	int prev_node, load;
 	struct zonelist *zonelist;
 	nodemask_t used_mask;
-	enum zone_type k;
 
 	/* initialize zonelists */
-	for (i = 0; i < GFP_ZONETYPES; i++) {
+	for (i = 0; i < MAX_NR_ZONES; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->zones[0] = NULL;
 	}
@@ -1481,13 +1481,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 			node_load[node] += load;
 		prev_node = node;
 		load--;
-		for (i = 0; i < GFP_ZONETYPES; i++) {
+		for (i = 0; i < MAX_NR_ZONES; i++) {
 			zonelist = pgdat->node_zonelists + i;
 			for (j = 0; zonelist->zones[j] != NULL; j++);
 
-			k = highest_zone(i);
-
-	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+	 		j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 			zonelist->zones[j] = NULL;
 		}
 	}
@@ -1497,19 +1495,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 
 static void __meminit build_zonelists(pg_data_t *pgdat)
 {
-	int i, node, local_node;
-	enum zone_type k;
-	enum zone_type j;
+	int node, local_node;
+	enum zone_type i,j;
 
 	local_node = pgdat->node_id;
-	for (i = 0; i < GFP_ZONETYPES; i++) {
+	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zonelist *zonelist;
 
 		zonelist = pgdat->node_zonelists + i;
 
-		j = 0;
-		k = highest_zone(i);
- 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		j = build_zonelists_node(pgdat, zonelist, 0, i);
  		/*
  		 * Now we build the zonelist so that it contains the zones
  		 * of all the other nodes.
@@ -1521,12 +1516,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
 		for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 			if (!node_online(node))
 				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 		}
 		for (node = 0; node < local_node; node++) {
 			if (!node_online(node))
 				continue;
-			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+			j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
 		}
 
 		zonelist->zones[j] = NULL;
-- 
cgit v1.2.3


From 8bc719d3cab8414938f9ea6e33b58d8810d18068 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 25 Sep 2006 23:31:20 -0700
Subject: [PATCH] out of memory notifier

Add a notifer chain to the out of memory killer.  If one of the registered
callbacks could release some memory, do not kill the process but return and
retry the allocation that forced the oom killer to run.

The purpose of the notifier is to add a safety net in the presence of
memory ballooners.  If the resource manager inflated the balloon to a size
where memory allocations can not be satisfied anymore, it is better to
deflate the balloon a bit instead of killing processes.

The implementation for the s390 ballooner is included.

[akpm@osdl.org: cleanups]
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/mm/cmm.c   | 155 +++++++++++++++++++++++++++++++--------------------
 include/linux/swap.h |   4 ++
 mm/oom_kill.c        |  22 ++++++++
 3 files changed, 121 insertions(+), 60 deletions(-)

diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 786a44dba5b..f2e00df99ba 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
+#include <linux/swap.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -34,17 +35,18 @@ struct cmm_page_array {
 	unsigned long pages[CMM_NR_PAGES];
 };
 
-static long cmm_pages = 0;
-static long cmm_timed_pages = 0;
-static volatile long cmm_pages_target = 0;
-static volatile long cmm_timed_pages_target = 0;
-static long cmm_timeout_pages = 0;
-static long cmm_timeout_seconds = 0;
+static long cmm_pages;
+static long cmm_timed_pages;
+static volatile long cmm_pages_target;
+static volatile long cmm_timed_pages_target;
+static long cmm_timeout_pages;
+static long cmm_timeout_seconds;
 
-static struct cmm_page_array *cmm_page_list = NULL;
-static struct cmm_page_array *cmm_timed_page_list = NULL;
+static struct cmm_page_array *cmm_page_list;
+static struct cmm_page_array *cmm_timed_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
 
-static unsigned long cmm_thread_active = 0;
+static unsigned long cmm_thread_active;
 static struct work_struct cmm_thread_starter;
 static wait_queue_head_t cmm_thread_wait;
 static struct timer_list cmm_timer;
@@ -53,58 +55,89 @@ static void cmm_timer_fn(unsigned long);
 static void cmm_set_timer(void);
 
 static long
-cmm_alloc_pages(long pages, long *counter, struct cmm_page_array **list)
+cmm_alloc_pages(long nr, long *counter, struct cmm_page_array **list)
 {
-	struct cmm_page_array *pa;
-	unsigned long page;
+	struct cmm_page_array *pa, *npa;
+	unsigned long addr;
 
-	pa = *list;
-	while (pages) {
-		page = __get_free_page(GFP_NOIO);
-		if (!page)
+	while (nr) {
+		addr = __get_free_page(GFP_NOIO);
+		if (!addr)
 			break;
+		spin_lock(&cmm_lock);
+		pa = *list;
 		if (!pa || pa->index >= CMM_NR_PAGES) {
 			/* Need a new page for the page list. */
-			pa = (struct cmm_page_array *)
+			spin_unlock(&cmm_lock);
+			npa = (struct cmm_page_array *)
 				__get_free_page(GFP_NOIO);
-			if (!pa) {
-				free_page(page);
+			if (!npa) {
+				free_page(addr);
 				break;
 			}
-			pa->next = *list;
-			pa->index = 0;
-			*list = pa;
+			spin_lock(&cmm_lock);
+			pa = *list;
+			if (!pa || pa->index >= CMM_NR_PAGES) {
+				npa->next = pa;
+				npa->index = 0;
+				pa = npa;
+				*list = pa;
+			} else
+				free_page((unsigned long) npa);
 		}
-		diag10(page);
-		pa->pages[pa->index++] = page;
+		diag10(addr);
+		pa->pages[pa->index++] = addr;
 		(*counter)++;
-		pages--;
+		spin_unlock(&cmm_lock);
+		nr--;
 	}
-	return pages;
+	return nr;
 }
 
-static void
-cmm_free_pages(long pages, long *counter, struct cmm_page_array **list)
+static long
+cmm_free_pages(long nr, long *counter, struct cmm_page_array **list)
 {
 	struct cmm_page_array *pa;
-	unsigned long page;
+	unsigned long addr;
 
+	spin_lock(&cmm_lock);
 	pa = *list;
-	while (pages) {
+	while (nr) {
 		if (!pa || pa->index <= 0)
 			break;
-		page = pa->pages[--pa->index];
+		addr = pa->pages[--pa->index];
 		if (pa->index == 0) {
 			pa = pa->next;
 			free_page((unsigned long) *list);
 			*list = pa;
 		}
-		free_page(page);
+		free_page(addr);
 		(*counter)--;
-		pages--;
+		nr--;
 	}
+	spin_unlock(&cmm_lock);
+	return nr;
 }
 
+static int cmm_oom_notify(struct notifier_block *self,
+			  unsigned long dummy, void *parm)
+{
+	unsigned long *freed = parm;
+	long nr = 256;
+
+	nr = cmm_free_pages(nr, &cmm_timed_pages, &cmm_timed_page_list);
+	if (nr > 0)
+		nr = cmm_free_pages(nr, &cmm_pages, &cmm_page_list);
+	cmm_pages_target = cmm_pages;
+	cmm_timed_pages_target = cmm_timed_pages;
+	*freed += 256 - nr;
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cmm_oom_nb = {
+	.notifier_call = cmm_oom_notify
+};
+
 static int
 cmm_thread(void *dummy)
 {
@@ -177,21 +210,21 @@ cmm_set_timer(void)
 static void
 cmm_timer_fn(unsigned long ignored)
 {
-	long pages;
+	long nr;
 
-	pages = cmm_timed_pages_target - cmm_timeout_pages;
-	if (pages < 0)
+	nr = cmm_timed_pages_target - cmm_timeout_pages;
+	if (nr < 0)
 		cmm_timed_pages_target = 0;
 	else
-		cmm_timed_pages_target = pages;
+		cmm_timed_pages_target = nr;
 	cmm_kick_thread();
 	cmm_set_timer();
 }
 
 void
-cmm_set_pages(long pages)
+cmm_set_pages(long nr)
 {
-	cmm_pages_target = pages;
+	cmm_pages_target = nr;
 	cmm_kick_thread();
 }
 
@@ -202,9 +235,9 @@ cmm_get_pages(void)
 }
 
 void
-cmm_add_timed_pages(long pages)
+cmm_add_timed_pages(long nr)
 {
-	cmm_timed_pages_target += pages;
+	cmm_timed_pages_target += nr;
 	cmm_kick_thread();
 }
 
@@ -215,9 +248,9 @@ cmm_get_timed_pages(void)
 }
 
 void
-cmm_set_timeout(long pages, long seconds)
+cmm_set_timeout(long nr, long seconds)
 {
-	cmm_timeout_pages = pages;
+	cmm_timeout_pages = nr;
 	cmm_timeout_seconds = seconds;
 	cmm_set_timer();
 }
@@ -245,7 +278,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[16], *p;
-	long pages;
+	long nr;
 	int len;
 
 	if (!*lenp || (*ppos && !write)) {
@@ -260,17 +293,17 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp,
 			return -EFAULT;
 		buf[sizeof(buf) - 1] = '\0';
 		cmm_skip_blanks(buf, &p);
-		pages = simple_strtoul(p, &p, 0);
+		nr = simple_strtoul(p, &p, 0);
 		if (ctl == &cmm_table[0])
-			cmm_set_pages(pages);
+			cmm_set_pages(nr);
 		else
-			cmm_add_timed_pages(pages);
+			cmm_add_timed_pages(nr);
 	} else {
 		if (ctl == &cmm_table[0])
-			pages = cmm_get_pages();
+			nr = cmm_get_pages();
 		else
-			pages = cmm_get_timed_pages();
-		len = sprintf(buf, "%ld\n", pages);
+			nr = cmm_get_timed_pages();
+		len = sprintf(buf, "%ld\n", nr);
 		if (len > *lenp)
 			len = *lenp;
 		if (copy_to_user(buffer, buf, len))
@@ -286,7 +319,7 @@ cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char buf[64], *p;
-	long pages, seconds;
+	long nr, seconds;
 	int len;
 
 	if (!*lenp || (*ppos && !write)) {
@@ -301,10 +334,10 @@ cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp,
 			return -EFAULT;
 		buf[sizeof(buf) - 1] = '\0';
 		cmm_skip_blanks(buf, &p);
-		pages = simple_strtoul(p, &p, 0);
+		nr = simple_strtoul(p, &p, 0);
 		cmm_skip_blanks(p, &p);
 		seconds = simple_strtoul(p, &p, 0);
-		cmm_set_timeout(pages, seconds);
+		cmm_set_timeout(nr, seconds);
 	} else {
 		len = sprintf(buf, "%ld %ld\n",
 			      cmm_timeout_pages, cmm_timeout_seconds);
@@ -357,7 +390,7 @@ static struct ctl_table cmm_dir_table[] = {
 static void
 cmm_smsg_target(char *from, char *msg)
 {
-	long pages, seconds;
+	long nr, seconds;
 
 	if (strlen(sender) > 0 && strcmp(from, sender) != 0)
 		return;
@@ -366,27 +399,27 @@ cmm_smsg_target(char *from, char *msg)
 	if (strncmp(msg, "SHRINK", 6) == 0) {
 		if (!cmm_skip_blanks(msg + 6, &msg))
 			return;
-		pages = simple_strtoul(msg, &msg, 0);
+		nr = simple_strtoul(msg, &msg, 0);
 		cmm_skip_blanks(msg, &msg);
 		if (*msg == '\0')
-			cmm_set_pages(pages);
+			cmm_set_pages(nr);
 	} else if (strncmp(msg, "RELEASE", 7) == 0) {
 		if (!cmm_skip_blanks(msg + 7, &msg))
 			return;
-		pages = simple_strtoul(msg, &msg, 0);
+		nr = simple_strtoul(msg, &msg, 0);
 		cmm_skip_blanks(msg, &msg);
 		if (*msg == '\0')
-			cmm_add_timed_pages(pages);
+			cmm_add_timed_pages(nr);
 	} else if (strncmp(msg, "REUSE", 5) == 0) {
 		if (!cmm_skip_blanks(msg + 5, &msg))
 			return;
-		pages = simple_strtoul(msg, &msg, 0);
+		nr = simple_strtoul(msg, &msg, 0);
 		if (!cmm_skip_blanks(msg, &msg))
 			return;
 		seconds = simple_strtoul(msg, &msg, 0);
 		cmm_skip_blanks(msg, &msg);
 		if (*msg == '\0')
-			cmm_set_timeout(pages, seconds);
+			cmm_set_timeout(nr, seconds);
 	}
 }
 #endif
@@ -402,6 +435,7 @@ cmm_init (void)
 #ifdef CONFIG_CMM_IUCV
 	smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
 #endif
+	register_oom_notifier(&cmm_oom_nb);
 	INIT_WORK(&cmm_thread_starter, (void *) cmm_start_thread, NULL);
 	init_waitqueue_head(&cmm_thread_wait);
 	init_timer(&cmm_timer);
@@ -411,6 +445,7 @@ cmm_init (void)
 static void
 cmm_exit(void)
 {
+	unregister_oom_notifier(&cmm_oom_nb);
 	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
 	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
 #ifdef CONFIG_CMM_PROC
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34a6bc3e6cf..32db06c8ffe 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -10,6 +10,8 @@
 #include <asm/atomic.h>
 #include <asm/page.h>
 
+struct notifier_block;
+
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
@@ -156,6 +158,8 @@ struct swap_list_t {
 
 /* linux/mm/oom_kill.c */
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
+extern int register_oom_notifier(struct notifier_block *nb);
+extern int unregister_oom_notifier(struct notifier_block *nb);
 
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b9af136e5cf..7d056843fa2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -21,6 +21,8 @@
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
@@ -306,6 +308,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 	return oom_kill_task(p, message);
 }
 
+static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
+
+int register_oom_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(register_oom_notifier);
+
+int unregister_oom_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  *
@@ -318,6 +334,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
 	struct task_struct *p;
 	unsigned long points = 0;
+	unsigned long freed = 0;
+
+	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+	if (freed > 0)
+		/* Got some memory back in the last second. */
+		return;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-- 
cgit v1.2.3


From 7ff6f08295d90ab20d25200ef485ebb45b1b8d71 Mon Sep 17 00:00:00 2001
From: Martin Peschke <mp3@de.ibm.com>
Date: Mon, 25 Sep 2006 23:31:21 -0700
Subject: [PATCH] CPU hotplug compatible alloc_percpu()

This patch splits alloc_percpu() up into two phases.  Likewise for
free_percpu().  This allows clients to limit initial allocations to online
cpu's, and to populate or depopulate per-cpu data at run time as needed:

  struct my_struct *obj;

  /* initial allocation for online cpu's */
  obj = percpu_alloc(sizeof(struct my_struct), GFP_KERNEL);

  ...

  /* populate per-cpu data for cpu coming online */
  ptr = percpu_populate(obj, sizeof(struct my_struct), GFP_KERNEL, cpu);

  ...

  /* access per-cpu object */
  ptr = percpu_ptr(obj, smp_processor_id());

  ...

  /* depopulate per-cpu data for cpu going offline */
  percpu_depopulate(obj, cpu);

  ...

  /* final removal */
  percpu_free(obj);

Signed-off-by: Martin Peschke <mp3@de.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/percpu.h |  79 +++++++++++++++++------
 mm/slab.c              | 166 ++++++++++++++++++++++++++++++++-----------------
 2 files changed, 169 insertions(+), 76 deletions(-)

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index f926490a7d8..3835a9642f1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -1,9 +1,12 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
+
 #include <linux/spinlock.h> /* For preempt_disable() */
 #include <linux/slab.h> /* For kmalloc() */
 #include <linux/smp.h>
 #include <linux/string.h> /* For memset() */
+#include <linux/cpumask.h>
+
 #include <asm/percpu.h>
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
@@ -27,39 +30,77 @@ struct percpu_data {
 	void *ptrs[NR_CPUS];
 };
 
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
 /* 
- * Use this to get to a cpu's version of the per-cpu object allocated using
- * alloc_percpu.  Non-atomic access to the current CPU's version should
+ * Use this to get to a cpu's version of the per-cpu object dynamically
+ * allocated. Non-atomic access to the current CPU's version should
  * probably be combined with get_cpu()/put_cpu().
  */ 
-#define per_cpu_ptr(ptr, cpu)                   \
-({                                              \
-        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	\
+#define percpu_ptr(ptr, cpu)                              \
+({                                                        \
+        struct percpu_data *__p = __percpu_disguise(ptr); \
+        (__typeof__(ptr))__p->ptrs[(cpu)];	          \
 })
 
-extern void *__alloc_percpu(size_t size);
-extern void free_percpu(const void *);
+extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu);
+extern void percpu_depopulate(void *__pdata, int cpu);
+extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+				  cpumask_t *mask);
+extern void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask);
+extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
+extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+
+static inline void percpu_depopulate(void *__pdata, int cpu)
+{
+}
+
+static inline void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+{
+}
 
-static inline void *__alloc_percpu(size_t size)
+static inline void *percpu_populate(void *__pdata, size_t size, gfp_t gfp,
+				    int cpu)
 {
-	void *ret = kmalloc(size, GFP_KERNEL);
-	if (ret)
-		memset(ret, 0, size);
-	return ret;
+	return percpu_ptr(__pdata, cpu);
 }
-static inline void free_percpu(const void *ptr)
-{	
-	kfree(ptr);
+
+static inline int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+					 cpumask_t *mask)
+{
+	return 0;
+}
+
+static inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+{
+	return kzalloc(size, gfp);
+}
+
+static inline void percpu_free(void *__pdata)
+{
+	kfree(__pdata);
 }
 
 #endif /* CONFIG_SMP */
 
-/* Simple wrapper for the common case: zeros memory. */
-#define alloc_percpu(type)	((type *)(__alloc_percpu(sizeof(type))))
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
+#define percpu_depopulate_mask(__pdata, mask) \
+	__percpu_depopulate_mask((__pdata), &(mask))
+#define percpu_alloc_mask(size, gfp, mask) \
+	__percpu_alloc_mask((size), (gfp), &(mask))
+
+#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
+
+/* (legacy) interface for use without CPU hotplug handling */
+
+#define __alloc_percpu(size)	percpu_alloc_mask((size), GFP_KERNEL, \
+						  cpu_possible_map)
+#define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
+#define free_percpu(ptr)	percpu_free((ptr))
+#define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/slab.c b/mm/slab.c
index 5870bcbd33c..00584dbbec0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3371,52 +3371,127 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
 
 #ifdef CONFIG_SMP
 /**
- * __alloc_percpu - allocate one copy of the object for every present
- * cpu in the system, zeroing them.
- * Objects should be dereferenced using the per_cpu_ptr macro only.
+ * percpu_depopulate - depopulate per-cpu data for given cpu
+ * @__pdata: per-cpu data to depopulate
+ * @cpu: depopulate per-cpu data for this cpu
  *
- * @size: how many bytes of memory are required.
+ * Depopulating per-cpu data for a cpu going offline would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
  */
-void *__alloc_percpu(size_t size)
+void percpu_depopulate(void *__pdata, int cpu)
 {
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
+	struct percpu_data *pdata = __percpu_disguise(__pdata);
+	if (pdata->ptrs[cpu]) {
+		kfree(pdata->ptrs[cpu]);
+		pdata->ptrs[cpu] = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(percpu_depopulate);
 
-	if (!pdata)
-		return NULL;
+/**
+ * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
+ * @__pdata: per-cpu data to depopulate
+ * @mask: depopulate per-cpu data for cpu's selected through mask bits
+ */
+void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+{
+	int cpu;
+	for_each_cpu_mask(cpu, *mask)
+		percpu_depopulate(__pdata, cpu);
+}
+EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
 
-	/*
-	 * Cannot use for_each_online_cpu since a cpu may come online
-	 * and we have no way of figuring out how to fix the array
-	 * that we have allocated then....
-	 */
-	for_each_possible_cpu(i) {
-		int node = cpu_to_node(i);
+/**
+ * percpu_populate - populate per-cpu data for given cpu
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @cpu: populate per-data for this cpu
+ *
+ * Populating per-cpu data for a cpu coming online would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
+ * Per-cpu object is populated with zeroed buffer.
+ */
+void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+{
+	struct percpu_data *pdata = __percpu_disguise(__pdata);
+	int node = cpu_to_node(cpu);
 
-		if (node_online(node))
-			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
-		else
-			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+	BUG_ON(pdata->ptrs[cpu]);
+	if (node_online(node)) {
+		/* FIXME: kzalloc_node(size, gfp, node) */
+		pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
+		if (pdata->ptrs[cpu])
+			memset(pdata->ptrs[cpu], 0, size);
+	} else
+		pdata->ptrs[cpu] = kzalloc(size, gfp);
+	return pdata->ptrs[cpu];
+}
+EXPORT_SYMBOL_GPL(percpu_populate);
 
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-		memset(pdata->ptrs[i], 0, size);
-	}
+/**
+ * percpu_populate_mask - populate per-cpu data for more cpu's
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-cpu data for cpu's selected through mask bits
+ *
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+			   cpumask_t *mask)
+{
+	cpumask_t populated = CPU_MASK_NONE;
+	int cpu;
+
+	for_each_cpu_mask(cpu, *mask)
+		if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
+			__percpu_depopulate_mask(__pdata, &populated);
+			return -ENOMEM;
+		} else
+			cpu_set(cpu, populated);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__percpu_populate_mask);
 
-	/* Catch derefs w/o wrappers */
-	return (void *)(~(unsigned long)pdata);
+/**
+ * percpu_alloc_mask - initial setup of per-cpu data
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-data for cpu's selected through mask bits
+ *
+ * Populating per-cpu data for all online cpu's would be a typical use case,
+ * which is simplified by the percpu_alloc() wrapper.
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+{
+	void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
+	void *__pdata = __percpu_disguise(pdata);
 
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
+	if (unlikely(!pdata))
+		return NULL;
+	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+		return __pdata;
 	kfree(pdata);
 	return NULL;
 }
-EXPORT_SYMBOL(__alloc_percpu);
-#endif
+EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+
+/**
+ * percpu_free - final cleanup of per-cpu data
+ * @__pdata: object to clean up
+ *
+ * We simply clean up any per-cpu object left. No need for the client to
+ * track and specify through a bis mask which per-cpu objects are to free.
+ */
+void percpu_free(void *__pdata)
+{
+	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	kfree(__percpu_disguise(__pdata));
+}
+EXPORT_SYMBOL_GPL(percpu_free);
+#endif	/* CONFIG_SMP */
 
 /**
  * kmem_cache_free - Deallocate an object
@@ -3463,29 +3538,6 @@ void kfree(const void *objp)
 }
 EXPORT_SYMBOL(kfree);
 
-#ifdef CONFIG_SMP
-/**
- * free_percpu - free previously allocated percpu memory
- * @objp: pointer returned by alloc_percpu.
- *
- * Don't free memory not originally allocated by alloc_percpu()
- * The complemented objp is to check for that.
- */
-void free_percpu(const void *objp)
-{
-	int i;
-	struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
-
-	/*
-	 * We allocate for all cpus so we cannot use for online cpu here.
-	 */
-	for_each_possible_cpu(i)
-	    kfree(p->ptrs[i]);
-	kfree(p);
-}
-EXPORT_SYMBOL(free_percpu);
-#endif
-
 unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
 	return obj_size(cachep);
-- 
cgit v1.2.3


From bfa5bf6d6446f0028187a727f792fbc7934228ad Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Mon, 25 Sep 2006 23:31:22 -0700
Subject: [PATCH] Add kerneldocs for some functions in mm/memory.c

These functions are already documented quite well with long comments.  Now
add kerneldoc style header to make this turn up in everyones favorite doc
format.

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Cc: "Randy.Dunlap" <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 65962534b4e..92a3ebd8d79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1227,7 +1227,12 @@ out:
 	return retval;
 }
 
-/*
+/**
+ * vm_insert_page - insert single page into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @page: source kernel page
+ *
  * This allows drivers to insert individual pages they've allocated
  * into a user vma.
  *
@@ -1319,7 +1324,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 	return 0;
 }
 
-/*  Note: this is only safe if the mm semaphore is held when called. */
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target user address to start at
+ * @pfn: physical address of kernel memory
+ * @size: size of map area
+ * @prot: page protection flags for this mapping
+ *
+ *  Note: this is only safe if the mm semaphore is held when called.
+ */
 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		    unsigned long pfn, unsigned long size, pgprot_t prot)
 {
@@ -1801,9 +1815,10 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
+/**
+ * vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
  *
  * NOTE! We have to be ready to update the memory sharing
  * between the file and the memory map for a potential last
@@ -1872,11 +1887,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 }
 EXPORT_UNUSED_SYMBOL(vmtruncate_range);  /*  June 2006  */
 
-/* 
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @addr: address to start
+ * @vma: user vma this addresses belong to
+ *
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
  * because it doesn't cost us any seek time.  We also make sure to queue
- * the 'original' request together with the readahead ones...  
+ * the 'original' request together with the readahead ones...
  *
  * This has been extended to use the NUMA policies from the mm triggering
  * the readahead.
-- 
cgit v1.2.3


From 28e4d965e6131ace1e813e93aebca89ac6b82dc1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:23 -0700
Subject: [PATCH] mm: remove_mapping() safeness

Some users of remove_mapping had been unsafe.

Modify the remove_mapping precondition to ensure the caller has locked the
page and obtained the correct mapping.  Modify callers to ensure the
mapping is the correct one.

[hugh@veritas.com: swapper_space fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 41a3da3d6cc..16180587fd7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -377,8 +377,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
-	if (!mapping)
-		return 0;		/* truncate got there first */
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping != page_mapping(page));
 
 	write_lock_irq(&mapping->tree_lock);
 
@@ -547,7 +547,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto free_it;
 		}
 
-		if (!remove_mapping(mapping, page))
+		if (!mapping || !remove_mapping(mapping, page))
 			goto keep_locked;
 
 free_it:
-- 
cgit v1.2.3


From db37648cd6ce9b828abd6d49aa3d269926ee7b7d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:24 -0700
Subject: [PATCH] mm: non syncing lock_page()

lock_page needs the caller to have a reference on the page->mapping inode
due to sync_page, ergo set_page_dirty_lock is obviously buggy according to
its comments.

Solve it by introducing a new lock_page_nosync which does not do a sync_page.

akpm: unpleasant solution to an unpleasant problem.  If it goes wrong it could
cause great slowdowns while the lock_page() caller waits for kblockd to
perform the unplug.  And if a filesystem has special sync_page() requirements
(none presently do), permanent hangs are possible.

otoh, set_page_dirty_lock() is usually (always?) called against userspace
pages.  They are always up-to-date, so there shouldn't be any pending read I/O
against these pages.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagemap.h | 15 +++++++++++++++
 mm/filemap.c            | 17 +++++++++++++++++
 mm/page-writeback.c     |  2 +-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0a2f5d27f60..64f95092515 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -130,14 +130,29 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 }
 
 extern void FASTCALL(__lock_page(struct page *page));
+extern void FASTCALL(__lock_page_nosync(struct page *page));
 extern void FASTCALL(unlock_page(struct page *page));
 
+/*
+ * lock_page may only be called if we have the page's inode pinned.
+ */
 static inline void lock_page(struct page *page)
 {
 	might_sleep();
 	if (TestSetPageLocked(page))
 		__lock_page(page);
 }
+
+/*
+ * lock_page_nosync should only be used if we can't pin the page's inode.
+ * Doesn't play quite so well with block device plugging.
+ */
+static inline void lock_page_nosync(struct page *page)
+{
+	might_sleep();
+	if (TestSetPageLocked(page))
+		__lock_page_nosync(page);
+}
 	
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
diff --git a/mm/filemap.c b/mm/filemap.c
index b9a60c43b61..d5af1cab426 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
 EXPORT_SYMBOL(page_cache_alloc_cold);
 #endif
 
+static int __sleep_on_page_lock(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -577,6 +583,17 @@ void fastcall __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
 
+/*
+ * Variant of lock_page that does not require the caller to hold a reference
+ * on the page's mapping.
+ */
+void fastcall __lock_page_nosync(struct page *page)
+{
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
+							TASK_UNINTERRUPTIBLE);
+}
+
 /**
  * find_get_page - find and get a page reference
  * @mapping: the address_space to search
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b9f4c6f1be8..555752907dc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -701,7 +701,7 @@ int set_page_dirty_lock(struct page *page)
 {
 	int ret;
 
-	lock_page(page);
+	lock_page_nosync(page);
 	ret = set_page_dirty(page);
 	unlock_page(page);
 	return ret;
-- 
cgit v1.2.3


From ca5f9703dffa012cc46166e6206c5a992910e041 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 Sep 2006 23:31:25 -0700
Subject: [PATCH] slab: respect architecture and caller mandated alignment

As explained by Heiko, on s390 (32-bit) ARCH_KMALLOC_MINALIGN is set to
eight because their common I/O layer allocates data structures that need to
have an eight byte alignment.  This does not work when CONFIG_SLAB_DEBUG is
enabled because kmem_cache_create will override alignment to BYTES_PER_WORD
which is four.

So change kmem_cache_create to ensure cache alignment is always at minimum
what the architecture or caller mandates even if slab debugging is enabled.

Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 00584dbbec0..d47d0e18697 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2096,6 +2096,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	} else {
 		ralign = BYTES_PER_WORD;
 	}
+
+	/*
+	 * Redzoning and user store require word alignment. Note this will be
+	 * overridden by architecture or caller mandated alignment if either
+	 * is greater than BYTES_PER_WORD.
+	 */
+	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
+		ralign = BYTES_PER_WORD;
+
 	/* 2) arch mandated alignment: disables debug if necessary */
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
@@ -2109,8 +2118,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
 	/*
-	 * 4) Store it. Note that the debug code below can reduce
-	 *    the alignment to BYTES_PER_WORD.
+	 * 4) Store it.
 	 */
 	align = ralign;
 
@@ -2122,20 +2130,19 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #if DEBUG
 	cachep->obj_size = size;
 
+	/*
+	 * Both debugging options require word-alignment which is calculated
+	 * into align above.
+	 */
 	if (flags & SLAB_RED_ZONE) {
-		/* redzoning only works with word aligned caches */
-		align = BYTES_PER_WORD;
-
 		/* add space for red zone words */
 		cachep->obj_offset += BYTES_PER_WORD;
 		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
-		/* user store requires word alignment and
-		 * one word storage behind the end of the real
-		 * object.
+		/* user store requires one word storage behind the end of
+		 * the real object.
 		 */
-		align = BYTES_PER_WORD;
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-- 
cgit v1.2.3


From 6ddab3b9ebebc88bfdd8107c64f12d7e4480c559 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Sep 2006 23:31:26 -0700
Subject: [PATCH] mm: swap write failure fixup

Currently we can silently drop data if the write to swap failed.  It
usually doesn't result in data-corruption because on page-in the process
will receive SIGBUS (assuming write-failure implies read-failure).

This assumption might or might not be valid.

This patch will avoid the page being discarded after a failed write.  But
will print a warning the sysadmin _should_ take to heart, if a lot of swap
space becomes un-writeable, OOM is not far off.

Tested by making the write fail 'randomly' once every 50 writes or so.

[akpm@osdl.org: printk warning fix]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_io.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 88029948d00..d2f0a578337 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -52,8 +52,23 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
 	if (bio->bi_size)
 		return 1;
 
-	if (!uptodate)
+	if (!uptodate) {
 		SetPageError(page);
+		/*
+		 * We failed to write the page out to swap-space.
+		 * Re-dirty the page in order to avoid it being reclaimed.
+		 * Also print a dire warning that things will go BAD (tm)
+		 * very quickly.
+		 *
+		 * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+		 */
+		set_page_dirty(page);
+		printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
+				imajor(bio->bi_bdev->bd_inode),
+				iminor(bio->bi_bdev->bd_inode),
+				(unsigned long long)bio->bi_sector);
+		ClearPageReclaim(page);
+	}
 	end_page_writeback(page);
 	bio_put(bio);
 	return 0;
@@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
 	if (!uptodate) {
 		SetPageError(page);
 		ClearPageUptodate(page);
+		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+				imajor(bio->bi_bdev->bd_inode),
+				iminor(bio->bi_bdev->bd_inode),
+				(unsigned long long)bio->bi_sector);
 	} else {
 		SetPageUptodate(page);
 	}
-- 
cgit v1.2.3


From 408d85441cd5a9bd6bc851d677a10c605ed8db5f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:27 -0700
Subject: [PATCH] oom: use unreclaimable info

__alloc_pages currently starts shooting if page reclaim has failed to free up
swap_cluster_max pages in one run through the priorities.  This is not always
a good indicator on its own, so make use of the all_unreclaimable logic as
well: don't consider going OOM until all zones we're interested in are
unreclaimable.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16180587fd7..ba18d0c36b8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -62,6 +62,8 @@ struct scan_control {
 	int swap_cluster_max;
 
 	int swappiness;
+
+	int all_unreclaimable;
 };
 
 /*
@@ -925,6 +927,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 	unsigned long nr_reclaimed = 0;
 	int i;
 
+	sc->all_unreclaimable = 1;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
@@ -941,6 +944,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
+		sc->all_unreclaimable = 0;
+
 		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
 	return nr_reclaimed;
@@ -1021,6 +1026,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
+	/* top priority shrink_caches still had more to do? don't OOM, then */
+	if (!sc.all_unreclaimable)
+		ret = 1;
 out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
-- 
cgit v1.2.3


From 4ff1ffb4870b007b86f21e5f27eeb11498c4c077 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:28 -0700
Subject: [PATCH] oom: reclaim_mapped on oom

Potentially it takes several scans of the lru lists before we can even start
reclaiming pages.

mapped pages, with young ptes can take 2 passes on the active list + one on
the inactive list.  But reclaim_mapped may not always kick in instantly, so it
could take even more than that.

Raise the threshold for marking a zone as all_unreclaimable from a factor of 4
time the pages in the zone to 6.  Introduce a mechanism to force
reclaim_mapped if we've reached a factor 3 and still haven't made progress.

Previously, a customer doing stress testing was able to easily OOM the box
after using only a small fraction of its swap (~100MB).  After the patches, it
would only OOM after having used up all swap (~800MB).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba18d0c36b8..8f35d7d585c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -697,6 +697,11 @@ done:
 	return nr_reclaimed;
 }
 
+static inline int zone_is_near_oom(struct zone *zone)
+{
+	return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+}
+
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -732,6 +737,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		long distress;
 		long swap_tendency;
 
+		if (zone_is_near_oom(zone))
+			goto force_reclaim_mapped;
+
 		/*
 		 * `distress' is a measure of how much trouble we're having
 		 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
@@ -767,6 +775,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		 * memory onto the inactive list.
 		 */
 		if (swap_tendency >= 100)
+force_reclaim_mapped:
 			reclaim_mapped = 1;
 	}
 
@@ -1161,7 +1170,7 @@ scan:
 			if (zone->all_unreclaimable)
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
-				    (zone->nr_active + zone->nr_inactive) * 4)
+				    (zone->nr_active + zone->nr_inactive) * 6)
 				zone->all_unreclaimable = 1;
 			/*
 			 * If we've done a decent amount of scanning and
-- 
cgit v1.2.3


From 7887a3da753e1ba8244556cc9a2b38c815bfe256 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:29 -0700
Subject: [PATCH] oom: cpuset hint

cpuset_excl_nodes_overlap does not always indicate that killing a task will
not free any memory we for us.  For example, we may be asking for an
allocation from _anywhere_ in the machine, or the task in question may be
pinning memory that is outside its cpuset.  Fix this by just causing
cpuset_excl_nodes_overlap to reduce the badness rather than disallow it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7d056843fa2..4f815b06ac1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -128,6 +128,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 		points /= 4;
 
+	/*
+	 * If p's nodes don't overlap ours, it may still help to kill p
+	 * because p may have allocated or otherwise mapped memory on
+	 * this node before. However it will be less likely.
+	 */
+	if (!cpuset_excl_nodes_overlap(p))
+		points /= 8;
+
 	/*
 	 * Adjust the score by oomkilladj.
 	 */
@@ -198,9 +206,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			continue;
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-		/* If p's nodes don't overlap ours, it won't help to kill p. */
-		if (!cpuset_excl_nodes_overlap(p))
-			continue;
 
 		/*
 		 * This is in the process of releasing memory so wait for it
-- 
cgit v1.2.3


From 50ec3bbffbe8a96347c54832d48110a5bc9e9ff8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:29 -0700
Subject: [PATCH] oom: handle current exiting

If current *is* exiting, it should actually be allowed to access reserved
memory rather than OOM kill something else.  Can't do this via a straight
check in page_alloc.c because that would allow multiple tasks to use up
reserves.  Instead cause current to OOM-kill itself which will mark it as
TIF_MEMDIE.

The current procedure of simply aborting the OOM-kill if a task is exiting can
lead to OOM deadlocks.

In the case of killing a PF_EXITING task, don't make a lot of noise about it.
This becomes more important in future patches, where we can "kill" OOM_DISABLE
tasks.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4f815b06ac1..0131bae2a16 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -210,11 +210,26 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		/*
 		 * This is in the process of releasing memory so wait for it
 		 * to finish before killing some other task by mistake.
+		 *
+		 * However, if p is the current task, we allow the 'kill' to
+		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
+		 * which will allow it to gain access to memory reserves in
+		 * the process of exiting and releasing its resources.
+		 * Otherwise we could get an OOM deadlock.
 		 */
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
-		if (releasing && !(p->flags & PF_DEAD))
+		if (releasing) {
+			/* PF_DEAD tasks have already released their mm */
+			if (p->flags & PF_DEAD)
+				continue;
+			if (p->flags & PF_EXITING && p == current) {
+				chosen = p;
+				*ppoints = ULONG_MAX;
+				break;
+			}
 			return ERR_PTR(-1UL);
+		}
 		if (p->flags & PF_SWAPOFF)
 			return p;
 
@@ -248,8 +263,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "%s: Killed process %d (%s).\n",
+
+	if (message) {
+		printk(KERN_ERR "%s: Killed process %d (%s).\n",
 				message, p->pid, p->comm);
+	}
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
@@ -300,8 +318,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 	struct task_struct *c;
 	struct list_head *tsk;
 
-	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
-		"children.\n", p->pid, p->comm, points);
+	/*
+	 * If the task is already exiting, don't alarm the sysadmin or kill
+	 * its children or threads, just set TIF_MEMDIE so it can die quickly
+	 */
+	if (p->flags & PF_EXITING) {
+		__oom_kill_task(p, NULL);
+		return 0;
+	}
+
+	printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
+			" and children.\n", p->pid, p->comm, points);
 	/* Try to kill a child first */
 	list_for_each(tsk, &p->children) {
 		c = list_entry(tsk, struct task_struct, sibling);
-- 
cgit v1.2.3


From 4a3ede107e422a0c53d28024b0aa902ca22a8768 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:30 -0700
Subject: [PATCH] oom: handle oom_disable exiting

Having the oomkilladj == OOM_DISABLE check before the releasing check means
that oomkilladj == OOM_DISABLE tasks exiting will not stop the OOM killer.

Moving the test down will give the desired behaviour.  Also: it will allow
them to "OOM-kill" themselves if they are exiting.  As per the previous patch,
this is required to prevent OOM killer deadlocks (and they don't actually get
killed, because they're already exiting -- they're simply allowed access to
memory reserves).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0131bae2a16..55a05f1ef76 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -204,8 +204,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		/* skip the init task with pid == 1 */
 		if (p->pid == 1)
 			continue;
-		if (p->oomkilladj == OOM_DISABLE)
-			continue;
 
 		/*
 		 * This is in the process of releasing memory so wait for it
@@ -230,6 +228,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			}
 			return ERR_PTR(-1UL);
 		}
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
 		if (p->flags & PF_SWAPOFF)
 			return p;
 
-- 
cgit v1.2.3


From af5b912435de32fbede08cee949429823ed49781 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:31 -0700
Subject: [PATCH] oom: swapoff tasks tweak

PF_SWAPOFF processes currently cause select_bad_process to return straight
away.  Instead, give them high priority, so we will kill them first, however
we also first ensure no parallel OOM kills are happening at the same time.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 55a05f1ef76..f1aba7e7b76 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -59,6 +59,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 		return 0;
 	}
 
+	/*
+	 * swapoff can easily use up all memory, so kill those first.
+	 */
+	if (p->flags & PF_SWAPOFF)
+		return ULONG_MAX;
+
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
@@ -230,8 +236,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		}
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-		if (p->flags & PF_SWAPOFF)
-			return p;
 
 		points = badness(p, uptime.tv_sec);
 		if (points > *ppoints || !chosen) {
-- 
cgit v1.2.3


From 5081dde33f7a61d28d9b185cc386f12cb837c7a4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:32 -0700
Subject: [PATCH] oom: kthread infinite loop fix

Skip kernel threads, rather than having them return 0 from badness.
Theoretically, badness might truncate all results to 0, thus a kernel thread
might be picked first, causing an infinite loop.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f1aba7e7b76..12cd4735dc2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -207,6 +207,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		unsigned long points;
 		int releasing;
 
+		/* skip kernel threads */
+		if (!p->mm)
+			continue;
 		/* skip the init task with pid == 1 */
 		if (p->pid == 1)
 			continue;
-- 
cgit v1.2.3


From b72f160443cb78b2f8addae6e331d2adaa70f869 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:32 -0700
Subject: [PATCH] oom: more printk

Print the name of the task invoking the OOM killer.  Could make debugging
easier.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 12cd4735dc2..c5e38400058 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -381,8 +381,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 		return;
 
 	if (printk_ratelimit()) {
-		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
-			gfp_mask, order);
+		printk(KERN_WARNING "%s invoked oom-killer: "
+			"gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+			current->comm, gfp_mask, order, current->oomkilladj);
 		dump_stack();
 		show_mem();
 	}
-- 
cgit v1.2.3


From dfd54cbcc0b834652389ce99b5e656ea5f44a3c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Sep 2006 23:31:33 -0700
Subject: [PATCH] bootmem: use MAX_DMA_ADDRESS instead of LOW32LIMIT

Introduce ARCH_LOW_ADDRESS_LIMIT which can be set per architecture to
override the 4GB default limit used by the bootmem allocater within
__alloc_bootmem_low() and __alloc_bootmem_low_node().  E.g.  s390 needs a
2GB limit instead of 4GB.

Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-s390/processor.h |  2 ++
 mm/bootmem.c                 | 11 ++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/asm-s390/processor.h b/include/asm-s390/processor.h
index a3a4e5fd30d..578c2209fa7 100644
--- a/include/asm-s390/processor.h
+++ b/include/asm-s390/processor.h
@@ -337,6 +337,8 @@ struct notifier_block;
 int register_idle_notifier(struct notifier_block *nb);
 int unregister_idle_notifier(struct notifier_block *nb);
 
+#define ARCH_LOW_ADDRESS_LIMIT	0x7fffffffUL
+
 #endif
 
 /*
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f0f85fa713c..d53112fcb40 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
 
 #include <asm/bug.h>
 #include <asm/io.h>
+#include <asm/processor.h>
 
 #include "internal.h"
 
@@ -453,7 +454,9 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 	return __alloc_bootmem(size, align, goal);
 }
 
-#define LOW32LIMIT 0xffffffff
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT	0xffffffffUL
+#endif
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 				  unsigned long goal)
@@ -462,7 +465,8 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 	void *ptr;
 
 	list_for_each_entry(bdata, &bdata_list, list) {
-		ptr = __alloc_bootmem_core(bdata, size, align, goal, LOW32LIMIT);
+		ptr = __alloc_bootmem_core(bdata, size, align, goal,
+						ARCH_LOW_ADDRESS_LIMIT);
 		if (ptr)
 			return ptr;
 	}
@@ -478,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
-	return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+	return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
+				    ARCH_LOW_ADDRESS_LIMIT);
 }
-- 
cgit v1.2.3


From e5ac9c5aec7c4bc57fa93f2d37d760a22cb7bd33 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 25 Sep 2006 23:31:34 -0700
Subject: [PATCH] Add some comments to slab.c

Also, checks if we get a valid slabp_cache for off slab slab-descriptors.
We should always get this.  If we don't, then in that case we, will have to
disable off-slab descriptors for this cache and do the calculations again.
This is a rare case, so add a BUG_ON, for now, just in case.

Signed-off-by: Alok N Kataria <alok.kataria@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index d47d0e18697..3ad2f64998f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2206,8 +2206,17 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		cachep->gfpflags |= GFP_DMA;
 	cachep->buffer_size = size;
 
-	if (flags & CFLGS_OFF_SLAB)
+	if (flags & CFLGS_OFF_SLAB) {
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
+		/*
+		 * This is a possibility for one of the malloc_sizes caches.
+		 * But since we go off slab only for object size greater than
+		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
+		 * this should not happen at all.
+		 * But leave a BUG_ON for some lucky dude.
+		 */
+		BUG_ON(!cachep->slabp_cache);
+	}
 	cachep->ctor = ctor;
 	cachep->dtor = dtor;
 	cachep->name = name;
@@ -2441,7 +2450,17 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-/* Get the memory for a slab management obj. */
+/*
+ * Get the memory for a slab management obj.
+ * For a slab cache when the slab descriptor is off-slab, slab descriptors
+ * always come from malloc_sizes caches.  The slab descriptor cannot
+ * come from the same cache which is getting created because,
+ * when we are searching for an appropriate cache for these
+ * descriptors in kmem_cache_create, we search through the malloc_sizes array.
+ * If we are creating a malloc_sizes cache here it would not be visible to
+ * kmem_find_general_cachep till the initialization is complete.
+ * Hence we cannot have slabp_cache same as the original cache.
+ */
 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 				   int colour_off, gfp_t local_flags,
 				   int nodeid)
@@ -3125,6 +3144,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
+				/* No need to drop any previously held
+				 * lock here, even if we have a off-slab slab
+				 * descriptor it is guaranteed to come from
+				 * a different cache, refer to comments before
+				 * alloc_slabmgmt.
+				 */
 				slab_destroy(cachep, slabp);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
-- 
cgit v1.2.3


From da6052f7b33abe55fbfd7d2213815f58c00a88d4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 25 Sep 2006 23:31:35 -0700
Subject: [PATCH] update some mm/ comments

Let's try to keep mm/ comments more useful and up to date. This is a start.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h         | 68 +++++++++++++++++++++++++++-------------------
 include/linux/page-flags.h | 35 ++++++++++++++----------
 mm/filemap.c               |  8 +++---
 3 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2db4229a006..f2018775b99 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -219,7 +219,8 @@ struct inode;
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
  * moment. Note that we have no way to track which tasks are using
- * a page.
+ * a page, though if it is a pagecache page, rmap structures can tell us
+ * who is mapping it.
  */
 struct page {
 	unsigned long flags;		/* Atomic flags, some possibly
@@ -299,8 +300,7 @@ struct page {
  */
 
 /*
- * Drop a ref, return true if the logical refcount fell to zero (the page has
- * no users)
+ * Drop a ref, return true if the refcount fell to zero (the page has no users)
  */
 static inline int put_page_testzero(struct page *page)
 {
@@ -356,43 +356,55 @@ void split_page(struct page *page, unsigned int order);
  * For the non-reserved pages, page_count(page) denotes a reference count.
  *   page_count() == 0 means the page is free. page->lru is then used for
  *   freelist management in the buddy allocator.
- *   page_count() == 1 means the page is used for exactly one purpose
- *   (e.g. a private data page of one process).
+ *   page_count() > 0  means the page has been allocated.
  *
- * A page may be used for kmalloc() or anyone else who does a
- * __get_free_page(). In this case the page_count() is at least 1, and
- * all other fields are unused but should be 0 or NULL. The
- * management of this page is the responsibility of the one who uses
- * it.
+ * Pages are allocated by the slab allocator in order to provide memory
+ * to kmalloc and kmem_cache_alloc. In this case, the management of the
+ * page, and the fields in 'struct page' are the responsibility of mm/slab.c
+ * unless a particular usage is carefully commented. (the responsibility of
+ * freeing the kmalloc memory is the caller's, of course).
  *
- * The other pages (we may call them "process pages") are completely
+ * A page may be used by anyone else who does a __get_free_page().
+ * In this case, page_count still tracks the references, and should only
+ * be used through the normal accessor functions. The top bits of page->flags
+ * and page->virtual store page management information, but all other fields
+ * are unused and could be used privately, carefully. The management of this
+ * page is the responsibility of the one who allocated it, and those who have
+ * subsequently been given references to it.
+ *
+ * The other pages (we may call them "pagecache pages") are completely
  * managed by the Linux memory manager: I/O, buffers, swapping etc.
  * The following discussion applies only to them.
  *
- * A page may belong to an inode's memory mapping. In this case,
- * page->mapping is the pointer to the inode, and page->index is the
- * file offset of the page, in units of PAGE_CACHE_SIZE.
+ * A pagecache page contains an opaque `private' member, which belongs to the
+ * page's address_space. Usually, this is the address of a circular list of
+ * the page's disk buffers. PG_private must be set to tell the VM to call
+ * into the filesystem to release these pages.
  *
- * A page contains an opaque `private' member, which belongs to the
- * page's address_space.  Usually, this is the address of a circular
- * list of the page's disk buffers.
+ * A page may belong to an inode's memory mapping. In this case, page->mapping
+ * is the pointer to the inode, and page->index is the file offset of the page,
+ * in units of PAGE_CACHE_SIZE.
  *
- * For pages belonging to inodes, the page_count() is the number of
- * attaches, plus 1 if `private' contains something, plus one for
- * the page cache itself.
+ * If pagecache pages are not associated with an inode, they are said to be
+ * anonymous pages. These may become associated with the swapcache, and in that
+ * case PG_swapcache is set, and page->private is an offset into the swapcache.
  *
- * Instead of keeping dirty/clean pages in per address-space lists, we instead
- * now tag pages as dirty/under writeback in the radix tree.
+ * In either case (swapcache or inode backed), the pagecache itself holds one
+ * reference to the page. Setting PG_private should also increment the
+ * refcount. The each user mapping also has a reference to the page.
  *
- * There is also a per-mapping radix tree mapping index to the page
- * in memory if present. The tree is rooted at mapping->root.  
+ * The pagecache pages are stored in a per-mapping radix tree, which is
+ * rooted at mapping->page_tree, and indexed by offset.
+ * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
+ * lists, we instead now tag pages as dirty/writeback in the radix tree.
  *
- * All process pages can do I/O:
+ * All pagecache pages may be subject to I/O:
  * - inode pages may need to be read from disk,
  * - inode pages which have been modified and are MAP_SHARED may need
- *   to be written to disk,
- * - private pages which have been modified may need to be swapped out
- *   to swap space and (later) to be read back into memory.
+ *   to be written back to the inode on disk,
+ * - anonymous pages (including MAP_PRIVATE file mappings) which have been
+ *   modified may need to be swapped out to swap space and (later) to be read
+ *   back into memory.
  */
 
 /*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5748642e9f3..9d7921dd50f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -13,24 +13,25 @@
  * PG_reserved is set for special pages, which can never be swapped out. Some
  * of them might not even exist (eg empty_bad_page)...
  *
- * The PG_private bitflag is set if page->private contains a valid value.
+ * The PG_private bitflag is set on pagecache pages if they contain filesystem
+ * specific data (which is normally at page->private). It can be used by
+ * private allocations for its own usage.
  *
- * During disk I/O, PG_locked is used. This bit is set before I/O and
- * reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks
- * waiting for the I/O on this page to complete.
+ * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
+ * and cleared when writeback _starts_ or when read _completes_. PG_writeback
+ * is set before writeback starts and cleared when it finishes.
+ *
+ * PG_locked also pins a page in pagecache, and blocks truncation of the file
+ * while it is held.
+ *
+ * page_waitqueue(page) is a wait queue of all tasks waiting for the page
+ * to become unlocked.
  *
  * PG_uptodate tells whether the page's contents is valid.  When a read
  * completes, the page becomes uptodate, unless a disk I/O error happened.
  *
- * For choosing which pages to swap out, inode pages carry a PG_referenced bit,
- * which is set any time the system accesses that page through the (mapping,
- * index) hash table.  This referenced bit, together with the referenced bit
- * in the page tables, is used to manipulate page->age and move the page across
- * the active, inactive_dirty and inactive_clean lists.
- *
- * Note that the referenced bit, the page->lru list_head and the active,
- * inactive_dirty and inactive_clean lists are protected by the
- * zone->lru_lock, and *NOT* by the usual PG_locked bit!
+ * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
+ * file-backed pagecache (see mm/vmscan.c).
  *
  * PG_error is set to indicate that an I/O error occurred on this page.
  *
@@ -42,6 +43,10 @@
  * space, they need to be kmapped separately for doing IO on the pages.  The
  * struct page (these bits with information) are always mapped into kernel
  * address space...
+ *
+ * PG_buddy is set to indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ *
  */
 
 /*
@@ -74,7 +79,7 @@
 #define PG_checked		 8	/* kill me in 2.5.<early>. */
 #define PG_arch_1		 9
 #define PG_reserved		10
-#define PG_private		11	/* Has something at ->private */
+#define PG_private		11	/* If pagecache, has fs-private data */
 
 #define PG_writeback		12	/* Page is under writeback */
 #define PG_nosave		13	/* Used for system suspend/resume */
@@ -83,7 +88,7 @@
 
 #define PG_mappedtodisk		16	/* Has blocks allocated on-disk */
 #define PG_reclaim		17	/* To be reclaimed asap */
-#define PG_nosave_free		18	/* Free, should not be written */
+#define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
 
diff --git a/mm/filemap.c b/mm/filemap.c
index d5af1cab426..afcdc72b5e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -599,8 +599,8 @@ void fastcall __lock_page_nosync(struct page *page)
  * @mapping: the address_space to search
  * @offset: the page index
  *
- * A rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+ * Is there a pagecache struct page at the given (mapping, offset) tuple?
+ * If yes, increment its refcount and return it; if no, return NULL.
  */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 {
@@ -987,7 +987,7 @@ page_not_up_to_date:
 		/* Get exclusive access to the page ... */
 		lock_page(page);
 
-		/* Did it get unhashed before we got the lock? */
+		/* Did it get truncated before we got the lock? */
 		if (!page->mapping) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1627,7 +1627,7 @@ no_cached_page:
 page_not_uptodate:
 	lock_page(page);
 
-	/* Did it get unhashed while we waited for it? */
+	/* Did it get truncated while we waited for it? */
 	if (!page->mapping) {
 		unlock_page(page);
 		goto err;
-- 
cgit v1.2.3


From dbe5e69d2d6e591996ea2b817b887d03b60bb143 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Sep 2006 23:31:36 -0700
Subject: [PATCH] slab: optimize kmalloc_node the same way as kmalloc

[akpm@osdl.org: export fix]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 25 ++++++++++++++++++++++++-
 mm/slab.c            |  4 ++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 193c03c547e..2f6bef6a98c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -202,7 +202,30 @@ extern int slab_is_available(void);
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node);
-extern void *kmalloc_node(size_t size, gfp_t flags, int node);
+extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	if (__builtin_constant_p(size)) {
+		int i = 0;
+#define CACHE(x) \
+		if (size <= x) \
+			goto found; \
+		else \
+			i++;
+#include "kmalloc_sizes.h"
+#undef CACHE
+		{
+			extern void __you_cannot_kmalloc_that_much(void);
+			__you_cannot_kmalloc_that_much();
+		}
+found:
+		return kmem_cache_alloc_node((flags & GFP_DMA) ?
+			malloc_sizes[i].cs_dmacachep :
+			malloc_sizes[i].cs_cachep, flags, node);
+	}
+	return __kmalloc_node(size, flags, node);
+}
 #else
 static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int node)
 {
diff --git a/mm/slab.c b/mm/slab.c
index 3ad2f64998f..5e59ce7a46c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3348,7 +3348,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *kmalloc_node(size_t size, gfp_t flags, int node)
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	struct kmem_cache *cachep;
 
@@ -3357,7 +3357,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node)
 		return NULL;
 	return kmem_cache_alloc_node(cachep, flags, node);
 }
-EXPORT_SYMBOL(kmalloc_node);
+EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
 /**
-- 
cgit v1.2.3


From 117f6eb1d8b8deb6f19fc88fc15bdb413c2a0c79 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:37 -0700
Subject: [PATCH] slab: extract __kmem_cache_destroy from kmem_cache_destroy

The ability to free memory allocated to a slab cache is also useful if an
error occurs during setup of a slab.  So extract the function.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 5e59ce7a46c..c714741b253 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1833,6 +1833,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 	}
 }
 
+static void __kmem_cache_destroy(struct kmem_cache *cachep)
+{
+	int i;
+	struct kmem_list3 *l3;
+
+	for_each_online_cpu(i)
+	    kfree(cachep->array[i]);
+
+	/* NUMA: free the list3 structures */
+	for_each_online_node(i) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
+			kfree(l3->shared);
+			free_alien_cache(l3->alien);
+			kfree(l3);
+		}
+	}
+	kmem_cache_free(&cache_cache, cachep);
+}
+
+
 /**
  * calculate_slab_order - calculate size (page order) of slabs
  * @cachep: pointer to the cache that is being created
@@ -2404,9 +2425,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
  */
 int kmem_cache_destroy(struct kmem_cache *cachep)
 {
-	int i;
-	struct kmem_list3 *l3;
-
 	BUG_ON(!cachep || in_interrupt());
 
 	/* Don't let CPUs to come and go */
@@ -2432,19 +2450,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
 		synchronize_rcu();
 
-	for_each_online_cpu(i)
-	    kfree(cachep->array[i]);
-
-	/* NUMA: free the list3 structures */
-	for_each_online_node(i) {
-		l3 = cachep->nodelists[i];
-		if (l3) {
-			kfree(l3->shared);
-			free_alien_cache(l3->alien);
-			kfree(l3);
-		}
-	}
-	kmem_cache_free(&cache_cache, cachep);
+	__kmem_cache_destroy(cachep);
 	unlock_cpu_hotplug();
 	return 0;
 }
-- 
cgit v1.2.3


From 2ed3a4ef95ef1a13a424378c34ebd9b7e593f212 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:38 -0700
Subject: [PATCH] slab: do not panic when alloc_kmemlist fails and slab is up

It is fairly easy to get a system to oops by simply sizing a cache via
/proc in such a way that one of the chaches (shared is easiest) becomes
bigger than the maximum allowed slab allocation size.  This occurs because
enable_cpucache() fails if it cannot reallocate some caches.

However, enable_cpucache() is used for multiple purposes: resizing caches,
cache creation and bootstrap.

If the slab is already up then we already have working caches.  The resize
can fail without a problem.  We just need to return the proper error code.
F.e.  after this patch:

# echo "size-64 10000 50 1000" >/proc/slabinfo
-bash: echo: write error: Cannot allocate memory

notice no OOPS.

If we are doing a kmem_cache_create() then we also should not panic but
return -ENOMEM.

If on the other hand we do not have a fully bootstrapped slab allocator yet
then we should indeed panic since we are unable to bring up the slab to its
full functionality.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index c714741b253..3233c4c7cbc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
-static void enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 
 /*
@@ -1490,7 +1490,8 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			enable_cpucache(cachep);
+			if (enable_cpucache(cachep))
+				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
 
@@ -1924,12 +1925,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
-static void setup_cpu_cache(struct kmem_cache *cachep)
+static int setup_cpu_cache(struct kmem_cache *cachep)
 {
-	if (g_cpucache_up == FULL) {
-		enable_cpucache(cachep);
-		return;
-	}
+	if (g_cpucache_up == FULL)
+		return enable_cpucache(cachep);
+
 	if (g_cpucache_up == NONE) {
 		/*
 		 * Note: the first kmem_cache_create must create the cache
@@ -1976,6 +1976,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
 	cpu_cache_get(cachep)->touched = 0;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
+	return 0;
 }
 
 /**
@@ -2242,8 +2243,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->dtor = dtor;
 	cachep->name = name;
 
-
-	setup_cpu_cache(cachep);
+	if (setup_cpu_cache(cachep)) {
+		__kmem_cache_destroy(cachep);
+		cachep = NULL;
+		goto oops;
+	}
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
@@ -3693,7 +3697,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
 	struct ccupdate_struct new;
-	int i, err;
+	int i;
 
 	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
@@ -3724,17 +3728,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		kfree(ccold);
 	}
 
-	err = alloc_kmemlist(cachep);
-	if (err) {
-		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-		       cachep->name, -err);
-		BUG();
-	}
-	return 0;
+	return alloc_kmemlist(cachep);
 }
 
 /* Called with cache_chain_mutex held always */
-static void enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
@@ -3786,6 +3784,7 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
+	return err;
 }
 
 /*
-- 
cgit v1.2.3


From 056c62418cc639bf2fe962c6a6ee56054b838bc7 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Mon, 25 Sep 2006 23:31:38 -0700
Subject: [PATCH] slab: fix lockdep warnings

Place the alien array cache locks of on slab malloc slab caches on a
seperate lockdep class.  This avoids false positives from lockdep

[akpm@osdl.org: build fix]
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 3233c4c7cbc..2b37a62f631 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = {
 #endif
 };
 
+#define BAD_ALIEN_MAGIC 0x01020304ul
+
 #ifdef CONFIG_LOCKDEP
 
 /*
@@ -682,29 +684,53 @@ static struct kmem_cache cache_cache = {
  * The locking for this is tricky in that it nests within the locks
  * of all other slabs in a few places; to deal with this special
  * locking we put on-slab caches into a separate lock-class.
+ *
+ * We set lock class for alien array caches which are up during init.
+ * The lock annotation will be lost if all cpus of a node goes down and
+ * then comes back up during hotplug
  */
-static struct lock_class_key on_slab_key;
+static struct lock_class_key on_slab_l3_key;
+static struct lock_class_key on_slab_alc_key;
+
+static inline void init_lock_keys(void)
 
-static inline void init_lock_keys(struct cache_sizes *s)
 {
 	int q;
-
-	for (q = 0; q < MAX_NUMNODES; q++) {
-		if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep))
-			continue;
-		lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock,
-				  &on_slab_key);
+	struct cache_sizes *s = malloc_sizes;
+
+	while (s->cs_size != ULONG_MAX) {
+		for_each_node(q) {
+			struct array_cache **alc;
+			int r;
+			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
+			if (!l3 || OFF_SLAB(s->cs_cachep))
+				continue;
+			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
+			alc = l3->alien;
+			/*
+			 * FIXME: This check for BAD_ALIEN_MAGIC
+			 * should go away when common slab code is taught to
+			 * work even without alien caches.
+			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+			 * for alloc_alien_cache,
+			 */
+			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+				continue;
+			for_each_node(r) {
+				if (alc[r])
+					lockdep_set_class(&alc[r]->lock,
+					     &on_slab_alc_key);
+			}
+		}
+		s++;
 	}
 }
-
 #else
-static inline void init_lock_keys(struct cache_sizes *s)
+static inline void init_lock_keys(void)
 {
 }
 #endif
 
-
-
 /* Guard access to the cache-chain. */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
@@ -1091,7 +1117,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
-	return (struct array_cache **) 0x01020304ul;
+	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
 
 static inline void free_alien_cache(struct array_cache **ac_ptr)
@@ -1421,7 +1447,6 @@ void __init kmem_cache_init(void)
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL, NULL);
 		}
-		init_lock_keys(sizes);
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
@@ -1495,6 +1520,10 @@ void __init kmem_cache_init(void)
 		mutex_unlock(&cache_chain_mutex);
 	}
 
+	/* Annotate slab for lockdep -- annotate the malloc caches */
+	init_lock_keys();
+
+
 	/* Done! */
 	g_cpucache_up = FULL;
 
-- 
cgit v1.2.3


From 9b819d204cf602eab1a53a9ec4b8d2ca51e02a1d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:40 -0700
Subject: [PATCH] Add __GFP_THISNODE to avoid fallback to other nodes and
 ignore cpuset/memory policy restrictions

Add a new gfp flag __GFP_THISNODE to avoid fallback to other nodes.  This
flag is essential if a kernel component requires memory to be located on a
certain node.  It will be needed for alloc_pages_node() to force allocation
on the indicated node and for alloc_pages() to force allocation on the
current node.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h | 3 ++-
 kernel/cpuset.c     | 2 +-
 mm/mempolicy.c      | 2 +-
 mm/page_alloc.c     | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 63ab88a36f3..0eda5b6dedb 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -45,6 +45,7 @@ struct vm_area_struct;
 #define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -53,7 +54,7 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_THISNODE)
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc..76940361273 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2316,7 +2316,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
-	if (in_interrupt())
+	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
 	node = z->zone_pgdat->node_id;
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3429a710ab..8002e1faccd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1290,7 +1290,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 
 	if ((gfp & __GFP_WAIT) && !in_interrupt())
 		cpuset_update_task_memory_state();
-	if (!pol || in_interrupt())
+	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 208a6b03aa7..ea498788af5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -893,6 +893,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	do {
+		if (unlikely((gfp_mask & __GFP_THISNODE) &&
+			(*z)->zone_pgdat != zonelist->zones[0]->zone_pgdat))
+				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 				!cpuset_zone_allowed(*z, gfp_mask))
 			continue;
-- 
cgit v1.2.3


From 3d99cfb5f46191fc68f1343feeb2cf835001f7d7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:43 -0700
Subject: [PATCH] sys_move_pages: Do not fall back to other nodes

If the user specified a node where we should move the page to then we
really do not want any other node.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/migrate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 3f1e0c2c942..6196f45c526 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -741,7 +741,9 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
+	return alloc_pages_node(pm->node,
+		GFP_HIGHUSER | __GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY,
+		0);
 }
 
 /*
-- 
cgit v1.2.3


From bd1b1677b5d4f39deda068bf5cc43ce3aaec839f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:44 -0700
Subject: [PATCH] Guarantee that the uncached allocator gets pages on the
 correct node

The uncached allocator manages per node pools.  Specify __GFP_THISNODE in
order to force allocation on the indicated node or fail.  The uncached
allocator has already logic to deal with failing allocations.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/uncached.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 4c73a676366..f813caa8057 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
 	/* attempt to allocate a granule's worth of cached memory pages */
 
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO,
+	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO |
+				 __GFP_THISNODE | __GFP_NORETRY | __GFP_NOWARN,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
 		mutex_unlock(&uc_pool->add_chunk_mutex);
-- 
cgit v1.2.3


From 1192d526412b1b8ccb1493064cea06efc12c772b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:45 -0700
Subject: [PATCH] Cleanup: Add zone pointer to get_page_from_freelist

There are frequent references to *z in get_page_from_freelist.

Add an explicit zone variable that can be used in all these places.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea498788af5..e8a71657ac4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -887,35 +887,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	struct zone **z = zonelist->zones;
 	struct page *page = NULL;
 	int classzone_idx = zone_idx(*z);
+	struct zone *zone;
 
 	/*
 	 * Go through the zonelist once, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	do {
+		zone = *z;
 		if (unlikely((gfp_mask & __GFP_THISNODE) &&
-			(*z)->zone_pgdat != zonelist->zones[0]->zone_pgdat))
+			zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
 				break;
 		if ((alloc_flags & ALLOC_CPUSET) &&
-				!cpuset_zone_allowed(*z, gfp_mask))
+				!cpuset_zone_allowed(zone, gfp_mask))
 			continue;
 
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
 			if (alloc_flags & ALLOC_WMARK_MIN)
-				mark = (*z)->pages_min;
+				mark = zone->pages_min;
 			else if (alloc_flags & ALLOC_WMARK_LOW)
-				mark = (*z)->pages_low;
+				mark = zone->pages_low;
 			else
-				mark = (*z)->pages_high;
-			if (!zone_watermark_ok(*z, order, mark,
+				mark = zone->pages_high;
+			if (!zone_watermark_ok(zone , order, mark,
 				    classzone_idx, alloc_flags))
 				if (!zone_reclaim_mode ||
-				    !zone_reclaim(*z, gfp_mask, order))
+				    !zone_reclaim(zone, gfp_mask, order))
 					continue;
 		}
 
-		page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
+		page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
 		if (page) {
 			break;
 		}
-- 
cgit v1.2.3


From fbd98167e653535c5816be154f2149c0efa7757d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:45 -0700
Subject: [PATCH] Profiling: require buffer allocation on the correct node

Profiling really suffers with off node buffers.  Fail if no memory is
available on the nodes.  The profiling code can deal with these failures
should they occur.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/profile.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501..fb660c7d35b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 		node = cpu_to_node(cpu);
 		per_cpu(cpu_profile_flip, cpu) = 0;
 		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
 			if (!page)
 				return NOTIFY_BAD;
 			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
 		}
 		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-			page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+			page = alloc_pages_node(node,
+					GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+					0);
 			if (!page)
 				goto out_free;
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
 		int node = cpu_to_node(cpu);
 		struct page *page;
 
-		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
-		page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+		page = alloc_pages_node(node,
+				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[0]
-- 
cgit v1.2.3


From 980128f223fa3c75e3ebdde650c9f1bcabd4c0a2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:46 -0700
Subject: [PATCH] Define easier to handle GFP_THISNODE

In many places we will need to use the same combination of flags.  Specify
a single GFP_THISNODE definition for ease of use in gfp.h.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/uncached.c | 3 +--
 include/linux/gfp.h         | 2 ++
 mm/migrate.c                | 4 +---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index f813caa8057..c58e933694d 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,8 +98,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
 	/* attempt to allocate a granule's worth of cached memory pages */
 
-	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO |
-				 __GFP_THISNODE | __GFP_NORETRY | __GFP_NOWARN,
+	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
 		mutex_unlock(&uc_pool->add_chunk_mutex);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0eda5b6dedb..8b34aabfe4c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -67,6 +67,8 @@ struct vm_area_struct;
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
 			 __GFP_HIGHMEM)
 
+#define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
+
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6196f45c526..20a8c2687b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -741,9 +741,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_node(pm->node,
-		GFP_HIGHUSER | __GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY,
-		0);
+	return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
 }
 
 /*
-- 
cgit v1.2.3


From d2e7b7d0aa021847c59f882b066e7d3812902870 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 25 Sep 2006 23:31:47 -0700
Subject: [PATCH] fix potential stack overflow in mm/slab.c

On High end systems (1024 or so cpus) this can potentially cause stack
overflow. Fix the stack usage.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2b37a62f631..619337a5cb2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3725,22 +3725,26 @@ static void do_ccupdate_local(void *info)
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
-	struct ccupdate_struct new;
+	struct ccupdate_struct *new;
 	int i;
 
-	memset(&new.new, 0, sizeof(new.new));
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
 	for_each_online_cpu(i) {
-		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
+		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
 						batchcount);
-		if (!new.new[i]) {
+		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
-				kfree(new.new[i]);
+				kfree(new->new[i]);
+			kfree(new);
 			return -ENOMEM;
 		}
 	}
-	new.cachep = cachep;
+	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
+	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
@@ -3748,7 +3752,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	cachep->shared = shared;
 
 	for_each_online_cpu(i) {
-		struct array_cache *ccold = new.new[i];
+		struct array_cache *ccold = new->new[i];
 		if (!ccold)
 			continue;
 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
@@ -3756,7 +3760,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
 		kfree(ccold);
 	}
-
+	kfree(new);
 	return alloc_kmemlist(cachep);
 }
 
@@ -4274,6 +4278,7 @@ static int leaks_show(struct seq_file *m, void *p)
 		show_symbol(m, n[2*i+2]);
 		seq_putc(m, '\n');
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 46a82b2d5591335277ed2930611f6acb4ce654ed Mon Sep 17 00:00:00 2001
From: Dave McCracken <dmccr@us.ibm.com>
Date: Mon, 25 Sep 2006 23:31:48 -0700
Subject: [PATCH] Standardize pxx_page macros

One of the changes necessary for shared page tables is to standardize the
pxx_page macros.  pte_page and pmd_page have always returned the struct
page associated with their entry, while pte_page_kernel and pmd_page_kernel
have returned the kernel virtual address.  pud_page and pgd_page, on the
other hand, return the kernel virtual address.

Shared page tables needs pud_page and pgd_page to return the actual page
structures.  There are very few actual users of these functions, so it is
simple to standardize their usage.

Since this is basic cleanup, I am submitting these changes as a standalone
patch.  Per Hugh Dickins' comments about it, I am also changing the
pxx_page_kernel macros to pxx_page_vaddr to clarify their meaning.

Signed-off-by: Dave McCracken <dmccr@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/ioremap.c               |  2 +-
 arch/sh/mm/cache-sh7705.c           |  2 +-
 arch/sparc/mm/srmmu.c               |  2 +-
 arch/sparc/mm/sun4c.c               |  2 +-
 arch/um/kernel/skas/mmu.c           |  2 +-
 arch/x86_64/mm/fault.c              |  6 +++---
 include/asm-alpha/mmzone.h          |  1 +
 include/asm-alpha/pgtable.h         |  9 +++++----
 include/asm-arm/pgtable.h           |  8 ++++----
 include/asm-arm26/pgtable.h         |  8 ++++----
 include/asm-cris/pgtable.h          |  4 ++--
 include/asm-frv/pgtable.h           |  8 ++++----
 include/asm-generic/4level-fixup.h  |  4 ++++
 include/asm-generic/pgtable-nopmd.h |  2 +-
 include/asm-generic/pgtable-nopud.h |  2 +-
 include/asm-i386/pgtable-3level.h   |  2 +-
 include/asm-i386/pgtable.h          |  4 ++--
 include/asm-ia64/pgtable.h          | 14 ++++++++------
 include/asm-m32r/pgtable-2level.h   |  6 +++++-
 include/asm-m32r/pgtable.h          |  4 ++--
 include/asm-m68k/motorola_pgtable.h |  1 +
 include/asm-mips/pgtable-32.h       |  4 ++--
 include/asm-mips/pgtable-64.h       | 10 ++++++----
 include/asm-mips/pgtable.h          |  2 +-
 include/asm-parisc/pgtable.h        |  9 +++++----
 include/asm-powerpc/pgtable-4k.h    |  5 +++--
 include/asm-powerpc/pgtable.h       | 11 ++++++-----
 include/asm-ppc/pgtable.h           |  8 ++++----
 include/asm-s390/pgtable.h          | 10 ++++++----
 include/asm-sh/pgtable-2level.h     |  5 ++++-
 include/asm-sh/pgtable.h            |  4 ++--
 include/asm-sh64/pgtable.h          |  6 ++++--
 include/asm-sparc/pgtable.h         |  4 ++--
 include/asm-sparc64/pgtable.h       |  5 +++--
 include/asm-um/pgtable-2level.h     |  2 +-
 include/asm-um/pgtable-3level.h     |  5 +++--
 include/asm-um/pgtable.h            |  4 ++--
 include/asm-x86_64/pgtable.h        | 16 ++++++++--------
 include/asm-xtensa/pgtable.h        |  4 ++--
 39 files changed, 117 insertions(+), 90 deletions(-)

diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 88a999df0ab..591fc3187c7 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -177,7 +177,7 @@ static void unmap_area_sections(unsigned long virt, unsigned long size)
 			 * Free the page table, if there was one.
 			 */
 			if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE)
-				pte_free_kernel(pmd_page_kernel(pmd));
+				pte_free_kernel(pmd_page_vaddr(pmd));
 		}
 
 		addr += PGDIR_SIZE;
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index ad8ed7d41e1..bf94eedb0a8 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -30,7 +30,7 @@
 
 #define __pte_offset(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
+#define pte_offset(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
 		__pte_offset(address))
 
 static inline void cache_wback_all(void)
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index 16e13f663ab..b27a506309e 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -2175,7 +2175,7 @@ void __init ld_mmu_srmmu(void)
 
 	BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM);
 	BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM);
-	BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM);
+	BTFIXUPSET_CALL(pgd_page_vaddr, srmmu_pgd_page, BTFIXUPCALL_NORM);
 
 	BTFIXUPSET_SETHI(none_mask, 0xF0000000);
 
diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c
index 7fdddf3c7e1..436021ceb2e 100644
--- a/arch/sparc/mm/sun4c.c
+++ b/arch/sparc/mm/sun4c.c
@@ -2280,5 +2280,5 @@ void __init ld_mmu_sun4c(void)
 
 	/* These should _never_ get called with two level tables. */
 	BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP);
-	BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0);
+	BTFIXUPSET_CALL(pgd_page_vaddr, sun4c_pgd_page, BTFIXUPCALL_RETO0);
 }
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 624ca238d1f..79c22707a63 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -55,7 +55,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc,
 	 * destroy_context_skas.
 	 */
 
-        mm->context.skas.last_page_table = pmd_page_kernel(*pmd);
+        mm->context.skas.last_page_table = pmd_page_vaddr(*pmd);
 #ifdef CONFIG_3_LEVEL_PGTABLES
         mm->context.skas.last_pmd = (unsigned long) __va(pud_val(*pud));
 #endif
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index ac8ea66ccb9..4198798e146 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -299,7 +299,7 @@ static int vmalloc_fault(unsigned long address)
 	if (pgd_none(*pgd))
 		set_pgd(pgd, *pgd_ref);
 	else
-		BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 
 	/* Below here mismatches are bugs because these lower tables
 	   are shared */
@@ -308,7 +308,7 @@ static int vmalloc_fault(unsigned long address)
 	pud_ref = pud_offset(pgd_ref, address);
 	if (pud_none(*pud_ref))
 		return -1;
-	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 		BUG();
 	pmd = pmd_offset(pud, address);
 	pmd_ref = pmd_offset(pud_ref, address);
@@ -641,7 +641,7 @@ void vmalloc_sync_all(void)
 				if (pgd_none(*pgd))
 					set_pgd(pgd, *pgd_ref);
 				else
-					BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 			}
 			spin_unlock(&pgd_lock);
 			set_bit(pgd_index(address), insync);
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 64d0ab98fcd..8af56ce346a 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -75,6 +75,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 #define VALID_PAGE(page)	(((page) - mem_map) < max_mapnr)
 
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> 32))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> 32))
 #define pte_pfn(pte)		(pte_val(pte) >> 32)
 
 #define mk_pte(page, pgprot)						     \
diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h
index 93eaa58b796..49ac9bee7ce 100644
--- a/include/asm-alpha/pgtable.h
+++ b/include/asm-alpha/pgtable.h
@@ -230,16 +230,17 @@ extern inline void pgd_set(pgd_t * pgdp, pmd_t * pmdp)
 
 
 extern inline unsigned long
-pmd_page_kernel(pmd_t pmd)
+pmd_page_vaddr(pmd_t pmd)
 {
 	return ((pmd_val(pmd) & _PFN_MASK) >> (32-PAGE_SHIFT)) + PAGE_OFFSET;
 }
 
 #ifndef CONFIG_DISCONTIGMEM
 #define pmd_page(pmd)	(mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32))
+#define pgd_page(pgd)	(mem_map + ((pgd_val(pgd) & _PFN_MASK) >> 32))
 #endif
 
-extern inline unsigned long pgd_page(pgd_t pgd)
+extern inline unsigned long pgd_page_vaddr(pgd_t pgd)
 { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); }
 
 extern inline int pte_none(pte_t pte)		{ return !pte_val(pte); }
@@ -293,13 +294,13 @@ extern inline pte_t pte_mkyoung(pte_t pte)	{ pte_val(pte) |= __ACCESS_BITS; retu
 /* Find an entry in the second-level page table.. */
 extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 {
-	return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
+	return (pmd_t *) pgd_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
 }
 
 /* Find an entry in the third-level page table.. */
 extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
 {
-	return (pte_t *) pmd_page_kernel(*dir)
+	return (pte_t *) pmd_page_vaddr(*dir)
 		+ ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1));
 }
 
diff --git a/include/asm-arm/pgtable.h b/include/asm-arm/pgtable.h
index 8d3919c6458..4d10d319fa3 100644
--- a/include/asm-arm/pgtable.h
+++ b/include/asm-arm/pgtable.h
@@ -224,9 +224,9 @@ extern struct page *empty_zero_page;
 #define pte_none(pte)		(!pte_val(pte))
 #define pte_clear(mm,addr,ptep)	set_pte_at((mm),(addr),(ptep), __pte(0))
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
-#define pte_offset_kernel(dir,addr)	(pmd_page_kernel(*(dir)) + __pte_index(addr))
-#define pte_offset_map(dir,addr)	(pmd_page_kernel(*(dir)) + __pte_index(addr))
-#define pte_offset_map_nested(dir,addr)	(pmd_page_kernel(*(dir)) + __pte_index(addr))
+#define pte_offset_kernel(dir,addr)	(pmd_page_vaddr(*(dir)) + __pte_index(addr))
+#define pte_offset_map(dir,addr)	(pmd_page_vaddr(*(dir)) + __pte_index(addr))
+#define pte_offset_map_nested(dir,addr)	(pmd_page_vaddr(*(dir)) + __pte_index(addr))
 #define pte_unmap(pte)		do { } while (0)
 #define pte_unmap_nested(pte)	do { } while (0)
 
@@ -291,7 +291,7 @@ PTE_BIT_FUNC(mkyoung,   |= L_PTE_YOUNG);
 		clean_pmd_entry(pmdp);	\
 	} while (0)
 
-static inline pte_t *pmd_page_kernel(pmd_t pmd)
+static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 {
 	unsigned long ptr;
 
diff --git a/include/asm-arm26/pgtable.h b/include/asm-arm26/pgtable.h
index 19ac9101a6b..63a8881fae1 100644
--- a/include/asm-arm26/pgtable.h
+++ b/include/asm-arm26/pgtable.h
@@ -186,12 +186,12 @@ extern struct page *empty_zero_page;
  * return a pointer to memory (no special alignment)
  */
 #define pmd_page(pmd)  ((struct page *)(pmd_val((pmd)) & ~_PMD_PRESENT))
-#define pmd_page_kernel(pmd) ((pte_t *)(pmd_val((pmd)) & ~_PMD_PRESENT))
+#define pmd_page_vaddr(pmd) ((pte_t *)(pmd_val((pmd)) & ~_PMD_PRESENT))
 
-#define pte_offset_kernel(dir,addr)     (pmd_page_kernel(*(dir)) + __pte_index(addr))
+#define pte_offset_kernel(dir,addr)     (pmd_page_vaddr(*(dir)) + __pte_index(addr))
 
-#define pte_offset_map(dir,addr)        (pmd_page_kernel(*(dir)) + __pte_index(addr))
-#define pte_offset_map_nested(dir,addr) (pmd_page_kernel(*(dir)) + __pte_index(addr))
+#define pte_offset_map(dir,addr)        (pmd_page_vaddr(*(dir)) + __pte_index(addr))
+#define pte_offset_map_nested(dir,addr) (pmd_page_vaddr(*(dir)) + __pte_index(addr))
 #define pte_unmap(pte)                  do { } while (0)
 #define pte_unmap_nested(pte)           do { } while (0)
 
diff --git a/include/asm-cris/pgtable.h b/include/asm-cris/pgtable.h
index 5d76c1c0d6c..c94a7107019 100644
--- a/include/asm-cris/pgtable.h
+++ b/include/asm-cris/pgtable.h
@@ -253,7 +253,7 @@ static inline void pmd_set(pmd_t * pmdp, pte_t * ptep)
 { pmd_val(*pmdp) = _PAGE_TABLE | (unsigned long) ptep; }
 
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
-#define pmd_page_kernel(pmd)	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page_vaddr(pmd)	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 /* to find an entry in a page-table-directory. */
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -271,7 +271,7 @@ static inline pgd_t * pgd_offset(struct mm_struct * mm, unsigned long address)
 #define __pte_offset(address) \
 	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  __pte_offset(address))
 #define pte_offset_map(dir, address) \
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
 #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index 7af7485e889..2fb3c6f05e0 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -217,7 +217,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 }
 
 #define pgd_page(pgd)				(pud_page((pud_t){ pgd }))
-#define pgd_page_kernel(pgd)			(pud_page_kernel((pud_t){ pgd }))
+#define pgd_page_vaddr(pgd)			(pud_page_vaddr((pud_t){ pgd }))
 
 /*
  * allocating and freeing a pud is trivial: the 1-entry pud is
@@ -246,7 +246,7 @@ static inline void pud_clear(pud_t *pud)	{ }
 #define set_pud(pudptr, pudval)			set_pmd((pmd_t *)(pudptr), (pmd_t) { pudval })
 
 #define pud_page(pud)				(pmd_page((pmd_t){ pud }))
-#define pud_page_kernel(pud)			(pmd_page_kernel((pmd_t){ pud }))
+#define pud_page_vaddr(pud)			(pmd_page_vaddr((pmd_t){ pud }))
 
 /*
  * (pmds are folded into pgds so this doesn't get actually called,
@@ -362,7 +362,7 @@ static inline pmd_t *pmd_offset(pud_t *dir, unsigned long address)
 #define	pmd_bad(x)	(pmd_val(x) & xAMPRx_SS)
 #define pmd_clear(xp)	do { __set_pmd(xp, 0); } while(0)
 
-#define pmd_page_kernel(pmd) \
+#define pmd_page_vaddr(pmd) \
 	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -458,7 +458,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
 
 #if defined(CONFIG_HIGHPTE)
 #define pte_offset_map(dir, address) \
diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index 68c6fea994d..7b88d3931e3 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -21,6 +21,10 @@
 #define pud_present(pud)		1
 #define pud_ERROR(pud)			do { } while (0)
 #define pud_clear(pud)			pgd_clear(pud)
+#define pud_val(pud)			pgd_val(pud)
+#define pud_populate(mm, pud, pmd)	pgd_populate(mm, pud, pmd)
+#define pud_page(pud)			pgd_page(pud)
+#define pud_page_vaddr(pud)		pgd_page_vaddr(pud)
 
 #undef pud_free_tlb
 #define pud_free_tlb(tlb, x)            do { } while (0)
diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h
index c8d53ba20e1..29ff5d84d8c 100644
--- a/include/asm-generic/pgtable-nopmd.h
+++ b/include/asm-generic/pgtable-nopmd.h
@@ -47,7 +47,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address)
 #define __pmd(x)				((pmd_t) { __pud(x) } )
 
 #define pud_page(pud)				(pmd_page((pmd_t){ pud }))
-#define pud_page_kernel(pud)			(pmd_page_kernel((pmd_t){ pud }))
+#define pud_page_vaddr(pud)			(pmd_page_vaddr((pmd_t){ pud }))
 
 /*
  * allocating and freeing a pmd is trivial: the 1-entry pmd is
diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h
index 82e29f0ce46..56646450055 100644
--- a/include/asm-generic/pgtable-nopud.h
+++ b/include/asm-generic/pgtable-nopud.h
@@ -44,7 +44,7 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address)
 #define __pud(x)				((pud_t) { __pgd(x) } )
 
 #define pgd_page(pgd)				(pud_page((pud_t){ pgd }))
-#define pgd_page_kernel(pgd)			(pud_page_kernel((pud_t){ pgd }))
+#define pgd_page_vaddr(pgd)			(pud_page_vaddr((pud_t){ pgd }))
 
 /*
  * allocating and freeing a pud is trivial: the 1-entry pud is
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index dccb1b3337a..807ed9e366d 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -77,7 +77,7 @@ static inline void pud_clear (pud_t * pud) { }
 #define pud_page(pud) \
 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
 
-#define pud_page_kernel(pud) \
+#define pud_page_vaddr(pud) \
 ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
 
 
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 09697fec3d2..c04b3d0f484 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -364,11 +364,11 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
 
 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
-#define pmd_page_kernel(pmd) \
+#define pmd_page_vaddr(pmd) \
 		((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 /*
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 228981cadf8..55318274772 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -275,21 +275,23 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pmd_bad(pmd)			(!ia64_phys_addr_valid(pmd_val(pmd)))
 #define pmd_present(pmd)		(pmd_val(pmd) != 0UL)
 #define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
-#define pmd_page_kernel(pmd)		((unsigned long) __va(pmd_val(pmd) & _PFN_MASK))
+#define pmd_page_vaddr(pmd)		((unsigned long) __va(pmd_val(pmd) & _PFN_MASK))
 #define pmd_page(pmd)			virt_to_page((pmd_val(pmd) + PAGE_OFFSET))
 
 #define pud_none(pud)			(!pud_val(pud))
 #define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
 #define pud_present(pud)		(pud_val(pud) != 0UL)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define pud_page(pud)			((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#define pud_page_vaddr(pud)		((unsigned long) __va(pud_val(pud) & _PFN_MASK))
+#define pud_page(pud)			virt_to_page((pud_val(pud) + PAGE_OFFSET))
 
 #ifdef CONFIG_PGTABLE_4
 #define pgd_none(pgd)			(!pgd_val(pgd))
 #define pgd_bad(pgd)			(!ia64_phys_addr_valid(pgd_val(pgd)))
 #define pgd_present(pgd)		(pgd_val(pgd) != 0UL)
 #define pgd_clear(pgdp)			(pgd_val(*(pgdp)) = 0UL)
-#define pgd_page(pgd)			((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+#define pgd_page_vaddr(pgd)		((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+#define pgd_page(pgd)			virt_to_page((pgd_val(pgd) + PAGE_OFFSET))
 #endif
 
 /*
@@ -360,19 +362,19 @@ pgd_offset (struct mm_struct *mm, unsigned long address)
 #ifdef CONFIG_PGTABLE_4
 /* Find an entry in the second-level page table.. */
 #define pud_offset(dir,addr) \
-	((pud_t *) pgd_page(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+	((pud_t *) pgd_page_vaddr(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
 #endif
 
 /* Find an entry in the third-level page table.. */
 #define pmd_offset(dir,addr) \
-	((pmd_t *) pud_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+	((pmd_t *) pud_page_vaddr(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
 /*
  * Find an entry in the third-level page table.  This looks more complicated than it
  * should be because some platforms place page tables in high memory.
  */
 #define pte_index(addr)	 	(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir,addr)	((pte_t *) pmd_page_kernel(*(dir)) + pte_index(addr))
+#define pte_offset_kernel(dir,addr)	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir,addr)	pte_offset_kernel(dir, addr)
 #define pte_offset_map_nested(dir,addr)	pte_offset_map(dir, addr)
 #define pte_unmap(pte)			do { } while (0)
diff --git a/include/asm-m32r/pgtable-2level.h b/include/asm-m32r/pgtable-2level.h
index be0f167e344..6a674e3d37a 100644
--- a/include/asm-m32r/pgtable-2level.h
+++ b/include/asm-m32r/pgtable-2level.h
@@ -52,9 +52,13 @@ static inline int pgd_present(pgd_t pgd)	{ return 1; }
 #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
 #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval)
 
-#define pgd_page(pgd) \
+#define pgd_page_vaddr(pgd) \
 ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK))
 
+#ifndef CONFIG_DISCONTIGMEM
+#define pgd_page(pgd)	(mem_map + ((pgd_val(pgd) >> PAGE_SHIFT) - PFN_BASE))
+#endif /* !CONFIG_DISCONTIGMEM */
+
 static inline pmd_t *pmd_offset(pgd_t * dir, unsigned long address)
 {
 	return (pmd_t *) dir;
diff --git a/include/asm-m32r/pgtable.h b/include/asm-m32r/pgtable.h
index 1983b7f4527..1c15ba7ce31 100644
--- a/include/asm-m32r/pgtable.h
+++ b/include/asm-m32r/pgtable.h
@@ -336,7 +336,7 @@ static inline void pmd_set(pmd_t * pmdp, pte_t * ptep)
 	pmd_val(*pmdp) = (((unsigned long) ptep) & PAGE_MASK);
 }
 
-#define pmd_page_kernel(pmd)	\
+#define pmd_page_vaddr(pmd)	\
 	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -358,7 +358,7 @@ static inline void pmd_set(pmd_t * pmdp, pte_t * ptep)
 #define pte_index(address)	\
 	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address)	\
-	((pte_t *)pmd_page_kernel(*(dir)) + pte_index(address))
+	((pte_t *)pmd_page_vaddr(*(dir)) + pte_index(address))
 #define pte_offset_map(dir, address)	\
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
 #define pte_offset_map_nested(dir, address)	pte_offset_map(dir, address)
diff --git a/include/asm-m68k/motorola_pgtable.h b/include/asm-m68k/motorola_pgtable.h
index 1ccc7338a54..61e4406ed96 100644
--- a/include/asm-m68k/motorola_pgtable.h
+++ b/include/asm-m68k/motorola_pgtable.h
@@ -150,6 +150,7 @@ static inline void pgd_set(pgd_t *pgdp, pmd_t *pmdp)
 #define pgd_bad(pgd)		((pgd_val(pgd) & _DESCTYPE_MASK) != _PAGE_TABLE)
 #define pgd_present(pgd)	(pgd_val(pgd) & _PAGE_TABLE)
 #define pgd_clear(pgdp)		({ pgd_val(*pgdp) = 0; })
+#define pgd_page(pgd)		(mem_map + ((unsigned long)(__va(pgd_val(pgd)) - PAGE_OFFSET) >> PAGE_SHIFT))
 
 #define pte_ERROR(e) \
 	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
diff --git a/include/asm-mips/pgtable-32.h b/include/asm-mips/pgtable-32.h
index 4b26d852813..d20f2e9b28b 100644
--- a/include/asm-mips/pgtable-32.h
+++ b/include/asm-mips/pgtable-32.h
@@ -156,9 +156,9 @@ pfn_pte(unsigned long pfn, pgprot_t prot)
 #define __pte_offset(address)						\
 	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset(dir, address)					\
-	((pte_t *) (pmd_page_kernel(*dir)) + __pte_offset(address))
+	((pte_t *) (pmd_page_vaddr(*dir)) + __pte_offset(address))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  __pte_offset(address))
 
 #define pte_offset_map(dir, address)                                    \
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
diff --git a/include/asm-mips/pgtable-64.h b/include/asm-mips/pgtable-64.h
index e3db93212ea..c59a1e21f5b 100644
--- a/include/asm-mips/pgtable-64.h
+++ b/include/asm-mips/pgtable-64.h
@@ -178,24 +178,26 @@ static inline void pud_clear(pud_t *pudp)
 /* to find an entry in a page-table-directory */
 #define pgd_offset(mm,addr)	((mm)->pgd + pgd_index(addr))
 
-static inline unsigned long pud_page(pud_t pud)
+static inline unsigned long pud_page_vaddr(pud_t pud)
 {
 	return pud_val(pud);
 }
+#define pud_phys(pud)		(pud_val(pud) - PAGE_OFFSET)
+#define pud_page(pud)		(pfn_to_page(pud_phys(pud) >> PAGE_SHIFT))
 
 /* Find an entry in the second-level page table.. */
 static inline pmd_t *pmd_offset(pud_t * pud, unsigned long address)
 {
-	return (pmd_t *) pud_page(*pud) + pmd_index(address);
+	return (pmd_t *) pud_page_vaddr(*pud) + pmd_index(address);
 }
 
 /* Find an entry in the third-level page table.. */
 #define __pte_offset(address)						\
 	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset(dir, address)					\
-	((pte_t *) (pmd_page_kernel(*dir)) + __pte_offset(address))
+	((pte_t *) (pmd_page_vaddr(*dir)) + __pte_offset(address))
 #define pte_offset_kernel(dir, address)					\
-	((pte_t *) pmd_page_kernel(*(dir)) +  __pte_offset(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  __pte_offset(address))
 #define pte_offset_map(dir, address)					\
 	((pte_t *)page_address(pmd_page(*(dir))) + __pte_offset(address))
 #define pte_offset_map_nested(dir, address)				\
diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h
index a36ca1be17f..1ca4d1e185c 100644
--- a/include/asm-mips/pgtable.h
+++ b/include/asm-mips/pgtable.h
@@ -87,7 +87,7 @@ extern void paging_init(void);
  */
 #define pmd_phys(pmd)		(pmd_val(pmd) - PAGE_OFFSET)
 #define pmd_page(pmd)		(pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
-#define pmd_page_kernel(pmd)	pmd_val(pmd)
+#define pmd_page_vaddr(pmd)	pmd_val(pmd)
 
 #if defined(CONFIG_64BIT_PHYS_ADDR) && defined(CONFIG_CPU_MIPS32_R1)
 
diff --git a/include/asm-parisc/pgtable.h b/include/asm-parisc/pgtable.h
index 5066c54dae0..c0b61e0d149 100644
--- a/include/asm-parisc/pgtable.h
+++ b/include/asm-parisc/pgtable.h
@@ -303,7 +303,8 @@ static inline void pmd_clear(pmd_t *pmd) {
 
 
 #if PT_NLEVELS == 3
-#define pgd_page(pgd) ((unsigned long) __va(pgd_address(pgd)))
+#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_address(pgd)))
+#define pgd_page(pgd)	virt_to_page((void *)pgd_page_vaddr(pgd))
 
 /* For 64 bit we have three level tables */
 
@@ -382,7 +383,7 @@ extern inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
 
-#define pmd_page_kernel(pmd)	((unsigned long) __va(pmd_address(pmd)))
+#define pmd_page_vaddr(pmd)	((unsigned long) __va(pmd_address(pmd)))
 
 #define __pmd_page(pmd) ((unsigned long) __va(pmd_address(pmd)))
 #define pmd_page(pmd)	virt_to_page((void *)__pmd_page(pmd))
@@ -400,7 +401,7 @@ extern inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #if PT_NLEVELS == 3
 #define pmd_offset(dir,address) \
-((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1)))
+((pmd_t *) pgd_page_vaddr(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1)))
 #else
 #define pmd_offset(dir,addr) ((pmd_t *) dir)
 #endif
@@ -408,7 +409,7 @@ extern inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 /* Find an entry in the third-level page table.. */ 
 #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
 #define pte_offset_kernel(pmd, address) \
-	((pte_t *) pmd_page_kernel(*(pmd)) + pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(pmd)) + pte_index(address))
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
diff --git a/include/asm-powerpc/pgtable-4k.h b/include/asm-powerpc/pgtable-4k.h
index e7036155672..345d9b07b3e 100644
--- a/include/asm-powerpc/pgtable-4k.h
+++ b/include/asm-powerpc/pgtable-4k.h
@@ -88,10 +88,11 @@
 #define pgd_bad(pgd)		(pgd_val(pgd) == 0)
 #define pgd_present(pgd)	(pgd_val(pgd) != 0)
 #define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
-#define pgd_page(pgd)		(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
+#define pgd_page(pgd)		virt_to_page(pgd_page_vaddr(pgd))
 
 #define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page(*(pgdp))) + \
+  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
     (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
 
 #define pud_ERROR(e) \
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index 8dbf5ad8150..10f52743f4f 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -196,8 +196,8 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 				 || (pmd_val(pmd) & PMD_BAD_BITS))
 #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
+#define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
+#define pmd_page(pmd)		virt_to_page(pmd_page_vaddr(pmd))
 
 #define pud_set(pudp, pudval)	(pud_val(*(pudp)) = (pudval))
 #define pud_none(pud)		(!pud_val(pud))
@@ -205,7 +205,8 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 				 || (pud_val(pud) & PUD_BAD_BITS))
 #define pud_present(pud)	(pud_val(pud) != 0)
 #define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
-#define pud_page(pud)		(pud_val(pud) & ~PUD_MASKED_BITS)
+#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
+#define pud_page(pud)		virt_to_page(pud_page_vaddr(pud))
 
 #define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
 
@@ -219,10 +220,10 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
 #define pmd_offset(pudp,addr) \
-  (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+  (((pmd_t *) pud_page_vaddr(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
 #define pte_offset_kernel(dir,addr) \
-  (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+  (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
diff --git a/include/asm-ppc/pgtable.h b/include/asm-ppc/pgtable.h
index 51fa7c66291..b1fdbf40dba 100644
--- a/include/asm-ppc/pgtable.h
+++ b/include/asm-ppc/pgtable.h
@@ -526,7 +526,7 @@ static inline int pgd_bad(pgd_t pgd)		{ return 0; }
 static inline int pgd_present(pgd_t pgd)	{ return 1; }
 #define pgd_clear(xp)				do { } while (0)
 
-#define pgd_page(pgd) \
+#define pgd_page_vaddr(pgd) \
 	((unsigned long) __va(pgd_val(pgd) & PAGE_MASK))
 
 /*
@@ -720,12 +720,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  * of the pte page.  -- paulus
  */
 #ifndef CONFIG_BOOKE
-#define pmd_page_kernel(pmd)	\
+#define pmd_page_vaddr(pmd)	\
 	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 #define pmd_page(pmd)		\
 	(mem_map + (pmd_val(pmd) >> PAGE_SHIFT))
 #else
-#define pmd_page_kernel(pmd)	\
+#define pmd_page_vaddr(pmd)	\
 	((unsigned long) (pmd_val(pmd) & PAGE_MASK))
 #define pmd_page(pmd)		\
 	(mem_map + (__pa(pmd_val(pmd)) >> PAGE_SHIFT))
@@ -748,7 +748,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 #define pte_index(address)		\
 	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, addr)	\
-	((pte_t *) pmd_page_kernel(*(dir)) + pte_index(addr))
+	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir, addr)		\
 	((pte_t *) kmap_atomic(pmd_page(*(dir)), KM_PTE0) + pte_index(addr))
 #define pte_offset_map_nested(dir, addr)	\
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index 1a07028d575..e965309feda 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -664,11 +664,13 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
 #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT)
 #define pte_page(x) pfn_to_page(pte_pfn(x))
 
-#define pmd_page_kernel(pmd) (pmd_val(pmd) & PAGE_MASK)
+#define pmd_page_vaddr(pmd) (pmd_val(pmd) & PAGE_MASK)
 
 #define pmd_page(pmd) (mem_map+(pmd_val(pmd) >> PAGE_SHIFT))
 
-#define pgd_page_kernel(pgd) (pgd_val(pgd) & PAGE_MASK)
+#define pgd_page_vaddr(pgd) (pgd_val(pgd) & PAGE_MASK)
+
+#define pgd_page(pgd) (mem_map+(pgd_val(pgd) >> PAGE_SHIFT))
 
 /* to find an entry in a page-table-directory */
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
@@ -690,14 +692,14 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 /* Find an entry in the second-level page table.. */
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
 #define pmd_offset(dir,addr) \
-	((pmd_t *) pgd_page_kernel(*(dir)) + pmd_index(addr))
+	((pmd_t *) pgd_page_vaddr(*(dir)) + pmd_index(addr))
 
 #endif /* __s390x__ */
 
 /* Find an entry in the third-level page table.. */
 #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1))
 #define pte_offset_kernel(pmd, address) \
-	((pte_t *) pmd_page_kernel(*(pmd)) + pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(pmd)) + pte_index(address))
 #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_offset_map_nested(pmd, address) pte_offset_kernel(pmd, address)
 #define pte_unmap(pte) do { } while (0)
diff --git a/include/asm-sh/pgtable-2level.h b/include/asm-sh/pgtable-2level.h
index b0528aa3cb1..b525db6f61c 100644
--- a/include/asm-sh/pgtable-2level.h
+++ b/include/asm-sh/pgtable-2level.h
@@ -50,9 +50,12 @@ static inline void pgd_clear (pgd_t * pgdp) 	{ }
 #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
 #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval)
 
-#define pgd_page(pgd) \
+#define pgd_page_vaddr(pgd) \
 ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK))
 
+#define pgd_page(pgd) \
+	(phys_to_page(pgd_val(pgd)))
+
 static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 {
 	return (pmd_t *) dir;
diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h
index dcd23a03683..40d41a78041 100644
--- a/include/asm-sh/pgtable.h
+++ b/include/asm-sh/pgtable.h
@@ -225,7 +225,7 @@ static inline pgprot_t pgprot_noncached(pgprot_t _prot)
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 { set_pte(&pte, __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot))); return pte; }
 
-#define pmd_page_kernel(pmd) \
+#define pmd_page_vaddr(pmd) \
 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 #define pmd_page(pmd) \
@@ -242,7 +242,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define pte_index(address) \
 		((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
 #define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
 #define pte_offset_map_nested(dir, address) pte_offset_kernel(dir, address)
 #define pte_unmap(pte)		do { } while (0)
diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h
index 54c7821893f..6b97c4cb1d6 100644
--- a/include/asm-sh64/pgtable.h
+++ b/include/asm-sh64/pgtable.h
@@ -190,7 +190,9 @@ static inline int pgd_bad(pgd_t pgd)		{ return 0; }
 #endif
 
 
-#define pgd_page(pgd_entry)	((unsigned long) (pgd_val(pgd_entry) & PAGE_MASK))
+#define pgd_page_vaddr(pgd_entry)	((unsigned long) (pgd_val(pgd_entry) & PAGE_MASK))
+#define pgd_page(pgd)	(virt_to_page(pgd_val(pgd)))
+
 
 /*
  * PMD defines. Middle level.
@@ -219,7 +221,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 #define pmd_none(pmd_entry)	(pmd_val((pmd_entry)) == _PMD_EMPTY)
 #define pmd_bad(pmd_entry)	((pmd_val(pmd_entry) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 
-#define pmd_page_kernel(pmd_entry) \
+#define pmd_page_vaddr(pmd_entry) \
 	((unsigned long) __va(pmd_val(pmd_entry) & PAGE_MASK))
 
 #define pmd_page(pmd) \
diff --git a/include/asm-sparc/pgtable.h b/include/asm-sparc/pgtable.h
index 226c6475c9a..4f0a5ba0d6a 100644
--- a/include/asm-sparc/pgtable.h
+++ b/include/asm-sparc/pgtable.h
@@ -143,10 +143,10 @@ extern unsigned long empty_zero_page;
 /*
  */
 BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t)
-BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t)
+BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page_vaddr, pgd_t)
 
 #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd)
-#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd)
+#define pgd_page_vaddr(pgd) BTFIXUP_CALL(pgd_page_vaddr)(pgd)
 
 BTFIXUPDEF_SETHI(none_mask)
 BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t)
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index ebfe395cfb8..b12be7a869f 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -630,8 +630,9 @@ static inline unsigned long pte_present(pte_t pte)
 #define __pmd_page(pmd)		\
 	((unsigned long) __va((((unsigned long)pmd_val(pmd))<<11UL)))
 #define pmd_page(pmd) 			virt_to_page((void *)__pmd_page(pmd))
-#define pud_page(pud)		\
+#define pud_page_vaddr(pud)		\
 	((unsigned long) __va((((unsigned long)pud_val(pud))<<11UL)))
+#define pud_page(pud) 			virt_to_page((void *)pud_page_vaddr(pud))
 #define pmd_none(pmd)			(!pmd_val(pmd))
 #define pmd_bad(pmd)			(0)
 #define pmd_present(pmd)		(pmd_val(pmd) != 0U)
@@ -653,7 +654,7 @@ static inline unsigned long pte_present(pte_t pte)
 
 /* Find an entry in the second-level page table.. */
 #define pmd_offset(pudp, address)	\
-	((pmd_t *) pud_page(*(pudp)) + \
+	((pmd_t *) pud_page_vaddr(*(pudp)) + \
 	 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)))
 
 /* Find an entry in the third-level page table.. */
diff --git a/include/asm-um/pgtable-2level.h b/include/asm-um/pgtable-2level.h
index ffe017f6b64..6050e0eb257 100644
--- a/include/asm-um/pgtable-2level.h
+++ b/include/asm-um/pgtable-2level.h
@@ -41,7 +41,7 @@ static inline void pgd_mkuptodate(pgd_t pgd)	{ }
 #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot))
 #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot))
 
-#define pmd_page_kernel(pmd) \
+#define pmd_page_vaddr(pmd) \
 	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 /*
diff --git a/include/asm-um/pgtable-3level.h b/include/asm-um/pgtable-3level.h
index 786c2572728..ca0c2a92a11 100644
--- a/include/asm-um/pgtable-3level.h
+++ b/include/asm-um/pgtable-3level.h
@@ -74,11 +74,12 @@ extern inline void pud_clear (pud_t *pud)
         set_pud(pud, __pud(0));
 }
 
-#define pud_page(pud) \
+#define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK)
+#define pud_page_vaddr(pud) \
 	((struct page *) __va(pud_val(pud) & PAGE_MASK))
 
 /* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+#define pmd_offset(pud, address) ((pmd_t *) pud_page_vaddr(*(pud)) + \
 			pmd_index(address))
 
 static inline unsigned long pte_pfn(pte_t pte)
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index ac64eb95586..4862daf8b90 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -349,7 +349,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	return pte; 
 }
 
-#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 
 /*
  * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
@@ -389,7 +389,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
  */
 #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir, address) \
-	((pte_t *) pmd_page_kernel(*(dir)) +  pte_index(address))
+	((pte_t *) pmd_page_vaddr(*(dir)) +  pte_index(address))
 #define pte_offset_map(dir, address) \
 	((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address))
 #define pte_offset_map_nested(dir, address) pte_offset_map(dir, address)
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index a31ab4e68a9..51eba239517 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -101,9 +101,6 @@ static inline void pgd_clear (pgd_t * pgd)
 	set_pgd(pgd, __pgd(0));
 }
 
-#define pud_page(pud) \
-((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
-
 #define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte, 0))
 
 struct mm_struct;
@@ -326,7 +323,8 @@ static inline int pmd_large(pmd_t pte) {
 /*
  * Level 4 access.
  */
-#define pgd_page(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page_vaddr(pgd) ((unsigned long) __va((unsigned long)pgd_val(pgd) & PTE_MASK))
+#define pgd_page(pgd)		(pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
@@ -335,16 +333,18 @@ static inline int pmd_large(pmd_t pte) {
 
 /* PUD - Level3 access */
 /* to find an entry in a page-table-directory. */
+#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
+#define pud_page(pud)		(pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
+#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
 #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT)
 
 /* PMD  - Level 2 access */
-#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
 
 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
-#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
+#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
 			pmd_index(address))
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
@@ -382,7 +382,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 
 #define pte_index(address) \
 		(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
-#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
+#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
 			pte_index(address))
 
 /* x86-64 always has all page tables mapped. */
diff --git a/include/asm-xtensa/pgtable.h b/include/asm-xtensa/pgtable.h
index 7b15afb70c5..a47cc734c20 100644
--- a/include/asm-xtensa/pgtable.h
+++ b/include/asm-xtensa/pgtable.h
@@ -218,7 +218,7 @@ extern pgd_t swapper_pg_dir[PAGE_SIZE/sizeof(pgd_t)];
 /*
  * The pmd contains the kernel virtual address of the pte page.
  */
-#define pmd_page_kernel(pmd) ((unsigned long)(pmd_val(pmd) & PAGE_MASK))
+#define pmd_page_vaddr(pmd) ((unsigned long)(pmd_val(pmd) & PAGE_MASK))
 #define pmd_page(pmd) virt_to_page(pmd_val(pmd))
 
 /*
@@ -349,7 +349,7 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 /* Find an entry in the third-level page table.. */
 #define pte_index(address)	(((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
 #define pte_offset_kernel(dir,addr) 					\
-	((pte_t*) pmd_page_kernel(*(dir)) + pte_index(addr))
+	((pte_t*) pmd_page_vaddr(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir),(addr))
 #define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir),(addr))
 
-- 
cgit v1.2.3


From 006d22d9bbb7e66279ba5cc4556b54eeaf8fd556 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:48 -0700
Subject: [PATCH] Optimize free_one_page

Free one_page currently adds the page to a fake list and calls
free_page_bulk.  Fee_page_bulk takes it off again and then calles
__free_one_page.

Make free_one_page go directly to __free_one_page.  Saves list on / off and
a temporary list in free_one_page for higher ordered pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e8a71657ac4..cc648304756 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -448,9 +448,11 @@ static void free_pages_bulk(struct zone *zone, int count,
 
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
-	LIST_HEAD(list);
-	list_add(&page->lru, &list);
-	free_pages_bulk(zone, 1, &list, order);
+	spin_lock(&zone->lock);
+	zone->all_unreclaimable = 0;
+	zone->pages_scanned = 0;
+	__free_one_page(page, zone ,order);
+	spin_unlock(&zone->lock);
 }
 
 static void __free_pages_ok(struct page *page, unsigned int order)
-- 
cgit v1.2.3


From 39bbcb8f88154c4ac9853baf3f1134af4c987517 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:49 -0700
Subject: [PATCH] mm: do not check unpopulated zones for draining and counter
 updates

If a zone is unpopulated then we do not need to check for pages that are to
be drained and also not for vm counters that may need to be updated.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 ++++-
 mm/vmstat.c     | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cc648304756..f7ea020c23e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -633,7 +633,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 #ifdef CONFIG_NUMA
 /*
  * Called from the slab reaper to drain pagesets on a particular node that
- * belong to the currently executing processor.
+ * belongs to the currently executing processor.
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
@@ -647,6 +647,9 @@ void drain_node_pages(int nodeid)
 		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
 
+		if (!populated_zone(zone))
+			continue;
+
 		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 04a9093f649..968c0072e19 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu)
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pcp;
 
+		if (!populated_zone(zone))
+			continue;
+
 		pcp = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-- 
cgit v1.2.3


From d00bcc98d7ec2c87391c9d9e1cca519ef64d33ef Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:50 -0700
Subject: [PATCH] Extract the allocpercpu functions from the slab allocator

The allocpercpu functions __alloc_percpu and __free_percpu() are heavily
using the slab allocator.  However, they are conceptually slab.  This also
simplifies SLOB (at this point slob may be broken in mm.  This should fix
it).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Makefile      |   2 +-
 mm/allocpercpu.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slab.c        | 124 ----------------------------------------------------
 mm/slob.c        |  45 -------------------
 4 files changed, 130 insertions(+), 170 deletions(-)
 create mode 100644 mm/allocpercpu.c

diff --git a/mm/Makefile b/mm/Makefile
index 9dd824c11ee..60c56c0b5e1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,4 +23,4 @@ obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-
+obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
new file mode 100644
index 00000000000..eaa9abeea53
--- /dev/null
+++ b/mm/allocpercpu.c
@@ -0,0 +1,129 @@
+/*
+ * linux/mm/allocpercpu.c
+ *
+ * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
+ */
+#include <linux/mm.h>
+#include <linux/module.h>
+
+/**
+ * percpu_depopulate - depopulate per-cpu data for given cpu
+ * @__pdata: per-cpu data to depopulate
+ * @cpu: depopulate per-cpu data for this cpu
+ *
+ * Depopulating per-cpu data for a cpu going offline would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
+ */
+void percpu_depopulate(void *__pdata, int cpu)
+{
+	struct percpu_data *pdata = __percpu_disguise(__pdata);
+	if (pdata->ptrs[cpu]) {
+		kfree(pdata->ptrs[cpu]);
+		pdata->ptrs[cpu] = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(percpu_depopulate);
+
+/**
+ * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
+ * @__pdata: per-cpu data to depopulate
+ * @mask: depopulate per-cpu data for cpu's selected through mask bits
+ */
+void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+{
+	int cpu;
+	for_each_cpu_mask(cpu, *mask)
+		percpu_depopulate(__pdata, cpu);
+}
+EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+
+/**
+ * percpu_populate - populate per-cpu data for given cpu
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @cpu: populate per-data for this cpu
+ *
+ * Populating per-cpu data for a cpu coming online would be a typical
+ * use case. You need to register a cpu hotplug handler for that purpose.
+ * Per-cpu object is populated with zeroed buffer.
+ */
+void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+{
+	struct percpu_data *pdata = __percpu_disguise(__pdata);
+	int node = cpu_to_node(cpu);
+
+	BUG_ON(pdata->ptrs[cpu]);
+	if (node_online(node)) {
+		/* FIXME: kzalloc_node(size, gfp, node) */
+		pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
+		if (pdata->ptrs[cpu])
+			memset(pdata->ptrs[cpu], 0, size);
+	} else
+		pdata->ptrs[cpu] = kzalloc(size, gfp);
+	return pdata->ptrs[cpu];
+}
+EXPORT_SYMBOL_GPL(percpu_populate);
+
+/**
+ * percpu_populate_mask - populate per-cpu data for more cpu's
+ * @__pdata: per-cpu data to populate further
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-cpu data for cpu's selected through mask bits
+ *
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+			   cpumask_t *mask)
+{
+	cpumask_t populated = CPU_MASK_NONE;
+	int cpu;
+
+	for_each_cpu_mask(cpu, *mask)
+		if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
+			__percpu_depopulate_mask(__pdata, &populated);
+			return -ENOMEM;
+		} else
+			cpu_set(cpu, populated);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+
+/**
+ * percpu_alloc_mask - initial setup of per-cpu data
+ * @size: size of per-cpu object
+ * @gfp: may sleep or not etc.
+ * @mask: populate per-data for cpu's selected through mask bits
+ *
+ * Populating per-cpu data for all online cpu's would be a typical use case,
+ * which is simplified by the percpu_alloc() wrapper.
+ * Per-cpu objects are populated with zeroed buffers.
+ */
+void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+{
+	void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
+	void *__pdata = __percpu_disguise(pdata);
+
+	if (unlikely(!pdata))
+		return NULL;
+	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+		return __pdata;
+	kfree(pdata);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+
+/**
+ * percpu_free - final cleanup of per-cpu data
+ * @__pdata: object to clean up
+ *
+ * We simply clean up any per-cpu object left. No need for the client to
+ * track and specify through a bis mask which per-cpu objects are to free.
+ */
+void percpu_free(void *__pdata)
+{
+	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	kfree(__percpu_disguise(__pdata));
+}
+EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/slab.c b/mm/slab.c
index 619337a5cb2..13b5050f84c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3440,130 +3440,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #endif
 
-#ifdef CONFIG_SMP
-/**
- * percpu_depopulate - depopulate per-cpu data for given cpu
- * @__pdata: per-cpu data to depopulate
- * @cpu: depopulate per-cpu data for this cpu
- *
- * Depopulating per-cpu data for a cpu going offline would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- */
-void percpu_depopulate(void *__pdata, int cpu)
-{
-	struct percpu_data *pdata = __percpu_disguise(__pdata);
-	if (pdata->ptrs[cpu]) {
-		kfree(pdata->ptrs[cpu]);
-		pdata->ptrs[cpu] = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(percpu_depopulate);
-
-/**
- * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
- * @__pdata: per-cpu data to depopulate
- * @mask: depopulate per-cpu data for cpu's selected through mask bits
- */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
-{
-	int cpu;
-	for_each_cpu_mask(cpu, *mask)
-		percpu_depopulate(__pdata, cpu);
-}
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
-
-/**
- * percpu_populate - populate per-cpu data for given cpu
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @cpu: populate per-data for this cpu
- *
- * Populating per-cpu data for a cpu coming online would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- * Per-cpu object is populated with zeroed buffer.
- */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
-{
-	struct percpu_data *pdata = __percpu_disguise(__pdata);
-	int node = cpu_to_node(cpu);
-
-	BUG_ON(pdata->ptrs[cpu]);
-	if (node_online(node)) {
-		/* FIXME: kzalloc_node(size, gfp, node) */
-		pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
-		if (pdata->ptrs[cpu])
-			memset(pdata->ptrs[cpu], 0, size);
-	} else
-		pdata->ptrs[cpu] = kzalloc(size, gfp);
-	return pdata->ptrs[cpu];
-}
-EXPORT_SYMBOL_GPL(percpu_populate);
-
-/**
- * percpu_populate_mask - populate per-cpu data for more cpu's
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-cpu data for cpu's selected through mask bits
- *
- * Per-cpu objects are populated with zeroed buffers.
- */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-			   cpumask_t *mask)
-{
-	cpumask_t populated = CPU_MASK_NONE;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *mask)
-		if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
-			__percpu_depopulate_mask(__pdata, &populated);
-			return -ENOMEM;
-		} else
-			cpu_set(cpu, populated);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
-
-/**
- * percpu_alloc_mask - initial setup of per-cpu data
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
- *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
- */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
-{
-	void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
-	void *__pdata = __percpu_disguise(pdata);
-
-	if (unlikely(!pdata))
-		return NULL;
-	if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
-		return __pdata;
-	kfree(pdata);
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
-
-/**
- * percpu_free - final cleanup of per-cpu data
- * @__pdata: object to clean up
- *
- * We simply clean up any per-cpu object left. No need for the client to
- * track and specify through a bis mask which per-cpu objects are to free.
- */
-void percpu_free(void *__pdata)
-{
-	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
-	kfree(__percpu_disguise(__pdata));
-}
-EXPORT_SYMBOL_GPL(percpu_free);
-#endif	/* CONFIG_SMP */
-
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
diff --git a/mm/slob.c b/mm/slob.c
index 7b52b20b960..4c28a421b27 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -343,48 +343,3 @@ void kmem_cache_init(void)
 atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
 EXPORT_SYMBOL(slab_reclaim_pages);
 
-#ifdef CONFIG_SMP
-
-void *__alloc_percpu(size_t size)
-{
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
-
-	if (!pdata)
-		return NULL;
-
-	for_each_possible_cpu(i) {
-		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-		memset(pdata->ptrs[i], 0, size);
-	}
-
-	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
-	kfree(pdata);
-	return NULL;
-}
-EXPORT_SYMBOL(__alloc_percpu);
-
-void
-free_percpu(const void *objp)
-{
-	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
-	for_each_possible_cpu(i)
-		kfree(p->ptrs[i]);
-
-	kfree(p);
-}
-EXPORT_SYMBOL(free_percpu);
-
-#endif
-- 
cgit v1.2.3


From 8417bba4b151346ed475fcc923693c9e3be89063 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:51 -0700
Subject: [PATCH] Replace min_unmapped_ratio by min_unmapped_pages in struct
 zone

*_pages is a better description of the role of the variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 2 +-
 mm/page_alloc.c        | 4 ++--
 mm/vmscan.c            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7fe317164b7..a703527e2b4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -169,7 +169,7 @@ struct zone {
 	/*
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
-	unsigned long		min_unmapped_ratio;
+	unsigned long		min_unmapped_pages;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f7ea020c23e..5da6bc4e0a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2002,7 +2002,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 #ifdef CONFIG_NUMA
-		zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
 #endif
 		zone->name = zone_names[j];
@@ -2313,7 +2313,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 		return rc;
 
 	for_each_zone(zone)
-		zone->min_unmapped_ratio = (zone->present_pages *
+		zone->min_unmapped_pages = (zone->present_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8f35d7d585c..5154c25e844 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1618,7 +1618,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages)
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 972d1a7b140569084439a81265a0f15b74e924e0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:51 -0700
Subject: [PATCH] ZVC: Support NR_SLAB_RECLAIMABLE / NR_SLAB_UNRECLAIMABLE

Remove the atomic counter for slab_reclaim_pages and replace the counter
and NR_SLAB with two ZVC counter that account for unreclaimable and
reclaimable slab pages: NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE.

Change the check in vmscan.c to refer to to NR_SLAB_RECLAIMABLE.  The
intend seems to be to check for slab pages that could be freed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c |  4 +++-
 drivers/base/node.c    |  9 +++++++--
 fs/proc/proc_misc.c    |  7 ++++++-
 include/linux/mmzone.h |  3 ++-
 include/linux/slab.h   |  2 --
 mm/mmap.c              |  2 +-
 mm/nommu.c             |  2 +-
 mm/page_alloc.c        |  3 ++-
 mm/slab.c              | 24 +++++++++++-------------
 mm/slob.c              |  4 ----
 mm/vmscan.c            |  2 +-
 mm/vmstat.c            |  3 ++-
 12 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index bd98768d876..a9f4910a22f 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -60,7 +60,9 @@ void show_mem(void)
 	printk(KERN_INFO "%lu pages writeback\n",
 					global_page_state(NR_WRITEBACK));
 	printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
-	printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
+	printk(KERN_INFO "%lu pages slab\n",
+		global_page_state(NR_SLAB_RECLAIMABLE) +
+		global_page_state(NR_SLAB_UNRECLAIMABLE));
 	printk(KERN_INFO "%lu pages pagetables\n",
 					global_page_state(NR_PAGETABLE));
 }
diff --git a/drivers/base/node.c b/drivers/base/node.c
index e09f5c2c11e..001e6f6b9c1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -68,7 +68,9 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
-		       "Node %d Slab:         %8lu kB\n",
+		       "Node %d Slab:         %8lu kB\n"
+		       "Node %d SReclaimable: %8lu kB\n"
+		       "Node %d SUnreclaim:   %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
 		       nid, K(i.totalram - i.freeram),
@@ -88,7 +90,10 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
-		       nid, K(node_page_state(nid, NR_SLAB)));
+		       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
+				node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
+		       nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
+		       nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index caa0a51560a..5bbd6089605 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -170,6 +170,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
+		"SReclaimable: %8lu kB\n"
+		"SUnreclaim:   %8lu kB\n"
 		"PageTables:   %8lu kB\n"
 		"NFS_Unstable: %8lu kB\n"
 		"Bounce:       %8lu kB\n"
@@ -197,7 +199,10 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
-		K(global_page_state(NR_SLAB)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_PAGETABLE)),
 		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(global_page_state(NR_BOUNCE)),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a703527e2b4..08c41b9f92e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -51,7 +51,8 @@ enum zone_stat_item {
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
-	NR_SLAB,	/* Pages used by slab allocator */
+	NR_SLAB_RECLAIMABLE,
+	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,	/* used for pagetables */
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2f6bef6a98c..66d6eb78d1c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -284,8 +284,6 @@ extern kmem_cache_t	*fs_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
-extern atomic_t slab_reclaim_pages;
-
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 8507ee9cd57..eea8eefd51a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -116,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 		 * which are reclaimable, under pressure.  The dentry
 		 * cache and most inode caches should fall into this
 		 */
-		free += atomic_read(&slab_reclaim_pages);
+		free += global_page_state(NR_SLAB_RECLAIMABLE);
 
 		/*
 		 * Leave the last 3% for root
diff --git a/mm/nommu.c b/mm/nommu.c
index c576df71e3b..d99dea31e44 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1133,7 +1133,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
 		 * which are reclaimable, under pressure.  The dentry
 		 * cache and most inode caches should fall into this
 		 */
-		free += atomic_read(&slab_reclaim_pages);
+		free += global_page_state(NR_SLAB_RECLAIMABLE);
 
 		/*
 		 * Leave the last 3% for root
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5da6bc4e0a6..47e98423b30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1304,7 +1304,8 @@ void show_free_areas(void)
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		nr_free_pages(),
-		global_page_state(NR_SLAB),
+		global_page_state(NR_SLAB_RECLAIMABLE) +
+			global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_PAGETABLE));
 
diff --git a/mm/slab.c b/mm/slab.c
index 13b5050f84c..7a48eb1a60c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -735,14 +735,6 @@ static inline void init_lock_keys(void)
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
-/*
- * vm_enough_memory() looks at this to determine how many slab-allocated pages
- * are possibly freeable under pressure
- *
- * SLAB_RECLAIM_ACCOUNT turns this on per-slab
- */
-atomic_t slab_reclaim_pages;
-
 /*
  * chicken and egg problem: delay the per-cpu array allocation
  * until the general caches are up.
@@ -1580,8 +1572,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		atomic_add(nr_pages, &slab_reclaim_pages);
-	add_zone_page_state(page_zone(page), NR_SLAB, nr_pages);
+		add_zone_page_state(page_zone(page),
+			NR_SLAB_RECLAIMABLE, nr_pages);
+	else
+		add_zone_page_state(page_zone(page),
+			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	for (i = 0; i < nr_pages; i++)
 		__SetPageSlab(page + i);
 	return page_address(page);
@@ -1596,7 +1591,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 
-	sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		sub_zone_page_state(page_zone(page),
+				NR_SLAB_RECLAIMABLE, nr_freed);
+	else
+		sub_zone_page_state(page_zone(page),
+				NR_SLAB_UNRECLAIMABLE, nr_freed);
 	while (i--) {
 		BUG_ON(!PageSlab(page));
 		__ClearPageSlab(page);
@@ -1605,8 +1605,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
-	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slob.c b/mm/slob.c
index 4c28a421b27..20188627347 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -339,7 +339,3 @@ void kmem_cache_init(void)
 
 	mod_timer(&slob_timer, jiffies + HZ);
 }
-
-atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
-EXPORT_SYMBOL(slab_reclaim_pages);
-
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5154c25e844..349797ba4ba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1378,7 +1378,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	for_each_zone(zone)
 		lru_pages += zone->nr_active + zone->nr_inactive;
 
-	nr_slab = global_page_state(NR_SLAB);
+	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 968c0072e19..490d8c1a0de 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -458,7 +458,8 @@ static char *vmstat_text[] = {
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
-	"nr_slab",
+	"nr_slab_reclaimable",
+	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
 	"nr_dirty",
 	"nr_writeback",
-- 
cgit v1.2.3


From 0ff38490c836dc379ff7ec45b10a15a662f4e5f6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:52 -0700
Subject: [PATCH] zone_reclaim: dynamic slab reclaim

Currently one can enable slab reclaim by setting an explicit option in
/proc/sys/vm/zone_reclaim_mode.  Slab reclaim is then used as a final
option if the freeing of unmapped file backed pages is not enough to free
enough pages to allow a local allocation.

However, that means that the slab can grow excessively and that most memory
of a node may be used by slabs.  We have had a case where a machine with
46GB of memory was using 40-42GB for slab.  Zone reclaim was effective in
dealing with pagecache pages.  However, slab reclaim was only done during
global reclaim (which is a bit rare on NUMA systems).

This patch implements slab reclaim during zone reclaim.  Zone reclaim
occurs if there is a danger of an off node allocation.  At that point we

1. Shrink the per node page cache if the number of pagecache
   pages is more than min_unmapped_ratio percent of pages in a zone.

2. Shrink the slab cache if the number of the nodes reclaimable slab pages
   (patch depends on earlier one that implements that counter)
   are more than min_slab_ratio (a new /proc/sys/vm tunable).

The shrinking of the slab cache is a bit problematic since it is not node
specific.  So we simply calculate what point in the slab we want to reach
(current per node slab use minus the number of pages that neeed to be
allocated) and then repeately run the global reclaim until that is
unsuccessful or we have reached the limit.  I hope we will have zone based
slab reclaim at some point which will make that easier.

The default for the min_slab_ratio is 5%

Also remove the slab option from /proc/sys/vm/zone_reclaim_mode.

[akpm@osdl.org: cleanups]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 27 +++++++++++++++------
 include/linux/mmzone.h      |  3 +++
 include/linux/swap.h        |  1 +
 include/linux/sysctl.h      |  1 +
 kernel/sysctl.c             | 11 +++++++++
 mm/page_alloc.c             | 17 +++++++++++++
 mm/vmscan.c                 | 58 +++++++++++++++++++++++++++++----------------
 7 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 7cee90223d3..20d0d797f53 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm:
 - drop-caches
 - zone_reclaim_mode
 - min_unmapped_ratio
+- min_slab_ratio
 - panic_on_oom
 
 ==============================================================
@@ -138,7 +139,6 @@ This is value ORed together of
 1	= Zone reclaim on
 2	= Zone reclaim writes dirty pages out
 4	= Zone reclaim swaps pages
-8	= Also do a global slab reclaim pass
 
 zone_reclaim_mode is set during bootup to 1 if it is determined that pages
 from remote zones will cause a measurable performance reduction. The
@@ -162,18 +162,13 @@ Allowing regular swap effectively restricts allocations to the local
 node unless explicitly overridden by memory policies or cpuset
 configurations.
 
-It may be advisable to allow slab reclaim if the system makes heavy
-use of files and builds up large slab caches. However, the slab
-shrink operation is global, may take a long time and free slabs
-in all nodes of the system.
-
 =============================================================
 
 min_unmapped_ratio:
 
 This is available only on NUMA kernels.
 
-A percentage of the file backed pages in each zone.  Zone reclaim will only
+A percentage of the total pages in each zone.  Zone reclaim will only
 occur if more than this percentage of pages are file backed and unmapped.
 This is to insure that a minimal amount of local pages is still available for
 file I/O even if the node is overallocated.
@@ -182,6 +177,24 @@ The default is 1 percent.
 
 =============================================================
 
+min_slab_ratio:
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone.  On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+=============================================================
+
 panic_on_oom
 
 This enables or disables panic on out-of-memory feature.  If this is set to 1,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 08c41b9f92e..3693f1a5278 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -171,6 +171,7 @@ struct zone {
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
 	unsigned long		min_unmapped_pages;
+	unsigned long		min_slab_pages;
 	struct per_cpu_pageset	*pageset[NR_CPUS];
 #else
 	struct per_cpu_pageset	pageset[NR_CPUS];
@@ -448,6 +449,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 32db06c8ffe..a2f5ad7c2d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -193,6 +193,7 @@ extern long vm_total_pages;
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
+extern int sysctl_min_slab_ratio;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 736ed917a4f..eca555781d0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -191,6 +191,7 @@ enum
 	VM_MIN_UNMAPPED=32,	/* Set min percent of unmapped pages */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
+	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc3713..fd43c3e6786 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -943,6 +943,17 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= VM_MIN_SLAB,
+		.procname	= "min_slab_ratio",
+		.data		= &sysctl_min_slab_ratio,
+		.maxlen		= sizeof(sysctl_min_slab_ratio),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_min_slab_ratio_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 #ifdef CONFIG_X86_32
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47e98423b30..cf913bdd433 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
 #ifdef CONFIG_NUMA
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
+		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
@@ -2318,6 +2319,22 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
+
+int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->min_slab_pages = (zone->present_pages *
+				sysctl_min_slab_ratio) / 100;
+	return 0;
+}
 #endif
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 349797ba4ba..089e943c4d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1527,7 +1527,6 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
-#define RECLAIM_SLAB (1<<3)	/* Do a global slab shrink if the zone is out of memory */
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1542,6 +1541,12 @@ int zone_reclaim_mode __read_mostly;
  */
 int sysctl_min_unmapped_ratio = 1;
 
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -1573,29 +1578,37 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	/*
-	 * Free memory by calling shrink zone with increasing priorities
-	 * until we have enough memory freed.
-	 */
-	priority = ZONE_RECLAIM_PRIORITY;
-	do {
-		nr_reclaimed += shrink_zone(priority, zone, &sc);
-		priority--;
-	} while (priority >= 0 && nr_reclaimed < nr_pages);
+	if (zone_page_state(zone, NR_FILE_PAGES) -
+		zone_page_state(zone, NR_FILE_MAPPED) >
+		zone->min_unmapped_pages) {
+		/*
+		 * Free memory by calling shrink zone with increasing
+		 * priorities until we have enough memory freed.
+		 */
+		priority = ZONE_RECLAIM_PRIORITY;
+		do {
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			priority--;
+		} while (priority >= 0 && nr_reclaimed < nr_pages);
+	}
 
-	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
-		 * many pages were freed in this zone. So we just shake the slab
-		 * a bit and then go off node for this particular allocation
-		 * despite possibly having freed enough memory to allocate in
-		 * this zone.  If we freed local memory then the next
-		 * allocations will be local again.
+		 * many pages were freed in this zone. So we take the current
+		 * number of slab pages and shake the slab until it is reduced
+		 * by the same nr_pages that we used for reclaiming unmapped
+		 * pages.
 		 *
-		 * shrink_slab will free memory on all zones and may take
-		 * a long time.
+		 * Note that shrink_slab will free memory on all zones and may
+		 * take a long time.
 		 */
-		shrink_slab(sc.nr_scanned, gfp_mask, order);
+		unsigned long limit = zone_page_state(zone,
+				NR_SLAB_RECLAIMABLE) - nr_pages;
+
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+			zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit)
+			;
 	}
 
 	p->reclaim_state = NULL;
@@ -1609,7 +1622,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	/*
-	 * Zone reclaim reclaims unmapped file backed pages.
+	 * Zone reclaim reclaims unmapped file backed pages and
+	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
@@ -1618,7 +1632,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages)
+	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+			<= zone->min_slab_pages)
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 83e33a4711760469f5c3861b8ffea4947656d4eb Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:53 -0700
Subject: [PATCH] zone reclaim with slab: avoid unecessary off node allocations

Minor performance fix.

If we reclaimed enough slab pages from a zone then we can avoid going off
node with the current allocation.  Take care of updating nr_reclaimed when
reclaiming from the slab.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 089e943c4d3..b950f193816 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1566,6 +1566,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
 	};
+	unsigned long slab_reclaimable;
 
 	disable_swap_token();
 	cond_resched();
@@ -1592,7 +1593,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		} while (priority >= 0 && nr_reclaimed < nr_pages);
 	}
 
-	if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) {
+	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+	if (slab_reclaimable > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
 		 * many pages were freed in this zone. So we take the current
@@ -1603,12 +1605,17 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
-		unsigned long limit = zone_page_state(zone,
-				NR_SLAB_RECLAIMABLE) - nr_pages;
-
 		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
-			zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit)
+			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+				slab_reclaimable - nr_pages)
 			;
+
+		/*
+		 * Update nr_reclaimed by the number of slab pages we
+		 * reclaimed from this zone.
+		 */
+		nr_reclaimed += slab_reclaimable -
+			zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	}
 
 	p->reclaim_state = NULL;
-- 
cgit v1.2.3


From 5a291b98b2116d669449885abef3000f747504b3 Mon Sep 17 00:00:00 2001
From: Ram Gupta <ram.gupta5@gmail.com>
Date: Mon, 25 Sep 2006 23:31:54 -0700
Subject: [PATCH] oom-kill: update comments to reflect current code

Update the comments for __oom_kill_task() to reflect the code changes.

Signed-off-by: Ram Gupta <r.gupta@astronautics.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c5e38400058..f1c0ef1fd21 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -250,9 +250,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 }
 
 /**
- * We must be careful though to never send SIGKILL a process with
- * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
- * we select a process with CAP_SYS_RAW_IO set).
+ * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
+ * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
+ * set.
  */
 static void __oom_kill_task(struct task_struct *p, const char *message)
 {
-- 
cgit v1.2.3


From 4415cc8df630b05d3a54267d5f3e5c0b63a4ec05 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:55 -0700
Subject: [PATCH] Hugepages: Use page_to_nid rather than traversing zone
 pointers

I found two location in hugetlb.c where we chase pointer instead of using
page_to_nid().  Page_to_nid is more effective and can get the node directly
from page flags.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index df499973255..3aceadce1a7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page)
 {
 	int i;
 	nr_huge_pages--;
-	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
+	nr_huge_pages_node[page_to_nid(page)]--;
 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page)
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(unsigned long count)
 {
-	int i, nid;
+	int i;
+
 	for (i = 0; i < MAX_NUMNODES; ++i) {
 		struct page *page, *next;
 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
@@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count)
 				continue;
 			list_del(&page->lru);
 			update_and_free_page(page);
-			nid = page_zone(page)->zone_pgdat->node_id;
 			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			free_huge_pages_node[page_to_nid(page)]--;
 			if (count >= nr_huge_pages)
 				return;
 		}
-- 
cgit v1.2.3


From 89fa30242facca249aead2aac03c4c69764f911c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 25 Sep 2006 23:31:55 -0700
Subject: [PATCH] NUMA: Add zone_to_nid function

There are many places where we need to determine the node of a zone.
Currently we use a difficult to read sequence of pointer dereferencing.
Put that into an inline function and use throughout VM.  Maybe we can find
a way to optimize the lookup in the future.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/discontig.c | 2 +-
 arch/parisc/mm/init.c    | 2 +-
 include/linux/mm.h       | 7 ++++++-
 kernel/cpuset.c          | 4 ++--
 mm/hugetlb.c             | 2 +-
 mm/mempolicy.c           | 6 +++---
 mm/oom_kill.c            | 3 +--
 mm/page_alloc.c          | 2 +-
 mm/vmscan.c              | 2 +-
 9 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 07c300f9376..fb5d8b747de 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -422,7 +422,7 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone->zone_pgdat->node_id,
+				zone->name, zone_to_nid(zone),
 				zone_start_pfn, zone_end_pfn);
 
 		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index c7329615ef9..25ad28d63e8 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -551,7 +551,7 @@ void show_mem(void)
 
 				printk("Zone list for zone %d on node %d: ", j, i);
 				for (k = 0; zl->zones[k] != NULL; k++) 
-					printk("[%d/%s] ", zl->zones[k]->zone_pgdat->node_id, zl->zones[k]->name);
+					printk("[%d/%s] ", zone_to_nid(zl->zones[k]), zl->zones[k]->name);
 				printk("\n");
 			}
 		}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f2018775b99..856f0ee7e84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,12 +499,17 @@ static inline struct zone *page_zone(struct page *page)
 	return zone_table[page_zone_id(page)];
 }
 
+static inline unsigned long zone_to_nid(struct zone *zone)
+{
+	return zone->zone_pgdat->node_id;
+}
+
 static inline unsigned long page_to_nid(struct page *page)
 {
 	if (FLAGS_HAS_NODE)
 		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 	else
-		return page_zone(page)->zone_pgdat->node_id;
+		return zone_to_nid(page_zone(page));
 }
 static inline unsigned long page_to_section(struct page *page)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 76940361273..cff41511269 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2245,7 +2245,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	int i;
 
 	for (i = 0; zl->zones[i]; i++) {
-		int nid = zl->zones[i]->zone_pgdat->node_id;
+		int nid = zone_to_nid(zl->zones[i]);
 
 		if (node_isset(nid, current->mems_allowed))
 			return 1;
@@ -2318,7 +2318,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = z->zone_pgdat->node_id;
+	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3aceadce1a7..7c7d03dbf73 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
-		nid = (*z)->zone_pgdat->node_id;
+		nid = zone_to_nid(*z);
 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
 		    !list_empty(&hugepage_freelists[nid]))
 			break;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8002e1faccd..38f89650bc8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -487,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 	switch (p->policy) {
 	case MPOL_BIND:
 		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+			node_set(zone_to_nid(p->v.zonelist->zones[i]),
 				*nodes);
 		break;
 	case MPOL_DEFAULT:
@@ -1145,7 +1145,7 @@ unsigned slab_node(struct mempolicy *policy)
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+		return zone_to_nid(policy->v.zonelist->zones[0]);
 
 	case MPOL_PREFERRED:
 		if (policy->v.preferred_node >= 0)
@@ -1649,7 +1649,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 
 		nodes_clear(nodes);
 		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set((*z)->zone_pgdat->node_id, nodes);
+			node_set(zone_to_nid(*z), nodes);
 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f1c0ef1fd21..bada3d03119 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 
 	for (z = zonelist->zones; *z; z++)
 		if (cpuset_zone_allowed(*z, gfp_mask))
-			node_clear((*z)->zone_pgdat->node_id,
-					nodes);
+			node_clear(zone_to_nid(*z), nodes);
 		else
 			return CONSTRAINT_CPUSET;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf913bdd433..51070b6d593 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1217,7 +1217,7 @@ unsigned int nr_free_pagecache_pages(void)
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
-	printk("Node %d ", zone->zone_pgdat->node_id);
+	printk("Node %ld ", zone_to_nid(zone));
 }
 #else
 #define show_node(zone)	do { } while (0)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b950f193816..87779dda4ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1661,7 +1661,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
-	node_id = zone->zone_pgdat->node_id;
+	node_id = zone_to_nid(zone);
 	mask = node_to_cpumask(node_id);
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
-- 
cgit v1.2.3


From 62bac0185ad3dfef11d9602980445c54d45199c6 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:56 -0700
Subject: [PATCH] selinux: eliminate selinux_task_ctxid

Eliminate selinux_task_ctxid since it duplicates selinux_task_get_sid.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/selinux.h    | 15 ---------------
 kernel/auditsc.c           |  2 +-
 security/selinux/exports.c |  9 ---------
 3 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index aad4e390d6a..79e4707ca77 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -69,16 +69,6 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
  */
 void selinux_audit_set_callback(int (*callback)(void));
 
-/**
- *	selinux_task_ctxid - determine a context ID for a process.
- *	@tsk: the task object
- *	@ctxid: ID value returned via this
- *
- *	On return, ctxid will contain an ID for the context.  This value
- *	should only be used opaquely.
- */
-void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid);
-
 /**
  *     selinux_ctxid_to_string - map a security context ID to a string
  *     @ctxid: security context ID to be converted.
@@ -166,11 +156,6 @@ static inline void selinux_audit_set_callback(int (*callback)(void))
 	return;
 }
 
-static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
-{
-	*ctxid = 0;
-}
-
 static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
 {
        *ctx = NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a010..331e1701039 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			   logged upon error */
 			if (f->se_rule) {
 				if (need_sid) {
-					selinux_task_ctxid(tsk, &sid);
+					selinux_get_task_sid(tsk, &sid);
 					need_sid = 0;
 				}
 				result = selinux_audit_rule_match(sid, f->type,
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 9d7737db5e5..ee0fb47f81a 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -21,15 +21,6 @@
 #include "security.h"
 #include "objsec.h"
 
-void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
-{
-	struct task_security_struct *tsec = tsk->security;
-	if (selinux_enabled)
-		*ctxid = tsec->sid;
-	else
-		*ctxid = 0;
-}
-
 int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
 {
 	if (selinux_enabled)
-- 
cgit v1.2.3


From 1a70cd40cb291c25b67ec0da715a49d76719329d Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:57 -0700
Subject: [PATCH] selinux: rename selinux_ctxid_to_string

Rename selinux_ctxid_to_string to selinux_sid_to_string to be
consistent with other interfaces.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/selinux.h    |  8 ++++----
 kernel/audit.c             | 14 +++++++-------
 kernel/auditfilter.c       |  2 +-
 kernel/auditsc.c           |  4 ++--
 security/selinux/exports.c |  4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 79e4707ca77..df9098de4c9 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -70,8 +70,8 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
 void selinux_audit_set_callback(int (*callback)(void));
 
 /**
- *     selinux_ctxid_to_string - map a security context ID to a string
- *     @ctxid: security context ID to be converted.
+ *     selinux_sid_to_string - map a security context ID to a string
+ *     @sid: security context ID to be converted.
  *     @ctx: address of context string to be returned
  *     @ctxlen: length of returned context string.
  *
@@ -79,7 +79,7 @@ void selinux_audit_set_callback(int (*callback)(void));
  *     string will be allocated internally, and the caller must call
  *     kfree() on it after use.
  */
-int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen);
+int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen);
 
 /**
  *     selinux_get_inode_sid - get the inode's security context ID
@@ -156,7 +156,7 @@ static inline void selinux_audit_set_callback(int (*callback)(void))
 	return;
 }
 
-static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+static inline int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
 {
        *ctx = NULL;
        *ctxlen = 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c962..f9889ee7782 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 		char *ctx = NULL;
 		u32 len;
 		int rc;
-		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+		if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
 			return rc;
 		else
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
 			if (sid) {
-				if ((err = selinux_ctxid_to_string(
+				if ((err = selinux_sid_to_string(
 						sid, &ctx, &len)))
 					return err;
 				else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 						 "user pid=%d uid=%u auid=%u",
 						 pid, uid, loginuid);
 				if (sid) {
-					if (selinux_ctxid_to_string(
+					if (selinux_sid_to_string(
 							sid, &ctx, &len)) {
 						audit_log_format(ab, 
 							" ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					   loginuid, sid);
 		break;
 	case AUDIT_SIGNAL_INFO:
-		err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+		err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
 		if (err)
 			return err;
 		sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72..1a58a81fb09 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
-		if (selinux_ctxid_to_string(sid, &ctx, &len))
+		if (selinux_sid_to_string(sid, &ctx, &len))
 			audit_log_format(ab, " ssid=%u", sid);
 		else
 			audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 331e1701039..fb83c5cb8c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -898,7 +898,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			if (axi->osid != 0) {
 				char *ctx = NULL;
 				u32 len;
-				if (selinux_ctxid_to_string(
+				if (selinux_sid_to_string(
 						axi->osid, &ctx, &len)) {
 					audit_log_format(ab, " osid=%u",
 							axi->osid);
@@ -1005,7 +1005,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		if (n->osid != 0) {
 			char *ctx = NULL;
 			u32 len;
-			if (selinux_ctxid_to_string(
+			if (selinux_sid_to_string(
 				n->osid, &ctx, &len)) {
 				audit_log_format(ab, " osid=%u", n->osid);
 				call_panic = 2;
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index ee0fb47f81a..b6f96943be1 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -21,10 +21,10 @@
 #include "security.h"
 #include "objsec.h"
 
-int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+int selinux_sid_to_string(u32 sid, char **ctx, u32 *ctxlen)
 {
 	if (selinux_enabled)
-		return security_sid_to_context(ctxid, ctx, ctxlen);
+		return security_sid_to_context(sid, ctx, ctxlen);
 	else {
 		*ctx = NULL;
 		*ctxlen = 0;
-- 
cgit v1.2.3


From 9a2f44f01a67a6ecca71515af999895b45a2aeb0 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:58 -0700
Subject: [PATCH] selinux: replace ctxid with sid in selinux_audit_rule_match
 interface

Replace ctxid with sid in selinux_audit_rule_match interface for
consistency with other interfaces.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/selinux.h        | 6 +++---
 security/selinux/ss/services.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index df9098de4c9..d1b7ca6c1c5 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -46,7 +46,7 @@ void selinux_audit_rule_free(struct selinux_audit_rule *rule);
 
 /**
  *	selinux_audit_rule_match - determine if a context ID matches a rule.
- *	@ctxid: the context ID to check
+ *	@sid: the context ID to check
  *	@field: the field this rule refers to
  *	@op: the operater the rule uses
  *	@rule: pointer to the audit rule to check against
@@ -55,7 +55,7 @@ void selinux_audit_rule_free(struct selinux_audit_rule *rule);
  *	Returns 1 if the context id matches the rule, 0 if it does not, and
  *	-errno on failure.
  */
-int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
                              struct selinux_audit_rule *rule,
                              struct audit_context *actx);
 
@@ -144,7 +144,7 @@ static inline void selinux_audit_rule_free(struct selinux_audit_rule *rule)
 	return;
 }
 
-static inline int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+static inline int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
                                            struct selinux_audit_rule *rule,
                                            struct audit_context *actx)
 {
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 22ed17c1771..988079f4529 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2003,7 +2003,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
 	return rc;
 }
 
-int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
                              struct selinux_audit_rule *rule,
                              struct audit_context *actx)
 {
@@ -2026,11 +2026,11 @@ int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
 		goto out;
 	}
 
-	ctxt = sidtab_search(&sidtab, ctxid);
+	ctxt = sidtab_search(&sidtab, sid);
 	if (!ctxt) {
 		audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR,
 		          "selinux_audit_rule_match: unrecognized SID %d\n",
-		          ctxid);
+		          sid);
 		match = -ENOENT;
 		goto out;
 	}
-- 
cgit v1.2.3


From 016b9bdb81d9c9c7800e4e224ade38d8b37669d3 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:31:58 -0700
Subject: [PATCH] selinux: enable configuration of max policy version

Enable configuration of SELinux maximum supported policy version to support
legacy userland (init) that does not gracefully handle kernels that support
newer policy versions two or more beyond the installed policy, as in FC3
and FC4.

[bunk@stusta.de: improve Kconfig help text]
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/Kconfig            | 37 +++++++++++++++++++++++++++++++++++++
 security/selinux/include/security.h |  6 +++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index 814ddc42f1f..5c64c746b06 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -124,3 +124,40 @@ config SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT
 
 	  If you are unsure what do do here, select N.
 
+config SECURITY_SELINUX_POLICYDB_VERSION_MAX
+	bool "NSA SELinux maximum supported policy format version"
+	depends on SECURITY_SELINUX
+	default n
+	help
+	  This option enables the maximum policy format version supported
+	  by SELinux to be set to a particular value.  This value is reported
+	  to userspace via /selinux/policyvers and used at policy load time.
+	  It can be adjusted downward to support legacy userland (init) that
+	  does not correctly handle kernels that support newer policy versions.
+
+	  Examples:
+	  For the Fedora Core 3 or 4 Linux distributions, enable this option
+	  and set the value via the next option. For Fedore Core 5 and later,
+	  do not enable this option.
+
+	  If you are unsure how to answer this question, answer N.
+
+config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
+	int "NSA SELinux maximum supported policy format version value"
+	depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
+	range 15 20
+	default 19
+	help
+	  This option sets the value for the maximum policy format version
+	  supported by SELinux.
+
+	  Examples:
+	  For Fedora Core 3, use 18.
+	  For Fedora Core 4, use 19.
+
+	  If you are unsure how to answer this question, look for the
+	  policy format version supported by your policy toolchain, by
+	  running 'checkpolicy -V'. Or look at what policy you have
+	  installed under /etc/selinux/$SELINUXTYPE/policy, where
+	  SELINUXTYPE is defined in your /etc/selinux/config.
+
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 911954a692f..aa21ca1721a 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -27,7 +27,11 @@
 
 /* Range of policy versions we understand*/
 #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
-#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_AVTAB
+#ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
+#define POLICYDB_VERSION_MAX	CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
+#else
+#define POLICYDB_VERSION_MAX	POLICYDB_VERSION_AVTAB
+#endif
 
 extern int selinux_enabled;
 extern int selinux_mls_enabled;
-- 
cgit v1.2.3


From f3f8771420737004da55159c2f2dc0b6f483a4ef Mon Sep 17 00:00:00 2001
From: Darrel Goeddel <dgoeddel@TrustedCS.com>
Date: Mon, 25 Sep 2006 23:31:59 -0700
Subject: [PATCH] selinux: add support for range transitions on object classes

Introduces support for policy version 21.  This version of the binary
kernel policy allows for defining range transitions on security classes
other than the process security class.  As always, backwards compatibility
for older formats is retained.  The security class is read in as specified
when using the new format, while the "process" security class is assumed
when using an older policy format.

Signed-off-by: Darrel Goeddel <dgoeddel@trustedcs.com>
Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/Kconfig            |  2 +-
 security/selinux/include/security.h |  3 ++-
 security/selinux/ss/mls.c           | 21 ++++++++++-----------
 security/selinux/ss/policydb.c      | 27 ++++++++++++++++++++-------
 security/selinux/ss/policydb.h      |  7 ++++---
 5 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index 5c64c746b06..293dbd6246c 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -145,7 +145,7 @@ config SECURITY_SELINUX_POLICYDB_VERSION_MAX
 config SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
 	int "NSA SELinux maximum supported policy format version value"
 	depends on SECURITY_SELINUX_POLICYDB_VERSION_MAX
-	range 15 20
+	range 15 21
 	default 19
 	help
 	  This option sets the value for the maximum policy format version
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index aa21ca1721a..1ef79172cc8 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -24,13 +24,14 @@
 #define POLICYDB_VERSION_VALIDATETRANS	19
 #define POLICYDB_VERSION_MLS		19
 #define POLICYDB_VERSION_AVTAB		20
+#define POLICYDB_VERSION_RANGETRANS	21
 
 /* Range of policy versions we understand*/
 #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
 #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
 #define POLICYDB_VERSION_MAX	CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
 #else
-#define POLICYDB_VERSION_MAX	POLICYDB_VERSION_AVTAB
+#define POLICYDB_VERSION_MAX	POLICYDB_VERSION_RANGETRANS
 #endif
 
 extern int selinux_enabled;
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index 119bd6078ba..c713af23250 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -530,22 +530,21 @@ int mls_compute_sid(struct context *scontext,
 		    u32 specified,
 		    struct context *newcontext)
 {
+	struct range_trans *rtr;
+
 	if (!selinux_mls_enabled)
 		return 0;
 
 	switch (specified) {
 	case AVTAB_TRANSITION:
-		if (tclass == SECCLASS_PROCESS) {
-			struct range_trans *rangetr;
-			/* Look for a range transition rule. */
-			for (rangetr = policydb.range_tr; rangetr;
-			     rangetr = rangetr->next) {
-				if (rangetr->dom == scontext->type &&
-				    rangetr->type == tcontext->type) {
-					/* Set the range from the rule */
-					return mls_range_set(newcontext,
-					                     &rangetr->range);
-				}
+		/* Look for a range transition rule. */
+		for (rtr = policydb.range_tr; rtr; rtr = rtr->next) {
+			if (rtr->source_type == scontext->type &&
+			    rtr->target_type == tcontext->type &&
+			    rtr->target_class == tclass) {
+				/* Set the range from the rule */
+				return mls_range_set(newcontext,
+				                     &rtr->target_range);
 			}
 		}
 		/* Fallthrough */
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index f03960e697c..b1889530255 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -96,6 +96,11 @@ static struct policydb_compat_info policydb_compat[] = {
 		.sym_num        = SYM_NUM,
 		.ocon_num       = OCON_NUM,
 	},
+	{
+		.version        = POLICYDB_VERSION_RANGETRANS,
+		.sym_num        = SYM_NUM,
+		.ocon_num       = OCON_NUM,
+	},
 };
 
 static struct policydb_compat_info *policydb_lookup_compat(int version)
@@ -645,15 +650,15 @@ void policydb_destroy(struct policydb *p)
 
 	for (rt = p->range_tr; rt; rt = rt -> next) {
 		if (lrt) {
-			ebitmap_destroy(&lrt->range.level[0].cat);
-			ebitmap_destroy(&lrt->range.level[1].cat);
+			ebitmap_destroy(&lrt->target_range.level[0].cat);
+			ebitmap_destroy(&lrt->target_range.level[1].cat);
 			kfree(lrt);
 		}
 		lrt = rt;
 	}
 	if (lrt) {
-		ebitmap_destroy(&lrt->range.level[0].cat);
-		ebitmap_destroy(&lrt->range.level[1].cat);
+		ebitmap_destroy(&lrt->target_range.level[0].cat);
+		ebitmap_destroy(&lrt->target_range.level[1].cat);
 		kfree(lrt);
 	}
 
@@ -1829,6 +1834,7 @@ int policydb_read(struct policydb *p, void *fp)
 	}
 
 	if (p->policyvers >= POLICYDB_VERSION_MLS) {
+		int new_rangetr = p->policyvers >= POLICYDB_VERSION_RANGETRANS;
 		rc = next_entry(buf, fp, sizeof(u32));
 		if (rc < 0)
 			goto bad;
@@ -1847,9 +1853,16 @@ int policydb_read(struct policydb *p, void *fp)
 			rc = next_entry(buf, fp, (sizeof(u32) * 2));
 			if (rc < 0)
 				goto bad;
-			rt->dom = le32_to_cpu(buf[0]);
-			rt->type = le32_to_cpu(buf[1]);
-			rc = mls_read_range_helper(&rt->range, fp);
+			rt->source_type = le32_to_cpu(buf[0]);
+			rt->target_type = le32_to_cpu(buf[1]);
+			if (new_rangetr) {
+				rc = next_entry(buf, fp, sizeof(u32));
+				if (rc < 0)
+					goto bad;
+				rt->target_class = le32_to_cpu(buf[0]);
+			} else
+				rt->target_class = SECCLASS_PROCESS;
+			rc = mls_read_range_helper(&rt->target_range, fp);
 			if (rc)
 				goto bad;
 			lrt = rt;
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index b1340711f72..8319d5ff594 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -106,9 +106,10 @@ struct cat_datum {
 };
 
 struct range_trans {
-	u32 dom;			/* current process domain */
-	u32 type;			/* program executable type */
-	struct mls_range range;		/* new range */
+	u32 source_type;
+	u32 target_type;
+	u32 target_class;
+	struct mls_range target_range;
 	struct range_trans *next;
 };
 
-- 
cgit v1.2.3


From 296fddf7513c155adbd3a443d12add1f62b5cddb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 25 Sep 2006 23:32:00 -0700
Subject: [PATCH] SELinux: eliminate inode_security_set_security

inode_security_set_sid is only called by security_inode_init_security, which
is called when a new file is being created and needs to have its incore
security state initialized and its security xattr set.  This helper used to be
called in other places in the past, but now only has the one.  So this patch
rolls inode_security_set_sid directly back into security_inode_init_security.
There also is no need to hold the isec->sem while doing this, as the inode is
not available to other threads at this point in time.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/hooks.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5a66c4c09f7..14a78199ee1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1364,25 +1364,6 @@ static inline u32 file_to_av(struct file *file)
 	return av;
 }
 
-/* Set an inode's SID to a specified value. */
-static int inode_security_set_sid(struct inode *inode, u32 sid)
-{
-	struct inode_security_struct *isec = inode->i_security;
-	struct superblock_security_struct *sbsec = inode->i_sb->s_security;
-
-	if (!sbsec->initialized) {
-		/* Defer initialization to selinux_complete_init. */
-		return 0;
-	}
-
-	down(&isec->sem);
-	isec->sclass = inode_mode_to_security_class(inode->i_mode);
-	isec->sid = sid;
-	isec->initialized = 1;
-	up(&isec->sem);
-	return 0;
-}
-
 /* Hook functions begin here. */
 
 static int selinux_ptrace(struct task_struct *parent, struct task_struct *child)
@@ -2091,7 +2072,13 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 		}
 	}
 
-	inode_security_set_sid(inode, newsid);
+	/* Possibly defer initialization to selinux_complete_init. */
+	if (sbsec->initialized) {
+		struct inode_security_struct *isec = inode->i_security;
+		isec->sclass = inode_mode_to_security_class(inode->i_mode);
+		isec->sid = newsid;
+		isec->initialized = 1;
+	}
 
 	if (!ss_initialized || sbsec->behavior == SECURITY_FS_USE_MNTPOINT)
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 23970741720360de9dd0a4e87fbeb1d5927aa474 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 25 Sep 2006 23:32:01 -0700
Subject: [PATCH] SELinux: change isec semaphore to a mutex

This patch converts the remaining isec->sem into a mutex.  Very similar
locking is provided as before only in the faster smaller mutex rather than a
semaphore.  An out_unlock path is introduced rather than the conditional
unlocking found in the original code.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/hooks.c          | 30 ++++++++++++++----------------
 security/selinux/include/objsec.h |  2 +-
 security/selinux/ss/services.c    |  4 ++--
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 14a78199ee1..63ad57ab44f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -71,6 +71,7 @@
 #include <linux/audit.h>
 #include <linux/string.h>
 #include <linux/selinux.h>
+#include <linux/mutex.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -185,7 +186,7 @@ static int inode_alloc_security(struct inode *inode)
 		return -ENOMEM;
 
 	memset(isec, 0, sizeof(*isec));
-	init_MUTEX(&isec->sem);
+	mutex_init(&isec->lock);
 	INIT_LIST_HEAD(&isec->list);
 	isec->inode = inode;
 	isec->sid = SECINITSID_UNLABELED;
@@ -843,15 +844,13 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 	char *context = NULL;
 	unsigned len = 0;
 	int rc = 0;
-	int hold_sem = 0;
 
 	if (isec->initialized)
 		goto out;
 
-	down(&isec->sem);
-	hold_sem = 1;
+	mutex_lock(&isec->lock);
 	if (isec->initialized)
-		goto out;
+		goto out_unlock;
 
 	sbsec = inode->i_sb->s_security;
 	if (!sbsec->initialized) {
@@ -862,7 +861,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 		if (list_empty(&isec->list))
 			list_add(&isec->list, &sbsec->isec_head);
 		spin_unlock(&sbsec->isec_lock);
-		goto out;
+		goto out_unlock;
 	}
 
 	switch (sbsec->behavior) {
@@ -885,7 +884,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 			printk(KERN_WARNING "%s:  no dentry for dev=%s "
 			       "ino=%ld\n", __FUNCTION__, inode->i_sb->s_id,
 			       inode->i_ino);
-			goto out;
+			goto out_unlock;
 		}
 
 		len = INITCONTEXTLEN;
@@ -893,7 +892,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 		if (!context) {
 			rc = -ENOMEM;
 			dput(dentry);
-			goto out;
+			goto out_unlock;
 		}
 		rc = inode->i_op->getxattr(dentry, XATTR_NAME_SELINUX,
 					   context, len);
@@ -903,7 +902,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 						   NULL, 0);
 			if (rc < 0) {
 				dput(dentry);
-				goto out;
+				goto out_unlock;
 			}
 			kfree(context);
 			len = rc;
@@ -911,7 +910,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 			if (!context) {
 				rc = -ENOMEM;
 				dput(dentry);
-				goto out;
+				goto out_unlock;
 			}
 			rc = inode->i_op->getxattr(dentry,
 						   XATTR_NAME_SELINUX,
@@ -924,7 +923,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 				       "%d for dev=%s ino=%ld\n", __FUNCTION__,
 				       -rc, inode->i_sb->s_id, inode->i_ino);
 				kfree(context);
-				goto out;
+				goto out_unlock;
 			}
 			/* Map ENODATA to the default file SID */
 			sid = sbsec->def_sid;
@@ -960,7 +959,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 					     isec->sclass,
 					     &sid);
 		if (rc)
-			goto out;
+			goto out_unlock;
 		isec->sid = sid;
 		break;
 	case SECURITY_FS_USE_MNTPOINT:
@@ -978,7 +977,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 							  isec->sclass,
 							  &sid);
 				if (rc)
-					goto out;
+					goto out_unlock;
 				isec->sid = sid;
 			}
 		}
@@ -987,12 +986,11 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 
 	isec->initialized = 1;
 
+out_unlock:
+	mutex_unlock(&isec->lock);
 out:
 	if (isec->sclass == SECCLASS_FILE)
 		isec->sclass = inode_mode_to_security_class(inode->i_mode);
-
-	if (hold_sem)
-		up(&isec->sem);
 	return rc;
 }
 
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 0a39bfd1319..7d5a0289878 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -44,7 +44,7 @@ struct inode_security_struct {
 	u32 sid;             /* SID of this object */
 	u16 sclass;       /* security class of this object */
 	unsigned char initialized;     /* initialization flag */
-	struct semaphore sem;
+	struct mutex lock;
 	unsigned char inherit;         /* inherit SID from parent entry */
 };
 
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 988079f4529..0c219a1b324 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2578,7 +2578,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
 	sock = SOCKET_I(inode);
 	isec = inode->i_security;
 	sksec = sock->sk->sk_security;
-	down(&isec->sem);
+	mutex_lock(&isec->lock);
 	if (unlikely(sksec->nlbl_state == NLBL_REQUIRE &&
 		     (mask & (MAY_WRITE | MAY_APPEND)))) {
 		lock_sock(sock->sk);
@@ -2586,7 +2586,7 @@ int selinux_netlbl_inode_permission(struct inode *inode, int mask)
 		release_sock(sock->sk);
 	} else
 		rc = 0;
-	up(&isec->sem);
+	mutex_unlock(&isec->lock);
 
 	return rc;
 }
-- 
cgit v1.2.3


From bc7e982b84aceef0a040c88ff659eb5c83818f72 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 25 Sep 2006 23:32:02 -0700
Subject: [PATCH] SELinux: convert sbsec semaphore to a mutex

This patch converts the semaphore in the superblock security struct to a
mutex.  No locking changes or other code changes are done.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/hooks.c          | 7 +++----
 security/selinux/include/objsec.h | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 63ad57ab44f..55cec4d6f11 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -51,7 +51,6 @@
 #include <net/ip.h>		/* for sysctl_local_port_range[] */
 #include <net/tcp.h>		/* struct or_callable used in sock_rcv_skb */
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #include <asm/ioctls.h>
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
@@ -243,7 +242,7 @@ static int superblock_alloc_security(struct super_block *sb)
 	if (!sbsec)
 		return -ENOMEM;
 
-	init_MUTEX(&sbsec->sem);
+	mutex_init(&sbsec->lock);
 	INIT_LIST_HEAD(&sbsec->list);
 	INIT_LIST_HEAD(&sbsec->isec_head);
 	spin_lock_init(&sbsec->isec_lock);
@@ -595,7 +594,7 @@ static int superblock_doinit(struct super_block *sb, void *data)
 	struct inode *inode = root->d_inode;
 	int rc = 0;
 
-	down(&sbsec->sem);
+	mutex_lock(&sbsec->lock);
 	if (sbsec->initialized)
 		goto out;
 
@@ -690,7 +689,7 @@ next_inode:
 	}
 	spin_unlock(&sbsec->isec_lock);
 out:
-	up(&sbsec->sem);
+	mutex_unlock(&sbsec->lock);
 	return rc;
 }
 
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 7d5a0289878..ef2267fea8b 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -63,7 +63,7 @@ struct superblock_security_struct {
 	unsigned int behavior;          /* labeling behavior */
 	unsigned char initialized;      /* initialization flag */
 	unsigned char proc;             /* proc fs */
-	struct semaphore sem;
+	struct mutex lock;
 	struct list_head isec_head;
 	spinlock_t isec_lock;
 };
-- 
cgit v1.2.3


From b20c8122a3204496fca8b5343c93b60fe11dad04 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 25 Sep 2006 23:32:03 -0700
Subject: [PATCH] selinux: fix tty locking

Take tty_mutex when accessing ->signal->tty in selinux code.  Noted by Alan
Cox.  Longer term, we are looking at refactoring the code to provide better
encapsulation of the tty layer, but this is a simple fix that addresses the
immediate bug.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Alan Cox <alan@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 security/selinux/hooks.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 55cec4d6f11..e4d81a42fca 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1689,10 +1689,12 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 {
 	struct avc_audit_data ad;
 	struct file *file, *devnull = NULL;
-	struct tty_struct *tty = current->signal->tty;
+	struct tty_struct *tty;
 	struct fdtable *fdt;
 	long j = -1;
 
+	mutex_lock(&tty_mutex);
+	tty = current->signal->tty;
 	if (tty) {
 		file_list_lock();
 		file = list_entry(tty->tty_files.next, typeof(*file), f_u.fu_list);
@@ -1712,6 +1714,7 @@ static inline void flush_unauthorized_files(struct files_struct * files)
 		}
 		file_list_unlock();
 	}
+	mutex_unlock(&tty_mutex);
 
 	/* Revalidate access to inherited open files. */
 
-- 
cgit v1.2.3


From 8d6b5eeea5eb644232cbbbe1c927fdf051e60fa5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:04 -0700
Subject: [PATCH] binfmt_elf: consistently use loff_t

As David Howells <dhowells@redhat.com> points out, binfmt_elf sometimes uses
off_t, sometimes uses loff_t.  Use loff_t throughout.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 672a3b90bc5..64802aabd1a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1262,7 +1262,7 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
 	return;
 }
 
-static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset)
+static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
 {
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = offset;
@@ -1428,7 +1428,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	off_t offset = 0, dataoff;
+	loff_t offset = 0, dataoff;
 	unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote;
 	struct memelfnote *notes = NULL;
@@ -1661,11 +1661,11 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	ELF_CORE_WRITE_EXTRA_DATA;
 #endif
 
-	if ((off_t)file->f_pos != offset) {
+	if (file->f_pos != offset) {
 		/* Sanity check */
 		printk(KERN_WARNING
-		       "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
-		       (off_t)file->f_pos, offset);
+		       "elf_core_dump: file->f_pos (%Ld) != offset (%Ld)\n",
+		       file->f_pos, offset);
 	}
 
 end_coredump:
-- 
cgit v1.2.3


From 1bcbba306048ed86b935d57a95d887c23d52c94b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:04 -0700
Subject: [PATCH] FRV: Use the generic IRQ stuff

Make the FRV arch use the generic IRQ code rather than having its own
routines for doing so.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig                    |   8 +-
 arch/frv/kernel/Makefile            |   5 +-
 arch/frv/kernel/irq-mb93091.c       | 157 +++++---
 arch/frv/kernel/irq-mb93093.c       | 115 ++++--
 arch/frv/kernel/irq-mb93493.c       | 167 +++++---
 arch/frv/kernel/irq-routing.c       | 291 --------------
 arch/frv/kernel/irq.c               | 750 ++++++------------------------------
 arch/frv/kernel/setup.c             |   1 -
 arch/frv/kernel/time.c              |   1 -
 arch/frv/mb93090-mb00/pci-irq.c     |   1 -
 include/asm-frv/cpu-irqs.h          |  54 ++-
 include/asm-frv/hardirq.h           |   5 +
 include/asm-frv/irq-routing.h       |  70 ----
 include/asm-frv/irq.h               |  26 +-
 include/asm-frv/mb93091-fpga-irqs.h |   6 +-
 include/asm-frv/mb93093-fpga-irqs.h |   6 +-
 include/asm-frv/mb93493-irqs.h      |   6 +-
 include/asm-frv/mb93493-regs.h      |   2 +
 18 files changed, 450 insertions(+), 1221 deletions(-)
 delete mode 100644 arch/frv/kernel/irq-routing.c
 delete mode 100644 include/asm-frv/irq-routing.h

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index a601a17cf56..5833808fb82 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -27,7 +27,7 @@ config GENERIC_CALIBRATE_DELAY
 
 config GENERIC_HARDIRQS
 	bool
-	default n
+	default y
 
 config GENERIC_TIME
 	bool
@@ -251,6 +251,12 @@ config MB93091_NO_MB
 endchoice
 endif
 
+config FUJITSU_MB93493
+	bool "MB93493 Multimedia chip"
+	help
+	  Select this option if the MB93493 multimedia chip is going to be
+	  used.
+
 choice
 	prompt "GP-Relative data support"
 	default GPREL_DATA_8
diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile
index 5a827b349b5..32db3499c46 100644
--- a/arch/frv/kernel/Makefile
+++ b/arch/frv/kernel/Makefile
@@ -10,15 +10,14 @@ extra-y:= head.o init_task.o vmlinux.lds
 obj-y := $(heads-y) entry.o entry-table.o break.o switch_to.o kernel_thread.o \
 	 process.o traps.o ptrace.o signal.o dma.o \
 	 sys_frv.o time.o semaphore.o setup.o frv_ksyms.o \
-	 debug-stub.o irq.o irq-routing.o sleep.o uaccess.o
+	 debug-stub.o irq.o sleep.o uaccess.o
 
 obj-$(CONFIG_GDBSTUB)		+= gdb-stub.o gdb-io.o
 
 obj-$(CONFIG_MB93091_VDK)	+= irq-mb93091.o
-obj-$(CONFIG_MB93093_PDK)	+= irq-mb93093.o
-obj-$(CONFIG_FUJITSU_MB93493)	+= irq-mb93493.o
 obj-$(CONFIG_PM)		+= pm.o cmode.o
 obj-$(CONFIG_MB93093_PDK)	+= pm-mb93093.o
+obj-$(CONFIG_FUJITSU_MB93493)	+= irq-mb93493.o
 obj-$(CONFIG_SYSCTL)		+= sysctl.o
 obj-$(CONFIG_FUTEX)		+= futex.o
 obj-$(CONFIG_MODULES)		+= module.o
diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c
index 1381abcd5cc..635d2343766 100644
--- a/arch/frv/kernel/irq-mb93091.c
+++ b/arch/frv/kernel/irq-mb93091.c
@@ -24,7 +24,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 #include <asm/irc-regs.h>
-#include <asm/irq-routing.h>
 
 #define __reg16(ADDR) (*(volatile unsigned short *)(ADDR))
 
@@ -33,83 +32,129 @@
 #define __get_IFR()	({ __reg16(0xffc0000c); })
 #define __clr_IFR(M)	do { __reg16(0xffc0000c) = ~(M); wmb(); } while(0)
 
-static void frv_fpga_doirq(struct irq_source *source);
-static void frv_fpga_control(struct irq_group *group, int irq, int on);
 
-/*****************************************************************************/
 /*
- * FPGA IRQ multiplexor
+ * on-motherboard FPGA PIC operations
  */
-static struct irq_source frv_fpga[4] = {
-#define __FPGA(X, M)					\
-	[X] = {						\
-		.muxname	= "fpga."#X,		\
-		.irqmask	= M,			\
-		.doirq		= frv_fpga_doirq,	\
-	}
+static void frv_fpga_enable(unsigned int irq)
+{
+	uint16_t imr = __get_IMR();
 
-	__FPGA(0, 0x0028),
-	__FPGA(1, 0x0050),
-	__FPGA(2, 0x1c00),
-	__FPGA(3, 0x6386),
-};
-
-static struct irq_group frv_fpga_irqs = {
-	.first_irq	= IRQ_BASE_FPGA,
-	.control	= frv_fpga_control,
-	.sources = {
-		[ 1] = &frv_fpga[3],
-		[ 2] = &frv_fpga[3],
-		[ 3] = &frv_fpga[0],
-		[ 4] = &frv_fpga[1],
-		[ 5] = &frv_fpga[0],
-		[ 6] = &frv_fpga[1],
-		[ 7] = &frv_fpga[3],
-		[ 8] = &frv_fpga[3],
-		[ 9] = &frv_fpga[3],
-		[10] = &frv_fpga[2],
-		[11] = &frv_fpga[2],
-		[12] = &frv_fpga[2],
-		[13] = &frv_fpga[3],
-		[14] = &frv_fpga[3],
-	},
-};
+	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
 
+	__set_IMR(imr);
+}
 
-static void frv_fpga_control(struct irq_group *group, int index, int on)
+static void frv_fpga_disable(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
-	if (on)
-		imr &= ~(1 << index);
-	else
-		imr |= 1 << index;
+	imr |= 1 << (irq - IRQ_BASE_FPGA);
 
 	__set_IMR(imr);
 }
 
-static void frv_fpga_doirq(struct irq_source *source)
+static void frv_fpga_ack(unsigned int irq)
+{
+	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
+}
+
+static void frv_fpga_end(unsigned int irq)
+{
+}
+
+static struct irq_chip frv_fpga_pic = {
+	.name		= "mb93091",
+	.enable		= frv_fpga_enable,
+	.disable	= frv_fpga_disable,
+	.ack		= frv_fpga_ack,
+	.mask		= frv_fpga_disable,
+	.unmask		= frv_fpga_enable,
+	.end		= frv_fpga_end,
+};
+
+/*
+ * FPGA PIC interrupt handler
+ */
+static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 {
-	uint16_t mask, imr;
+	uint16_t imr, mask = (unsigned long) _mask;
+	irqreturn_t iret = 0;
 
 	imr = __get_IMR();
-	mask = source->irqmask & ~imr & __get_IFR();
-	if (mask) {
-		__set_IMR(imr | mask);
-		__clr_IFR(mask);
-		distribute_irqs(&frv_fpga_irqs, mask);
-		__set_IMR(imr);
+	mask = mask & ~imr & __get_IFR();
+
+	/* poll all the triggered IRQs */
+	while (mask) {
+		int irq;
+
+		asm("scan %1,gr0,%0" : "=r"(irq) : "r"(mask));
+		irq = 31 - irq;
+		mask &= ~(1 << irq);
+
+		if (__do_IRQ(IRQ_BASE_FPGA + irq, regs))
+			iret |= IRQ_HANDLED;
 	}
+
+	return iret;
 }
 
+/*
+ * define an interrupt action for each FPGA PIC output
+ * - use dev_id to indicate the FPGA PIC input to output mappings
+ */
+static struct irqaction fpga_irq[4]  = {
+	[0] = {
+		.handler	= fpga_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "fpga.0",
+		.dev_id		= (void *) 0x0028UL,
+	},
+	[1] = {
+		.handler	= fpga_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "fpga.1",
+		.dev_id		= (void *) 0x0050UL,
+	},
+	[2] = {
+		.handler	= fpga_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "fpga.2",
+		.dev_id		= (void *) 0x1c00UL,
+	},
+	[3] = {
+		.handler	= fpga_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "fpga.3",
+		.dev_id		= (void *) 0x6386UL,
+	}
+};
+
+/*
+ * initialise the motherboard FPGA's PIC
+ */
 void __init fpga_init(void)
 {
+	int irq;
+
+	/* all PIC inputs are all set to be low-level driven, apart from the
+	 * NMI button (15) which is fixed at falling-edge
+	 */
 	__set_IMR(0x7ffe);
 	__clr_IFR(0x0000);
 
-	frv_irq_route_external(&frv_fpga[0], IRQ_CPU_EXTERNAL0);
-	frv_irq_route_external(&frv_fpga[1], IRQ_CPU_EXTERNAL1);
-	frv_irq_route_external(&frv_fpga[2], IRQ_CPU_EXTERNAL2);
-	frv_irq_route_external(&frv_fpga[3], IRQ_CPU_EXTERNAL3);
-	frv_irq_set_group(&frv_fpga_irqs);
+	for (irq = IRQ_BASE_FPGA + 1; irq <= IRQ_BASE_FPGA + 14; irq++)
+		set_irq_chip_and_handler(irq, &frv_fpga_pic, handle_level_irq);
+
+	set_irq_chip_and_handler(IRQ_FPGA_NMI, &frv_fpga_pic, handle_edge_irq);
+
+	/* the FPGA drives the first four external IRQ inputs on the CPU PIC */
+	setup_irq(IRQ_CPU_EXTERNAL0, &fpga_irq[0]);
+	setup_irq(IRQ_CPU_EXTERNAL1, &fpga_irq[1]);
+	setup_irq(IRQ_CPU_EXTERNAL2, &fpga_irq[2]);
+	setup_irq(IRQ_CPU_EXTERNAL3, &fpga_irq[3]);
 }
diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c
index 48b2a642088..f60db7988e7 100644
--- a/arch/frv/kernel/irq-mb93093.c
+++ b/arch/frv/kernel/irq-mb93093.c
@@ -1,6 +1,6 @@
 /* irq-mb93093.c: MB93093 FPGA interrupt handling
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -24,7 +24,6 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 #include <asm/irc-regs.h>
-#include <asm/irq-routing.h>
 
 #define __reg16(ADDR) (*(volatile unsigned short *)(__region_CS2 + (ADDR)))
 
@@ -33,66 +32,100 @@
 #define __get_IFR()	({ __reg16(0x02); })
 #define __clr_IFR(M)	do { __reg16(0x02) = ~(M); wmb(); } while(0)
 
-static void frv_fpga_doirq(struct irq_source *source);
-static void frv_fpga_control(struct irq_group *group, int irq, int on);
-
-/*****************************************************************************/
 /*
- * FPGA IRQ multiplexor
+ * off-CPU FPGA PIC operations
  */
-static struct irq_source frv_fpga[4] = {
-#define __FPGA(X, M)					\
-	[X] = {						\
-		.muxname	= "fpga."#X,		\
-		.irqmask	= M,			\
-		.doirq		= frv_fpga_doirq,	\
-	}
-
-	__FPGA(0, 0x0700),
-};
+static void frv_fpga_enable(unsigned int irq)
+{
+	uint16_t imr = __get_IMR();
 
-static struct irq_group frv_fpga_irqs = {
-	.first_irq	= IRQ_BASE_FPGA,
-	.control	= frv_fpga_control,
-	.sources = {
-		[ 8] = &frv_fpga[0],
-		[ 9] = &frv_fpga[0],
-		[10] = &frv_fpga[0],
-	},
-};
+	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
 
+	__set_IMR(imr);
+}
 
-static void frv_fpga_control(struct irq_group *group, int index, int on)
+static void frv_fpga_disable(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
-	if (on)
-		imr &= ~(1 << index);
-	else
-		imr |= 1 << index;
+	imr |= 1 << (irq - IRQ_BASE_FPGA);
 
 	__set_IMR(imr);
 }
 
-static void frv_fpga_doirq(struct irq_source *source)
+static void frv_fpga_ack(unsigned int irq)
+{
+	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
+}
+
+static void frv_fpga_end(unsigned int irq)
+{
+}
+
+static struct irq_chip frv_fpga_pic = {
+	.name		= "mb93093",
+	.enable		= frv_fpga_enable,
+	.disable	= frv_fpga_disable,
+	.ack		= frv_fpga_ack,
+	.mask		= frv_fpga_disable,
+	.unmask		= frv_fpga_enable,
+	.end		= frv_fpga_end,
+};
+
+/*
+ * FPGA PIC interrupt handler
+ */
+static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 {
-	uint16_t mask, imr;
+	uint16_t imr, mask = (unsigned long) _mask;
+	irqreturn_t iret = 0;
 
 	imr = __get_IMR();
-	mask = source->irqmask & ~imr & __get_IFR();
-	if (mask) {
-		__set_IMR(imr | mask);
-		__clr_IFR(mask);
-		distribute_irqs(&frv_fpga_irqs, mask);
-		__set_IMR(imr);
+	mask = mask & ~imr & __get_IFR();
+
+	/* poll all the triggered IRQs */
+	while (mask) {
+		int irq;
+
+		asm("scan %1,gr0,%0" : "=r"(irq) : "r"(mask));
+		irq = 31 - irq;
+		mask &= ~(1 << irq);
+
+		if (__do_IRQ(IRQ_BASE_FPGA + irq, regs))
+			iret |= IRQ_HANDLED;
 	}
+
+	return iret;
 }
 
+/*
+ * define an interrupt action for each FPGA PIC output
+ * - use dev_id to indicate the FPGA PIC input to output mappings
+ */
+static struct irqaction fpga_irq[1]  = {
+	[0] = {
+		.handler	= fpga_interrupt,
+		.flags		= IRQF_DISABLED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "fpga.0",
+		.dev_id		= (void *) 0x0700UL,
+	}
+};
+
+/*
+ * initialise the motherboard FPGA's PIC
+ */
 void __init fpga_init(void)
 {
+	int irq;
+
+	/* all PIC inputs are all set to be edge triggered */
 	__set_IMR(0x0700);
 	__clr_IFR(0x0000);
 
-	frv_irq_route_external(&frv_fpga[0], IRQ_CPU_EXTERNAL2);
-	frv_irq_set_group(&frv_fpga_irqs);
+	for (irq = IRQ_BASE_FPGA + 8; irq <= IRQ_BASE_FPGA + 10; irq++)
+		set_irq_chip_and_handler(irq, &frv_fpga_pic, handle_edge_irq);
+
+	/* the FPGA drives external IRQ input #2 on the CPU PIC */
+	setup_irq(IRQ_CPU_EXTERNAL2, &fpga_irq[0]);
 }
diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c
index 988d035640e..8ad9abfc7c1 100644
--- a/arch/frv/kernel/irq-mb93493.c
+++ b/arch/frv/kernel/irq-mb93493.c
@@ -1,6 +1,6 @@
 /* irq-mb93493.c: MB93493 companion chip interrupt handler
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -24,84 +24,133 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 #include <asm/irc-regs.h>
-#include <asm/irq-routing.h>
 #include <asm/mb93493-irqs.h>
+#include <asm/mb93493-regs.h>
 
-static void frv_mb93493_doirq(struct irq_source *source);
+#define IRQ_ROUTE_ONE(X) (X##_ROUTE << (X - IRQ_BASE_MB93493))
+
+#define IRQ_ROUTING					\
+	(IRQ_ROUTE_ONE(IRQ_MB93493_VDC)		|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_VCC)		|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_AUDIO_OUT)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_I2C_0)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_I2C_1)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_USB)		|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_LOCAL_BUS)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_PCMCIA)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_GPIO)	|	\
+	 IRQ_ROUTE_ONE(IRQ_MB93493_AUDIO_IN))
 
-/*****************************************************************************/
 /*
- * MB93493 companion chip IRQ multiplexor
+ * daughter board PIC operations
  */
-static struct irq_source frv_mb93493[2] = {
-	[0] = {
-		.muxname		= "mb93493.0",
-		.muxdata		= __region_CS3 + 0x3d0,
-		.doirq			= frv_mb93493_doirq,
-		.irqmask		= 0x0000,
-	},
-	[1] = {
-		.muxname		= "mb93493.1",
-		.muxdata		= __region_CS3 + 0x3d4,
-		.doirq			= frv_mb93493_doirq,
-		.irqmask		= 0x0000,
-	},
-};
-
-static void frv_mb93493_control(struct irq_group *group, int index, int on)
+static void frv_mb93493_enable(unsigned int irq)
 {
-	struct irq_source *source;
 	uint32_t iqsr;
+	volatile void *piqsr;
 
-	if ((frv_mb93493[0].irqmask & (1 << index)))
-		source = &frv_mb93493[0];
+	if (IRQ_ROUTING & (1 << (irq - IRQ_BASE_MB93493)))
+		piqsr = __addr_MB93493_IQSR(1);
 	else
-		source = &frv_mb93493[1];
+		piqsr = __addr_MB93493_IQSR(0);
 
-	iqsr = readl(source->muxdata);
-	if (on)
-		iqsr |= 1 << (index + 16);
+	iqsr = readl(piqsr);
+	iqsr |= 1 << (irq - IRQ_BASE_MB93493 + 16);
+	writel(iqsr, piqsr);
+}
+
+static void frv_mb93493_disable(unsigned int irq)
+{
+	uint32_t iqsr;
+	volatile void *piqsr;
+
+	if (IRQ_ROUTING & (1 << (irq - IRQ_BASE_MB93493)))
+		piqsr = __addr_MB93493_IQSR(1);
 	else
-		iqsr &= ~(1 << (index + 16));
+		piqsr = __addr_MB93493_IQSR(0);
 
-	writel(iqsr, source->muxdata);
+	iqsr = readl(piqsr);
+	iqsr &= ~(1 << (irq - IRQ_BASE_MB93493 + 16));
+	writel(iqsr, piqsr);
 }
 
-static struct irq_group frv_mb93493_irqs = {
-	.first_irq	= IRQ_BASE_MB93493,
-	.control	= frv_mb93493_control,
-};
-
-static void frv_mb93493_doirq(struct irq_source *source)
+static void frv_mb93493_ack(unsigned int irq)
 {
-	uint32_t mask = readl(source->muxdata);
-	mask = mask & (mask >> 16) & 0xffff;
+}
 
-	if (mask)
-		distribute_irqs(&frv_mb93493_irqs, mask);
+static void frv_mb93493_end(unsigned int irq)
+{
 }
 
-static void __init mb93493_irq_route(int irq, int source)
+static struct irq_chip frv_mb93493_pic = {
+	.name		= "mb93093",
+	.enable		= frv_mb93493_enable,
+	.disable	= frv_mb93493_disable,
+	.ack		= frv_mb93493_ack,
+	.mask		= frv_mb93493_disable,
+	.unmask		= frv_mb93493_enable,
+	.end		= frv_mb93493_end,
+};
+
+/*
+ * MB93493 PIC interrupt handler
+ */
+static irqreturn_t mb93493_interrupt(int irq, void *_piqsr, struct pt_regs *regs)
 {
-	frv_mb93493[source].irqmask |= 1 << (irq - IRQ_BASE_MB93493);
-	frv_mb93493_irqs.sources[irq - IRQ_BASE_MB93493] = &frv_mb93493[source];
+	volatile void *piqsr = _piqsr;
+	irqreturn_t iret = 0;
+	uint32_t iqsr;
+
+	iqsr = readl(piqsr);
+	iqsr = iqsr & (iqsr >> 16) & 0xffff;
+
+	/* poll all the triggered IRQs */
+	while (iqsr) {
+		int irq;
+
+		asm("scan %1,gr0,%0" : "=r"(irq) : "r"(iqsr));
+		irq = 31 - irq;
+		iqsr &= ~(1 << irq);
+
+		if (__do_IRQ(IRQ_BASE_MB93493 + irq, regs))
+			iret |= IRQ_HANDLED;
+	}
+
+	return iret;
 }
 
-void __init route_mb93493_irqs(void)
+/*
+ * define an interrupt action for each MB93493 PIC output
+ * - use dev_id to indicate the MB93493 PIC input to output mappings
+ */
+static struct irqaction mb93493_irq[2]  = {
+	[0] = {
+		.handler	= mb93493_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "mb93493.0",
+		.dev_id		= (void *) __addr_MB93493_IQSR(0),
+	},
+	[1] = {
+		.handler	= mb93493_interrupt,
+		.flags		= IRQF_DISABLED | IRQF_SHARED,
+		.mask		= CPU_MASK_NONE,
+		.name		= "mb93493.1",
+		.dev_id		= (void *) __addr_MB93493_IQSR(1),
+	}
+};
+
+/*
+ * initialise the motherboard MB93493's PIC
+ */
+void __init mb93493_init(void)
 {
-	frv_irq_route_external(&frv_mb93493[0], IRQ_CPU_MB93493_0);
-	frv_irq_route_external(&frv_mb93493[1], IRQ_CPU_MB93493_1);
-
-	frv_irq_set_group(&frv_mb93493_irqs);
-
-	mb93493_irq_route(IRQ_MB93493_VDC,		IRQ_MB93493_VDC_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_VCC,		IRQ_MB93493_VCC_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_AUDIO_IN,		IRQ_MB93493_AUDIO_IN_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_I2C_0,		IRQ_MB93493_I2C_0_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_I2C_1,		IRQ_MB93493_I2C_1_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_USB,		IRQ_MB93493_USB_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_LOCAL_BUS,	IRQ_MB93493_LOCAL_BUS_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_PCMCIA,		IRQ_MB93493_PCMCIA_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_GPIO,		IRQ_MB93493_GPIO_ROUTE);
-	mb93493_irq_route(IRQ_MB93493_AUDIO_OUT,	IRQ_MB93493_AUDIO_OUT_ROUTE);
+	int irq;
+
+	for (irq = IRQ_BASE_MB93493 + 0; irq <= IRQ_BASE_MB93493 + 10; irq++)
+		set_irq_chip_and_handler(irq, &frv_mb93493_pic, handle_edge_irq);
+
+	/* the MB93493 drives external IRQ inputs on the CPU PIC */
+	setup_irq(IRQ_CPU_MB93493_0, &mb93493_irq[0]);
+	setup_irq(IRQ_CPU_MB93493_1, &mb93493_irq[1]);
 }
diff --git a/arch/frv/kernel/irq-routing.c b/arch/frv/kernel/irq-routing.c
deleted file mode 100644
index 53886adf47d..00000000000
--- a/arch/frv/kernel/irq-routing.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/* irq-routing.c: IRQ routing
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/serial_reg.h>
-#include <asm/io.h>
-#include <asm/irq-routing.h>
-#include <asm/irc-regs.h>
-#include <asm/serial-regs.h>
-#include <asm/dma.h>
-
-struct irq_level frv_irq_levels[16] = {
-	[0 ... 15] = {
-		.lock	= SPIN_LOCK_UNLOCKED,
-	}
-};
-
-struct irq_group *irq_groups[NR_IRQ_GROUPS];
-
-extern struct irq_group frv_cpu_irqs;
-
-void __init frv_irq_route(struct irq_source *source, int irqlevel)
-{
-	source->level = &frv_irq_levels[irqlevel];
-	source->next = frv_irq_levels[irqlevel].sources;
-	frv_irq_levels[irqlevel].sources = source;
-}
-
-void __init frv_irq_route_external(struct irq_source *source, int irq)
-{
-	int irqlevel = 0;
-
-	switch (irq) {
-	case IRQ_CPU_EXTERNAL0:	irqlevel = IRQ_XIRQ0_LEVEL; break;
-	case IRQ_CPU_EXTERNAL1:	irqlevel = IRQ_XIRQ1_LEVEL; break;
-	case IRQ_CPU_EXTERNAL2:	irqlevel = IRQ_XIRQ2_LEVEL; break;
-	case IRQ_CPU_EXTERNAL3:	irqlevel = IRQ_XIRQ3_LEVEL; break;
-	case IRQ_CPU_EXTERNAL4:	irqlevel = IRQ_XIRQ4_LEVEL; break;
-	case IRQ_CPU_EXTERNAL5:	irqlevel = IRQ_XIRQ5_LEVEL; break;
-	case IRQ_CPU_EXTERNAL6:	irqlevel = IRQ_XIRQ6_LEVEL; break;
-	case IRQ_CPU_EXTERNAL7:	irqlevel = IRQ_XIRQ7_LEVEL; break;
-	default: BUG();
-	}
-
-	source->level = &frv_irq_levels[irqlevel];
-	source->next = frv_irq_levels[irqlevel].sources;
-	frv_irq_levels[irqlevel].sources = source;
-}
-
-void __init frv_irq_set_group(struct irq_group *group)
-{
-	irq_groups[group->first_irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP] = group;
-}
-
-void distribute_irqs(struct irq_group *group, unsigned long irqmask)
-{
-	struct irqaction *action;
-	int irq;
-
-	while (irqmask) {
-		asm("scan %1,gr0,%0" : "=r"(irq) : "r"(irqmask));
-		if (irq < 0 || irq > 31)
-			asm volatile("break");
-		irq = 31 - irq;
-
-		irqmask &= ~(1 << irq);
-		action = group->actions[irq];
-
-		irq += group->first_irq;
-
-		if (action) {
-			int status = 0;
-
-//			if (!(action->flags & IRQF_DISABLED))
-//				local_irq_enable();
-
-			do {
-				status |= action->flags;
-				action->handler(irq, action->dev_id, __frame);
-				action = action->next;
-			} while (action);
-
-			if (status & IRQF_SAMPLE_RANDOM)
-				add_interrupt_randomness(irq);
-			local_irq_disable();
-		}
-	}
-}
-
-/*****************************************************************************/
-/*
- * CPU UART interrupts
- */
-static void frv_cpuuart_doirq(struct irq_source *source)
-{
-//	uint8_t iir = readb(source->muxdata + UART_IIR * 8);
-//	if ((iir & 0x0f) != UART_IIR_NO_INT)
-		distribute_irqs(&frv_cpu_irqs, source->irqmask);
-}
-
-struct irq_source frv_cpuuart[2] = {
-#define __CPUUART(X, A)						\
-	[X] = {							\
-		.muxname	= "uart",			\
-		.muxdata	= (volatile void __iomem *)(unsigned long)A,\
-		.irqmask	= 1 << IRQ_CPU_UART##X,		\
-		.doirq		= frv_cpuuart_doirq,		\
-	}
-
-	__CPUUART(0, UART0_BASE),
-	__CPUUART(1, UART1_BASE),
-};
-
-/*****************************************************************************/
-/*
- * CPU DMA interrupts
- */
-static void frv_cpudma_doirq(struct irq_source *source)
-{
-	uint32_t cstr = readl(source->muxdata + DMAC_CSTRx);
-	if (cstr & DMAC_CSTRx_INT)
-		distribute_irqs(&frv_cpu_irqs, source->irqmask);
-}
-
-struct irq_source frv_cpudma[8] = {
-#define __CPUDMA(X, A)						\
-	[X] = {							\
-		.muxname	= "dma",			\
-		.muxdata	= (volatile void __iomem *)(unsigned long)A,\
-		.irqmask	= 1 << IRQ_CPU_DMA##X,		\
-		.doirq		= frv_cpudma_doirq,		\
-	}
-
-	__CPUDMA(0, 0xfe000900),
-	__CPUDMA(1, 0xfe000980),
-	__CPUDMA(2, 0xfe000a00),
-	__CPUDMA(3, 0xfe000a80),
-	__CPUDMA(4, 0xfe001000),
-	__CPUDMA(5, 0xfe001080),
-	__CPUDMA(6, 0xfe001100),
-	__CPUDMA(7, 0xfe001180),
-};
-
-/*****************************************************************************/
-/*
- * CPU timer interrupts - can't tell whether they've generated an interrupt or not
- */
-static void frv_cputimer_doirq(struct irq_source *source)
-{
-	distribute_irqs(&frv_cpu_irqs, source->irqmask);
-}
-
-struct irq_source frv_cputimer[3] = {
-#define __CPUTIMER(X)						\
-	[X] = {							\
-		.muxname	= "timer",			\
-		.muxdata	= NULL,				\
-		.irqmask	= 1 << IRQ_CPU_TIMER##X,	\
-		.doirq		= frv_cputimer_doirq,		\
-	}
-
-	__CPUTIMER(0),
-	__CPUTIMER(1),
-	__CPUTIMER(2),
-};
-
-/*****************************************************************************/
-/*
- * external CPU interrupts - can't tell directly whether they've generated an interrupt or not
- */
-static void frv_cpuexternal_doirq(struct irq_source *source)
-{
-	distribute_irqs(&frv_cpu_irqs, source->irqmask);
-}
-
-struct irq_source frv_cpuexternal[8] = {
-#define __CPUEXTERNAL(X)					\
-	[X] = {							\
-		.muxname	= "ext",			\
-		.muxdata	= NULL,				\
-		.irqmask	= 1 << IRQ_CPU_EXTERNAL##X,	\
-		.doirq		= frv_cpuexternal_doirq,	\
-	}
-
-	__CPUEXTERNAL(0),
-	__CPUEXTERNAL(1),
-	__CPUEXTERNAL(2),
-	__CPUEXTERNAL(3),
-	__CPUEXTERNAL(4),
-	__CPUEXTERNAL(5),
-	__CPUEXTERNAL(6),
-	__CPUEXTERNAL(7),
-};
-
-#define set_IRR(N,A,B,C,D) __set_IRR(N, (A << 28) | (B << 24) | (C << 20) | (D << 16))
-
-struct irq_group frv_cpu_irqs = {
-	.sources = {
-		[IRQ_CPU_UART0]		= &frv_cpuuart[0],
-		[IRQ_CPU_UART1]		= &frv_cpuuart[1],
-		[IRQ_CPU_TIMER0]	= &frv_cputimer[0],
-		[IRQ_CPU_TIMER1]	= &frv_cputimer[1],
-		[IRQ_CPU_TIMER2]	= &frv_cputimer[2],
-		[IRQ_CPU_DMA0]		= &frv_cpudma[0],
-		[IRQ_CPU_DMA1]		= &frv_cpudma[1],
-		[IRQ_CPU_DMA2]		= &frv_cpudma[2],
-		[IRQ_CPU_DMA3]		= &frv_cpudma[3],
-		[IRQ_CPU_DMA4]		= &frv_cpudma[4],
-		[IRQ_CPU_DMA5]		= &frv_cpudma[5],
-		[IRQ_CPU_DMA6]		= &frv_cpudma[6],
-		[IRQ_CPU_DMA7]		= &frv_cpudma[7],
-		[IRQ_CPU_EXTERNAL0]	= &frv_cpuexternal[0],
-		[IRQ_CPU_EXTERNAL1]	= &frv_cpuexternal[1],
-		[IRQ_CPU_EXTERNAL2]	= &frv_cpuexternal[2],
-		[IRQ_CPU_EXTERNAL3]	= &frv_cpuexternal[3],
-		[IRQ_CPU_EXTERNAL4]	= &frv_cpuexternal[4],
-		[IRQ_CPU_EXTERNAL5]	= &frv_cpuexternal[5],
-		[IRQ_CPU_EXTERNAL6]	= &frv_cpuexternal[6],
-		[IRQ_CPU_EXTERNAL7]	= &frv_cpuexternal[7],
-	},
-};
-
-/*****************************************************************************/
-/*
- * route the CPU's interrupt sources
- */
-void __init route_cpu_irqs(void)
-{
-	frv_irq_set_group(&frv_cpu_irqs);
-
-	__set_IITMR(0, 0x003f0000);	/* DMA0-3, TIMER0-2 IRQ detect levels */
-	__set_IITMR(1, 0x20000000);	/* ERR0-1, UART0-1, DMA4-7 IRQ detect levels */
-
-	/* route UART and error interrupts */
-	frv_irq_route(&frv_cpuuart[0],	IRQ_UART0_LEVEL);
-	frv_irq_route(&frv_cpuuart[1],	IRQ_UART1_LEVEL);
-
-	set_IRR(6, IRQ_GDBSTUB_LEVEL, IRQ_GDBSTUB_LEVEL, IRQ_UART1_LEVEL, IRQ_UART0_LEVEL);
-
-	/* route DMA channel interrupts */
-	frv_irq_route(&frv_cpudma[0],	IRQ_DMA0_LEVEL);
-	frv_irq_route(&frv_cpudma[1],	IRQ_DMA1_LEVEL);
-	frv_irq_route(&frv_cpudma[2],	IRQ_DMA2_LEVEL);
-	frv_irq_route(&frv_cpudma[3],	IRQ_DMA3_LEVEL);
-	frv_irq_route(&frv_cpudma[4],	IRQ_DMA4_LEVEL);
-	frv_irq_route(&frv_cpudma[5],	IRQ_DMA5_LEVEL);
-	frv_irq_route(&frv_cpudma[6],	IRQ_DMA6_LEVEL);
-	frv_irq_route(&frv_cpudma[7],	IRQ_DMA7_LEVEL);
-
-	set_IRR(4, IRQ_DMA3_LEVEL, IRQ_DMA2_LEVEL, IRQ_DMA1_LEVEL, IRQ_DMA0_LEVEL);
-	set_IRR(7, IRQ_DMA7_LEVEL, IRQ_DMA6_LEVEL, IRQ_DMA5_LEVEL, IRQ_DMA4_LEVEL);
-
-	/* route timer interrupts */
-	frv_irq_route(&frv_cputimer[0],	IRQ_TIMER0_LEVEL);
-	frv_irq_route(&frv_cputimer[1],	IRQ_TIMER1_LEVEL);
-	frv_irq_route(&frv_cputimer[2],	IRQ_TIMER2_LEVEL);
-
-	set_IRR(5, 0, IRQ_TIMER2_LEVEL, IRQ_TIMER1_LEVEL, IRQ_TIMER0_LEVEL);
-
-	/* route external interrupts */
-	frv_irq_route(&frv_cpuexternal[0], IRQ_XIRQ0_LEVEL);
-	frv_irq_route(&frv_cpuexternal[1], IRQ_XIRQ1_LEVEL);
-	frv_irq_route(&frv_cpuexternal[2], IRQ_XIRQ2_LEVEL);
-	frv_irq_route(&frv_cpuexternal[3], IRQ_XIRQ3_LEVEL);
-	frv_irq_route(&frv_cpuexternal[4], IRQ_XIRQ4_LEVEL);
-	frv_irq_route(&frv_cpuexternal[5], IRQ_XIRQ5_LEVEL);
-	frv_irq_route(&frv_cpuexternal[6], IRQ_XIRQ6_LEVEL);
-	frv_irq_route(&frv_cpuexternal[7], IRQ_XIRQ7_LEVEL);
-
-	set_IRR(2, IRQ_XIRQ7_LEVEL, IRQ_XIRQ6_LEVEL, IRQ_XIRQ5_LEVEL, IRQ_XIRQ4_LEVEL);
-	set_IRR(3, IRQ_XIRQ3_LEVEL, IRQ_XIRQ2_LEVEL, IRQ_XIRQ1_LEVEL, IRQ_XIRQ0_LEVEL);
-
-#if defined(CONFIG_MB93091_VDK)
-	__set_TM1(0x55550000);		/* XIRQ7-0 all active low */
-#elif defined(CONFIG_MB93093_PDK)
-	__set_TM1(0x15550000);		/* XIRQ7 active high, 6-0 all active low */
-#else
-#error dont know external IRQ trigger levels for this setup
-#endif
-
-} /* end route_cpu_irqs() */
diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c
index 08967010be0..e1ab9f2e43f 100644
--- a/arch/frv/kernel/irq.c
+++ b/arch/frv/kernel/irq.c
@@ -1,6 +1,6 @@
 /* irq.c: FRV IRQ handling
  *
- * Copyright (C) 2003, 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2003, 2004, 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,13 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-/*
- * (mostly architecture independent, will move to kernel/irq.c in 2.5.)
- *
- * IRQs are in fact implemented a bit like signal handlers for the kernel.
- * Naturally it's not a 1:1 relation, but there are similarities.
- */
-
 #include <linux/ptrace.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
@@ -43,19 +36,16 @@
 #include <asm/delay.h>
 #include <asm/irq.h>
 #include <asm/irc-regs.h>
-#include <asm/irq-routing.h>
 #include <asm/gdb-stub.h>
 
-extern void __init fpga_init(void);
-extern void __init route_mb93493_irqs(void);
-
-static void register_irq_proc (unsigned int irq);
+#define set_IRR(N,A,B,C,D) __set_IRR(N, (A << 28) | (B << 24) | (C << 20) | (D << 16))
 
-/*
- * Special irq handlers.
- */
+extern void __init fpga_init(void);
+#ifdef CONFIG_FUJITSU_MB93493
+extern void __init mb93493_init(void);
+#endif
 
-irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) { return IRQ_HANDLED; }
+#define __reg16(ADDR) (*(volatile unsigned short *)(ADDR))
 
 atomic_t irq_err_count;
 
@@ -64,215 +54,99 @@ atomic_t irq_err_count;
  */
 int show_interrupts(struct seq_file *p, void *v)
 {
-	struct irqaction *action;
-	struct irq_group *group;
+	int i = *(loff_t *) v, cpu;
+	struct irqaction * action;
 	unsigned long flags;
-	int level, grp, ix, i, j;
-
-	i = *(loff_t *) v;
-
-	switch (i) {
-	case 0:
-		seq_printf(p, "           ");
-		for_each_online_cpu(j)
-			seq_printf(p, "CPU%d       ",j);
-
-		seq_putc(p, '\n');
-		break;
 
-	case 1 ... NR_IRQ_GROUPS * NR_IRQ_ACTIONS_PER_GROUP:
-		local_irq_save(flags);
-
-		grp = (i - 1) / NR_IRQ_ACTIONS_PER_GROUP;
-		group = irq_groups[grp];
-		if (!group)
-			goto skip;
-
-		ix = (i - 1) % NR_IRQ_ACTIONS_PER_GROUP;
-		action = group->actions[ix];
-		if (!action)
-			goto skip;
-
-		seq_printf(p, "%3d: ", i - 1);
-
-#ifndef CONFIG_SMP
-		seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i - 1]);
-#endif
-
-		level = group->sources[ix]->level - frv_irq_levels;
-
-		seq_printf(p, " %12s@%x", group->sources[ix]->muxname, level);
-		seq_printf(p, "  %s", action->name);
-
-		for (action = action->next; action; action = action->next)
-			seq_printf(p, ", %s", action->name);
+	if (i == 0) {
+		char cpuname[12];
 
+		seq_printf(p, "    ");
+		for_each_present_cpu(cpu) {
+			sprintf(cpuname, "CPU%d", cpu);
+			seq_printf(p, " %10s", cpuname);
+		}
 		seq_putc(p, '\n');
-skip:
-		local_irq_restore(flags);
-		break;
+	}
 
-	case NR_IRQ_GROUPS * NR_IRQ_ACTIONS_PER_GROUP + 1:
-		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-		break;
+	if (i < NR_IRQS) {
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		action = irq_desc[i].action;
+		if (action) {
+			seq_printf(p, "%3d: ", i);
+			for_each_present_cpu(cpu)
+				seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]);
+			seq_printf(p, " %10s", irq_desc[i].chip->name ? : "-");
+			seq_printf(p, "  %s", action->name);
+			for (action = action->next;
+			     action;
+			     action = action->next)
+				seq_printf(p, ", %s", action->name);
+
+			seq_putc(p, '\n');
+		}
 
-	default:
-		break;
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+	} else if (i == NR_IRQS) {
+		seq_printf(p, "Err: %10u\n", atomic_read(&irq_err_count));
 	}
 
 	return 0;
 }
 
-
 /*
- * Generic enable/disable code: this just calls
- * down into the PIC-specific version for the actual
- * hardware disable after having gotten the irq
- * controller lock.
+ * on-CPU PIC operations
  */
-
-/**
- *	disable_irq_nosync - disable an irq without waiting
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Disables and Enables are
- *	nested.
- *	Unlike disable_irq(), this function does not ensure existing
- *	instances of the IRQ handler have completed before returning.
- *
- *	This function may be called from IRQ context.
- */
-
-void disable_irq_nosync(unsigned int irq)
+static void frv_cpupic_enable(unsigned int irqlevel)
 {
-	struct irq_source *source;
-	struct irq_group *group;
-	struct irq_level *level;
-	unsigned long flags;
-	int idx = irq & (NR_IRQ_ACTIONS_PER_GROUP - 1);
-
-	group = irq_groups[irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP];
-	if (!group)
-		BUG();
-
-	source = group->sources[idx];
-	if (!source)
-		BUG();
-
-	level = source->level;
-
-	spin_lock_irqsave(&level->lock, flags);
-
-	if (group->control) {
-		if (!group->disable_cnt[idx]++)
-			group->control(group, idx, 0);
-	} else if (!level->disable_count++) {
-		__set_MASK(level - frv_irq_levels);
-	}
-
-	spin_unlock_irqrestore(&level->lock, flags);
+	__clr_MASK(irqlevel);
 }
 
-EXPORT_SYMBOL(disable_irq_nosync);
-
-/**
- *	disable_irq - disable an irq and wait for completion
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and Disables are
- *	nested.
- *	This function waits for any pending IRQ handlers for this interrupt
- *	to complete before returning. If you use this function while
- *	holding a resource the IRQ handler may need you will deadlock.
- *
- *	This function may be called - with care - from IRQ context.
- */
-
-void disable_irq(unsigned int irq)
+static void frv_cpupic_disable(unsigned int irqlevel)
 {
-	disable_irq_nosync(irq);
-
-#ifdef CONFIG_SMP
-	if (!local_irq_count(smp_processor_id())) {
-		do {
-			barrier();
-		} while (irq_desc[irq].status & IRQ_INPROGRESS);
-	}
-#endif
+	__set_MASK(irqlevel);
 }
 
-EXPORT_SYMBOL(disable_irq);
-
-/**
- *	enable_irq - enable handling of an irq
- *	@irq: Interrupt to enable
- *
- *	Undoes the effect of one call to disable_irq().  If this
- *	matches the last disable, processing of interrupts on this
- *	IRQ line is re-enabled.
- *
- *	This function may be called from IRQ context.
- */
-
-void enable_irq(unsigned int irq)
+static void frv_cpupic_ack(unsigned int irqlevel)
 {
-	struct irq_source *source;
-	struct irq_group *group;
-	struct irq_level *level;
-	unsigned long flags;
-	int idx = irq & (NR_IRQ_ACTIONS_PER_GROUP - 1);
-	int count;
-
-	group = irq_groups[irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP];
-	if (!group)
-		BUG();
-
-	source = group->sources[idx];
-	if (!source)
-		BUG();
-
-	level = source->level;
-
-	spin_lock_irqsave(&level->lock, flags);
-
-	if (group->control)
-		count = group->disable_cnt[idx];
-	else
-		count = level->disable_count;
-
-	switch (count) {
-	case 1:
-		if (group->control) {
-			if (group->actions[idx])
-				group->control(group, idx, 1);
-		} else {
-			if (level->usage)
-				__clr_MASK(level - frv_irq_levels);
-		}
-		/* fall-through */
+	__set_MASK(irqlevel);
+	__clr_RC(irqlevel);
+	__clr_IRL();
+}
 
-	default:
-		count--;
-		break;
+static void frv_cpupic_mask(unsigned int irqlevel)
+{
+	__set_MASK(irqlevel);
+}
 
-	case 0:
-		printk("enable_irq(%u) unbalanced from %p\n", irq, __builtin_return_address(0));
-	}
+static void frv_cpupic_mask_ack(unsigned int irqlevel)
+{
+	__set_MASK(irqlevel);
+	__clr_RC(irqlevel);
+	__clr_IRL();
+}
 
-	if (group->control)
-		group->disable_cnt[idx] = count;
-	else
-		level->disable_count = count;
+static void frv_cpupic_unmask(unsigned int irqlevel)
+{
+	__clr_MASK(irqlevel);
+}
 
-	spin_unlock_irqrestore(&level->lock, flags);
+static void frv_cpupic_end(unsigned int irqlevel)
+{
+	__clr_MASK(irqlevel);
 }
 
-EXPORT_SYMBOL(enable_irq);
+static struct irq_chip frv_cpu_pic = {
+	.name		= "cpu",
+	.enable		= frv_cpupic_enable,
+	.disable	= frv_cpupic_disable,
+	.ack		= frv_cpupic_ack,
+	.mask		= frv_cpupic_mask,
+	.mask_ack	= frv_cpupic_mask_ack,
+	.unmask		= frv_cpupic_unmask,
+	.end		= frv_cpupic_end,
+};
 
-/*****************************************************************************/
 /*
  * handles all normal device IRQ's
  * - registers are referred to by the __frame variable (GR28)
@@ -281,463 +155,65 @@ EXPORT_SYMBOL(enable_irq);
  */
 asmlinkage void do_IRQ(void)
 {
-	struct irq_source *source;
-	int level, cpu;
-
 	irq_enter();
-
-	level = (__frame->tbr >> 4) & 0xf;
-	cpu = smp_processor_id();
-
-	if ((unsigned long) __frame - (unsigned long) (current + 1) < 512)
-		BUG();
-
-	__set_MASK(level);
-	__clr_RC(level);
-	__clr_IRL();
-
-	kstat_this_cpu.irqs[level]++;
-
-	for (source = frv_irq_levels[level].sources; source; source = source->next)
-		source->doirq(source);
-
-	__clr_MASK(level);
-
+	__do_IRQ(__get_IRL(), __frame);
 	irq_exit();
+}
 
-} /* end do_IRQ() */
-
-/*****************************************************************************/
 /*
  * handles all NMIs when not co-opted by the debugger
  * - registers are referred to by the __frame variable (GR28)
  */
 asmlinkage void do_NMI(void)
 {
-} /* end do_NMI() */
-
-/*****************************************************************************/
-/**
- *	request_irq - allocate an interrupt line
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
- *	@dev_id: A cookie passed back to the handler function
- *
- *	This call allocates interrupt resources and enables the
- *	interrupt line and IRQ handling. From the point this
- *	call is made your handler function may be invoked. Since
- *	your handler function must clear any interrupt the board
- *	raises, you must take care both to initialise your hardware
- *	and to set up the interrupt handler in the right order.
- *
- *	Dev_id must be globally unique. Normally the address of the
- *	device data structure is used as the cookie. Since the handler
- *	receives this value it makes sense to use it.
- *
- *	If your interrupt is shared you must pass a non NULL dev_id
- *	as this is required when freeing the interrupt.
- *
- *	Flags:
- *
- *	IRQF_SHARED		Interrupt is shared
- *
- *	IRQF_DISABLED	Disable local interrupts while processing
- *
- *	IRQF_SAMPLE_RANDOM	The interrupt can be used for entropy
- *
- */
-
-int request_irq(unsigned int irq,
-		irqreturn_t (*handler)(int, void *, struct pt_regs *),
-		unsigned long irqflags,
-		const char * devname,
-		void *dev_id)
-{
-	int retval;
-	struct irqaction *action;
-
-#if 1
-	/*
-	 * Sanity-check: shared interrupts should REALLY pass in
-	 * a real dev-ID, otherwise we'll have trouble later trying
-	 * to figure out which interrupt is which (messes up the
-	 * interrupt freeing logic etc).
-	 */
-	if (irqflags & IRQF_SHARED) {
-		if (!dev_id)
-			printk("Bad boy: %s (at 0x%x) called us without a dev_id!\n",
-			       devname, (&irq)[-1]);
-	}
-#endif
-
-	if ((irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP) >= NR_IRQ_GROUPS)
-		return -EINVAL;
-	if (!handler)
-		return -EINVAL;
-
-	action = (struct irqaction *) kmalloc(sizeof(struct irqaction), GFP_KERNEL);
-	if (!action)
-		return -ENOMEM;
-
-	action->handler = handler;
-	action->flags = irqflags;
-	action->mask = CPU_MASK_NONE;
-	action->name = devname;
-	action->next = NULL;
-	action->dev_id = dev_id;
-
-	retval = setup_irq(irq, action);
-	if (retval)
-		kfree(action);
-	return retval;
-}
-
-EXPORT_SYMBOL(request_irq);
-
-/**
- *	free_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function. The function
- *	does not return until any executing interrupts for this IRQ
- *	have completed.
- *
- *	This function may be called from interrupt context.
- *
- *	Bugs: Attempting to free an irq in a handler for the same irq hangs
- *	      the machine.
- */
-
-void free_irq(unsigned int irq, void *dev_id)
-{
-	struct irq_source *source;
-	struct irq_group *group;
-	struct irq_level *level;
-	struct irqaction **p, **pp;
-	unsigned long flags;
-
-	if ((irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP) >= NR_IRQ_GROUPS)
-		return;
-
-	group = irq_groups[irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP];
-	if (!group)
-		BUG();
-
-	source = group->sources[irq & (NR_IRQ_ACTIONS_PER_GROUP - 1)];
-	if (!source)
-		BUG();
-
-	level = source->level;
-	p = &group->actions[irq & (NR_IRQ_ACTIONS_PER_GROUP - 1)];
-
-	spin_lock_irqsave(&level->lock, flags);
-
-	for (pp = p; *pp; pp = &(*pp)->next) {
-		struct irqaction *action = *pp;
-
-		if (action->dev_id != dev_id)
-			continue;
-
-		/* found it - remove from the list of entries */
-		*pp = action->next;
-
-		level->usage--;
-
-		if (p == pp && group->control)
-			group->control(group, irq & (NR_IRQ_ACTIONS_PER_GROUP - 1), 0);
-
-		if (level->usage == 0)
-			__set_MASK(level - frv_irq_levels);
-
-		spin_unlock_irqrestore(&level->lock,flags);
-
-#ifdef CONFIG_SMP
-		/* Wait to make sure it's not being used on another CPU */
-		while (desc->status & IRQ_INPROGRESS)
-			barrier();
-#endif
-		kfree(action);
-		return;
-	}
-}
-
-EXPORT_SYMBOL(free_irq);
-
-/*
- * IRQ autodetection code..
- *
- * This depends on the fact that any interrupt that comes in on to an
- * unassigned IRQ will cause GxICR_DETECT to be set
- */
-
-static DECLARE_MUTEX(probe_sem);
-
-/**
- *	probe_irq_on	- begin an interrupt autodetect
- *
- *	Commence probing for an interrupt. The interrupts are scanned
- *	and a mask of potential interrupt lines is returned.
- *
- */
-
-unsigned long probe_irq_on(void)
-{
-	down(&probe_sem);
-	return 0;
 }
 
-EXPORT_SYMBOL(probe_irq_on);
-
-/*
- * Return a mask of triggered interrupts (this
- * can handle only legacy ISA interrupts).
- */
-
-/**
- *	probe_irq_mask - scan a bitmap of interrupt lines
- *	@val:	mask of interrupts to consider
- *
- *	Scan the ISA bus interrupt lines and return a bitmap of
- *	active interrupts. The interrupt probe logic state is then
- *	returned to its previous value.
- *
- *	Note: we need to scan all the irq's even though we will
- *	only return ISA irq numbers - just so that we reset them
- *	all to a known state.
- */
-unsigned int probe_irq_mask(unsigned long xmask)
-{
-	up(&probe_sem);
-	return 0;
-}
-
-EXPORT_SYMBOL(probe_irq_mask);
-
 /*
- * Return the one interrupt that triggered (this can
- * handle any interrupt source).
- */
-
-/**
- *	probe_irq_off	- end an interrupt autodetect
- *	@xmask: mask of potential interrupts (unused)
- *
- *	Scans the unused interrupt lines and returns the line which
- *	appears to have triggered the interrupt. If no interrupt was
- *	found then zero is returned. If more than one interrupt is
- *	found then minus the first candidate is returned to indicate
- *	their is doubt.
- *
- *	The interrupt probe logic state is returned to its previous
- *	value.
- *
- *	BUGS: When used in a module (which arguably shouldnt happen)
- *	nothing prevents two IRQ probe callers from overlapping. The
- *	results of this are non-optimal.
+ * initialise the interrupt system
  */
-
-int probe_irq_off(unsigned long xmask)
-{
-	up(&probe_sem);
-	return -1;
-}
-
-EXPORT_SYMBOL(probe_irq_off);
-
-/* this was setup_x86_irq but it seems pretty generic */
-int setup_irq(unsigned int irq, struct irqaction *new)
-{
-	struct irq_source *source;
-	struct irq_group *group;
-	struct irq_level *level;
-	struct irqaction **p, **pp;
-	unsigned long flags;
-
-	group = irq_groups[irq >> NR_IRQ_LOG2_ACTIONS_PER_GROUP];
-	if (!group)
-		BUG();
-
-	source = group->sources[irq & (NR_IRQ_ACTIONS_PER_GROUP - 1)];
-	if (!source)
-		BUG();
-
-	level = source->level;
-
-	p = &group->actions[irq & (NR_IRQ_ACTIONS_PER_GROUP - 1)];
-
-	/*
-	 * Some drivers like serial.c use request_irq() heavily,
-	 * so we have to be careful not to interfere with a
-	 * running system.
-	 */
-	if (new->flags & IRQF_SAMPLE_RANDOM) {
-		/*
-		 * This function might sleep, we want to call it first,
-		 * outside of the atomic block.
-		 * Yes, this might clear the entropy pool if the wrong
-		 * driver is attempted to be loaded, without actually
-		 * installing a new handler, but is this really a problem,
-		 * only the sysadmin is able to do this.
-		 */
-		rand_initialize_irq(irq);
-	}
-
-	/* must juggle the interrupt processing stuff with interrupts disabled */
-	spin_lock_irqsave(&level->lock, flags);
-
-	/* can't share interrupts unless all parties agree to */
-	if (level->usage != 0 && !(level->flags & new->flags & IRQF_SHARED)) {
-		spin_unlock_irqrestore(&level->lock,flags);
-		return -EBUSY;
-	}
-
-	/* add new interrupt at end of irq queue */
-	pp = p;
-	while (*pp)
-		pp = &(*pp)->next;
-
-	*pp = new;
-
-	level->usage++;
-	level->flags = new->flags;
-
-	/* turn the interrupts on */
-	if (level->usage == 1)
-		__clr_MASK(level - frv_irq_levels);
-
-	if (p == pp && group->control)
-		group->control(group, irq & (NR_IRQ_ACTIONS_PER_GROUP - 1), 1);
-
-	spin_unlock_irqrestore(&level->lock, flags);
-	register_irq_proc(irq);
-	return 0;
-}
-
-static struct proc_dir_entry * root_irq_dir;
-static struct proc_dir_entry * irq_dir [NR_IRQS];
-
-#define HEX_DIGITS 8
-
-static unsigned int parse_hex_value (const char __user *buffer,
-				     unsigned long count, unsigned long *ret)
-{
-	unsigned char hexnum [HEX_DIGITS];
-	unsigned long value;
-	int i;
-
-	if (!count)
-		return -EINVAL;
-	if (count > HEX_DIGITS)
-		count = HEX_DIGITS;
-	if (copy_from_user(hexnum, buffer, count))
-		return -EFAULT;
-
-	/*
-	 * Parse the first 8 characters as a hex string, any non-hex char
-	 * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same.
-	 */
-	value = 0;
-
-	for (i = 0; i < count; i++) {
-		unsigned int c = hexnum[i];
-
-		switch (c) {
-			case '0' ... '9': c -= '0'; break;
-			case 'a' ... 'f': c -= 'a'-10; break;
-			case 'A' ... 'F': c -= 'A'-10; break;
-		default:
-			goto out;
-		}
-		value = (value << 4) | c;
-	}
-out:
-	*ret = value;
-	return 0;
-}
-
-
-static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
-			int count, int *eof, void *data)
-{
-	unsigned long *mask = (unsigned long *) data;
-	if (count < HEX_DIGITS+1)
-		return -EINVAL;
-	return sprintf (page, "%08lx\n", *mask);
-}
-
-static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
-{
-	unsigned long *mask = (unsigned long *) data, full_count = count, err;
-	unsigned long new_value;
-
-	show_state();
-	err = parse_hex_value(buffer, count, &new_value);
-	if (err)
-		return err;
-
-	*mask = new_value;
-	return full_count;
-}
-
-#define MAX_NAMELEN 10
-
-static void register_irq_proc (unsigned int irq)
-{
-	char name [MAX_NAMELEN];
-
-	if (!root_irq_dir || irq_dir[irq])
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	sprintf(name, "%d", irq);
-
-	/* create /proc/irq/1234 */
-	irq_dir[irq] = proc_mkdir(name, root_irq_dir);
-}
-
-unsigned long prof_cpu_mask = -1;
-
-void init_irq_proc (void)
+void __init init_IRQ(void)
 {
-	struct proc_dir_entry *entry;
-	int i;
+	int level;
 
-	/* create /proc/irq */
-	root_irq_dir = proc_mkdir("irq", NULL);
+	for (level = 1; level <= 14; level++)
+		set_irq_chip_and_handler(level, &frv_cpu_pic,
+					 handle_level_irq);
 
-	/* create /proc/irq/prof_cpu_mask */
-	entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
-	if (!entry)
-	    return;
+	set_irq_handler(IRQ_CPU_TIMER0, handle_edge_irq);
 
-	entry->nlink = 1;
-	entry->data = (void *)&prof_cpu_mask;
-	entry->read_proc = prof_cpu_mask_read_proc;
-	entry->write_proc = prof_cpu_mask_write_proc;
-
-	/*
-	 * Create entries for all existing IRQs.
+	/* set the trigger levels for internal interrupt sources
+	 * - timers all falling-edge
+	 * - ERR0 is rising-edge
+	 * - all others are high-level
 	 */
-	for (i = 0; i < NR_IRQS; i++)
-		register_irq_proc(i);
-}
+	__set_IITMR(0, 0x003f0000);	/* DMA0-3, TIMER0-2 */
+	__set_IITMR(1, 0x20000000);	/* ERR0-1, UART0-1, DMA4-7 */
+
+	/* route internal interrupts */
+	set_IRR(4, IRQ_DMA3_LEVEL, IRQ_DMA2_LEVEL, IRQ_DMA1_LEVEL,
+		IRQ_DMA0_LEVEL);
+	set_IRR(5, 0, IRQ_TIMER2_LEVEL, IRQ_TIMER1_LEVEL, IRQ_TIMER0_LEVEL);
+	set_IRR(6, IRQ_GDBSTUB_LEVEL, IRQ_GDBSTUB_LEVEL,
+		IRQ_UART1_LEVEL, IRQ_UART0_LEVEL);
+	set_IRR(7, IRQ_DMA7_LEVEL, IRQ_DMA6_LEVEL, IRQ_DMA5_LEVEL,
+		IRQ_DMA4_LEVEL);
+
+	/* route external interrupts */
+	set_IRR(2, IRQ_XIRQ7_LEVEL, IRQ_XIRQ6_LEVEL, IRQ_XIRQ5_LEVEL,
+		IRQ_XIRQ4_LEVEL);
+	set_IRR(3, IRQ_XIRQ3_LEVEL, IRQ_XIRQ2_LEVEL, IRQ_XIRQ1_LEVEL,
+		IRQ_XIRQ0_LEVEL);
+
+#if defined(CONFIG_MB93091_VDK)
+	__set_TM1(0x55550000);		/* XIRQ7-0 all active low */
+#elif defined(CONFIG_MB93093_PDK)
+	__set_TM1(0x15550000);		/* XIRQ7 active high, 6-0 all active low */
+#else
+#error dont know external IRQ trigger levels for this setup
+#endif
 
-/*****************************************************************************/
-/*
- * initialise the interrupt system
- */
-void __init init_IRQ(void)
-{
-	route_cpu_irqs();
 	fpga_init();
 #ifdef CONFIG_FUJITSU_MB93493
-	route_mb93493_irqs();
+	mb93493_init();
 #endif
-} /* end init_IRQ() */
+}
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c
index af08ccd4ed6..d96a57e5f03 100644
--- a/arch/frv/kernel/setup.c
+++ b/arch/frv/kernel/setup.c
@@ -43,7 +43,6 @@
 #include <asm/mb-regs.h>
 #include <asm/mb93493-regs.h>
 #include <asm/gdb-stub.h>
-#include <asm/irq-routing.h>
 #include <asm/io.h>
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 68a77fe3bb4..3d0284bccb9 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -26,7 +26,6 @@
 #include <asm/timer-regs.h>
 #include <asm/mb-regs.h>
 #include <asm/mb86943a.h>
-#include <asm/irq-routing.h>
 
 #include <linux/timex.h>
 
diff --git a/arch/frv/mb93090-mb00/pci-irq.c b/arch/frv/mb93090-mb00/pci-irq.c
index 2278c80bd88..ba587523c01 100644
--- a/arch/frv/mb93090-mb00/pci-irq.c
+++ b/arch/frv/mb93090-mb00/pci-irq.c
@@ -15,7 +15,6 @@
 
 #include <asm/io.h>
 #include <asm/smp.h>
-#include <asm/irq-routing.h>
 
 #include "pci-frv.h"
 
diff --git a/include/asm-frv/cpu-irqs.h b/include/asm-frv/cpu-irqs.h
index 5cd691e1f8c..478f3498fcf 100644
--- a/include/asm-frv/cpu-irqs.h
+++ b/include/asm-frv/cpu-irqs.h
@@ -14,36 +14,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <asm/irq-routing.h>
-
-#define IRQ_BASE_CPU		(NR_IRQ_ACTIONS_PER_GROUP * 0)
-
-/* IRQ IDs presented to drivers */
-enum {
-	IRQ_CPU__UNUSED = IRQ_BASE_CPU,
-	IRQ_CPU_UART0,
-	IRQ_CPU_UART1,
-	IRQ_CPU_TIMER0,
-	IRQ_CPU_TIMER1,
-	IRQ_CPU_TIMER2,
-	IRQ_CPU_DMA0,
-	IRQ_CPU_DMA1,
-	IRQ_CPU_DMA2,
-	IRQ_CPU_DMA3,
-	IRQ_CPU_DMA4,
-	IRQ_CPU_DMA5,
-	IRQ_CPU_DMA6,
-	IRQ_CPU_DMA7,
-	IRQ_CPU_EXTERNAL0,
-	IRQ_CPU_EXTERNAL1,
-	IRQ_CPU_EXTERNAL2,
-	IRQ_CPU_EXTERNAL3,
-	IRQ_CPU_EXTERNAL4,
-	IRQ_CPU_EXTERNAL5,
-	IRQ_CPU_EXTERNAL6,
-	IRQ_CPU_EXTERNAL7,
-};
-
 /* IRQ to level mappings */
 #define IRQ_GDBSTUB_LEVEL	15
 #define IRQ_UART_LEVEL		13
@@ -82,6 +52,30 @@ enum {
 #define IRQ_XIRQ6_LEVEL		7
 #define IRQ_XIRQ7_LEVEL		8
 
+/* IRQ IDs presented to drivers */
+#define IRQ_CPU__UNUSED		IRQ_BASE_CPU
+#define IRQ_CPU_UART0		(IRQ_BASE_CPU + IRQ_UART0_LEVEL)
+#define IRQ_CPU_UART1		(IRQ_BASE_CPU + IRQ_UART1_LEVEL)
+#define IRQ_CPU_TIMER0		(IRQ_BASE_CPU + IRQ_TIMER0_LEVEL)
+#define IRQ_CPU_TIMER1		(IRQ_BASE_CPU + IRQ_TIMER1_LEVEL)
+#define IRQ_CPU_TIMER2		(IRQ_BASE_CPU + IRQ_TIMER2_LEVEL)
+#define IRQ_CPU_DMA0		(IRQ_BASE_CPU + IRQ_DMA0_LEVEL)
+#define IRQ_CPU_DMA1		(IRQ_BASE_CPU + IRQ_DMA1_LEVEL)
+#define IRQ_CPU_DMA2		(IRQ_BASE_CPU + IRQ_DMA2_LEVEL)
+#define IRQ_CPU_DMA3		(IRQ_BASE_CPU + IRQ_DMA3_LEVEL)
+#define IRQ_CPU_DMA4		(IRQ_BASE_CPU + IRQ_DMA4_LEVEL)
+#define IRQ_CPU_DMA5		(IRQ_BASE_CPU + IRQ_DMA5_LEVEL)
+#define IRQ_CPU_DMA6		(IRQ_BASE_CPU + IRQ_DMA6_LEVEL)
+#define IRQ_CPU_DMA7		(IRQ_BASE_CPU + IRQ_DMA7_LEVEL)
+#define IRQ_CPU_EXTERNAL0	(IRQ_BASE_CPU + IRQ_XIRQ0_LEVEL)
+#define IRQ_CPU_EXTERNAL1	(IRQ_BASE_CPU + IRQ_XIRQ1_LEVEL)
+#define IRQ_CPU_EXTERNAL2	(IRQ_BASE_CPU + IRQ_XIRQ2_LEVEL)
+#define IRQ_CPU_EXTERNAL3	(IRQ_BASE_CPU + IRQ_XIRQ3_LEVEL)
+#define IRQ_CPU_EXTERNAL4	(IRQ_BASE_CPU + IRQ_XIRQ4_LEVEL)
+#define IRQ_CPU_EXTERNAL5	(IRQ_BASE_CPU + IRQ_XIRQ5_LEVEL)
+#define IRQ_CPU_EXTERNAL6	(IRQ_BASE_CPU + IRQ_XIRQ6_LEVEL)
+#define IRQ_CPU_EXTERNAL7	(IRQ_BASE_CPU + IRQ_XIRQ7_LEVEL)
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_CPU_IRQS_H */
diff --git a/include/asm-frv/hardirq.h b/include/asm-frv/hardirq.h
index 7581b5a7559..fc47515822a 100644
--- a/include/asm-frv/hardirq.h
+++ b/include/asm-frv/hardirq.h
@@ -26,5 +26,10 @@ typedef struct {
 #error SMP not available on FR-V
 #endif /* CONFIG_SMP */
 
+extern atomic_t irq_err_count;
+static inline void ack_bad_irq(int irq)
+{
+	atomic_inc(&irq_err_count);
+}
 
 #endif
diff --git a/include/asm-frv/irq-routing.h b/include/asm-frv/irq-routing.h
deleted file mode 100644
index ac3ab900a1d..00000000000
--- a/include/asm-frv/irq-routing.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* irq-routing.h: multiplexed IRQ routing
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_IRQ_ROUTING_H
-#define _ASM_IRQ_ROUTING_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/spinlock.h>
-#include <asm/irq.h>
-
-struct irq_source;
-struct irq_level;
-
-/*
- * IRQ action distribution sets
- */
-struct irq_group {
-	int			first_irq;	/* first IRQ distributed here */
-	void (*control)(struct irq_group *group, int index, int on);
-
-	struct irqaction	*actions[NR_IRQ_ACTIONS_PER_GROUP];	/* IRQ action chains */
-	struct irq_source	*sources[NR_IRQ_ACTIONS_PER_GROUP];	/* IRQ sources */
-	int			disable_cnt[NR_IRQ_ACTIONS_PER_GROUP];	/* disable counts */
-};
-
-/*
- * IRQ source manager
- */
-struct irq_source {
-	struct irq_source	*next;
-	struct irq_level	*level;
-	const char		*muxname;
-	volatile void __iomem	*muxdata;
-	unsigned long		irqmask;
-
-	void (*doirq)(struct irq_source *source);
-};
-
-/*
- * IRQ level management (per CPU IRQ priority / entry vector)
- */
-struct irq_level {
-	int			usage;
-	int			disable_count;
-	unsigned long		flags;		/* current IRQF_DISABLED and IRQF_SHARED settings */
-	spinlock_t		lock;
-	struct irq_source	*sources;
-};
-
-extern struct irq_level frv_irq_levels[16];
-extern struct irq_group *irq_groups[NR_IRQ_GROUPS];
-
-extern void frv_irq_route(struct irq_source *source, int irqlevel);
-extern void frv_irq_route_external(struct irq_source *source, int irq);
-extern void frv_irq_set_group(struct irq_group *group);
-extern void distribute_irqs(struct irq_group *group, unsigned long irqmask);
-extern void route_cpu_irqs(void);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IRQ_ROUTING_H */
diff --git a/include/asm-frv/irq.h b/include/asm-frv/irq.h
index 58b619215a5..8fefd6b827a 100644
--- a/include/asm-frv/irq.h
+++ b/include/asm-frv/irq.h
@@ -1,6 +1,6 @@
 /* irq.h: FRV IRQ definitions
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -12,32 +12,22 @@
 #ifndef _ASM_IRQ_H_
 #define _ASM_IRQ_H_
 
-
-/*
- * the system has an on-CPU PIC and another PIC on the FPGA and other PICs on other peripherals,
- * so we do some routing in irq-routing.[ch] to reduce the number of false-positives seen by
- * drivers
- */
-
 /* this number is used when no interrupt has been assigned */
 #define NO_IRQ				(-1)
 
-#define NR_IRQ_LOG2_ACTIONS_PER_GROUP	5
-#define NR_IRQ_ACTIONS_PER_GROUP	(1 << NR_IRQ_LOG2_ACTIONS_PER_GROUP)
-#define NR_IRQ_GROUPS			4
-#define NR_IRQS				(NR_IRQ_ACTIONS_PER_GROUP * NR_IRQ_GROUPS)
+#define NR_IRQS				48
+#define IRQ_BASE_CPU			(0 * 16)
+#define IRQ_BASE_FPGA			(1 * 16)
+#define IRQ_BASE_MB93493		(2 * 16)
 
 /* probe returns a 32-bit IRQ mask:-/ */
-#define MIN_PROBE_IRQ	(NR_IRQS - 32)
+#define MIN_PROBE_IRQ			(NR_IRQS - 32)
 
+#ifndef __ASSEMBLY__
 static inline int irq_canonicalize(int irq)
 {
 	return irq;
 }
-
-extern void disable_irq_nosync(unsigned int irq);
-extern void disable_irq(unsigned int irq);
-extern void enable_irq(unsigned int irq);
-
+#endif
 
 #endif /* _ASM_IRQ_H_ */
diff --git a/include/asm-frv/mb93091-fpga-irqs.h b/include/asm-frv/mb93091-fpga-irqs.h
index 341bfc52a0e..19778c5ba9d 100644
--- a/include/asm-frv/mb93091-fpga-irqs.h
+++ b/include/asm-frv/mb93091-fpga-irqs.h
@@ -12,11 +12,9 @@
 #ifndef _ASM_MB93091_FPGA_IRQS_H
 #define _ASM_MB93091_FPGA_IRQS_H
 
-#ifndef __ASSEMBLY__
-
-#include <asm/irq-routing.h>
+#include <asm/irq.h>
 
-#define IRQ_BASE_FPGA		(NR_IRQ_ACTIONS_PER_GROUP * 1)
+#ifndef __ASSEMBLY__
 
 /* IRQ IDs presented to drivers */
 enum {
diff --git a/include/asm-frv/mb93093-fpga-irqs.h b/include/asm-frv/mb93093-fpga-irqs.h
index 1e0f11c2fcd..590266b1a6d 100644
--- a/include/asm-frv/mb93093-fpga-irqs.h
+++ b/include/asm-frv/mb93093-fpga-irqs.h
@@ -12,11 +12,9 @@
 #ifndef _ASM_MB93093_FPGA_IRQS_H
 #define _ASM_MB93093_FPGA_IRQS_H
 
-#ifndef __ASSEMBLY__
-
-#include <asm/irq-routing.h>
+#include <asm/irq.h>
 
-#define IRQ_BASE_FPGA		(NR_IRQ_ACTIONS_PER_GROUP * 1)
+#ifndef __ASSEMBLY__
 
 /* IRQ IDs presented to drivers */
 enum {
diff --git a/include/asm-frv/mb93493-irqs.h b/include/asm-frv/mb93493-irqs.h
index 15096e73132..82c7aeddd33 100644
--- a/include/asm-frv/mb93493-irqs.h
+++ b/include/asm-frv/mb93493-irqs.h
@@ -12,11 +12,9 @@
 #ifndef _ASM_MB93493_IRQS_H
 #define _ASM_MB93493_IRQS_H
 
-#ifndef __ASSEMBLY__
-
-#include <asm/irq-routing.h>
+#include <asm/irq.h>
 
-#define IRQ_BASE_MB93493	(NR_IRQ_ACTIONS_PER_GROUP * 2)
+#ifndef __ASSEMBLY__
 
 /* IRQ IDs presented to drivers */
 enum {
diff --git a/include/asm-frv/mb93493-regs.h b/include/asm-frv/mb93493-regs.h
index c54aa9d1446..8a1f6aac8cf 100644
--- a/include/asm-frv/mb93493-regs.h
+++ b/include/asm-frv/mb93493-regs.h
@@ -15,6 +15,7 @@
 #include <asm/mb-regs.h>
 #include <asm/mb93493-irqs.h>
 
+#define __addr_MB93493(X)	((volatile unsigned long *)(__region_CS3 + (X)))
 #define __get_MB93493(X)	({ *(volatile unsigned long *)(__region_CS3 + (X)); })
 
 #define __set_MB93493(X,V)						\
@@ -26,6 +27,7 @@ do {									\
 #define __set_MB93493_STSR(X,V)	__set_MB93493(0x3c0 + (X) * 4, (V))
 #define MB93493_STSR_EN
 
+#define __addr_MB93493_IQSR(X)	__addr_MB93493(0x3d0 + (X) * 4)
 #define __get_MB93493_IQSR(X)	__get_MB93493(0x3d0 + (X) * 4)
 #define __set_MB93493_IQSR(X,V)	__set_MB93493(0x3d0 + (X) * 4, (V))
 
-- 
cgit v1.2.3


From 88d6e19900366781739df033e9c0e2532e715fa5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:06 -0700
Subject: [PATCH] FRV: improve FRV's use of generic IRQ handling

Improve FRV's use of generic IRQ handling:

 (*) Use generic_handle_irq() rather than __do_IRQ() as the latter is obsolete.

 (*) Don't implement enable() and disable() ops as these will fall back to
     using unmask() and mask().

 (*) Provide mask_ack() functions to avoid a call each to mask() and ack().

 (*) Make the cascade handlers always return IRQ_HANDLED.

 (*) Implement the mask() and unmask() functions in the same order as they're
     listed in the ops table.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/irq-mb93091.c | 36 +++++++++++++++++++-----------------
 arch/frv/kernel/irq-mb93093.c | 36 +++++++++++++++++++-----------------
 arch/frv/kernel/irq-mb93493.c | 35 ++++++++++++++---------------------
 arch/frv/kernel/irq.c         | 15 +--------------
 4 files changed, 53 insertions(+), 69 deletions(-)

diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c
index 635d2343766..369bc0a7443 100644
--- a/arch/frv/kernel/irq-mb93091.c
+++ b/arch/frv/kernel/irq-mb93091.c
@@ -36,41 +36,45 @@
 /*
  * on-motherboard FPGA PIC operations
  */
-static void frv_fpga_enable(unsigned int irq)
+static void frv_fpga_mask(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
-	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
+	imr |= 1 << (irq - IRQ_BASE_FPGA);
 
 	__set_IMR(imr);
 }
 
-static void frv_fpga_disable(unsigned int irq)
+static void frv_fpga_ack(unsigned int irq)
+{
+	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
+}
+
+static void frv_fpga_mask_ack(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
 	imr |= 1 << (irq - IRQ_BASE_FPGA);
-
 	__set_IMR(imr);
-}
 
-static void frv_fpga_ack(unsigned int irq)
-{
 	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
 }
 
-static void frv_fpga_end(unsigned int irq)
+static void frv_fpga_unmask(unsigned int irq)
 {
+	uint16_t imr = __get_IMR();
+
+	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
+
+	__set_IMR(imr);
 }
 
 static struct irq_chip frv_fpga_pic = {
 	.name		= "mb93091",
-	.enable		= frv_fpga_enable,
-	.disable	= frv_fpga_disable,
 	.ack		= frv_fpga_ack,
-	.mask		= frv_fpga_disable,
-	.unmask		= frv_fpga_enable,
-	.end		= frv_fpga_end,
+	.mask		= frv_fpga_mask,
+	.mask_ack	= frv_fpga_mask_ack,
+	.unmask		= frv_fpga_unmask,
 };
 
 /*
@@ -79,7 +83,6 @@ static struct irq_chip frv_fpga_pic = {
 static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 {
 	uint16_t imr, mask = (unsigned long) _mask;
-	irqreturn_t iret = 0;
 
 	imr = __get_IMR();
 	mask = mask & ~imr & __get_IFR();
@@ -92,11 +95,10 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 		irq = 31 - irq;
 		mask &= ~(1 << irq);
 
-		if (__do_IRQ(IRQ_BASE_FPGA + irq, regs))
-			iret |= IRQ_HANDLED;
+		generic_handle_irq(IRQ_BASE_FPGA + irq, regs);
 	}
 
-	return iret;
+	return IRQ_HANDLED;
 }
 
 /*
diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c
index f60db7988e7..a43a2215895 100644
--- a/arch/frv/kernel/irq-mb93093.c
+++ b/arch/frv/kernel/irq-mb93093.c
@@ -35,40 +35,44 @@
 /*
  * off-CPU FPGA PIC operations
  */
-static void frv_fpga_enable(unsigned int irq)
+static void frv_fpga_mask(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
-	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
-
+	imr |= 1 << (irq - IRQ_BASE_FPGA);
 	__set_IMR(imr);
 }
 
-static void frv_fpga_disable(unsigned int irq)
+static void frv_fpga_ack(unsigned int irq)
+{
+	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
+}
+
+static void frv_fpga_mask_ack(unsigned int irq)
 {
 	uint16_t imr = __get_IMR();
 
 	imr |= 1 << (irq - IRQ_BASE_FPGA);
-
 	__set_IMR(imr);
-}
 
-static void frv_fpga_ack(unsigned int irq)
-{
 	__clr_IFR(1 << (irq - IRQ_BASE_FPGA));
 }
 
-static void frv_fpga_end(unsigned int irq)
+static void frv_fpga_unmask(unsigned int irq)
 {
+	uint16_t imr = __get_IMR();
+
+	imr &= ~(1 << (irq - IRQ_BASE_FPGA));
+
+	__set_IMR(imr);
 }
 
 static struct irq_chip frv_fpga_pic = {
 	.name		= "mb93093",
-	.enable		= frv_fpga_enable,
-	.disable	= frv_fpga_disable,
 	.ack		= frv_fpga_ack,
-	.mask		= frv_fpga_disable,
-	.unmask		= frv_fpga_enable,
+	.mask		= frv_fpga_mask,
+	.mask_ack	= frv_fpga_mask_ack,
+	.unmask		= frv_fpga_unmask,
 	.end		= frv_fpga_end,
 };
 
@@ -78,7 +82,6 @@ static struct irq_chip frv_fpga_pic = {
 static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 {
 	uint16_t imr, mask = (unsigned long) _mask;
-	irqreturn_t iret = 0;
 
 	imr = __get_IMR();
 	mask = mask & ~imr & __get_IFR();
@@ -91,11 +94,10 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask, struct pt_regs *regs)
 		irq = 31 - irq;
 		mask &= ~(1 << irq);
 
-		if (__do_IRQ(IRQ_BASE_FPGA + irq, regs))
-			iret |= IRQ_HANDLED;
+		generic_irq_handle(IRQ_BASE_FPGA + irq, regs);
 	}
 
-	return iret;
+	return IRQ_HANDLED;
 }
 
 /*
diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c
index 8ad9abfc7c1..39c0188a349 100644
--- a/arch/frv/kernel/irq-mb93493.c
+++ b/arch/frv/kernel/irq-mb93493.c
@@ -43,8 +43,9 @@
 
 /*
  * daughter board PIC operations
+ * - there is no way to ACK interrupts in the MB93493 chip
  */
-static void frv_mb93493_enable(unsigned int irq)
+static void frv_mb93493_mask(unsigned int irq)
 {
 	uint32_t iqsr;
 	volatile void *piqsr;
@@ -55,11 +56,15 @@ static void frv_mb93493_enable(unsigned int irq)
 		piqsr = __addr_MB93493_IQSR(0);
 
 	iqsr = readl(piqsr);
-	iqsr |= 1 << (irq - IRQ_BASE_MB93493 + 16);
+	iqsr &= ~(1 << (irq - IRQ_BASE_MB93493 + 16));
 	writel(iqsr, piqsr);
 }
 
-static void frv_mb93493_disable(unsigned int irq)
+static void frv_mb93493_ack(unsigned int irq)
+{
+}
+
+static void frv_mb93493_unmask(unsigned int irq)
 {
 	uint32_t iqsr;
 	volatile void *piqsr;
@@ -70,26 +75,16 @@ static void frv_mb93493_disable(unsigned int irq)
 		piqsr = __addr_MB93493_IQSR(0);
 
 	iqsr = readl(piqsr);
-	iqsr &= ~(1 << (irq - IRQ_BASE_MB93493 + 16));
+	iqsr |= 1 << (irq - IRQ_BASE_MB93493 + 16);
 	writel(iqsr, piqsr);
 }
 
-static void frv_mb93493_ack(unsigned int irq)
-{
-}
-
-static void frv_mb93493_end(unsigned int irq)
-{
-}
-
 static struct irq_chip frv_mb93493_pic = {
 	.name		= "mb93093",
-	.enable		= frv_mb93493_enable,
-	.disable	= frv_mb93493_disable,
 	.ack		= frv_mb93493_ack,
-	.mask		= frv_mb93493_disable,
-	.unmask		= frv_mb93493_enable,
-	.end		= frv_mb93493_end,
+	.mask		= frv_mb93493_mask,
+	.mask_ack	= frv_mb93493_mask,
+	.unmask		= frv_mb93493_unmask,
 };
 
 /*
@@ -98,7 +93,6 @@ static struct irq_chip frv_mb93493_pic = {
 static irqreturn_t mb93493_interrupt(int irq, void *_piqsr, struct pt_regs *regs)
 {
 	volatile void *piqsr = _piqsr;
-	irqreturn_t iret = 0;
 	uint32_t iqsr;
 
 	iqsr = readl(piqsr);
@@ -112,11 +106,10 @@ static irqreturn_t mb93493_interrupt(int irq, void *_piqsr, struct pt_regs *regs
 		irq = 31 - irq;
 		iqsr &= ~(1 << irq);
 
-		if (__do_IRQ(IRQ_BASE_MB93493 + irq, regs))
-			iret |= IRQ_HANDLED;
+		generic_handle_irq(IRQ_BASE_MB93493 + irq, regs);
 	}
 
-	return iret;
+	return IRQ_HANDLED;
 }
 
 /*
diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c
index e1ab9f2e43f..5ac041c7c0a 100644
--- a/arch/frv/kernel/irq.c
+++ b/arch/frv/kernel/irq.c
@@ -97,19 +97,8 @@ int show_interrupts(struct seq_file *p, void *v)
 /*
  * on-CPU PIC operations
  */
-static void frv_cpupic_enable(unsigned int irqlevel)
-{
-	__clr_MASK(irqlevel);
-}
-
-static void frv_cpupic_disable(unsigned int irqlevel)
-{
-	__set_MASK(irqlevel);
-}
-
 static void frv_cpupic_ack(unsigned int irqlevel)
 {
-	__set_MASK(irqlevel);
 	__clr_RC(irqlevel);
 	__clr_IRL();
 }
@@ -138,8 +127,6 @@ static void frv_cpupic_end(unsigned int irqlevel)
 
 static struct irq_chip frv_cpu_pic = {
 	.name		= "cpu",
-	.enable		= frv_cpupic_enable,
-	.disable	= frv_cpupic_disable,
 	.ack		= frv_cpupic_ack,
 	.mask		= frv_cpupic_mask,
 	.mask_ack	= frv_cpupic_mask_ack,
@@ -156,7 +143,7 @@ static struct irq_chip frv_cpu_pic = {
 asmlinkage void do_IRQ(void)
 {
 	irq_enter();
-	__do_IRQ(__get_IRL(), __frame);
+	generic_handle_irq(__get_IRL(), __frame);
 	irq_exit();
 }
 
-- 
cgit v1.2.3


From af8c65b57aaa4ae321af34dbfc5ca7f5625263fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:07 -0700
Subject: [PATCH] FRV: permit __do_IRQ() to be dispensed with

Permit __do_IRQ() to be dispensed with based on a configuration option.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig    | 4 ++++
 include/linux/irq.h | 6 ++++++
 kernel/irq/handle.c | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 5833808fb82..f7b171b92ea 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -29,6 +29,10 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
+config GENERIC_HARDIRQS_NO__DO_IRQ
+	bool
+	default y
+
 config GENERIC_TIME
 	bool
 	default y
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fbf6d901e9c..48d3cb3b6a4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -320,7 +320,9 @@ handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
  * Monolithic do_IRQ implementation.
  * (is an explicit fastcall, because i386 4KSTACKS calls it from assembly)
  */
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+#endif
 
 /*
  * Architectures call this to let the generic IRQ layer
@@ -332,10 +334,14 @@ static inline void generic_handle_irq(unsigned int irq, struct pt_regs *regs)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
+#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+	desc->handle_irq(irq, desc, regs);
+#else
 	if (likely(desc->handle_irq))
 		desc->handle_irq(irq, desc, regs);
 	else
 		__do_IRQ(irq, regs);
+#endif
 }
 
 /* Handling of unhandled and spurious interrupts: */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af9..4c6cdbaed66 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 	return retval;
 }
 
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
  * @irq:	the interrupt number
@@ -253,6 +254,7 @@ out:
 
 	return 1;
 }
+#endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
-- 
cgit v1.2.3


From 92fc707208bb2e601c24b5ab65db37bcb361b658 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:07 -0700
Subject: [PATCH] FRV: Fix fls() to handle bit 31 being set correctly

Fix FRV fls() to handle bit 31 being set correctly (it should return 32 not 0).

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-frv/bitops.h | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index 980ae1b0cd2..97fb746f76c 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -161,16 +161,29 @@ static inline int __test_bit(int nr, const volatile void * addr)
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/find.h>
 
-/*
- * fls: find last bit set.
+/**
+ * fls - find last bit set
+ * @x: the word to search
+ *
+ * This is defined the same way as ffs:
+ * - return 32..1 to indicate bit 31..0 most significant bit set
+ * - return 0 to indicate no bits set
  */
 #define fls(x)						\
 ({							\
 	int bit;					\
 							\
-	asm("scan %1,gr0,%0" : "=r"(bit) : "r"(x));	\
+	asm("	subcc	%1,gr0,gr0,icc0		\n"	\
+	    "	ckne	icc0,cc4		\n"	\
+	    "	cscan.p	%1,gr0,%0	,cc4,#1	\n"	\
+	    "	csub	%0,%0,%0	,cc4,#0	\n"	\
+	    "   csub    %2,%0,%0	,cc4,#1	\n"	\
+	    : "=&r"(bit)				\
+	    : "r"(x), "r"(32)				\
+	    : "icc0", "cc4"				\
+	    );						\
 							\
-	bit ? 33 - bit : bit;				\
+	bit;						\
 })
 
 #include <asm-generic/bitops/fls64.h>
-- 
cgit v1.2.3


From a8ad27d03f17e6154c61e81d4a7028c56ca6390d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:08 -0700
Subject: [PATCH] FRV: Implement fls64()

Implement fls64() for FRV without recource to conditional jumps.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-frv/bitops.h | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index 97fb746f76c..591eecc1f8c 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -186,7 +186,47 @@ static inline int __test_bit(int nr, const volatile void * addr)
 	bit;						\
 })
 
-#include <asm-generic/bitops/fls64.h>
+/**
+ * fls64 - find last bit set in a 64-bit value
+ * @n: the value to search
+ *
+ * This is defined the same way as ffs:
+ * - return 64..1 to indicate bit 63..0 most significant bit set
+ * - return 0 to indicate no bits set
+ */
+static inline __attribute__((const))
+int fls64(u64 n)
+{
+	union {
+		u64 ll;
+		struct { u32 h, l; };
+	} _;
+	int bit, x, y;
+
+	_.ll = n;
+
+	asm("	subcc.p		%3,gr0,gr0,icc0		\n"
+	    "	subcc		%4,gr0,gr0,icc1		\n"
+	    "	ckne		icc0,cc4		\n"
+	    "	ckne		icc1,cc5		\n"
+	    "	norcr		cc4,cc5,cc6		\n"
+	    "	csub.p		%0,%0,%0	,cc6,1	\n"
+	    "	orcr		cc5,cc4,cc4		\n"
+	    "	andcr		cc4,cc5,cc4		\n"
+	    "	cscan.p		%3,gr0,%0	,cc4,0	\n"
+	    "   setlos		#64,%1			\n"
+	    "	cscan.p		%4,gr0,%0	,cc4,1	\n"
+	    "   setlos		#32,%2			\n"
+	    "	csub.p		%1,%0,%0	,cc4,0	\n"
+	    "	csub		%2,%0,%0	,cc4,1	\n"
+	    : "=&r"(bit), "=r"(x), "=r"(y)
+	    : "0r"(_.h), "r"(_.l)
+	    : "icc0", "icc1", "cc4", "cc5", "cc6"
+	    );
+	return bit;
+
+}
+
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
 
-- 
cgit v1.2.3


From cf134483b2cd657039b305777215c531a1009947 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Sep 2006 23:32:09 -0700
Subject: [PATCH] FRV: Optimise ffs()

Optimise ffs(x) by using fls(x & x - 1) which we optimise to use the SCAN
instruction.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-frv/bitops.h | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h
index 591eecc1f8c..1f70d47148b 100644
--- a/include/asm-frv/bitops.h
+++ b/include/asm-frv/bitops.h
@@ -157,8 +157,6 @@ static inline int __test_bit(int nr, const volatile void * addr)
  __constant_test_bit((nr),(addr)) : \
  __test_bit((nr),(addr)))
 
-#include <asm-generic/bitops/ffs.h>
-#include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/find.h>
 
 /**
@@ -227,6 +225,37 @@ int fls64(u64 n)
 
 }
 
+/**
+ * ffs - find first bit set
+ * @x: the word to search
+ *
+ * - return 32..1 to indicate bit 31..0 most least significant bit set
+ * - return 0 to indicate no bits set
+ */
+static inline __attribute__((const))
+int ffs(int x)
+{
+	/* Note: (x & -x) gives us a mask that is the least significant
+	 * (rightmost) 1-bit of the value in x.
+	 */
+	return fls(x & -x);
+}
+
+/**
+ * __ffs - find first bit set
+ * @x: the word to search
+ *
+ * - return 31..0 to indicate bit 31..0 most least significant bit set
+ * - if no bits are set in x, the result is undefined
+ */
+static inline __attribute__((const))
+int __ffs(unsigned long x)
+{
+	int bit;
+	asm("scan %1,gr0,%0" : "=r"(bit) : "r"(x & -x));
+	return 31 - bit;
+}
+
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/hweight.h>
 
-- 
cgit v1.2.3


From 53e62d3aaa60590d4a69b4e07c29f448b5151047 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 25 Sep 2006 23:32:10 -0700
Subject: [PATCH] Alchemy: Delete unused pt_regs * argument from
 au1xxx_dbdma_chan_alloc

The third argument of au1xxx_dbdma_chan_alloc's callback function is not
used anywhere.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/au1000/common/dbdma.c             | 10 ++++++----
 drivers/ide/mips/au1xxx-ide.c               |  4 ++--
 drivers/mmc/au1xmmc.c                       |  2 +-
 include/asm-mips/mach-au1x00/au1xxx_dbdma.h |  6 +++---
 sound/oss/au1550_ac97.c                     |  6 ++----
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/mips/au1000/common/dbdma.c b/arch/mips/au1000/common/dbdma.c
index 98244d51c15..c4fae8ff467 100644
--- a/arch/mips/au1000/common/dbdma.c
+++ b/arch/mips/au1000/common/dbdma.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL(au1xxx_ddma_add_device);
 */
 u32
 au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid,
-       void (*callback)(int, void *, struct pt_regs *), void *callparam)
+       void (*callback)(int, void *), void *callparam)
 {
 	unsigned long   flags;
 	u32		used, chan, rv;
@@ -248,8 +248,10 @@ au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid,
 		au1xxx_dbdma_init();
 	dbdma_initialized = 1;
 
-	if ((stp = find_dbdev_id(srcid)) == NULL) return 0;
-	if ((dtp = find_dbdev_id(destid)) == NULL) return 0;
+	if ((stp = find_dbdev_id(srcid)) == NULL)
+		return 0;
+	if ((dtp = find_dbdev_id(destid)) == NULL)
+		return 0;
 
 	used = 0;
 	rv = 0;
@@ -869,7 +871,7 @@ dbdma_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	au_sync();
 
 	if (ctp->chan_callback)
-		(ctp->chan_callback)(irq, ctp->chan_callparam, regs);
+		(ctp->chan_callback)(irq, ctp->chan_callparam);
 
 	ctp->cur_ptr = phys_to_virt(DSCR_GET_NXTPTR(dp->dscr_nxtptr));
 	return IRQ_RETVAL(1);
diff --git a/drivers/ide/mips/au1xxx-ide.c b/drivers/ide/mips/au1xxx-ide.c
index 71f27e955d8..c7854ea57b5 100644
--- a/drivers/ide/mips/au1xxx-ide.c
+++ b/drivers/ide/mips/au1xxx-ide.c
@@ -476,13 +476,13 @@ static int auide_dma_lostirq(ide_drive_t *drive)
 	return 0;
 }
 
-static void auide_ddma_tx_callback(int irq, void *param, struct pt_regs *regs)
+static void auide_ddma_tx_callback(int irq, void *param)
 {
 	_auide_hwif *ahwif = (_auide_hwif*)param;
 	ahwif->drive->waiting_for_dma = 0;
 }
 
-static void auide_ddma_rx_callback(int irq, void *param, struct pt_regs *regs)
+static void auide_ddma_rx_callback(int irq, void *param)
 {
 	_auide_hwif *ahwif = (_auide_hwif*)param;
 	ahwif->drive->waiting_for_dma = 0;
diff --git a/drivers/mmc/au1xmmc.c b/drivers/mmc/au1xmmc.c
index fb606165af3..61268da1395 100644
--- a/drivers/mmc/au1xmmc.c
+++ b/drivers/mmc/au1xmmc.c
@@ -731,7 +731,7 @@ static void au1xmmc_set_ios(struct mmc_host* mmc, struct mmc_ios* ios)
 	}
 }
 
-static void au1xmmc_dma_callback(int irq, void *dev_id, struct pt_regs *regs)
+static void au1xmmc_dma_callback(int irq, void *dev_id)
 {
 	struct au1xmmc_host *host = (struct au1xmmc_host *) dev_id;
 
diff --git a/include/asm-mips/mach-au1x00/au1xxx_dbdma.h b/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
index d5b38a247e5..eeb0c3115b6 100644
--- a/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
+++ b/include/asm-mips/mach-au1x00/au1xxx_dbdma.h
@@ -316,7 +316,7 @@ typedef struct dbdma_chan_config {
 	au1x_ddma_desc_t	*chan_desc_base;
 	au1x_ddma_desc_t	*get_ptr, *put_ptr, *cur_ptr;
 	void			*chan_callparam;
-	void (*chan_callback)(int, void *, struct pt_regs *);
+	void (*chan_callback)(int, void *);
 } chan_tab_t;
 
 #define DEV_FLAGS_INUSE		(1 << 0)
@@ -334,8 +334,8 @@ typedef struct dbdma_chan_config {
  * meaningful name.  The 'callback' is called during dma completion
  * interrupt.
  */
-u32 au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid,
-       void (*callback)(int, void *, struct pt_regs *), void *callparam);
+extern u32 au1xxx_dbdma_chan_alloc(u32 srcid, u32 destid,
+	void (*callback)(int, void *), void *callparam);
 
 #define DBDMA_MEM_CHAN	DSCR_CMD0_ALWAYS
 
diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c
index 4cdb86252d6..219795171c7 100644
--- a/sound/oss/au1550_ac97.c
+++ b/sound/oss/au1550_ac97.c
@@ -719,8 +719,7 @@ prog_dmabuf_dac(struct au1550_state *s)
 }
 
 
-static void
-dac_dma_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static void dac_dma_interrupt(int irq, void *dev_id)
 {
 	struct au1550_state *s = (struct au1550_state *) dev_id;
 	struct dmabuf  *db = &s->dma_dac;
@@ -754,8 +753,7 @@ dac_dma_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 }
 
 
-static void
-adc_dma_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static void adc_dma_interrupt(int irq, void *dev_id)
 {
 	struct	au1550_state *s = (struct au1550_state *)dev_id;
 	struct	dmabuf  *dp = &s->dma_adc;
-- 
cgit v1.2.3


From 5f97f7f9400de47ae837170bb274e90ad3934386 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Mon, 25 Sep 2006 23:32:13 -0700
Subject: [PATCH] avr32 architecture

This adds support for the Atmel AVR32 architecture as well as the AT32AP7000
CPU and the AT32STK1000 development board.

AVR32 is a new high-performance 32-bit RISC microprocessor core, designed for
cost-sensitive embedded applications, with particular emphasis on low power
consumption and high code density.  The AVR32 architecture is not binary
compatible with earlier 8-bit AVR architectures.

The AVR32 architecture, including the instruction set, is described by the
AVR32 Architecture Manual, available from

http://www.atmel.com/dyn/resources/prod_documents/doc32000.pdf

The Atmel AT32AP7000 is the first CPU implementing the AVR32 architecture.  It
features a 7-stage pipeline, 16KB instruction and data caches and a full
Memory Management Unit.  It also comes with a large set of integrated
peripherals, many of which are shared with the AT91 ARM-based controllers from
Atmel.

Full data sheet is available from

http://www.atmel.com/dyn/resources/prod_documents/doc32003.pdf

while the CPU core implementation including caches and MMU is documented by
the AVR32 AP Technical Reference, available from

http://www.atmel.com/dyn/resources/prod_documents/doc32001.pdf

Information about the AT32STK1000 development board can be found at

http://www.atmel.com/dyn/products/tools_card.asp?tool_id=3918

including a BSP CD image with an earlier version of this patch, development
tools (binaries and source/patches) and a root filesystem image suitable for
booting from SD card.

Alternatively, there's a preliminary "getting started" guide available at
http://avr32linux.org/twiki/bin/view/Main/GettingStarted which provides links
to the sources and patches you will need in order to set up a cross-compiling
environment for avr32-linux.

This patch, as well as the other patches included with the BSP and the
toolchain patches, is actively supported by Atmel Corporation.

[dmccr@us.ibm.com: Fix more pxx_page macro locations]
[bunk@stusta.de: fix `make defconfig']
Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Dave McCracken <dmccr@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 MAINTAINERS                                      |  17 +
 arch/avr32/Kconfig                               | 196 +++++
 arch/avr32/Kconfig.debug                         |  19 +
 arch/avr32/Makefile                              |  84 +++
 arch/avr32/boards/atstk1000/Makefile             |   2 +
 arch/avr32/boards/atstk1000/atstk1002.c          |  37 +
 arch/avr32/boards/atstk1000/setup.c              |  59 ++
 arch/avr32/boards/atstk1000/spi.c                |  27 +
 arch/avr32/boot/images/Makefile                  |  62 ++
 arch/avr32/boot/u-boot/Makefile                  |   3 +
 arch/avr32/boot/u-boot/empty.S                   |   1 +
 arch/avr32/boot/u-boot/head.S                    |  60 ++
 arch/avr32/configs/atstk1002_defconfig           | 754 ++++++++++++++++++++
 arch/avr32/kernel/Makefile                       |  18 +
 arch/avr32/kernel/asm-offsets.c                  |  25 +
 arch/avr32/kernel/avr32_ksyms.c                  |  55 ++
 arch/avr32/kernel/cpu.c                          | 327 +++++++++
 arch/avr32/kernel/entry-avr32b.S                 | 678 ++++++++++++++++++
 arch/avr32/kernel/head.S                         |  45 ++
 arch/avr32/kernel/init_task.c                    |  38 +
 arch/avr32/kernel/irq.c                          |  71 ++
 arch/avr32/kernel/kprobes.c                      | 270 +++++++
 arch/avr32/kernel/module.c                       | 324 +++++++++
 arch/avr32/kernel/process.c                      | 276 ++++++++
 arch/avr32/kernel/ptrace.c                       | 371 ++++++++++
 arch/avr32/kernel/semaphore.c                    | 148 ++++
 arch/avr32/kernel/setup.c                        | 335 +++++++++
 arch/avr32/kernel/signal.c                       | 328 +++++++++
 arch/avr32/kernel/switch_to.S                    |  35 +
 arch/avr32/kernel/sys_avr32.c                    |  51 ++
 arch/avr32/kernel/syscall-stubs.S                | 102 +++
 arch/avr32/kernel/syscall_table.S                | 289 ++++++++
 arch/avr32/kernel/time.c                         | 238 +++++++
 arch/avr32/kernel/traps.c                        | 425 +++++++++++
 arch/avr32/kernel/vmlinux.lds.c                  | 139 ++++
 arch/avr32/lib/Makefile                          |  10 +
 arch/avr32/lib/__avr32_asr64.S                   |  31 +
 arch/avr32/lib/__avr32_lsl64.S                   |  31 +
 arch/avr32/lib/__avr32_lsr64.S                   |  31 +
 arch/avr32/lib/clear_user.S                      |  76 ++
 arch/avr32/lib/copy_user.S                       | 119 ++++
 arch/avr32/lib/csum_partial.S                    |  47 ++
 arch/avr32/lib/csum_partial_copy_generic.S       |  99 +++
 arch/avr32/lib/delay.c                           |  55 ++
 arch/avr32/lib/findbit.S                         | 154 ++++
 arch/avr32/lib/io-readsl.S                       |  24 +
 arch/avr32/lib/io-readsw.S                       |  43 ++
 arch/avr32/lib/io-writesl.S                      |  20 +
 arch/avr32/lib/io-writesw.S                      |  38 +
 arch/avr32/lib/libgcc.h                          |  33 +
 arch/avr32/lib/longlong.h                        |  98 +++
 arch/avr32/lib/memcpy.S                          |  62 ++
 arch/avr32/lib/memset.S                          |  72 ++
 arch/avr32/lib/strncpy_from_user.S               |  60 ++
 arch/avr32/lib/strnlen_user.S                    |  67 ++
 arch/avr32/mach-at32ap/Makefile                  |   2 +
 arch/avr32/mach-at32ap/at32ap.c                  |  90 +++
 arch/avr32/mach-at32ap/at32ap7000.c              | 866 +++++++++++++++++++++++
 arch/avr32/mach-at32ap/clock.c                   | 148 ++++
 arch/avr32/mach-at32ap/clock.h                   |  30 +
 arch/avr32/mach-at32ap/extint.c                  | 171 +++++
 arch/avr32/mach-at32ap/intc.c                    | 133 ++++
 arch/avr32/mach-at32ap/intc.h                    | 327 +++++++++
 arch/avr32/mach-at32ap/pio.c                     | 118 +++
 arch/avr32/mach-at32ap/pio.h                     | 178 +++++
 arch/avr32/mach-at32ap/sm.c                      | 289 ++++++++
 arch/avr32/mach-at32ap/sm.h                      | 240 +++++++
 arch/avr32/mm/Makefile                           |   6 +
 arch/avr32/mm/cache.c                            | 150 ++++
 arch/avr32/mm/clear_page.S                       |  25 +
 arch/avr32/mm/copy_page.S                        |  28 +
 arch/avr32/mm/dma-coherent.c                     | 139 ++++
 arch/avr32/mm/fault.c                            | 315 +++++++++
 arch/avr32/mm/init.c                             | 480 +++++++++++++
 arch/avr32/mm/ioremap.c                          | 197 ++++++
 arch/avr32/mm/tlb.c                              | 378 ++++++++++
 include/asm-avr32/Kbuild                         |   3 +
 include/asm-avr32/a.out.h                        |  26 +
 include/asm-avr32/addrspace.h                    |  43 ++
 include/asm-avr32/arch-at32ap/at91rm9200_pdc.h   |  36 +
 include/asm-avr32/arch-at32ap/at91rm9200_usart.h | 123 ++++
 include/asm-avr32/arch-at32ap/board.h            |  35 +
 include/asm-avr32/arch-at32ap/init.h             |  21 +
 include/asm-avr32/arch-at32ap/portmux.h          |  16 +
 include/asm-avr32/arch-at32ap/sm.h               |  27 +
 include/asm-avr32/asm.h                          | 102 +++
 include/asm-avr32/atomic.h                       | 201 ++++++
 include/asm-avr32/auxvec.h                       |   4 +
 include/asm-avr32/bitops.h                       | 296 ++++++++
 include/asm-avr32/bug.h                          |  47 ++
 include/asm-avr32/bugs.h                         |  15 +
 include/asm-avr32/byteorder.h                    |  25 +
 include/asm-avr32/cache.h                        |  29 +
 include/asm-avr32/cachectl.h                     |  11 +
 include/asm-avr32/cacheflush.h                   | 129 ++++
 include/asm-avr32/checksum.h                     | 156 ++++
 include/asm-avr32/cputime.h                      |   6 +
 include/asm-avr32/current.h                      |  15 +
 include/asm-avr32/delay.h                        |  26 +
 include/asm-avr32/div64.h                        |   6 +
 include/asm-avr32/dma-mapping.h                  | 320 +++++++++
 include/asm-avr32/dma.h                          |   8 +
 include/asm-avr32/elf.h                          | 110 +++
 include/asm-avr32/emergency-restart.h            |   6 +
 include/asm-avr32/errno.h                        |   6 +
 include/asm-avr32/fcntl.h                        |   6 +
 include/asm-avr32/futex.h                        |   6 +
 include/asm-avr32/hardirq.h                      |  34 +
 include/asm-avr32/hw_irq.h                       |   9 +
 include/asm-avr32/intc.h                         | 128 ++++
 include/asm-avr32/io.h                           | 253 +++++++
 include/asm-avr32/ioctl.h                        |   6 +
 include/asm-avr32/ioctls.h                       |  83 +++
 include/asm-avr32/ipcbuf.h                       |  29 +
 include/asm-avr32/irq.h                          |  10 +
 include/asm-avr32/irqflags.h                     |  68 ++
 include/asm-avr32/kdebug.h                       |  38 +
 include/asm-avr32/kmap_types.h                   |  30 +
 include/asm-avr32/kprobes.h                      |  34 +
 include/asm-avr32/linkage.h                      |   7 +
 include/asm-avr32/local.h                        |   6 +
 include/asm-avr32/mach/serial_at91.h             |  33 +
 include/asm-avr32/mman.h                         |  17 +
 include/asm-avr32/mmu.h                          |  10 +
 include/asm-avr32/mmu_context.h                  | 148 ++++
 include/asm-avr32/module.h                       |  28 +
 include/asm-avr32/msgbuf.h                       |  31 +
 include/asm-avr32/mutex.h                        |   9 +
 include/asm-avr32/namei.h                        |   7 +
 include/asm-avr32/numnodes.h                     |   7 +
 include/asm-avr32/ocd.h                          |  78 ++
 include/asm-avr32/page.h                         | 112 +++
 include/asm-avr32/param.h                        |  23 +
 include/asm-avr32/pci.h                          |   8 +
 include/asm-avr32/percpu.h                       |   6 +
 include/asm-avr32/pgalloc.h                      |  96 +++
 include/asm-avr32/pgtable-2level.h               |  47 ++
 include/asm-avr32/pgtable.h                      | 408 +++++++++++
 include/asm-avr32/poll.h                         |  27 +
 include/asm-avr32/posix_types.h                  | 129 ++++
 include/asm-avr32/processor.h                    | 147 ++++
 include/asm-avr32/ptrace.h                       | 154 ++++
 include/asm-avr32/resource.h                     |   6 +
 include/asm-avr32/scatterlist.h                  |  21 +
 include/asm-avr32/sections.h                     |   6 +
 include/asm-avr32/semaphore.h                    | 109 +++
 include/asm-avr32/sembuf.h                       |  25 +
 include/asm-avr32/setup.h                        | 141 ++++
 include/asm-avr32/shmbuf.h                       |  42 ++
 include/asm-avr32/shmparam.h                     |   6 +
 include/asm-avr32/sigcontext.h                   |  34 +
 include/asm-avr32/siginfo.h                      |   6 +
 include/asm-avr32/signal.h                       | 168 +++++
 include/asm-avr32/socket.h                       |  53 ++
 include/asm-avr32/sockios.h                      |  12 +
 include/asm-avr32/stat.h                         |  79 +++
 include/asm-avr32/statfs.h                       |   6 +
 include/asm-avr32/string.h                       |  17 +
 include/asm-avr32/sysreg.h                       | 332 +++++++++
 include/asm-avr32/system.h                       | 155 ++++
 include/asm-avr32/termbits.h                     | 173 +++++
 include/asm-avr32/termios.h                      |  80 +++
 include/asm-avr32/thread_info.h                  | 106 +++
 include/asm-avr32/timex.h                        |  40 ++
 include/asm-avr32/tlb.h                          |  32 +
 include/asm-avr32/tlbflush.h                     |  40 ++
 include/asm-avr32/topology.h                     |   6 +
 include/asm-avr32/traps.h                        |  23 +
 include/asm-avr32/types.h                        |  70 ++
 include/asm-avr32/uaccess.h                      | 335 +++++++++
 include/asm-avr32/ucontext.h                     |  12 +
 include/asm-avr32/unaligned.h                    |  25 +
 include/asm-avr32/unistd.h                       | 387 ++++++++++
 include/asm-avr32/user.h                         |  65 ++
 include/linux/elf-em.h                           |   1 +
 lib/Kconfig.debug                                |   4 +-
 176 files changed, 18124 insertions(+), 2 deletions(-)
 create mode 100644 arch/avr32/Kconfig
 create mode 100644 arch/avr32/Kconfig.debug
 create mode 100644 arch/avr32/Makefile
 create mode 100644 arch/avr32/boards/atstk1000/Makefile
 create mode 100644 arch/avr32/boards/atstk1000/atstk1002.c
 create mode 100644 arch/avr32/boards/atstk1000/setup.c
 create mode 100644 arch/avr32/boards/atstk1000/spi.c
 create mode 100644 arch/avr32/boot/images/Makefile
 create mode 100644 arch/avr32/boot/u-boot/Makefile
 create mode 100644 arch/avr32/boot/u-boot/empty.S
 create mode 100644 arch/avr32/boot/u-boot/head.S
 create mode 100644 arch/avr32/configs/atstk1002_defconfig
 create mode 100644 arch/avr32/kernel/Makefile
 create mode 100644 arch/avr32/kernel/asm-offsets.c
 create mode 100644 arch/avr32/kernel/avr32_ksyms.c
 create mode 100644 arch/avr32/kernel/cpu.c
 create mode 100644 arch/avr32/kernel/entry-avr32b.S
 create mode 100644 arch/avr32/kernel/head.S
 create mode 100644 arch/avr32/kernel/init_task.c
 create mode 100644 arch/avr32/kernel/irq.c
 create mode 100644 arch/avr32/kernel/kprobes.c
 create mode 100644 arch/avr32/kernel/module.c
 create mode 100644 arch/avr32/kernel/process.c
 create mode 100644 arch/avr32/kernel/ptrace.c
 create mode 100644 arch/avr32/kernel/semaphore.c
 create mode 100644 arch/avr32/kernel/setup.c
 create mode 100644 arch/avr32/kernel/signal.c
 create mode 100644 arch/avr32/kernel/switch_to.S
 create mode 100644 arch/avr32/kernel/sys_avr32.c
 create mode 100644 arch/avr32/kernel/syscall-stubs.S
 create mode 100644 arch/avr32/kernel/syscall_table.S
 create mode 100644 arch/avr32/kernel/time.c
 create mode 100644 arch/avr32/kernel/traps.c
 create mode 100644 arch/avr32/kernel/vmlinux.lds.c
 create mode 100644 arch/avr32/lib/Makefile
 create mode 100644 arch/avr32/lib/__avr32_asr64.S
 create mode 100644 arch/avr32/lib/__avr32_lsl64.S
 create mode 100644 arch/avr32/lib/__avr32_lsr64.S
 create mode 100644 arch/avr32/lib/clear_user.S
 create mode 100644 arch/avr32/lib/copy_user.S
 create mode 100644 arch/avr32/lib/csum_partial.S
 create mode 100644 arch/avr32/lib/csum_partial_copy_generic.S
 create mode 100644 arch/avr32/lib/delay.c
 create mode 100644 arch/avr32/lib/findbit.S
 create mode 100644 arch/avr32/lib/io-readsl.S
 create mode 100644 arch/avr32/lib/io-readsw.S
 create mode 100644 arch/avr32/lib/io-writesl.S
 create mode 100644 arch/avr32/lib/io-writesw.S
 create mode 100644 arch/avr32/lib/libgcc.h
 create mode 100644 arch/avr32/lib/longlong.h
 create mode 100644 arch/avr32/lib/memcpy.S
 create mode 100644 arch/avr32/lib/memset.S
 create mode 100644 arch/avr32/lib/strncpy_from_user.S
 create mode 100644 arch/avr32/lib/strnlen_user.S
 create mode 100644 arch/avr32/mach-at32ap/Makefile
 create mode 100644 arch/avr32/mach-at32ap/at32ap.c
 create mode 100644 arch/avr32/mach-at32ap/at32ap7000.c
 create mode 100644 arch/avr32/mach-at32ap/clock.c
 create mode 100644 arch/avr32/mach-at32ap/clock.h
 create mode 100644 arch/avr32/mach-at32ap/extint.c
 create mode 100644 arch/avr32/mach-at32ap/intc.c
 create mode 100644 arch/avr32/mach-at32ap/intc.h
 create mode 100644 arch/avr32/mach-at32ap/pio.c
 create mode 100644 arch/avr32/mach-at32ap/pio.h
 create mode 100644 arch/avr32/mach-at32ap/sm.c
 create mode 100644 arch/avr32/mach-at32ap/sm.h
 create mode 100644 arch/avr32/mm/Makefile
 create mode 100644 arch/avr32/mm/cache.c
 create mode 100644 arch/avr32/mm/clear_page.S
 create mode 100644 arch/avr32/mm/copy_page.S
 create mode 100644 arch/avr32/mm/dma-coherent.c
 create mode 100644 arch/avr32/mm/fault.c
 create mode 100644 arch/avr32/mm/init.c
 create mode 100644 arch/avr32/mm/ioremap.c
 create mode 100644 arch/avr32/mm/tlb.c
 create mode 100644 include/asm-avr32/Kbuild
 create mode 100644 include/asm-avr32/a.out.h
 create mode 100644 include/asm-avr32/addrspace.h
 create mode 100644 include/asm-avr32/arch-at32ap/at91rm9200_pdc.h
 create mode 100644 include/asm-avr32/arch-at32ap/at91rm9200_usart.h
 create mode 100644 include/asm-avr32/arch-at32ap/board.h
 create mode 100644 include/asm-avr32/arch-at32ap/init.h
 create mode 100644 include/asm-avr32/arch-at32ap/portmux.h
 create mode 100644 include/asm-avr32/arch-at32ap/sm.h
 create mode 100644 include/asm-avr32/asm.h
 create mode 100644 include/asm-avr32/atomic.h
 create mode 100644 include/asm-avr32/auxvec.h
 create mode 100644 include/asm-avr32/bitops.h
 create mode 100644 include/asm-avr32/bug.h
 create mode 100644 include/asm-avr32/bugs.h
 create mode 100644 include/asm-avr32/byteorder.h
 create mode 100644 include/asm-avr32/cache.h
 create mode 100644 include/asm-avr32/cachectl.h
 create mode 100644 include/asm-avr32/cacheflush.h
 create mode 100644 include/asm-avr32/checksum.h
 create mode 100644 include/asm-avr32/cputime.h
 create mode 100644 include/asm-avr32/current.h
 create mode 100644 include/asm-avr32/delay.h
 create mode 100644 include/asm-avr32/div64.h
 create mode 100644 include/asm-avr32/dma-mapping.h
 create mode 100644 include/asm-avr32/dma.h
 create mode 100644 include/asm-avr32/elf.h
 create mode 100644 include/asm-avr32/emergency-restart.h
 create mode 100644 include/asm-avr32/errno.h
 create mode 100644 include/asm-avr32/fcntl.h
 create mode 100644 include/asm-avr32/futex.h
 create mode 100644 include/asm-avr32/hardirq.h
 create mode 100644 include/asm-avr32/hw_irq.h
 create mode 100644 include/asm-avr32/intc.h
 create mode 100644 include/asm-avr32/io.h
 create mode 100644 include/asm-avr32/ioctl.h
 create mode 100644 include/asm-avr32/ioctls.h
 create mode 100644 include/asm-avr32/ipcbuf.h
 create mode 100644 include/asm-avr32/irq.h
 create mode 100644 include/asm-avr32/irqflags.h
 create mode 100644 include/asm-avr32/kdebug.h
 create mode 100644 include/asm-avr32/kmap_types.h
 create mode 100644 include/asm-avr32/kprobes.h
 create mode 100644 include/asm-avr32/linkage.h
 create mode 100644 include/asm-avr32/local.h
 create mode 100644 include/asm-avr32/mach/serial_at91.h
 create mode 100644 include/asm-avr32/mman.h
 create mode 100644 include/asm-avr32/mmu.h
 create mode 100644 include/asm-avr32/mmu_context.h
 create mode 100644 include/asm-avr32/module.h
 create mode 100644 include/asm-avr32/msgbuf.h
 create mode 100644 include/asm-avr32/mutex.h
 create mode 100644 include/asm-avr32/namei.h
 create mode 100644 include/asm-avr32/numnodes.h
 create mode 100644 include/asm-avr32/ocd.h
 create mode 100644 include/asm-avr32/page.h
 create mode 100644 include/asm-avr32/param.h
 create mode 100644 include/asm-avr32/pci.h
 create mode 100644 include/asm-avr32/percpu.h
 create mode 100644 include/asm-avr32/pgalloc.h
 create mode 100644 include/asm-avr32/pgtable-2level.h
 create mode 100644 include/asm-avr32/pgtable.h
 create mode 100644 include/asm-avr32/poll.h
 create mode 100644 include/asm-avr32/posix_types.h
 create mode 100644 include/asm-avr32/processor.h
 create mode 100644 include/asm-avr32/ptrace.h
 create mode 100644 include/asm-avr32/resource.h
 create mode 100644 include/asm-avr32/scatterlist.h
 create mode 100644 include/asm-avr32/sections.h
 create mode 100644 include/asm-avr32/semaphore.h
 create mode 100644 include/asm-avr32/sembuf.h
 create mode 100644 include/asm-avr32/setup.h
 create mode 100644 include/asm-avr32/shmbuf.h
 create mode 100644 include/asm-avr32/shmparam.h
 create mode 100644 include/asm-avr32/sigcontext.h
 create mode 100644 include/asm-avr32/siginfo.h
 create mode 100644 include/asm-avr32/signal.h
 create mode 100644 include/asm-avr32/socket.h
 create mode 100644 include/asm-avr32/sockios.h
 create mode 100644 include/asm-avr32/stat.h
 create mode 100644 include/asm-avr32/statfs.h
 create mode 100644 include/asm-avr32/string.h
 create mode 100644 include/asm-avr32/sysreg.h
 create mode 100644 include/asm-avr32/system.h
 create mode 100644 include/asm-avr32/termbits.h
 create mode 100644 include/asm-avr32/termios.h
 create mode 100644 include/asm-avr32/thread_info.h
 create mode 100644 include/asm-avr32/timex.h
 create mode 100644 include/asm-avr32/tlb.h
 create mode 100644 include/asm-avr32/tlbflush.h
 create mode 100644 include/asm-avr32/topology.h
 create mode 100644 include/asm-avr32/traps.h
 create mode 100644 include/asm-avr32/types.h
 create mode 100644 include/asm-avr32/uaccess.h
 create mode 100644 include/asm-avr32/ucontext.h
 create mode 100644 include/asm-avr32/unaligned.h
 create mode 100644 include/asm-avr32/unistd.h
 create mode 100644 include/asm-avr32/user.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 23348c0d37b..767a43434ae 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -443,6 +443,23 @@ W:	http://people.redhat.com/sgrubb/audit/
 T:	git kernel.org:/pub/scm/linux/kernel/git/dwmw2/audit-2.6.git
 S:	Maintained
 
+AVR32 ARCHITECTURE
+P:	Atmel AVR32 Support Team
+M:	avr32@atmel.com
+P:	Haavard Skinnemoen
+M:	hskinnemoen@atmel.com
+W:	http://www.atmel.com/products/AVR32/
+W:	http://avr32linux.org/
+W:	http://avrfreaks.net/
+S:	Supported
+
+AVR32/AT32AP MACHINE SUPPORT
+P:	Atmel AVR32 Support Team
+M:	avr32@atmel.com
+P:	Haavard Skinnemoen
+M:	hskinnemoen@atmel.com
+S:	Supported
+
 AX.25 NETWORK LAYER
 P:	Ralf Baechle
 M:	ralf@linux-mips.org
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
new file mode 100644
index 00000000000..5f1694eea84
--- /dev/null
+++ b/arch/avr32/Kconfig
@@ -0,0 +1,196 @@
+#
+# For a description of the syntax of this configuration file,
+# see Documentation/kbuild/kconfig-language.txt.
+#
+
+mainmenu "Linux Kernel Configuration"
+
+config AVR32
+	bool
+	default y
+	# With EMBEDDED=n, we get lots of stuff automatically selected
+	# that we usually don't need on AVR32.
+	select EMBEDDED
+	help
+	  AVR32 is a high-performance 32-bit RISC microprocessor core,
+	  designed for cost-sensitive embedded applications, with particular
+	  emphasis on low power consumption and high code density.
+
+	  There is an AVR32 Linux project with a web page at
+	  http://avr32linux.org/.
+
+config UID16
+	bool
+
+config GENERIC_HARDIRQS
+	bool
+	default y
+
+config HARDIRQS_SW_RESEND
+	bool
+	default y
+
+config GENERIC_IRQ_PROBE
+	bool
+	default y
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config GENERIC_TIME
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
+config GENERIC_BUST_SPINLOCK
+	bool
+
+config GENERIC_HWEIGHT
+	bool
+	default y
+
+config GENERIC_CALIBRATE_DELAY
+	bool
+	default y
+
+source "init/Kconfig"
+
+menu "System Type and features"
+
+config SUBARCH_AVR32B
+	bool
+config MMU
+	bool
+config PERFORMANCE_COUNTERS
+	bool
+
+config PLATFORM_AT32AP
+	bool
+	select SUBARCH_AVR32B
+	select MMU
+	select PERFORMANCE_COUNTERS
+
+choice
+	prompt "AVR32 CPU type"
+	default CPU_AT32AP7000
+
+config CPU_AT32AP7000
+	bool "AT32AP7000"
+	select PLATFORM_AT32AP
+endchoice
+
+#
+# CPU Daughterboards for ATSTK1000
+config BOARD_ATSTK1002
+	bool
+
+choice
+	prompt "AVR32 board type"
+	default BOARD_ATSTK1000
+
+config BOARD_ATSTK1000
+	bool "ATSTK1000 evaluation board"
+	select BOARD_ATSTK1002 if CPU_AT32AP7000
+endchoice
+
+choice
+	prompt "Boot loader type"
+	default LOADER_U_BOOT
+
+config	LOADER_U_BOOT
+	bool "U-Boot (or similar) bootloader"
+endchoice
+
+config LOAD_ADDRESS
+	hex
+	default 0x10000000 if LOADER_U_BOOT=y && CPU_AT32AP7000=y
+
+config ENTRY_ADDRESS
+	hex
+	default 0x90000000 if LOADER_U_BOOT=y && CPU_AT32AP7000=y
+
+config PHYS_OFFSET
+	hex
+	default 0x10000000 if CPU_AT32AP7000=y
+
+source "kernel/Kconfig.preempt"
+
+config HAVE_ARCH_BOOTMEM_NODE
+	bool
+	default n
+
+config ARCH_HAVE_MEMORY_PRESENT
+	bool
+	default n
+
+config NEED_NODE_MEMMAP_SIZE
+	bool
+	default n
+
+config ARCH_FLATMEM_ENABLE
+	bool
+	default y
+
+config ARCH_DISCONTIGMEM_ENABLE
+	bool
+	default n
+
+config ARCH_SPARSEMEM_ENABLE
+	bool
+	default n
+
+source "mm/Kconfig"
+
+config OWNERSHIP_TRACE
+	bool "Ownership trace support"
+	default y
+	help
+	  Say Y to generate an Ownership Trace message on every context switch,
+	  enabling Nexus-compliant debuggers to keep track of the PID of the
+	  currently executing task.
+
+# FPU emulation goes here
+
+source "kernel/Kconfig.hz"
+
+config CMDLINE
+	string "Default kernel command line"
+	default ""
+	help
+	  If you don't have a boot loader capable of passing a command line string
+	  to the kernel, you may specify one here. As a minimum, you should specify
+	  the memory size and the root device (e.g., mem=8M, root=/dev/nfs).
+
+endmenu
+
+menu "Bus options"
+
+config PCI
+	bool
+
+source "drivers/pci/Kconfig"
+
+source "drivers/pcmcia/Kconfig"
+
+endmenu
+
+menu "Executable file formats"
+source "fs/Kconfig.binfmt"
+endmenu
+
+source "net/Kconfig"
+
+source "drivers/Kconfig"
+
+source "fs/Kconfig"
+
+source "arch/avr32/Kconfig.debug"
+
+source "security/Kconfig"
+
+source "crypto/Kconfig"
+
+source "lib/Kconfig"
diff --git a/arch/avr32/Kconfig.debug b/arch/avr32/Kconfig.debug
new file mode 100644
index 00000000000..64ace00fe6c
--- /dev/null
+++ b/arch/avr32/Kconfig.debug
@@ -0,0 +1,19 @@
+menu "Kernel hacking"
+
+config TRACE_IRQFLAGS_SUPPORT
+	bool
+	default y
+
+source "lib/Kconfig.debug"
+
+config KPROBES
+	bool "Kprobes"
+	depends on DEBUG_KERNEL
+	help
+	  Kprobes allows you to trap at almost any kernel address and
+          execute a callback function.  register_kprobe() establishes
+          a probepoint and specifies the callback.  Kprobes is useful
+          for kernel debugging, non-intrusive instrumentation and testing.
+          If in doubt, say "N".
+
+endmenu
diff --git a/arch/avr32/Makefile b/arch/avr32/Makefile
new file mode 100644
index 00000000000..cefc95a7398
--- /dev/null
+++ b/arch/avr32/Makefile
@@ -0,0 +1,84 @@
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2004-2006 Atmel Corporation.
+
+# Default target when executing plain make
+.PHONY: all
+all: uImage vmlinux.elf linux.lst
+
+KBUILD_DEFCONFIG	:= atstk1002_defconfig
+
+CFLAGS		+= -pipe -fno-builtin -mno-pic
+AFLAGS		+= -mrelax -mno-pic
+CFLAGS_MODULE	+= -mno-relax
+LDFLAGS_vmlinux	+= --relax
+
+cpuflags-$(CONFIG_CPU_AP7000)	+= -mcpu=ap7000
+
+CFLAGS		+= $(cpuflags-y)
+AFLAGS		+= $(cpuflags-y)
+
+CHECKFLAGS	+= -D__avr32__
+
+LIBGCC		:= $(shell $(CC) $(CFLAGS) -print-libgcc-file-name)
+
+head-$(CONFIG_LOADER_U_BOOT)		+= arch/avr32/boot/u-boot/head.o
+head-y					+= arch/avr32/kernel/head.o
+core-$(CONFIG_PLATFORM_AT32AP)		+= arch/avr32/mach-at32ap/
+core-$(CONFIG_BOARD_ATSTK1000)		+= arch/avr32/boards/atstk1000/
+core-$(CONFIG_LOADER_U_BOOT)		+= arch/avr32/boot/u-boot/
+core-y					+= arch/avr32/kernel/
+core-y					+= arch/avr32/mm/
+libs-y					+= arch/avr32/lib/ #$(LIBGCC)
+
+archincdir-$(CONFIG_PLATFORM_AT32AP)	:= arch-at32ap
+
+include/asm-avr32/.arch: $(wildcard include/config/platform/*.h) include/config/auto.conf
+	@echo '  SYMLINK include/asm-avr32/arch -> include/asm-avr32/$(archincdir-y)'
+ifneq ($(KBUILD_SRC),)
+	$(Q)mkdir -p include/asm-avr32
+	$(Q)ln -fsn $(srctree)/include/asm-avr32/$(archincdir-y) include/asm-avr32/arch
+else
+	$(Q)ln -fsn $(archincdir-y) include/asm-avr32/arch
+endif
+	@touch $@
+
+archprepare: include/asm-avr32/.arch
+
+BOOT_TARGETS := vmlinux.elf vmlinux.bin uImage uImage.srec
+
+.PHONY: $(BOOT_TARGETS) install
+
+boot := arch/$(ARCH)/boot/images
+
+             KBUILD_IMAGE := $(boot)/uImage
+vmlinux.elf: KBUILD_IMAGE := $(boot)/vmlinux.elf
+vmlinux.cso: KBUILD_IMAGE := $(boot)/vmlinux.cso
+uImage.srec: KBUILD_IMAGE := $(boot)/uImage.srec
+uImage:      KBUILD_IMAGE := $(boot)/uImage
+
+quiet_cmd_listing = LST     $@
+      cmd_listing = avr32-linux-objdump $(OBJDUMPFLAGS) -lS $< > $@
+quiet_cmd_disasm  = DIS     $@
+      cmd_disasm  = avr32-linux-objdump $(OBJDUMPFLAGS) -d $< > $@
+
+vmlinux.elf vmlinux.bin uImage.srec uImage vmlinux.cso: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
+install: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
+
+linux.s: vmlinux
+	$(call if_changed,disasm)
+
+linux.lst: vmlinux
+	$(call if_changed,listing)
+
+define archhelp
+  @echo '* vmlinux.elf		- ELF image with load address 0'
+  @echo '  vmlinux.cso		- PathFinder CSO image'
+  @echo '  uImage		- Create a bootable image for U-Boot'
+endef
diff --git a/arch/avr32/boards/atstk1000/Makefile b/arch/avr32/boards/atstk1000/Makefile
new file mode 100644
index 00000000000..c3e36ece865
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/Makefile
@@ -0,0 +1,2 @@
+obj-y				+= setup.o spi.o
+obj-$(CONFIG_BOARD_ATSTK1002)	+= atstk1002.o
diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c
new file mode 100644
index 00000000000..49164e9aadd
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/atstk1002.c
@@ -0,0 +1,37 @@
+/*
+ * ATSTK1002 daughterboard-specific init code
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+
+#include <asm/arch/board.h>
+
+struct eth_platform_data __initdata eth0_data = {
+	.valid		= 1,
+	.mii_phy_addr	= 0x10,
+	.is_rmii	= 0,
+	.hw_addr	= { 0x6a, 0x87, 0x71, 0x14, 0xcd, 0xcb },
+};
+
+extern struct lcdc_platform_data atstk1000_fb0_data;
+
+static int __init atstk1002_init(void)
+{
+	at32_add_system_devices();
+
+	at32_add_device_usart(1);	/* /dev/ttyS0 */
+	at32_add_device_usart(2);	/* /dev/ttyS1 */
+	at32_add_device_usart(3);	/* /dev/ttyS2 */
+
+	at32_add_device_eth(0, &eth0_data);
+	at32_add_device_spi(0);
+	at32_add_device_lcdc(0, &atstk1000_fb0_data);
+
+	return 0;
+}
+postcore_initcall(atstk1002_init);
diff --git a/arch/avr32/boards/atstk1000/setup.c b/arch/avr32/boards/atstk1000/setup.c
new file mode 100644
index 00000000000..191ab85de9a
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/setup.c
@@ -0,0 +1,59 @@
+/*
+ * ATSTK1000 board-specific setup code.
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/linkage.h>
+
+#include <asm/setup.h>
+
+#include <asm/arch/board.h>
+
+/* Initialized by bootloader-specific startup code. */
+struct tag *bootloader_tags __initdata;
+
+struct lcdc_platform_data __initdata atstk1000_fb0_data;
+
+asmlinkage void __init board_early_init(void)
+{
+	extern void sdram_init(void);
+
+#ifdef CONFIG_LOADER_STANDALONE
+	sdram_init();
+#endif
+}
+
+void __init board_setup_fbmem(unsigned long fbmem_start,
+			      unsigned long fbmem_size)
+{
+	if (!fbmem_size)
+		return;
+
+	if (!fbmem_start) {
+		void *fbmem;
+
+		fbmem = alloc_bootmem_low_pages(fbmem_size);
+		fbmem_start = __pa(fbmem);
+	} else {
+		pg_data_t *pgdat;
+
+		for_each_online_pgdat(pgdat) {
+			if (fbmem_start >= pgdat->bdata->node_boot_start
+			    && fbmem_start <= pgdat->bdata->node_low_pfn)
+				reserve_bootmem_node(pgdat, fbmem_start,
+						     fbmem_size);
+		}
+	}
+
+	printk("%luKiB framebuffer memory at address 0x%08lx\n",
+	       fbmem_size >> 10, fbmem_start);
+	atstk1000_fb0_data.fbmem_start = fbmem_start;
+	atstk1000_fb0_data.fbmem_size = fbmem_size;
+}
diff --git a/arch/avr32/boards/atstk1000/spi.c b/arch/avr32/boards/atstk1000/spi.c
new file mode 100644
index 00000000000..567726c82c6
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/spi.c
@@ -0,0 +1,27 @@
+/*
+ * ATSTK1000 SPI devices
+ *
+ * Copyright (C) 2005 Atmel Norway
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/device.h>
+#include <linux/spi/spi.h>
+
+static struct spi_board_info spi_board_info[] __initdata = {
+	{
+		.modalias	= "ltv350qv",
+		.max_speed_hz	= 16000000,
+		.bus_num	= 0,
+		.chip_select	= 1,
+	},
+};
+
+static int board_init_spi(void)
+{
+	spi_register_board_info(spi_board_info, ARRAY_SIZE(spi_board_info));
+	return 0;
+}
+arch_initcall(board_init_spi);
diff --git a/arch/avr32/boot/images/Makefile b/arch/avr32/boot/images/Makefile
new file mode 100644
index 00000000000..ccd74eeecec
--- /dev/null
+++ b/arch/avr32/boot/images/Makefile
@@ -0,0 +1,62 @@
+#
+# Copyright (C) 2004-2006 Atmel Corporation
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+MKIMAGE		:= $(srctree)/scripts/mkuboot.sh
+
+extra-y		:= vmlinux.bin vmlinux.gz
+
+OBJCOPYFLAGS_vmlinux.bin := -O binary
+$(obj)/vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+$(obj)/vmlinux.gz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,gzip)
+
+quiet_cmd_uimage = UIMAGE $@
+      cmd_uimage = $(CONFIG_SHELL) $(MKIMAGE) -A avr32 -O linux -T kernel	\
+		-C gzip -a $(CONFIG_LOAD_ADDRESS) -e $(CONFIG_ENTRY_ADDRESS)	\
+		-n 'Linux-$(KERNELRELEASE)' -d $< $@
+
+targets += uImage uImage.srec
+$(obj)/uImage: $(obj)/vmlinux.gz
+	$(call if_changed,uimage)
+	@echo '  Image $@ is ready'
+
+OBJCOPYFLAGS_uImage.srec := -I binary -O srec
+$(obj)/uImage.srec: $(obj)/uImage
+	$(call if_changed,objcopy)
+
+OBJCOPYFLAGS_vmlinux.elf := --change-section-lma .text-0x80000000 \
+			    --change-section-lma __ex_table-0x80000000 \
+			    --change-section-lma .rodata-0x80000000 \
+			    --change-section-lma .data-0x80000000 \
+			    --change-section-lma .init-0x80000000 \
+			    --change-section-lma .bss-0x80000000 \
+			    --change-section-lma .initrd-0x80000000 \
+			    --change-section-lma __param-0x80000000 \
+			    --change-section-lma __ksymtab-0x80000000 \
+			    --change-section-lma __ksymtab_gpl-0x80000000 \
+			    --change-section-lma __kcrctab-0x80000000 \
+			    --change-section-lma __kcrctab_gpl-0x80000000 \
+			    --change-section-lma __ksymtab_strings-0x80000000 \
+			    --change-section-lma .got-0x80000000 \
+			    --set-start 0xa0000000
+$(obj)/vmlinux.elf: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+quiet_cmd_sfdwarf = SFDWARF $@
+      cmd_sfdwarf = sfdwarf $< TO $@ GNUAVR IW $(SFDWARF_FLAGS) > $(obj)/sfdwarf.log
+
+$(obj)/vmlinux.cso: $(obj)/vmlinux.elf FORCE
+	$(call if_changed,sfdwarf)
+
+install: $(BOOTIMAGE)
+	sh $(srctree)/install-kernel.sh $<
+
+# Generated files to be removed upon make clean
+clean-files	:= vmlinux* uImage uImage.srec
diff --git a/arch/avr32/boot/u-boot/Makefile b/arch/avr32/boot/u-boot/Makefile
new file mode 100644
index 00000000000..125ddc96c27
--- /dev/null
+++ b/arch/avr32/boot/u-boot/Makefile
@@ -0,0 +1,3 @@
+extra-y		:= head.o
+
+obj-y		:= empty.o
diff --git a/arch/avr32/boot/u-boot/empty.S b/arch/avr32/boot/u-boot/empty.S
new file mode 100644
index 00000000000..8ac91a5f12f
--- /dev/null
+++ b/arch/avr32/boot/u-boot/empty.S
@@ -0,0 +1 @@
+/* Empty file */
diff --git a/arch/avr32/boot/u-boot/head.S b/arch/avr32/boot/u-boot/head.S
new file mode 100644
index 00000000000..4488fa27fe9
--- /dev/null
+++ b/arch/avr32/boot/u-boot/head.S
@@ -0,0 +1,60 @@
+/*
+ * Startup code for use with the u-boot bootloader.
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/setup.h>
+
+	/*
+	 * The kernel is loaded where we want it to be and all caches
+	 * have just been flushed. We get two parameters from u-boot:
+	 *
+	 * r12 contains a magic number (ATAG_MAGIC)
+	 * r11 points to a tag table providing information about
+	 *     the system.
+	 */
+	.section .init.text,"ax"
+	.global _start
+_start:
+	/* Check if the boot loader actually provided a tag table */
+	lddpc	r0, magic_number
+	cp.w	r12, r0
+	brne	no_tag_table
+
+	/* Initialize .bss */
+	lddpc	r2, bss_start_addr
+	lddpc   r3, end_addr
+	mov	r0, 0
+	mov	r1, 0
+1:      st.d    r2++, r0
+	cp      r2, r3
+	brlo    1b
+
+	/*
+	 * Save the tag table address for later use. This must be done
+	 * _after_ .bss has been initialized...
+	 */
+	lddpc	r0, tag_table_addr
+	st.w	r0[0], r11
+
+	/* Jump to loader-independent setup code */
+	rjmp	kernel_entry
+
+	.align	2
+magic_number:
+	.long	ATAG_MAGIC
+tag_table_addr:
+	.long	bootloader_tags
+bss_start_addr:
+	.long   __bss_start
+end_addr:
+	.long   _end
+
+no_tag_table:
+	sub	r12, pc, (. - 2f)
+	bral	panic
+2:	.asciz	"Boot loader didn't provide correct magic number\n"
diff --git a/arch/avr32/configs/atstk1002_defconfig b/arch/avr32/configs/atstk1002_defconfig
new file mode 100644
index 00000000000..1d22255009f
--- /dev/null
+++ b/arch/avr32/configs/atstk1002_defconfig
@@ -0,0 +1,754 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.18-rc1
+# Tue Jul 11 12:41:36 2006
+#
+CONFIG_AVR32=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_HARDIRQS_SW_RESEND=y
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+# CONFIG_SYSVIPC is not set
+# CONFIG_POSIX_MQUEUE is not set
+# CONFIG_BSD_PROCESS_ACCT is not set
+CONFIG_SYSCTL=y
+# CONFIG_AUDIT is not set
+# CONFIG_IKCONFIG is not set
+# CONFIG_RELAY is not set
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_EMBEDDED=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_ALL is not set
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+# CONFIG_BASE_FULL is not set
+# CONFIG_FUTEX is not set
+# CONFIG_EPOLL is not set
+CONFIG_SHMEM=y
+# CONFIG_SLAB is not set
+# CONFIG_VM_EVENT_COUNTERS is not set
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=1
+CONFIG_SLOB=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+# CONFIG_KMOD is not set
+
+#
+# Block layer
+#
+# CONFIG_BLK_DEV_IO_TRACE is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+# CONFIG_IOSCHED_AS is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+# CONFIG_DEFAULT_AS is not set
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+CONFIG_DEFAULT_NOOP=y
+CONFIG_DEFAULT_IOSCHED="noop"
+
+#
+# System Type and features
+#
+CONFIG_SUBARCH_AVR32B=y
+CONFIG_MMU=y
+CONFIG_PERFORMANCE_COUNTERS=y
+CONFIG_PLATFORM_AT32AP=y
+CONFIG_CPU_AT32AP7000=y
+CONFIG_BOARD_ATSTK1002=y
+CONFIG_BOARD_ATSTK1000=y
+CONFIG_LOADER_U_BOOT=y
+CONFIG_LOAD_ADDRESS=0x10000000
+CONFIG_ENTRY_ADDRESS=0x90000000
+CONFIG_PHYS_OFFSET=0x10000000
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+# CONFIG_HAVE_ARCH_BOOTMEM_NODE is not set
+# CONFIG_ARCH_HAVE_MEMORY_PRESENT is not set
+# CONFIG_NEED_NODE_MEMMAP_SIZE is not set
+CONFIG_ARCH_FLATMEM_ENABLE=y
+# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
+# CONFIG_ARCH_SPARSEMEM_ENABLE is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+# CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPLIT_PTLOCK_CPUS=4
+# CONFIG_RESOURCES_64BIT is not set
+# CONFIG_OWNERSHIP_TRACE is not set
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+CONFIG_CMDLINE=""
+
+#
+# Bus options
+#
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+# CONFIG_NETDEBUG is not set
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_IP_PNP_BOOTP is not set
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
+# CONFIG_INET_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+# CONFIG_TCP_CONG_ADVANCED is not set
+CONFIG_TCP_CONG_BIC=y
+# CONFIG_IPV6 is not set
+# CONFIG_INET6_XFRM_TUNNEL is not set
+# CONFIG_INET6_TUNNEL is not set
+# CONFIG_NETWORK_SECMARK is not set
+# CONFIG_NETFILTER is not set
+
+#
+# DCCP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_DCCP is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_SCTP is not set
+
+#
+# TIPC Configuration (EXPERIMENTAL)
+#
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_NET_TCPPROBE is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+# CONFIG_IEEE80211 is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+# CONFIG_FW_LOADER is not set
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_SYS_HYPERVISOR is not set
+
+#
+# Connector - unified userspace <-> kernelspace linker
+#
+# CONFIG_CONNECTOR is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+# CONFIG_PARPORT is not set
+
+#
+# Plug and Play support
+#
+
+#
+# Block devices
+#
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+# CONFIG_BLK_DEV_CRYPTOLOOP is not set
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=m
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Fusion MPT device support
+#
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+
+#
+# I2O device support
+#
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+CONFIG_DUMMY=y
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+CONFIG_TUN=m
+
+#
+# PHY device support
+#
+# CONFIG_PHYLIB is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+CONFIG_MACB=y
+
+#
+# Ethernet (1000 Mbit)
+#
+
+#
+# Ethernet (10000 Mbit)
+#
+
+#
+# Token Ring devices
+#
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+CONFIG_PPP=m
+# CONFIG_PPP_MULTILINK is not set
+# CONFIG_PPP_FILTER is not set
+CONFIG_PPP_ASYNC=m
+# CONFIG_PPP_SYNC_TTY is not set
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPP_MPPE is not set
+# CONFIG_PPPOE is not set
+# CONFIG_SLIP is not set
+# CONFIG_SHAPER is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_AT91=y
+CONFIG_SERIAL_AT91_CONSOLE=y
+# CONFIG_SERIAL_AT91_TTYAT is not set
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_LEGACY_PTYS is not set
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+# CONFIG_WATCHDOG is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+# CONFIG_RAW_DRIVER is not set
+
+#
+# TPM devices
+#
+# CONFIG_TCG_TPM is not set
+# CONFIG_TELCLOCK is not set
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# SPI support
+#
+CONFIG_SPI=y
+# CONFIG_SPI_DEBUG is not set
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_ATMEL=m
+# CONFIG_SPI_BITBANG is not set
+
+#
+# SPI Protocol Masters
+#
+
+#
+# Dallas's 1-wire bus
+#
+
+#
+# Hardware Monitoring support
+#
+# CONFIG_HWMON is not set
+# CONFIG_HWMON_VID is not set
+
+#
+# Misc devices
+#
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+CONFIG_VIDEO_V4L2=y
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+# CONFIG_FIRMWARE_EDID is not set
+CONFIG_FB=m
+CONFIG_FB_CFB_FILLRECT=m
+CONFIG_FB_CFB_COPYAREA=m
+CONFIG_FB_CFB_IMAGEBLIT=m
+# CONFIG_FB_MACMODES is not set
+# CONFIG_FB_BACKLIGHT is not set
+# CONFIG_FB_MODE_HELPERS is not set
+# CONFIG_FB_TILEBLITTING is not set
+CONFIG_FB_SIDSA=m
+CONFIG_FB_SIDSA_DEFAULT_BPP=24
+# CONFIG_FB_S1D13XXX is not set
+# CONFIG_FB_VIRTUAL is not set
+
+#
+# Logo configuration
+#
+# CONFIG_LOGO is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_BACKLIGHT_CLASS_DEVICE is not set
+CONFIG_LCD_CLASS_DEVICE=m
+CONFIG_LCD_DEVICE=y
+CONFIG_LCD_LTV350QV=m
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+# CONFIG_USB_ARCH_HAS_HCD is not set
+# CONFIG_USB_ARCH_HAS_OHCI is not set
+# CONFIG_USB_ARCH_HAS_EHCI is not set
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# LED devices
+#
+# CONFIG_NEW_LEDS is not set
+
+#
+# LED drivers
+#
+
+#
+# LED Triggers
+#
+
+#
+# InfiniBand support
+#
+
+#
+# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
+#
+
+#
+# Real Time Clock
+#
+# CONFIG_RTC_CLASS is not set
+
+#
+# DMA Engine support
+#
+# CONFIG_DMA_ENGINE is not set
+
+#
+# DMA Clients
+#
+
+#
+# DMA Devices
+#
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+# CONFIG_EXT2_FS_XIP is not set
+# CONFIG_EXT3_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+# CONFIG_XFS_FS is not set
+# CONFIG_OCFS2_FS is not set
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+# CONFIG_HUGETLB_PAGE is not set
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+# CONFIG_NFS_DIRECTIO is not set
+# CONFIG_NFSD is not set
+CONFIG_ROOT_NFS=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+CONFIG_CIFS=m
+# CONFIG_CIFS_STATS is not set
+# CONFIG_CIFS_WEAK_PW_HASH is not set
+# CONFIG_CIFS_XATTR is not set
+# CONFIG_CIFS_DEBUG2 is not set
+# CONFIG_CIFS_EXPERIMENTAL is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+# CONFIG_9P_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=m
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+CONFIG_NLS_CODEPAGE_850=m
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+# CONFIG_NLS_ASCII is not set
+CONFIG_NLS_ISO8859_1=m
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+CONFIG_NLS_UTF8=m
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+CONFIG_PRINTK_TIME=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_KERNEL=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_RWSEMS is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+# CONFIG_DEBUG_KOBJECT is not set
+CONFIG_DEBUG_BUGVERBOSE=y
+# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_DEBUG_VM is not set
+CONFIG_FRAME_POINTER=y
+# CONFIG_UNWIND_INFO is not set
+CONFIG_FORCED_INLINING=y
+# CONFIG_RCU_TORTURE_TEST is not set
+CONFIG_KPROBES=y
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Hardware crypto devices
+#
+
+#
+# Library routines
+#
+CONFIG_CRC_CCITT=m
+# CONFIG_CRC16 is not set
+CONFIG_CRC32=m
+# CONFIG_LIBCRC32C is not set
+CONFIG_ZLIB_INFLATE=m
+CONFIG_ZLIB_DEFLATE=m
diff --git a/arch/avr32/kernel/Makefile b/arch/avr32/kernel/Makefile
new file mode 100644
index 00000000000..90e5afff54a
--- /dev/null
+++ b/arch/avr32/kernel/Makefile
@@ -0,0 +1,18 @@
+#
+# Makefile for the Linux/AVR32 kernel.
+#
+
+extra-y				:= head.o vmlinux.lds
+
+obj-$(CONFIG_SUBARCH_AVR32B)	+= entry-avr32b.o
+obj-y				+= syscall_table.o syscall-stubs.o irq.o
+obj-y				+= setup.o traps.o semaphore.o ptrace.o
+obj-y				+= signal.o sys_avr32.o process.o time.o
+obj-y				+= init_task.o switch_to.o cpu.o
+obj-$(CONFIG_MODULES)		+= module.o avr32_ksyms.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o
+
+USE_STANDARD_AS_RULE		:= true
+
+%.lds: %.lds.c FORCE
+	$(call if_changed_dep,cpp_lds_S)
diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c
new file mode 100644
index 00000000000..97d86586566
--- /dev/null
+++ b/arch/avr32/kernel/asm-offsets.c
@@ -0,0 +1,25 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#include <linux/thread_info.h>
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+#define OFFSET(sym, str, mem) \
+        DEFINE(sym, offsetof(struct str, mem));
+
+void foo(void)
+{
+	OFFSET(TI_task, thread_info, task);
+	OFFSET(TI_exec_domain, thread_info, exec_domain);
+	OFFSET(TI_flags, thread_info, flags);
+	OFFSET(TI_cpu, thread_info, cpu);
+	OFFSET(TI_preempt_count, thread_info, preempt_count);
+	OFFSET(TI_restart_block, thread_info, restart_block);
+}
diff --git a/arch/avr32/kernel/avr32_ksyms.c b/arch/avr32/kernel/avr32_ksyms.c
new file mode 100644
index 00000000000..04f767a272b
--- /dev/null
+++ b/arch/avr32/kernel/avr32_ksyms.c
@@ -0,0 +1,55 @@
+/*
+ * Export AVR32-specific functions for loadable modules.
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+
+#include <asm/checksum.h>
+#include <asm/uaccess.h>
+#include <asm/delay.h>
+
+/*
+ * GCC functions
+ */
+extern unsigned long long __avr32_lsl64(unsigned long long u, unsigned long b);
+extern unsigned long long __avr32_lsr64(unsigned long long u, unsigned long b);
+extern unsigned long long __avr32_asr64(unsigned long long u, unsigned long b);
+EXPORT_SYMBOL(__avr32_lsl64);
+EXPORT_SYMBOL(__avr32_lsr64);
+EXPORT_SYMBOL(__avr32_asr64);
+
+/*
+ * String functions
+ */
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memcpy);
+
+/*
+ * Userspace access stuff.
+ */
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_user);
+EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(csum_partial);
+EXPORT_SYMBOL(csum_partial_copy_generic);
+
+/* Delay loops (lib/delay.S) */
+EXPORT_SYMBOL(__ndelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__const_udelay);
+
+/* Bit operations (lib/findbit.S) */
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(generic_find_next_zero_le_bit);
diff --git a/arch/avr32/kernel/cpu.c b/arch/avr32/kernel/cpu.c
new file mode 100644
index 00000000000..342452ba204
--- /dev/null
+++ b/arch/avr32/kernel/cpu.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/seq_file.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/param.h>
+#include <linux/errno.h>
+
+#include <asm/setup.h>
+#include <asm/sysreg.h>
+
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+#ifdef CONFIG_PERFORMANCE_COUNTERS
+
+/*
+ * XXX: If/when a SMP-capable implementation of AVR32 will ever be
+ * made, we must make sure that the code executes on the correct CPU.
+ */
+static ssize_t show_pc0event(struct sys_device *dev, char *buf)
+{
+	unsigned long pccr;
+
+	pccr = sysreg_read(PCCR);
+	return sprintf(buf, "0x%lx\n", (pccr >> 12) & 0x3f);
+}
+static ssize_t store_pc0event(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf || val > 0x3f)
+		return -EINVAL;
+	val = (val << 12) | (sysreg_read(PCCR) & 0xfffc0fff);
+	sysreg_write(PCCR, val);
+	return count;
+}
+static ssize_t show_pc0count(struct sys_device *dev, char *buf)
+{
+	unsigned long pcnt0;
+
+	pcnt0 = sysreg_read(PCNT0);
+	return sprintf(buf, "%lu\n", pcnt0);
+}
+static ssize_t store_pc0count(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+	sysreg_write(PCNT0, val);
+
+	return count;
+}
+
+static ssize_t show_pc1event(struct sys_device *dev, char *buf)
+{
+	unsigned long pccr;
+
+	pccr = sysreg_read(PCCR);
+	return sprintf(buf, "0x%lx\n", (pccr >> 18) & 0x3f);
+}
+static ssize_t store_pc1event(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf || val > 0x3f)
+		return -EINVAL;
+	val = (val << 18) | (sysreg_read(PCCR) & 0xff03ffff);
+	sysreg_write(PCCR, val);
+	return count;
+}
+static ssize_t show_pc1count(struct sys_device *dev, char *buf)
+{
+	unsigned long pcnt1;
+
+	pcnt1 = sysreg_read(PCNT1);
+	return sprintf(buf, "%lu\n", pcnt1);
+}
+static ssize_t store_pc1count(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+	sysreg_write(PCNT1, val);
+
+	return count;
+}
+
+static ssize_t show_pccycles(struct sys_device *dev, char *buf)
+{
+	unsigned long pccnt;
+
+	pccnt = sysreg_read(PCCNT);
+	return sprintf(buf, "%lu\n", pccnt);
+}
+static ssize_t store_pccycles(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+	sysreg_write(PCCNT, val);
+
+	return count;
+}
+
+static ssize_t show_pcenable(struct sys_device *dev, char *buf)
+{
+	unsigned long pccr;
+
+	pccr = sysreg_read(PCCR);
+	return sprintf(buf, "%c\n", (pccr & 1)?'1':'0');
+}
+static ssize_t store_pcenable(struct sys_device *dev, const char *buf,
+			      size_t count)
+{
+	unsigned long pccr, val;
+	char *endp;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EINVAL;
+	if (val)
+		val = 1;
+
+	pccr = sysreg_read(PCCR);
+	pccr = (pccr & ~1UL) | val;
+	sysreg_write(PCCR, pccr);
+
+	return count;
+}
+
+static SYSDEV_ATTR(pc0event, 0600, show_pc0event, store_pc0event);
+static SYSDEV_ATTR(pc0count, 0600, show_pc0count, store_pc0count);
+static SYSDEV_ATTR(pc1event, 0600, show_pc1event, store_pc1event);
+static SYSDEV_ATTR(pc1count, 0600, show_pc1count, store_pc1count);
+static SYSDEV_ATTR(pccycles, 0600, show_pccycles, store_pccycles);
+static SYSDEV_ATTR(pcenable, 0600, show_pcenable, store_pcenable);
+
+#endif /* CONFIG_PERFORMANCE_COUNTERS */
+
+static int __init topology_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu *c = &per_cpu(cpu_devices, cpu);
+
+		register_cpu(c, cpu);
+
+#ifdef CONFIG_PERFORMANCE_COUNTERS
+		sysdev_create_file(&c->sysdev, &attr_pc0event);
+		sysdev_create_file(&c->sysdev, &attr_pc0count);
+		sysdev_create_file(&c->sysdev, &attr_pc1event);
+		sysdev_create_file(&c->sysdev, &attr_pc1count);
+		sysdev_create_file(&c->sysdev, &attr_pccycles);
+		sysdev_create_file(&c->sysdev, &attr_pcenable);
+#endif
+	}
+
+	return 0;
+}
+
+subsys_initcall(topology_init);
+
+static const char *cpu_names[] = {
+	"Morgan",
+	"AP7000",
+};
+#define NR_CPU_NAMES ARRAY_SIZE(cpu_names)
+
+static const char *arch_names[] = {
+	"AVR32A",
+	"AVR32B",
+};
+#define NR_ARCH_NAMES ARRAY_SIZE(arch_names)
+
+static const char *mmu_types[] = {
+	"No MMU",
+	"ITLB and DTLB",
+	"Shared TLB",
+	"MPU"
+};
+
+void __init setup_processor(void)
+{
+	unsigned long config0, config1;
+	unsigned cpu_id, cpu_rev, arch_id, arch_rev, mmu_type;
+	unsigned tmp;
+
+	config0 = sysreg_read(CONFIG0); /* 0x0000013e; */
+	config1 = sysreg_read(CONFIG1); /* 0x01f689a2; */
+	cpu_id = config0 >> 24;
+	cpu_rev = (config0 >> 16) & 0xff;
+	arch_id = (config0 >> 13) & 0x07;
+	arch_rev = (config0 >> 10) & 0x07;
+	mmu_type = (config0 >> 7) & 0x03;
+
+	boot_cpu_data.arch_type = arch_id;
+	boot_cpu_data.cpu_type = cpu_id;
+	boot_cpu_data.arch_revision = arch_rev;
+	boot_cpu_data.cpu_revision = cpu_rev;
+	boot_cpu_data.tlb_config = mmu_type;
+
+	tmp = (config1 >> 13) & 0x07;
+	if (tmp) {
+		boot_cpu_data.icache.ways = 1 << ((config1 >> 10) & 0x07);
+		boot_cpu_data.icache.sets = 1 << ((config1 >> 16) & 0x0f);
+		boot_cpu_data.icache.linesz = 1 << (tmp + 1);
+	}
+	tmp = (config1 >> 3) & 0x07;
+	if (tmp) {
+		boot_cpu_data.dcache.ways = 1 << (config1 & 0x07);
+		boot_cpu_data.dcache.sets = 1 << ((config1 >> 6) & 0x0f);
+		boot_cpu_data.dcache.linesz = 1 << (tmp + 1);
+	}
+
+	if ((cpu_id >= NR_CPU_NAMES) || (arch_id >= NR_ARCH_NAMES)) {
+		printk ("Unknown CPU configuration (ID %02x, arch %02x), "
+			"continuing anyway...\n",
+			cpu_id, arch_id);
+		return;
+	}
+
+	printk ("CPU: %s [%02x] revision %d (%s revision %d)\n",
+		cpu_names[cpu_id], cpu_id, cpu_rev,
+		arch_names[arch_id], arch_rev);
+	printk ("CPU: MMU configuration: %s\n", mmu_types[mmu_type]);
+	printk ("CPU: features:");
+	if (config0 & (1 << 6))
+		printk(" fpu");
+	if (config0 & (1 << 5))
+		printk(" java");
+	if (config0 & (1 << 4))
+		printk(" perfctr");
+	if (config0 & (1 << 3))
+		printk(" ocd");
+	printk("\n");
+}
+
+#ifdef CONFIG_PROC_FS
+static int c_show(struct seq_file *m, void *v)
+{
+	unsigned int icache_size, dcache_size;
+	unsigned int cpu = smp_processor_id();
+
+	icache_size = boot_cpu_data.icache.ways *
+		boot_cpu_data.icache.sets *
+		boot_cpu_data.icache.linesz;
+	dcache_size = boot_cpu_data.dcache.ways *
+		boot_cpu_data.dcache.sets *
+		boot_cpu_data.dcache.linesz;
+
+	seq_printf(m, "processor\t: %d\n", cpu);
+
+	if (boot_cpu_data.arch_type < NR_ARCH_NAMES)
+		seq_printf(m, "cpu family\t: %s revision %d\n",
+			   arch_names[boot_cpu_data.arch_type],
+			   boot_cpu_data.arch_revision);
+	if (boot_cpu_data.cpu_type < NR_CPU_NAMES)
+		seq_printf(m, "cpu type\t: %s revision %d\n",
+			   cpu_names[boot_cpu_data.cpu_type],
+			   boot_cpu_data.cpu_revision);
+
+	seq_printf(m, "i-cache\t\t: %dK (%u ways x %u sets x %u)\n",
+		   icache_size >> 10,
+		   boot_cpu_data.icache.ways,
+		   boot_cpu_data.icache.sets,
+		   boot_cpu_data.icache.linesz);
+	seq_printf(m, "d-cache\t\t: %dK (%u ways x %u sets x %u)\n",
+		   dcache_size >> 10,
+		   boot_cpu_data.dcache.ways,
+		   boot_cpu_data.dcache.sets,
+		   boot_cpu_data.dcache.linesz);
+	seq_printf(m, "bogomips\t: %lu.%02lu\n",
+		   boot_cpu_data.loops_per_jiffy / (500000/HZ),
+		   (boot_cpu_data.loops_per_jiffy / (5000/HZ)) % 100);
+
+	return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	return *pos < 1 ? (void *)1 : NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+
+}
+
+struct seq_operations cpuinfo_op = {
+	.start	= c_start,
+	.next	= c_next,
+	.stop	= c_stop,
+	.show	= c_show
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S
new file mode 100644
index 00000000000..eeb66792bc3
--- /dev/null
+++ b/arch/avr32/kernel/entry-avr32b.S
@@ -0,0 +1,678 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * This file contains the low-level entry-points into the kernel, that is,
+ * exception handlers, debug trap handlers, interrupt handlers and the
+ * system call handler.
+ */
+#include <linux/errno.h>
+
+#include <asm/asm.h>
+#include <asm/hardirq.h>
+#include <asm/irq.h>
+#include <asm/ocd.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/ptrace.h>
+#include <asm/sysreg.h>
+#include <asm/thread_info.h>
+#include <asm/unistd.h>
+
+#ifdef CONFIG_PREEMPT
+# define preempt_stop		mask_interrupts
+#else
+# define preempt_stop
+# define fault_resume_kernel	fault_restore_all
+#endif
+
+#define __MASK(x)	((1 << (x)) - 1)
+#define IRQ_MASK	((__MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) | \
+			 (__MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT))
+
+	.section .ex.text,"ax",@progbits
+	.align	2
+exception_vectors:
+	bral	handle_critical
+	.align	2
+	bral	handle_critical
+	.align	2
+	bral	do_bus_error_write
+	.align	2
+	bral	do_bus_error_read
+	.align	2
+	bral	do_nmi_ll
+	.align	2
+	bral	handle_address_fault
+	.align	2
+	bral	handle_protection_fault
+	.align	2
+	bral	handle_debug
+	.align	2
+	bral	do_illegal_opcode_ll
+	.align	2
+	bral	do_illegal_opcode_ll
+	.align	2
+	bral	do_illegal_opcode_ll
+	.align	2
+	bral	do_fpe_ll
+	.align	2
+	bral	do_illegal_opcode_ll
+	.align	2
+	bral	handle_address_fault
+	.align	2
+	bral	handle_address_fault
+	.align	2
+	bral	handle_protection_fault
+	.align	2
+	bral	handle_protection_fault
+	.align	2
+	bral	do_dtlb_modified
+
+	/*
+	 * r0 :	PGD/PT/PTE
+	 * r1 : Offending address
+	 * r2 : Scratch register
+	 * r3 : Cause (5, 12 or 13)
+	 */
+#define	tlbmiss_save	pushm	r0-r3
+#define tlbmiss_restore	popm	r0-r3
+
+	.section .tlbx.ex.text,"ax",@progbits
+	.global	itlb_miss
+itlb_miss:
+	tlbmiss_save
+	rjmp	tlb_miss_common
+
+	.section .tlbr.ex.text,"ax",@progbits
+dtlb_miss_read:
+	tlbmiss_save
+	rjmp	tlb_miss_common
+
+	.section .tlbw.ex.text,"ax",@progbits
+dtlb_miss_write:
+	tlbmiss_save
+
+	.global	tlb_miss_common
+tlb_miss_common:
+	mfsr	r0, SYSREG_PTBR
+	mfsr	r1, SYSREG_TLBEAR
+
+	/* Is it the vmalloc space? */
+	bld	r1, 31
+	brcs	handle_vmalloc_miss
+
+	/* First level lookup */
+pgtbl_lookup:
+	lsr	r2, r1, PGDIR_SHIFT
+	ld.w	r0, r0[r2 << 2]
+	bld	r0, _PAGE_BIT_PRESENT
+	brcc	page_table_not_present
+
+	/* TODO: Check access rights on page table if necessary */
+
+	/* Translate to virtual address in P1. */
+	andl	r0, 0xf000
+	sbr	r0, 31
+
+	/* Second level lookup */
+	lsl	r1, (32 - PGDIR_SHIFT)
+	lsr	r1, (32 - PGDIR_SHIFT) + PAGE_SHIFT
+	add	r2, r0, r1 << 2
+	ld.w	r1, r2[0]
+	bld	r1, _PAGE_BIT_PRESENT
+	brcc	page_not_present
+
+	/* Mark the page as accessed */
+	sbr	r1, _PAGE_BIT_ACCESSED
+	st.w	r2[0], r1
+
+	/* Drop software flags */
+	andl	r1, _PAGE_FLAGS_HARDWARE_MASK & 0xffff
+	mtsr	SYSREG_TLBELO, r1
+
+	/* Figure out which entry we want to replace */
+	mfsr	r0, SYSREG_TLBARLO
+	clz	r2, r0
+	brcc	1f
+	mov	r1, -1			/* All entries have been accessed, */
+	mtsr	SYSREG_TLBARLO, r1	/* so reset TLBAR */
+	mov	r2, 0			/* and start at 0 */
+1:	mfsr	r1, SYSREG_MMUCR
+	lsl	r2, 14
+	andl	r1, 0x3fff, COH
+	or	r1, r2
+	mtsr	SYSREG_MMUCR, r1
+
+	tlbw
+
+	tlbmiss_restore
+	rete
+
+handle_vmalloc_miss:
+	/* Simply do the lookup in init's page table */
+	mov	r0, lo(swapper_pg_dir)
+	orh	r0, hi(swapper_pg_dir)
+	rjmp	pgtbl_lookup
+
+
+	/* ---                    System Call                    --- */
+
+	.section .scall.text,"ax",@progbits
+system_call:
+	pushm	r12		/* r12_orig */
+	stmts	--sp, r0-lr
+	zero_fp
+	mfsr	r0, SYSREG_RAR_SUP
+	mfsr	r1, SYSREG_RSR_SUP
+	stm	--sp, r0-r1
+
+	/* check for syscall tracing */
+	get_thread_info r0
+	ld.w	r1, r0[TI_flags]
+	bld	r1, TIF_SYSCALL_TRACE
+	brcs	syscall_trace_enter
+
+syscall_trace_cont:
+	cp.w	r8, NR_syscalls
+	brhs	syscall_badsys
+
+	lddpc   lr, syscall_table_addr
+	ld.w    lr, lr[r8 << 2]
+	mov	r8, r5		/* 5th argument (6th is pushed by stub) */
+	icall   lr
+
+	.global	syscall_return
+syscall_return:
+	get_thread_info r0
+	mask_interrupts		/* make sure we don't miss an interrupt
+				   setting need_resched or sigpending
+				   between sampling and the rets */
+
+	/* Store the return value so that the correct value is loaded below */
+	stdsp   sp[REG_R12], r12
+
+	ld.w	r1, r0[TI_flags]
+	andl	r1, _TIF_ALLWORK_MASK, COH
+	brne	syscall_exit_work
+
+syscall_exit_cont:
+	popm	r8-r9
+	mtsr	SYSREG_RAR_SUP, r8
+	mtsr	SYSREG_RSR_SUP, r9
+	ldmts	sp++, r0-lr
+	sub	sp, -4		/* r12_orig */
+	rets
+
+	.align	2
+syscall_table_addr:
+	.long   sys_call_table
+
+syscall_badsys:
+	mov	r12, -ENOSYS
+	rjmp	syscall_return
+
+	.global ret_from_fork
+ret_from_fork:
+	rcall   schedule_tail
+
+	/* check for syscall tracing */
+	get_thread_info r0
+	ld.w	r1, r0[TI_flags]
+	andl	r1, _TIF_ALLWORK_MASK, COH
+	brne	syscall_exit_work
+	rjmp    syscall_exit_cont
+
+syscall_trace_enter:
+	pushm	r8-r12
+	rcall	syscall_trace
+	popm	r8-r12
+	rjmp	syscall_trace_cont
+
+syscall_exit_work:
+	bld	r1, TIF_SYSCALL_TRACE
+	brcc	1f
+	unmask_interrupts
+	rcall	syscall_trace
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+
+1:	bld	r1, TIF_NEED_RESCHED
+	brcc	2f
+	unmask_interrupts
+	rcall	schedule
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	1b
+
+2:	mov	r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK
+	tst	r1, r2
+	breq	3f
+	unmask_interrupts
+	mov	r12, sp
+	mov	r11, r0
+	rcall	do_notify_resume
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	1b
+
+3:	bld	r1, TIF_BREAKPOINT
+	brcc	syscall_exit_cont
+	mfsr	r3, SYSREG_TLBEHI
+	lddsp	r2, sp[REG_PC]
+	andl	r3, 0xff, COH
+	lsl	r3, 1
+	sbr	r3, 30
+	sbr	r3, 0
+	mtdr	DBGREG_BWA2A, r2
+	mtdr	DBGREG_BWC2A, r3
+	rjmp	syscall_exit_cont
+
+
+	/* The slow path of the TLB miss handler */
+page_table_not_present:
+page_not_present:
+	tlbmiss_restore
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_page_fault
+	rjmp	ret_from_exception
+
+	/* This function expects to find offending PC in SYSREG_RAR_EX */
+save_full_context_ex:
+	mfsr	r8, SYSREG_RSR_EX
+	mov	r12, r8
+	andh	r8, (MODE_MASK >> 16), COH
+	mfsr	r11, SYSREG_RAR_EX
+	brne	2f
+
+1:	pushm	r11, r12	/* PC and SR */
+	unmask_exceptions
+	ret	r12
+
+2:	sub	r10, sp, -(FRAME_SIZE_FULL - REG_LR)
+	stdsp	sp[4], r10	/* replace saved SP */
+	rjmp	1b
+
+	/* Low-level exception handlers */
+handle_critical:
+	pushm	r12
+	pushm	r0-r12
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_critical_exception
+
+	/* We should never get here... */
+bad_return:
+	sub	r12, pc, (. - 1f)
+	bral	panic
+	.align	2
+1:	.asciz	"Return from critical exception!"
+
+	.align	1
+do_bus_error_write:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mov	r11, 1
+	rjmp	1f
+
+do_bus_error_read:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mov	r11, 0
+1:	mfsr	r12, SYSREG_BEAR
+	mov	r10, sp
+	rcall	do_bus_error
+	rjmp	ret_from_exception
+
+	.align	1
+do_nmi_ll:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	/* FIXME: Make sure RAR_NMI and RSR_NMI are pushed instead of *_EX */
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_nmi
+	rjmp	bad_return
+
+handle_address_fault:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_address_exception
+	rjmp	ret_from_exception
+
+handle_protection_fault:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_page_fault
+	rjmp	ret_from_exception
+
+	.align	1
+do_illegal_opcode_ll:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	mfsr	r12, SYSREG_ECR
+	mov	r11, sp
+	rcall	do_illegal_opcode
+	rjmp	ret_from_exception
+
+do_dtlb_modified:
+	pushm	r0-r3
+	mfsr	r1, SYSREG_TLBEAR
+	mfsr	r0, SYSREG_PTBR
+	lsr	r2, r1, PGDIR_SHIFT
+	ld.w	r0, r0[r2 << 2]
+	lsl	r1, (32 - PGDIR_SHIFT)
+	lsr	r1, (32 - PGDIR_SHIFT) + PAGE_SHIFT
+
+	/* Translate to virtual address in P1 */
+	andl	r0, 0xf000
+	sbr	r0, 31
+	add	r2, r0, r1 << 2
+	ld.w	r3, r2[0]
+	sbr	r3, _PAGE_BIT_DIRTY
+	mov	r0, r3
+	st.w	r2[0], r3
+
+	/* The page table is up-to-date. Update the TLB entry as well */
+	andl	r0, lo(_PAGE_FLAGS_HARDWARE_MASK)
+	mtsr	SYSREG_TLBELO, r0
+
+	/* MMUCR[DRP] is updated automatically, so let's go... */
+	tlbw
+
+	popm	r0-r3
+	rete
+
+do_fpe_ll:
+	sub	sp, 4
+	stmts	--sp, r0-lr
+	rcall	save_full_context_ex
+	unmask_interrupts
+	mov	r12, 26
+	mov	r11, sp
+	rcall	do_fpe
+	rjmp	ret_from_exception
+
+ret_from_exception:
+	mask_interrupts
+	lddsp	r4, sp[REG_SR]
+	andh	r4, (MODE_MASK >> 16), COH
+	brne	fault_resume_kernel
+
+	get_thread_info r0
+	ld.w	r1, r0[TI_flags]
+	andl	r1, _TIF_WORK_MASK, COH
+	brne	fault_exit_work
+
+fault_resume_user:
+	popm	r8-r9
+	mask_exceptions
+	mtsr	SYSREG_RAR_EX, r8
+	mtsr	SYSREG_RSR_EX, r9
+	ldmts	sp++, r0-lr
+	sub	sp, -4
+	rete
+
+fault_resume_kernel:
+#ifdef CONFIG_PREEMPT
+	get_thread_info	r0
+	ld.w	r2, r0[TI_preempt_count]
+	cp.w	r2, 0
+	brne	1f
+	ld.w	r1, r0[TI_flags]
+	bld	r1, TIF_NEED_RESCHED
+	brcc	1f
+	lddsp	r4, sp[REG_SR]
+	bld	r4, SYSREG_GM_OFFSET
+	brcs	1f
+	rcall	preempt_schedule_irq
+1:
+#endif
+
+	popm	r8-r9
+	mask_exceptions
+	mfsr	r1, SYSREG_SR
+	mtsr	SYSREG_RAR_EX, r8
+	mtsr	SYSREG_RSR_EX, r9
+	popm	lr
+	sub	sp, -4		/* ignore SP */
+	popm	r0-r12
+	sub	sp, -4		/* ignore r12_orig */
+	rete
+
+irq_exit_work:
+	/* Switch to exception mode so that we can share the same code. */
+	mfsr	r8, SYSREG_SR
+	cbr	r8, SYSREG_M0_OFFSET
+	orh	r8, hi(SYSREG_BIT(M1) | SYSREG_BIT(M2))
+	mtsr	SYSREG_SR, r8
+	sub	pc, -2
+	get_thread_info r0
+	ld.w	r1, r0[TI_flags]
+
+fault_exit_work:
+	bld	r1, TIF_NEED_RESCHED
+	brcc	1f
+	unmask_interrupts
+	rcall	schedule
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	fault_exit_work
+
+1:	mov	r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK
+	tst	r1, r2
+	breq	2f
+	unmask_interrupts
+	mov	r12, sp
+	mov	r11, r0
+	rcall	do_notify_resume
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	fault_exit_work
+
+2:	bld	r1, TIF_BREAKPOINT
+	brcc	fault_resume_user
+	mfsr	r3, SYSREG_TLBEHI
+	lddsp	r2, sp[REG_PC]
+	andl	r3, 0xff, COH
+	lsl	r3, 1
+	sbr	r3, 30
+	sbr	r3, 0
+	mtdr	DBGREG_BWA2A, r2
+	mtdr	DBGREG_BWC2A, r3
+	rjmp	fault_resume_user
+
+	/* If we get a debug trap from privileged context we end up here */
+handle_debug_priv:
+	/* Fix up LR and SP in regs. r11 contains the mode we came from */
+	mfsr	r8, SYSREG_SR
+	mov	r9, r8
+	andh	r8, hi(~MODE_MASK)
+	or	r8, r11
+	mtsr	SYSREG_SR, r8
+	sub	pc, -2
+	stdsp	sp[REG_LR], lr
+	mtsr	SYSREG_SR, r9
+	sub	pc, -2
+	sub	r10, sp, -FRAME_SIZE_FULL
+	stdsp	sp[REG_SP], r10
+	mov	r12, sp
+	rcall	do_debug_priv
+
+	/* Now, put everything back */
+	ssrf	SR_EM_BIT
+	popm	r10, r11
+	mtsr	SYSREG_RAR_DBG, r10
+	mtsr	SYSREG_RSR_DBG, r11
+	mfsr	r8, SYSREG_SR
+	mov	r9, r8
+	andh	r8, hi(~MODE_MASK)
+	andh	r11, hi(MODE_MASK)
+	or	r8, r11
+	mtsr	SYSREG_SR, r8
+	sub	pc, -2
+	popm	lr
+	mtsr	SYSREG_SR, r9
+	sub	pc, -2
+	sub	sp, -4		/* skip SP */
+	popm	r0-r12
+	sub	sp, -4
+	retd
+
+	/*
+	 * At this point, everything is masked, that is, interrupts,
+	 * exceptions and debugging traps. We might get called from
+	 * interrupt or exception context in some rare cases, but this
+	 * will be taken care of by do_debug(), so we're not going to
+	 * do a 100% correct context save here.
+	 */
+handle_debug:
+	sub	sp, 4		/* r12_orig */
+	stmts	--sp, r0-lr
+	mfsr	r10, SYSREG_RAR_DBG
+	mfsr	r11, SYSREG_RSR_DBG
+	unmask_exceptions
+	pushm	r10,r11
+	andh	r11, (MODE_MASK >> 16), COH
+	brne	handle_debug_priv
+
+	mov	r12, sp
+	rcall	do_debug
+
+	lddsp	r10, sp[REG_SR]
+	andh	r10, (MODE_MASK >> 16), COH
+	breq	debug_resume_user
+
+debug_restore_all:
+	popm	r10,r11
+	mask_exceptions
+	mtsr	SYSREG_RSR_DBG, r11
+	mtsr	SYSREG_RAR_DBG, r10
+	ldmts	sp++, r0-lr
+	sub	sp, -4
+	retd
+
+debug_resume_user:
+	get_thread_info r0
+	mask_interrupts
+
+	ld.w	r1, r0[TI_flags]
+	andl	r1, _TIF_DBGWORK_MASK, COH
+	breq	debug_restore_all
+
+1:	bld	r1, TIF_NEED_RESCHED
+	brcc	2f
+	unmask_interrupts
+	rcall	schedule
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	1b
+
+2:	mov	r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK
+	tst	r1, r2
+	breq	3f
+	unmask_interrupts
+	mov	r12, sp
+	mov	r11, r0
+	rcall	do_notify_resume
+	mask_interrupts
+	ld.w	r1, r0[TI_flags]
+	rjmp	1b
+
+3:	bld	r1, TIF_SINGLE_STEP
+	brcc	debug_restore_all
+	mfdr	r2, DBGREG_DC
+	sbr	r2, DC_SS_BIT
+	mtdr	DBGREG_DC, r2
+	rjmp	debug_restore_all
+
+	.set	rsr_int0,	SYSREG_RSR_INT0
+	.set	rsr_int1,	SYSREG_RSR_INT1
+	.set	rsr_int2,	SYSREG_RSR_INT2
+	.set	rsr_int3,	SYSREG_RSR_INT3
+	.set	rar_int0,	SYSREG_RAR_INT0
+	.set	rar_int1,	SYSREG_RAR_INT1
+	.set	rar_int2,	SYSREG_RAR_INT2
+	.set	rar_int3,	SYSREG_RAR_INT3
+
+	.macro	IRQ_LEVEL level
+	.type	irq_level\level, @function
+irq_level\level:
+	sub	sp, 4		/* r12_orig */
+	stmts	--sp,r0-lr
+	mfsr	r8, rar_int\level
+	mfsr	r9, rsr_int\level
+	pushm	r8-r9
+
+	mov	r11, sp
+	mov	r12, \level
+
+	rcall	do_IRQ
+
+	lddsp	r4, sp[REG_SR]
+	andh	r4, (MODE_MASK >> 16), COH
+#ifdef CONFIG_PREEMPT
+	brne	2f
+#else
+	brne	1f
+#endif
+
+	get_thread_info	r0
+	ld.w	r1, r0[TI_flags]
+	andl	r1, _TIF_WORK_MASK, COH
+	brne	irq_exit_work
+
+1:	popm	r8-r9
+	mtsr	rar_int\level, r8
+	mtsr	rsr_int\level, r9
+	ldmts	sp++,r0-lr
+	sub	sp, -4		/* ignore r12_orig */
+	rete
+
+#ifdef CONFIG_PREEMPT
+2:
+	get_thread_info	r0
+	ld.w	r2, r0[TI_preempt_count]
+	cp.w	r2, 0
+	brne	1b
+	ld.w	r1, r0[TI_flags]
+	bld	r1, TIF_NEED_RESCHED
+	brcc	1b
+	lddsp	r4, sp[REG_SR]
+	bld	r4, SYSREG_GM_OFFSET
+	brcs	1b
+	rcall	preempt_schedule_irq
+	rjmp	1b
+#endif
+	.endm
+
+	.section .irq.text,"ax",@progbits
+
+	.global	irq_level0
+	.global	irq_level1
+	.global	irq_level2
+	.global	irq_level3
+	IRQ_LEVEL 0
+	IRQ_LEVEL 1
+	IRQ_LEVEL 2
+	IRQ_LEVEL 3
diff --git a/arch/avr32/kernel/head.S b/arch/avr32/kernel/head.S
new file mode 100644
index 00000000000..773b7ad87be
--- /dev/null
+++ b/arch/avr32/kernel/head.S
@@ -0,0 +1,45 @@
+/*
+ * Non-board-specific low-level startup code
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/sysreg.h>
+
+	.section .init.text,"ax"
+	.global kernel_entry
+kernel_entry:
+	/* Initialize status register */
+	lddpc   r0, init_sr
+	mtsr	SYSREG_SR, r0
+
+	/* Set initial stack pointer */
+	lddpc   sp, stack_addr
+	sub	sp, -THREAD_SIZE
+
+#ifdef CONFIG_FRAME_POINTER
+	/* Mark last stack frame */
+	mov	lr, 0
+	mov	r7, 0
+#endif
+
+	/* Set up the PIO, SDRAM controller, early printk, etc. */
+	rcall	board_early_init
+
+	/* Start the show */
+	lddpc   pc, kernel_start_addr
+
+	.align  2
+init_sr:
+	.long   0x007f0000	/* Supervisor mode, everything masked */
+stack_addr:
+	.long   init_thread_union
+kernel_start_addr:
+	.long   start_kernel
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
new file mode 100644
index 00000000000..effcacf9d1a
--- /dev/null
+++ b/arch/avr32/kernel/init_task.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/init_task.h>
+#include <linux/mqueue.h>
+
+#include <asm/pgtable.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial thread structure. Must be aligned on an 8192-byte boundary.
+ */
+union thread_union init_thread_union
+	__attribute__((__section__(".data.init_task"))) =
+		{ INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
diff --git a/arch/avr32/kernel/irq.c b/arch/avr32/kernel/irq.c
new file mode 100644
index 00000000000..856f3548e66
--- /dev/null
+++ b/arch/avr32/kernel/irq.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on arch/i386/kernel/irq.c
+ *   Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This file contains the code used by various IRQ handling routines:
+ * asking for different IRQ's should be done through these routines
+ * instead of just grabbing them. Thus setups with different IRQ numbers
+ * shouldn't result in any weird surprises, and installing new handlers
+ * should be easier.
+ *
+ * IRQ's are in fact implemented a bit like signal handlers for the kernel.
+ * Naturally it's not a 1:1 relation, but there are similarities.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sysdev.h>
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+	printk("unexpected IRQ %u\n", irq);
+}
+
+#ifdef CONFIG_PROC_FS
+int show_interrupts(struct seq_file *p, void *v)
+{
+	int i = *(loff_t *)v, cpu;
+	struct irqaction *action;
+	unsigned long flags;
+
+	if (i == 0) {
+		seq_puts(p, "           ");
+		for_each_online_cpu(cpu)
+			seq_printf(p, "CPU%d       ", cpu);
+		seq_putc(p, '\n');
+	}
+
+	if (i < NR_IRQS) {
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
+		action = irq_desc[i].action;
+		if (!action)
+			goto unlock;
+
+		seq_printf(p, "%3d: ", i);
+		for_each_online_cpu(cpu)
+			seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]);
+		seq_printf(p, "  %s", action->name);
+		for (action = action->next; action; action = action->next)
+			seq_printf(p, ", %s", action->name);
+
+		seq_putc(p, '\n');
+	unlock:
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+	}
+
+	return 0;
+}
+#endif
diff --git a/arch/avr32/kernel/kprobes.c b/arch/avr32/kernel/kprobes.c
new file mode 100644
index 00000000000..6caf9e8d808
--- /dev/null
+++ b/arch/avr32/kernel/kprobes.c
@@ -0,0 +1,270 @@
+/*
+ *  Kernel Probes (KProbes)
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * Based on arch/ppc64/kernel/kprobes.c
+ *  Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+#include <asm/ocd.h>
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe);
+static unsigned long kprobe_status;
+static struct pt_regs jprobe_saved_regs;
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+	int ret = 0;
+
+	if ((unsigned long)p->addr & 0x01) {
+		printk("Attempt to register kprobe at an unaligned address\n");
+		ret = -EINVAL;
+	}
+
+	/* XXX: Might be a good idea to check if p->addr is a valid
+	 * kernel address as well... */
+
+	if (!ret) {
+		pr_debug("copy kprobe at %p\n", p->addr);
+		memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+		p->opcode = *p->addr;
+	}
+
+	return ret;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+	pr_debug("arming kprobe at %p\n", p->addr);
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long)p->addr,
+			   (unsigned long)p->addr + sizeof(kprobe_opcode_t));
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+	pr_debug("disarming kprobe at %p\n", p->addr);
+	*p->addr = p->opcode;
+	flush_icache_range((unsigned long)p->addr,
+			   (unsigned long)p->addr + sizeof(kprobe_opcode_t));
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+	unsigned long dc;
+
+	pr_debug("preparing to singlestep over %p (PC=%08lx)\n",
+		 p->addr, regs->pc);
+
+	BUG_ON(!(sysreg_read(SR) & SYSREG_BIT(SR_D)));
+
+	dc = __mfdr(DBGREG_DC);
+	dc |= DC_SS;
+	__mtdr(DBGREG_DC, dc);
+
+	/*
+	 * We must run the instruction from its original location
+	 * since it may actually reference PC.
+	 *
+	 * TODO: Do the instruction replacement directly in icache.
+	 */
+	*p->addr = p->opcode;
+	flush_icache_range((unsigned long)p->addr,
+			   (unsigned long)p->addr + sizeof(kprobe_opcode_t));
+}
+
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+	unsigned long dc;
+
+	pr_debug("resuming execution at PC=%08lx\n", regs->pc);
+
+	dc = __mfdr(DBGREG_DC);
+	dc &= ~DC_SS;
+	__mtdr(DBGREG_DC, dc);
+
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long)p->addr,
+			   (unsigned long)p->addr + sizeof(kprobe_opcode_t));
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p)
+{
+	__get_cpu_var(current_kprobe) = p;
+}
+
+static int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *p;
+	void *addr = (void *)regs->pc;
+	int ret = 0;
+
+	pr_debug("kprobe_handler: kprobe_running=%d\n",
+		 kprobe_running());
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+
+	/* Check that we're not recursing */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kprobe_status == KPROBE_HIT_SS) {
+				printk("FIXME: kprobe hit while single-stepping!\n");
+				goto no_kprobe;
+			}
+
+			printk("FIXME: kprobe hit while handling another kprobe\n");
+			goto no_kprobe;
+		} else {
+			p = kprobe_running();
+			if (p->break_handler && p->break_handler(p, regs))
+				goto ss_probe;
+		}
+		/* If it's not ours, can't be delete race, (we hold lock). */
+		goto no_kprobe;
+	}
+
+	p = get_kprobe(addr);
+	if (!p)
+		goto no_kprobe;
+
+	kprobe_status = KPROBE_HIT_ACTIVE;
+	set_current_kprobe(p);
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/* handler has already set things up, so skip ss setup */
+		return 1;
+
+ss_probe:
+	prepare_singlestep(p, regs);
+	kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	return ret;
+}
+
+static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+	struct kprobe *cur = kprobe_running();
+
+	pr_debug("post_kprobe_handler, cur=%p\n", cur);
+
+	if (!cur)
+		return 0;
+
+	if (cur->post_handler) {
+		kprobe_status = KPROBE_HIT_SSDONE;
+		cur->post_handler(cur, regs, 0);
+	}
+
+	resume_execution(cur, regs);
+	reset_current_kprobe();
+	preempt_enable_no_resched();
+
+	return 1;
+}
+
+static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	struct kprobe *cur = kprobe_running();
+
+	pr_debug("kprobe_fault_handler: trapnr=%d\n", trapnr);
+
+	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+		return 1;
+
+	if (kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(cur, regs);
+		preempt_enable_no_resched();
+	}
+	return 0;
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	int ret = NOTIFY_DONE;
+
+	pr_debug("kprobe_exceptions_notify: val=%lu, data=%p\n",
+		 val, data);
+
+	switch (val) {
+	case DIE_BREAKPOINT:
+		if (kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_SSTEP:
+		if (post_kprobe_handler(args->regs))
+			ret = NOTIFY_STOP;
+		break;
+	case DIE_FAULT:
+		if (kprobe_running()
+		    && kprobe_fault_handler(args->regs, args->trapnr))
+			ret = NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+	memcpy(&jprobe_saved_regs, regs, sizeof(struct pt_regs));
+
+	/*
+	 * TODO: We should probably save some of the stack here as
+	 * well, since gcc may pass arguments on the stack for certain
+	 * functions (lots of arguments, large aggregates, varargs)
+	 */
+
+	/* setup return addr to the jprobe handler routine */
+	regs->pc = (unsigned long)jp->entry;
+	return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+	asm volatile("breakpoint" ::: "memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	/*
+	 * FIXME - we should ideally be validating that we got here 'cos
+	 * of the "trap" in jprobe_return() above, before restoring the
+	 * saved regs...
+	 */
+	memcpy(regs, &jprobe_saved_regs, sizeof(struct pt_regs));
+	return 1;
+}
+
+int __init arch_init_kprobes(void)
+{
+	printk("KPROBES: Enabling monitor mode (MM|DBE)...\n");
+	__mtdr(DBGREG_DC, DC_MM | DC_DBE);
+
+	/* TODO: Register kretprobe trampoline */
+	return 0;
+}
diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c
new file mode 100644
index 00000000000..dfc32f2817b
--- /dev/null
+++ b/arch/avr32/kernel/module.c
@@ -0,0 +1,324 @@
+/*
+ * AVR32-specific kernel module loader
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * GOT initialization parts are based on the s390 version
+ *   Copyright (C) 2002, 2003 IBM Deutschland Entwicklung GmbH,
+ *                            IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/moduleloader.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+
+void *module_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+	return vmalloc(size);
+}
+
+void module_free(struct module *mod, void *module_region)
+{
+	vfree(mod->arch.syminfo);
+	mod->arch.syminfo = NULL;
+
+	vfree(module_region);
+	/* FIXME: if module_region == mod->init_region, trim exception
+	 * table entries. */
+}
+
+static inline int check_rela(Elf32_Rela *rela, struct module *module,
+			     char *strings, Elf32_Sym *symbols)
+{
+	struct mod_arch_syminfo *info;
+
+	info = module->arch.syminfo + ELF32_R_SYM(rela->r_info);
+	switch (ELF32_R_TYPE(rela->r_info)) {
+	case R_AVR32_GOT32:
+	case R_AVR32_GOT16:
+	case R_AVR32_GOT8:
+	case R_AVR32_GOT21S:
+	case R_AVR32_GOT18SW:	/* mcall */
+	case R_AVR32_GOT16S:	/* ld.w */
+		if (rela->r_addend != 0) {
+			printk(KERN_ERR
+			       "GOT relocation against %s at offset %u with addend\n",
+			       strings + symbols[ELF32_R_SYM(rela->r_info)].st_name,
+			       rela->r_offset);
+			return -ENOEXEC;
+		}
+		if (info->got_offset == -1UL) {
+			info->got_offset = module->arch.got_size;
+			module->arch.got_size += sizeof(void *);
+		}
+		pr_debug("GOT[%3lu] %s\n", info->got_offset,
+			 strings + symbols[ELF32_R_SYM(rela->r_info)].st_name);
+		break;
+	}
+
+	return 0;
+}
+
+int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+			      char *secstrings, struct module *module)
+{
+	Elf32_Shdr *symtab;
+	Elf32_Sym *symbols;
+	Elf32_Rela *rela;
+	char *strings;
+	int nrela, i, j;
+	int ret;
+
+	/* Find the symbol table */
+	symtab = NULL;
+	for (i = 0; i < hdr->e_shnum; i++)
+		switch (sechdrs[i].sh_type) {
+		case SHT_SYMTAB:
+			symtab = &sechdrs[i];
+			break;
+		}
+	if (!symtab) {
+		printk(KERN_ERR "module %s: no symbol table\n", module->name);
+		return -ENOEXEC;
+	}
+
+	/* Allocate room for one syminfo structure per symbol. */
+	module->arch.nsyms = symtab->sh_size / sizeof(Elf_Sym);
+	module->arch.syminfo = vmalloc(module->arch.nsyms
+				   * sizeof(struct mod_arch_syminfo));
+	if (!module->arch.syminfo)
+		return -ENOMEM;
+
+	symbols = (void *)hdr + symtab->sh_offset;
+	strings = (void *)hdr + sechdrs[symtab->sh_link].sh_offset;
+	for (i = 0; i < module->arch.nsyms; i++) {
+		if (symbols[i].st_shndx == SHN_UNDEF &&
+		    strcmp(strings + symbols[i].st_name,
+			   "_GLOBAL_OFFSET_TABLE_") == 0)
+			/* "Define" it as absolute. */
+			symbols[i].st_shndx = SHN_ABS;
+		module->arch.syminfo[i].got_offset = -1UL;
+		module->arch.syminfo[i].got_initialized = 0;
+	}
+
+	/* Allocate GOT entries for symbols that need it. */
+	module->arch.got_size = 0;
+	for (i = 0; i < hdr->e_shnum; i++) {
+		if (sechdrs[i].sh_type != SHT_RELA)
+			continue;
+		nrela = sechdrs[i].sh_size / sizeof(Elf32_Rela);
+		rela = (void *)hdr + sechdrs[i].sh_offset;
+		for (j = 0; j < nrela; j++) {
+			ret = check_rela(rela + j, module,
+					 strings, symbols);
+			if (ret)
+				goto out_free_syminfo;
+		}
+	}
+
+	/*
+	 * Increase core size to make room for GOT and set start
+	 * offset for GOT.
+	 */
+	module->core_size = ALIGN(module->core_size, 4);
+	module->arch.got_offset = module->core_size;
+	module->core_size += module->arch.got_size;
+
+	return 0;
+
+out_free_syminfo:
+	vfree(module->arch.syminfo);
+	module->arch.syminfo = NULL;
+
+	return ret;
+}
+
+static inline int reloc_overflow(struct module *module, const char *reloc_name,
+				 Elf32_Addr relocation)
+{
+	printk(KERN_ERR "module %s: Value %lx does not fit relocation %s\n",
+	       module->name, (unsigned long)relocation, reloc_name);
+	return -ENOEXEC;
+}
+
+#define get_u16(loc)		(*((uint16_t *)loc))
+#define put_u16(loc, val)	(*((uint16_t *)loc) = (val))
+
+int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
+		       unsigned int symindex, unsigned int relindex,
+		       struct module *module)
+{
+	Elf32_Shdr *symsec = sechdrs + symindex;
+	Elf32_Shdr *relsec = sechdrs + relindex;
+	Elf32_Shdr *dstsec = sechdrs + relsec->sh_info;
+	Elf32_Rela *rel = (void *)relsec->sh_addr;
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rela); i++, rel++) {
+		struct mod_arch_syminfo *info;
+		Elf32_Sym *sym;
+		Elf32_Addr relocation;
+		uint32_t *location;
+		uint32_t value;
+
+		location = (void *)dstsec->sh_addr + rel->r_offset;
+		sym = (Elf32_Sym *)symsec->sh_addr + ELF32_R_SYM(rel->r_info);
+		relocation = sym->st_value + rel->r_addend;
+
+		info = module->arch.syminfo + ELF32_R_SYM(rel->r_info);
+
+		/* Initialize GOT entry if necessary */
+		switch (ELF32_R_TYPE(rel->r_info)) {
+		case R_AVR32_GOT32:
+		case R_AVR32_GOT16:
+		case R_AVR32_GOT8:
+		case R_AVR32_GOT21S:
+		case R_AVR32_GOT18SW:
+		case R_AVR32_GOT16S:
+			if (!info->got_initialized) {
+				Elf32_Addr *gotent;
+
+				gotent = (module->module_core
+					  + module->arch.got_offset
+					  + info->got_offset);
+				*gotent = relocation;
+				info->got_initialized = 1;
+			}
+
+			relocation = info->got_offset;
+			break;
+		}
+
+		switch (ELF32_R_TYPE(rel->r_info)) {
+		case R_AVR32_32:
+		case R_AVR32_32_CPENT:
+			*location = relocation;
+			break;
+		case R_AVR32_22H_PCREL:
+			relocation -= (Elf32_Addr)location;
+			if ((relocation & 0xffe00001) != 0
+			    && (relocation & 0xffc00001) != 0xffc00000)
+				return reloc_overflow(module,
+						      "R_AVR32_22H_PCREL",
+						      relocation);
+			relocation >>= 1;
+
+			value = *location;
+			value = ((value & 0xe1ef0000)
+				 | (relocation & 0xffff)
+				 | ((relocation & 0x10000) << 4)
+				 | ((relocation & 0x1e0000) << 8));
+			*location = value;
+			break;
+		case R_AVR32_11H_PCREL:
+			relocation -= (Elf32_Addr)location;
+			if ((relocation & 0xfffffc01) != 0
+			    && (relocation & 0xfffff801) != 0xfffff800)
+				return reloc_overflow(module,
+						      "R_AVR32_11H_PCREL",
+						      relocation);
+			value = get_u16(location);
+			value = ((value & 0xf00c)
+				 | ((relocation & 0x1fe) << 3)
+				 | ((relocation & 0x600) >> 9));
+			put_u16(location, value);
+			break;
+		case R_AVR32_9H_PCREL:
+			relocation -= (Elf32_Addr)location;
+			if ((relocation & 0xffffff01) != 0
+			    && (relocation & 0xfffffe01) != 0xfffffe00)
+				return reloc_overflow(module,
+						      "R_AVR32_9H_PCREL",
+						      relocation);
+			value = get_u16(location);
+			value = ((value & 0xf00f)
+				 | ((relocation & 0x1fe) << 3));
+			put_u16(location, value);
+			break;
+		case R_AVR32_9UW_PCREL:
+			relocation -= ((Elf32_Addr)location) & 0xfffffffc;
+			if ((relocation & 0xfffffc03) != 0)
+				return reloc_overflow(module,
+						      "R_AVR32_9UW_PCREL",
+						      relocation);
+			value = get_u16(location);
+			value = ((value & 0xf80f)
+				 | ((relocation & 0x1fc) << 2));
+			put_u16(location, value);
+			break;
+		case R_AVR32_GOTPC:
+			/*
+			 * R6 = PC - (PC - GOT)
+			 *
+			 * At this point, relocation contains the
+			 * value of PC.  Just subtract the value of
+			 * GOT, and we're done.
+			 */
+			pr_debug("GOTPC: PC=0x%lx, got_offset=0x%lx, core=0x%p\n",
+				 relocation, module->arch.got_offset,
+				 module->module_core);
+			relocation -= ((unsigned long)module->module_core
+				       + module->arch.got_offset);
+			*location = relocation;
+			break;
+		case R_AVR32_GOT18SW:
+			if ((relocation & 0xfffe0003) != 0
+			    && (relocation & 0xfffc0003) != 0xffff0000)
+				return reloc_overflow(module, "R_AVR32_GOT18SW",
+						     relocation);
+			relocation >>= 2;
+			/* fall through */
+		case R_AVR32_GOT16S:
+			if ((relocation & 0xffff8000) != 0
+			    && (relocation & 0xffff0000) != 0xffff0000)
+				return reloc_overflow(module, "R_AVR32_GOT16S",
+						      relocation);
+			pr_debug("GOT reloc @ 0x%lx -> %lu\n",
+				 rel->r_offset, relocation);
+			value = *location;
+			value = ((value & 0xffff0000)
+				 | (relocation & 0xffff));
+			*location = value;
+			break;
+
+		default:
+			printk(KERN_ERR "module %s: Unknown relocation: %u\n",
+			       module->name, ELF32_R_TYPE(rel->r_info));
+			return -ENOEXEC;
+		}
+	}
+
+	return ret;
+}
+
+int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab,
+		   unsigned int symindex, unsigned int relindex,
+		   struct module *module)
+{
+	printk(KERN_ERR "module %s: REL relocations are not supported\n",
+		module->name);
+	return -ENOEXEC;
+}
+
+int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+		    struct module *module)
+{
+	vfree(module->arch.syminfo);
+	module->arch.syminfo = NULL;
+
+	return 0;
+}
+
+void module_arch_cleanup(struct module *module)
+{
+
+}
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
new file mode 100644
index 00000000000..317dc50945f
--- /dev/null
+++ b/arch/avr32/kernel/process.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/fs.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/unistd.h>
+
+#include <asm/sysreg.h>
+#include <asm/ocd.h>
+
+void (*pm_power_off)(void) = NULL;
+EXPORT_SYMBOL(pm_power_off);
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+void cpu_idle(void)
+{
+	/* endless idle loop with no priority at all */
+	while (1) {
+		/* TODO: Enter sleep mode */
+		while (!need_resched())
+			cpu_relax();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
+}
+
+void machine_halt(void)
+{
+}
+
+void machine_power_off(void)
+{
+}
+
+void machine_restart(char *cmd)
+{
+	__mtdr(DBGREG_DC, DC_DBE);
+	__mtdr(DBGREG_DC, DC_RES);
+	while (1) ;
+}
+
+/*
+ * PC is actually discarded when returning from a system call -- the
+ * return address must be stored in LR. This function will make sure
+ * LR points to do_exit before starting the thread.
+ *
+ * Also, when returning from fork(), r12 is 0, so we must copy the
+ * argument as well.
+ *
+ *  r0 : The argument to the main thread function
+ *  r1 : The address of do_exit
+ *  r2 : The address of the main thread function
+ */
+asmlinkage extern void kernel_thread_helper(void);
+__asm__("	.type	kernel_thread_helper, @function\n"
+	"kernel_thread_helper:\n"
+	"	mov	r12, r0\n"
+	"	mov	lr, r2\n"
+	"	mov	pc, r1\n"
+	"	.size	kernel_thread_helper, . - kernel_thread_helper");
+
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+	struct pt_regs regs;
+
+	memset(&regs, 0, sizeof(regs));
+
+	regs.r0 = (unsigned long)arg;
+	regs.r1 = (unsigned long)fn;
+	regs.r2 = (unsigned long)do_exit;
+	regs.lr = (unsigned long)kernel_thread_helper;
+	regs.pc = (unsigned long)kernel_thread_helper;
+	regs.sr = MODE_SUPERVISOR;
+
+	return do_fork(flags | CLONE_VM | CLONE_UNTRACED,
+		       0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * Free current thread data structures etc
+ */
+void exit_thread(void)
+{
+	/* nothing to do */
+}
+
+void flush_thread(void)
+{
+	/* nothing to do */
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+	/* do nothing */
+}
+
+static const char *cpu_modes[] = {
+	"Application", "Supervisor", "Interrupt level 0", "Interrupt level 1",
+	"Interrupt level 2", "Interrupt level 3", "Exception", "NMI"
+};
+
+void show_regs(struct pt_regs *regs)
+{
+	unsigned long sp = regs->sp;
+	unsigned long lr = regs->lr;
+	unsigned long mode = (regs->sr & MODE_MASK) >> MODE_SHIFT;
+
+	if (!user_mode(regs))
+		sp = (unsigned long)regs + FRAME_SIZE_FULL;
+
+	print_symbol("PC is at %s\n", instruction_pointer(regs));
+	print_symbol("LR is at %s\n", lr);
+	printk("pc : [<%08lx>]    lr : [<%08lx>]    %s\n"
+	       "sp : %08lx  r12: %08lx  r11: %08lx\n",
+	       instruction_pointer(regs),
+	       lr, print_tainted(), sp, regs->r12, regs->r11);
+	printk("r10: %08lx  r9 : %08lx  r8 : %08lx\n",
+	       regs->r10, regs->r9, regs->r8);
+	printk("r7 : %08lx  r6 : %08lx  r5 : %08lx  r4 : %08lx\n",
+	       regs->r7, regs->r6, regs->r5, regs->r4);
+	printk("r3 : %08lx  r2 : %08lx  r1 : %08lx  r0 : %08lx\n",
+	       regs->r3, regs->r2, regs->r1, regs->r0);
+	printk("Flags: %c%c%c%c%c\n",
+	       regs->sr & SR_Q ? 'Q' : 'q',
+	       regs->sr & SR_V ? 'V' : 'v',
+	       regs->sr & SR_N ? 'N' : 'n',
+	       regs->sr & SR_Z ? 'Z' : 'z',
+	       regs->sr & SR_C ? 'C' : 'c');
+	printk("Mode bits: %c%c%c%c%c%c%c%c%c\n",
+	       regs->sr & SR_H ? 'H' : 'h',
+	       regs->sr & SR_R ? 'R' : 'r',
+	       regs->sr & SR_J ? 'J' : 'j',
+	       regs->sr & SR_EM ? 'E' : 'e',
+	       regs->sr & SR_I3M ? '3' : '.',
+	       regs->sr & SR_I2M ? '2' : '.',
+	       regs->sr & SR_I1M ? '1' : '.',
+	       regs->sr & SR_I0M ? '0' : '.',
+	       regs->sr & SR_GM ? 'G' : 'g');
+	printk("CPU Mode: %s\n", cpu_modes[mode]);
+
+	show_trace(NULL, (unsigned long *)sp, regs);
+}
+EXPORT_SYMBOL(show_regs);
+
+/* Fill in the fpu structure for a core dump. This is easy -- we don't have any */
+int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
+{
+	/* Not valid */
+	return 0;
+}
+
+asmlinkage void ret_from_fork(void);
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+		unsigned long unused,
+		struct task_struct *p, struct pt_regs *regs)
+{
+	struct pt_regs *childregs;
+
+	childregs = ((struct pt_regs *)(THREAD_SIZE + (unsigned long)p->thread_info)) - 1;
+	*childregs = *regs;
+
+	if (user_mode(regs))
+		childregs->sp = usp;
+	else
+		childregs->sp = (unsigned long)p->thread_info + THREAD_SIZE;
+
+	childregs->r12 = 0; /* Set return value for child */
+
+	p->thread.cpu_context.sr = MODE_SUPERVISOR | SR_GM;
+	p->thread.cpu_context.ksp = (unsigned long)childregs;
+	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
+
+	return 0;
+}
+
+/* r12-r8 are dummy parameters to force the compiler to use the stack */
+asmlinkage int sys_fork(struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
+			 unsigned long parent_tidptr,
+			 unsigned long child_tidptr, struct pt_regs *regs)
+{
+	if (!newsp)
+		newsp = regs->sp;
+	return do_fork(clone_flags, newsp, regs, 0,
+		       (int __user *)parent_tidptr,
+		       (int __user *)child_tidptr);
+}
+
+asmlinkage int sys_vfork(struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs,
+		       0, NULL, NULL);
+}
+
+asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv,
+			  char __user *__user *uenvp, struct pt_regs *regs)
+{
+	int error;
+	char *filename;
+
+	filename = getname(ufilename);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		goto out;
+
+	error = do_execve(filename, uargv, uenvp, regs);
+	if (error == 0)
+		current->ptrace &= ~PT_DTRACE;
+	putname(filename);
+
+out:
+	return error;
+}
+
+
+/*
+ * This function is supposed to answer the question "who called
+ * schedule()?"
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long pc;
+	unsigned long stack_page;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	stack_page = (unsigned long)p->thread_info;
+	BUG_ON(!stack_page);
+
+	/*
+	 * The stored value of PC is either the address right after
+	 * the call to __switch_to() or ret_from_fork.
+	 */
+	pc = thread_saved_pc(p);
+	if (in_sched_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+		unsigned long fp = p->thread.cpu_context.r7;
+		BUG_ON(fp < stack_page || fp > (THREAD_SIZE + stack_page));
+		pc = *(unsigned long *)fp;
+#else
+		/*
+		 * We depend on the frame size of schedule here, which
+		 * is actually quite ugly. It might be possible to
+		 * determine the frame size automatically at build
+		 * time by doing this:
+		 *   - compile sched.c
+		 *   - disassemble the resulting sched.o
+		 *   - look for 'sub sp,??' shortly after '<schedule>:'
+		 */
+		unsigned long sp = p->thread.cpu_context.ksp + 16;
+		BUG_ON(sp < stack_page || sp > (THREAD_SIZE + stack_page));
+		pc = *(unsigned long *)sp;
+#endif
+	}
+
+	return pc;
+}
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
new file mode 100644
index 00000000000..3c89e59029a
--- /dev/null
+++ b/arch/avr32/kernel/ptrace.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/ptrace.h>
+#include <linux/errno.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/unistd.h>
+#include <linux/notifier.h>
+
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/ocd.h>
+#include <asm/mmu_context.h>
+#include <asm/kdebug.h>
+
+static struct pt_regs *get_user_regs(struct task_struct *tsk)
+{
+	return (struct pt_regs *)((unsigned long) tsk->thread_info +
+				  THREAD_SIZE - sizeof(struct pt_regs));
+}
+
+static void ptrace_single_step(struct task_struct *tsk)
+{
+	pr_debug("ptrace_single_step: pid=%u, SR=0x%08lx\n",
+		 tsk->pid, tsk->thread.cpu_context.sr);
+	if (!(tsk->thread.cpu_context.sr & SR_D)) {
+		/*
+		 * Set a breakpoint at the current pc to force the
+		 * process into debug mode.  The syscall/exception
+		 * exit code will set a breakpoint at the return
+		 * address when this flag is set.
+		 */
+		pr_debug("ptrace_single_step: Setting TIF_BREAKPOINT\n");
+		set_tsk_thread_flag(tsk, TIF_BREAKPOINT);
+	}
+
+	/* The monitor code will do the actual step for us */
+	set_tsk_thread_flag(tsk, TIF_SINGLE_STEP);
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching
+ *
+ * Make sure any single step bits, etc. are not set
+ */
+void ptrace_disable(struct task_struct *child)
+{
+	clear_tsk_thread_flag(child, TIF_SINGLE_STEP);
+}
+
+/*
+ * Handle hitting a breakpoint
+ */
+static void ptrace_break(struct task_struct *tsk, struct pt_regs *regs)
+{
+	siginfo_t info;
+
+	info.si_signo = SIGTRAP;
+	info.si_errno = 0;
+	info.si_code  = TRAP_BRKPT;
+	info.si_addr  = (void __user *)instruction_pointer(regs);
+
+	pr_debug("ptrace_break: Sending SIGTRAP to PID %u (pc = 0x%p)\n",
+		 tsk->pid, info.si_addr);
+	force_sig_info(SIGTRAP, &info, tsk);
+}
+
+/*
+ * Read the word at offset "offset" into the task's "struct user". We
+ * actually access the pt_regs struct stored on the kernel stack.
+ */
+static int ptrace_read_user(struct task_struct *tsk, unsigned long offset,
+			    unsigned long __user *data)
+{
+	unsigned long *regs;
+	unsigned long value;
+
+	pr_debug("ptrace_read_user(%p, %#lx, %p)\n",
+		 tsk, offset, data);
+
+	if (offset & 3 || offset >= sizeof(struct user)) {
+		printk("ptrace_read_user: invalid offset 0x%08lx\n", offset);
+		return -EIO;
+	}
+
+	regs = (unsigned long *)get_user_regs(tsk);
+
+	value = 0;
+	if (offset < sizeof(struct pt_regs))
+		value = regs[offset / sizeof(regs[0])];
+
+	return put_user(value, data);
+}
+
+/*
+ * Write the word "value" to offset "offset" into the task's "struct
+ * user". We actually access the pt_regs struct stored on the kernel
+ * stack.
+ */
+static int ptrace_write_user(struct task_struct *tsk, unsigned long offset,
+			     unsigned long value)
+{
+	unsigned long *regs;
+
+	if (offset & 3 || offset >= sizeof(struct user)) {
+		printk("ptrace_write_user: invalid offset 0x%08lx\n", offset);
+		return -EIO;
+	}
+
+	if (offset >= sizeof(struct pt_regs))
+		return 0;
+
+	regs = (unsigned long *)get_user_regs(tsk);
+	regs[offset / sizeof(regs[0])] = value;
+
+	return 0;
+}
+
+static int ptrace_getregs(struct task_struct *tsk, void __user *uregs)
+{
+	struct pt_regs *regs = get_user_regs(tsk);
+
+	return copy_to_user(uregs, regs, sizeof(*regs)) ? -EFAULT : 0;
+}
+
+static int ptrace_setregs(struct task_struct *tsk, const void __user *uregs)
+{
+	struct pt_regs newregs;
+	int ret;
+
+	ret = -EFAULT;
+	if (copy_from_user(&newregs, uregs, sizeof(newregs)) == 0) {
+		struct pt_regs *regs = get_user_regs(tsk);
+
+		ret = -EINVAL;
+		if (valid_user_regs(&newregs)) {
+			*regs = newregs;
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+	unsigned long tmp;
+	int ret;
+
+	pr_debug("arch_ptrace(%ld, %ld, %#lx, %#lx)\n",
+		 request, child->pid, addr, data);
+
+	pr_debug("ptrace: Enabling monitor mode...\n");
+	__mtdr(DBGREG_DC, __mfdr(DBGREG_DC) | DC_MM | DC_DBE);
+
+	switch (request) {
+	/* Read the word at location addr in the child process */
+	case PTRACE_PEEKTEXT:
+	case PTRACE_PEEKDATA:
+		ret = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
+		if (ret == sizeof(tmp))
+			ret = put_user(tmp, (unsigned long __user *)data);
+		else
+			ret = -EIO;
+		break;
+
+	case PTRACE_PEEKUSR:
+		ret = ptrace_read_user(child, addr,
+				       (unsigned long __user *)data);
+		break;
+
+	/* Write the word in data at location addr */
+	case PTRACE_POKETEXT:
+	case PTRACE_POKEDATA:
+		ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+		if (ret == sizeof(data))
+			ret = 0;
+		else
+			ret = -EIO;
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = ptrace_write_user(child, addr, data);
+		break;
+
+	/* continue and stop at next (return from) syscall */
+	case PTRACE_SYSCALL:
+	/* restart after signal */
+	case PTRACE_CONT:
+		ret = -EIO;
+		if (!valid_signal(data))
+			break;
+		if (request == PTRACE_SYSCALL)
+			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		child->exit_code = data;
+		/* XXX: Are we sure no breakpoints are active here? */
+		wake_up_process(child);
+		ret = 0;
+		break;
+
+	/*
+	 * Make the child exit. Best I can do is send it a
+	 * SIGKILL. Perhaps it should be put in the status that it
+	 * wants to exit.
+	 */
+	case PTRACE_KILL:
+		ret = 0;
+		if (child->exit_state == EXIT_ZOMBIE)
+			break;
+		child->exit_code = SIGKILL;
+		wake_up_process(child);
+		break;
+
+	/*
+	 * execute single instruction.
+	 */
+	case PTRACE_SINGLESTEP:
+		ret = -EIO;
+		if (!valid_signal(data))
+			break;
+		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		ptrace_single_step(child);
+		child->exit_code = data;
+		wake_up_process(child);
+		ret = 0;
+		break;
+
+	/* Detach a process that was attached */
+	case PTRACE_DETACH:
+		ret = ptrace_detach(child, data);
+		break;
+
+	case PTRACE_GETREGS:
+		ret = ptrace_getregs(child, (void __user *)data);
+		break;
+
+	case PTRACE_SETREGS:
+		ret = ptrace_setregs(child, (const void __user *)data);
+		break;
+
+	default:
+		ret = ptrace_request(child, request, addr, data);
+		break;
+	}
+
+	pr_debug("sys_ptrace returning %d (DC = 0x%08lx)\n", ret, __mfdr(DBGREG_DC));
+	return ret;
+}
+
+asmlinkage void syscall_trace(void)
+{
+	pr_debug("syscall_trace called\n");
+	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+		return;
+	if (!(current->ptrace & PT_PTRACED))
+		return;
+
+	pr_debug("syscall_trace: notifying parent\n");
+	/* The 0x80 provides a way for the tracing parent to
+	 * distinguish between a syscall stop and SIGTRAP delivery */
+	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+				 ? 0x80 : 0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it
+	 * will do for normal use.  strace only continues with a
+	 * signal if the stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		pr_debug("syscall_trace: sending signal %d to PID %u\n",
+			 current->exit_code, current->pid);
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+
+asmlinkage void do_debug_priv(struct pt_regs *regs)
+{
+	unsigned long dc, ds;
+	unsigned long die_val;
+
+	ds = __mfdr(DBGREG_DS);
+
+	pr_debug("do_debug_priv: pc = %08lx, ds = %08lx\n", regs->pc, ds);
+
+	if (ds & DS_SSS)
+		die_val = DIE_SSTEP;
+	else
+		die_val = DIE_BREAKPOINT;
+
+	if (notify_die(die_val, regs, 0, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	if (likely(ds & DS_SSS)) {
+		extern void itlb_miss(void);
+		extern void tlb_miss_common(void);
+		struct thread_info *ti;
+
+		dc = __mfdr(DBGREG_DC);
+		dc &= ~DC_SS;
+		__mtdr(DBGREG_DC, dc);
+
+		ti = current_thread_info();
+		ti->flags |= _TIF_BREAKPOINT;
+
+		/* The TLB miss handlers don't check thread flags */
+		if ((regs->pc >= (unsigned long)&itlb_miss)
+		    && (regs->pc <= (unsigned long)&tlb_miss_common)) {
+			__mtdr(DBGREG_BWA2A, sysreg_read(RAR_EX));
+			__mtdr(DBGREG_BWC2A, 0x40000001 | (get_asid() << 1));
+		}
+
+		/*
+		 * If we're running in supervisor mode, the breakpoint
+		 * will take us where we want directly, no need to
+		 * single step.
+		 */
+		if ((regs->sr & MODE_MASK) != MODE_SUPERVISOR)
+			ti->flags |= TIF_SINGLE_STEP;
+	} else {
+		panic("Unable to handle debug trap at pc = %08lx\n",
+		      regs->pc);
+	}
+}
+
+/*
+ * Handle breakpoints, single steps and other debuggy things. To keep
+ * things simple initially, we run with interrupts and exceptions
+ * disabled all the time.
+ */
+asmlinkage void do_debug(struct pt_regs *regs)
+{
+	unsigned long dc, ds;
+
+	ds = __mfdr(DBGREG_DS);
+	pr_debug("do_debug: pc = %08lx, ds = %08lx\n", regs->pc, ds);
+
+	if (test_thread_flag(TIF_BREAKPOINT)) {
+		pr_debug("TIF_BREAKPOINT set\n");
+		/* We're taking care of it */
+		clear_thread_flag(TIF_BREAKPOINT);
+		__mtdr(DBGREG_BWC2A, 0);
+	}
+
+	if (test_thread_flag(TIF_SINGLE_STEP)) {
+		pr_debug("TIF_SINGLE_STEP set, ds = 0x%08lx\n", ds);
+		if (ds & DS_SSS) {
+			dc = __mfdr(DBGREG_DC);
+			dc &= ~DC_SS;
+			__mtdr(DBGREG_DC, dc);
+
+			clear_thread_flag(TIF_SINGLE_STEP);
+			ptrace_break(current, regs);
+		}
+	} else {
+		/* regular breakpoint */
+		ptrace_break(current, regs);
+	}
+}
diff --git a/arch/avr32/kernel/semaphore.c b/arch/avr32/kernel/semaphore.c
new file mode 100644
index 00000000000..1e2705a0501
--- /dev/null
+++ b/arch/avr32/kernel/semaphore.c
@@ -0,0 +1,148 @@
+/*
+ * AVR32 sempahore implementation.
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on linux/arch/i386/kernel/semaphore.c
+ *  Copyright (C) 1999 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+#include <asm/semaphore.h>
+#include <asm/atomic.h>
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to acquire the semaphore, while the "sleeping"
+ * variable is a count of such acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Logic:
+ *  - only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - when we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleeper" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+
+void __up(struct semaphore *sem)
+{
+	wake_up(&sem->wait);
+}
+EXPORT_SYMBOL(__up);
+
+void __sched __down(struct semaphore *sem)
+{
+	struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+
+        tsk->state = TASK_UNINTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+        sem->sleepers++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * the wait_queue_head.
+                 */
+                if (atomic_add_return(sleepers - 1, &sem->count) >= 0) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+                schedule();
+
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_UNINTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+        tsk->state = TASK_RUNNING;
+}
+EXPORT_SYMBOL(__down);
+
+int __sched __down_interruptible(struct semaphore *sem)
+{
+	int retval = 0;
+	struct task_struct *tsk = current;
+        DECLARE_WAITQUEUE(wait, tsk);
+        unsigned long flags;
+
+        tsk->state = TASK_INTERRUPTIBLE;
+        spin_lock_irqsave(&sem->wait.lock, flags);
+        add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+        sem->sleepers++;
+        for (;;) {
+                int sleepers = sem->sleepers;
+
+		/*
+		 * With signals pending, this turns into the trylock
+		 * failure case - we won't be sleeping, and we can't
+		 * get the lock as it has contention. Just correct the
+		 * count and exit.
+		 */
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			sem->sleepers = 0;
+			atomic_add(sleepers, &sem->count);
+			break;
+		}
+
+                /*
+                 * Add "everybody else" into it. They aren't
+                 * playing, because we own the spinlock in
+                 * the wait_queue_head.
+                 */
+                if (atomic_add_return(sleepers - 1, &sem->count) >= 0) {
+                        sem->sleepers = 0;
+                        break;
+                }
+                sem->sleepers = 1;      /* us - see -1 above */
+                spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+                schedule();
+
+                spin_lock_irqsave(&sem->wait.lock, flags);
+                tsk->state = TASK_INTERRUPTIBLE;
+        }
+        remove_wait_queue_locked(&sem->wait, &wait);
+        wake_up_locked(&sem->wait);
+        spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+        tsk->state = TASK_RUNNING;
+	return retval;
+}
+EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c
new file mode 100644
index 00000000000..5d68f3c6990
--- /dev/null
+++ b/arch/avr32/kernel/setup.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/console.h>
+#include <linux/ioport.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/root_dev.h>
+#include <linux/cpu.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/sysreg.h>
+
+#include <asm/arch/board.h>
+#include <asm/arch/init.h>
+
+extern int root_mountflags;
+
+/*
+ * Bootloader-provided information about physical memory
+ */
+struct tag_mem_range *mem_phys;
+struct tag_mem_range *mem_reserved;
+struct tag_mem_range *mem_ramdisk;
+
+/*
+ * Initialize loops_per_jiffy as 5000000 (500MIPS).
+ * Better make it too large than too small...
+ */
+struct avr32_cpuinfo boot_cpu_data = {
+	.loops_per_jiffy = 5000000
+};
+EXPORT_SYMBOL(boot_cpu_data);
+
+static char command_line[COMMAND_LINE_SIZE];
+
+/*
+ * Should be more than enough, but if you have a _really_ complex
+ * setup, you might need to increase the size of this...
+ */
+static struct tag_mem_range __initdata mem_range_cache[32];
+static unsigned mem_range_next_free;
+
+/*
+ * Standard memory resources
+ */
+static struct resource mem_res[] = {
+	{
+		.name	= "Kernel code",
+		.start	= 0,
+		.end	= 0,
+		.flags	= IORESOURCE_MEM
+	},
+	{
+		.name	= "Kernel data",
+		.start	= 0,
+		.end	= 0,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+#define kernel_code	mem_res[0]
+#define kernel_data	mem_res[1]
+
+/*
+ * Early framebuffer allocation. Works as follows:
+ *   - If fbmem_size is zero, nothing will be allocated or reserved.
+ *   - If fbmem_start is zero when setup_bootmem() is called,
+ *     fbmem_size bytes will be allocated from the bootmem allocator.
+ *   - If fbmem_start is nonzero, an area of size fbmem_size will be
+ *     reserved at the physical address fbmem_start if necessary. If
+ *     the area isn't in a memory region known to the kernel, it will
+ *     be left alone.
+ *
+ * Board-specific code may use these variables to set up platform data
+ * for the framebuffer driver if fbmem_size is nonzero.
+ */
+static unsigned long __initdata fbmem_start;
+static unsigned long __initdata fbmem_size;
+
+/*
+ * "fbmem=xxx[kKmM]" allocates the specified amount of boot memory for
+ * use as framebuffer.
+ *
+ * "fbmem=xxx[kKmM]@yyy[kKmM]" defines a memory region of size xxx and
+ * starting at yyy to be reserved for use as framebuffer.
+ *
+ * The kernel won't verify that the memory region starting at yyy
+ * actually contains usable RAM.
+ */
+static int __init early_parse_fbmem(char *p)
+{
+	fbmem_size = memparse(p, &p);
+	if (*p == '@')
+		fbmem_start = memparse(p, &p);
+	return 0;
+}
+early_param("fbmem", early_parse_fbmem);
+
+static inline void __init resource_init(void)
+{
+	struct tag_mem_range *region;
+
+	kernel_code.start = __pa(init_mm.start_code);
+	kernel_code.end = __pa(init_mm.end_code - 1);
+	kernel_data.start = __pa(init_mm.end_code);
+	kernel_data.end = __pa(init_mm.brk - 1);
+
+	for (region = mem_phys; region; region = region->next) {
+		struct resource *res;
+		unsigned long phys_start, phys_end;
+
+		if (region->size == 0)
+			continue;
+
+		phys_start = region->addr;
+		phys_end = phys_start + region->size - 1;
+
+		res = alloc_bootmem_low(sizeof(*res));
+		res->name = "System RAM";
+		res->start = phys_start;
+		res->end = phys_end;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+		request_resource (&iomem_resource, res);
+
+		if (kernel_code.start >= res->start &&
+		    kernel_code.end <= res->end)
+			request_resource (res, &kernel_code);
+		if (kernel_data.start >= res->start &&
+		    kernel_data.end <= res->end)
+			request_resource (res, &kernel_data);
+	}
+}
+
+static int __init parse_tag_core(struct tag *tag)
+{
+	if (tag->hdr.size > 2) {
+		if ((tag->u.core.flags & 1) == 0)
+			root_mountflags &= ~MS_RDONLY;
+		ROOT_DEV = new_decode_dev(tag->u.core.rootdev);
+	}
+	return 0;
+}
+__tagtable(ATAG_CORE, parse_tag_core);
+
+static int __init parse_tag_mem_range(struct tag *tag,
+				      struct tag_mem_range **root)
+{
+	struct tag_mem_range *cur, **pprev;
+	struct tag_mem_range *new;
+
+	/*
+	 * Ignore zero-sized entries. If we're running standalone, the
+	 * SDRAM code may emit such entries if something goes
+	 * wrong...
+	 */
+	if (tag->u.mem_range.size == 0)
+		return 0;
+
+	/*
+	 * Copy the data so the bootmem init code doesn't need to care
+	 * about it.
+	 */
+	if (mem_range_next_free >=
+	    (sizeof(mem_range_cache) / sizeof(mem_range_cache[0])))
+		panic("Physical memory map too complex!\n");
+
+	new = &mem_range_cache[mem_range_next_free++];
+	*new = tag->u.mem_range;
+
+	pprev = root;
+	cur = *root;
+	while (cur) {
+		pprev = &cur->next;
+		cur = cur->next;
+	}
+
+	*pprev = new;
+	new->next = NULL;
+
+	return 0;
+}
+
+static int __init parse_tag_mem(struct tag *tag)
+{
+	return parse_tag_mem_range(tag, &mem_phys);
+}
+__tagtable(ATAG_MEM, parse_tag_mem);
+
+static int __init parse_tag_cmdline(struct tag *tag)
+{
+	strlcpy(saved_command_line, tag->u.cmdline.cmdline, COMMAND_LINE_SIZE);
+	return 0;
+}
+__tagtable(ATAG_CMDLINE, parse_tag_cmdline);
+
+static int __init parse_tag_rdimg(struct tag *tag)
+{
+	return parse_tag_mem_range(tag, &mem_ramdisk);
+}
+__tagtable(ATAG_RDIMG, parse_tag_rdimg);
+
+static int __init parse_tag_clock(struct tag *tag)
+{
+	/*
+	 * We'll figure out the clocks by peeking at the system
+	 * manager regs directly.
+	 */
+	return 0;
+}
+__tagtable(ATAG_CLOCK, parse_tag_clock);
+
+static int __init parse_tag_rsvd_mem(struct tag *tag)
+{
+	return parse_tag_mem_range(tag, &mem_reserved);
+}
+__tagtable(ATAG_RSVD_MEM, parse_tag_rsvd_mem);
+
+static int __init parse_tag_ethernet(struct tag *tag)
+{
+#if 0
+	const struct platform_device *pdev;
+
+	/*
+	 * We really need a bus type that supports "classes"...this
+	 * will do for now (until we must handle other kinds of
+	 * ethernet controllers)
+	 */
+	pdev = platform_get_device("macb", tag->u.ethernet.mac_index);
+	if (pdev && pdev->dev.platform_data) {
+		struct eth_platform_data *data = pdev->dev.platform_data;
+
+		data->valid = 1;
+		data->mii_phy_addr = tag->u.ethernet.mii_phy_addr;
+		memcpy(data->hw_addr, tag->u.ethernet.hw_address,
+		       sizeof(data->hw_addr));
+	}
+#endif
+	return 0;
+}
+__tagtable(ATAG_ETHERNET, parse_tag_ethernet);
+
+/*
+ * Scan the tag table for this tag, and call its parse function. The
+ * tag table is built by the linker from all the __tagtable
+ * declarations.
+ */
+static int __init parse_tag(struct tag *tag)
+{
+	extern struct tagtable __tagtable_begin, __tagtable_end;
+	struct tagtable *t;
+
+	for (t = &__tagtable_begin; t < &__tagtable_end; t++)
+		if (tag->hdr.tag == t->tag) {
+			t->parse(tag);
+			break;
+		}
+
+	return t < &__tagtable_end;
+}
+
+/*
+ * Parse all tags in the list we got from the boot loader
+ */
+static void __init parse_tags(struct tag *t)
+{
+	for (; t->hdr.tag != ATAG_NONE; t = tag_next(t))
+		if (!parse_tag(t))
+			printk(KERN_WARNING
+			       "Ignoring unrecognised tag 0x%08x\n",
+			       t->hdr.tag);
+}
+
+void __init setup_arch (char **cmdline_p)
+{
+	struct clk *cpu_clk;
+
+	parse_tags(bootloader_tags);
+
+	setup_processor();
+	setup_platform();
+
+	cpu_clk = clk_get(NULL, "cpu");
+	if (IS_ERR(cpu_clk)) {
+		printk(KERN_WARNING "Warning: Unable to get CPU clock\n");
+	} else {
+		unsigned long cpu_hz = clk_get_rate(cpu_clk);
+
+		/*
+		 * Well, duh, but it's probably a good idea to
+		 * increment the use count.
+		 */
+		clk_enable(cpu_clk);
+
+		boot_cpu_data.clk = cpu_clk;
+		boot_cpu_data.loops_per_jiffy = cpu_hz * 4;
+		printk("CPU: Running at %lu.%03lu MHz\n",
+		       ((cpu_hz + 500) / 1000) / 1000,
+		       ((cpu_hz + 500) / 1000) % 1000);
+	}
+
+	init_mm.start_code = (unsigned long) &_text;
+	init_mm.end_code = (unsigned long) &_etext;
+	init_mm.end_data = (unsigned long) &_edata;
+	init_mm.brk = (unsigned long) &_end;
+
+	strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+	parse_early_param();
+
+	setup_bootmem();
+
+	board_setup_fbmem(fbmem_start, fbmem_size);
+
+#ifdef CONFIG_VT
+	conswitchp = &dummy_con;
+#endif
+
+	paging_init();
+
+	resource_init();
+}
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
new file mode 100644
index 00000000000..33096651c24
--- /dev/null
+++ b/arch/avr32/kernel/signal.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on linux/arch/sh/kernel/signal.c
+ *  Copyright (C) 1999, 2000  Niibe Yutaka & Kaz Kojima
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/suspend.h>
+
+#include <asm/uaccess.h>
+#include <asm/ucontext.h>
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+			       struct pt_regs *regs)
+{
+	return do_sigaltstack(uss, uoss, regs->sp);
+}
+
+struct rt_sigframe
+{
+	struct siginfo info;
+	struct ucontext uc;
+	unsigned long retcode;
+};
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+{
+	int err = 0;
+
+#define COPY(x)		err |= __get_user(regs->x, &sc->x)
+	COPY(sr);
+	COPY(pc);
+	COPY(lr);
+	COPY(sp);
+	COPY(r12);
+	COPY(r11);
+	COPY(r10);
+	COPY(r9);
+	COPY(r8);
+	COPY(r7);
+	COPY(r6);
+	COPY(r5);
+	COPY(r4);
+	COPY(r3);
+	COPY(r2);
+	COPY(r1);
+	COPY(r0);
+#undef	COPY
+
+	/*
+	 * Don't allow anyone to pretend they're running in supervisor
+	 * mode or something...
+	 */
+	err |= !valid_user_regs(regs);
+
+	return err;
+}
+
+
+asmlinkage int sys_rt_sigreturn(struct pt_regs *regs)
+{
+	struct rt_sigframe __user *frame;
+	sigset_t set;
+
+	frame = (struct rt_sigframe __user *)regs->sp;
+	pr_debug("SIG return: frame = %p\n", frame);
+
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+		goto badframe;
+
+	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+		goto badframe;
+
+	sigdelsetmask(&set, ~_BLOCKABLE);
+	spin_lock_irq(&current->sighand->siglock);
+	current->blocked = set;
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+		goto badframe;
+
+	pr_debug("Context restored: pc = %08lx, lr = %08lx, sp = %08lx\n",
+		 regs->pc, regs->lr, regs->sp);
+
+	return regs->r12;
+
+badframe:
+	force_sig(SIGSEGV, current);
+	return 0;
+}
+
+static int
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
+{
+	int err = 0;
+
+#define COPY(x)		err |= __put_user(regs->x, &sc->x)
+	COPY(sr);
+	COPY(pc);
+	COPY(lr);
+	COPY(sp);
+	COPY(r12);
+	COPY(r11);
+	COPY(r10);
+	COPY(r9);
+	COPY(r8);
+	COPY(r7);
+	COPY(r6);
+	COPY(r5);
+	COPY(r4);
+	COPY(r3);
+	COPY(r2);
+	COPY(r1);
+	COPY(r0);
+#undef	COPY
+
+	return err;
+}
+
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, int framesize)
+{
+	unsigned long sp = regs->sp;
+
+	if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(sp))
+		sp = current->sas_ss_sp + current->sas_ss_size;
+
+	return (void __user *)((sp - framesize) & ~3);
+}
+
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+	       sigset_t *set, struct pt_regs *regs)
+{
+	struct rt_sigframe __user *frame;
+	int err = 0;
+
+	frame = get_sigframe(ka, regs, sizeof(*frame));
+	err = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, frame, sizeof (*frame)))
+		goto out;
+
+	/*
+	 * Set up the return code:
+	 *
+	 *	mov	r8, __NR_rt_sigreturn
+	 *	scall
+	 *
+	 * Note: This will blow up since we're using a non-executable
+	 * stack. Better use SA_RESTORER.
+	 */
+#if __NR_rt_sigreturn > 127
+# error __NR_rt_sigreturn must be < 127 to fit in a short mov
+#endif
+	err = __put_user(0x3008d733 | (__NR_rt_sigreturn << 20),
+			 &frame->retcode);
+
+	err |= copy_siginfo_to_user(&frame->info, info);
+
+	/* Set up the ucontext */
+	err |= __put_user(0, &frame->uc.uc_flags);
+	err |= __put_user(NULL, &frame->uc.uc_link);
+	err |= __put_user((void __user *)current->sas_ss_sp,
+			  &frame->uc.uc_stack.ss_sp);
+	err |= __put_user(sas_ss_flags(regs->sp),
+			  &frame->uc.uc_stack.ss_flags);
+	err |= __put_user(current->sas_ss_size,
+			  &frame->uc.uc_stack.ss_size);
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs);
+	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+	if (err)
+		goto out;
+
+	regs->r12 = sig;
+	regs->r11 = (unsigned long) &frame->info;
+	regs->r10 = (unsigned long) &frame->uc;
+	regs->sp = (unsigned long) frame;
+	if (ka->sa.sa_flags & SA_RESTORER)
+		regs->lr = (unsigned long)ka->sa.sa_restorer;
+	else {
+		printk(KERN_NOTICE "[%s:%d] did not set SA_RESTORER\n",
+		       current->comm, current->pid);
+		regs->lr = (unsigned long) &frame->retcode;
+	}
+
+	pr_debug("SIG deliver [%s:%d]: sig=%d sp=0x%lx pc=0x%lx->0x%p lr=0x%lx\n",
+		 current->comm, current->pid, sig, regs->sp,
+		 regs->pc, ka->sa.sa_handler, regs->lr);
+
+	regs->pc = (unsigned long) ka->sa.sa_handler;
+
+out:
+	return err;
+}
+
+static inline void restart_syscall(struct pt_regs *regs)
+{
+	if (regs->r12 == -ERESTART_RESTARTBLOCK)
+		regs->r8 = __NR_restart_syscall;
+	else
+		regs->r12 = regs->r12_orig;
+	regs->pc -= 2;
+}
+
+static inline void
+handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info,
+	      sigset_t *oldset, struct pt_regs *regs, int syscall)
+{
+	int ret;
+
+	/*
+	 * Set up the stack frame
+	 */
+	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+
+	/*
+	 * Check that the resulting registers are sane
+	 */
+	ret |= !valid_user_regs(regs);
+
+	/*
+	 * Block the signal if we were unsuccessful.
+	 */
+	if (ret != 0 || !(ka->sa.sa_flags & SA_NODEFER)) {
+		spin_lock_irq(&current->sighand->siglock);
+		sigorsets(&current->blocked, &current->blocked,
+			  &ka->sa.sa_mask);
+		sigaddset(&current->blocked, sig);
+		recalc_sigpending();
+		spin_unlock_irq(&current->sighand->siglock);
+	}
+
+	if (ret == 0)
+		return;
+
+	force_sigsegv(sig, current);
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it
+ * doesn't want to handle. Thus you cannot kill init even with a
+ * SIGKILL even by mistake.
+ */
+int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall)
+{
+	siginfo_t info;
+	int signr;
+	struct k_sigaction ka;
+
+	/*
+	 * We want the common case to go fast, which is why we may in
+	 * certain cases get here from kernel mode. Just return
+	 * without doing anything if so.
+	 */
+	if (!user_mode(regs))
+		return 0;
+
+	if (try_to_freeze()) {
+		signr = 0;
+		if (!signal_pending(current))
+			goto no_signal;
+	}
+
+	if (test_thread_flag(TIF_RESTORE_SIGMASK))
+		oldset = &current->saved_sigmask;
+	else if (!oldset)
+		oldset = &current->blocked;
+
+	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+no_signal:
+	if (syscall) {
+		switch (regs->r12) {
+		case -ERESTART_RESTARTBLOCK:
+		case -ERESTARTNOHAND:
+			if (signr > 0) {
+				regs->r12 = -EINTR;
+				break;
+			}
+			/* fall through */
+		case -ERESTARTSYS:
+			if (signr > 0 && !(ka.sa.sa_flags & SA_RESTART)) {
+				regs->r12 = -EINTR;
+				break;
+			}
+			/* fall through */
+		case -ERESTARTNOINTR:
+			restart_syscall(regs);
+		}
+	}
+
+	if (signr == 0) {
+		/* No signal to deliver -- put the saved sigmask back */
+		if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+			clear_thread_flag(TIF_RESTORE_SIGMASK);
+			sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+		}
+		return 0;
+	}
+
+	handle_signal(signr, &ka, &info, oldset, regs, syscall);
+	return 1;
+}
+
+asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti)
+{
+	int syscall = 0;
+
+	if ((sysreg_read(SR) & MODE_MASK) == MODE_SUPERVISOR)
+		syscall = 1;
+
+	if (ti->flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
+		do_signal(regs, &current->blocked, syscall);
+}
diff --git a/arch/avr32/kernel/switch_to.S b/arch/avr32/kernel/switch_to.S
new file mode 100644
index 00000000000..a48d046723c
--- /dev/null
+++ b/arch/avr32/kernel/switch_to.S
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/sysreg.h>
+
+	.text
+	.global	__switch_to
+	.type	__switch_to, @function
+
+	/* Switch thread context from "prev" to "next", returning "last"
+	 *   r12 :	prev
+	 *   r11 :	&prev->thread + 1
+	 *   r10 :	&next->thread
+	 */
+__switch_to:
+	stm	--r11, r0,r1,r2,r3,r4,r5,r6,r7,sp,lr
+	mfsr	r9, SYSREG_SR
+	st.w	--r11, r9
+	ld.w	r8, r10++
+	/*
+	 * schedule() may have been called from a mode with a different
+	 * set of registers. Make sure we don't lose anything here.
+	 */
+	pushm	r10,r12
+	mtsr	SYSREG_SR, r8
+	frs			/* flush the return stack */
+	sub	pc, -2		/* flush the pipeline */
+	popm	r10,r12
+	ldm	r10++, r0,r1,r2,r3,r4,r5,r6,r7,sp,pc
+	.size	__switch_to, . - __switch_to
diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c
new file mode 100644
index 00000000000..6ec5693da44
--- /dev/null
+++ b/arch/avr32/kernel/sys_avr32.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+
+#include <asm/mman.h>
+#include <asm/uaccess.h>
+
+asmlinkage int sys_pipe(unsigned long __user *filedes)
+{
+	int fd[2];
+	int error;
+
+	error = do_pipe(fd);
+	if (!error) {
+		if (copy_to_user(filedes, fd, sizeof(fd)))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, off_t offset)
+{
+	int error = -EBADF;
+	struct file *file = NULL;
+
+	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+	if (!(flags & MAP_ANONYMOUS)) {
+		file = fget(fd);
+		if (!file)
+			return error;
+	}
+
+	down_write(&current->mm->mmap_sem);
+	error = do_mmap_pgoff(file, addr, len, prot, flags, offset);
+	up_write(&current->mm->mmap_sem);
+
+	if (file)
+		fput(file);
+	return error;
+}
diff --git a/arch/avr32/kernel/syscall-stubs.S b/arch/avr32/kernel/syscall-stubs.S
new file mode 100644
index 00000000000..7589a9b426c
--- /dev/null
+++ b/arch/avr32/kernel/syscall-stubs.S
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Stubs for syscalls that require access to pt_regs or that take more
+ * than five parameters.
+ */
+
+#define ARG6	r3
+
+	.text
+	.global __sys_rt_sigsuspend
+	.type	__sys_rt_sigsuspend,@function
+__sys_rt_sigsuspend:
+	mov	r10, sp
+	rjmp	sys_rt_sigsuspend
+
+	.global	__sys_sigaltstack
+	.type	__sys_sigaltstack,@function
+__sys_sigaltstack:
+	mov	r10, sp
+	rjmp	sys_sigaltstack
+
+	.global	__sys_rt_sigreturn
+	.type	__sys_rt_sigreturn,@function
+__sys_rt_sigreturn:
+	mov	r12, sp
+	rjmp	sys_rt_sigreturn
+
+	.global	__sys_fork
+	.type	__sys_fork,@function
+__sys_fork:
+	mov	r12, sp
+	rjmp	sys_fork
+
+	.global	__sys_clone
+	.type	__sys_clone,@function
+__sys_clone:
+	mov	r8, sp
+	rjmp	sys_clone
+
+	.global	__sys_vfork
+	.type	__sys_vfork,@function
+__sys_vfork:
+	mov	r12, sp
+	rjmp	sys_vfork
+
+	.global	__sys_execve
+	.type	__sys_execve,@function
+__sys_execve:
+	mov	r9, sp
+	rjmp	sys_execve
+
+	.global	__sys_mmap2
+	.type	__sys_mmap2,@function
+__sys_mmap2:
+	pushm	lr
+	st.w	--sp, ARG6
+	rcall	sys_mmap2
+	sub	sp, -4
+	popm	pc
+
+	.global	__sys_sendto
+	.type	__sys_sendto,@function
+__sys_sendto:
+	pushm	lr
+	st.w	--sp, ARG6
+	rcall	sys_sendto
+	sub	sp, -4
+	popm	pc
+
+	.global	__sys_recvfrom
+	.type	__sys_recvfrom,@function
+__sys_recvfrom:
+	pushm	lr
+	st.w	--sp, ARG6
+	rcall	sys_recvfrom
+	sub	sp, -4
+	popm	pc
+
+	.global	__sys_pselect6
+	.type	__sys_pselect6,@function
+__sys_pselect6:
+	pushm	lr
+	st.w	--sp, ARG6
+	rcall	sys_pselect6
+	sub	sp, -4
+	popm	pc
+
+	.global	__sys_splice
+	.type	__sys_splice,@function
+__sys_splice:
+	pushm	lr
+	st.w	--sp, ARG6
+	rcall	sys_splice
+	sub	sp, -4
+	popm	pc
diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S
new file mode 100644
index 00000000000..63b206965d0
--- /dev/null
+++ b/arch/avr32/kernel/syscall_table.S
@@ -0,0 +1,289 @@
+/*
+ * AVR32 system call table
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if !defined(CONFIG_NFSD) && !defined(CONFIG_NFSD_MODULE)
+#define sys_nfsservctl sys_ni_syscall
+#endif
+
+#if !defined(CONFIG_SYSV_IPC)
+# define sys_ipc	sys_ni_syscall
+#endif
+
+	.section .rodata,"a",@progbits
+	.type	sys_call_table,@object
+	.global	sys_call_table
+	.align	2
+sys_call_table:
+	.long	sys_restart_syscall
+	.long	sys_exit
+	.long	__sys_fork
+	.long	sys_read
+	.long	sys_write
+	.long	sys_open		/* 5 */
+	.long	sys_close
+	.long	sys_umask
+	.long	sys_creat
+	.long	sys_link
+	.long	sys_unlink		/* 10 */
+	.long	__sys_execve
+	.long	sys_chdir
+	.long	sys_time
+	.long	sys_mknod
+	.long	sys_chmod		/* 15 */
+	.long	sys_chown
+	.long	sys_lchown
+	.long	sys_lseek
+	.long	sys_llseek
+	.long	sys_getpid		/* 20 */
+	.long	sys_mount
+	.long	sys_umount
+	.long	sys_setuid
+	.long	sys_getuid
+	.long	sys_stime		/* 25 */
+	.long	sys_ptrace
+	.long	sys_alarm
+	.long	sys_pause
+	.long	sys_utime
+	.long	sys_newstat		/* 30 */
+	.long	sys_newfstat
+	.long	sys_newlstat
+	.long	sys_access
+	.long	sys_chroot
+	.long	sys_sync		/* 35 */
+	.long	sys_fsync
+	.long	sys_kill
+	.long	sys_rename
+	.long	sys_mkdir
+	.long	sys_rmdir		/* 40 */
+	.long	sys_dup
+	.long	sys_pipe
+	.long	sys_times
+	.long	__sys_clone
+	.long	sys_brk			/* 45 */
+	.long	sys_setgid
+	.long	sys_getgid
+	.long	sys_getcwd
+	.long	sys_geteuid
+	.long	sys_getegid		/* 50 */
+	.long	sys_acct
+	.long	sys_setfsuid
+	.long	sys_setfsgid
+	.long	sys_ioctl
+	.long	sys_fcntl		/* 55 */
+	.long	sys_setpgid
+	.long	sys_mremap
+	.long	sys_setresuid
+	.long	sys_getresuid
+	.long	sys_setreuid		/* 60 */
+	.long	sys_setregid
+	.long	sys_ustat
+	.long	sys_dup2
+	.long	sys_getppid
+	.long	sys_getpgrp		/* 65 */
+	.long	sys_setsid
+	.long	sys_rt_sigaction
+	.long	__sys_rt_sigreturn
+	.long	sys_rt_sigprocmask
+	.long	sys_rt_sigpending	/* 70 */
+	.long	sys_rt_sigtimedwait
+	.long	sys_rt_sigqueueinfo
+	.long	__sys_rt_sigsuspend
+	.long	sys_sethostname
+	.long	sys_setrlimit		/* 75 */
+	.long	sys_getrlimit
+	.long	sys_getrusage
+	.long	sys_gettimeofday
+	.long	sys_settimeofday
+	.long	sys_getgroups		/* 80 */
+	.long	sys_setgroups
+	.long	sys_select
+	.long	sys_symlink
+	.long	sys_fchdir
+	.long	sys_readlink		/* 85 */
+	.long	sys_pread64
+	.long	sys_pwrite64
+	.long	sys_swapon
+	.long	sys_reboot
+	.long	__sys_mmap2		/* 90 */
+	.long	sys_munmap
+	.long	sys_truncate
+	.long	sys_ftruncate
+	.long	sys_fchmod
+	.long	sys_fchown		/* 95 */
+	.long	sys_getpriority
+	.long	sys_setpriority
+	.long	sys_wait4
+	.long	sys_statfs
+	.long	sys_fstatfs		/* 100 */
+	.long	sys_vhangup
+	.long	__sys_sigaltstack
+	.long	sys_syslog
+	.long	sys_setitimer
+	.long	sys_getitimer		/* 105 */
+	.long	sys_swapoff
+	.long	sys_sysinfo
+	.long	sys_ipc
+	.long	sys_sendfile
+	.long	sys_setdomainname	/* 110 */
+	.long	sys_newuname
+	.long	sys_adjtimex
+	.long	sys_mprotect
+	.long	__sys_vfork
+	.long	sys_init_module		/* 115 */
+	.long	sys_delete_module
+	.long	sys_quotactl
+	.long	sys_getpgid
+	.long	sys_bdflush
+	.long	sys_sysfs		/* 120 */
+	.long	sys_personality
+	.long	sys_ni_syscall		/* reserved for afs_syscall */
+	.long	sys_getdents
+	.long	sys_flock
+	.long	sys_msync		/* 125 */
+	.long	sys_readv
+	.long	sys_writev
+	.long	sys_getsid
+	.long	sys_fdatasync
+	.long	sys_sysctl		/* 130 */
+	.long	sys_mlock
+	.long	sys_munlock
+	.long	sys_mlockall
+	.long	sys_munlockall
+	.long	sys_sched_setparam		/* 135 */
+	.long	sys_sched_getparam
+	.long	sys_sched_setscheduler
+	.long	sys_sched_getscheduler
+	.long	sys_sched_yield
+	.long	sys_sched_get_priority_max	/* 140 */
+	.long	sys_sched_get_priority_min
+	.long	sys_sched_rr_get_interval
+	.long	sys_nanosleep
+	.long	sys_poll
+	.long	sys_nfsservctl		/* 145 */
+	.long	sys_setresgid
+	.long	sys_getresgid
+	.long	sys_prctl
+	.long	sys_socket
+	.long	sys_bind		/* 150 */
+	.long	sys_connect
+	.long	sys_listen
+	.long	sys_accept
+	.long	sys_getsockname
+	.long	sys_getpeername		/* 155 */
+	.long	sys_socketpair
+	.long	sys_send
+	.long	sys_recv
+	.long	__sys_sendto
+	.long	__sys_recvfrom		/* 160 */
+	.long	sys_shutdown
+	.long	sys_setsockopt
+	.long	sys_getsockopt
+	.long	sys_sendmsg
+	.long	sys_recvmsg		/* 165 */
+	.long	sys_truncate64
+	.long	sys_ftruncate64
+	.long	sys_stat64
+	.long	sys_lstat64
+	.long	sys_fstat64		/* 170 */
+	.long	sys_pivot_root
+	.long	sys_mincore
+	.long	sys_madvise
+	.long	sys_getdents64
+	.long	sys_fcntl64		/* 175 */
+	.long	sys_gettid
+	.long	sys_readahead
+	.long	sys_setxattr
+	.long	sys_lsetxattr
+	.long	sys_fsetxattr		/* 180 */
+	.long	sys_getxattr
+	.long	sys_lgetxattr
+	.long	sys_fgetxattr
+	.long	sys_listxattr
+	.long	sys_llistxattr		/* 185 */
+	.long	sys_flistxattr
+	.long	sys_removexattr
+	.long	sys_lremovexattr
+	.long	sys_fremovexattr
+	.long	sys_tkill		/* 190 */
+	.long	sys_sendfile64
+	.long	sys_futex
+	.long	sys_sched_setaffinity
+	.long	sys_sched_getaffinity
+	.long	sys_capget		/* 195 */
+	.long	sys_capset
+	.long	sys_io_setup
+	.long	sys_io_destroy
+	.long	sys_io_getevents
+	.long	sys_io_submit		/* 200 */
+	.long	sys_io_cancel
+	.long	sys_fadvise64
+	.long	sys_exit_group
+	.long	sys_lookup_dcookie
+	.long	sys_epoll_create	/* 205 */
+	.long	sys_epoll_ctl
+	.long	sys_epoll_wait
+	.long	sys_remap_file_pages
+	.long	sys_set_tid_address
+	.long	sys_timer_create	/* 210 */
+	.long	sys_timer_settime
+	.long	sys_timer_gettime
+	.long	sys_timer_getoverrun
+	.long	sys_timer_delete
+	.long	sys_clock_settime	/* 215 */
+	.long	sys_clock_gettime
+	.long	sys_clock_getres
+	.long	sys_clock_nanosleep
+	.long	sys_statfs64
+	.long	sys_fstatfs64		/* 220 */
+	.long	sys_tgkill
+	.long	sys_ni_syscall		/* reserved for TUX */
+	.long	sys_utimes
+	.long	sys_fadvise64_64
+	.long	sys_cacheflush		/* 225 */
+	.long	sys_ni_syscall		/* sys_vserver */
+	.long	sys_mq_open
+	.long	sys_mq_unlink
+	.long	sys_mq_timedsend
+	.long	sys_mq_timedreceive	/* 230 */
+	.long	sys_mq_notify
+	.long	sys_mq_getsetattr
+	.long	sys_kexec_load
+	.long	sys_waitid
+	.long	sys_add_key		/* 235 */
+	.long	sys_request_key
+	.long	sys_keyctl
+	.long	sys_ioprio_set
+	.long	sys_ioprio_get
+	.long	sys_inotify_init	/* 240 */
+	.long	sys_inotify_add_watch
+	.long	sys_inotify_rm_watch
+	.long	sys_openat
+	.long	sys_mkdirat
+	.long	sys_mknodat		/* 245 */
+	.long	sys_fchownat
+	.long	sys_futimesat
+	.long	sys_fstatat64
+	.long	sys_unlinkat
+	.long	sys_renameat		/* 250 */
+	.long	sys_linkat
+	.long	sys_symlinkat
+	.long	sys_readlinkat
+	.long	sys_fchmodat
+	.long	sys_faccessat		/* 255 */
+	.long	__sys_pselect6
+	.long	sys_ppoll
+	.long	sys_unshare
+	.long	sys_set_robust_list
+	.long	sys_get_robust_list	/* 260 */
+	.long	__sys_splice
+	.long	sys_sync_file_range
+	.long	sys_tee
+	.long	sys_vmsplice
+	.long	sys_ni_syscall		/* r8 is saturated at nr_syscalls */
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
new file mode 100644
index 00000000000..b0e6b5855a3
--- /dev/null
+++ b/arch/avr32/kernel/time.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on MIPS implementation arch/mips/kernel/time.c
+ *   Copyright 2001 MontaVista Software Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/clocksource.h>
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/profile.h>
+#include <linux/sysdev.h>
+
+#include <asm/div64.h>
+#include <asm/sysreg.h>
+#include <asm/io.h>
+#include <asm/sections.h>
+
+static cycle_t read_cycle_count(void)
+{
+	return (cycle_t)sysreg_read(COUNT);
+}
+
+static struct clocksource clocksource_avr32 = {
+	.name		= "avr32",
+	.rating		= 350,
+	.read		= read_cycle_count,
+	.mask		= CLOCKSOURCE_MASK(32),
+	.shift		= 16,
+	.is_continuous	= 1,
+};
+
+/*
+ * By default we provide the null RTC ops
+ */
+static unsigned long null_rtc_get_time(void)
+{
+	return mktime(2004, 1, 1, 0, 0, 0);
+}
+
+static int null_rtc_set_time(unsigned long sec)
+{
+	return 0;
+}
+
+static unsigned long (*rtc_get_time)(void) = null_rtc_get_time;
+static int (*rtc_set_time)(unsigned long) = null_rtc_set_time;
+
+/* how many counter cycles in a jiffy? */
+static unsigned long cycles_per_jiffy;
+
+/* cycle counter value at the previous timer interrupt */
+static unsigned int timerhi, timerlo;
+
+/* the count value for the next timer interrupt */
+static unsigned int expirelo;
+
+static void avr32_timer_ack(void)
+{
+	unsigned int count;
+
+	/* Ack this timer interrupt and set the next one */
+	expirelo += cycles_per_jiffy;
+	if (expirelo == 0) {
+		printk(KERN_DEBUG "expirelo == 0\n");
+		sysreg_write(COMPARE, expirelo + 1);
+	} else {
+		sysreg_write(COMPARE, expirelo);
+	}
+
+	/* Check to see if we have missed any timer interrupts */
+	count = sysreg_read(COUNT);
+	if ((count - expirelo) < 0x7fffffff) {
+		expirelo = count + cycles_per_jiffy;
+		sysreg_write(COMPARE, expirelo);
+	}
+}
+
+static unsigned int avr32_hpt_read(void)
+{
+	return sysreg_read(COUNT);
+}
+
+/*
+ * Taken from MIPS c0_hpt_timer_init().
+ *
+ * Why is it so complicated, and what is "count"?  My assumption is
+ * that `count' specifies the "reference cycle", i.e. the cycle since
+ * reset that should mean "zero". The reason COUNT is written twice is
+ * probably to make sure we don't get any timer interrupts while we
+ * are messing with the counter.
+ */
+static void avr32_hpt_init(unsigned int count)
+{
+	count = sysreg_read(COUNT) - count;
+	expirelo = (count / cycles_per_jiffy + 1) * cycles_per_jiffy;
+	sysreg_write(COUNT, expirelo - cycles_per_jiffy);
+	sysreg_write(COMPARE, expirelo);
+	sysreg_write(COUNT, count);
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+	/* There must be better ways...? */
+	return (unsigned long long)jiffies * (1000000000 / HZ);
+}
+
+/*
+ * local_timer_interrupt() does profiling and process accounting on a
+ * per-CPU basis.
+ *
+ * In UP mode, it is invoked from the (global) timer_interrupt.
+ */
+static void local_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	if (current->pid)
+		profile_tick(CPU_PROFILING, regs);
+	update_process_times(user_mode(regs));
+}
+
+static irqreturn_t
+timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	unsigned int count;
+
+	/* ack timer interrupt and try to set next interrupt */
+	count = avr32_hpt_read();
+	avr32_timer_ack();
+
+	/* Update timerhi/timerlo for intra-jiffy calibration */
+	timerhi += count < timerlo;	/* Wrap around */
+	timerlo = count;
+
+	/*
+	 * Call the generic timer interrupt handler
+	 */
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	/*
+	 * In UP mode, we call local_timer_interrupt() to do profiling
+	 * and process accounting.
+	 *
+	 * SMP is not supported yet.
+	 */
+	local_timer_interrupt(irq, dev_id, regs);
+
+	return IRQ_HANDLED;
+}
+
+static struct irqaction timer_irqaction = {
+	.handler	= timer_interrupt,
+	.flags		= IRQF_DISABLED,
+	.name		= "timer",
+};
+
+void __init time_init(void)
+{
+	unsigned long mult, shift, count_hz;
+	int ret;
+
+	xtime.tv_sec = rtc_get_time();
+	xtime.tv_nsec = 0;
+
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+	printk("Before time_init: count=%08lx, compare=%08lx\n",
+	       (unsigned long)sysreg_read(COUNT),
+	       (unsigned long)sysreg_read(COMPARE));
+
+	count_hz = clk_get_rate(boot_cpu_data.clk);
+	shift = clocksource_avr32.shift;
+	mult = clocksource_hz2mult(count_hz, shift);
+	clocksource_avr32.mult = mult;
+
+	printk("Cycle counter: mult=%lu, shift=%lu\n", mult, shift);
+
+	{
+		u64 tmp;
+
+		tmp = TICK_NSEC;
+		tmp <<= shift;
+		tmp += mult / 2;
+		do_div(tmp, mult);
+
+		cycles_per_jiffy = tmp;
+	}
+
+	/* This sets up the high precision timer for the first interrupt. */
+	avr32_hpt_init(avr32_hpt_read());
+
+	printk("After time_init: count=%08lx, compare=%08lx\n",
+	       (unsigned long)sysreg_read(COUNT),
+	       (unsigned long)sysreg_read(COMPARE));
+
+	ret = clocksource_register(&clocksource_avr32);
+	if (ret)
+		printk(KERN_ERR
+		       "timer: could not register clocksource: %d\n", ret);
+
+	ret = setup_irq(0, &timer_irqaction);
+	if (ret)
+		printk("timer: could not request IRQ 0: %d\n", ret);
+}
+
+static struct sysdev_class timer_class = {
+	set_kset_name("timer"),
+};
+
+static struct sys_device timer_device = {
+	.id	= 0,
+	.cls	= &timer_class,
+};
+
+static int __init init_timer_sysfs(void)
+{
+	int err = sysdev_class_register(&timer_class);
+	if (!err)
+		err = sysdev_register(&timer_device);
+	return err;
+}
+
+device_initcall(init_timer_sysfs);
diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c
new file mode 100644
index 00000000000..7e803f4d7a1
--- /dev/null
+++ b/arch/avr32/kernel/traps.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#undef DEBUG
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+
+#include <asm/traps.h>
+#include <asm/sysreg.h>
+#include <asm/addrspace.h>
+#include <asm/ocd.h>
+#include <asm/mmu_context.h>
+#include <asm/uaccess.h>
+
+static void dump_mem(const char *str, unsigned long bottom, unsigned long top)
+{
+	unsigned long p;
+	int i;
+
+	printk("%s(0x%08lx to 0x%08lx)\n", str, bottom, top);
+
+	for (p = bottom & ~31; p < top; ) {
+		printk("%04lx: ", p & 0xffff);
+
+		for (i = 0; i < 8; i++, p += 4) {
+			unsigned int val;
+
+			if (p < bottom || p >= top)
+				printk("         ");
+			else {
+				if (__get_user(val, (unsigned int __user *)p)) {
+					printk("\n");
+					goto out;
+				}
+				printk("%08x ", val);
+			}
+		}
+		printk("\n");
+	}
+
+out:
+	return;
+}
+
+#ifdef CONFIG_FRAME_POINTER
+static inline void __show_trace(struct task_struct *tsk, unsigned long *sp,
+				struct pt_regs *regs)
+{
+	unsigned long __user *fp;
+	unsigned long __user *last_fp = NULL;
+
+	if (regs) {
+		fp = (unsigned long __user *)regs->r7;
+	} else if (tsk == current) {
+		register unsigned long __user *real_fp __asm__("r7");
+		fp = real_fp;
+	} else {
+		fp = (unsigned long __user *)tsk->thread.cpu_context.r7;
+	}
+
+	/*
+	 * Walk the stack until (a) we get an exception, (b) the frame
+	 * pointer becomes zero, or (c) the frame pointer gets stuck
+	 * at the same value.
+	 */
+	while (fp && fp != last_fp) {
+		unsigned long lr, new_fp = 0;
+
+		last_fp = fp;
+		if (__get_user(lr, fp))
+			break;
+		if (fp && __get_user(new_fp, fp + 1))
+			break;
+		fp = (unsigned long __user *)new_fp;
+
+		printk(" [<%08lx>] ", lr);
+		print_symbol("%s\n", lr);
+	}
+	printk("\n");
+}
+#else
+static inline void __show_trace(struct task_struct *tsk, unsigned long *sp,
+				struct pt_regs *regs)
+{
+	unsigned long addr;
+
+	while (!kstack_end(sp)) {
+		addr = *sp++;
+		if (kernel_text_address(addr)) {
+			printk(" [<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+		}
+	}
+}
+#endif
+
+void show_trace(struct task_struct *tsk, unsigned long *sp,
+		       struct pt_regs *regs)
+{
+	if (regs &&
+	    (((regs->sr & MODE_MASK) == MODE_EXCEPTION) ||
+	     ((regs->sr & MODE_MASK) == MODE_USER)))
+		return;
+
+	printk ("Call trace:");
+#ifdef CONFIG_KALLSYMS
+	printk("\n");
+#endif
+
+	__show_trace(tsk, sp, regs);
+	printk("\n");
+}
+
+void show_stack(struct task_struct *tsk, unsigned long *sp)
+{
+	unsigned long stack;
+
+	if (!tsk)
+		tsk = current;
+	if (sp == 0) {
+		if (tsk == current) {
+			register unsigned long *real_sp __asm__("sp");
+			sp = real_sp;
+		} else {
+			sp = (unsigned long *)tsk->thread.cpu_context.ksp;
+		}
+	}
+
+	stack = (unsigned long)sp;
+	dump_mem("Stack: ", stack,
+		 THREAD_SIZE + (unsigned long)tsk->thread_info);
+	show_trace(tsk, sp, NULL);
+}
+
+void dump_stack(void)
+{
+	show_stack(NULL, NULL);
+}
+EXPORT_SYMBOL(dump_stack);
+
+ATOMIC_NOTIFIER_HEAD(avr32_die_chain);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+	pr_debug("register_die_notifier: %p\n", nb);
+
+	return atomic_notifier_chain_register(&avr32_die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&avr32_die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
+
+static DEFINE_SPINLOCK(die_lock);
+
+void __die(const char *str, struct pt_regs *regs, unsigned long err,
+	   const char *file, const char *func, unsigned long line)
+{
+	struct task_struct *tsk = current;
+	static int die_counter;
+
+	console_verbose();
+	spin_lock_irq(&die_lock);
+	bust_spinlocks(1);
+
+	printk(KERN_ALERT "%s", str);
+	if (file && func)
+		printk(" in %s:%s, line %ld", file, func, line);
+	printk("[#%d]:\n", ++die_counter);
+	print_modules();
+	show_regs(regs);
+	printk("Process %s (pid: %d, stack limit = 0x%p)\n",
+	       tsk->comm, tsk->pid, tsk->thread_info + 1);
+
+	if (!user_mode(regs) || in_interrupt()) {
+		dump_mem("Stack: ", regs->sp,
+			 THREAD_SIZE + (unsigned long)tsk->thread_info);
+	}
+
+	bust_spinlocks(0);
+	spin_unlock_irq(&die_lock);
+	do_exit(SIGSEGV);
+}
+
+void __die_if_kernel(const char *str, struct pt_regs *regs, unsigned long err,
+		     const char *file, const char *func, unsigned long line)
+{
+	if (!user_mode(regs))
+		__die(str, regs, err, file, func, line);
+}
+
+asmlinkage void do_nmi(unsigned long ecr, struct pt_regs *regs)
+{
+#ifdef CONFIG_SUBARCH_AVR32B
+	/*
+	 * The exception entry always saves RSR_EX. For NMI, this is
+	 * wrong; it should be RSR_NMI
+	 */
+	regs->sr = sysreg_read(RSR_NMI);
+#endif
+
+	printk("NMI taken!!!!\n");
+	die("NMI", regs, ecr);
+	BUG();
+}
+
+asmlinkage void do_critical_exception(unsigned long ecr, struct pt_regs *regs)
+{
+	printk("Unable to handle critical exception %lu at pc = %08lx!\n",
+	       ecr, regs->pc);
+	die("Oops", regs, ecr);
+	BUG();
+}
+
+asmlinkage void do_address_exception(unsigned long ecr, struct pt_regs *regs)
+{
+	siginfo_t info;
+
+	die_if_kernel("Oops: Address exception in kernel mode", regs, ecr);
+
+#ifdef DEBUG
+	if (ecr == ECR_ADDR_ALIGN_X)
+		pr_debug("Instruction Address Exception at pc = %08lx\n",
+			 regs->pc);
+	else if (ecr == ECR_ADDR_ALIGN_R)
+		pr_debug("Data Address Exception (Read) at pc = %08lx\n",
+			 regs->pc);
+	else if (ecr == ECR_ADDR_ALIGN_W)
+		pr_debug("Data Address Exception (Write) at pc = %08lx\n",
+			 regs->pc);
+	else
+		BUG();
+
+	show_regs(regs);
+#endif
+
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRALN;
+	info.si_addr = (void __user *)regs->pc;
+
+	force_sig_info(SIGBUS, &info, current);
+}
+
+/* This way of handling undefined instructions is stolen from ARM */
+static LIST_HEAD(undef_hook);
+static spinlock_t undef_lock = SPIN_LOCK_UNLOCKED;
+
+void register_undef_hook(struct undef_hook *hook)
+{
+	spin_lock_irq(&undef_lock);
+	list_add(&hook->node, &undef_hook);
+	spin_unlock_irq(&undef_lock);
+}
+
+void unregister_undef_hook(struct undef_hook *hook)
+{
+	spin_lock_irq(&undef_lock);
+	list_del(&hook->node);
+	spin_unlock_irq(&undef_lock);
+}
+
+static int do_cop_absent(u32 insn)
+{
+	int cop_nr;
+	u32 cpucr;
+	if ( (insn & 0xfdf00000) == 0xf1900000 )
+		/* LDC0 */
+		cop_nr = 0;
+	else
+		cop_nr = (insn >> 13) & 0x7;
+
+	/* Try enabling the coprocessor */
+	cpucr = sysreg_read(CPUCR);
+	cpucr |= (1 << (24 + cop_nr));
+	sysreg_write(CPUCR, cpucr);
+
+	cpucr = sysreg_read(CPUCR);
+	if ( !(cpucr & (1 << (24 + cop_nr))) ){
+		printk("Coprocessor #%i not found!\n", cop_nr);
+		return -1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_BUG
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+static inline void do_bug_verbose(struct pt_regs *regs, u32 insn)
+{
+	char *file;
+	u16 line;
+	char c;
+
+	if (__get_user(line, (u16 __user *)(regs->pc + 2)))
+		return;
+	if (__get_user(file, (char * __user *)(regs->pc + 4))
+	    || (unsigned long)file < PAGE_OFFSET
+	    || __get_user(c, file))
+		file = "<bad filename>";
+
+	printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);
+}
+#else
+static inline void do_bug_verbose(struct pt_regs *regs, u32 insn)
+{
+
+}
+#endif
+#endif
+
+asmlinkage void do_illegal_opcode(unsigned long ecr, struct pt_regs *regs)
+{
+	u32 insn;
+	struct undef_hook *hook;
+	siginfo_t info;
+	void __user *pc;
+
+	if (!user_mode(regs))
+		goto kernel_trap;
+
+	local_irq_enable();
+
+	pc = (void __user *)instruction_pointer(regs);
+	if (__get_user(insn, (u32 __user *)pc))
+		goto invalid_area;
+
+        if (ecr == ECR_COPROC_ABSENT) {
+		if (do_cop_absent(insn) == 0)
+			return;
+        }
+
+	spin_lock_irq(&undef_lock);
+	list_for_each_entry(hook, &undef_hook, node) {
+		if ((insn & hook->insn_mask) == hook->insn_val) {
+			if (hook->fn(regs, insn) == 0) {
+				spin_unlock_irq(&undef_lock);
+				return;
+			}
+		}
+	}
+	spin_unlock_irq(&undef_lock);
+
+invalid_area:
+
+#ifdef DEBUG
+	printk("Illegal instruction at pc = %08lx\n", regs->pc);
+	if (regs->pc < TASK_SIZE) {
+		unsigned long ptbr, pgd, pte, *p;
+
+		ptbr = sysreg_read(PTBR);
+		p = (unsigned long *)ptbr;
+		pgd = p[regs->pc >> 22];
+		p = (unsigned long *)((pgd & 0x1ffff000) | 0x80000000);
+		pte = p[(regs->pc >> 12) & 0x3ff];
+		printk("page table: 0x%08lx -> 0x%08lx -> 0x%08lx\n", ptbr, pgd, pte);
+	}
+#endif
+
+	info.si_signo = SIGILL;
+	info.si_errno = 0;
+	info.si_addr = (void __user *)regs->pc;
+	switch (ecr) {
+	case ECR_ILLEGAL_OPCODE:
+	case ECR_UNIMPL_INSTRUCTION:
+		info.si_code = ILL_ILLOPC;
+		break;
+	case ECR_PRIVILEGE_VIOLATION:
+		info.si_code = ILL_PRVOPC;
+		break;
+	case ECR_COPROC_ABSENT:
+		info.si_code = ILL_COPROC;
+		break;
+	default:
+		BUG();
+	}
+
+	force_sig_info(SIGILL, &info, current);
+	return;
+
+kernel_trap:
+#ifdef CONFIG_BUG
+	if (__kernel_text_address(instruction_pointer(regs))) {
+		insn = *(u16 *)instruction_pointer(regs);
+		if (insn == AVR32_BUG_OPCODE) {
+			do_bug_verbose(regs, insn);
+			die("Kernel BUG", regs, 0);
+			return;
+		}
+	}
+#endif
+
+	die("Oops: Illegal instruction in kernel code", regs, ecr);
+}
+
+asmlinkage void do_fpe(unsigned long ecr, struct pt_regs *regs)
+{
+	siginfo_t info;
+
+	printk("Floating-point exception at pc = %08lx\n", regs->pc);
+
+	/* We have no FPU... */
+	info.si_signo = SIGILL;
+	info.si_errno = 0;
+	info.si_addr = (void __user *)regs->pc;
+	info.si_code = ILL_COPROC;
+
+	force_sig_info(SIGILL, &info, current);
+}
+
+
+void __init trap_init(void)
+{
+
+}
diff --git a/arch/avr32/kernel/vmlinux.lds.c b/arch/avr32/kernel/vmlinux.lds.c
new file mode 100644
index 00000000000..cdd627c6b7d
--- /dev/null
+++ b/arch/avr32/kernel/vmlinux.lds.c
@@ -0,0 +1,139 @@
+/*
+ * AVR32 linker script for the Linux kernel
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define LOAD_OFFSET 0x00000000
+#include <asm-generic/vmlinux.lds.h>
+
+OUTPUT_FORMAT("elf32-avr32", "elf32-avr32", "elf32-avr32")
+OUTPUT_ARCH(avr32)
+ENTRY(_start)
+
+/* Big endian */
+jiffies = jiffies_64 + 4;
+
+SECTIONS
+{
+	. = CONFIG_ENTRY_ADDRESS;
+	.init		: AT(ADDR(.init) - LOAD_OFFSET) {
+		_stext = .;
+		__init_begin = .;
+			_sinittext = .;
+			*(.text.reset)
+			*(.init.text)
+			_einittext = .;
+		. = ALIGN(4);
+		__tagtable_begin = .;
+			*(.taglist)
+		__tagtable_end = .;
+			*(.init.data)
+		. = ALIGN(16);
+		__setup_start = .;
+			*(.init.setup)
+		__setup_end = .;
+		. = ALIGN(4);
+		__initcall_start = .;
+			*(.initcall1.init)
+			*(.initcall2.init)
+			*(.initcall3.init)
+			*(.initcall4.init)
+			*(.initcall5.init)
+			*(.initcall6.init)
+			*(.initcall7.init)
+		__initcall_end = .;
+		__con_initcall_start = .;
+			*(.con_initcall.init)
+		__con_initcall_end = .;
+		__security_initcall_start = .;
+			*(.security_initcall.init)
+		__security_initcall_end = .;
+		. = ALIGN(32);
+		__initramfs_start = .;
+			*(.init.ramfs)
+		__initramfs_end = .;
+		. = ALIGN(4096);
+		__init_end = .;
+	}
+
+	. = ALIGN(8192);
+	.text		: AT(ADDR(.text) - LOAD_OFFSET) {
+		_evba = .;
+		_text = .;
+		*(.ex.text)
+		. = 0x50;
+		*(.tlbx.ex.text)
+		. = 0x60;
+		*(.tlbr.ex.text)
+		. = 0x70;
+		*(.tlbw.ex.text)
+		. = 0x100;
+		*(.scall.text)
+		*(.irq.text)
+		*(.text)
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		_etext = .;
+	} = 0xd703d703
+
+	. = ALIGN(4);
+	__ex_table	: AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	}
+
+	RODATA
+
+	. = ALIGN(8192);
+
+	.data		: AT(ADDR(.data) - LOAD_OFFSET) {
+		_data = .;
+		_sdata = .;
+		/*
+		 * First, the init task union, aligned to an 8K boundary.
+		 */
+		*(.data.init_task)
+
+		/* Then, the cacheline aligned data */
+		. = ALIGN(32);
+		*(.data.cacheline_aligned)
+
+		/* And the rest... */
+		*(.data.rel*)
+		*(.data)
+		CONSTRUCTORS
+
+		_edata = .;
+	}
+
+
+	. = ALIGN(8);
+	.bss    	: AT(ADDR(.bss) - LOAD_OFFSET) {
+		__bss_start = .;
+		*(.bss)
+		*(COMMON)
+		. = ALIGN(8);
+		__bss_stop = .;
+		_end = .;
+	}
+
+	/* When something in the kernel is NOT compiled as a module, the module
+	 * cleanup code and data are put into these segments. Both can then be
+	 * thrown away, as cleanup code is never called unless it's a module.
+	 */
+	/DISCARD/       	: {
+		*(.exit.text)
+		*(.exit.data)
+		*(.exitcall.exit)
+	}
+
+	DWARF_DEBUG
+}
diff --git a/arch/avr32/lib/Makefile b/arch/avr32/lib/Makefile
new file mode 100644
index 00000000000..09ac43e4052
--- /dev/null
+++ b/arch/avr32/lib/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for AVR32-specific library files
+#
+
+lib-y	:= copy_user.o clear_user.o
+lib-y	+= strncpy_from_user.o strnlen_user.o
+lib-y	+= delay.o memset.o memcpy.o findbit.o
+lib-y	+= csum_partial.o csum_partial_copy_generic.o
+lib-y	+= io-readsw.o io-readsl.o io-writesw.o io-writesl.o
+lib-y	+= __avr32_lsl64.o __avr32_lsr64.o __avr32_asr64.o
diff --git a/arch/avr32/lib/__avr32_asr64.S b/arch/avr32/lib/__avr32_asr64.S
new file mode 100644
index 00000000000..368b6bca4c7
--- /dev/null
+++ b/arch/avr32/lib/__avr32_asr64.S
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	/*
+	 * DWtype __avr32_asr64(DWtype u, word_type b)
+	 */
+	.text
+	.global	__avr32_asr64
+	.type	__avr32_asr64,@function
+__avr32_asr64:
+	cp.w	r12, 0
+	reteq	r12
+
+	rsub	r9, r12, 32
+	brle	1f
+
+	lsl	r8, r11, r9
+	lsr	r10, r10, r12
+	asr	r11, r11, r12
+	or	r10, r8
+	retal	r12
+
+1:	neg	r9
+	asr	r10, r11, r9
+	asr	r11, 31
+	retal	r12
diff --git a/arch/avr32/lib/__avr32_lsl64.S b/arch/avr32/lib/__avr32_lsl64.S
new file mode 100644
index 00000000000..f1dbc2b3625
--- /dev/null
+++ b/arch/avr32/lib/__avr32_lsl64.S
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	/*
+	 * DWtype __avr32_lsl64(DWtype u, word_type b)
+	 */
+	.text
+	.global	__avr32_lsl64
+	.type	__avr32_lsl64,@function
+__avr32_lsl64:
+	cp.w	r12, 0
+	reteq	r12
+
+	rsub	r9, r12, 32
+	brle	1f
+
+	lsr	r8, r10, r9
+	lsl	r10, r10, r12
+	lsl	r11, r11, r12
+	or	r11, r8
+	retal	r12
+
+1:	neg	r9
+	lsl	r11, r10, r9
+	mov	r10, 0
+	retal	r12
diff --git a/arch/avr32/lib/__avr32_lsr64.S b/arch/avr32/lib/__avr32_lsr64.S
new file mode 100644
index 00000000000..e65bb7f0d24
--- /dev/null
+++ b/arch/avr32/lib/__avr32_lsr64.S
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	/*
+	 * DWtype __avr32_lsr64(DWtype u, word_type b)
+	 */
+	.text
+	.global	__avr32_lsr64
+	.type	__avr32_lsr64,@function
+__avr32_lsr64:
+	cp.w	r12, 0
+	reteq	r12
+
+	rsub	r9, r12, 32
+	brle	1f
+
+	lsl	r8, r11, r9
+	lsr	r11, r11, r12
+	lsr	r10, r10, r12
+	or	r10, r8
+	retal	r12
+
+1:	neg	r9
+	lsr	r10, r11, r9
+	mov	r11, 0
+	retal	r12
diff --git a/arch/avr32/lib/clear_user.S b/arch/avr32/lib/clear_user.S
new file mode 100644
index 00000000000..d8991b6f8eb
--- /dev/null
+++ b/arch/avr32/lib/clear_user.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+
+	.text
+	.align	1
+	.global	clear_user
+	.type	clear_user, "function"
+clear_user:
+	branch_if_kernel r8, __clear_user
+	ret_if_privileged r8, r12, r11, r11
+
+	.global	__clear_user
+	.type	__clear_user, "function"
+__clear_user:
+	mov	r9, r12
+	mov	r8, 0
+	andl	r9, 3, COH
+	brne	5f
+
+1:	sub	r11, 4
+	brlt	2f
+
+10:	st.w	r12++, r8
+	sub	r11, 4
+	brge	10b
+
+2:	sub	r11, -4
+	reteq	0
+
+	/* Unaligned count or address */
+	bld	r11, 1
+	brcc	12f
+11:	st.h	r12++, r8
+	sub	r11, 2
+	reteq	0
+12:	st.b	r12++, r8
+	retal	0
+
+	/* Unaligned address */
+5:	cp.w	r11, 4
+	brlt	2b
+
+	lsl	r9, 2
+	add	pc, pc, r9
+13:	st.b	r12++, r8
+	sub	r11, 1
+14:	st.b	r12++, r8
+	sub	r11, 1
+15:	st.b	r12++, r8
+	sub	r11, 1
+	rjmp	1b
+
+	.size	clear_user, . - clear_user
+	.size	__clear_user, . - __clear_user
+
+	.section .fixup, "ax"
+	.align	1
+18:	sub	r11, -4
+19:	retal	r11
+
+	.section __ex_table, "a"
+	.align	2
+	.long	10b, 18b
+	.long	11b, 19b
+	.long	12b, 19b
+	.long	13b, 19b
+	.long	14b, 19b
+	.long	15b, 19b
diff --git a/arch/avr32/lib/copy_user.S b/arch/avr32/lib/copy_user.S
new file mode 100644
index 00000000000..ea59c04b07d
--- /dev/null
+++ b/arch/avr32/lib/copy_user.S
@@ -0,0 +1,119 @@
+/*
+ * Copy to/from userspace with optional address space checking.
+ *
+ * Copyright 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+
+	/*
+	 * __kernel_size_t
+	 * __copy_user(void *to, const void *from, __kernel_size_t n)
+	 *
+	 * Returns the number of bytes not copied. Might be off by
+	 * max 3 bytes if we get a fault in the main loop.
+	 *
+	 * The address-space checking functions simply fall through to
+	 * the non-checking version.
+	 */
+	.text
+	.align	1
+	.global	copy_from_user
+	.type	copy_from_user, @function
+copy_from_user:
+	branch_if_kernel r8, __copy_user
+	ret_if_privileged r8, r11, r10, r10
+	rjmp	__copy_user
+	.size	copy_from_user, . - copy_from_user
+
+	.global	copy_to_user
+	.type	copy_to_user, @function
+copy_to_user:
+	branch_if_kernel r8, __copy_user
+	ret_if_privileged r8, r12, r10, r10
+	.size	copy_to_user, . - copy_to_user
+
+	.global	__copy_user
+	.type	__copy_user, @function
+__copy_user:
+	mov	r9, r11
+	andl	r9, 3, COH
+	brne	6f
+
+	/* At this point, from is word-aligned */
+1:	sub	r10, 4
+	brlt	3f
+
+2:
+10:	ld.w	r8, r11++
+11:	st.w	r12++, r8
+	sub	r10, 4
+	brge	2b
+
+3:	sub	r10, -4
+	reteq	0
+
+	/*
+	 * Handle unaligned count. Need to be careful with r10 here so
+	 * that we return the correct value even if we get a fault
+	 */
+4:
+20:	ld.ub	r8, r11++
+21:	st.b	r12++, r8
+	sub	r10, 1
+	reteq	0
+22:	ld.ub	r8, r11++
+23:	st.b	r12++, r8
+	sub	r10, 1
+	reteq	0
+24:	ld.ub	r8, r11++
+25:	st.b	r12++, r8
+	retal	0
+
+	/* Handle unaligned from-pointer */
+6:	cp.w	r10, 4
+	brlt	4b
+	rsub	r9, r9, 4
+
+30:	ld.ub	r8, r11++
+31:	st.b	r12++, r8
+	sub	r10, 1
+	sub	r9, 1
+	breq	1b
+32:	ld.ub	r8, r11++
+33:	st.b	r12++, r8
+	sub	r10, 1
+	sub	r9, 1
+	breq	1b
+34:	ld.ub	r8, r11++
+35:	st.b	r12++, r8
+	sub	r10, 1
+	rjmp	1b
+	.size	__copy_user, . - __copy_user
+
+	.section .fixup,"ax"
+	.align	1
+19:	sub	r10, -4
+29:	retal	r10
+
+	.section __ex_table,"a"
+	.align	2
+	.long	10b, 19b
+	.long	11b, 19b
+	.long	20b, 29b
+	.long	21b, 29b
+	.long	22b, 29b
+	.long	23b, 29b
+	.long	24b, 29b
+	.long	25b, 29b
+	.long	30b, 29b
+	.long	31b, 29b
+	.long	32b, 29b
+	.long	33b, 29b
+	.long	34b, 29b
+	.long	35b, 29b
diff --git a/arch/avr32/lib/csum_partial.S b/arch/avr32/lib/csum_partial.S
new file mode 100644
index 00000000000..6a262b528eb
--- /dev/null
+++ b/arch/avr32/lib/csum_partial.S
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	/*
+	 * unsigned int csum_partial(const unsigned char *buff,
+	 *			     int len, unsigned int sum)
+	 */
+	.text
+	.global	csum_partial
+	.type	csum_partial,"function"
+	.align	1
+csum_partial:
+	/* checksum complete words, aligned or not */
+3:	sub	r11, 4
+	brlt	5f
+4:	ld.w	r9, r12++
+	add	r10, r9
+	acr	r10
+	sub	r11, 4
+	brge	4b
+
+	/* return if we had a whole number of words */
+5:	sub	r11, -4
+	reteq	r10
+
+	/* checksum any remaining bytes at the end */
+	mov	r9, 0
+	mov	r8, 0
+	cp	r11, 2
+	brlt	6f
+	ld.uh	r9, r12++
+	sub	r11, 2
+	breq	7f
+	lsl	r9, 16
+6:	ld.ub	r8, r12++
+	lsl	r8, 8
+7:	or	r9, r8
+	add	r10, r9
+	acr	r10
+
+	retal	r10
+	.size	csum_partial, . - csum_partial
diff --git a/arch/avr32/lib/csum_partial_copy_generic.S b/arch/avr32/lib/csum_partial_copy_generic.S
new file mode 100644
index 00000000000..a3a0f9b8929
--- /dev/null
+++ b/arch/avr32/lib/csum_partial_copy_generic.S
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/errno.h>
+#include <asm/asm.h>
+
+	/*
+	 * unsigned int csum_partial_copy_generic(const char *src, char *dst, int len
+	 *					  int sum, int *src_err_ptr,
+	 *					  int *dst_err_ptr)
+	 *
+	 * Copy src to dst while checksumming, otherwise like csum_partial.
+	 */
+
+	.macro ld_src size, reg, ptr
+9999:	ld.\size \reg, \ptr
+	.section __ex_table, "a"
+	.long	9999b, fixup_ld_src
+	.previous
+	.endm
+
+	.macro st_dst size, ptr, reg
+9999:	st.\size \ptr, \reg
+	.section __ex_table, "a"
+	.long	9999b, fixup_st_dst
+	.previous
+	.endm
+
+	.text
+	.global	csum_partial_copy_generic
+	.type	csum_partial_copy_generic,"function"
+	.align	1
+csum_partial_copy_generic:
+	pushm	r4-r7,lr
+
+	/* The inner loop */
+1:	sub	r10, 4
+	brlt	5f
+2:	ld_src	w, r5, r12++
+	st_dst	w, r11++, r5
+	add	r9, r5
+	acr	r9
+	sub	r10, 4
+	brge	2b
+
+	/* return if we had a whole number of words */
+5:	sub	r10, -4
+	brne	7f
+
+6:	mov	r12, r9
+	popm	r4-r7,pc
+
+	/* handle additional bytes at the tail */
+7:	mov	r5, 0
+	mov	r4, 32
+8:	ld_src	ub, r6, r12++
+	st_dst	b, r11++, r6
+	lsl	r5, 8
+	sub	r4, 8
+	bfins	r5, r6, 0, 8
+	sub	r10, 1
+	brne	8b
+
+	lsl	r5, r5, r4
+	add	r9, r5
+	acr	r9
+	rjmp	6b
+
+	/* Exception handler */
+	.section .fixup,"ax"
+	.align	1
+fixup_ld_src:
+	mov	r9, -EFAULT
+	cp.w	r8, 0
+	breq	1f
+	st.w	r8[0], r9
+
+1:	/*
+	 * TODO: zero the complete destination - computing the rest
+	 * is too much work
+	 */
+
+	mov	r9, 0
+	rjmp	6b
+
+fixup_st_dst:
+	mov	r9, -EFAULT
+	lddsp	r8, sp[20]
+	cp.w	r8, 0
+	breq	1f
+	st.w	r8[0], r9
+1:	mov	r9, 0
+	rjmp	6b
+
+	.previous
diff --git a/arch/avr32/lib/delay.c b/arch/avr32/lib/delay.c
new file mode 100644
index 00000000000..462c8307b68
--- /dev/null
+++ b/arch/avr32/lib/delay.c
@@ -0,0 +1,55 @@
+/*
+ *      Precise Delay Loops for avr32
+ *
+ *      Copyright (C) 1993 Linus Torvalds
+ *      Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *	Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <asm/delay.h>
+#include <asm/processor.h>
+#include <asm/sysreg.h>
+
+int read_current_timer(unsigned long *timer_value)
+{
+	*timer_value = sysreg_read(COUNT);
+	return 0;
+}
+
+void __delay(unsigned long loops)
+{
+	unsigned bclock, now;
+
+	bclock = sysreg_read(COUNT);
+	do {
+		now = sysreg_read(COUNT);
+	} while ((now - bclock) < loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+	unsigned long long loops;
+
+	asm("mulu.d %0, %1, %2"
+	    : "=r"(loops)
+	    : "r"(current_cpu_data.loops_per_jiffy * HZ), "r"(xloops));
+	__delay(loops >> 32);
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
diff --git a/arch/avr32/lib/findbit.S b/arch/avr32/lib/findbit.S
new file mode 100644
index 00000000000..2b4856f4bf7
--- /dev/null
+++ b/arch/avr32/lib/findbit.S
@@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+
+	.text
+	/*
+	 * unsigned long find_first_zero_bit(const unsigned long *addr,
+	 *				     unsigned long size)
+	 */
+ENTRY(find_first_zero_bit)
+	cp.w	r11, 0
+	reteq	r11
+	mov	r9, r11
+1:	ld.w	r8, r12[0]
+	com	r8
+	brne	.L_found
+	sub	r12, -4
+	sub	r9, 32
+	brgt	1b
+	retal	r11
+
+	/*
+	 * unsigned long find_next_zero_bit(const unsigned long *addr,
+	 *				    unsigned long size,
+	 *				    unsigned long offset)
+	 */
+ENTRY(find_next_zero_bit)
+	lsr	r8, r10, 5
+	sub	r9, r11, r10
+	retle	r11
+
+	lsl	r8, 2
+	add	r12, r8
+	andl	r10, 31, COH
+	breq	1f
+
+	/* offset is not word-aligned. Handle the first (32 - r10) bits */
+	ld.w	r8, r12[0]
+	com	r8
+	sub	r12, -4
+	lsr	r8, r8, r10
+	brne	.L_found
+
+	/* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
+	add	r9, r10
+	sub	r9, 32
+	retle	r11
+
+	/* Main loop. offset must be word-aligned */
+1:	ld.w	r8, r12[0]
+	com	r8
+	brne	.L_found
+	sub	r12, -4
+	sub	r9, 32
+	brgt	1b
+	retal	r11
+
+	/* Common return path for when a bit is actually found. */
+.L_found:
+	brev	r8
+	clz	r10, r8
+	rsub	r9, r11
+	add	r10, r9
+
+	/* XXX: If we don't have to return exactly "size" when the bit
+	   is not found, we may drop this "min" thing */
+	min	r12, r11, r10
+	retal	r12
+
+	/*
+	 * unsigned long find_first_bit(const unsigned long *addr,
+	 *				unsigned long size)
+	 */
+ENTRY(find_first_bit)
+	cp.w	r11, 0
+	reteq	r11
+	mov	r9, r11
+1:	ld.w	r8, r12[0]
+	cp.w	r8, 0
+	brne	.L_found
+	sub	r12, -4
+	sub	r9, 32
+	brgt	1b
+	retal	r11
+
+	/*
+	 * unsigned long find_next_bit(const unsigned long *addr,
+	 *			       unsigned long size,
+	 *			       unsigned long offset)
+	 */
+ENTRY(find_next_bit)
+	lsr	r8, r10, 5
+	sub	r9, r11, r10
+	retle	r11
+
+	lsl	r8, 2
+	add	r12, r8
+	andl	r10, 31, COH
+	breq	1f
+
+	/* offset is not word-aligned. Handle the first (32 - r10) bits */
+	ld.w	r8, r12[0]
+	sub	r12, -4
+	lsr	r8, r8, r10
+	brne	.L_found
+
+	/* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
+	add	r9, r10
+	sub	r9, 32
+	retle	r11
+
+	/* Main loop. offset must be word-aligned */
+1:	ld.w	r8, r12[0]
+	cp.w	r8, 0
+	brne	.L_found
+	sub	r12, -4
+	sub	r9, 32
+	brgt	1b
+	retal	r11
+
+ENTRY(generic_find_next_zero_le_bit)
+	lsr	r8, r10, 5
+	sub	r9, r11, r10
+	retle	r11
+
+	lsl	r8, 2
+	add	r12, r8
+	andl	r10, 31, COH
+	breq	1f
+
+	/* offset is not word-aligned. Handle the first (32 - r10) bits */
+	ldswp.w	r8, r12[0]
+	sub	r12, -4
+	lsr	r8, r8, r10
+	brne	.L_found
+
+	/* r9 = r9 - (32 - r10) = r9 + r10 - 32 */
+	add	r9, r10
+	sub	r9, 32
+	retle	r11
+
+	/* Main loop. offset must be word-aligned */
+1:	ldswp.w	r8, r12[0]
+	cp.w	r8, 0
+	brne	.L_found
+	sub	r12, -4
+	sub	r9, 32
+	brgt	1b
+	retal	r11
diff --git a/arch/avr32/lib/io-readsl.S b/arch/avr32/lib/io-readsl.S
new file mode 100644
index 00000000000..b103511ed6c
--- /dev/null
+++ b/arch/avr32/lib/io-readsl.S
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	.global	__raw_readsl
+	.type	__raw_readsl,@function
+__raw_readsl:
+	cp.w	r10, 0
+	reteq	r12
+
+	/*
+	 * If r11 isn't properly aligned, we might get an exception on
+	 * some implementations. But there's not much we can do about it.
+	 */
+1:	ld.w	r8, r12[0]
+	sub	r10, 1
+	st.w	r11++, r8
+	brne	1b
+
+	retal	r12
diff --git a/arch/avr32/lib/io-readsw.S b/arch/avr32/lib/io-readsw.S
new file mode 100644
index 00000000000..456be990902
--- /dev/null
+++ b/arch/avr32/lib/io-readsw.S
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+.Lnot_word_aligned:
+	/*
+	 * Bad alignment will cause a hardware exception, which is as
+	 * good as anything. No need for us to check for proper alignment.
+	 */
+	ld.uh	r8, r12[0]
+	sub	r10, 1
+	st.h	r11++, r8
+
+	/* fall through */
+
+	.global	__raw_readsw
+	.type	__raw_readsw,@function
+__raw_readsw:
+	cp.w	r10, 0
+	reteq	r12
+	mov	r9, 3
+	tst	r11, r9
+	brne	.Lnot_word_aligned
+
+	sub	r10, 2
+	brlt	2f
+
+1:	ldins.h	r8:t, r12[0]
+	ldins.h	r8:b, r12[0]
+	st.w	r11++, r8
+	sub	r10, 2
+	brge	1b
+
+2:	sub	r10, -2
+	reteq	r12
+
+	ld.uh	r8, r12[0]
+	st.h	r11++, r8
+	retal	r12
diff --git a/arch/avr32/lib/io-writesl.S b/arch/avr32/lib/io-writesl.S
new file mode 100644
index 00000000000..22138b3a16e
--- /dev/null
+++ b/arch/avr32/lib/io-writesl.S
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	.global	__raw_writesl
+	.type	__raw_writesl,@function
+__raw_writesl:
+	cp.w	r10, 0
+	reteq	r12
+
+1:	ld.w	r8, r11++
+	sub	r10, 1
+	st.w	r12[0], r8
+	brne	1b
+
+	retal	r12
diff --git a/arch/avr32/lib/io-writesw.S b/arch/avr32/lib/io-writesw.S
new file mode 100644
index 00000000000..8c4a53f1c52
--- /dev/null
+++ b/arch/avr32/lib/io-writesw.S
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+.Lnot_word_aligned:
+	ld.uh	r8, r11++
+	sub	r10, 1
+	st.h	r12[0], r8
+
+	.global	__raw_writesw
+	.type	__raw_writesw,@function
+__raw_writesw:
+	cp.w	r10, 0
+	mov	r9, 3
+	reteq	r12
+	tst	r11, r9
+	brne	.Lnot_word_aligned
+
+	sub	r10, 2
+	brlt	2f
+
+1:	ld.w	r8, r11++
+	bfextu	r9, r8, 16, 16
+	st.h	r12[0], r9
+	st.h	r12[0], r8
+	sub	r10, 2
+	brge	1b
+
+2:	sub	r10, -2
+	reteq	r12
+
+	ld.uh	r8, r11++
+	st.h	r12[0], r8
+	retal	r12
diff --git a/arch/avr32/lib/libgcc.h b/arch/avr32/lib/libgcc.h
new file mode 100644
index 00000000000..5a091b5e361
--- /dev/null
+++ b/arch/avr32/lib/libgcc.h
@@ -0,0 +1,33 @@
+/* Definitions for various functions 'borrowed' from gcc-3.4.3 */
+
+#define BITS_PER_UNIT	8
+
+typedef		 int QItype	__attribute__ ((mode (QI)));
+typedef unsigned int UQItype	__attribute__ ((mode (QI)));
+typedef		 int HItype	__attribute__ ((mode (HI)));
+typedef unsigned int UHItype	__attribute__ ((mode (HI)));
+typedef 	 int SItype	__attribute__ ((mode (SI)));
+typedef unsigned int USItype	__attribute__ ((mode (SI)));
+typedef		 int DItype	__attribute__ ((mode (DI)));
+typedef unsigned int UDItype	__attribute__ ((mode (DI)));
+typedef 	float SFtype	__attribute__ ((mode (SF)));
+typedef		float DFtype	__attribute__ ((mode (DF)));
+typedef int word_type __attribute__ ((mode (__word__)));
+
+#define W_TYPE_SIZE (4 * BITS_PER_UNIT)
+#define Wtype	SItype
+#define UWtype	USItype
+#define HWtype	SItype
+#define UHWtype	USItype
+#define DWtype	DItype
+#define UDWtype	UDItype
+#define __NW(a,b)	__ ## a ## si ## b
+#define __NDW(a,b)	__ ## a ## di ## b
+
+struct DWstruct {Wtype high, low;};
+
+typedef union
+{
+  struct DWstruct s;
+  DWtype ll;
+} DWunion;
diff --git a/arch/avr32/lib/longlong.h b/arch/avr32/lib/longlong.h
new file mode 100644
index 00000000000..cd5e369ac43
--- /dev/null
+++ b/arch/avr32/lib/longlong.h
@@ -0,0 +1,98 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+   Copyright (C) 1991, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000
+   Free Software Foundation, Inc.
+
+   This definition file is free software; you can redistribute it
+   and/or modify it under the terms of the GNU General Public
+   License as published by the Free Software Foundation; either
+   version 2, or (at your option) any later version.
+
+   This definition file is distributed in the hope that it will be
+   useful, but WITHOUT ANY WARRANTY; without even the implied
+   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+   See the GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/* Borrowed from gcc-3.4.3 */
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+#define count_leading_zeros(count, x) ((count) = __builtin_clz(x))
+
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {									\
+    UWtype __d1, __d0, __q1, __q0;					\
+    UWtype __r1, __r0, __m;						\
+    __d1 = __ll_highpart (d);						\
+    __d0 = __ll_lowpart (d);						\
+									\
+    __r1 = (n1) % __d1;							\
+    __q1 = (n1) / __d1;							\
+    __m = (UWtype) __q1 * __d0;						\
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
+    if (__r1 < __m)							\
+      {									\
+	__q1--, __r1 += (d);						\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+	  if (__r1 < __m)						\
+	    __q1--, __r1 += (d);					\
+      }									\
+    __r1 -= __m;							\
+									\
+    __r0 = __r1 % __d1;							\
+    __q0 = __r1 / __d1;							\
+    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
+    if (__r0 < __m)							\
+      {									\
+	__q0--, __r0 += (d);						\
+	if (__r0 >= (d))						\
+	  if (__r0 < __m)						\
+	    __q0--, __r0 += (d);					\
+      }									\
+    __r0 -= __m;							\
+									\
+    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (r) = __r0;								\
+  } while (0)
+
+#define udiv_qrnnd __udiv_qrnnd_c
+
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) - (bl);							\
+    (sh) = (ah) - (bh) - (__x > (al));					\
+    (sl) = __x;								\
+  } while (0)
+
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __x0, __x1, __x2, __x3;					\
+    UHWtype __ul, __vl, __uh, __vh;					\
+									\
+    __ul = __ll_lowpart (u);						\
+    __uh = __ll_highpart (u);						\
+    __vl = __ll_lowpart (v);						\
+    __vh = __ll_highpart (v);						\
+									\
+    __x0 = (UWtype) __ul * __vl;					\
+    __x1 = (UWtype) __ul * __vh;					\
+    __x2 = (UWtype) __uh * __vl;					\
+    __x3 = (UWtype) __uh * __vh;					\
+									\
+    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
+    __x1 += __x2;		/* but this indeed can */		\
+    if (__x1 < __x2)		/* did we get it? */			\
+      __x3 += __ll_B;		/* yes, add it in the proper pos.  */	\
+									\
+    (w1) = __x3 + __ll_highpart (__x1);					\
+    (w0) = __ll_lowpart (__x1) * __ll_B + __ll_lowpart (__x0);		\
+  } while (0)
diff --git a/arch/avr32/lib/memcpy.S b/arch/avr32/lib/memcpy.S
new file mode 100644
index 00000000000..0abb26142b6
--- /dev/null
+++ b/arch/avr32/lib/memcpy.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+	/*
+	 * void *memcpy(void *to, const void *from, unsigned long n)
+	 *
+	 * This implementation does word-aligned loads in the main loop,
+	 * possibly sacrificing alignment of stores.
+	 *
+	 * Hopefully, in most cases, both "to" and "from" will be
+	 * word-aligned to begin with.
+	 */
+	.text
+	.global	memcpy
+	.type	memcpy, @function
+memcpy:
+	mov	r9, r11
+	andl	r9, 3, COH
+	brne	1f
+
+	/* At this point, "from" is word-aligned */
+2:	sub	r10, 4
+	mov	r9, r12
+	brlt	4f
+
+3:	ld.w	r8, r11++
+	sub	r10, 4
+	st.w	r12++, r8
+	brge	3b
+
+4:	neg	r10
+	reteq	r9
+
+	/* Handle unaligned count */
+	lsl	r10, 2
+	add	pc, pc, r10
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	retal	r9
+
+	/* Handle unaligned "from" pointer */
+1:	sub	r10, 4
+	brlt	4b
+	add	r10, r9
+	lsl	r9, 2
+	add	pc, pc, r9
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	ld.ub	r8, r11++
+	st.b	r12++, r8
+	rjmp	2b
diff --git a/arch/avr32/lib/memset.S b/arch/avr32/lib/memset.S
new file mode 100644
index 00000000000..40da32c0480
--- /dev/null
+++ b/arch/avr32/lib/memset.S
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on linux/arch/arm/lib/memset.S
+ *   Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * ASM optimised string functions
+ */
+#include <asm/asm.h>
+
+	/*
+	 * r12:	void *b
+	 * r11:	int c
+	 * r10:	size_t len
+	 *
+	 * Returns b in r12
+	 */
+	.text
+	.global	memset
+	.type	memset, @function
+	.align	5
+memset:
+	mov	r9, r12
+	mov	r8, r12
+	or	r11, r11, r11 << 8
+	andl	r9, 3, COH
+	brne	1f
+
+2:	or	r11, r11, r11 << 16
+	sub	r10, 4
+	brlt	5f
+
+	/* Let's do some real work */
+4:	st.w	r8++, r11
+	sub	r10, 4
+	brge	4b
+
+	/*
+	 * When we get here, we've got less than 4 bytes to set. r10
+	 * might be negative.
+	 */
+5:	sub	r10, -4
+	reteq	r12
+
+	/* Fastpath ends here, exactly 32 bytes from memset */
+
+	/* Handle unaligned count or pointer */
+	bld	r10, 1
+	brcc	6f
+	st.b	r8++, r11
+	st.b	r8++, r11
+	bld	r10, 0
+	retcc	r12
+6:	st.b	r8++, r11
+	retal	r12
+
+	/* Handle unaligned pointer */
+1:	sub	r10, 4
+	brlt	5b
+	add	r10, r9
+	lsl	r9, 1
+	add	pc, r9
+	st.b	r8++, r11
+	st.b	r8++, r11
+	st.b	r8++, r11
+	rjmp	2b
+
+	.size	memset, . - memset
diff --git a/arch/avr32/lib/strncpy_from_user.S b/arch/avr32/lib/strncpy_from_user.S
new file mode 100644
index 00000000000..72bd50599ec
--- /dev/null
+++ b/arch/avr32/lib/strncpy_from_user.S
@@ -0,0 +1,60 @@
+/*
+ * Copy to/from userspace with optional address space checking.
+ *
+ * Copyright 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/errno.h>
+
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+
+	/*
+	 * long strncpy_from_user(char *dst, const char *src, long count)
+	 *
+	 * On success, returns the length of the string, not including
+	 * the terminating NUL.
+	 *
+	 * If the string is longer than count, returns count
+	 *
+	 * If userspace access fails, returns -EFAULT
+	 */
+	.text
+	.align	1
+	.global	strncpy_from_user
+	.type	strncpy_from_user, "function"
+strncpy_from_user:
+	mov	r9, -EFAULT
+	branch_if_kernel r8, __strncpy_from_user
+	ret_if_privileged r8, r11, r10, r9
+
+	.global	__strncpy_from_user
+	.type	__strncpy_from_user, "function"
+__strncpy_from_user:
+	cp.w	r10, 0
+	reteq	0
+
+	mov	r9, r10
+
+1:	ld.ub	r8, r11++
+	st.b	r12++, r8
+	cp.w	r8, 0
+	breq	2f
+	sub	r9, 1
+	brne	1b
+
+2:	sub	r10, r9
+	retal	r10
+
+	.section .fixup, "ax"
+	.align	1
+3:	mov	r12, -EFAULT
+	retal	r12
+
+	.section __ex_table, "a"
+	.align	2
+	.long	1b, 3b
diff --git a/arch/avr32/lib/strnlen_user.S b/arch/avr32/lib/strnlen_user.S
new file mode 100644
index 00000000000..65ce11afa66
--- /dev/null
+++ b/arch/avr32/lib/strnlen_user.S
@@ -0,0 +1,67 @@
+/*
+ * Copy to/from userspace with optional address space checking.
+ *
+ * Copyright 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/processor.h>
+#include <asm/asm.h>
+
+	.text
+	.align	1
+	.global	strnlen_user
+	.type	strnlen_user, "function"
+strnlen_user:
+	branch_if_kernel r8, __strnlen_user
+	sub	r8, r11, 1
+	add	r8, r12
+	retcs	0
+	brmi	adjust_length	/* do a closer inspection */
+
+	.global	__strnlen_user
+	.type	__strnlen_user, "function"
+__strnlen_user:
+	mov	r10, r12
+
+10:	ld.ub	r8, r12++
+	cp.w	r8, 0
+	breq	2f
+	sub	r11, 1
+	brne	10b
+
+	sub	r12, -1
+2:	sub	r12, r10
+	retal	r12
+
+
+	.type	adjust_length, "function"
+adjust_length:
+	cp.w	r12, 0		/* addr must always be < TASK_SIZE */
+	retmi	0
+
+	pushm	lr
+	lddpc	lr, _task_size
+	sub	r11, lr, r12
+	mov	r9, r11
+	rcall	__strnlen_user
+	cp.w	r12, r9
+	brgt	1f
+	popm	pc
+1:	popm	pc, r12=0
+
+	.align	2
+_task_size:
+	.long	TASK_SIZE
+
+	.section .fixup, "ax"
+	.align	1
+19:	retal	0
+
+	.section __ex_table, "a"
+	.align	2
+	.long	10b, 19b
diff --git a/arch/avr32/mach-at32ap/Makefile b/arch/avr32/mach-at32ap/Makefile
new file mode 100644
index 00000000000..4b10853eb61
--- /dev/null
+++ b/arch/avr32/mach-at32ap/Makefile
@@ -0,0 +1,2 @@
+obj-y				+= at32ap.o clock.o pio.o intc.o extint.o
+obj-$(CONFIG_CPU_AT32AP7000)	+= at32ap7000.o
diff --git a/arch/avr32/mach-at32ap/at32ap.c b/arch/avr32/mach-at32ap/at32ap.c
new file mode 100644
index 00000000000..f7cedf5aabe
--- /dev/null
+++ b/arch/avr32/mach-at32ap/at32ap.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+
+#include <asm/arch/init.h>
+#include <asm/arch/sm.h>
+
+struct at32_sm system_manager;
+
+static int __init at32_sm_init(void)
+{
+	struct resource *regs;
+	struct at32_sm *sm = &system_manager;
+	int ret = -ENXIO;
+
+	regs = platform_get_resource(&at32_sm_device, IORESOURCE_MEM, 0);
+	if (!regs)
+		goto fail;
+
+	spin_lock_init(&sm->lock);
+	sm->pdev = &at32_sm_device;
+
+	ret = -ENOMEM;
+	sm->regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!sm->regs)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "Failed to initialize System Manager: %d\n", ret);
+	return ret;
+}
+
+void __init setup_platform(void)
+{
+	at32_sm_init();
+	at32_clock_init();
+	at32_portmux_init();
+
+	/* FIXME: This doesn't belong here */
+	at32_setup_serial_console(1);
+}
+
+static int __init pdc_probe(struct platform_device *pdev)
+{
+	struct clk *pclk, *hclk;
+
+	pclk = clk_get(&pdev->dev, "pclk");
+	if (IS_ERR(pclk)) {
+		dev_err(&pdev->dev, "no pclk defined\n");
+		return PTR_ERR(pclk);
+	}
+	hclk = clk_get(&pdev->dev, "hclk");
+	if (IS_ERR(hclk)) {
+		dev_err(&pdev->dev, "no hclk defined\n");
+		clk_put(pclk);
+		return PTR_ERR(hclk);
+	}
+
+	clk_enable(pclk);
+	clk_enable(hclk);
+
+	dev_info(&pdev->dev, "Atmel Peripheral DMA Controller enabled\n");
+	return 0;
+}
+
+static struct platform_driver pdc_driver = {
+	.probe		= pdc_probe,
+	.driver		= {
+		.name	= "pdc",
+	},
+};
+
+static int __init pdc_init(void)
+{
+	return platform_driver_register(&pdc_driver);
+}
+arch_initcall(pdc_init);
diff --git a/arch/avr32/mach-at32ap/at32ap7000.c b/arch/avr32/mach-at32ap/at32ap7000.c
new file mode 100644
index 00000000000..e8c6893a1c2
--- /dev/null
+++ b/arch/avr32/mach-at32ap/at32ap7000.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clk.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+
+#include <asm/arch/board.h>
+#include <asm/arch/portmux.h>
+#include <asm/arch/sm.h>
+
+#include "clock.h"
+#include "pio.h"
+#include "sm.h"
+
+#define PBMEM(base)					\
+	{						\
+		.start		= base,			\
+		.end		= base + 0x3ff,		\
+		.flags		= IORESOURCE_MEM,	\
+	}
+#define IRQ(num)					\
+	{						\
+		.start		= num,			\
+		.end		= num,			\
+		.flags		= IORESOURCE_IRQ,	\
+	}
+#define NAMED_IRQ(num, _name)				\
+	{						\
+		.start		= num,			\
+		.end		= num,			\
+		.name		= _name,		\
+		.flags		= IORESOURCE_IRQ,	\
+	}
+
+#define DEFINE_DEV(_name, _id)					\
+static struct platform_device _name##_id##_device = {		\
+	.name		= #_name,				\
+	.id		= _id,					\
+	.resource	= _name##_id##_resource,		\
+	.num_resources	= ARRAY_SIZE(_name##_id##_resource),	\
+}
+#define DEFINE_DEV_DATA(_name, _id)				\
+static struct platform_device _name##_id##_device = {		\
+	.name		= #_name,				\
+	.id		= _id,					\
+	.dev		= {					\
+		.platform_data	= &_name##_id##_data,		\
+	},							\
+	.resource	= _name##_id##_resource,		\
+	.num_resources	= ARRAY_SIZE(_name##_id##_resource),	\
+}
+
+#define DEV_CLK(_name, devname, bus, _index)			\
+static struct clk devname##_##_name = {				\
+	.name		= #_name,				\
+	.dev		= &devname##_device.dev,		\
+	.parent		= &bus##_clk,				\
+	.mode		= bus##_clk_mode,			\
+	.get_rate	= bus##_clk_get_rate,			\
+	.index		= _index,				\
+}
+
+enum {
+	PIOA,
+	PIOB,
+	PIOC,
+	PIOD,
+};
+
+enum {
+	FUNC_A,
+	FUNC_B,
+};
+
+unsigned long at32ap7000_osc_rates[3] = {
+	[0] = 32768,
+	/* FIXME: these are ATSTK1002-specific */
+	[1] = 20000000,
+	[2] = 12000000,
+};
+
+static unsigned long osc_get_rate(struct clk *clk)
+{
+	return at32ap7000_osc_rates[clk->index];
+}
+
+static unsigned long pll_get_rate(struct clk *clk, unsigned long control)
+{
+	unsigned long div, mul, rate;
+
+	if (!(control & SM_BIT(PLLEN)))
+		return 0;
+
+	div = SM_BFEXT(PLLDIV, control) + 1;
+	mul = SM_BFEXT(PLLMUL, control) + 1;
+
+	rate = clk->parent->get_rate(clk->parent);
+	rate = (rate + div / 2) / div;
+	rate *= mul;
+
+	return rate;
+}
+
+static unsigned long pll0_get_rate(struct clk *clk)
+{
+	u32 control;
+
+	control = sm_readl(&system_manager, PM_PLL0);
+
+	return pll_get_rate(clk, control);
+}
+
+static unsigned long pll1_get_rate(struct clk *clk)
+{
+	u32 control;
+
+	control = sm_readl(&system_manager, PM_PLL1);
+
+	return pll_get_rate(clk, control);
+}
+
+/*
+ * The AT32AP7000 has five primary clock sources: One 32kHz
+ * oscillator, two crystal oscillators and two PLLs.
+ */
+static struct clk osc32k = {
+	.name		= "osc32k",
+	.get_rate	= osc_get_rate,
+	.users		= 1,
+	.index		= 0,
+};
+static struct clk osc0 = {
+	.name		= "osc0",
+	.get_rate	= osc_get_rate,
+	.users		= 1,
+	.index		= 1,
+};
+static struct clk osc1 = {
+	.name		= "osc1",
+	.get_rate	= osc_get_rate,
+	.index		= 2,
+};
+static struct clk pll0 = {
+	.name		= "pll0",
+	.get_rate	= pll0_get_rate,
+	.parent		= &osc0,
+};
+static struct clk pll1 = {
+	.name		= "pll1",
+	.get_rate	= pll1_get_rate,
+	.parent		= &osc0,
+};
+
+/*
+ * The main clock can be either osc0 or pll0.  The boot loader may
+ * have chosen one for us, so we don't really know which one until we
+ * have a look at the SM.
+ */
+static struct clk *main_clock;
+
+/*
+ * Synchronous clocks are generated from the main clock. The clocks
+ * must satisfy the constraint
+ *   fCPU >= fHSB >= fPB
+ * i.e. each clock must not be faster than its parent.
+ */
+static unsigned long bus_clk_get_rate(struct clk *clk, unsigned int shift)
+{
+	return main_clock->get_rate(main_clock) >> shift;
+};
+
+static void cpu_clk_mode(struct clk *clk, int enabled)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned long flags;
+	u32 mask;
+
+	spin_lock_irqsave(&sm->lock, flags);
+	mask = sm_readl(sm, PM_CPU_MASK);
+	if (enabled)
+		mask |= 1 << clk->index;
+	else
+		mask &= ~(1 << clk->index);
+	sm_writel(sm, PM_CPU_MASK, mask);
+	spin_unlock_irqrestore(&sm->lock, flags);
+}
+
+static unsigned long cpu_clk_get_rate(struct clk *clk)
+{
+	unsigned long cksel, shift = 0;
+
+	cksel = sm_readl(&system_manager, PM_CKSEL);
+	if (cksel & SM_BIT(CPUDIV))
+		shift = SM_BFEXT(CPUSEL, cksel) + 1;
+
+	return bus_clk_get_rate(clk, shift);
+}
+
+static void hsb_clk_mode(struct clk *clk, int enabled)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned long flags;
+	u32 mask;
+
+	spin_lock_irqsave(&sm->lock, flags);
+	mask = sm_readl(sm, PM_HSB_MASK);
+	if (enabled)
+		mask |= 1 << clk->index;
+	else
+		mask &= ~(1 << clk->index);
+	sm_writel(sm, PM_HSB_MASK, mask);
+	spin_unlock_irqrestore(&sm->lock, flags);
+}
+
+static unsigned long hsb_clk_get_rate(struct clk *clk)
+{
+	unsigned long cksel, shift = 0;
+
+	cksel = sm_readl(&system_manager, PM_CKSEL);
+	if (cksel & SM_BIT(HSBDIV))
+		shift = SM_BFEXT(HSBSEL, cksel) + 1;
+
+	return bus_clk_get_rate(clk, shift);
+}
+
+static void pba_clk_mode(struct clk *clk, int enabled)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned long flags;
+	u32 mask;
+
+	spin_lock_irqsave(&sm->lock, flags);
+	mask = sm_readl(sm, PM_PBA_MASK);
+	if (enabled)
+		mask |= 1 << clk->index;
+	else
+		mask &= ~(1 << clk->index);
+	sm_writel(sm, PM_PBA_MASK, mask);
+	spin_unlock_irqrestore(&sm->lock, flags);
+}
+
+static unsigned long pba_clk_get_rate(struct clk *clk)
+{
+	unsigned long cksel, shift = 0;
+
+	cksel = sm_readl(&system_manager, PM_CKSEL);
+	if (cksel & SM_BIT(PBADIV))
+		shift = SM_BFEXT(PBASEL, cksel) + 1;
+
+	return bus_clk_get_rate(clk, shift);
+}
+
+static void pbb_clk_mode(struct clk *clk, int enabled)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned long flags;
+	u32 mask;
+
+	spin_lock_irqsave(&sm->lock, flags);
+	mask = sm_readl(sm, PM_PBB_MASK);
+	if (enabled)
+		mask |= 1 << clk->index;
+	else
+		mask &= ~(1 << clk->index);
+	sm_writel(sm, PM_PBB_MASK, mask);
+	spin_unlock_irqrestore(&sm->lock, flags);
+}
+
+static unsigned long pbb_clk_get_rate(struct clk *clk)
+{
+	unsigned long cksel, shift = 0;
+
+	cksel = sm_readl(&system_manager, PM_CKSEL);
+	if (cksel & SM_BIT(PBBDIV))
+		shift = SM_BFEXT(PBBSEL, cksel) + 1;
+
+	return bus_clk_get_rate(clk, shift);
+}
+
+static struct clk cpu_clk = {
+	.name		= "cpu",
+	.get_rate	= cpu_clk_get_rate,
+	.users		= 1,
+};
+static struct clk hsb_clk = {
+	.name		= "hsb",
+	.parent		= &cpu_clk,
+	.get_rate	= hsb_clk_get_rate,
+};
+static struct clk pba_clk = {
+	.name		= "pba",
+	.parent		= &hsb_clk,
+	.mode		= hsb_clk_mode,
+	.get_rate	= pba_clk_get_rate,
+	.index		= 1,
+};
+static struct clk pbb_clk = {
+	.name		= "pbb",
+	.parent		= &hsb_clk,
+	.mode		= hsb_clk_mode,
+	.get_rate	= pbb_clk_get_rate,
+	.users		= 1,
+	.index		= 2,
+};
+
+/* --------------------------------------------------------------------
+ *  Generic Clock operations
+ * -------------------------------------------------------------------- */
+
+static void genclk_mode(struct clk *clk, int enabled)
+{
+	u32 control;
+
+	BUG_ON(clk->index > 7);
+
+	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+	if (enabled)
+		control |= SM_BIT(CEN);
+	else
+		control &= ~SM_BIT(CEN);
+	sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index, control);
+}
+
+static unsigned long genclk_get_rate(struct clk *clk)
+{
+	u32 control;
+	unsigned long div = 1;
+
+	BUG_ON(clk->index > 7);
+
+	if (!clk->parent)
+		return 0;
+
+	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+	if (control & SM_BIT(DIVEN))
+		div = 2 * (SM_BFEXT(DIV, control) + 1);
+
+	return clk->parent->get_rate(clk->parent) / div;
+}
+
+static long genclk_set_rate(struct clk *clk, unsigned long rate, int apply)
+{
+	u32 control;
+	unsigned long parent_rate, actual_rate, div;
+
+	BUG_ON(clk->index > 7);
+
+	if (!clk->parent)
+		return 0;
+
+	parent_rate = clk->parent->get_rate(clk->parent);
+	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+
+	if (rate > 3 * parent_rate / 4) {
+		actual_rate = parent_rate;
+		control &= ~SM_BIT(DIVEN);
+	} else {
+		div = (parent_rate + rate) / (2 * rate) - 1;
+		control = SM_BFINS(DIV, div, control) | SM_BIT(DIVEN);
+		actual_rate = parent_rate / (2 * (div + 1));
+	}
+
+	printk("clk %s: new rate %lu (actual rate %lu)\n",
+	       clk->name, rate, actual_rate);
+
+	if (apply)
+		sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index,
+			  control);
+
+	return actual_rate;
+}
+
+int genclk_set_parent(struct clk *clk, struct clk *parent)
+{
+	u32 control;
+
+	BUG_ON(clk->index > 7);
+
+	printk("clk %s: new parent %s (was %s)\n",
+	       clk->name, parent->name,
+	       clk->parent ? clk->parent->name : "(null)");
+
+	control = sm_readl(&system_manager, PM_GCCTRL + 4 * clk->index);
+
+	if (parent == &osc1 || parent == &pll1)
+		control |= SM_BIT(OSCSEL);
+	else if (parent == &osc0 || parent == &pll0)
+		control &= ~SM_BIT(OSCSEL);
+	else
+		return -EINVAL;
+
+	if (parent == &pll0 || parent == &pll1)
+		control |= SM_BIT(PLLSEL);
+	else
+		control &= ~SM_BIT(PLLSEL);
+
+	sm_writel(&system_manager, PM_GCCTRL + 4 * clk->index, control);
+	clk->parent = parent;
+
+	return 0;
+}
+
+/* --------------------------------------------------------------------
+ *  System peripherals
+ * -------------------------------------------------------------------- */
+static struct resource sm_resource[] = {
+	PBMEM(0xfff00000),
+	NAMED_IRQ(19, "eim"),
+	NAMED_IRQ(20, "pm"),
+	NAMED_IRQ(21, "rtc"),
+};
+struct platform_device at32_sm_device = {
+	.name		= "sm",
+	.id		= 0,
+	.resource	= sm_resource,
+	.num_resources	= ARRAY_SIZE(sm_resource),
+};
+DEV_CLK(pclk, at32_sm, pbb, 0);
+
+static struct resource intc0_resource[] = {
+	PBMEM(0xfff00400),
+};
+struct platform_device at32_intc0_device = {
+	.name		= "intc",
+	.id		= 0,
+	.resource	= intc0_resource,
+	.num_resources	= ARRAY_SIZE(intc0_resource),
+};
+DEV_CLK(pclk, at32_intc0, pbb, 1);
+
+static struct clk ebi_clk = {
+	.name		= "ebi",
+	.parent		= &hsb_clk,
+	.mode		= hsb_clk_mode,
+	.get_rate	= hsb_clk_get_rate,
+	.users		= 1,
+};
+static struct clk hramc_clk = {
+	.name		= "hramc",
+	.parent		= &hsb_clk,
+	.mode		= hsb_clk_mode,
+	.get_rate	= hsb_clk_get_rate,
+	.users		= 1,
+};
+
+static struct platform_device pdc_device = {
+	.name		= "pdc",
+	.id		= 0,
+};
+DEV_CLK(hclk, pdc, hsb, 4);
+DEV_CLK(pclk, pdc, pba, 16);
+
+static struct clk pico_clk = {
+	.name		= "pico",
+	.parent		= &cpu_clk,
+	.mode		= cpu_clk_mode,
+	.get_rate	= cpu_clk_get_rate,
+	.users		= 1,
+};
+
+/* --------------------------------------------------------------------
+ *  PIO
+ * -------------------------------------------------------------------- */
+
+static struct resource pio0_resource[] = {
+	PBMEM(0xffe02800),
+	IRQ(13),
+};
+DEFINE_DEV(pio, 0);
+DEV_CLK(mck, pio0, pba, 10);
+
+static struct resource pio1_resource[] = {
+	PBMEM(0xffe02c00),
+	IRQ(14),
+};
+DEFINE_DEV(pio, 1);
+DEV_CLK(mck, pio1, pba, 11);
+
+static struct resource pio2_resource[] = {
+	PBMEM(0xffe03000),
+	IRQ(15),
+};
+DEFINE_DEV(pio, 2);
+DEV_CLK(mck, pio2, pba, 12);
+
+static struct resource pio3_resource[] = {
+	PBMEM(0xffe03400),
+	IRQ(16),
+};
+DEFINE_DEV(pio, 3);
+DEV_CLK(mck, pio3, pba, 13);
+
+void __init at32_add_system_devices(void)
+{
+	system_manager.eim_first_irq = NR_INTERNAL_IRQS;
+
+	platform_device_register(&at32_sm_device);
+	platform_device_register(&at32_intc0_device);
+	platform_device_register(&pdc_device);
+
+	platform_device_register(&pio0_device);
+	platform_device_register(&pio1_device);
+	platform_device_register(&pio2_device);
+	platform_device_register(&pio3_device);
+}
+
+/* --------------------------------------------------------------------
+ *  USART
+ * -------------------------------------------------------------------- */
+
+static struct resource usart0_resource[] = {
+	PBMEM(0xffe00c00),
+	IRQ(7),
+};
+DEFINE_DEV(usart, 0);
+DEV_CLK(usart, usart0, pba, 4);
+
+static struct resource usart1_resource[] = {
+	PBMEM(0xffe01000),
+	IRQ(7),
+};
+DEFINE_DEV(usart, 1);
+DEV_CLK(usart, usart1, pba, 4);
+
+static struct resource usart2_resource[] = {
+	PBMEM(0xffe01400),
+	IRQ(8),
+};
+DEFINE_DEV(usart, 2);
+DEV_CLK(usart, usart2, pba, 5);
+
+static struct resource usart3_resource[] = {
+	PBMEM(0xffe01800),
+	IRQ(9),
+};
+DEFINE_DEV(usart, 3);
+DEV_CLK(usart, usart3, pba, 6);
+
+static inline void configure_usart0_pins(void)
+{
+	portmux_set_func(PIOA,  8, FUNC_B);	/* RXD	*/
+	portmux_set_func(PIOA,  9, FUNC_B);	/* TXD	*/
+}
+
+static inline void configure_usart1_pins(void)
+{
+	portmux_set_func(PIOA, 17, FUNC_A);	/* RXD	*/
+	portmux_set_func(PIOA, 18, FUNC_A);	/* TXD	*/
+}
+
+static inline void configure_usart2_pins(void)
+{
+	portmux_set_func(PIOB, 26, FUNC_B);	/* RXD	*/
+	portmux_set_func(PIOB, 27, FUNC_B);	/* TXD	*/
+}
+
+static inline void configure_usart3_pins(void)
+{
+	portmux_set_func(PIOB, 18, FUNC_B);	/* RXD	*/
+	portmux_set_func(PIOB, 17, FUNC_B);	/* TXD	*/
+}
+
+static struct platform_device *setup_usart(unsigned int id)
+{
+	struct platform_device *pdev;
+
+	switch (id) {
+	case 0:
+		pdev = &usart0_device;
+		configure_usart0_pins();
+		break;
+	case 1:
+		pdev = &usart1_device;
+		configure_usart1_pins();
+		break;
+	case 2:
+		pdev = &usart2_device;
+		configure_usart2_pins();
+		break;
+	case 3:
+		pdev = &usart3_device;
+		configure_usart3_pins();
+		break;
+	default:
+		pdev = NULL;
+		break;
+	}
+
+	return pdev;
+}
+
+struct platform_device *__init at32_add_device_usart(unsigned int id)
+{
+	struct platform_device *pdev;
+
+	pdev = setup_usart(id);
+	if (pdev)
+		platform_device_register(pdev);
+
+	return pdev;
+}
+
+struct platform_device *at91_default_console_device;
+
+void __init at32_setup_serial_console(unsigned int usart_id)
+{
+	at91_default_console_device = setup_usart(usart_id);
+}
+
+/* --------------------------------------------------------------------
+ *  Ethernet
+ * -------------------------------------------------------------------- */
+
+static struct eth_platform_data macb0_data;
+static struct resource macb0_resource[] = {
+	PBMEM(0xfff01800),
+	IRQ(25),
+};
+DEFINE_DEV_DATA(macb, 0);
+DEV_CLK(hclk, macb0, hsb, 8);
+DEV_CLK(pclk, macb0, pbb, 6);
+
+struct platform_device *__init
+at32_add_device_eth(unsigned int id, struct eth_platform_data *data)
+{
+	struct platform_device *pdev;
+
+	switch (id) {
+	case 0:
+		pdev = &macb0_device;
+
+		portmux_set_func(PIOC,  3, FUNC_A);	/* TXD0	*/
+		portmux_set_func(PIOC,  4, FUNC_A);	/* TXD1	*/
+		portmux_set_func(PIOC,  7, FUNC_A);	/* TXEN	*/
+		portmux_set_func(PIOC,  8, FUNC_A);	/* TXCK */
+		portmux_set_func(PIOC,  9, FUNC_A);	/* RXD0	*/
+		portmux_set_func(PIOC, 10, FUNC_A);	/* RXD1	*/
+		portmux_set_func(PIOC, 13, FUNC_A);	/* RXER	*/
+		portmux_set_func(PIOC, 15, FUNC_A);	/* RXDV	*/
+		portmux_set_func(PIOC, 16, FUNC_A);	/* MDC	*/
+		portmux_set_func(PIOC, 17, FUNC_A);	/* MDIO	*/
+
+		if (!data->is_rmii) {
+			portmux_set_func(PIOC,  0, FUNC_A);	/* COL	*/
+			portmux_set_func(PIOC,  1, FUNC_A);	/* CRS	*/
+			portmux_set_func(PIOC,  2, FUNC_A);	/* TXER	*/
+			portmux_set_func(PIOC,  5, FUNC_A);	/* TXD2	*/
+			portmux_set_func(PIOC,  6, FUNC_A);	/* TXD3 */
+			portmux_set_func(PIOC, 11, FUNC_A);	/* RXD2	*/
+			portmux_set_func(PIOC, 12, FUNC_A);	/* RXD3	*/
+			portmux_set_func(PIOC, 14, FUNC_A);	/* RXCK	*/
+			portmux_set_func(PIOC, 18, FUNC_A);	/* SPD	*/
+		}
+		break;
+
+	default:
+		return NULL;
+	}
+
+	memcpy(pdev->dev.platform_data, data, sizeof(struct eth_platform_data));
+	platform_device_register(pdev);
+
+	return pdev;
+}
+
+/* --------------------------------------------------------------------
+ *  SPI
+ * -------------------------------------------------------------------- */
+static struct resource spi0_resource[] = {
+	PBMEM(0xffe00000),
+	IRQ(3),
+};
+DEFINE_DEV(spi, 0);
+DEV_CLK(mck, spi0, pba, 0);
+
+struct platform_device *__init at32_add_device_spi(unsigned int id)
+{
+	struct platform_device *pdev;
+
+	switch (id) {
+	case 0:
+		pdev = &spi0_device;
+		portmux_set_func(PIOA,  0, FUNC_A);	/* MISO	 */
+		portmux_set_func(PIOA,  1, FUNC_A);	/* MOSI	 */
+		portmux_set_func(PIOA,  2, FUNC_A);	/* SCK	 */
+		portmux_set_func(PIOA,  3, FUNC_A);	/* NPCS0 */
+		portmux_set_func(PIOA,  4, FUNC_A);	/* NPCS1 */
+		portmux_set_func(PIOA,  5, FUNC_A);	/* NPCS2 */
+		break;
+
+	default:
+		return NULL;
+	}
+
+	platform_device_register(pdev);
+	return pdev;
+}
+
+/* --------------------------------------------------------------------
+ *  LCDC
+ * -------------------------------------------------------------------- */
+static struct lcdc_platform_data lcdc0_data;
+static struct resource lcdc0_resource[] = {
+	{
+		.start		= 0xff000000,
+		.end		= 0xff000fff,
+		.flags		= IORESOURCE_MEM,
+	},
+	IRQ(1),
+};
+DEFINE_DEV_DATA(lcdc, 0);
+DEV_CLK(hclk, lcdc0, hsb, 7);
+static struct clk lcdc0_pixclk = {
+	.name		= "pixclk",
+	.dev		= &lcdc0_device.dev,
+	.mode		= genclk_mode,
+	.get_rate	= genclk_get_rate,
+	.set_rate	= genclk_set_rate,
+	.set_parent	= genclk_set_parent,
+	.index		= 7,
+};
+
+struct platform_device *__init
+at32_add_device_lcdc(unsigned int id, struct lcdc_platform_data *data)
+{
+	struct platform_device *pdev;
+
+	switch (id) {
+	case 0:
+		pdev = &lcdc0_device;
+		portmux_set_func(PIOC, 19, FUNC_A);	/* CC	  */
+		portmux_set_func(PIOC, 20, FUNC_A);	/* HSYNC  */
+		portmux_set_func(PIOC, 21, FUNC_A);	/* PCLK	  */
+		portmux_set_func(PIOC, 22, FUNC_A);	/* VSYNC  */
+		portmux_set_func(PIOC, 23, FUNC_A);	/* DVAL	  */
+		portmux_set_func(PIOC, 24, FUNC_A);	/* MODE	  */
+		portmux_set_func(PIOC, 25, FUNC_A);	/* PWR	  */
+		portmux_set_func(PIOC, 26, FUNC_A);	/* DATA0  */
+		portmux_set_func(PIOC, 27, FUNC_A);	/* DATA1  */
+		portmux_set_func(PIOC, 28, FUNC_A);	/* DATA2  */
+		portmux_set_func(PIOC, 29, FUNC_A);	/* DATA3  */
+		portmux_set_func(PIOC, 30, FUNC_A);	/* DATA4  */
+		portmux_set_func(PIOC, 31, FUNC_A);	/* DATA5  */
+		portmux_set_func(PIOD,  0, FUNC_A);	/* DATA6  */
+		portmux_set_func(PIOD,  1, FUNC_A);	/* DATA7  */
+		portmux_set_func(PIOD,  2, FUNC_A);	/* DATA8  */
+		portmux_set_func(PIOD,  3, FUNC_A);	/* DATA9  */
+		portmux_set_func(PIOD,  4, FUNC_A);	/* DATA10 */
+		portmux_set_func(PIOD,  5, FUNC_A);	/* DATA11 */
+		portmux_set_func(PIOD,  6, FUNC_A);	/* DATA12 */
+		portmux_set_func(PIOD,  7, FUNC_A);	/* DATA13 */
+		portmux_set_func(PIOD,  8, FUNC_A);	/* DATA14 */
+		portmux_set_func(PIOD,  9, FUNC_A);	/* DATA15 */
+		portmux_set_func(PIOD, 10, FUNC_A);	/* DATA16 */
+		portmux_set_func(PIOD, 11, FUNC_A);	/* DATA17 */
+		portmux_set_func(PIOD, 12, FUNC_A);	/* DATA18 */
+		portmux_set_func(PIOD, 13, FUNC_A);	/* DATA19 */
+		portmux_set_func(PIOD, 14, FUNC_A);	/* DATA20 */
+		portmux_set_func(PIOD, 15, FUNC_A);	/* DATA21 */
+		portmux_set_func(PIOD, 16, FUNC_A);	/* DATA22 */
+		portmux_set_func(PIOD, 17, FUNC_A);	/* DATA23 */
+
+		clk_set_parent(&lcdc0_pixclk, &pll0);
+		clk_set_rate(&lcdc0_pixclk, clk_get_rate(&pll0));
+		break;
+
+	default:
+		return NULL;
+	}
+
+	memcpy(pdev->dev.platform_data, data,
+	       sizeof(struct lcdc_platform_data));
+
+	platform_device_register(pdev);
+	return pdev;
+}
+
+struct clk *at32_clock_list[] = {
+	&osc32k,
+	&osc0,
+	&osc1,
+	&pll0,
+	&pll1,
+	&cpu_clk,
+	&hsb_clk,
+	&pba_clk,
+	&pbb_clk,
+	&at32_sm_pclk,
+	&at32_intc0_pclk,
+	&ebi_clk,
+	&hramc_clk,
+	&pdc_hclk,
+	&pdc_pclk,
+	&pico_clk,
+	&pio0_mck,
+	&pio1_mck,
+	&pio2_mck,
+	&pio3_mck,
+	&usart0_usart,
+	&usart1_usart,
+	&usart2_usart,
+	&usart3_usart,
+	&macb0_hclk,
+	&macb0_pclk,
+	&spi0_mck,
+	&lcdc0_hclk,
+	&lcdc0_pixclk,
+};
+unsigned int at32_nr_clocks = ARRAY_SIZE(at32_clock_list);
+
+void __init at32_portmux_init(void)
+{
+	at32_init_pio(&pio0_device);
+	at32_init_pio(&pio1_device);
+	at32_init_pio(&pio2_device);
+	at32_init_pio(&pio3_device);
+}
+
+void __init at32_clock_init(void)
+{
+	struct at32_sm *sm = &system_manager;
+	u32 cpu_mask = 0, hsb_mask = 0, pba_mask = 0, pbb_mask = 0;
+	int i;
+
+	if (sm_readl(sm, PM_MCCTRL) & SM_BIT(PLLSEL))
+		main_clock = &pll0;
+	else
+		main_clock = &osc0;
+
+	if (sm_readl(sm, PM_PLL0) & SM_BIT(PLLOSC))
+		pll0.parent = &osc1;
+	if (sm_readl(sm, PM_PLL1) & SM_BIT(PLLOSC))
+		pll1.parent = &osc1;
+
+	/*
+	 * Turn on all clocks that have at least one user already, and
+	 * turn off everything else. We only do this for module
+	 * clocks, and even though it isn't particularly pretty to
+	 * check the address of the mode function, it should do the
+	 * trick...
+	 */
+	for (i = 0; i < ARRAY_SIZE(at32_clock_list); i++) {
+		struct clk *clk = at32_clock_list[i];
+
+		if (clk->mode == &cpu_clk_mode)
+			cpu_mask |= 1 << clk->index;
+		else if (clk->mode == &hsb_clk_mode)
+			hsb_mask |= 1 << clk->index;
+		else if (clk->mode == &pba_clk_mode)
+			pba_mask |= 1 << clk->index;
+		else if (clk->mode == &pbb_clk_mode)
+			pbb_mask |= 1 << clk->index;
+	}
+
+	sm_writel(sm, PM_CPU_MASK, cpu_mask);
+	sm_writel(sm, PM_HSB_MASK, hsb_mask);
+	sm_writel(sm, PM_PBA_MASK, pba_mask);
+	sm_writel(sm, PM_PBB_MASK, pbb_mask);
+}
diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c
new file mode 100644
index 00000000000..3d0d1097389
--- /dev/null
+++ b/arch/avr32/mach-at32ap/clock.c
@@ -0,0 +1,148 @@
+/*
+ * Clock management for AT32AP CPUs
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * Based on arch/arm/mach-at91rm9200/clock.c
+ *   Copyright (C) 2005 David Brownell
+ *   Copyright (C) 2005 Ivan Kokshaysky
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/string.h>
+
+#include "clock.h"
+
+static spinlock_t clk_lock = SPIN_LOCK_UNLOCKED;
+
+struct clk *clk_get(struct device *dev, const char *id)
+{
+	int i;
+
+	for (i = 0; i < at32_nr_clocks; i++) {
+		struct clk *clk = at32_clock_list[i];
+
+		if (clk->dev == dev && strcmp(id, clk->name) == 0)
+			return clk;
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(clk_get);
+
+void clk_put(struct clk *clk)
+{
+	/* clocks are static for now, we can't free them */
+}
+EXPORT_SYMBOL(clk_put);
+
+static void __clk_enable(struct clk *clk)
+{
+	if (clk->parent)
+		__clk_enable(clk->parent);
+	if (clk->users++ == 0 && clk->mode)
+		clk->mode(clk, 1);
+}
+
+int clk_enable(struct clk *clk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	__clk_enable(clk);
+	spin_unlock_irqrestore(&clk_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(clk_enable);
+
+static void __clk_disable(struct clk *clk)
+{
+	BUG_ON(clk->users == 0);
+
+	if (--clk->users == 0 && clk->mode)
+		clk->mode(clk, 0);
+	if (clk->parent)
+		__clk_disable(clk->parent);
+}
+
+void clk_disable(struct clk *clk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	__clk_disable(clk);
+	spin_unlock_irqrestore(&clk_lock, flags);
+}
+EXPORT_SYMBOL(clk_disable);
+
+unsigned long clk_get_rate(struct clk *clk)
+{
+	unsigned long flags;
+	unsigned long rate;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	rate = clk->get_rate(clk);
+	spin_unlock_irqrestore(&clk_lock, flags);
+
+	return rate;
+}
+EXPORT_SYMBOL(clk_get_rate);
+
+long clk_round_rate(struct clk *clk, unsigned long rate)
+{
+	unsigned long flags, actual_rate;
+
+	if (!clk->set_rate)
+		return -ENOSYS;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	actual_rate = clk->set_rate(clk, rate, 0);
+	spin_unlock_irqrestore(&clk_lock, flags);
+
+	return actual_rate;
+}
+EXPORT_SYMBOL(clk_round_rate);
+
+int clk_set_rate(struct clk *clk, unsigned long rate)
+{
+	unsigned long flags;
+	long ret;
+
+	if (!clk->set_rate)
+		return -ENOSYS;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	ret = clk->set_rate(clk, rate, 1);
+	spin_unlock_irqrestore(&clk_lock, flags);
+
+	return (ret < 0) ? ret : 0;
+}
+EXPORT_SYMBOL(clk_set_rate);
+
+int clk_set_parent(struct clk *clk, struct clk *parent)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!clk->set_parent)
+		return -ENOSYS;
+
+	spin_lock_irqsave(&clk_lock, flags);
+	ret = clk->set_parent(clk, parent);
+	spin_unlock_irqrestore(&clk_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(clk_set_parent);
+
+struct clk *clk_get_parent(struct clk *clk)
+{
+	return clk->parent;
+}
+EXPORT_SYMBOL(clk_get_parent);
diff --git a/arch/avr32/mach-at32ap/clock.h b/arch/avr32/mach-at32ap/clock.h
new file mode 100644
index 00000000000..f953f044ba4
--- /dev/null
+++ b/arch/avr32/mach-at32ap/clock.h
@@ -0,0 +1,30 @@
+/*
+ * Clock management for AT32AP CPUs
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * Based on arch/arm/mach-at91rm9200/clock.c
+ *   Copyright (C) 2005 David Brownell
+ *   Copyright (C) 2005 Ivan Kokshaysky
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/clk.h>
+
+struct clk {
+	const char	*name;		/* Clock name/function */
+	struct device	*dev;		/* Device the clock is used by */
+	struct clk	*parent;	/* Parent clock, if any */
+	void		(*mode)(struct clk *clk, int enabled);
+	unsigned long	(*get_rate)(struct clk *clk);
+	long		(*set_rate)(struct clk *clk, unsigned long rate,
+				    int apply);
+	int		(*set_parent)(struct clk *clk, struct clk *parent);
+	u16		users;		/* Enabled if non-zero */
+	u16		index;		/* Sibling index */
+};
+
+extern struct clk *at32_clock_list[];
+extern unsigned int at32_nr_clocks;
diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c
new file mode 100644
index 00000000000..7da9c5f7a0e
--- /dev/null
+++ b/arch/avr32/mach-at32ap/extint.c
@@ -0,0 +1,171 @@
+/*
+ * External interrupt handling for AT32AP CPUs
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+#include <linux/random.h>
+
+#include <asm/io.h>
+
+#include <asm/arch/sm.h>
+
+#include "sm.h"
+
+static void eim_ack_irq(unsigned int irq)
+{
+	struct at32_sm *sm = get_irq_chip_data(irq);
+	sm_writel(sm, EIM_ICR, 1 << (irq - sm->eim_first_irq));
+}
+
+static void eim_mask_irq(unsigned int irq)
+{
+	struct at32_sm *sm = get_irq_chip_data(irq);
+	sm_writel(sm, EIM_IDR, 1 << (irq - sm->eim_first_irq));
+}
+
+static void eim_mask_ack_irq(unsigned int irq)
+{
+	struct at32_sm *sm = get_irq_chip_data(irq);
+	sm_writel(sm, EIM_ICR, 1 << (irq - sm->eim_first_irq));
+	sm_writel(sm, EIM_IDR, 1 << (irq - sm->eim_first_irq));
+}
+
+static void eim_unmask_irq(unsigned int irq)
+{
+	struct at32_sm *sm = get_irq_chip_data(irq);
+	sm_writel(sm, EIM_IER, 1 << (irq - sm->eim_first_irq));
+}
+
+static int eim_set_irq_type(unsigned int irq, unsigned int flow_type)
+{
+	struct at32_sm *sm = get_irq_chip_data(irq);
+	unsigned int i = irq - sm->eim_first_irq;
+	u32 mode, edge, level;
+	unsigned long flags;
+	int ret = 0;
+
+	flow_type &= IRQ_TYPE_SENSE_MASK;
+
+	spin_lock_irqsave(&sm->lock, flags);
+
+	mode = sm_readl(sm, EIM_MODE);
+	edge = sm_readl(sm, EIM_EDGE);
+	level = sm_readl(sm, EIM_LEVEL);
+
+	switch (flow_type) {
+	case IRQ_TYPE_LEVEL_LOW:
+		mode |= 1 << i;
+		level &= ~(1 << i);
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		mode |= 1 << i;
+		level |= 1 << i;
+		break;
+	case IRQ_TYPE_EDGE_RISING:
+		mode &= ~(1 << i);
+		edge |= 1 << i;
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		mode &= ~(1 << i);
+		edge &= ~(1 << i);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	sm_writel(sm, EIM_MODE, mode);
+	sm_writel(sm, EIM_EDGE, edge);
+	sm_writel(sm, EIM_LEVEL, level);
+
+	spin_unlock_irqrestore(&sm->lock, flags);
+
+	return ret;
+}
+
+struct irq_chip eim_chip = {
+	.name		= "eim",
+	.ack		= eim_ack_irq,
+	.mask		= eim_mask_irq,
+	.mask_ack	= eim_mask_ack_irq,
+	.unmask		= eim_unmask_irq,
+	.set_type	= eim_set_irq_type,
+};
+
+static void demux_eim_irq(unsigned int irq, struct irq_desc *desc,
+			  struct pt_regs *regs)
+{
+	struct at32_sm *sm = desc->handler_data;
+	struct irq_desc *ext_desc;
+	unsigned long status, pending;
+	unsigned int i, ext_irq;
+
+	spin_lock(&sm->lock);
+
+	status = sm_readl(sm, EIM_ISR);
+	pending = status & sm_readl(sm, EIM_IMR);
+
+	while (pending) {
+		i = fls(pending) - 1;
+		pending &= ~(1 << i);
+
+		ext_irq = i + sm->eim_first_irq;
+		ext_desc = irq_desc + ext_irq;
+		ext_desc->handle_irq(ext_irq, ext_desc, regs);
+	}
+
+	spin_unlock(&sm->lock);
+}
+
+static int __init eim_init(void)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned int i;
+	unsigned int nr_irqs;
+	unsigned int int_irq;
+	u32 pattern;
+
+	/*
+	 * The EIM is really the same module as SM, so register
+	 * mapping, etc. has been taken care of already.
+	 */
+
+	/*
+	 * Find out how many interrupt lines that are actually
+	 * implemented in hardware.
+	 */
+	sm_writel(sm, EIM_IDR, ~0UL);
+	sm_writel(sm, EIM_MODE, ~0UL);
+	pattern = sm_readl(sm, EIM_MODE);
+	nr_irqs = fls(pattern);
+
+	sm->eim_chip = &eim_chip;
+
+	for (i = 0; i < nr_irqs; i++) {
+		set_irq_chip(sm->eim_first_irq + i, &eim_chip);
+		set_irq_chip_data(sm->eim_first_irq + i, sm);
+	}
+
+	int_irq = platform_get_irq_byname(sm->pdev, "eim");
+
+	set_irq_chained_handler(int_irq, demux_eim_irq);
+	set_irq_data(int_irq, sm);
+
+	printk("EIM: External Interrupt Module at 0x%p, IRQ %u\n",
+	       sm->regs, int_irq);
+	printk("EIM: Handling %u external IRQs, starting with IRQ %u\n",
+	       nr_irqs, sm->eim_first_irq);
+
+	return 0;
+}
+arch_initcall(eim_init);
diff --git a/arch/avr32/mach-at32ap/intc.c b/arch/avr32/mach-at32ap/intc.c
new file mode 100644
index 00000000000..74f8c9f2f03
--- /dev/null
+++ b/arch/avr32/mach-at32ap/intc.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+
+#include "intc.h"
+
+struct intc {
+	void __iomem	*regs;
+	struct irq_chip	chip;
+};
+
+extern struct platform_device at32_intc0_device;
+
+/*
+ * TODO: We may be able to implement mask/unmask by setting IxM flags
+ * in the status register.
+ */
+static void intc_mask_irq(unsigned int irq)
+{
+
+}
+
+static void intc_unmask_irq(unsigned int irq)
+{
+
+}
+
+static struct intc intc0 = {
+	.chip = {
+		.name		= "intc",
+		.mask		= intc_mask_irq,
+		.unmask		= intc_unmask_irq,
+	},
+};
+
+/*
+ * All interrupts go via intc at some point.
+ */
+asmlinkage void do_IRQ(int level, struct pt_regs *regs)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	unsigned long status_reg;
+
+	local_irq_disable();
+
+	irq_enter();
+
+	irq = intc_readl(&intc0, INTCAUSE0 - 4 * level);
+	desc = irq_desc + irq;
+	desc->handle_irq(irq, desc, regs);
+
+	/*
+	 * Clear all interrupt level masks so that we may handle
+	 * interrupts during softirq processing.  If this is a nested
+	 * interrupt, interrupts must stay globally disabled until we
+	 * return.
+	 */
+	status_reg = sysreg_read(SR);
+	status_reg &= ~(SYSREG_BIT(I0M) | SYSREG_BIT(I1M)
+			| SYSREG_BIT(I2M) | SYSREG_BIT(I3M));
+	sysreg_write(SR, status_reg);
+
+	irq_exit();
+}
+
+void __init init_IRQ(void)
+{
+	extern void _evba(void);
+	extern void irq_level0(void);
+	struct resource *regs;
+	struct clk *pclk;
+	unsigned int i;
+	u32 offset, readback;
+
+	regs = platform_get_resource(&at32_intc0_device, IORESOURCE_MEM, 0);
+	if (!regs) {
+		printk(KERN_EMERG "intc: no mmio resource defined\n");
+		goto fail;
+	}
+	pclk = clk_get(&at32_intc0_device.dev, "pclk");
+	if (IS_ERR(pclk)) {
+		printk(KERN_EMERG "intc: no clock defined\n");
+		goto fail;
+	}
+
+	clk_enable(pclk);
+
+	intc0.regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!intc0.regs) {
+		printk(KERN_EMERG "intc: failed to map registers (0x%08lx)\n",
+		       (unsigned long)regs->start);
+		goto fail;
+	}
+
+	/*
+	 * Initialize all interrupts to level 0 (lowest priority). The
+	 * priority level may be changed by calling
+	 * irq_set_priority().
+	 *
+	 */
+	offset = (unsigned long)&irq_level0 - (unsigned long)&_evba;
+	for (i = 0; i < NR_INTERNAL_IRQS; i++) {
+		intc_writel(&intc0, INTPR0 + 4 * i, offset);
+		readback = intc_readl(&intc0, INTPR0 + 4 * i);
+		if (readback == offset)
+			set_irq_chip_and_handler(i, &intc0.chip,
+						 handle_simple_irq);
+	}
+
+	/* Unmask all interrupt levels */
+	sysreg_write(SR, (sysreg_read(SR)
+			  & ~(SR_I3M | SR_I2M | SR_I1M | SR_I0M)));
+
+	return;
+
+fail:
+	panic("Interrupt controller initialization failed!\n");
+}
+
diff --git a/arch/avr32/mach-at32ap/intc.h b/arch/avr32/mach-at32ap/intc.h
new file mode 100644
index 00000000000..d289ca2fff1
--- /dev/null
+++ b/arch/avr32/mach-at32ap/intc.h
@@ -0,0 +1,327 @@
+/*
+ * Automatically generated by gen-header.xsl
+ */
+#ifndef __ASM_AVR32_PERIHP_INTC_H__
+#define __ASM_AVR32_PERIHP_INTC_H__
+
+#define INTC_NUM_INT_GRPS            33
+
+#define INTC_INTPR0                  0x0
+# define INTC_INTPR0_INTLEV_OFFSET   30
+# define INTC_INTPR0_INTLEV_SIZE     2
+# define INTC_INTPR0_OFFSET_OFFSET   0
+# define INTC_INTPR0_OFFSET_SIZE     24
+#define INTC_INTREQ0                 0x100
+# define INTC_INTREQ0_IREQUEST0_OFFSET 0
+# define INTC_INTREQ0_IREQUEST0_SIZE 1
+# define INTC_INTREQ0_IREQUEST1_OFFSET 1
+# define INTC_INTREQ0_IREQUEST1_SIZE 1
+#define INTC_INTPR1                  0x4
+# define INTC_INTPR1_INTLEV_OFFSET   30
+# define INTC_INTPR1_INTLEV_SIZE     2
+# define INTC_INTPR1_OFFSET_OFFSET   0
+# define INTC_INTPR1_OFFSET_SIZE     24
+#define INTC_INTREQ1                 0x104
+# define INTC_INTREQ1_IREQUEST32_OFFSET 0
+# define INTC_INTREQ1_IREQUEST32_SIZE 1
+# define INTC_INTREQ1_IREQUEST33_OFFSET 1
+# define INTC_INTREQ1_IREQUEST33_SIZE 1
+# define INTC_INTREQ1_IREQUEST34_OFFSET 2
+# define INTC_INTREQ1_IREQUEST34_SIZE 1
+# define INTC_INTREQ1_IREQUEST35_OFFSET 3
+# define INTC_INTREQ1_IREQUEST35_SIZE 1
+# define INTC_INTREQ1_IREQUEST36_OFFSET 4
+# define INTC_INTREQ1_IREQUEST36_SIZE 1
+# define INTC_INTREQ1_IREQUEST37_OFFSET 5
+# define INTC_INTREQ1_IREQUEST37_SIZE 1
+#define INTC_INTPR2                  0x8
+# define INTC_INTPR2_INTLEV_OFFSET   30
+# define INTC_INTPR2_INTLEV_SIZE     2
+# define INTC_INTPR2_OFFSET_OFFSET   0
+# define INTC_INTPR2_OFFSET_SIZE     24
+#define INTC_INTREQ2                 0x108
+# define INTC_INTREQ2_IREQUEST64_OFFSET 0
+# define INTC_INTREQ2_IREQUEST64_SIZE 1
+# define INTC_INTREQ2_IREQUEST65_OFFSET 1
+# define INTC_INTREQ2_IREQUEST65_SIZE 1
+# define INTC_INTREQ2_IREQUEST66_OFFSET 2
+# define INTC_INTREQ2_IREQUEST66_SIZE 1
+# define INTC_INTREQ2_IREQUEST67_OFFSET 3
+# define INTC_INTREQ2_IREQUEST67_SIZE 1
+# define INTC_INTREQ2_IREQUEST68_OFFSET 4
+# define INTC_INTREQ2_IREQUEST68_SIZE 1
+#define INTC_INTPR3                  0xc
+# define INTC_INTPR3_INTLEV_OFFSET   30
+# define INTC_INTPR3_INTLEV_SIZE     2
+# define INTC_INTPR3_OFFSET_OFFSET   0
+# define INTC_INTPR3_OFFSET_SIZE     24
+#define INTC_INTREQ3                 0x10c
+# define INTC_INTREQ3_IREQUEST96_OFFSET 0
+# define INTC_INTREQ3_IREQUEST96_SIZE 1
+#define INTC_INTPR4                  0x10
+# define INTC_INTPR4_INTLEV_OFFSET   30
+# define INTC_INTPR4_INTLEV_SIZE     2
+# define INTC_INTPR4_OFFSET_OFFSET   0
+# define INTC_INTPR4_OFFSET_SIZE     24
+#define INTC_INTREQ4                 0x110
+# define INTC_INTREQ4_IREQUEST128_OFFSET 0
+# define INTC_INTREQ4_IREQUEST128_SIZE 1
+#define INTC_INTPR5                  0x14
+# define INTC_INTPR5_INTLEV_OFFSET   30
+# define INTC_INTPR5_INTLEV_SIZE     2
+# define INTC_INTPR5_OFFSET_OFFSET   0
+# define INTC_INTPR5_OFFSET_SIZE     24
+#define INTC_INTREQ5                 0x114
+# define INTC_INTREQ5_IREQUEST160_OFFSET 0
+# define INTC_INTREQ5_IREQUEST160_SIZE 1
+#define INTC_INTPR6                  0x18
+# define INTC_INTPR6_INTLEV_OFFSET   30
+# define INTC_INTPR6_INTLEV_SIZE     2
+# define INTC_INTPR6_OFFSET_OFFSET   0
+# define INTC_INTPR6_OFFSET_SIZE     24
+#define INTC_INTREQ6                 0x118
+# define INTC_INTREQ6_IREQUEST192_OFFSET 0
+# define INTC_INTREQ6_IREQUEST192_SIZE 1
+#define INTC_INTPR7                  0x1c
+# define INTC_INTPR7_INTLEV_OFFSET   30
+# define INTC_INTPR7_INTLEV_SIZE     2
+# define INTC_INTPR7_OFFSET_OFFSET   0
+# define INTC_INTPR7_OFFSET_SIZE     24
+#define INTC_INTREQ7                 0x11c
+# define INTC_INTREQ7_IREQUEST224_OFFSET 0
+# define INTC_INTREQ7_IREQUEST224_SIZE 1
+#define INTC_INTPR8                  0x20
+# define INTC_INTPR8_INTLEV_OFFSET   30
+# define INTC_INTPR8_INTLEV_SIZE     2
+# define INTC_INTPR8_OFFSET_OFFSET   0
+# define INTC_INTPR8_OFFSET_SIZE     24
+#define INTC_INTREQ8                 0x120
+# define INTC_INTREQ8_IREQUEST256_OFFSET 0
+# define INTC_INTREQ8_IREQUEST256_SIZE 1
+#define INTC_INTPR9                  0x24
+# define INTC_INTPR9_INTLEV_OFFSET   30
+# define INTC_INTPR9_INTLEV_SIZE     2
+# define INTC_INTPR9_OFFSET_OFFSET   0
+# define INTC_INTPR9_OFFSET_SIZE     24
+#define INTC_INTREQ9                 0x124
+# define INTC_INTREQ9_IREQUEST288_OFFSET 0
+# define INTC_INTREQ9_IREQUEST288_SIZE 1
+#define INTC_INTPR10                 0x28
+# define INTC_INTPR10_INTLEV_OFFSET  30
+# define INTC_INTPR10_INTLEV_SIZE    2
+# define INTC_INTPR10_OFFSET_OFFSET  0
+# define INTC_INTPR10_OFFSET_SIZE    24
+#define INTC_INTREQ10                0x128
+# define INTC_INTREQ10_IREQUEST320_OFFSET 0
+# define INTC_INTREQ10_IREQUEST320_SIZE 1
+#define INTC_INTPR11                 0x2c
+# define INTC_INTPR11_INTLEV_OFFSET  30
+# define INTC_INTPR11_INTLEV_SIZE    2
+# define INTC_INTPR11_OFFSET_OFFSET  0
+# define INTC_INTPR11_OFFSET_SIZE    24
+#define INTC_INTREQ11                0x12c
+# define INTC_INTREQ11_IREQUEST352_OFFSET 0
+# define INTC_INTREQ11_IREQUEST352_SIZE 1
+#define INTC_INTPR12                 0x30
+# define INTC_INTPR12_INTLEV_OFFSET  30
+# define INTC_INTPR12_INTLEV_SIZE    2
+# define INTC_INTPR12_OFFSET_OFFSET  0
+# define INTC_INTPR12_OFFSET_SIZE    24
+#define INTC_INTREQ12                0x130
+# define INTC_INTREQ12_IREQUEST384_OFFSET 0
+# define INTC_INTREQ12_IREQUEST384_SIZE 1
+#define INTC_INTPR13                 0x34
+# define INTC_INTPR13_INTLEV_OFFSET  30
+# define INTC_INTPR13_INTLEV_SIZE    2
+# define INTC_INTPR13_OFFSET_OFFSET  0
+# define INTC_INTPR13_OFFSET_SIZE    24
+#define INTC_INTREQ13                0x134
+# define INTC_INTREQ13_IREQUEST416_OFFSET 0
+# define INTC_INTREQ13_IREQUEST416_SIZE 1
+#define INTC_INTPR14                 0x38
+# define INTC_INTPR14_INTLEV_OFFSET  30
+# define INTC_INTPR14_INTLEV_SIZE    2
+# define INTC_INTPR14_OFFSET_OFFSET  0
+# define INTC_INTPR14_OFFSET_SIZE    24
+#define INTC_INTREQ14                0x138
+# define INTC_INTREQ14_IREQUEST448_OFFSET 0
+# define INTC_INTREQ14_IREQUEST448_SIZE 1
+#define INTC_INTPR15                 0x3c
+# define INTC_INTPR15_INTLEV_OFFSET  30
+# define INTC_INTPR15_INTLEV_SIZE    2
+# define INTC_INTPR15_OFFSET_OFFSET  0
+# define INTC_INTPR15_OFFSET_SIZE    24
+#define INTC_INTREQ15                0x13c
+# define INTC_INTREQ15_IREQUEST480_OFFSET 0
+# define INTC_INTREQ15_IREQUEST480_SIZE 1
+#define INTC_INTPR16                 0x40
+# define INTC_INTPR16_INTLEV_OFFSET  30
+# define INTC_INTPR16_INTLEV_SIZE    2
+# define INTC_INTPR16_OFFSET_OFFSET  0
+# define INTC_INTPR16_OFFSET_SIZE    24
+#define INTC_INTREQ16                0x140
+# define INTC_INTREQ16_IREQUEST512_OFFSET 0
+# define INTC_INTREQ16_IREQUEST512_SIZE 1
+#define INTC_INTPR17                 0x44
+# define INTC_INTPR17_INTLEV_OFFSET  30
+# define INTC_INTPR17_INTLEV_SIZE    2
+# define INTC_INTPR17_OFFSET_OFFSET  0
+# define INTC_INTPR17_OFFSET_SIZE    24
+#define INTC_INTREQ17                0x144
+# define INTC_INTREQ17_IREQUEST544_OFFSET 0
+# define INTC_INTREQ17_IREQUEST544_SIZE 1
+#define INTC_INTPR18                 0x48
+# define INTC_INTPR18_INTLEV_OFFSET  30
+# define INTC_INTPR18_INTLEV_SIZE    2
+# define INTC_INTPR18_OFFSET_OFFSET  0
+# define INTC_INTPR18_OFFSET_SIZE    24
+#define INTC_INTREQ18                0x148
+# define INTC_INTREQ18_IREQUEST576_OFFSET 0
+# define INTC_INTREQ18_IREQUEST576_SIZE 1
+#define INTC_INTPR19                 0x4c
+# define INTC_INTPR19_INTLEV_OFFSET  30
+# define INTC_INTPR19_INTLEV_SIZE    2
+# define INTC_INTPR19_OFFSET_OFFSET  0
+# define INTC_INTPR19_OFFSET_SIZE    24
+#define INTC_INTREQ19                0x14c
+# define INTC_INTREQ19_IREQUEST608_OFFSET 0
+# define INTC_INTREQ19_IREQUEST608_SIZE 1
+# define INTC_INTREQ19_IREQUEST609_OFFSET 1
+# define INTC_INTREQ19_IREQUEST609_SIZE 1
+# define INTC_INTREQ19_IREQUEST610_OFFSET 2
+# define INTC_INTREQ19_IREQUEST610_SIZE 1
+# define INTC_INTREQ19_IREQUEST611_OFFSET 3
+# define INTC_INTREQ19_IREQUEST611_SIZE 1
+#define INTC_INTPR20                 0x50
+# define INTC_INTPR20_INTLEV_OFFSET  30
+# define INTC_INTPR20_INTLEV_SIZE    2
+# define INTC_INTPR20_OFFSET_OFFSET  0
+# define INTC_INTPR20_OFFSET_SIZE    24
+#define INTC_INTREQ20                0x150
+# define INTC_INTREQ20_IREQUEST640_OFFSET 0
+# define INTC_INTREQ20_IREQUEST640_SIZE 1
+#define INTC_INTPR21                 0x54
+# define INTC_INTPR21_INTLEV_OFFSET  30
+# define INTC_INTPR21_INTLEV_SIZE    2
+# define INTC_INTPR21_OFFSET_OFFSET  0
+# define INTC_INTPR21_OFFSET_SIZE    24
+#define INTC_INTREQ21                0x154
+# define INTC_INTREQ21_IREQUEST672_OFFSET 0
+# define INTC_INTREQ21_IREQUEST672_SIZE 1
+#define INTC_INTPR22                 0x58
+# define INTC_INTPR22_INTLEV_OFFSET  30
+# define INTC_INTPR22_INTLEV_SIZE    2
+# define INTC_INTPR22_OFFSET_OFFSET  0
+# define INTC_INTPR22_OFFSET_SIZE    24
+#define INTC_INTREQ22                0x158
+# define INTC_INTREQ22_IREQUEST704_OFFSET 0
+# define INTC_INTREQ22_IREQUEST704_SIZE 1
+# define INTC_INTREQ22_IREQUEST705_OFFSET 1
+# define INTC_INTREQ22_IREQUEST705_SIZE 1
+# define INTC_INTREQ22_IREQUEST706_OFFSET 2
+# define INTC_INTREQ22_IREQUEST706_SIZE 1
+#define INTC_INTPR23                 0x5c
+# define INTC_INTPR23_INTLEV_OFFSET  30
+# define INTC_INTPR23_INTLEV_SIZE    2
+# define INTC_INTPR23_OFFSET_OFFSET  0
+# define INTC_INTPR23_OFFSET_SIZE    24
+#define INTC_INTREQ23                0x15c
+# define INTC_INTREQ23_IREQUEST736_OFFSET 0
+# define INTC_INTREQ23_IREQUEST736_SIZE 1
+# define INTC_INTREQ23_IREQUEST737_OFFSET 1
+# define INTC_INTREQ23_IREQUEST737_SIZE 1
+# define INTC_INTREQ23_IREQUEST738_OFFSET 2
+# define INTC_INTREQ23_IREQUEST738_SIZE 1
+#define INTC_INTPR24                 0x60
+# define INTC_INTPR24_INTLEV_OFFSET  30
+# define INTC_INTPR24_INTLEV_SIZE    2
+# define INTC_INTPR24_OFFSET_OFFSET  0
+# define INTC_INTPR24_OFFSET_SIZE    24
+#define INTC_INTREQ24                0x160
+# define INTC_INTREQ24_IREQUEST768_OFFSET 0
+# define INTC_INTREQ24_IREQUEST768_SIZE 1
+#define INTC_INTPR25                 0x64
+# define INTC_INTPR25_INTLEV_OFFSET  30
+# define INTC_INTPR25_INTLEV_SIZE    2
+# define INTC_INTPR25_OFFSET_OFFSET  0
+# define INTC_INTPR25_OFFSET_SIZE    24
+#define INTC_INTREQ25                0x164
+# define INTC_INTREQ25_IREQUEST800_OFFSET 0
+# define INTC_INTREQ25_IREQUEST800_SIZE 1
+#define INTC_INTPR26                 0x68
+# define INTC_INTPR26_INTLEV_OFFSET  30
+# define INTC_INTPR26_INTLEV_SIZE    2
+# define INTC_INTPR26_OFFSET_OFFSET  0
+# define INTC_INTPR26_OFFSET_SIZE    24
+#define INTC_INTREQ26                0x168
+# define INTC_INTREQ26_IREQUEST832_OFFSET 0
+# define INTC_INTREQ26_IREQUEST832_SIZE 1
+#define INTC_INTPR27                 0x6c
+# define INTC_INTPR27_INTLEV_OFFSET  30
+# define INTC_INTPR27_INTLEV_SIZE    2
+# define INTC_INTPR27_OFFSET_OFFSET  0
+# define INTC_INTPR27_OFFSET_SIZE    24
+#define INTC_INTREQ27                0x16c
+# define INTC_INTREQ27_IREQUEST864_OFFSET 0
+# define INTC_INTREQ27_IREQUEST864_SIZE 1
+#define INTC_INTPR28                 0x70
+# define INTC_INTPR28_INTLEV_OFFSET  30
+# define INTC_INTPR28_INTLEV_SIZE    2
+# define INTC_INTPR28_OFFSET_OFFSET  0
+# define INTC_INTPR28_OFFSET_SIZE    24
+#define INTC_INTREQ28                0x170
+# define INTC_INTREQ28_IREQUEST896_OFFSET 0
+# define INTC_INTREQ28_IREQUEST896_SIZE 1
+#define INTC_INTPR29                 0x74
+# define INTC_INTPR29_INTLEV_OFFSET  30
+# define INTC_INTPR29_INTLEV_SIZE    2
+# define INTC_INTPR29_OFFSET_OFFSET  0
+# define INTC_INTPR29_OFFSET_SIZE    24
+#define INTC_INTREQ29                0x174
+# define INTC_INTREQ29_IREQUEST928_OFFSET 0
+# define INTC_INTREQ29_IREQUEST928_SIZE 1
+#define INTC_INTPR30                 0x78
+# define INTC_INTPR30_INTLEV_OFFSET  30
+# define INTC_INTPR30_INTLEV_SIZE    2
+# define INTC_INTPR30_OFFSET_OFFSET  0
+# define INTC_INTPR30_OFFSET_SIZE    24
+#define INTC_INTREQ30                0x178
+# define INTC_INTREQ30_IREQUEST960_OFFSET 0
+# define INTC_INTREQ30_IREQUEST960_SIZE 1
+#define INTC_INTPR31                 0x7c
+# define INTC_INTPR31_INTLEV_OFFSET  30
+# define INTC_INTPR31_INTLEV_SIZE    2
+# define INTC_INTPR31_OFFSET_OFFSET  0
+# define INTC_INTPR31_OFFSET_SIZE    24
+#define INTC_INTREQ31                0x17c
+# define INTC_INTREQ31_IREQUEST992_OFFSET 0
+# define INTC_INTREQ31_IREQUEST992_SIZE 1
+#define INTC_INTPR32                 0x80
+# define INTC_INTPR32_INTLEV_OFFSET  30
+# define INTC_INTPR32_INTLEV_SIZE    2
+# define INTC_INTPR32_OFFSET_OFFSET  0
+# define INTC_INTPR32_OFFSET_SIZE    24
+#define INTC_INTREQ32                0x180
+# define INTC_INTREQ32_IREQUEST1024_OFFSET 0
+# define INTC_INTREQ32_IREQUEST1024_SIZE 1
+#define INTC_INTCAUSE0               0x20c
+# define INTC_INTCAUSE0_CAUSEGRP_OFFSET 0
+# define INTC_INTCAUSE0_CAUSEGRP_SIZE 6
+#define INTC_INTCAUSE1               0x208
+# define INTC_INTCAUSE1_CAUSEGRP_OFFSET 0
+# define INTC_INTCAUSE1_CAUSEGRP_SIZE 6
+#define INTC_INTCAUSE2               0x204
+# define INTC_INTCAUSE2_CAUSEGRP_OFFSET 0
+# define INTC_INTCAUSE2_CAUSEGRP_SIZE 6
+#define INTC_INTCAUSE3               0x200
+# define INTC_INTCAUSE3_CAUSEGRP_OFFSET 0
+# define INTC_INTCAUSE3_CAUSEGRP_SIZE 6
+
+#define INTC_BIT(name)               (1 << INTC_##name##_OFFSET)
+#define INTC_MKBF(name, value)       (((value) & ((1 << INTC_##name##_SIZE) - 1)) << INTC_##name##_OFFSET)
+#define INTC_GETBF(name, value)      (((value) >> INTC_##name##_OFFSET) & ((1 << INTC_##name##_SIZE) - 1))
+
+#define intc_readl(port,reg)         readl((port)->regs + INTC_##reg)
+#define intc_writel(port,reg,value)  writel((value), (port)->regs + INTC_##reg)
+
+#endif /* __ASM_AVR32_PERIHP_INTC_H__ */
diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c
new file mode 100644
index 00000000000..d3aabfca859
--- /dev/null
+++ b/arch/avr32/mach-at32ap/pio.c
@@ -0,0 +1,118 @@
+/*
+ * Atmel PIO2 Port Multiplexer support
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+
+#include <asm/arch/portmux.h>
+
+#include "pio.h"
+
+#define MAX_NR_PIO_DEVICES		8
+
+struct pio_device {
+	void __iomem *regs;
+	const struct platform_device *pdev;
+	struct clk *clk;
+	u32 alloc_mask;
+	char name[32];
+};
+
+static struct pio_device pio_dev[MAX_NR_PIO_DEVICES];
+
+void portmux_set_func(unsigned int portmux_id, unsigned int pin_id,
+		      unsigned int function_id)
+{
+	struct pio_device *pio;
+	u32 mask = 1 << pin_id;
+
+	BUG_ON(portmux_id >= MAX_NR_PIO_DEVICES);
+
+	pio = &pio_dev[portmux_id];
+
+	if (function_id)
+		pio_writel(pio, BSR, mask);
+	else
+		pio_writel(pio, ASR, mask);
+	pio_writel(pio, PDR, mask);
+}
+
+static int __init pio_probe(struct platform_device *pdev)
+{
+	struct pio_device *pio = NULL;
+
+	BUG_ON(pdev->id >= MAX_NR_PIO_DEVICES);
+	pio = &pio_dev[pdev->id];
+	BUG_ON(!pio->regs);
+
+	/* TODO: Interrupts */
+
+	platform_set_drvdata(pdev, pio);
+
+	printk(KERN_INFO "%s: Atmel Port Multiplexer at 0x%p (irq %d)\n",
+	       pio->name, pio->regs, platform_get_irq(pdev, 0));
+
+	return 0;
+}
+
+static struct platform_driver pio_driver = {
+	.probe		= pio_probe,
+	.driver		= {
+		.name		= "pio",
+	},
+};
+
+static int __init pio_init(void)
+{
+	return platform_driver_register(&pio_driver);
+}
+subsys_initcall(pio_init);
+
+void __init at32_init_pio(struct platform_device *pdev)
+{
+	struct resource *regs;
+	struct pio_device *pio;
+
+	if (pdev->id > MAX_NR_PIO_DEVICES) {
+		dev_err(&pdev->dev, "only %d PIO devices supported\n",
+			MAX_NR_PIO_DEVICES);
+		return;
+	}
+
+	pio = &pio_dev[pdev->id];
+	snprintf(pio->name, sizeof(pio->name), "pio%d", pdev->id);
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!regs) {
+		dev_err(&pdev->dev, "no mmio resource defined\n");
+		return;
+	}
+
+	pio->clk = clk_get(&pdev->dev, "mck");
+	if (IS_ERR(pio->clk))
+		/*
+		 * This is a fatal error, but if we continue we might
+		 * be so lucky that we manage to initialize the
+		 * console and display this message...
+		 */
+		dev_err(&pdev->dev, "no mck clock defined\n");
+	else
+		clk_enable(pio->clk);
+
+	pio->pdev = pdev;
+	pio->regs = ioremap(regs->start, regs->end - regs->start + 1);
+
+	pio_writel(pio, ODR, ~0UL);
+	pio_writel(pio, PER, ~0UL);
+}
diff --git a/arch/avr32/mach-at32ap/pio.h b/arch/avr32/mach-at32ap/pio.h
new file mode 100644
index 00000000000..cfea1235159
--- /dev/null
+++ b/arch/avr32/mach-at32ap/pio.h
@@ -0,0 +1,178 @@
+/*
+ * Atmel PIO2 Port Multiplexer support
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ARCH_AVR32_AT32AP_PIO_H__
+#define __ARCH_AVR32_AT32AP_PIO_H__
+
+/* PIO register offsets */
+#define PIO_PER                                0x0000
+#define PIO_PDR                                0x0004
+#define PIO_PSR                                0x0008
+#define PIO_OER                                0x0010
+#define PIO_ODR                                0x0014
+#define PIO_OSR                                0x0018
+#define PIO_IFER                               0x0020
+#define PIO_IFDR                               0x0024
+#define PIO_ISFR                               0x0028
+#define PIO_SODR                               0x0030
+#define PIO_CODR                               0x0034
+#define PIO_ODSR                               0x0038
+#define PIO_PDSR                               0x003c
+#define PIO_IER                                0x0040
+#define PIO_IDR                                0x0044
+#define PIO_IMR                                0x0048
+#define PIO_ISR                                0x004c
+#define PIO_MDER                               0x0050
+#define PIO_MDDR                               0x0054
+#define PIO_MDSR                               0x0058
+#define PIO_PUDR                               0x0060
+#define PIO_PUER                               0x0064
+#define PIO_PUSR                               0x0068
+#define PIO_ASR                                0x0070
+#define PIO_BSR                                0x0074
+#define PIO_ABSR                               0x0078
+#define PIO_OWER                               0x00a0
+#define PIO_OWDR                               0x00a4
+#define PIO_OWSR                               0x00a8
+
+/* Bitfields in PER */
+
+/* Bitfields in PDR */
+
+/* Bitfields in PSR */
+
+/* Bitfields in OER */
+
+/* Bitfields in ODR */
+
+/* Bitfields in OSR */
+
+/* Bitfields in IFER */
+
+/* Bitfields in IFDR */
+
+/* Bitfields in ISFR */
+
+/* Bitfields in SODR */
+
+/* Bitfields in CODR */
+
+/* Bitfields in ODSR */
+
+/* Bitfields in PDSR */
+
+/* Bitfields in IER */
+
+/* Bitfields in IDR */
+
+/* Bitfields in IMR */
+
+/* Bitfields in ISR */
+
+/* Bitfields in MDER */
+
+/* Bitfields in MDDR */
+
+/* Bitfields in MDSR */
+
+/* Bitfields in PUDR */
+
+/* Bitfields in PUER */
+
+/* Bitfields in PUSR */
+
+/* Bitfields in ASR */
+
+/* Bitfields in BSR */
+
+/* Bitfields in ABSR */
+#define PIO_P0_OFFSET                          0
+#define PIO_P0_SIZE                            1
+#define PIO_P1_OFFSET                          1
+#define PIO_P1_SIZE                            1
+#define PIO_P2_OFFSET                          2
+#define PIO_P2_SIZE                            1
+#define PIO_P3_OFFSET                          3
+#define PIO_P3_SIZE                            1
+#define PIO_P4_OFFSET                          4
+#define PIO_P4_SIZE                            1
+#define PIO_P5_OFFSET                          5
+#define PIO_P5_SIZE                            1
+#define PIO_P6_OFFSET                          6
+#define PIO_P6_SIZE                            1
+#define PIO_P7_OFFSET                          7
+#define PIO_P7_SIZE                            1
+#define PIO_P8_OFFSET                          8
+#define PIO_P8_SIZE                            1
+#define PIO_P9_OFFSET                          9
+#define PIO_P9_SIZE                            1
+#define PIO_P10_OFFSET                         10
+#define PIO_P10_SIZE                           1
+#define PIO_P11_OFFSET                         11
+#define PIO_P11_SIZE                           1
+#define PIO_P12_OFFSET                         12
+#define PIO_P12_SIZE                           1
+#define PIO_P13_OFFSET                         13
+#define PIO_P13_SIZE                           1
+#define PIO_P14_OFFSET                         14
+#define PIO_P14_SIZE                           1
+#define PIO_P15_OFFSET                         15
+#define PIO_P15_SIZE                           1
+#define PIO_P16_OFFSET                         16
+#define PIO_P16_SIZE                           1
+#define PIO_P17_OFFSET                         17
+#define PIO_P17_SIZE                           1
+#define PIO_P18_OFFSET                         18
+#define PIO_P18_SIZE                           1
+#define PIO_P19_OFFSET                         19
+#define PIO_P19_SIZE                           1
+#define PIO_P20_OFFSET                         20
+#define PIO_P20_SIZE                           1
+#define PIO_P21_OFFSET                         21
+#define PIO_P21_SIZE                           1
+#define PIO_P22_OFFSET                         22
+#define PIO_P22_SIZE                           1
+#define PIO_P23_OFFSET                         23
+#define PIO_P23_SIZE                           1
+#define PIO_P24_OFFSET                         24
+#define PIO_P24_SIZE                           1
+#define PIO_P25_OFFSET                         25
+#define PIO_P25_SIZE                           1
+#define PIO_P26_OFFSET                         26
+#define PIO_P26_SIZE                           1
+#define PIO_P27_OFFSET                         27
+#define PIO_P27_SIZE                           1
+#define PIO_P28_OFFSET                         28
+#define PIO_P28_SIZE                           1
+#define PIO_P29_OFFSET                         29
+#define PIO_P29_SIZE                           1
+#define PIO_P30_OFFSET                         30
+#define PIO_P30_SIZE                           1
+#define PIO_P31_OFFSET                         31
+#define PIO_P31_SIZE                           1
+
+/* Bitfields in OWER */
+
+/* Bitfields in OWDR */
+
+/* Bitfields in OWSR */
+
+/* Bit manipulation macros */
+#define PIO_BIT(name)                          (1 << PIO_##name##_OFFSET)
+#define PIO_BF(name,value)                     (((value) & ((1 << PIO_##name##_SIZE) - 1)) << PIO_##name##_OFFSET)
+#define PIO_BFEXT(name,value)                  (((value) >> PIO_##name##_OFFSET) & ((1 << PIO_##name##_SIZE) - 1))
+#define PIO_BFINS(name,value,old)              (((old) & ~(((1 << PIO_##name##_SIZE) - 1) << PIO_##name##_OFFSET)) | PIO_BF(name,value))
+
+/* Register access macros */
+#define pio_readl(port,reg)                    readl((port)->regs + PIO_##reg)
+#define pio_writel(port,reg,value)             writel((value), (port)->regs + PIO_##reg)
+
+void at32_init_pio(struct platform_device *pdev);
+
+#endif /* __ARCH_AVR32_AT32AP_PIO_H__ */
diff --git a/arch/avr32/mach-at32ap/sm.c b/arch/avr32/mach-at32ap/sm.c
new file mode 100644
index 00000000000..03306eb0345
--- /dev/null
+++ b/arch/avr32/mach-at32ap/sm.c
@@ -0,0 +1,289 @@
+/*
+ * System Manager driver for AT32AP CPUs
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+
+#include <asm/intc.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+
+#include <asm/arch/sm.h>
+
+#include "sm.h"
+
+#define SM_EIM_IRQ_RESOURCE	1
+#define SM_PM_IRQ_RESOURCE	2
+#define SM_RTC_IRQ_RESOURCE	3
+
+#define to_eim(irqc) container_of(irqc, struct at32_sm, irqc)
+
+struct at32_sm system_manager;
+
+int __init at32_sm_init(void)
+{
+	struct resource *regs;
+	struct at32_sm *sm = &system_manager;
+	int ret = -ENXIO;
+
+	regs = platform_get_resource(&at32_sm_device, IORESOURCE_MEM, 0);
+	if (!regs)
+		goto fail;
+
+	spin_lock_init(&sm->lock);
+	sm->pdev = &at32_sm_device;
+
+	ret = -ENOMEM;
+	sm->regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!sm->regs)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "Failed to initialize System Manager: %d\n", ret);
+	return ret;
+}
+
+/*
+ * External Interrupt Module (EIM).
+ *
+ * EIM gets level- or edge-triggered interrupts of either polarity
+ * from the outside and converts it to active-high level-triggered
+ * interrupts that the internal interrupt controller can handle. EIM
+ * also provides masking/unmasking of interrupts, as well as
+ * acknowledging of edge-triggered interrupts.
+ */
+
+static irqreturn_t spurious_eim_interrupt(int irq, void *dev_id,
+					  struct pt_regs *regs)
+{
+	printk(KERN_WARNING "Spurious EIM interrupt %d\n", irq);
+	disable_irq(irq);
+	return IRQ_NONE;
+}
+
+static struct irqaction eim_spurious_action = {
+	.handler = spurious_eim_interrupt,
+};
+
+static irqreturn_t eim_handle_irq(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct irq_controller * irqc = dev_id;
+	struct at32_sm *sm = to_eim(irqc);
+	unsigned long pending;
+
+	/*
+	 * No need to disable interrupts globally.  The interrupt
+	 * level relevant to this group must be masked all the time,
+	 * so we know that this particular EIM instance will not be
+	 * re-entered.
+	 */
+	spin_lock(&sm->lock);
+
+	pending = intc_get_pending(sm->irqc.irq_group);
+	if (unlikely(!pending)) {
+		printk(KERN_ERR "EIM (group %u): No interrupts pending!\n",
+		       sm->irqc.irq_group);
+		goto unlock;
+	}
+
+	do {
+		struct irqaction *action;
+		unsigned int i;
+
+		i = fls(pending) - 1;
+		pending &= ~(1 << i);
+		action = sm->action[i];
+
+		/* Acknowledge the interrupt */
+		sm_writel(sm, EIM_ICR, 1 << i);
+
+		spin_unlock(&sm->lock);
+
+		if (action->flags & SA_INTERRUPT)
+			local_irq_disable();
+		action->handler(sm->irqc.first_irq + i, action->dev_id, regs);
+		local_irq_enable();
+		spin_lock(&sm->lock);
+		if (action->flags & SA_SAMPLE_RANDOM)
+			add_interrupt_randomness(sm->irqc.first_irq + i);
+	} while (pending);
+
+unlock:
+	spin_unlock(&sm->lock);
+	return IRQ_HANDLED;
+}
+
+static void eim_mask(struct irq_controller *irqc, unsigned int irq)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	unsigned int i;
+
+	i = irq - sm->irqc.first_irq;
+	sm_writel(sm, EIM_IDR, 1 << i);
+}
+
+static void eim_unmask(struct irq_controller *irqc, unsigned int irq)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	unsigned int i;
+
+	i = irq - sm->irqc.first_irq;
+	sm_writel(sm, EIM_IER, 1 << i);
+}
+
+static int eim_setup(struct irq_controller *irqc, unsigned int irq,
+		struct irqaction *action)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	sm->action[irq - sm->irqc.first_irq] = action;
+	/* Acknowledge earlier interrupts */
+	sm_writel(sm, EIM_ICR, (1<<(irq - sm->irqc.first_irq)));
+	eim_unmask(irqc, irq);
+	return 0;
+}
+
+static void eim_free(struct irq_controller *irqc, unsigned int irq,
+		void *dev)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	eim_mask(irqc, irq);
+	sm->action[irq - sm->irqc.first_irq] = &eim_spurious_action;
+}
+
+static int eim_set_type(struct irq_controller *irqc, unsigned int irq,
+			unsigned int type)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	unsigned long flags;
+	u32 value, pattern;
+
+	spin_lock_irqsave(&sm->lock, flags);
+
+	pattern = 1 << (irq - sm->irqc.first_irq);
+
+	value = sm_readl(sm, EIM_MODE);
+	if (type & IRQ_TYPE_LEVEL)
+		value |= pattern;
+	else
+		value &= ~pattern;
+	sm_writel(sm, EIM_MODE, value);
+	value = sm_readl(sm, EIM_EDGE);
+	if (type & IRQ_EDGE_RISING)
+		value |= pattern;
+	else
+		value &= ~pattern;
+	sm_writel(sm, EIM_EDGE, value);
+	value = sm_readl(sm, EIM_LEVEL);
+	if (type & IRQ_LEVEL_HIGH)
+		value |= pattern;
+	else
+		value &= ~pattern;
+	sm_writel(sm, EIM_LEVEL, value);
+
+	spin_unlock_irqrestore(&sm->lock, flags);
+
+	return 0;
+}
+
+static unsigned int eim_get_type(struct irq_controller *irqc,
+				 unsigned int irq)
+{
+	struct at32_sm *sm = to_eim(irqc);
+	unsigned long flags;
+	unsigned int type = 0;
+	u32 mode, edge, level, pattern;
+
+	pattern = 1 << (irq - sm->irqc.first_irq);
+
+	spin_lock_irqsave(&sm->lock, flags);
+	mode = sm_readl(sm, EIM_MODE);
+	edge = sm_readl(sm, EIM_EDGE);
+	level = sm_readl(sm, EIM_LEVEL);
+	spin_unlock_irqrestore(&sm->lock, flags);
+
+	if (mode & pattern)
+		type |= IRQ_TYPE_LEVEL;
+	if (edge & pattern)
+		type |= IRQ_EDGE_RISING;
+	if (level & pattern)
+		type |= IRQ_LEVEL_HIGH;
+
+	return type;
+}
+
+static struct irq_controller_class eim_irq_class = {
+	.typename	= "EIM",
+	.handle		= eim_handle_irq,
+	.setup		= eim_setup,
+	.free		= eim_free,
+	.mask		= eim_mask,
+	.unmask		= eim_unmask,
+	.set_type	= eim_set_type,
+	.get_type	= eim_get_type,
+};
+
+static int __init eim_init(void)
+{
+	struct at32_sm *sm = &system_manager;
+	unsigned int i;
+	u32 pattern;
+	int ret;
+
+	/*
+	 * The EIM is really the same module as SM, so register
+	 * mapping, etc. has been taken care of already.
+	 */
+
+	/*
+	 * Find out how many interrupt lines that are actually
+	 * implemented in hardware.
+	 */
+	sm_writel(sm, EIM_IDR, ~0UL);
+	sm_writel(sm, EIM_MODE, ~0UL);
+	pattern = sm_readl(sm, EIM_MODE);
+	sm->irqc.nr_irqs = fls(pattern);
+
+	ret = -ENOMEM;
+	sm->action = kmalloc(sizeof(*sm->action) * sm->irqc.nr_irqs,
+			     GFP_KERNEL);
+	if (!sm->action)
+		goto out;
+
+	for (i = 0; i < sm->irqc.nr_irqs; i++)
+		sm->action[i] = &eim_spurious_action;
+
+	spin_lock_init(&sm->lock);
+	sm->irqc.irq_group = sm->pdev->resource[SM_EIM_IRQ_RESOURCE].start;
+	sm->irqc.class = &eim_irq_class;
+
+	ret = intc_register_controller(&sm->irqc);
+	if (ret < 0)
+		goto out_free_actions;
+
+	printk("EIM: External Interrupt Module at 0x%p, IRQ group %u\n",
+	       sm->regs, sm->irqc.irq_group);
+	printk("EIM: Handling %u external IRQs, starting with IRQ%u\n",
+	       sm->irqc.nr_irqs, sm->irqc.first_irq);
+
+	return 0;
+
+out_free_actions:
+	kfree(sm->action);
+out:
+	return ret;
+}
+arch_initcall(eim_init);
diff --git a/arch/avr32/mach-at32ap/sm.h b/arch/avr32/mach-at32ap/sm.h
new file mode 100644
index 00000000000..27565822ae2
--- /dev/null
+++ b/arch/avr32/mach-at32ap/sm.h
@@ -0,0 +1,240 @@
+/*
+ * Register definitions for SM
+ *
+ * System Manager
+ */
+#ifndef __ASM_AVR32_SM_H__
+#define __ASM_AVR32_SM_H__
+
+/* SM register offsets */
+#define SM_PM_MCCTRL                            0x0000
+#define SM_PM_CKSEL                             0x0004
+#define SM_PM_CPU_MASK                          0x0008
+#define SM_PM_HSB_MASK                          0x000c
+#define SM_PM_PBA_MASK                         0x0010
+#define SM_PM_PBB_MASK                         0x0014
+#define SM_PM_PLL0                              0x0020
+#define SM_PM_PLL1                              0x0024
+#define SM_PM_VCTRL                             0x0030
+#define SM_PM_VMREF                             0x0034
+#define SM_PM_VMV                               0x0038
+#define SM_PM_IER                               0x0040
+#define SM_PM_IDR                               0x0044
+#define SM_PM_IMR                               0x0048
+#define SM_PM_ISR                               0x004c
+#define SM_PM_ICR                               0x0050
+#define SM_PM_GCCTRL                            0x0060
+#define SM_RTC_CTRL                             0x0080
+#define SM_RTC_VAL                              0x0084
+#define SM_RTC_TOP                              0x0088
+#define SM_RTC_IER                              0x0090
+#define SM_RTC_IDR                              0x0094
+#define SM_RTC_IMR                              0x0098
+#define SM_RTC_ISR                              0x009c
+#define SM_RTC_ICR                              0x00a0
+#define SM_WDT_CTRL                             0x00b0
+#define SM_WDT_CLR                              0x00b4
+#define SM_WDT_EXT                              0x00b8
+#define SM_RC_RCAUSE                            0x00c0
+#define SM_EIM_IER                              0x0100
+#define SM_EIM_IDR                              0x0104
+#define SM_EIM_IMR                              0x0108
+#define SM_EIM_ISR                              0x010c
+#define SM_EIM_ICR                              0x0110
+#define SM_EIM_MODE                             0x0114
+#define SM_EIM_EDGE                             0x0118
+#define SM_EIM_LEVEL                            0x011c
+#define SM_EIM_TEST                             0x0120
+#define SM_EIM_NMIC                             0x0124
+
+/* Bitfields in PM_MCCTRL */
+
+/* Bitfields in PM_CKSEL */
+#define SM_CPUSEL_OFFSET                        0
+#define SM_CPUSEL_SIZE                          3
+#define SM_CPUDIV_OFFSET                        7
+#define SM_CPUDIV_SIZE                          1
+#define SM_HSBSEL_OFFSET                        8
+#define SM_HSBSEL_SIZE                          3
+#define SM_HSBDIV_OFFSET                        15
+#define SM_HSBDIV_SIZE                          1
+#define SM_PBASEL_OFFSET                       16
+#define SM_PBASEL_SIZE                         3
+#define SM_PBADIV_OFFSET                       23
+#define SM_PBADIV_SIZE                         1
+#define SM_PBBSEL_OFFSET                       24
+#define SM_PBBSEL_SIZE                         3
+#define SM_PBBDIV_OFFSET                       31
+#define SM_PBBDIV_SIZE                         1
+
+/* Bitfields in PM_CPU_MASK */
+
+/* Bitfields in PM_HSB_MASK */
+
+/* Bitfields in PM_PBA_MASK */
+
+/* Bitfields in PM_PBB_MASK */
+
+/* Bitfields in PM_PLL0 */
+#define SM_PLLEN_OFFSET                         0
+#define SM_PLLEN_SIZE                           1
+#define SM_PLLOSC_OFFSET                        1
+#define SM_PLLOSC_SIZE                          1
+#define SM_PLLOPT_OFFSET                        2
+#define SM_PLLOPT_SIZE                          3
+#define SM_PLLDIV_OFFSET                        8
+#define SM_PLLDIV_SIZE                          8
+#define SM_PLLMUL_OFFSET                        16
+#define SM_PLLMUL_SIZE                          8
+#define SM_PLLCOUNT_OFFSET                      24
+#define SM_PLLCOUNT_SIZE                        6
+#define SM_PLLTEST_OFFSET                       31
+#define SM_PLLTEST_SIZE                         1
+
+/* Bitfields in PM_PLL1 */
+
+/* Bitfields in PM_VCTRL */
+#define SM_VAUTO_OFFSET                         0
+#define SM_VAUTO_SIZE                           1
+#define SM_PM_VCTRL_VAL_OFFSET                  8
+#define SM_PM_VCTRL_VAL_SIZE                    7
+
+/* Bitfields in PM_VMREF */
+#define SM_REFSEL_OFFSET                        0
+#define SM_REFSEL_SIZE                          4
+
+/* Bitfields in PM_VMV */
+#define SM_PM_VMV_VAL_OFFSET                    0
+#define SM_PM_VMV_VAL_SIZE                      8
+
+/* Bitfields in PM_IER */
+
+/* Bitfields in PM_IDR */
+
+/* Bitfields in PM_IMR */
+
+/* Bitfields in PM_ISR */
+
+/* Bitfields in PM_ICR */
+#define SM_LOCK0_OFFSET                         0
+#define SM_LOCK0_SIZE                           1
+#define SM_LOCK1_OFFSET                         1
+#define SM_LOCK1_SIZE                           1
+#define SM_WAKE_OFFSET                          2
+#define SM_WAKE_SIZE                            1
+#define SM_VOK_OFFSET                           3
+#define SM_VOK_SIZE                             1
+#define SM_VMRDY_OFFSET                         4
+#define SM_VMRDY_SIZE                           1
+#define SM_CKRDY_OFFSET                         5
+#define SM_CKRDY_SIZE                           1
+
+/* Bitfields in PM_GCCTRL */
+#define SM_OSCSEL_OFFSET                        0
+#define SM_OSCSEL_SIZE                          1
+#define SM_PLLSEL_OFFSET                        1
+#define SM_PLLSEL_SIZE                          1
+#define SM_CEN_OFFSET                           2
+#define SM_CEN_SIZE                             1
+#define SM_CPC_OFFSET                           3
+#define SM_CPC_SIZE                             1
+#define SM_DIVEN_OFFSET                         4
+#define SM_DIVEN_SIZE                           1
+#define SM_DIV_OFFSET                           8
+#define SM_DIV_SIZE                             8
+
+/* Bitfields in RTC_CTRL */
+#define SM_PCLR_OFFSET                          1
+#define SM_PCLR_SIZE                            1
+#define SM_TOPEN_OFFSET                         2
+#define SM_TOPEN_SIZE                           1
+#define SM_CLKEN_OFFSET                         3
+#define SM_CLKEN_SIZE                           1
+#define SM_PSEL_OFFSET                          8
+#define SM_PSEL_SIZE                            16
+
+/* Bitfields in RTC_VAL */
+#define SM_RTC_VAL_VAL_OFFSET                   0
+#define SM_RTC_VAL_VAL_SIZE                     31
+
+/* Bitfields in RTC_TOP */
+#define SM_RTC_TOP_VAL_OFFSET                   0
+#define SM_RTC_TOP_VAL_SIZE                     32
+
+/* Bitfields in RTC_IER */
+
+/* Bitfields in RTC_IDR */
+
+/* Bitfields in RTC_IMR */
+
+/* Bitfields in RTC_ISR */
+
+/* Bitfields in RTC_ICR */
+#define SM_TOPI_OFFSET                          0
+#define SM_TOPI_SIZE                            1
+
+/* Bitfields in WDT_CTRL */
+#define SM_KEY_OFFSET                           24
+#define SM_KEY_SIZE                             8
+
+/* Bitfields in WDT_CLR */
+
+/* Bitfields in WDT_EXT */
+
+/* Bitfields in RC_RCAUSE */
+#define SM_POR_OFFSET                           0
+#define SM_POR_SIZE                             1
+#define SM_BOD_OFFSET                           1
+#define SM_BOD_SIZE                             1
+#define SM_EXT_OFFSET                           2
+#define SM_EXT_SIZE                             1
+#define SM_WDT_OFFSET                           3
+#define SM_WDT_SIZE                             1
+#define SM_NTAE_OFFSET                          4
+#define SM_NTAE_SIZE                            1
+#define SM_SERP_OFFSET                          5
+#define SM_SERP_SIZE                            1
+
+/* Bitfields in EIM_IER */
+
+/* Bitfields in EIM_IDR */
+
+/* Bitfields in EIM_IMR */
+
+/* Bitfields in EIM_ISR */
+
+/* Bitfields in EIM_ICR */
+
+/* Bitfields in EIM_MODE */
+
+/* Bitfields in EIM_EDGE */
+#define SM_INT0_OFFSET                          0
+#define SM_INT0_SIZE                            1
+#define SM_INT1_OFFSET                          1
+#define SM_INT1_SIZE                            1
+#define SM_INT2_OFFSET                          2
+#define SM_INT2_SIZE                            1
+#define SM_INT3_OFFSET                          3
+#define SM_INT3_SIZE                            1
+
+/* Bitfields in EIM_LEVEL */
+
+/* Bitfields in EIM_TEST */
+#define SM_TESTEN_OFFSET                        31
+#define SM_TESTEN_SIZE                          1
+
+/* Bitfields in EIM_NMIC */
+#define SM_EN_OFFSET                            0
+#define SM_EN_SIZE                              1
+
+/* Bit manipulation macros */
+#define SM_BIT(name)                            (1 << SM_##name##_OFFSET)
+#define SM_BF(name,value)                       (((value) & ((1 << SM_##name##_SIZE) - 1)) << SM_##name##_OFFSET)
+#define SM_BFEXT(name,value)                    (((value) >> SM_##name##_OFFSET) & ((1 << SM_##name##_SIZE) - 1))
+#define SM_BFINS(name,value,old)                (((old) & ~(((1 << SM_##name##_SIZE) - 1) << SM_##name##_OFFSET)) | SM_BF(name,value))
+
+/* Register access macros */
+#define sm_readl(port,reg)                      readl((port)->regs + SM_##reg)
+#define sm_writel(port,reg,value)               writel((value), (port)->regs + SM_##reg)
+
+#endif /* __ASM_AVR32_SM_H__ */
diff --git a/arch/avr32/mm/Makefile b/arch/avr32/mm/Makefile
new file mode 100644
index 00000000000..0066491f90d
--- /dev/null
+++ b/arch/avr32/mm/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the Linux/AVR32 kernel.
+#
+
+obj-y				+= init.o clear_page.o copy_page.o dma-coherent.o
+obj-y				+= ioremap.o cache.o fault.o tlb.o
diff --git a/arch/avr32/mm/cache.c b/arch/avr32/mm/cache.c
new file mode 100644
index 00000000000..450515b245a
--- /dev/null
+++ b/arch/avr32/mm/cache.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/highmem.h>
+#include <linux/unistd.h>
+
+#include <asm/cacheflush.h>
+#include <asm/cachectl.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+
+/*
+ * If you attempt to flush anything more than this, you need superuser
+ * privileges.  The value is completely arbitrary.
+ */
+#define CACHEFLUSH_MAX_LEN	1024
+
+void invalidate_dcache_region(void *start, size_t size)
+{
+	unsigned long v, begin, end, linesz;
+
+	linesz = boot_cpu_data.dcache.linesz;
+
+	//printk("invalidate dcache: %p + %u\n", start, size);
+
+	/* You asked for it, you got it */
+	begin = (unsigned long)start & ~(linesz - 1);
+	end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
+
+	for (v = begin; v < end; v += linesz)
+		invalidate_dcache_line((void *)v);
+}
+
+void clean_dcache_region(void *start, size_t size)
+{
+	unsigned long v, begin, end, linesz;
+
+	linesz = boot_cpu_data.dcache.linesz;
+	begin = (unsigned long)start & ~(linesz - 1);
+	end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
+
+	for (v = begin; v < end; v += linesz)
+		clean_dcache_line((void *)v);
+	flush_write_buffer();
+}
+
+void flush_dcache_region(void *start, size_t size)
+{
+	unsigned long v, begin, end, linesz;
+
+	linesz = boot_cpu_data.dcache.linesz;
+	begin = (unsigned long)start & ~(linesz - 1);
+	end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
+
+	for (v = begin; v < end; v += linesz)
+		flush_dcache_line((void *)v);
+	flush_write_buffer();
+}
+
+void invalidate_icache_region(void *start, size_t size)
+{
+	unsigned long v, begin, end, linesz;
+
+	linesz = boot_cpu_data.icache.linesz;
+	begin = (unsigned long)start & ~(linesz - 1);
+	end = ((unsigned long)start + size + linesz - 1) & ~(linesz - 1);
+
+	for (v = begin; v < end; v += linesz)
+		invalidate_icache_line((void *)v);
+}
+
+static inline void __flush_icache_range(unsigned long start, unsigned long end)
+{
+	unsigned long v, linesz;
+
+	linesz = boot_cpu_data.dcache.linesz;
+	for (v = start; v < end; v += linesz) {
+		clean_dcache_line((void *)v);
+		invalidate_icache_line((void *)v);
+	}
+
+	flush_write_buffer();
+}
+
+/*
+ * This one is called after a module has been loaded.
+ */
+void flush_icache_range(unsigned long start, unsigned long end)
+{
+	unsigned long linesz;
+
+	linesz = boot_cpu_data.dcache.linesz;
+	__flush_icache_range(start & ~(linesz - 1),
+			     (end + linesz - 1) & ~(linesz - 1));
+}
+
+/*
+ * This one is called from do_no_page(), do_swap_page() and install_page().
+ */
+void flush_icache_page(struct vm_area_struct *vma, struct page *page)
+{
+	if (vma->vm_flags & VM_EXEC) {
+		void *v = kmap(page);
+		__flush_icache_range((unsigned long)v, (unsigned long)v + PAGE_SIZE);
+		kunmap(v);
+	}
+}
+
+/*
+ * This one is used by copy_to_user_page()
+ */
+void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+			     unsigned long addr, int len)
+{
+	if (vma->vm_flags & VM_EXEC)
+		flush_icache_range(addr, addr + len);
+}
+
+asmlinkage int sys_cacheflush(int operation, void __user *addr, size_t len)
+{
+	int ret;
+
+	if (len > CACHEFLUSH_MAX_LEN) {
+		ret = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto out;
+	}
+
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_WRITE, addr, len))
+		goto out;
+
+	switch (operation) {
+	case CACHE_IFLUSH:
+		flush_icache_range((unsigned long)addr,
+				   (unsigned long)addr + len);
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+out:
+	return ret;
+}
diff --git a/arch/avr32/mm/clear_page.S b/arch/avr32/mm/clear_page.S
new file mode 100644
index 00000000000..5d70dca0069
--- /dev/null
+++ b/arch/avr32/mm/clear_page.S
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * clear_page
+ * r12: P1 address (to)
+ */
+	.text
+	.global clear_page
+clear_page:
+	sub	r9, r12, -PAGE_SIZE
+	mov     r10, 0
+	mov	r11, 0
+0:      st.d    r12++, r10
+	cp      r12, r9
+	brne	0b
+	mov     pc, lr
diff --git a/arch/avr32/mm/copy_page.S b/arch/avr32/mm/copy_page.S
new file mode 100644
index 00000000000..c2b3752946b
--- /dev/null
+++ b/arch/avr32/mm/copy_page.S
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * copy_page
+ *
+ * r12		to (P1 address)
+ * r11		from (P1 address)
+ * r8-r10	scratch
+ */
+	.text
+	.global copy_page
+copy_page:
+	sub	r10, r11, -(1 << PAGE_SHIFT)
+	/* pref	r11[0] */
+1:	/* pref	r11[8] */
+	ld.d	r8, r11++
+	st.d	r12++, r8
+	cp	r11, r10
+	brlo	1b
+	mov	pc, lr
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
new file mode 100644
index 00000000000..44ab8a7bdae
--- /dev/null
+++ b/arch/avr32/mm/dma-coherent.c
@@ -0,0 +1,139 @@
+/*
+ *  Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/dma-mapping.h>
+
+#include <asm/addrspace.h>
+#include <asm/cacheflush.h>
+
+void dma_cache_sync(void *vaddr, size_t size, int direction)
+{
+	/*
+	 * No need to sync an uncached area
+	 */
+	if (PXSEG(vaddr) == P2SEG)
+		return;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:		/* invalidate only */
+		dma_cache_inv(vaddr, size);
+		break;
+	case DMA_TO_DEVICE:		/* writeback only */
+		dma_cache_wback(vaddr, size);
+		break;
+	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
+		dma_cache_wback_inv(vaddr, size);
+		break;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL(dma_cache_sync);
+
+static struct page *__dma_alloc(struct device *dev, size_t size,
+				dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page, *free, *end;
+	int order;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		return NULL;
+	split_page(page, order);
+
+	/*
+	 * When accessing physical memory with valid cache data, we
+	 * get a cache hit even if the virtual memory region is marked
+	 * as uncached.
+	 *
+	 * Since the memory is newly allocated, there is no point in
+	 * doing a writeback. If the previous owner cares, he should
+	 * have flushed the cache before releasing the memory.
+	 */
+	invalidate_dcache_region(phys_to_virt(page_to_phys(page)), size);
+
+	*handle = page_to_bus(page);
+	free = page + (size >> PAGE_SHIFT);
+	end = page + (1 << order);
+
+	/*
+	 * Free any unused pages
+	 */
+	while (free < end) {
+		__free_page(free);
+		free++;
+	}
+
+	return page;
+}
+
+static void __dma_free(struct device *dev, size_t size,
+		       struct page *page, dma_addr_t handle)
+{
+	struct page *end = page + (PAGE_ALIGN(size) >> PAGE_SHIFT);
+
+	while (page < end)
+		__free_page(page++);
+}
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+			 dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page;
+	void *ret = NULL;
+
+	page = __dma_alloc(dev, size, handle, gfp);
+	if (page)
+		ret = phys_to_uncached(page_to_phys(page));
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+void dma_free_coherent(struct device *dev, size_t size,
+		       void *cpu_addr, dma_addr_t handle)
+{
+	void *addr = phys_to_cached(uncached_to_phys(cpu_addr));
+	struct page *page;
+
+	pr_debug("dma_free_coherent addr %p (phys %08lx) size %u\n",
+		 cpu_addr, (unsigned long)handle, (unsigned)size);
+	BUG_ON(!virt_addr_valid(addr));
+	page = virt_to_page(addr);
+	__dma_free(dev, size, page, handle);
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+#if 0
+void *dma_alloc_writecombine(struct device *dev, size_t size,
+			     dma_addr_t *handle, gfp_t gfp)
+{
+	struct page *page;
+
+	page = __dma_alloc(dev, size, handle, gfp);
+
+	/* Now, map the page into P3 with write-combining turned on */
+	return __ioremap(page_to_phys(page), size, _PAGE_BUFFER);
+}
+EXPORT_SYMBOL(dma_alloc_writecombine);
+
+void dma_free_writecombine(struct device *dev, size_t size,
+			   void *cpu_addr, dma_addr_t handle)
+{
+	struct page *page;
+
+	iounmap(cpu_addr);
+
+	page = bus_to_page(handle);
+	__dma_free(dev, size, page, handle);
+}
+EXPORT_SYMBOL(dma_free_writecombine);
+#endif
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
new file mode 100644
index 00000000000..678557260a3
--- /dev/null
+++ b/arch/avr32/mm/fault.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on linux/arch/sh/mm/fault.c:
+ *   Copyright (C) 1999  Niibe Yutaka
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+
+#include <asm/kdebug.h>
+#include <asm/mmu_context.h>
+#include <asm/sysreg.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+
+#ifdef DEBUG
+static void dump_code(unsigned long pc)
+{
+	char *p = (char *)pc;
+	char val;
+	int i;
+
+
+	printk(KERN_DEBUG "Code:");
+	for (i = 0; i < 16; i++) {
+		if (__get_user(val, p + i))
+			break;
+		printk(" %02x", val);
+	}
+	printk("\n");
+}
+#endif
+
+#ifdef CONFIG_KPROBES
+ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+
+static inline int notify_page_fault(enum die_val val, struct pt_regs *regs,
+				    int trap, int sig)
+{
+	struct die_args args = {
+		.regs = regs,
+		.trapnr = trap,
+	};
+	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
+}
+#else
+static inline int notify_page_fault(enum die_val val, struct pt_regs *regs,
+				    int trap, int sig)
+{
+	return NOTIFY_DONE;
+}
+#endif
+
+/*
+ * This routine handles page faults. It determines the address and the
+ * problem, and then passes it off to one of the appropriate routines.
+ *
+ * ecr is the Exception Cause Register. Possible values are:
+ *   5:  Page not found (instruction access)
+ *   6:  Protection fault (instruction access)
+ *   12: Page not found (read access)
+ *   13: Page not found (write access)
+ *   14: Protection fault (read access)
+ *   15: Protection fault (write access)
+ */
+asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	const struct exception_table_entry *fixup;
+	unsigned long address;
+	unsigned long page;
+	int writeaccess = 0;
+
+	if (notify_page_fault(DIE_PAGE_FAULT, regs,
+			      ecr, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	address = sysreg_read(TLBEAR);
+
+	tsk = current;
+	mm = tsk->mm;
+
+	/*
+	 * If we're in an interrupt or have no user context, we must
+	 * not take the fault...
+	 */
+	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+		goto no_context;
+
+	local_irq_enable();
+
+	down_read(&mm->mmap_sem);
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= address)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (expand_stack(vma, address))
+		goto bad_area;
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so we
+	 * can handle it...
+	 */
+good_area:
+	//pr_debug("good area: vm_flags = 0x%lx\n", vma->vm_flags);
+	switch (ecr) {
+	case ECR_PROTECTION_X:
+	case ECR_TLB_MISS_X:
+		if (!(vma->vm_flags & VM_EXEC))
+			goto bad_area;
+		break;
+	case ECR_PROTECTION_R:
+	case ECR_TLB_MISS_R:
+		if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
+			goto bad_area;
+		break;
+	case ECR_PROTECTION_W:
+	case ECR_TLB_MISS_W:
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+		writeaccess = 1;
+		break;
+	default:
+		panic("Unhandled case %lu in do_page_fault!", ecr);
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault, make
+	 * sure we exit gracefully rather than endlessly redo the
+	 * fault.
+	 */
+survive:
+	switch (handle_mm_fault(mm, vma, address, writeaccess)) {
+	case VM_FAULT_MINOR:
+		tsk->min_flt++;
+		break;
+	case VM_FAULT_MAJOR:
+		tsk->maj_flt++;
+		break;
+	case VM_FAULT_SIGBUS:
+		goto do_sigbus;
+	case VM_FAULT_OOM:
+		goto out_of_memory;
+	default:
+		BUG();
+	}
+
+	up_read(&mm->mmap_sem);
+	return;
+
+	/*
+	 * Something tried to access memory that isn't in our memory
+	 * map. Fix it, but check if it's kernel or user first...
+	 */
+bad_area:
+	pr_debug("Bad area [%s:%u]: addr %08lx, ecr %lu\n",
+		 tsk->comm, tsk->pid, address, ecr);
+
+	up_read(&mm->mmap_sem);
+
+	if (user_mode(regs)) {
+		/* Hmm...we have to pass address and ecr somehow... */
+		/* tsk->thread.address = address;
+		   tsk->thread.error_code = ecr; */
+#ifdef DEBUG
+		show_regs(regs);
+		dump_code(regs->pc);
+
+		page = sysreg_read(PTBR);
+		printk("ptbr = %08lx", page);
+		if (page) {
+			page = ((unsigned long *)page)[address >> 22];
+			printk(" pgd = %08lx", page);
+			if (page & _PAGE_PRESENT) {
+				page &= PAGE_MASK;
+				address &= 0x003ff000;
+				page = ((unsigned long *)__va(page))[address >> PAGE_SHIFT];
+				printk(" pte = %08lx\n", page);
+			}
+		}
+#endif
+		pr_debug("Sending SIGSEGV to PID %d...\n",
+			tsk->pid);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+
+no_context:
+	pr_debug("No context\n");
+
+	/* Are we prepared to handle this kernel fault? */
+	fixup = search_exception_tables(regs->pc);
+	if (fixup) {
+		regs->pc = fixup->fixup;
+		pr_debug("Found fixup at %08lx\n", fixup->fixup);
+		return;
+	}
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have
+	 * to terminate things with extreme prejudice.
+	 */
+	if (address < PAGE_SIZE)
+		printk(KERN_ALERT
+		       "Unable to handle kernel NULL pointer dereference");
+	else
+		printk(KERN_ALERT
+		       "Unable to handle kernel paging request");
+	printk(" at virtual address %08lx\n", address);
+	printk(KERN_ALERT "pc = %08lx\n", regs->pc);
+
+	page = sysreg_read(PTBR);
+	printk(KERN_ALERT "ptbr = %08lx", page);
+	if (page) {
+		page = ((unsigned long *)page)[address >> 22];
+		printk(" pgd = %08lx", page);
+		if (page & _PAGE_PRESENT) {
+			page &= PAGE_MASK;
+			address &= 0x003ff000;
+			page = ((unsigned long *)__va(page))[address >> PAGE_SHIFT];
+			printk(" pte = %08lx\n", page);
+		}
+	}
+	die("\nOops", regs, ecr);
+	do_exit(SIGKILL);
+
+	/*
+	 * We ran out of memory, or some other thing happened to us
+	 * that made us unable to handle the page fault gracefully.
+	 */
+out_of_memory:
+	printk("Out of memory\n");
+	up_read(&mm->mmap_sem);
+	if (current->pid == 1) {
+		yield();
+		down_read(&mm->mmap_sem);
+		goto survive;
+	}
+	printk("VM: Killing process %s\n", tsk->comm);
+	if (user_mode(regs))
+		do_exit(SIGKILL);
+	goto no_context;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+
+	/*
+	 * Send a sigbus, regardless of whether we were in kernel or
+	 * user mode.
+	 */
+	/* address, error_code, trap_no, ... */
+#ifdef DEBUG
+	show_regs(regs);
+	dump_code(regs->pc);
+#endif
+	pr_debug("Sending SIGBUS to PID %d...\n", tsk->pid);
+	force_sig(SIGBUS, tsk);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!user_mode(regs))
+		goto no_context;
+}
+
+asmlinkage void do_bus_error(unsigned long addr, int write_access,
+			     struct pt_regs *regs)
+{
+	printk(KERN_ALERT
+	       "Bus error at physical address 0x%08lx (%s access)\n",
+	       addr, write_access ? "write" : "read");
+	printk(KERN_INFO "DTLB dump:\n");
+	dump_dtlb();
+	die("Bus Error", regs, write_access);
+	do_exit(SIGKILL);
+}
+
+/*
+ * This functionality is currently not possible to implement because
+ * we're using segmentation to ensure a fixed mapping of the kernel
+ * virtual address space.
+ *
+ * It would be possible to implement this, but it would require us to
+ * disable segmentation at startup and load the kernel mappings into
+ * the TLB like any other pages. There will be lots of trickery to
+ * avoid recursive invocation of the TLB miss handler, though...
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+
+}
+EXPORT_SYMBOL(kernel_map_pages);
+#endif
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
new file mode 100644
index 00000000000..3e6c4103980
--- /dev/null
+++ b/arch/avr32/mm/init.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/pagemap.h>
+#include <linux/pfn.h>
+#include <linux/nodemask.h>
+
+#include <asm/page.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+pgd_t swapper_pg_dir[PTRS_PER_PGD];
+
+struct page *empty_zero_page;
+
+/*
+ * Cache of MMU context last used.
+ */
+unsigned long mmu_context_cache = NO_CONTEXT;
+
+#define START_PFN	(NODE_DATA(0)->bdata->node_boot_start >> PAGE_SHIFT)
+#define MAX_LOW_PFN	(NODE_DATA(0)->bdata->node_low_pfn)
+
+void show_mem(void)
+{
+	int total = 0, reserved = 0, cached = 0;
+	int slab = 0, free = 0, shared = 0;
+	pg_data_t *pgdat;
+
+	printk("Mem-info:\n");
+	show_free_areas();
+
+	for_each_online_pgdat(pgdat) {
+		struct page *page, *end;
+
+		page = pgdat->node_mem_map;
+		end = page + pgdat->node_spanned_pages;
+
+		do {
+			total++;
+			if (PageReserved(page))
+				reserved++;
+			else if (PageSwapCache(page))
+				cached++;
+			else if (PageSlab(page))
+				slab++;
+			else if (!page_count(page))
+				free++;
+			else
+				shared += page_count(page) - 1;
+			page++;
+		} while (page < end);
+	}
+
+	printk ("%d pages of RAM\n", total);
+	printk ("%d free pages\n", free);
+	printk ("%d reserved pages\n", reserved);
+	printk ("%d slab pages\n", slab);
+	printk ("%d pages shared\n", shared);
+	printk ("%d pages swap cached\n", cached);
+}
+
+static void __init print_memory_map(const char *what,
+				    struct tag_mem_range *mem)
+{
+	printk ("%s:\n", what);
+	for (; mem; mem = mem->next) {
+		printk ("  %08lx - %08lx\n",
+			(unsigned long)mem->addr,
+			(unsigned long)(mem->addr + mem->size));
+	}
+}
+
+#define MAX_LOWMEM	HIGHMEM_START
+#define MAX_LOWMEM_PFN	PFN_DOWN(MAX_LOWMEM)
+
+/*
+ * Sort a list of memory regions in-place by ascending address.
+ *
+ * We're using bubble sort because we only have singly linked lists
+ * with few elements.
+ */
+static void __init sort_mem_list(struct tag_mem_range **pmem)
+{
+	int done;
+	struct tag_mem_range **a, **b;
+
+	if (!*pmem)
+		return;
+
+	do {
+		done = 1;
+		a = pmem, b = &(*pmem)->next;
+		while (*b) {
+			if ((*a)->addr > (*b)->addr) {
+				struct tag_mem_range *tmp;
+				tmp = (*b)->next;
+				(*b)->next = *a;
+				*a = *b;
+				*b = tmp;
+				done = 0;
+			}
+			a = &(*a)->next;
+			b = &(*a)->next;
+		}
+	} while (!done);
+}
+
+/*
+ * Find a free memory region large enough for storing the
+ * bootmem bitmap.
+ */
+static unsigned long __init
+find_bootmap_pfn(const struct tag_mem_range *mem)
+{
+	unsigned long bootmap_pages, bootmap_len;
+	unsigned long node_pages = PFN_UP(mem->size);
+	unsigned long bootmap_addr = mem->addr;
+	struct tag_mem_range *reserved = mem_reserved;
+	struct tag_mem_range *ramdisk = mem_ramdisk;
+	unsigned long kern_start = virt_to_phys(_stext);
+	unsigned long kern_end = virt_to_phys(_end);
+
+	bootmap_pages = bootmem_bootmap_pages(node_pages);
+	bootmap_len = bootmap_pages << PAGE_SHIFT;
+
+	/*
+	 * Find a large enough region without reserved pages for
+	 * storing the bootmem bitmap. We can take advantage of the
+	 * fact that all lists have been sorted.
+	 *
+	 * We have to check explicitly reserved regions as well as the
+	 * kernel image and any RAMDISK images...
+	 *
+	 * Oh, and we have to make sure we don't overwrite the taglist
+	 * since we're going to use it until the bootmem allocator is
+	 * fully up and running.
+	 */
+	while (1) {
+		if ((bootmap_addr < kern_end) &&
+		    ((bootmap_addr + bootmap_len) > kern_start))
+			bootmap_addr = kern_end;
+
+		while (reserved &&
+		       (bootmap_addr >= (reserved->addr + reserved->size)))
+			reserved = reserved->next;
+
+		if (reserved &&
+		    ((bootmap_addr + bootmap_len) >= reserved->addr)) {
+			bootmap_addr = reserved->addr + reserved->size;
+			continue;
+		}
+
+		while (ramdisk &&
+		       (bootmap_addr >= (ramdisk->addr + ramdisk->size)))
+			ramdisk = ramdisk->next;
+
+		if (!ramdisk ||
+		    ((bootmap_addr + bootmap_len) < ramdisk->addr))
+			break;
+
+		bootmap_addr = ramdisk->addr + ramdisk->size;
+	}
+
+	if ((PFN_UP(bootmap_addr) + bootmap_len) >= (mem->addr + mem->size))
+		return ~0UL;
+
+	return PFN_UP(bootmap_addr);
+}
+
+void __init setup_bootmem(void)
+{
+	unsigned bootmap_size;
+	unsigned long first_pfn, bootmap_pfn, pages;
+	unsigned long max_pfn, max_low_pfn;
+	unsigned long kern_start = virt_to_phys(_stext);
+	unsigned long kern_end = virt_to_phys(_end);
+	unsigned node = 0;
+	struct tag_mem_range *bank, *res;
+
+	sort_mem_list(&mem_phys);
+	sort_mem_list(&mem_reserved);
+
+	print_memory_map("Physical memory", mem_phys);
+	print_memory_map("Reserved memory", mem_reserved);
+
+	nodes_clear(node_online_map);
+
+	if (mem_ramdisk) {
+#ifdef CONFIG_BLK_DEV_INITRD
+		initrd_start = __va(mem_ramdisk->addr);
+		initrd_end = initrd_start + mem_ramdisk->size;
+
+		print_memory_map("RAMDISK images", mem_ramdisk);
+		if (mem_ramdisk->next)
+			printk(KERN_WARNING
+			       "Warning: Only the first RAMDISK image "
+			       "will be used\n");
+		sort_mem_list(&mem_ramdisk);
+#else
+		printk(KERN_WARNING "RAM disk image present, but "
+		       "no initrd support in kernel!\n");
+#endif
+	}
+
+	if (mem_phys->next)
+		printk(KERN_WARNING "Only using first memory bank\n");
+
+	for (bank = mem_phys; bank; bank = NULL) {
+		first_pfn = PFN_UP(bank->addr);
+		max_low_pfn = max_pfn = PFN_DOWN(bank->addr + bank->size);
+		bootmap_pfn = find_bootmap_pfn(bank);
+		if (bootmap_pfn > max_pfn)
+			panic("No space for bootmem bitmap!\n");
+
+		if (max_low_pfn > MAX_LOWMEM_PFN) {
+			max_low_pfn = MAX_LOWMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+			/*
+			 * Lowmem is memory that can be addressed
+			 * directly through P1/P2
+			 */
+			printk(KERN_WARNING
+			       "Node %u: Only %ld MiB of memory will be used.\n",
+			       node, MAX_LOWMEM >> 20);
+			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+#else
+#error HIGHMEM is not supported by AVR32 yet
+#endif
+		}
+
+		/* Initialize the boot-time allocator with low memory only. */
+		bootmap_size = init_bootmem_node(NODE_DATA(node), bootmap_pfn,
+						 first_pfn, max_low_pfn);
+
+		printk("Node %u: bdata = %p, bdata->node_bootmem_map = %p\n",
+		       node, NODE_DATA(node)->bdata,
+		       NODE_DATA(node)->bdata->node_bootmem_map);
+
+		/*
+		 * Register fully available RAM pages with the bootmem
+		 * allocator.
+		 */
+		pages = max_low_pfn - first_pfn;
+		free_bootmem_node (NODE_DATA(node), PFN_PHYS(first_pfn),
+				   PFN_PHYS(pages));
+
+		/*
+		 * Reserve space for the kernel image (if present in
+		 * this node)...
+		 */
+		if ((kern_start >= PFN_PHYS(first_pfn)) &&
+		    (kern_start < PFN_PHYS(max_pfn))) {
+			printk("Node %u: Kernel image %08lx - %08lx\n",
+			       node, kern_start, kern_end);
+			reserve_bootmem_node(NODE_DATA(node), kern_start,
+					     kern_end - kern_start);
+		}
+
+		/* ...the bootmem bitmap... */
+		reserve_bootmem_node(NODE_DATA(node),
+				     PFN_PHYS(bootmap_pfn),
+				     bootmap_size);
+
+		/* ...any RAMDISK images... */
+		for (res = mem_ramdisk; res; res = res->next) {
+			if (res->addr > PFN_PHYS(max_pfn))
+				break;
+
+			if (res->addr >= PFN_PHYS(first_pfn)) {
+				printk("Node %u: RAMDISK %08lx - %08lx\n",
+				       node,
+				       (unsigned long)res->addr,
+				       (unsigned long)(res->addr + res->size));
+				reserve_bootmem_node(NODE_DATA(node),
+						     res->addr, res->size);
+			}
+		}
+
+		/* ...and any other reserved regions. */
+		for (res = mem_reserved; res; res = res->next) {
+			if (res->addr > PFN_PHYS(max_pfn))
+				break;
+
+			if (res->addr >= PFN_PHYS(first_pfn)) {
+				printk("Node %u: Reserved %08lx - %08lx\n",
+				       node,
+				       (unsigned long)res->addr,
+				       (unsigned long)(res->addr + res->size));
+				reserve_bootmem_node(NODE_DATA(node),
+						     res->addr, res->size);
+			}
+		}
+
+		node_set_online(node);
+	}
+}
+
+/*
+ * paging_init() sets up the page tables
+ *
+ * This routine also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+	extern unsigned long _evba;
+	void *zero_page;
+	int nid;
+
+	/*
+	 * Make sure we can handle exceptions before enabling
+	 * paging. Not that we should ever _get_ any exceptions this
+	 * early, but you never know...
+	 */
+	printk("Exception vectors start at %p\n", &_evba);
+	sysreg_write(EVBA, (unsigned long)&_evba);
+
+	/*
+	 * Since we are ready to handle exceptions now, we should let
+	 * the CPU generate them...
+	 */
+	__asm__ __volatile__ ("csrf %0" : : "i"(SR_EM_BIT));
+
+	/*
+	 * Allocate the zero page. The allocator will panic if it
+	 * can't satisfy the request, so no need to check.
+	 */
+	zero_page = alloc_bootmem_low_pages_node(NODE_DATA(0),
+						 PAGE_SIZE);
+
+	{
+		pgd_t *pg_dir;
+		int i;
+
+		pg_dir = swapper_pg_dir;
+		sysreg_write(PTBR, (unsigned long)pg_dir);
+
+		for (i = 0; i < PTRS_PER_PGD; i++)
+			pgd_val(pg_dir[i]) = 0;
+
+		enable_mmu();
+		printk ("CPU: Paging enabled\n");
+	}
+
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		unsigned long zones_size[MAX_NR_ZONES];
+		unsigned long low, start_pfn;
+
+		start_pfn = pgdat->bdata->node_boot_start;
+		start_pfn >>= PAGE_SHIFT;
+		low = pgdat->bdata->node_low_pfn;
+
+		memset(zones_size, 0, sizeof(zones_size));
+		zones_size[ZONE_NORMAL] = low - start_pfn;
+
+		printk("Node %u: start_pfn = 0x%lx, low = 0x%lx\n",
+		       nid, start_pfn, low);
+
+		free_area_init_node(nid, pgdat, zones_size, start_pfn, NULL);
+
+		printk("Node %u: mem_map starts at %p\n",
+		       pgdat->node_id, pgdat->node_mem_map);
+	}
+
+	mem_map = NODE_DATA(0)->node_mem_map;
+
+	memset(zero_page, 0, PAGE_SIZE);
+	empty_zero_page = virt_to_page(zero_page);
+	flush_dcache_page(empty_zero_page);
+}
+
+void __init mem_init(void)
+{
+	int codesize, reservedpages, datasize, initsize;
+	int nid, i;
+
+	reservedpages = 0;
+	high_memory = NULL;
+
+	/* this will put all low memory onto the freelists */
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		unsigned long node_pages = 0;
+		void *node_high_memory;
+
+		num_physpages += pgdat->node_present_pages;
+
+		if (pgdat->node_spanned_pages != 0)
+			node_pages = free_all_bootmem_node(pgdat);
+
+		totalram_pages += node_pages;
+
+		for (i = 0; i < node_pages; i++)
+			if (PageReserved(pgdat->node_mem_map + i))
+				reservedpages++;
+
+		node_high_memory = (void *)((pgdat->node_start_pfn
+					     + pgdat->node_spanned_pages)
+					    << PAGE_SHIFT);
+		if (node_high_memory > high_memory)
+			high_memory = node_high_memory;
+	}
+
+	max_mapnr = MAP_NR(high_memory);
+
+	codesize = (unsigned long)_etext - (unsigned long)_text;
+	datasize = (unsigned long)_edata - (unsigned long)_data;
+	initsize = (unsigned long)__init_end - (unsigned long)__init_begin;
+
+	printk ("Memory: %luk/%luk available (%dk kernel code, "
+		"%dk reserved, %dk data, %dk init)\n",
+		(unsigned long)nr_free_pages() << (PAGE_SHIFT - 10),
+		totalram_pages << (PAGE_SHIFT - 10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT - 10),
+		datasize >> 10,
+		initsize >> 10);
+}
+
+static inline void free_area(unsigned long addr, unsigned long end, char *s)
+{
+	unsigned int size = (end - addr) >> 10;
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		struct page *page = virt_to_page(addr);
+		ClearPageReserved(page);
+		init_page_count(page);
+		free_page(addr);
+		totalram_pages++;
+	}
+
+	if (size && s)
+		printk(KERN_INFO "Freeing %s memory: %dK (%lx - %lx)\n",
+		       s, size, end - (size << 10), end);
+}
+
+void free_initmem(void)
+{
+	free_area((unsigned long)__init_begin, (unsigned long)__init_end,
+		  "init");
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static int keep_initrd;
+
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	if (!keep_initrd)
+		free_area(start, end, "initrd");
+}
+
+static int __init keepinitrd_setup(char *__unused)
+{
+	keep_initrd = 1;
+	return 1;
+}
+
+__setup("keepinitrd", keepinitrd_setup);
+#endif
diff --git a/arch/avr32/mm/ioremap.c b/arch/avr32/mm/ioremap.c
new file mode 100644
index 00000000000..536021877df
--- /dev/null
+++ b/arch/avr32/mm/ioremap.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/addrspace.h>
+
+static inline int remap_area_pte(pte_t *pte, unsigned long address,
+				  unsigned long end, unsigned long phys_addr,
+				  pgprot_t prot)
+{
+	unsigned long pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	do {
+		WARN_ON(!pte_none(*pte));
+
+		set_pte(pte, pfn_pte(pfn, prot));
+		address += PAGE_SIZE;
+		pfn++;
+		pte++;
+	} while (address && (address < end));
+
+	return 0;
+}
+
+static inline int remap_area_pmd(pmd_t *pmd, unsigned long address,
+				 unsigned long end, unsigned long phys_addr,
+				 pgprot_t prot)
+{
+	unsigned long next;
+
+	phys_addr -= address;
+
+	do {
+		pte_t *pte = pte_alloc_kernel(pmd, address);
+		if (!pte)
+			return -ENOMEM;
+
+		next = (address + PMD_SIZE) & PMD_MASK;
+		if (remap_area_pte(pte, address, next,
+				   address + phys_addr, prot))
+			return -ENOMEM;
+
+		address = next;
+		pmd++;
+	} while (address && (address < end));
+	return 0;
+}
+
+static int remap_area_pud(pud_t *pud, unsigned long address,
+			  unsigned long end, unsigned long phys_addr,
+			  pgprot_t prot)
+{
+	unsigned long next;
+
+	phys_addr -= address;
+
+	do {
+		pmd_t *pmd = pmd_alloc(&init_mm, pud, address);
+		if (!pmd)
+			return -ENOMEM;
+		next = (address + PUD_SIZE) & PUD_MASK;
+		if (remap_area_pmd(pmd, address, next,
+				   phys_addr + address, prot))
+			return -ENOMEM;
+
+		address = next;
+		pud++;
+	} while (address && address < end);
+
+	return 0;
+}
+
+static int remap_area_pages(unsigned long address, unsigned long phys_addr,
+			    size_t size, pgprot_t prot)
+{
+	unsigned long end = address + size;
+	unsigned long next;
+	pgd_t *pgd;
+	int err = 0;
+
+	phys_addr -= address;
+
+	pgd = pgd_offset_k(address);
+	flush_cache_all();
+	BUG_ON(address >= end);
+
+	spin_lock(&init_mm.page_table_lock);
+	do {
+		pud_t *pud = pud_alloc(&init_mm, pgd, address);
+
+		err = -ENOMEM;
+		if (!pud)
+			break;
+
+		next = (address + PGDIR_SIZE) & PGDIR_MASK;
+		if (next < address || next > end)
+			next = end;
+		err = remap_area_pud(pud, address, next,
+				     phys_addr + address, prot);
+		if (err)
+			break;
+
+		address = next;
+		pgd++;
+	} while (address && (address < end));
+
+	spin_unlock(&init_mm.page_table_lock);
+	flush_tlb_all();
+	return err;
+}
+
+/*
+ * Re-map an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access physical
+ * memory directly.
+ */
+void __iomem *__ioremap(unsigned long phys_addr, size_t size,
+			unsigned long flags)
+{
+	void *addr;
+	struct vm_struct *area;
+	unsigned long offset, last_addr;
+	pgprot_t prot;
+
+	/*
+	 * Check if we can simply use the P4 segment. This area is
+	 * uncacheable, so if caching/buffering is requested, we can't
+	 * use it.
+	 */
+	if ((phys_addr >= P4SEG) && (flags == 0))
+		return (void __iomem *)phys_addr;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	/*
+	 * XXX: When mapping regular RAM, we'd better make damn sure
+	 * it's never used for anything else.  But this is really the
+	 * caller's responsibility...
+	 */
+	if (PHYSADDR(P2SEGADDR(phys_addr)) == phys_addr)
+		return (void __iomem *)P2SEGADDR(phys_addr);
+
+	/* Mappings have to be page-aligned */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+	prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
+			| _PAGE_ACCESSED | _PAGE_TYPE_SMALL | flags);
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area(size, VM_IOREMAP);
+	if (!area)
+		return NULL;
+	area->phys_addr = phys_addr;
+	addr = area->addr;
+	if (remap_area_pages((unsigned long)addr, phys_addr, size, prot)) {
+		vunmap(addr);
+		return NULL;
+	}
+
+	return (void __iomem *)(offset + (char *)addr);
+}
+EXPORT_SYMBOL(__ioremap);
+
+void __iounmap(void __iomem *addr)
+{
+	struct vm_struct *p;
+
+	if ((unsigned long)addr >= P4SEG)
+		return;
+
+	p = remove_vm_area((void *)(PAGE_MASK & (unsigned long __force)addr));
+	if (unlikely(!p)) {
+		printk (KERN_ERR "iounmap: bad address %p\n", addr);
+		return;
+	}
+
+	kfree (p);
+}
+EXPORT_SYMBOL(__iounmap);
diff --git a/arch/avr32/mm/tlb.c b/arch/avr32/mm/tlb.c
new file mode 100644
index 00000000000..5d0523bbe29
--- /dev/null
+++ b/arch/avr32/mm/tlb.c
@@ -0,0 +1,378 @@
+/*
+ * AVR32 TLB operations
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/mm.h>
+
+#include <asm/mmu_context.h>
+
+#define _TLBEHI_I	0x100
+
+void show_dtlb_entry(unsigned int index)
+{
+	unsigned int tlbehi, tlbehi_save, tlbelo, mmucr, mmucr_save, flags;
+
+	local_irq_save(flags);
+	mmucr_save = sysreg_read(MMUCR);
+	tlbehi_save = sysreg_read(TLBEHI);
+	mmucr = mmucr_save & 0x13;
+	mmucr |= index << 14;
+	sysreg_write(MMUCR, mmucr);
+
+	asm volatile("tlbr" : : : "memory");
+	cpu_sync_pipeline();
+
+	tlbehi = sysreg_read(TLBEHI);
+	tlbelo = sysreg_read(TLBELO);
+
+	printk("%2u: %c %c %02x   %05x %05x %o  %o  %c %c %c %c\n",
+	       index,
+	       (tlbehi & 0x200)?'1':'0',
+	       (tlbelo & 0x100)?'1':'0',
+	       (tlbehi & 0xff),
+	       (tlbehi >> 12), (tlbelo >> 12),
+	       (tlbelo >> 4) & 7, (tlbelo >> 2) & 3,
+	       (tlbelo & 0x200)?'1':'0',
+	       (tlbelo & 0x080)?'1':'0',
+	       (tlbelo & 0x001)?'1':'0',
+	       (tlbelo & 0x002)?'1':'0');
+
+	sysreg_write(MMUCR, mmucr_save);
+	sysreg_write(TLBEHI, tlbehi_save);
+	cpu_sync_pipeline();
+	local_irq_restore(flags);
+}
+
+void dump_dtlb(void)
+{
+	unsigned int i;
+
+	printk("ID  V G ASID VPN   PFN   AP SZ C B W D\n");
+	for (i = 0; i < 32; i++)
+		show_dtlb_entry(i);
+}
+
+static unsigned long last_mmucr;
+
+static inline void set_replacement_pointer(unsigned shift)
+{
+	unsigned long mmucr, mmucr_save;
+
+	mmucr = mmucr_save = sysreg_read(MMUCR);
+
+	/* Does this mapping already exist? */
+	__asm__ __volatile__(
+		"	tlbs\n"
+		"	mfsr %0, %1"
+		: "=r"(mmucr)
+		: "i"(SYSREG_MMUCR));
+
+	if (mmucr & SYSREG_BIT(MMUCR_N)) {
+		/* Not found -- pick a not-recently-accessed entry */
+		unsigned long rp;
+		unsigned long tlbar = sysreg_read(TLBARLO);
+
+		rp = 32 - fls(tlbar);
+		if (rp == 32) {
+			rp = 0;
+			sysreg_write(TLBARLO, -1L);
+		}
+
+		mmucr &= 0x13;
+		mmucr |= (rp << shift);
+
+		sysreg_write(MMUCR, mmucr);
+	}
+
+	last_mmucr = mmucr;
+}
+
+static void update_dtlb(unsigned long address, pte_t pte, unsigned long asid)
+{
+	unsigned long vpn;
+
+	vpn = (address & MMU_VPN_MASK) | _TLBEHI_VALID | asid;
+	sysreg_write(TLBEHI, vpn);
+	cpu_sync_pipeline();
+
+	set_replacement_pointer(14);
+
+	sysreg_write(TLBELO, pte_val(pte) & _PAGE_FLAGS_HARDWARE_MASK);
+
+	/* Let's go */
+	asm volatile("nop\n\ttlbw" : : : "memory");
+	cpu_sync_pipeline();
+}
+
+void update_mmu_cache(struct vm_area_struct *vma,
+		      unsigned long address, pte_t pte)
+{
+	unsigned long flags;
+
+	/* ptrace may call this routine */
+	if (vma && current->active_mm != vma->vm_mm)
+		return;
+
+	local_irq_save(flags);
+	update_dtlb(address, pte, get_asid());
+	local_irq_restore(flags);
+}
+
+void __flush_tlb_page(unsigned long asid, unsigned long page)
+{
+	unsigned long mmucr, tlbehi;
+
+	page |= asid;
+	sysreg_write(TLBEHI, page);
+	cpu_sync_pipeline();
+	asm volatile("tlbs");
+	mmucr = sysreg_read(MMUCR);
+
+	if (!(mmucr & SYSREG_BIT(MMUCR_N))) {
+		unsigned long tlbarlo;
+		unsigned long entry;
+
+		/* Clear the "valid" bit */
+		tlbehi = sysreg_read(TLBEHI);
+		tlbehi &= ~_TLBEHI_VALID;
+		sysreg_write(TLBEHI, tlbehi);
+		cpu_sync_pipeline();
+
+		/* mark the entry as "not accessed" */
+		entry = (mmucr >> 14) & 0x3f;
+		tlbarlo = sysreg_read(TLBARLO);
+		tlbarlo |= (0x80000000 >> entry);
+		sysreg_write(TLBARLO, tlbarlo);
+
+		/* update the entry with valid bit clear */
+		asm volatile("tlbw");
+		cpu_sync_pipeline();
+	}
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
+{
+	if (vma->vm_mm && vma->vm_mm->context != NO_CONTEXT) {
+		unsigned long flags, asid;
+		unsigned long saved_asid = MMU_NO_ASID;
+
+		asid = vma->vm_mm->context & MMU_CONTEXT_ASID_MASK;
+		page &= PAGE_MASK;
+
+		local_irq_save(flags);
+		if (vma->vm_mm != current->mm) {
+			saved_asid = get_asid();
+			set_asid(asid);
+		}
+
+		__flush_tlb_page(asid, page);
+
+		if (saved_asid != MMU_NO_ASID)
+			set_asid(saved_asid);
+		local_irq_restore(flags);
+	}
+}
+
+void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (mm->context != NO_CONTEXT) {
+		unsigned long flags;
+		int size;
+
+		local_irq_save(flags);
+		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+		if (size > (MMU_DTLB_ENTRIES / 4)) { /* Too many entries to flush */
+			mm->context = NO_CONTEXT;
+			if (mm == current->mm)
+				activate_context(mm);
+		} else {
+			unsigned long asid = mm->context & MMU_CONTEXT_ASID_MASK;
+			unsigned long saved_asid = MMU_NO_ASID;
+
+			start &= PAGE_MASK;
+			end += (PAGE_SIZE - 1);
+			end &= PAGE_MASK;
+			if (mm != current->mm) {
+				saved_asid = get_asid();
+				set_asid(asid);
+			}
+
+			while (start < end) {
+				__flush_tlb_page(asid, start);
+				start += PAGE_SIZE;
+			}
+			if (saved_asid != MMU_NO_ASID)
+				set_asid(saved_asid);
+		}
+		local_irq_restore(flags);
+	}
+}
+
+/*
+ * TODO: If this is only called for addresses > TASK_SIZE, we can probably
+ * skip the ASID stuff and just use the Global bit...
+ */
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+	unsigned long flags;
+	int size;
+
+	local_irq_save(flags);
+	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+	if (size > (MMU_DTLB_ENTRIES / 4)) { /* Too many entries to flush */
+		flush_tlb_all();
+	} else {
+		unsigned long asid = init_mm.context & MMU_CONTEXT_ASID_MASK;
+		unsigned long saved_asid = get_asid();
+
+		start &= PAGE_MASK;
+		end += (PAGE_SIZE - 1);
+		end &= PAGE_MASK;
+		set_asid(asid);
+		while (start < end) {
+			__flush_tlb_page(asid, start);
+			start += PAGE_SIZE;
+		}
+		set_asid(saved_asid);
+	}
+	local_irq_restore(flags);
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	/* Invalidate all TLB entries of this process by getting a new ASID */
+	if (mm->context != NO_CONTEXT) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		mm->context = NO_CONTEXT;
+		if (mm == current->mm)
+			activate_context(mm);
+		local_irq_restore(flags);
+	}
+}
+
+void flush_tlb_all(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	sysreg_write(MMUCR, sysreg_read(MMUCR) | SYSREG_BIT(MMUCR_I));
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+static void *tlb_start(struct seq_file *tlb, loff_t *pos)
+{
+	static unsigned long tlb_index;
+
+	if (*pos >= 32)
+		return NULL;
+
+	tlb_index = 0;
+	return &tlb_index;
+}
+
+static void *tlb_next(struct seq_file *tlb, void *v, loff_t *pos)
+{
+	unsigned long *index = v;
+
+	if (*index >= 31)
+		return NULL;
+
+	++*pos;
+	++*index;
+	return index;
+}
+
+static void tlb_stop(struct seq_file *tlb, void *v)
+{
+
+}
+
+static int tlb_show(struct seq_file *tlb, void *v)
+{
+	unsigned int tlbehi, tlbehi_save, tlbelo, mmucr, mmucr_save, flags;
+	unsigned long *index = v;
+
+	if (*index == 0)
+		seq_puts(tlb, "ID  V G ASID VPN   PFN   AP SZ C B W D\n");
+
+	BUG_ON(*index >= 32);
+
+	local_irq_save(flags);
+	mmucr_save = sysreg_read(MMUCR);
+	tlbehi_save = sysreg_read(TLBEHI);
+	mmucr = mmucr_save & 0x13;
+	mmucr |= *index << 14;
+	sysreg_write(MMUCR, mmucr);
+
+	asm volatile("tlbr" : : : "memory");
+	cpu_sync_pipeline();
+
+	tlbehi = sysreg_read(TLBEHI);
+	tlbelo = sysreg_read(TLBELO);
+
+	sysreg_write(MMUCR, mmucr_save);
+	sysreg_write(TLBEHI, tlbehi_save);
+	cpu_sync_pipeline();
+	local_irq_restore(flags);
+
+	seq_printf(tlb, "%2lu: %c %c %02x   %05x %05x %o  %o  %c %c %c %c\n",
+	       *index,
+	       (tlbehi & 0x200)?'1':'0',
+	       (tlbelo & 0x100)?'1':'0',
+	       (tlbehi & 0xff),
+	       (tlbehi >> 12), (tlbelo >> 12),
+	       (tlbelo >> 4) & 7, (tlbelo >> 2) & 3,
+	       (tlbelo & 0x200)?'1':'0',
+	       (tlbelo & 0x080)?'1':'0',
+	       (tlbelo & 0x001)?'1':'0',
+	       (tlbelo & 0x002)?'1':'0');
+
+	return 0;
+}
+
+static struct seq_operations tlb_ops = {
+	.start		= tlb_start,
+	.next		= tlb_next,
+	.stop		= tlb_stop,
+	.show		= tlb_show,
+};
+
+static int tlb_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &tlb_ops);
+}
+
+static struct file_operations proc_tlb_operations = {
+	.open		= tlb_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proctlb_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("tlb", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_tlb_operations;
+	return 0;
+}
+late_initcall(proctlb_init);
+#endif /* CONFIG_PROC_FS */
diff --git a/include/asm-avr32/Kbuild b/include/asm-avr32/Kbuild
new file mode 100644
index 00000000000..8770e73ce93
--- /dev/null
+++ b/include/asm-avr32/Kbuild
@@ -0,0 +1,3 @@
+include include/asm-generic/Kbuild.asm
+
+headers-y	+= cachectl.h
diff --git a/include/asm-avr32/a.out.h b/include/asm-avr32/a.out.h
new file mode 100644
index 00000000000..50bf6e31a14
--- /dev/null
+++ b/include/asm-avr32/a.out.h
@@ -0,0 +1,26 @@
+#ifndef __ASM_AVR32_A_OUT_H
+#define __ASM_AVR32_A_OUT_H
+
+struct exec
+{
+  unsigned long a_info;		/* Use macros N_MAGIC, etc for access */
+  unsigned a_text;		/* length of text, in bytes */
+  unsigned a_data;		/* length of data, in bytes */
+  unsigned a_bss;		/* length of uninitialized data area for file, in bytes */
+  unsigned a_syms;		/* length of symbol table data in file, in bytes */
+  unsigned a_entry;		/* start address */
+  unsigned a_trsize;		/* length of relocation info for text, in bytes */
+  unsigned a_drsize;		/* length of relocation info for data, in bytes */
+};
+
+#define N_TRSIZE(a)	((a).a_trsize)
+#define N_DRSIZE(a)	((a).a_drsize)
+#define N_SYMSIZE(a)	((a).a_syms)
+
+#ifdef __KERNEL__
+
+#define STACK_TOP	TASK_SIZE
+
+#endif
+
+#endif /* __ASM_AVR32_A_OUT_H */
diff --git a/include/asm-avr32/addrspace.h b/include/asm-avr32/addrspace.h
new file mode 100644
index 00000000000..366794858ec
--- /dev/null
+++ b/include/asm-avr32/addrspace.h
@@ -0,0 +1,43 @@
+/*
+ * Defitions for the address spaces of the AVR32 CPUs. Heavily based on
+ * include/asm-sh/addrspace.h
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_ADDRSPACE_H
+#define __ASM_AVR32_ADDRSPACE_H
+
+#ifdef CONFIG_MMU
+
+/* Memory segments when segmentation is enabled */
+#define P0SEG		0x00000000
+#define P1SEG		0x80000000
+#define P2SEG		0xa0000000
+#define P3SEG		0xc0000000
+#define P4SEG		0xe0000000
+
+/* Returns the privileged segment base of a given address */
+#define PXSEG(a)	(((unsigned long)(a)) & 0xe0000000)
+
+/* Returns the physical address of a PnSEG (n=1,2) address */
+#define PHYSADDR(a)	(((unsigned long)(a)) & 0x1fffffff)
+
+/*
+ * Map an address to a certain privileged segment
+ */
+#define P1SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
+				      | P1SEG))
+#define P2SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
+				      | P2SEG))
+#define P3SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
+				      | P3SEG))
+#define P4SEGADDR(a) ((__typeof__(a))(((unsigned long)(a) & 0x1fffffff) \
+				      | P4SEG))
+
+#endif /* CONFIG_MMU */
+
+#endif /* __ASM_AVR32_ADDRSPACE_H */
diff --git a/include/asm-avr32/arch-at32ap/at91rm9200_pdc.h b/include/asm-avr32/arch-at32ap/at91rm9200_pdc.h
new file mode 100644
index 00000000000..ce1150d4438
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/at91rm9200_pdc.h
@@ -0,0 +1,36 @@
+/*
+ * include/asm-arm/arch-at91rm9200/at91rm9200_pdc.h
+ *
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * Peripheral Data Controller (PDC) registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef AT91RM9200_PDC_H
+#define AT91RM9200_PDC_H
+
+#define AT91_PDC_RPR		0x100	/* Receive Pointer Register */
+#define AT91_PDC_RCR		0x104	/* Receive Counter Register */
+#define AT91_PDC_TPR		0x108	/* Transmit Pointer Register */
+#define AT91_PDC_TCR		0x10c	/* Transmit Counter Register */
+#define AT91_PDC_RNPR		0x110	/* Receive Next Pointer Register */
+#define AT91_PDC_RNCR		0x114	/* Receive Next Counter Register */
+#define AT91_PDC_TNPR		0x118	/* Transmit Next Pointer Register */
+#define AT91_PDC_TNCR		0x11c	/* Transmit Next Counter Register */
+
+#define AT91_PDC_PTCR		0x120	/* Transfer Control Register */
+#define		AT91_PDC_RXTEN		(1 << 0)	/* Receiver Transfer Enable */
+#define		AT91_PDC_RXTDIS		(1 << 1)	/* Receiver Transfer Disable */
+#define		AT91_PDC_TXTEN		(1 << 8)	/* Transmitter Transfer Enable */
+#define		AT91_PDC_TXTDIS		(1 << 9)	/* Transmitter Transfer Disable */
+
+#define AT91_PDC_PTSR		0x124	/* Transfer Status Register */
+
+#endif
diff --git a/include/asm-avr32/arch-at32ap/at91rm9200_usart.h b/include/asm-avr32/arch-at32ap/at91rm9200_usart.h
new file mode 100644
index 00000000000..79f851e31b9
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/at91rm9200_usart.h
@@ -0,0 +1,123 @@
+/*
+ * include/asm-arm/arch-at91rm9200/at91rm9200_usart.h
+ *
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * USART registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef AT91RM9200_USART_H
+#define AT91RM9200_USART_H
+
+#define AT91_US_CR		0x00			/* Control Register */
+#define		AT91_US_RSTRX		(1 <<  2)		/* Reset Receiver */
+#define		AT91_US_RSTTX		(1 <<  3)		/* Reset Transmitter */
+#define		AT91_US_RXEN		(1 <<  4)		/* Receiver Enable */
+#define		AT91_US_RXDIS		(1 <<  5)		/* Receiver Disable */
+#define		AT91_US_TXEN		(1 <<  6)		/* Transmitter Enable */
+#define		AT91_US_TXDIS		(1 <<  7)		/* Transmitter Disable */
+#define		AT91_US_RSTSTA		(1 <<  8)		/* Reset Status Bits */
+#define		AT91_US_STTBRK		(1 <<  9)		/* Start Break */
+#define		AT91_US_STPBRK		(1 << 10)		/* Stop Break */
+#define		AT91_US_STTTO		(1 << 11)		/* Start Time-out */
+#define		AT91_US_SENDA		(1 << 12)		/* Send Address */
+#define		AT91_US_RSTIT		(1 << 13)		/* Reset Iterations */
+#define		AT91_US_RSTNACK		(1 << 14)		/* Reset Non Acknowledge */
+#define		AT91_US_RETTO		(1 << 15)		/* Rearm Time-out */
+#define		AT91_US_DTREN		(1 << 16)		/* Data Terminal Ready Enable */
+#define		AT91_US_DTRDIS		(1 << 17)		/* Data Terminal Ready Disable */
+#define		AT91_US_RTSEN		(1 << 18)		/* Request To Send Enable */
+#define		AT91_US_RTSDIS		(1 << 19)		/* Request To Send Disable */
+
+#define AT91_US_MR		0x04			/* Mode Register */
+#define		AT91_US_USMODE		(0xf <<  0)		/* Mode of the USART */
+#define			AT91_US_USMODE_NORMAL		0
+#define			AT91_US_USMODE_RS485		1
+#define			AT91_US_USMODE_HWHS		2
+#define			AT91_US_USMODE_MODEM		3
+#define			AT91_US_USMODE_ISO7816_T0	4
+#define			AT91_US_USMODE_ISO7816_T1	6
+#define			AT91_US_USMODE_IRDA		8
+#define		AT91_US_USCLKS		(3   <<  4)		/* Clock Selection */
+#define		AT91_US_CHRL		(3   <<  6)		/* Character Length */
+#define			AT91_US_CHRL_5			(0 <<  6)
+#define			AT91_US_CHRL_6			(1 <<  6)
+#define			AT91_US_CHRL_7			(2 <<  6)
+#define			AT91_US_CHRL_8			(3 <<  6)
+#define		AT91_US_SYNC		(1 <<  8)		/* Synchronous Mode Select */
+#define		AT91_US_PAR		(7 <<  9)		/* Parity Type */
+#define			AT91_US_PAR_EVEN		(0 <<  9)
+#define			AT91_US_PAR_ODD			(1 <<  9)
+#define			AT91_US_PAR_SPACE		(2 <<  9)
+#define			AT91_US_PAR_MARK		(3 <<  9)
+#define			AT91_US_PAR_NONE		(4 <<  9)
+#define			AT91_US_PAR_MULTI_DROP		(6 <<  9)
+#define		AT91_US_NBSTOP		(3 << 12)		/* Number of Stop Bits */
+#define			AT91_US_NBSTOP_1		(0 << 12)
+#define			AT91_US_NBSTOP_1_5		(1 << 12)
+#define			AT91_US_NBSTOP_2		(2 << 12)
+#define		AT91_US_CHMODE		(3 << 14)		/* Channel Mode */
+#define			AT91_US_CHMODE_NORMAL		(0 << 14)
+#define			AT91_US_CHMODE_ECHO		(1 << 14)
+#define			AT91_US_CHMODE_LOC_LOOP		(2 << 14)
+#define			AT91_US_CHMODE_REM_LOOP		(3 << 14)
+#define		AT91_US_MSBF		(1 << 16)		/* Bit Order */
+#define		AT91_US_MODE9		(1 << 17)		/* 9-bit Character Length */
+#define		AT91_US_CLKO		(1 << 18)		/* Clock Output Select */
+#define		AT91_US_OVER		(1 << 19)		/* Oversampling Mode */
+#define		AT91_US_INACK		(1 << 20)		/* Inhibit Non Acknowledge */
+#define		AT91_US_DSNACK		(1 << 21)		/* Disable Successive NACK */
+#define		AT91_US_MAX_ITER	(7 << 24)		/* Max Iterations */
+#define		AT91_US_FILTER		(1 << 28)		/* Infrared Receive Line Filter */
+
+#define AT91_US_IER		0x08			/* Interrupt Enable Register */
+#define		AT91_US_RXRDY		(1 <<  0)		/* Receiver Ready */
+#define		AT91_US_TXRDY		(1 <<  1)		/* Transmitter Ready */
+#define		AT91_US_RXBRK		(1 <<  2)		/* Break Received / End of Break */
+#define		AT91_US_ENDRX		(1 <<  3)		/* End of Receiver Transfer */
+#define		AT91_US_ENDTX		(1 <<  4)		/* End of Transmitter Transfer */
+#define		AT91_US_OVRE		(1 <<  5)		/* Overrun Error */
+#define		AT91_US_FRAME		(1 <<  6)		/* Framing Error */
+#define		AT91_US_PARE		(1 <<  7)		/* Parity Error */
+#define		AT91_US_TIMEOUT		(1 <<  8)		/* Receiver Time-out */
+#define		AT91_US_TXEMPTY		(1 <<  9)		/* Transmitter Empty */
+#define		AT91_US_ITERATION	(1 << 10)		/* Max number of Repetitions Reached */
+#define		AT91_US_TXBUFE		(1 << 11)		/* Transmission Buffer Empty */
+#define		AT91_US_RXBUFF		(1 << 12)		/* Reception Buffer Full */
+#define		AT91_US_NACK		(1 << 13)		/* Non Acknowledge */
+#define		AT91_US_RIIC		(1 << 16)		/* Ring Indicator Input Change */
+#define		AT91_US_DSRIC		(1 << 17)		/* Data Set Ready Input Change */
+#define		AT91_US_DCDIC		(1 << 18)		/* Data Carrier Detect Input Change */
+#define		AT91_US_CTSIC		(1 << 19)		/* Clear to Send Input Change */
+#define		AT91_US_RI		(1 << 20)		/* RI */
+#define		AT91_US_DSR		(1 << 21)		/* DSR */
+#define		AT91_US_DCD		(1 << 22)		/* DCD */
+#define		AT91_US_CTS		(1 << 23)		/* CTS */
+
+#define AT91_US_IDR		0x0c			/* Interrupt Disable Register */
+#define AT91_US_IMR		0x10			/* Interrupt Mask Register */
+#define AT91_US_CSR		0x14			/* Channel Status Register */
+#define AT91_US_RHR		0x18			/* Receiver Holding Register */
+#define AT91_US_THR		0x1c			/* Transmitter Holding Register */
+
+#define AT91_US_BRGR		0x20			/* Baud Rate Generator Register */
+#define		AT91_US_CD		(0xffff << 0)		/* Clock Divider */
+
+#define AT91_US_RTOR		0x24			/* Receiver Time-out Register */
+#define		AT91_US_TO		(0xffff << 0)		/* Time-out Value */
+
+#define AT91_US_TTGR		0x28			/* Transmitter Timeguard Register */
+#define		AT91_US_TG		(0xff << 0)		/* Timeguard Value */
+
+#define AT91_US_FIDI		0x40			/* FI DI Ratio Register */
+#define AT91_US_NER		0x44			/* Number of Errors Register */
+#define AT91_US_IF		0x4c			/* IrDA Filter Register */
+
+#endif
diff --git a/include/asm-avr32/arch-at32ap/board.h b/include/asm-avr32/arch-at32ap/board.h
new file mode 100644
index 00000000000..39368e18ab2
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/board.h
@@ -0,0 +1,35 @@
+/*
+ * Platform data definitions.
+ */
+#ifndef __ASM_ARCH_BOARD_H
+#define __ASM_ARCH_BOARD_H
+
+#include <linux/types.h>
+
+/* Add basic devices: system manager, interrupt controller, portmuxes, etc. */
+void at32_add_system_devices(void);
+
+#define AT91_NR_UART	4
+extern struct platform_device *at91_default_console_device;
+
+struct platform_device *at32_add_device_usart(unsigned int id);
+
+struct eth_platform_data {
+	u8	valid;
+	u8	mii_phy_addr;
+	u8	is_rmii;
+	u8	hw_addr[6];
+};
+struct platform_device *
+at32_add_device_eth(unsigned int id, struct eth_platform_data *data);
+
+struct platform_device *at32_add_device_spi(unsigned int id);
+
+struct lcdc_platform_data {
+	unsigned long fbmem_start;
+	unsigned long fbmem_size;
+};
+struct platform_device *
+at32_add_device_lcdc(unsigned int id, struct lcdc_platform_data *data);
+
+#endif /* __ASM_ARCH_BOARD_H */
diff --git a/include/asm-avr32/arch-at32ap/init.h b/include/asm-avr32/arch-at32ap/init.h
new file mode 100644
index 00000000000..43722634e06
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/init.h
@@ -0,0 +1,21 @@
+/*
+ * AT32AP platform initialization calls.
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_AT32AP_INIT_H__
+#define __ASM_AVR32_AT32AP_INIT_H__
+
+void setup_platform(void);
+
+/* Called by setup_platform */
+void at32_clock_init(void);
+void at32_portmux_init(void);
+
+void at32_setup_serial_console(unsigned int usart_id);
+
+#endif /* __ASM_AVR32_AT32AP_INIT_H__ */
diff --git a/include/asm-avr32/arch-at32ap/portmux.h b/include/asm-avr32/arch-at32ap/portmux.h
new file mode 100644
index 00000000000..4d50421262a
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/portmux.h
@@ -0,0 +1,16 @@
+/*
+ * AT32 portmux interface.
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_AT32_PORTMUX_H__
+#define __ASM_AVR32_AT32_PORTMUX_H__
+
+void portmux_set_func(unsigned int portmux_id, unsigned int pin_id,
+		      unsigned int function_id);
+
+#endif /* __ASM_AVR32_AT32_PORTMUX_H__ */
diff --git a/include/asm-avr32/arch-at32ap/sm.h b/include/asm-avr32/arch-at32ap/sm.h
new file mode 100644
index 00000000000..265a9ead20b
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/sm.h
@@ -0,0 +1,27 @@
+/*
+ * AT32 System Manager interface.
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_AT32_SM_H__
+#define __ASM_AVR32_AT32_SM_H__
+
+struct irq_chip;
+struct platform_device;
+
+struct at32_sm {
+	spinlock_t lock;
+	void __iomem *regs;
+	struct irq_chip *eim_chip;
+	unsigned int eim_first_irq;
+	struct platform_device *pdev;
+};
+
+extern struct platform_device at32_sm_device;
+extern struct at32_sm system_manager;
+
+#endif /* __ASM_AVR32_AT32_SM_H__ */
diff --git a/include/asm-avr32/asm.h b/include/asm-avr32/asm.h
new file mode 100644
index 00000000000..515c7618952
--- /dev/null
+++ b/include/asm-avr32/asm.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_ASM_H__
+#define __ASM_AVR32_ASM_H__
+
+#include <asm/sysreg.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+#define mask_interrupts		ssrf	SR_GM_BIT
+#define mask_exceptions		ssrf	SR_EM_BIT
+#define unmask_interrupts	csrf	SR_GM_BIT
+#define unmask_exceptions	csrf	SR_EM_BIT
+
+#ifdef CONFIG_FRAME_POINTER
+	.macro	save_fp
+	st.w	--sp, r7
+	.endm
+	.macro	restore_fp
+	ld.w	r7, sp++
+	.endm
+	.macro	zero_fp
+	mov	r7, 0
+	.endm
+#else
+	.macro	save_fp
+	.endm
+	.macro	restore_fp
+	.endm
+	.macro	zero_fp
+	.endm
+#endif
+	.macro	get_thread_info reg
+	mov	\reg, sp
+	andl	\reg, ~(THREAD_SIZE - 1) & 0xffff
+	.endm
+
+	/* Save and restore registers */
+	.macro	save_min sr, tmp=lr
+	pushm	lr
+	mfsr	\tmp, \sr
+	zero_fp
+	st.w	--sp, \tmp
+	.endm
+
+	.macro	restore_min sr, tmp=lr
+	ld.w	\tmp, sp++
+	mtsr	\sr, \tmp
+	popm	lr
+	.endm
+
+	.macro	save_half sr, tmp=lr
+	save_fp
+	pushm	r8-r9,r10,r11,r12,lr
+	zero_fp
+	mfsr	\tmp, \sr
+	st.w	--sp, \tmp
+	.endm
+
+	.macro	restore_half sr, tmp=lr
+	ld.w	\tmp, sp++
+	mtsr	\sr, \tmp
+	popm	r8-r9,r10,r11,r12,lr
+	restore_fp
+	.endm
+
+	.macro	save_full_user sr, tmp=lr
+	stmts	--sp, r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,sp,lr
+	st.w	--sp, lr
+	zero_fp
+	mfsr	\tmp, \sr
+	st.w	--sp, \tmp
+	.endm
+
+	.macro	restore_full_user sr, tmp=lr
+	ld.w	\tmp, sp++
+	mtsr	\sr, \tmp
+	ld.w	lr, sp++
+	ldmts	sp++, r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,sp,lr
+	.endm
+
+	/* uaccess macros */
+	.macro branch_if_kernel scratch, label
+	get_thread_info \scratch
+	ld.w	\scratch, \scratch[TI_flags]
+	bld	\scratch, TIF_USERSPACE
+	brcc	\label
+	.endm
+
+	.macro ret_if_privileged scratch, addr, size, ret
+	sub	\scratch, \size, 1
+	add	\scratch, \addr
+	retcs	\ret
+	retmi	\ret
+	.endm
+
+#endif /* __ASM_AVR32_ASM_H__ */
diff --git a/include/asm-avr32/atomic.h b/include/asm-avr32/atomic.h
new file mode 100644
index 00000000000..e0b9c44c126
--- /dev/null
+++ b/include/asm-avr32/atomic.h
@@ -0,0 +1,201 @@
+/*
+ * Atomic operations that C can't guarantee us.  Useful for
+ * resource counting etc.
+ *
+ * But use these as seldom as possible since they are slower than
+ * regular operations.
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_ATOMIC_H
+#define __ASM_AVR32_ATOMIC_H
+
+#include <asm/system.h>
+
+typedef struct { volatile int counter; } atomic_t;
+#define ATOMIC_INIT(i)  { (i) }
+
+#define atomic_read(v)		((v)->counter)
+#define atomic_set(v, i)	(((v)->counter) = i)
+
+/*
+ * atomic_sub_return - subtract the atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v. Returns the resulting value.
+ */
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+	int result;
+
+	asm volatile(
+		"/* atomic_sub_return */\n"
+		"1:	ssrf	5\n"
+		"	ld.w	%0, %2\n"
+		"	sub	%0, %3\n"
+		"	stcond	%1, %0\n"
+		"	brne	1b"
+		: "=&r"(result), "=o"(v->counter)
+		: "m"(v->counter), "ir"(i)
+		: "cc");
+
+	return result;
+}
+
+/*
+ * atomic_add_return - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v. Returns the resulting value.
+ */
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+	int result;
+
+	if (__builtin_constant_p(i))
+		result = atomic_sub_return(-i, v);
+	else
+		asm volatile(
+			"/* atomic_add_return */\n"
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %1\n"
+			"	add	%0, %3\n"
+			"	stcond	%2, %0\n"
+			"	brne	1b"
+			: "=&r"(result), "=o"(v->counter)
+			: "m"(v->counter), "r"(i)
+			: "cc", "memory");
+
+	return result;
+}
+
+/*
+ * atomic_sub_unless - sub unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * If the atomic value v is not equal to u, this function subtracts a
+ * from v, and returns non zero. If v is equal to u then it returns
+ * zero. This is done as an atomic operation.
+*/
+static inline int atomic_sub_unless(atomic_t *v, int a, int u)
+{
+	int tmp, result = 0;
+
+	asm volatile(
+		"/* atomic_sub_unless */\n"
+		"1:	ssrf	5\n"
+		"	ld.w	%0, %3\n"
+		"	cp.w	%0, %5\n"
+		"	breq	1f\n"
+		"	sub	%0, %4\n"
+		"	stcond	%2, %0\n"
+		"	brne	1b\n"
+		"	mov	%1, 1\n"
+		"1:"
+		: "=&r"(tmp), "=&r"(result), "=o"(v->counter)
+		: "m"(v->counter), "ir"(a), "ir"(u)
+		: "cc", "memory");
+
+	return result;
+}
+
+/*
+ * atomic_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * If the atomic value v is not equal to u, this function adds a to v,
+ * and returns non zero. If v is equal to u then it returns zero. This
+ * is done as an atomic operation.
+*/
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int tmp, result;
+
+	if (__builtin_constant_p(a))
+		result = atomic_sub_unless(v, -a, u);
+	else {
+		result = 0;
+		asm volatile(
+			"/* atomic_add_unless */\n"
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %3\n"
+			"	cp.w	%0, %5\n"
+			"	breq	1f\n"
+			"	add	%0, %4\n"
+			"	stcond	%2, %0\n"
+			"	brne	1b\n"
+			"	mov	%1, 1\n"
+			"1:"
+			: "=&r"(tmp), "=&r"(result), "=o"(v->counter)
+			: "m"(v->counter), "r"(a), "ir"(u)
+			: "cc", "memory");
+	}
+
+	return result;
+}
+
+/*
+ * atomic_sub_if_positive - conditionally subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically test @v and subtract @i if @v is greater or equal than @i.
+ * The function returns the old value of @v minus @i.
+ */
+static inline int atomic_sub_if_positive(int i, atomic_t *v)
+{
+	int result;
+
+	asm volatile(
+		"/* atomic_sub_if_positive */\n"
+		"1:	ssrf	5\n"
+		"	ld.w	%0, %2\n"
+		"	sub	%0, %3\n"
+		"	brlt	1f\n"
+		"	stcond	%1, %0\n"
+		"	brne	1b\n"
+		"1:"
+		: "=&r"(result), "=o"(v->counter)
+		: "m"(v->counter), "ir"(i)
+		: "cc", "memory");
+
+	return result;
+}
+
+#define atomic_xchg(v, new)	(xchg(&((v)->counter), new))
+#define atomic_cmpxchg(v, o, n)	((int)cmpxchg(&((v)->counter), (o), (n)))
+
+#define atomic_sub(i, v)	(void)atomic_sub_return(i, v)
+#define atomic_add(i, v)	(void)atomic_add_return(i, v)
+#define atomic_dec(v)		atomic_sub(1, (v))
+#define atomic_inc(v)		atomic_add(1, (v))
+
+#define atomic_dec_return(v)	atomic_sub_return(1, v)
+#define atomic_inc_return(v)	atomic_add_return(1, v)
+
+#define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0)
+#define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0)
+#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
+#define atomic_add_negative(i, v) (atomic_add_return(i, v) < 0)
+
+#define atomic_inc_not_zero(v)	atomic_add_unless(v, 1, 0)
+#define atomic_dec_if_positive(v) atomic_sub_if_positive(1, v)
+
+#define smp_mb__before_atomic_dec()	barrier()
+#define smp_mb__after_atomic_dec()	barrier()
+#define smp_mb__before_atomic_inc()	barrier()
+#define smp_mb__after_atomic_inc()	barrier()
+
+#include <asm-generic/atomic.h>
+
+#endif /*  __ASM_AVR32_ATOMIC_H */
diff --git a/include/asm-avr32/auxvec.h b/include/asm-avr32/auxvec.h
new file mode 100644
index 00000000000..d5dd435bf8f
--- /dev/null
+++ b/include/asm-avr32/auxvec.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_AVR32_AUXVEC_H
+#define __ASM_AVR32_AUXVEC_H
+
+#endif /* __ASM_AVR32_AUXVEC_H */
diff --git a/include/asm-avr32/bitops.h b/include/asm-avr32/bitops.h
new file mode 100644
index 00000000000..5299f8c8e11
--- /dev/null
+++ b/include/asm-avr32/bitops.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_BITOPS_H
+#define __ASM_AVR32_BITOPS_H
+
+#include <asm/byteorder.h>
+#include <asm/system.h>
+
+/*
+ * clear_bit() doesn't provide any barrier for the compiler
+ */
+#define smp_mb__before_clear_bit()	barrier()
+#define smp_mb__after_clear_bit()	barrier()
+
+/*
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void set_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long tmp;
+
+	if (__builtin_constant_p(nr)) {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %2\n"
+			"	sbr	%0, %3\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p)
+			: "m"(*p), "i"(nr)
+			: "cc");
+	} else {
+		unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %2\n"
+			"	or	%0, %3\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p)
+			: "m"(*p), "r"(mask)
+			: "cc");
+	}
+}
+
+/*
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void clear_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long tmp;
+
+	if (__builtin_constant_p(nr)) {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %2\n"
+			"	cbr	%0, %3\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p)
+			: "m"(*p), "i"(nr)
+			: "cc");
+	} else {
+		unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %2\n"
+			"	andn	%0, %3\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p)
+			: "m"(*p), "r"(mask)
+			: "cc");
+	}
+}
+
+/*
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void change_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+	unsigned long tmp;
+
+	asm volatile(
+		"1:	ssrf	5\n"
+		"	ld.w	%0, %2\n"
+		"	eor	%0, %3\n"
+		"	stcond	%1, %0\n"
+		"	brne	1b"
+		: "=&r"(tmp), "=o"(*p)
+		: "m"(*p), "r"(mask)
+		: "cc");
+}
+
+/*
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_set_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+	unsigned long tmp, old;
+
+	if (__builtin_constant_p(nr)) {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %3\n"
+			"	mov	%2, %0\n"
+			"	sbr	%0, %4\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p), "=&r"(old)
+			: "m"(*p), "i"(nr)
+			: "memory", "cc");
+	} else {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%2, %3\n"
+			"	or	%0, %2, %4\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p), "=&r"(old)
+			: "m"(*p), "r"(mask)
+			: "memory", "cc");
+	}
+
+	return (old & mask) != 0;
+}
+
+/*
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_clear_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+	unsigned long tmp, old;
+
+	if (__builtin_constant_p(nr)) {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %3\n"
+			"	mov	%2, %0\n"
+			"	cbr	%0, %4\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p), "=&r"(old)
+			: "m"(*p), "i"(nr)
+			: "memory", "cc");
+	} else {
+		asm volatile(
+			"1:	ssrf	5\n"
+			"	ld.w	%0, %3\n"
+			"	mov	%2, %0\n"
+			"	andn	%0, %4\n"
+			"	stcond	%1, %0\n"
+			"	brne	1b"
+			: "=&r"(tmp), "=o"(*p), "=&r"(old)
+			: "m"(*p), "r"(mask)
+			: "memory", "cc");
+	}
+
+	return (old & mask) != 0;
+}
+
+/*
+ * test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int test_and_change_bit(int nr, volatile void * addr)
+{
+	unsigned long *p = ((unsigned long *)addr) + nr / BITS_PER_LONG;
+	unsigned long mask = 1UL << (nr % BITS_PER_LONG);
+	unsigned long tmp, old;
+
+	asm volatile(
+		"1:	ssrf	5\n"
+		"	ld.w	%2, %3\n"
+		"	eor	%0, %2, %4\n"
+		"	stcond	%1, %0\n"
+		"	brne	1b"
+		: "=&r"(tmp), "=o"(*p), "=&r"(old)
+		: "m"(*p), "r"(mask)
+		: "memory", "cc");
+
+	return (old & mask) != 0;
+}
+
+#include <asm-generic/bitops/non-atomic.h>
+
+/* Find First bit Set */
+static inline unsigned long __ffs(unsigned long word)
+{
+	unsigned long result;
+
+	asm("brev %1\n\t"
+	    "clz %0,%1"
+	    : "=r"(result), "=&r"(word)
+	    : "1"(word));
+	return result;
+}
+
+/* Find First Zero */
+static inline unsigned long ffz(unsigned long word)
+{
+	return __ffs(~word);
+}
+
+/* Find Last bit Set */
+static inline int fls(unsigned long word)
+{
+	unsigned long result;
+
+	asm("clz %0,%1" : "=r"(result) : "r"(word));
+	return 32 - result;
+}
+
+unsigned long find_first_zero_bit(const unsigned long *addr,
+				  unsigned long size);
+unsigned long find_next_zero_bit(const unsigned long *addr,
+				 unsigned long size,
+				 unsigned long offset);
+unsigned long find_first_bit(const unsigned long *addr,
+			     unsigned long size);
+unsigned long find_next_bit(const unsigned long *addr,
+				 unsigned long size,
+				 unsigned long offset);
+
+/*
+ * ffs: find first bit set. This is defined the same way as
+ * the libc and compiler builtin ffs routines, therefore
+ * differs in spirit from the above ffz (man ffs).
+ *
+ * The difference is that bit numbering starts at 1, and if no bit is set,
+ * the function returns 0.
+ */
+static inline int ffs(unsigned long word)
+{
+	if(word == 0)
+		return 0;
+	return __ffs(word) + 1;
+}
+
+#include <asm-generic/bitops/fls64.h>
+#include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/hweight.h>
+
+#include <asm-generic/bitops/ext2-non-atomic.h>
+#include <asm-generic/bitops/ext2-atomic.h>
+#include <asm-generic/bitops/minix-le.h>
+
+#endif /* __ASM_AVR32_BITOPS_H */
diff --git a/include/asm-avr32/bug.h b/include/asm-avr32/bug.h
new file mode 100644
index 00000000000..521766bc936
--- /dev/null
+++ b/include/asm-avr32/bug.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_BUG_H
+#define __ASM_AVR32_BUG_H
+
+#ifdef CONFIG_BUG
+
+/*
+ * According to our Chief Architect, this compact opcode is very
+ * unlikely to ever be implemented.
+ */
+#define AVR32_BUG_OPCODE	0x5df0
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#define BUG()								\
+	do {								\
+		asm volatile(".hword	%0\n\t"				\
+			     ".hword	%1\n\t"				\
+			     ".long	%2"				\
+			     :						\
+			     : "n"(AVR32_BUG_OPCODE),			\
+			       "i"(__LINE__), "X"(__FILE__));		\
+	} while (0)
+
+#else
+
+#define BUG()								\
+	do {								\
+		asm volatile(".hword	%0\n\t"				\
+			     : : "n"(AVR32_BUG_OPCODE));		\
+	} while (0)
+
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
+#define HAVE_ARCH_BUG
+
+#endif /* CONFIG_BUG */
+
+#include <asm-generic/bug.h>
+
+#endif /* __ASM_AVR32_BUG_H */
diff --git a/include/asm-avr32/bugs.h b/include/asm-avr32/bugs.h
new file mode 100644
index 00000000000..7635e770622
--- /dev/null
+++ b/include/asm-avr32/bugs.h
@@ -0,0 +1,15 @@
+/*
+ * This is included by init/main.c to check for architecture-dependent bugs.
+ *
+ * Needs:
+ *      void check_bugs(void);
+ */
+#ifndef __ASM_AVR32_BUGS_H
+#define __ASM_AVR32_BUGS_H
+
+static void __init check_bugs(void)
+{
+	cpu_data->loops_per_jiffy = loops_per_jiffy;
+}
+
+#endif /* __ASM_AVR32_BUGS_H */
diff --git a/include/asm-avr32/byteorder.h b/include/asm-avr32/byteorder.h
new file mode 100644
index 00000000000..402ff4125cd
--- /dev/null
+++ b/include/asm-avr32/byteorder.h
@@ -0,0 +1,25 @@
+/*
+ * AVR32 endian-conversion functions.
+ */
+#ifndef __ASM_AVR32_BYTEORDER_H
+#define __ASM_AVR32_BYTEORDER_H
+
+#include <asm/types.h>
+#include <linux/compiler.h>
+
+#ifdef __CHECKER__
+extern unsigned long __builtin_bswap_32(unsigned long x);
+extern unsigned short __builtin_bswap_16(unsigned short x);
+#endif
+
+#define __arch__swab32(x) __builtin_bswap_32(x)
+#define __arch__swab16(x) __builtin_bswap_16(x)
+
+#if !defined(__STRICT_ANSI__) || defined(__KERNEL__)
+# define __BYTEORDER_HAS_U64__
+# define __SWAB_64_THRU_32__
+#endif
+
+#include <linux/byteorder/big_endian.h>
+
+#endif /* __ASM_AVR32_BYTEORDER_H */
diff --git a/include/asm-avr32/cache.h b/include/asm-avr32/cache.h
new file mode 100644
index 00000000000..dabb955f3c0
--- /dev/null
+++ b/include/asm-avr32/cache.h
@@ -0,0 +1,29 @@
+#ifndef __ASM_AVR32_CACHE_H
+#define __ASM_AVR32_CACHE_H
+
+#define L1_CACHE_SHIFT 5
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+#ifndef __ASSEMBLER__
+struct cache_info {
+	unsigned int ways;
+	unsigned int sets;
+	unsigned int linesz;
+};
+#endif /* __ASSEMBLER */
+
+/* Cache operation constants */
+#define ICACHE_FLUSH		0x00
+#define ICACHE_INVALIDATE	0x01
+#define ICACHE_LOCK		0x02
+#define ICACHE_UNLOCK		0x03
+#define ICACHE_PREFETCH		0x04
+
+#define DCACHE_FLUSH		0x08
+#define DCACHE_LOCK		0x09
+#define DCACHE_UNLOCK		0x0a
+#define DCACHE_INVALIDATE	0x0b
+#define DCACHE_CLEAN		0x0c
+#define DCACHE_CLEAN_INVAL	0x0d
+
+#endif /* __ASM_AVR32_CACHE_H */
diff --git a/include/asm-avr32/cachectl.h b/include/asm-avr32/cachectl.h
new file mode 100644
index 00000000000..4faf1ce6006
--- /dev/null
+++ b/include/asm-avr32/cachectl.h
@@ -0,0 +1,11 @@
+#ifndef __ASM_AVR32_CACHECTL_H
+#define __ASM_AVR32_CACHECTL_H
+
+/*
+ * Operations that can be performed through the cacheflush system call
+ */
+
+/* Clean the data cache, then invalidate the icache */
+#define CACHE_IFLUSH	0
+
+#endif /* __ASM_AVR32_CACHECTL_H */
diff --git a/include/asm-avr32/cacheflush.h b/include/asm-avr32/cacheflush.h
new file mode 100644
index 00000000000..f1bf1708980
--- /dev/null
+++ b/include/asm-avr32/cacheflush.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_CACHEFLUSH_H
+#define __ASM_AVR32_CACHEFLUSH_H
+
+/* Keep includes the same across arches.  */
+#include <linux/mm.h>
+
+#define CACHE_OP_ICACHE_INVALIDATE	0x01
+#define CACHE_OP_DCACHE_INVALIDATE	0x0b
+#define CACHE_OP_DCACHE_CLEAN		0x0c
+#define CACHE_OP_DCACHE_CLEAN_INVAL	0x0d
+
+/*
+ * Invalidate any cacheline containing virtual address vaddr without
+ * writing anything back to memory.
+ *
+ * Note that this function may corrupt unrelated data structures when
+ * applied on buffers that are not cacheline aligned in both ends.
+ */
+static inline void invalidate_dcache_line(void *vaddr)
+{
+	asm volatile("cache %0[0], %1"
+		     :
+		     : "r"(vaddr), "n"(CACHE_OP_DCACHE_INVALIDATE)
+		     : "memory");
+}
+
+/*
+ * Make sure any cacheline containing virtual address vaddr is written
+ * to memory.
+ */
+static inline void clean_dcache_line(void *vaddr)
+{
+	asm volatile("cache %0[0], %1"
+		     :
+		     : "r"(vaddr), "n"(CACHE_OP_DCACHE_CLEAN)
+		     : "memory");
+}
+
+/*
+ * Make sure any cacheline containing virtual address vaddr is written
+ * to memory and then invalidate it.
+ */
+static inline void flush_dcache_line(void *vaddr)
+{
+	asm volatile("cache %0[0], %1"
+		     :
+		     : "r"(vaddr), "n"(CACHE_OP_DCACHE_CLEAN_INVAL)
+		     : "memory");
+}
+
+/*
+ * Invalidate any instruction cacheline containing virtual address
+ * vaddr.
+ */
+static inline void invalidate_icache_line(void *vaddr)
+{
+	asm volatile("cache %0[0], %1"
+		     :
+		     : "r"(vaddr), "n"(CACHE_OP_ICACHE_INVALIDATE)
+		     : "memory");
+}
+
+/*
+ * Applies the above functions on all lines that are touched by the
+ * specified virtual address range.
+ */
+void invalidate_dcache_region(void *start, size_t len);
+void clean_dcache_region(void *start, size_t len);
+void flush_dcache_region(void *start, size_t len);
+void invalidate_icache_region(void *start, size_t len);
+
+/*
+ * Make sure any pending writes are completed before continuing.
+ */
+#define flush_write_buffer() asm volatile("sync 0" : : : "memory")
+
+/*
+ * The following functions are called when a virtual mapping changes.
+ * We do not need to flush anything in this case.
+ */
+#define flush_cache_all()			do { } while (0)
+#define flush_cache_mm(mm)			do { } while (0)
+#define flush_cache_range(vma, start, end)	do { } while (0)
+#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define flush_cache_vmap(start, end)		do { } while (0)
+#define flush_cache_vunmap(start, end)		do { } while (0)
+
+/*
+ * I think we need to implement this one to be able to reliably
+ * execute pages from RAMDISK. However, if we implement the
+ * flush_dcache_*() functions, it might not be needed anymore.
+ *
+ * #define flush_icache_page(vma, page)		do { } while (0)
+ */
+extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
+
+/*
+ * These are (I think) related to D-cache aliasing.  We might need to
+ * do something here, but only for certain configurations.  No such
+ * configurations exist at this time.
+ */
+#define flush_dcache_page(page)			do { } while (0)
+#define flush_dcache_mmap_lock(page)		do { } while (0)
+#define flush_dcache_mmap_unlock(page)		do { } while (0)
+
+/*
+ * These are for I/D cache coherency. In this case, we do need to
+ * flush with all configurations.
+ */
+extern void flush_icache_range(unsigned long start, unsigned long end);
+extern void flush_icache_user_range(struct vm_area_struct *vma,
+				    struct page *page,
+				    unsigned long addr, int len);
+
+#define copy_to_user_page(vma, page, vaddr, dst, src, len) do {	\
+	memcpy(dst, src, len);					\
+	flush_icache_user_range(vma, page, vaddr, len);		\
+} while(0)
+#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
+	memcpy(dst, src, len)
+
+#endif /* __ASM_AVR32_CACHEFLUSH_H */
diff --git a/include/asm-avr32/checksum.h b/include/asm-avr32/checksum.h
new file mode 100644
index 00000000000..41b7af09edc
--- /dev/null
+++ b/include/asm-avr32/checksum.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_CHECKSUM_H
+#define __ASM_AVR32_CHECKSUM_H
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+unsigned int csum_partial(const unsigned char * buff, int len,
+			  unsigned int sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+unsigned int csum_partial_copy_generic(const char *src, char *dst, int len,
+				       int sum, int *src_err_ptr,
+				       int *dst_err_ptr);
+
+/*
+ *	Note: when you get a NULL pointer exception here this means someone
+ *	passed in an incorrect kernel address to one of these functions.
+ *
+ *	If you use these functions directly please don't forget the
+ *	verify_area().
+ */
+static inline
+unsigned int csum_partial_copy_nocheck(const char *src, char *dst,
+				       int len, int sum)
+{
+	return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+}
+
+static inline
+unsigned int csum_partial_copy_from_user (const char __user *src, char *dst,
+					  int len, int sum, int *err_ptr)
+{
+	return csum_partial_copy_generic((const char __force *)src, dst, len,
+					 sum, err_ptr, NULL);
+}
+
+/*
+ *	This is a version of ip_compute_csum() optimized for IP headers,
+ *	which always checksum on 4 octet boundaries.
+ */
+static inline unsigned short ip_fast_csum(unsigned char *iph,
+					  unsigned int ihl)
+{
+	unsigned int sum, tmp;
+
+	__asm__ __volatile__(
+		"	ld.w	%0, %1++\n"
+		"	ld.w	%3, %1++\n"
+		"	sub	%2, 4\n"
+		"	add	%0, %3\n"
+		"	ld.w	%3, %1++\n"
+		"	adc	%0, %0, %3\n"
+		"	ld.w	%3, %1++\n"
+		"	adc	%0, %0, %3\n"
+		"	acr	%0\n"
+		"1:	ld.w	%3, %1++\n"
+		"	add	%0, %3\n"
+		"	acr	%0\n"
+		"	sub	%2, 1\n"
+		"	brne	1b\n"
+		"	lsl	%3, %0, 16\n"
+		"	andl	%0, 0\n"
+		"	mov	%2, 0xffff\n"
+		"	add	%0, %3\n"
+		"	adc	%0, %0, %2\n"
+		"	com	%0\n"
+		"	lsr	%0, 16\n"
+		: "=r"(sum), "=r"(iph), "=r"(ihl), "=r"(tmp)
+		: "1"(iph), "2"(ihl)
+		: "memory", "cc");
+	return sum;
+}
+
+/*
+ *	Fold a partial checksum
+ */
+
+static inline unsigned int csum_fold(unsigned int sum)
+{
+	unsigned int tmp;
+
+	asm("	bfextu	%1, %0, 0, 16\n"
+	    "	lsr	%0, 16\n"
+	    "	add	%0, %1\n"
+	    "	bfextu	%1, %0, 16, 16\n"
+	    "	add	%0, %1"
+	    : "=&r"(sum), "=&r"(tmp)
+	    : "0"(sum));
+
+	return ~sum;
+}
+
+static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
+					       unsigned long daddr,
+					       unsigned short len,
+					       unsigned short proto,
+					       unsigned int sum)
+{
+	asm("	add	%0, %1\n"
+	    "	adc	%0, %0, %2\n"
+	    "	adc	%0, %0, %3\n"
+	    "	acr	%0"
+	    : "=r"(sum)
+	    : "r"(daddr), "r"(saddr), "r"(ntohs(len) | (proto << 16)),
+	      "0"(sum)
+	    : "cc");
+
+	return sum;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,
+						   unsigned long daddr,
+						   unsigned short len,
+						   unsigned short proto,
+						   unsigned int sum)
+{
+	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+
+static inline unsigned short ip_compute_csum(unsigned char * buff, int len)
+{
+    return csum_fold(csum_partial(buff, len, 0));
+}
+
+#endif /* __ASM_AVR32_CHECKSUM_H */
diff --git a/include/asm-avr32/cputime.h b/include/asm-avr32/cputime.h
new file mode 100644
index 00000000000..e87e0f81cbe
--- /dev/null
+++ b/include/asm-avr32/cputime.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_CPUTIME_H
+#define __ASM_AVR32_CPUTIME_H
+
+#include <asm-generic/cputime.h>
+
+#endif /* __ASM_AVR32_CPUTIME_H */
diff --git a/include/asm-avr32/current.h b/include/asm-avr32/current.h
new file mode 100644
index 00000000000..c7b0549eab8
--- /dev/null
+++ b/include/asm-avr32/current.h
@@ -0,0 +1,15 @@
+#ifndef __ASM_AVR32_CURRENT_H
+#define __ASM_AVR32_CURRENT_H
+
+#include <linux/thread_info.h>
+
+struct task_struct;
+
+inline static struct task_struct * get_current(void)
+{
+	return current_thread_info()->task;
+}
+
+#define current get_current()
+
+#endif /* __ASM_AVR32_CURRENT_H */
diff --git a/include/asm-avr32/delay.h b/include/asm-avr32/delay.h
new file mode 100644
index 00000000000..cc3b2e3343b
--- /dev/null
+++ b/include/asm-avr32/delay.h
@@ -0,0 +1,26 @@
+#ifndef __ASM_AVR32_DELAY_H
+#define __ASM_AVR32_DELAY_H
+
+/*
+ * Copyright (C) 1993 Linus Torvalds
+ *
+ * Delay routines calling functions in arch/avr32/lib/delay.c
+ */
+
+extern void __bad_udelay(void);
+extern void __bad_ndelay(void);
+
+extern void __udelay(unsigned long usecs);
+extern void __ndelay(unsigned long nsecs);
+extern void __const_udelay(unsigned long usecs);
+extern void __delay(unsigned long loops);
+
+#define udelay(n) (__builtin_constant_p(n) ? \
+	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c6ul)) : \
+	__udelay(n))
+
+#define ndelay(n) (__builtin_constant_p(n) ? \
+	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
+	__ndelay(n))
+
+#endif /* __ASM_AVR32_DELAY_H */
diff --git a/include/asm-avr32/div64.h b/include/asm-avr32/div64.h
new file mode 100644
index 00000000000..d7ddd4fdeca
--- /dev/null
+++ b/include/asm-avr32/div64.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_DIV64_H
+#define __ASM_AVR32_DIV64_H
+
+#include <asm-generic/div64.h>
+
+#endif /* __ASM_AVR32_DIV64_H */
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
new file mode 100644
index 00000000000..4c40cb41cdf
--- /dev/null
+++ b/include/asm-avr32/dma-mapping.h
@@ -0,0 +1,320 @@
+#ifndef __ASM_AVR32_DMA_MAPPING_H
+#define __ASM_AVR32_DMA_MAPPING_H
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <asm/scatterlist.h>
+#include <asm/processor.h>
+#include <asm/cacheflush.h>
+#include <asm/io.h>
+
+extern void dma_cache_sync(void *vaddr, size_t size, int direction);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask
+ * to this function.
+ */
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+	/* Fix when needed. I really don't know of any limitations */
+	return 1;
+}
+
+static inline int dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+
+	*dev->dma_mask = dma_mask;
+	return 0;
+}
+
+/**
+ * dma_alloc_coherent - allocate consistent memory for DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @size: required memory size
+ * @handle: bus-specific DMA address
+ *
+ * Allocate some uncached, unbuffered memory for a device for
+ * performing DMA.  This function allocates pages, and will
+ * return the CPU-viewed address, and sets @handle to be the
+ * device-viewed address.
+ */
+extern void *dma_alloc_coherent(struct device *dev, size_t size,
+				dma_addr_t *handle, gfp_t gfp);
+
+/**
+ * dma_free_coherent - free memory allocated by dma_alloc_coherent
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @size: size of memory originally requested in dma_alloc_coherent
+ * @cpu_addr: CPU-view address returned from dma_alloc_coherent
+ * @handle: device-view address returned from dma_alloc_coherent
+ *
+ * Free (and unmap) a DMA buffer previously allocated by
+ * dma_alloc_coherent().
+ *
+ * References to memory and mappings associated with cpu_addr/handle
+ * during and after this call executing are illegal.
+ */
+extern void dma_free_coherent(struct device *dev, size_t size,
+			      void *cpu_addr, dma_addr_t handle);
+
+/**
+ * dma_alloc_writecombine - allocate write-combining memory for DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @size: required memory size
+ * @handle: bus-specific DMA address
+ *
+ * Allocate some uncached, buffered memory for a device for
+ * performing DMA.  This function allocates pages, and will
+ * return the CPU-viewed address, and sets @handle to be the
+ * device-viewed address.
+ */
+extern void *dma_alloc_writecombine(struct device *dev, size_t size,
+				    dma_addr_t *handle, gfp_t gfp);
+
+/**
+ * dma_free_coherent - free memory allocated by dma_alloc_writecombine
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @size: size of memory originally requested in dma_alloc_writecombine
+ * @cpu_addr: CPU-view address returned from dma_alloc_writecombine
+ * @handle: device-view address returned from dma_alloc_writecombine
+ *
+ * Free (and unmap) a DMA buffer previously allocated by
+ * dma_alloc_writecombine().
+ *
+ * References to memory and mappings associated with cpu_addr/handle
+ * during and after this call executing are illegal.
+ */
+extern void dma_free_writecombine(struct device *dev, size_t size,
+				  void *cpu_addr, dma_addr_t handle);
+
+/**
+ * dma_map_single - map a single buffer for streaming DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @cpu_addr: CPU direct mapped address of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Ensure that any data held in the cache is appropriately discarded
+ * or written back.
+ *
+ * The device owns this memory once this call has completed.  The CPU
+ * can regain ownership by calling dma_unmap_single() or dma_sync_single().
+ */
+static inline dma_addr_t
+dma_map_single(struct device *dev, void *cpu_addr, size_t size,
+	       enum dma_data_direction direction)
+{
+	dma_cache_sync(cpu_addr, size, direction);
+	return virt_to_bus(cpu_addr);
+}
+
+/**
+ * dma_unmap_single - unmap a single buffer previously mapped
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @handle: DMA address of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Unmap a single streaming mode DMA translation.  The handle and size
+ * must match what was provided in the previous dma_map_single() call.
+ * All other usages are undefined.
+ *
+ * After this call, reads by the CPU to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static inline void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		 enum dma_data_direction direction)
+{
+
+}
+
+/**
+ * dma_map_page - map a portion of a page for streaming DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @page: page that buffer resides in
+ * @offset: offset into page for start of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Ensure that any data held in the cache is appropriately discarded
+ * or written back.
+ *
+ * The device owns this memory once this call has completed.  The CPU
+ * can regain ownership by calling dma_unmap_page() or dma_sync_single().
+ */
+static inline dma_addr_t
+dma_map_page(struct device *dev, struct page *page,
+	     unsigned long offset, size_t size,
+	     enum dma_data_direction direction)
+{
+	return dma_map_single(dev, page_address(page) + offset,
+			      size, direction);
+}
+
+/**
+ * dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @handle: DMA address of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Unmap a single streaming mode DMA translation.  The handle and size
+ * must match what was provided in the previous dma_map_single() call.
+ * All other usages are undefined.
+ *
+ * After this call, reads by the CPU to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static inline void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+	       enum dma_data_direction direction)
+{
+	dma_unmap_single(dev, dma_address, size, direction);
+}
+
+/**
+ * dma_map_sg - map a set of SG buffers for streaming mode DMA
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map
+ * @dir: DMA transfer direction
+ *
+ * Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static inline int
+dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+	   enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++) {
+		char *virt;
+
+		sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
+		virt = page_address(sg[i].page) + sg[i].offset;
+		dma_cache_sync(virt, sg[i].length, direction);
+	}
+
+	return nents;
+}
+
+/**
+ * dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map
+ * @dir: DMA transfer direction
+ *
+ * Unmap a set of streaming mode DMA translations.
+ * Again, CPU read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static inline void
+dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+	     enum dma_data_direction direction)
+{
+
+}
+
+/**
+ * dma_sync_single_for_cpu
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @handle: DMA address of buffer
+ * @size: size of buffer to map
+ * @dir: DMA transfer direction
+ *
+ * Make physical memory consistent for a single streaming mode DMA
+ * translation after a transfer.
+ *
+ * If you perform a dma_map_single() but wish to interrogate the
+ * buffer using the cpu, yet do not wish to teardown the DMA mapping,
+ * you must call this function before doing so.  At the next point you
+ * give the DMA address back to the card, you must first perform a
+ * dma_sync_single_for_device, and then the device again owns the
+ * buffer.
+ */
+static inline void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+			size_t size, enum dma_data_direction direction)
+{
+	dma_cache_sync(bus_to_virt(dma_handle), size, direction);
+}
+
+static inline void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+			   size_t size, enum dma_data_direction direction)
+{
+	dma_cache_sync(bus_to_virt(dma_handle), size, direction);
+}
+
+/**
+ * dma_sync_sg_for_cpu
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @sg: list of buffers
+ * @nents: number of buffers to map
+ * @dir: DMA transfer direction
+ *
+ * Make physical memory consistent for a set of streaming
+ * mode DMA translations after a transfer.
+ *
+ * The same as dma_sync_single_for_* but for a scatter-gather list,
+ * same rules and usage.
+ */
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		    int nents, enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++) {
+		dma_cache_sync(page_address(sg[i].page) + sg[i].offset,
+			       sg[i].length, direction);
+	}
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		       int nents, enum dma_data_direction direction)
+{
+	int i;
+
+	for (i = 0; i < nents; i++) {
+		dma_cache_sync(page_address(sg[i].page) + sg[i].offset,
+			       sg[i].length, direction);
+	}
+}
+
+/* Now for the API extensions over the pci_ one */
+
+#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+
+static inline int dma_is_consistent(dma_addr_t dma_addr)
+{
+	return 1;
+}
+
+static inline int dma_get_cache_alignment(void)
+{
+	return boot_cpu_data.dcache.linesz;
+}
+
+#endif /* __ASM_AVR32_DMA_MAPPING_H */
diff --git a/include/asm-avr32/dma.h b/include/asm-avr32/dma.h
new file mode 100644
index 00000000000..9e91205590a
--- /dev/null
+++ b/include/asm-avr32/dma.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_AVR32_DMA_H
+#define __ASM_AVR32_DMA_H
+
+/* The maximum address that we can perform a DMA transfer to on this platform.
+ * Not really applicable to AVR32, but some functions need it. */
+#define MAX_DMA_ADDRESS		0xffffffff
+
+#endif /* __ASM_AVR32_DMA_H */
diff --git a/include/asm-avr32/elf.h b/include/asm-avr32/elf.h
new file mode 100644
index 00000000000..d334b4994d2
--- /dev/null
+++ b/include/asm-avr32/elf.h
@@ -0,0 +1,110 @@
+#ifndef __ASM_AVR32_ELF_H
+#define __ASM_AVR32_ELF_H
+
+/* AVR32 relocation numbers */
+#define R_AVR32_NONE		0
+#define R_AVR32_32		1
+#define R_AVR32_16		2
+#define R_AVR32_8		3
+#define R_AVR32_32_PCREL	4
+#define R_AVR32_16_PCREL	5
+#define R_AVR32_8_PCREL		6
+#define R_AVR32_DIFF32		7
+#define R_AVR32_DIFF16		8
+#define R_AVR32_DIFF8		9
+#define R_AVR32_GOT32		10
+#define R_AVR32_GOT16		11
+#define R_AVR32_GOT8		12
+#define R_AVR32_21S		13
+#define R_AVR32_16U		14
+#define R_AVR32_16S		15
+#define R_AVR32_8S		16
+#define R_AVR32_8S_EXT		17
+#define R_AVR32_22H_PCREL	18
+#define R_AVR32_18W_PCREL	19
+#define R_AVR32_16B_PCREL	20
+#define R_AVR32_16N_PCREL	21
+#define R_AVR32_14UW_PCREL	22
+#define R_AVR32_11H_PCREL	23
+#define R_AVR32_10UW_PCREL	24
+#define R_AVR32_9H_PCREL	25
+#define R_AVR32_9UW_PCREL	26
+#define R_AVR32_HI16		27
+#define R_AVR32_LO16		28
+#define R_AVR32_GOTPC		29
+#define R_AVR32_GOTCALL		30
+#define R_AVR32_LDA_GOT		31
+#define R_AVR32_GOT21S		32
+#define R_AVR32_GOT18SW		33
+#define R_AVR32_GOT16S		34
+#define R_AVR32_GOT7UW		35
+#define R_AVR32_32_CPENT	36
+#define R_AVR32_CPCALL		37
+#define R_AVR32_16_CP		38
+#define R_AVR32_9W_CP		39
+#define R_AVR32_RELATIVE	40
+#define R_AVR32_GLOB_DAT	41
+#define R_AVR32_JMP_SLOT	42
+#define R_AVR32_ALIGN		43
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+#include <asm/user.h>
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof (struct pt_regs) / sizeof (elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_fpu_struct elf_fpregset_t;
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) ( (x)->e_machine == EM_AVR32 )
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS	ELFCLASS32
+#ifdef __LITTLE_ENDIAN__
+#define ELF_DATA	ELFDATA2LSB
+#else
+#define ELF_DATA	ELFDATA2MSB
+#endif
+#define ELF_ARCH	EM_AVR32
+
+#define USE_ELF_CORE_DUMP
+#define ELF_EXEC_PAGESIZE	4096
+
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#define ELF_ET_DYN_BASE         (2 * TASK_SIZE / 3)
+
+
+/* This yields a mask that user programs can use to figure out what
+   instruction set this CPU supports.  This could be done in user space,
+   but it's not easy, and we've already done it here.  */
+
+#define ELF_HWCAP	(0)
+
+/* This yields a string that ld.so will use to load implementation
+   specific libraries for optimization.  This is more specific in
+   intent than poking at uname or /proc/cpuinfo.
+
+   For the moment, we have only optimizations for the Intel generations,
+   but that could change... */
+
+#define ELF_PLATFORM  (NULL)
+
+#ifdef __KERNEL__
+#define SET_PERSONALITY(ex, ibcs2) set_personality(PER_LINUX_32BIT)
+#endif
+
+#endif /* __ASM_AVR32_ELF_H */
diff --git a/include/asm-avr32/emergency-restart.h b/include/asm-avr32/emergency-restart.h
new file mode 100644
index 00000000000..3e7e014776b
--- /dev/null
+++ b/include/asm-avr32/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_EMERGENCY_RESTART_H
+#define __ASM_AVR32_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* __ASM_AVR32_EMERGENCY_RESTART_H */
diff --git a/include/asm-avr32/errno.h b/include/asm-avr32/errno.h
new file mode 100644
index 00000000000..558a7249f06
--- /dev/null
+++ b/include/asm-avr32/errno.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_ERRNO_H
+#define __ASM_AVR32_ERRNO_H
+
+#include <asm-generic/errno.h>
+
+#endif /* __ASM_AVR32_ERRNO_H */
diff --git a/include/asm-avr32/fcntl.h b/include/asm-avr32/fcntl.h
new file mode 100644
index 00000000000..14c0c4402b1
--- /dev/null
+++ b/include/asm-avr32/fcntl.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_FCNTL_H
+#define __ASM_AVR32_FCNTL_H
+
+#include <asm-generic/fcntl.h>
+
+#endif /* __ASM_AVR32_FCNTL_H */
diff --git a/include/asm-avr32/futex.h b/include/asm-avr32/futex.h
new file mode 100644
index 00000000000..10419f14a68
--- /dev/null
+++ b/include/asm-avr32/futex.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_FUTEX_H
+#define __ASM_AVR32_FUTEX_H
+
+#include <asm-generic/futex.h>
+
+#endif /* __ASM_AVR32_FUTEX_H */
diff --git a/include/asm-avr32/hardirq.h b/include/asm-avr32/hardirq.h
new file mode 100644
index 00000000000..267354356f6
--- /dev/null
+++ b/include/asm-avr32/hardirq.h
@@ -0,0 +1,34 @@
+#ifndef __ASM_AVR32_HARDIRQ_H
+#define __ASM_AVR32_HARDIRQ_H
+
+#include <linux/threads.h>
+#include <asm/irq.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/cache.h>
+
+/* entry.S is sensitive to the offsets of these fields */
+typedef struct {
+	unsigned int __softirq_pending;
+} ____cacheline_aligned irq_cpustat_t;
+
+void ack_bad_irq(unsigned int irq);
+
+/* Standard mappings for irq_cpustat_t above */
+#include <linux/irq_cpustat.h>
+
+#endif /* __ASSEMBLY__ */
+
+#define HARDIRQ_BITS	12
+
+/*
+ * The hardirq mask has to be large enough to have
+ * space for potentially all IRQ sources in the system
+ * nesting on a single CPU:
+ */
+#if (1 << HARDIRQ_BITS) < NR_IRQS
+# error HARDIRQ_BITS is too low!
+#endif
+
+#endif /* __ASM_AVR32_HARDIRQ_H */
diff --git a/include/asm-avr32/hw_irq.h b/include/asm-avr32/hw_irq.h
new file mode 100644
index 00000000000..218b0a6bfd1
--- /dev/null
+++ b/include/asm-avr32/hw_irq.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_AVR32_HW_IRQ_H
+#define __ASM_AVR32_HW_IRQ_H
+
+static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
+{
+	/* Nothing to do */
+}
+
+#endif /* __ASM_AVR32_HW_IRQ_H */
diff --git a/include/asm-avr32/intc.h b/include/asm-avr32/intc.h
new file mode 100644
index 00000000000..1ac9ca75e8f
--- /dev/null
+++ b/include/asm-avr32/intc.h
@@ -0,0 +1,128 @@
+#ifndef __ASM_AVR32_INTC_H
+#define __ASM_AVR32_INTC_H
+
+#include <linux/sysdev.h>
+#include <linux/interrupt.h>
+
+struct irq_controller;
+struct irqaction;
+struct pt_regs;
+
+struct platform_device;
+
+/* Information about the internal interrupt controller */
+struct intc_device {
+	/* ioremapped address of configuration block */
+	void __iomem *regs;
+
+	/* the physical device */
+	struct platform_device *pdev;
+
+	/* Number of interrupt lines per group. */
+	unsigned int irqs_per_group;
+
+	/* The highest group ID + 1 */
+	unsigned int nr_groups;
+
+	/*
+	 * Bitfield indicating which groups are actually in use.  The
+	 * size of the array is
+	 * ceil(group_max / (8 * sizeof(unsigned int))).
+	 */
+	unsigned int group_mask[];
+};
+
+struct irq_controller_class {
+	/*
+	 * A short name identifying this kind of controller.
+	 */
+	const char *typename;
+	/*
+	 * Handle the IRQ.  Must do any necessary acking and masking.
+	 */
+	irqreturn_t (*handle)(int irq, void *dev_id, struct pt_regs *regs);
+	/*
+	 * Register a new IRQ handler.
+	 */
+	int (*setup)(struct irq_controller *ctrl, unsigned int irq,
+		     struct irqaction *action);
+	/*
+	 * Unregister a IRQ handler.
+	 */
+	void (*free)(struct irq_controller *ctrl, unsigned int irq,
+		     void *dev_id);
+	/*
+	 * Mask the IRQ in the interrupt controller.
+	 */
+	void (*mask)(struct irq_controller *ctrl, unsigned int irq);
+	/*
+	 * Unmask the IRQ in the interrupt controller.
+	 */
+	void (*unmask)(struct irq_controller *ctrl, unsigned int irq);
+	/*
+	 * Set the type of the IRQ. See below for possible types.
+	 * Return -EINVAL if a given type is not supported
+	 */
+	int (*set_type)(struct irq_controller *ctrl, unsigned int irq,
+			unsigned int type);
+	/*
+	 * Return the IRQ type currently set
+	 */
+	unsigned int (*get_type)(struct irq_controller *ctrl, unsigned int irq);
+};
+
+struct irq_controller {
+	struct irq_controller_class *class;
+	unsigned int irq_group;
+	unsigned int first_irq;
+	unsigned int nr_irqs;
+	struct list_head list;
+};
+
+struct intc_group_desc {
+	struct irq_controller *ctrl;
+	irqreturn_t (*handle)(int, void *, struct pt_regs *);
+	unsigned long flags;
+	void *dev_id;
+	const char *devname;
+};
+
+/*
+ * The internal interrupt controller.  Defined in board/part-specific
+ * devices.c.
+ * TODO: Should probably be defined per-cpu.
+ */
+extern struct intc_device intc;
+
+extern int request_internal_irq(unsigned int irq,
+				irqreturn_t (*handler)(int, void *, struct pt_regs *),
+				unsigned long irqflags,
+				const char *devname, void *dev_id);
+extern void free_internal_irq(unsigned int irq);
+
+/* Only used by time_init() */
+extern int setup_internal_irq(unsigned int irq, struct intc_group_desc *desc);
+
+/*
+ * Set interrupt priority for a given group. `group' can be found by
+ * using irq_to_group(irq). Priority can be from 0 (lowest) to 3
+ * (highest). Higher-priority interrupts will preempt lower-priority
+ * interrupts (unless interrupts are masked globally).
+ *
+ * This function does not check for conflicts within a group.
+ */
+extern int intc_set_priority(unsigned int group,
+			     unsigned int priority);
+
+/*
+ * Returns a bitmask of pending interrupts in a group.
+ */
+extern unsigned long intc_get_pending(unsigned int group);
+
+/*
+ * Register a new external interrupt controller.  Returns the first
+ * external IRQ number that is assigned to the new controller.
+ */
+extern int intc_register_controller(struct irq_controller *ctrl);
+
+#endif /* __ASM_AVR32_INTC_H */
diff --git a/include/asm-avr32/io.h b/include/asm-avr32/io.h
new file mode 100644
index 00000000000..2fc8f111dce
--- /dev/null
+++ b/include/asm-avr32/io.h
@@ -0,0 +1,253 @@
+#ifndef __ASM_AVR32_IO_H
+#define __ASM_AVR32_IO_H
+
+#include <linux/string.h>
+
+#ifdef __KERNEL__
+
+#include <asm/addrspace.h>
+#include <asm/byteorder.h>
+
+/* virt_to_phys will only work when address is in P1 or P2 */
+static __inline__ unsigned long virt_to_phys(volatile void *address)
+{
+	return PHYSADDR(address);
+}
+
+static __inline__ void * phys_to_virt(unsigned long address)
+{
+	return (void *)P1SEGADDR(address);
+}
+
+#define cached_to_phys(addr)	((unsigned long)PHYSADDR(addr))
+#define uncached_to_phys(addr)	((unsigned long)PHYSADDR(addr))
+#define phys_to_cached(addr)	((void *)P1SEGADDR(addr))
+#define phys_to_uncached(addr)	((void *)P2SEGADDR(addr))
+
+/*
+ * Generic IO read/write.  These perform native-endian accesses.  Note
+ * that some architectures will want to re-define __raw_{read,write}w.
+ */
+extern void __raw_writesb(unsigned int addr, const void *data, int bytelen);
+extern void __raw_writesw(unsigned int addr, const void *data, int wordlen);
+extern void __raw_writesl(unsigned int addr, const void *data, int longlen);
+
+extern void __raw_readsb(unsigned int addr, void *data, int bytelen);
+extern void __raw_readsw(unsigned int addr, void *data, int wordlen);
+extern void __raw_readsl(unsigned int addr, void *data, int longlen);
+
+static inline void writeb(unsigned char b, volatile void __iomem *addr)
+{
+	*(volatile unsigned char __force *)addr = b;
+}
+static inline void writew(unsigned short b, volatile void __iomem *addr)
+{
+	*(volatile unsigned short __force *)addr = b;
+}
+static inline void writel(unsigned int b, volatile void __iomem *addr)
+{
+	*(volatile unsigned int __force *)addr = b;
+}
+#define __raw_writeb writeb
+#define __raw_writew writew
+#define __raw_writel writel
+
+static inline unsigned char readb(const volatile void __iomem *addr)
+{
+	return *(const volatile unsigned char __force *)addr;
+}
+static inline unsigned short readw(const volatile void __iomem *addr)
+{
+	return *(const volatile unsigned short __force *)addr;
+}
+static inline unsigned int readl(const volatile void __iomem *addr)
+{
+	return *(const volatile unsigned int __force *)addr;
+}
+#define __raw_readb readb
+#define __raw_readw readw
+#define __raw_readl readl
+
+#define writesb(p, d, l)	__raw_writesb((unsigned int)p, d, l)
+#define writesw(p, d, l)	__raw_writesw((unsigned int)p, d, l)
+#define writesl(p, d, l)	__raw_writesl((unsigned int)p, d, l)
+
+#define readsb(p, d, l)		__raw_readsb((unsigned int)p, d, l)
+#define readsw(p, d, l)		__raw_readsw((unsigned int)p, d, l)
+#define readsl(p, d, l)		__raw_readsl((unsigned int)p, d, l)
+
+/*
+ * These two are only here because ALSA _thinks_ it needs them...
+ */
+static inline void memcpy_fromio(void * to, const volatile void __iomem *from,
+				 unsigned long count)
+{
+	char *p = to;
+	while (count) {
+		count--;
+		*p = readb(from);
+		p++;
+		from++;
+	}
+}
+
+static inline void  memcpy_toio(volatile void __iomem *to, const void * from,
+				unsigned long count)
+{
+	const char *p = from;
+	while (count) {
+		count--;
+		writeb(*p, to);
+		p++;
+		to++;
+	}
+}
+
+static inline void memset_io(volatile void __iomem *addr, unsigned char val,
+			     unsigned long count)
+{
+	memset((void __force *)addr, val, count);
+}
+
+/*
+ * Bad read/write accesses...
+ */
+extern void __readwrite_bug(const char *fn);
+
+#define IO_SPACE_LIMIT	0xffffffff
+
+/* Convert I/O port address to virtual address */
+#define __io(p)		((void __iomem *)phys_to_uncached(p))
+
+/*
+ *  IO port access primitives
+ *  -------------------------
+ *
+ * The AVR32 doesn't have special IO access instructions; all IO is memory
+ * mapped. Note that these are defined to perform little endian accesses
+ * only. Their primary purpose is to access PCI and ISA peripherals.
+ *
+ * Note that for a big endian machine, this implies that the following
+ * big endian mode connectivity is in place.
+ *
+ * The machine specific io.h include defines __io to translate an "IO"
+ * address to a memory address.
+ *
+ * Note that we prevent GCC re-ordering or caching values in expressions
+ * by introducing sequence points into the in*() definitions.  Note that
+ * __raw_* do not guarantee this behaviour.
+ *
+ * The {in,out}[bwl] macros are for emulating x86-style PCI/ISA IO space.
+ */
+#define outb(v, p)		__raw_writeb(v, __io(p))
+#define outw(v, p)		__raw_writew(cpu_to_le16(v), __io(p))
+#define outl(v, p)		__raw_writel(cpu_to_le32(v), __io(p))
+
+#define inb(p)			__raw_readb(__io(p))
+#define inw(p)			le16_to_cpu(__raw_readw(__io(p)))
+#define inl(p)			le32_to_cpu(__raw_readl(__io(p)))
+
+static inline void __outsb(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		outb(*(u8 *)addr, port);
+		addr++;
+	}
+}
+
+static inline void __insb(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		*(u8 *)addr = inb(port);
+		addr++;
+	}
+}
+
+static inline void __outsw(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		outw(*(u16 *)addr, port);
+		addr += 2;
+	}
+}
+
+static inline void __insw(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		*(u16 *)addr = inw(port);
+		addr += 2;
+	}
+}
+
+static inline void __outsl(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		outl(*(u32 *)addr, port);
+		addr += 4;
+	}
+}
+
+static inline void __insl(unsigned long port, void *addr, unsigned int count)
+{
+	while (count--) {
+		*(u32 *)addr = inl(port);
+		addr += 4;
+	}
+}
+
+#define outsb(port, addr, count)	__outsb(port, addr, count)
+#define insb(port, addr, count)		__insb(port, addr, count)
+#define outsw(port, addr, count)	__outsw(port, addr, count)
+#define insw(port, addr, count)		__insw(port, addr, count)
+#define outsl(port, addr, count)	__outsl(port, addr, count)
+#define insl(port, addr, count)		__insl(port, addr, count)
+
+extern void __iomem *__ioremap(unsigned long offset, size_t size,
+			       unsigned long flags);
+extern void __iounmap(void __iomem *addr);
+
+/*
+ * ioremap	-   map bus memory into CPU space
+ * @offset	bus address of the memory
+ * @size	size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to make
+ * bus memory CPU accessible via the readb/.../writel functions and
+ * the other mmio helpers. The returned address is not guaranteed to
+ * be usable directly as a virtual address.
+ */
+#define ioremap(offset, size)			\
+	__ioremap((offset), (size), 0)
+
+#define iounmap(addr)				\
+	__iounmap(addr)
+
+#define cached(addr) P1SEGADDR(addr)
+#define uncached(addr) P2SEGADDR(addr)
+
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+#define page_to_bus page_to_phys
+#define bus_to_page phys_to_page
+
+#define dma_cache_wback_inv(_start, _size)	\
+	flush_dcache_region(_start, _size)
+#define dma_cache_inv(_start, _size)		\
+	invalidate_dcache_region(_start, _size)
+#define dma_cache_wback(_start, _size)		\
+	clean_dcache_region(_start, _size)
+
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+#define xlate_dev_mem_ptr(p)    __va(p)
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p)   p
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_AVR32_IO_H */
diff --git a/include/asm-avr32/ioctl.h b/include/asm-avr32/ioctl.h
new file mode 100644
index 00000000000..c8472c1398e
--- /dev/null
+++ b/include/asm-avr32/ioctl.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_IOCTL_H
+#define __ASM_AVR32_IOCTL_H
+
+#include <asm-generic/ioctl.h>
+
+#endif /* __ASM_AVR32_IOCTL_H */
diff --git a/include/asm-avr32/ioctls.h b/include/asm-avr32/ioctls.h
new file mode 100644
index 00000000000..0500426b718
--- /dev/null
+++ b/include/asm-avr32/ioctls.h
@@ -0,0 +1,83 @@
+#ifndef __ASM_AVR32_IOCTLS_H
+#define __ASM_AVR32_IOCTLS_H
+
+#include <asm/ioctl.h>
+
+/* 0x54 is just a magic number to make these relatively unique ('T') */
+
+#define TCGETS		0x5401
+#define TCSETS		0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
+#define TCSETSW		0x5403
+#define TCSETSF		0x5404
+#define TCGETA		0x5405
+#define TCSETA		0x5406
+#define TCSETAW		0x5407
+#define TCSETAF		0x5408
+#define TCSBRK		0x5409
+#define TCXONC		0x540A
+#define TCFLSH		0x540B
+#define TIOCEXCL	0x540C
+#define TIOCNXCL	0x540D
+#define TIOCSCTTY	0x540E
+#define TIOCGPGRP	0x540F
+#define TIOCSPGRP	0x5410
+#define TIOCOUTQ	0x5411
+#define TIOCSTI		0x5412
+#define TIOCGWINSZ	0x5413
+#define TIOCSWINSZ	0x5414
+#define TIOCMGET	0x5415
+#define TIOCMBIS	0x5416
+#define TIOCMBIC	0x5417
+#define TIOCMSET	0x5418
+#define TIOCGSOFTCAR	0x5419
+#define TIOCSSOFTCAR	0x541A
+#define FIONREAD	0x541B
+#define TIOCINQ		FIONREAD
+#define TIOCLINUX	0x541C
+#define TIOCCONS	0x541D
+#define TIOCGSERIAL	0x541E
+#define TIOCSSERIAL	0x541F
+#define TIOCPKT		0x5420
+#define FIONBIO		0x5421
+#define TIOCNOTTY	0x5422
+#define TIOCSETD	0x5423
+#define TIOCGETD	0x5424
+#define TCSBRKP		0x5425	/* Needed for POSIX tcsendbreak() */
+/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
+#define TIOCSBRK	0x5427  /* BSD compatibility */
+#define TIOCCBRK	0x5428  /* BSD compatibility */
+#define TIOCGSID	0x5429  /* Return the session ID of FD */
+#define TIOCGPTN	_IOR('T',0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
+#define TIOCSPTLCK	_IOW('T',0x31, int)  /* Lock/unlock Pty */
+
+#define FIONCLEX	0x5450
+#define FIOCLEX		0x5451
+#define FIOASYNC	0x5452
+#define TIOCSERCONFIG	0x5453
+#define TIOCSERGWILD	0x5454
+#define TIOCSERSWILD	0x5455
+#define TIOCGLCKTRMIOS	0x5456
+#define TIOCSLCKTRMIOS	0x5457
+#define TIOCSERGSTRUCT	0x5458 /* For debugging only */
+#define TIOCSERGETLSR   0x5459 /* Get line status register */
+#define TIOCSERGETMULTI 0x545A /* Get multiport config  */
+#define TIOCSERSETMULTI 0x545B /* Set multiport config */
+
+#define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
+#define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
+#define TIOCGHAYESESP   0x545E  /* Get Hayes ESP configuration */
+#define TIOCSHAYESESP   0x545F  /* Set Hayes ESP configuration */
+#define FIOQSIZE	0x5460
+
+/* Used for packet mode */
+#define TIOCPKT_DATA		 0
+#define TIOCPKT_FLUSHREAD	 1
+#define TIOCPKT_FLUSHWRITE	 2
+#define TIOCPKT_STOP		 4
+#define TIOCPKT_START		 8
+#define TIOCPKT_NOSTOP		16
+#define TIOCPKT_DOSTOP		32
+
+#define TIOCSER_TEMT    0x01	/* Transmitter physically empty */
+
+#endif /* __ASM_AVR32_IOCTLS_H */
diff --git a/include/asm-avr32/ipcbuf.h b/include/asm-avr32/ipcbuf.h
new file mode 100644
index 00000000000..1552c9698f5
--- /dev/null
+++ b/include/asm-avr32/ipcbuf.h
@@ -0,0 +1,29 @@
+#ifndef __ASM_AVR32_IPCBUF_H
+#define __ASM_AVR32_IPCBUF_H
+
+/*
+* The user_ipc_perm structure for AVR32 architecture.
+* Note extra padding because this structure is passed back and forth
+* between kernel and user space.
+*
+* Pad space is left for:
+* - 32-bit mode_t and seq
+* - 2 miscellaneous 32-bit values
+*/
+
+struct ipc64_perm
+{
+        __kernel_key_t          key;
+        __kernel_uid32_t        uid;
+        __kernel_gid32_t        gid;
+        __kernel_uid32_t        cuid;
+        __kernel_gid32_t        cgid;
+        __kernel_mode_t         mode;
+        unsigned short          __pad1;
+        unsigned short          seq;
+        unsigned short          __pad2;
+        unsigned long           __unused1;
+        unsigned long           __unused2;
+};
+
+#endif /* __ASM_AVR32_IPCBUF_H */
diff --git a/include/asm-avr32/irq.h b/include/asm-avr32/irq.h
new file mode 100644
index 00000000000..f7e725707dd
--- /dev/null
+++ b/include/asm-avr32/irq.h
@@ -0,0 +1,10 @@
+#ifndef __ASM_AVR32_IRQ_H
+#define __ASM_AVR32_IRQ_H
+
+#define NR_INTERNAL_IRQS	64
+#define NR_EXTERNAL_IRQS	64
+#define NR_IRQS			(NR_INTERNAL_IRQS + NR_EXTERNAL_IRQS)
+
+#define irq_canonicalize(i)	(i)
+
+#endif /* __ASM_AVR32_IOCTLS_H */
diff --git a/include/asm-avr32/irqflags.h b/include/asm-avr32/irqflags.h
new file mode 100644
index 00000000000..93570daac38
--- /dev/null
+++ b/include/asm-avr32/irqflags.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_IRQFLAGS_H
+#define __ASM_AVR32_IRQFLAGS_H
+
+#include <asm/sysreg.h>
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+	return sysreg_read(SR);
+}
+
+#define raw_local_save_flags(x)					\
+	do { (x) = __raw_local_save_flags(); } while (0)
+
+/*
+ * This will restore ALL status register flags, not only the interrupt
+ * mask flag.
+ *
+ * The empty asm statement informs the compiler of this fact while
+ * also serving as a barrier.
+ */
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+	sysreg_write(SR, flags);
+	asm volatile("" : : : "memory", "cc");
+}
+
+static inline void raw_local_irq_disable(void)
+{
+	asm volatile("ssrf %0" : : "n"(SYSREG_GM_OFFSET) : "memory");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+	asm volatile("csrf %0" : : "n"(SYSREG_GM_OFFSET) : "memory");
+}
+
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+	return (flags & SYSREG_BIT(GM)) != 0;
+}
+
+static inline int raw_irqs_disabled(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	return raw_irqs_disabled_flags(flags);
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+	unsigned long flags = __raw_local_save_flags();
+
+	raw_local_irq_disable();
+
+	return flags;
+}
+
+#define raw_local_irq_save(flags)				\
+	do { (flags) = __raw_local_irq_save(); } while (0)
+
+#endif /* __ASM_AVR32_IRQFLAGS_H */
diff --git a/include/asm-avr32/kdebug.h b/include/asm-avr32/kdebug.h
new file mode 100644
index 00000000000..f583b643ffb
--- /dev/null
+++ b/include/asm-avr32/kdebug.h
@@ -0,0 +1,38 @@
+#ifndef __ASM_AVR32_KDEBUG_H
+#define __ASM_AVR32_KDEBUG_H
+
+#include <linux/notifier.h>
+
+struct pt_regs;
+
+struct die_args {
+	struct pt_regs *regs;
+	int trapnr;
+};
+
+int register_die_notifier(struct notifier_block *nb);
+int unregister_die_notifier(struct notifier_block *nb);
+int register_page_fault_notifier(struct notifier_block *nb);
+int unregister_page_fault_notifier(struct notifier_block *nb);
+extern struct atomic_notifier_head avr32_die_chain;
+
+/* Grossly misnamed. */
+enum die_val {
+	DIE_FAULT,
+	DIE_BREAKPOINT,
+	DIE_SSTEP,
+	DIE_PAGE_FAULT,
+};
+
+static inline int notify_die(enum die_val val, struct pt_regs *regs,
+			     int trap, int sig)
+{
+	struct die_args args = {
+		.regs = regs,
+		.trapnr = trap,
+	};
+
+	return atomic_notifier_call_chain(&avr32_die_chain, val, &args);
+}
+
+#endif /* __ASM_AVR32_KDEBUG_H */
diff --git a/include/asm-avr32/kmap_types.h b/include/asm-avr32/kmap_types.h
new file mode 100644
index 00000000000..b7f5c687010
--- /dev/null
+++ b/include/asm-avr32/kmap_types.h
@@ -0,0 +1,30 @@
+#ifndef __ASM_AVR32_KMAP_TYPES_H
+#define __ASM_AVR32_KMAP_TYPES_H
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+# define D(n) __KM_FENCE_##n ,
+#else
+# define D(n)
+#endif
+
+enum km_type {
+D(0)	KM_BOUNCE_READ,
+D(1)	KM_SKB_SUNRPC_DATA,
+D(2)	KM_SKB_DATA_SOFTIRQ,
+D(3)	KM_USER0,
+D(4)	KM_USER1,
+D(5)	KM_BIO_SRC_IRQ,
+D(6)	KM_BIO_DST_IRQ,
+D(7)	KM_PTE0,
+D(8)	KM_PTE1,
+D(9)	KM_PTE2,
+D(10)	KM_IRQ0,
+D(11)	KM_IRQ1,
+D(12)	KM_SOFTIRQ0,
+D(13)	KM_SOFTIRQ1,
+D(14)	KM_TYPE_NR
+};
+
+#undef D
+
+#endif /* __ASM_AVR32_KMAP_TYPES_H */
diff --git a/include/asm-avr32/kprobes.h b/include/asm-avr32/kprobes.h
new file mode 100644
index 00000000000..09a5cbe2f89
--- /dev/null
+++ b/include/asm-avr32/kprobes.h
@@ -0,0 +1,34 @@
+/*
+ * Kernel Probes (KProbes)
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_KPROBES_H
+#define __ASM_AVR32_KPROBES_H
+
+#include <linux/types.h>
+
+typedef u16	kprobe_opcode_t;
+#define BREAKPOINT_INSTRUCTION	0xd673	/* breakpoint */
+#define MAX_INSN_SIZE		2
+
+#define ARCH_INACTIVE_KPROBE_COUNT 1
+
+#define arch_remove_kprobe(p)	do { } while (0)
+
+/* Architecture specific copy of original instruction */
+struct arch_specific_insn {
+	kprobe_opcode_t	insn[MAX_INSN_SIZE];
+};
+
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+				    unsigned long val, void *data);
+
+#define flush_insn_slot(p)	do { } while (0)
+
+#endif /* __ASM_AVR32_KPROBES_H */
diff --git a/include/asm-avr32/linkage.h b/include/asm-avr32/linkage.h
new file mode 100644
index 00000000000..f7b285e910d
--- /dev/null
+++ b/include/asm-avr32/linkage.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_LINKAGE_H
+#define __ASM_LINKAGE_H
+
+#define __ALIGN .balign 2
+#define __ALIGN_STR ".balign 2"
+
+#endif /* __ASM_LINKAGE_H */
diff --git a/include/asm-avr32/local.h b/include/asm-avr32/local.h
new file mode 100644
index 00000000000..1c1619694da
--- /dev/null
+++ b/include/asm-avr32/local.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_LOCAL_H
+#define __ASM_AVR32_LOCAL_H
+
+#include <asm-generic/local.h>
+
+#endif /* __ASM_AVR32_LOCAL_H */
diff --git a/include/asm-avr32/mach/serial_at91.h b/include/asm-avr32/mach/serial_at91.h
new file mode 100644
index 00000000000..1290bb32802
--- /dev/null
+++ b/include/asm-avr32/mach/serial_at91.h
@@ -0,0 +1,33 @@
+/*
+ *  linux/include/asm-arm/mach/serial_at91.h
+ *
+ *  Based on serial_sa1100.h  by Nicolas Pitre
+ *
+ *  Copyright (C) 2002 ATMEL Rousset
+ *
+ *  Low level machine dependent UART functions.
+ */
+
+struct uart_port;
+
+/*
+ * This is a temporary structure for registering these
+ * functions; it is intended to be discarded after boot.
+ */
+struct at91_port_fns {
+	void	(*set_mctrl)(struct uart_port *, u_int);
+	u_int	(*get_mctrl)(struct uart_port *);
+	void	(*enable_ms)(struct uart_port *);
+	void	(*pm)(struct uart_port *, u_int, u_int);
+	int	(*set_wake)(struct uart_port *, u_int);
+	int	(*open)(struct uart_port *);
+	void	(*close)(struct uart_port *);
+};
+
+#if defined(CONFIG_SERIAL_AT91)
+void at91_register_uart_fns(struct at91_port_fns *fns);
+#else
+#define at91_register_uart_fns(fns) do { } while (0)
+#endif
+
+
diff --git a/include/asm-avr32/mman.h b/include/asm-avr32/mman.h
new file mode 100644
index 00000000000..648f91e7187
--- /dev/null
+++ b/include/asm-avr32/mman.h
@@ -0,0 +1,17 @@
+#ifndef __ASM_AVR32_MMAN_H__
+#define __ASM_AVR32_MMAN_H__
+
+#include <asm-generic/mman.h>
+
+#define MAP_GROWSDOWN	0x0100		/* stack-like segment */
+#define MAP_DENYWRITE	0x0800		/* ETXTBSY */
+#define MAP_EXECUTABLE	0x1000		/* mark it as an executable */
+#define MAP_LOCKED	0x2000		/* pages are locked */
+#define MAP_NORESERVE	0x4000		/* don't check for reservations */
+#define MAP_POPULATE	0x8000		/* populate (prefault) page tables */
+#define MAP_NONBLOCK	0x10000		/* do not block on IO */
+
+#define MCL_CURRENT	1		/* lock all current mappings */
+#define MCL_FUTURE	2		/* lock all future mappings */
+
+#endif /* __ASM_AVR32_MMAN_H__ */
diff --git a/include/asm-avr32/mmu.h b/include/asm-avr32/mmu.h
new file mode 100644
index 00000000000..60c2d2650d3
--- /dev/null
+++ b/include/asm-avr32/mmu.h
@@ -0,0 +1,10 @@
+#ifndef __ASM_AVR32_MMU_H
+#define __ASM_AVR32_MMU_H
+
+/* Default "unsigned long" context */
+typedef unsigned long mm_context_t;
+
+#define MMU_ITLB_ENTRIES	64
+#define MMU_DTLB_ENTRIES	64
+
+#endif /* __ASM_AVR32_MMU_H */
diff --git a/include/asm-avr32/mmu_context.h b/include/asm-avr32/mmu_context.h
new file mode 100644
index 00000000000..31add1ae808
--- /dev/null
+++ b/include/asm-avr32/mmu_context.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * ASID handling taken from SH implementation.
+ *   Copyright (C) 1999 Niibe Yutaka
+ *   Copyright (C) 2003 Paul Mundt
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_MMU_CONTEXT_H
+#define __ASM_AVR32_MMU_CONTEXT_H
+
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/sysreg.h>
+
+/*
+ * The MMU "context" consists of two things:
+ *    (a) TLB cache version
+ *    (b) ASID (Address Space IDentifier)
+ */
+#define MMU_CONTEXT_ASID_MASK		0x000000ff
+#define MMU_CONTEXT_VERSION_MASK	0xffffff00
+#define MMU_CONTEXT_FIRST_VERSION       0x00000100
+#define NO_CONTEXT			0
+
+#define MMU_NO_ASID			0x100
+
+/* Virtual Page Number mask */
+#define MMU_VPN_MASK	0xfffff000
+
+/* Cache of MMU context last used */
+extern unsigned long mmu_context_cache;
+
+/*
+ * Get MMU context if needed
+ */
+static inline void
+get_mmu_context(struct mm_struct *mm)
+{
+	unsigned long mc = mmu_context_cache;
+
+	if (((mm->context ^ mc) & MMU_CONTEXT_VERSION_MASK) == 0)
+		/* It's up to date, do nothing */
+		return;
+
+	/* It's old, we need to get new context with new version */
+	mc = ++mmu_context_cache;
+	if (!(mc & MMU_CONTEXT_ASID_MASK)) {
+		/*
+		 * We have exhausted all ASIDs of this version.
+		 * Flush the TLB and start new cycle.
+		 */
+		flush_tlb_all();
+		/*
+		 * Fix version. Note that we avoid version #0
+		 * to distinguish NO_CONTEXT.
+		 */
+		if (!mc)
+			mmu_context_cache = mc = MMU_CONTEXT_FIRST_VERSION;
+	}
+	mm->context = mc;
+}
+
+/*
+ * Initialize the context related info for a new mm_struct
+ * instance.
+ */
+static inline int init_new_context(struct task_struct *tsk,
+				       struct mm_struct *mm)
+{
+	mm->context = NO_CONTEXT;
+	return 0;
+}
+
+/*
+ * Destroy context related info for an mm_struct that is about
+ * to be put to rest.
+ */
+static inline void destroy_context(struct mm_struct *mm)
+{
+	/* Do nothing */
+}
+
+static inline void set_asid(unsigned long asid)
+{
+	/* XXX: We're destroying TLBEHI[8:31] */
+	sysreg_write(TLBEHI, asid & MMU_CONTEXT_ASID_MASK);
+	cpu_sync_pipeline();
+}
+
+static inline unsigned long get_asid(void)
+{
+	unsigned long asid;
+
+	asid = sysreg_read(TLBEHI);
+	return asid & MMU_CONTEXT_ASID_MASK;
+}
+
+static inline void activate_context(struct mm_struct *mm)
+{
+	get_mmu_context(mm);
+	set_asid(mm->context & MMU_CONTEXT_ASID_MASK);
+}
+
+static inline void switch_mm(struct mm_struct *prev,
+				 struct mm_struct *next,
+				 struct task_struct *tsk)
+{
+	if (likely(prev != next)) {
+		unsigned long __pgdir = (unsigned long)next->pgd;
+
+		sysreg_write(PTBR, __pgdir);
+		activate_context(next);
+	}
+}
+
+#define deactivate_mm(tsk,mm) do { } while(0)
+
+#define activate_mm(prev, next) switch_mm((prev), (next), NULL)
+
+static inline void
+enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+}
+
+
+static inline void enable_mmu(void)
+{
+	sysreg_write(MMUCR, (SYSREG_BIT(MMUCR_S)
+			     | SYSREG_BIT(E)
+			     | SYSREG_BIT(MMUCR_I)));
+	nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+
+	if (mmu_context_cache == NO_CONTEXT)
+		mmu_context_cache = MMU_CONTEXT_FIRST_VERSION;
+
+	set_asid(mmu_context_cache & MMU_CONTEXT_ASID_MASK);
+}
+
+static inline void disable_mmu(void)
+{
+	sysreg_write(MMUCR, SYSREG_BIT(MMUCR_S));
+}
+
+#endif /* __ASM_AVR32_MMU_CONTEXT_H */
diff --git a/include/asm-avr32/module.h b/include/asm-avr32/module.h
new file mode 100644
index 00000000000..451444538a1
--- /dev/null
+++ b/include/asm-avr32/module.h
@@ -0,0 +1,28 @@
+#ifndef __ASM_AVR32_MODULE_H
+#define __ASM_AVR32_MODULE_H
+
+struct mod_arch_syminfo {
+	unsigned long got_offset;
+	int got_initialized;
+};
+
+struct mod_arch_specific {
+	/* Starting offset of got in the module core memory. */
+	unsigned long got_offset;
+	/* Size of the got. */
+	unsigned long got_size;
+	/* Number of symbols in syminfo. */
+	int nsyms;
+	/* Additional symbol information (got offsets). */
+	struct mod_arch_syminfo *syminfo;
+};
+
+#define Elf_Shdr		Elf32_Shdr
+#define Elf_Sym			Elf32_Sym
+#define Elf_Ehdr		Elf32_Ehdr
+
+#define MODULE_PROC_FAMILY "AVR32v1"
+
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
+
+#endif /* __ASM_AVR32_MODULE_H */
diff --git a/include/asm-avr32/msgbuf.h b/include/asm-avr32/msgbuf.h
new file mode 100644
index 00000000000..ac18bc4da7f
--- /dev/null
+++ b/include/asm-avr32/msgbuf.h
@@ -0,0 +1,31 @@
+#ifndef __ASM_AVR32_MSGBUF_H
+#define __ASM_AVR32_MSGBUF_H
+
+/*
+ * The msqid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct msqid64_ds {
+	struct ipc64_perm msg_perm;
+	__kernel_time_t msg_stime;	/* last msgsnd time */
+	unsigned long	__unused1;
+	__kernel_time_t msg_rtime;	/* last msgrcv time */
+	unsigned long	__unused2;
+	__kernel_time_t msg_ctime;	/* last change time */
+	unsigned long	__unused3;
+	unsigned long  msg_cbytes;	/* current number of bytes on queue */
+	unsigned long  msg_qnum;	/* number of messages in queue */
+	unsigned long  msg_qbytes;	/* max number of bytes on queue */
+	__kernel_pid_t msg_lspid;	/* pid of last msgsnd */
+	__kernel_pid_t msg_lrpid;	/* last receive pid */
+	unsigned long  __unused4;
+	unsigned long  __unused5;
+};
+
+#endif /* __ASM_AVR32_MSGBUF_H */
diff --git a/include/asm-avr32/mutex.h b/include/asm-avr32/mutex.h
new file mode 100644
index 00000000000..458c1f7fbc1
--- /dev/null
+++ b/include/asm-avr32/mutex.h
@@ -0,0 +1,9 @@
+/*
+ * Pull in the generic implementation for the mutex fastpath.
+ *
+ * TODO: implement optimized primitives instead, or leave the generic
+ * implementation in place, or pick the atomic_xchg() based generic
+ * implementation. (see asm-generic/mutex-xchg.h for details)
+ */
+
+#include <asm-generic/mutex-dec.h>
diff --git a/include/asm-avr32/namei.h b/include/asm-avr32/namei.h
new file mode 100644
index 00000000000..f0a26de06ca
--- /dev/null
+++ b/include/asm-avr32/namei.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_AVR32_NAMEI_H
+#define __ASM_AVR32_NAMEI_H
+
+/* This dummy routine may be changed to something useful */
+#define __emul_prefix() NULL
+
+#endif /* __ASM_AVR32_NAMEI_H */
diff --git a/include/asm-avr32/numnodes.h b/include/asm-avr32/numnodes.h
new file mode 100644
index 00000000000..0b864d7ce33
--- /dev/null
+++ b/include/asm-avr32/numnodes.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_AVR32_NUMNODES_H
+#define __ASM_AVR32_NUMNODES_H
+
+/* Max 4 nodes */
+#define NODES_SHIFT	2
+
+#endif /* __ASM_AVR32_NUMNODES_H */
diff --git a/include/asm-avr32/ocd.h b/include/asm-avr32/ocd.h
new file mode 100644
index 00000000000..46f73180a12
--- /dev/null
+++ b/include/asm-avr32/ocd.h
@@ -0,0 +1,78 @@
+/*
+ * AVR32 OCD Registers
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_OCD_H
+#define __ASM_AVR32_OCD_H
+
+/* Debug Registers */
+#define DBGREG_DID		  0
+#define DBGREG_DC		  8
+#define DBGREG_DS		 16
+#define DBGREG_RWCS		 28
+#define DBGREG_RWA		 36
+#define DBGREG_RWD		 40
+#define DBGREG_WT		 44
+#define DBGREG_DTC		 52
+#define DBGREG_DTSA0		 56
+#define DBGREG_DTSA1		 60
+#define DBGREG_DTEA0		 72
+#define DBGREG_DTEA1		 76
+#define DBGREG_BWC0A		 88
+#define DBGREG_BWC0B		 92
+#define DBGREG_BWC1A		 96
+#define DBGREG_BWC1B		100
+#define DBGREG_BWC2A		104
+#define DBGREG_BWC2B		108
+#define DBGREG_BWC3A		112
+#define DBGREG_BWC3B		116
+#define DBGREG_BWA0A		120
+#define DBGREG_BWA0B		124
+#define DBGREG_BWA1A		128
+#define DBGREG_BWA1B		132
+#define DBGREG_BWA2A		136
+#define DBGREG_BWA2B		140
+#define DBGREG_BWA3A		144
+#define DBGREG_BWA3B		148
+#define DBGREG_BWD3A		153
+#define DBGREG_BWD3B		156
+
+#define DBGREG_PID		284
+
+#define SABAH_OCD		0x01
+#define SABAH_ICACHE		0x02
+#define SABAH_MEM_CACHED	0x04
+#define SABAH_MEM_UNCACHED	0x05
+
+/* Fields in the Development Control register */
+#define DC_SS_BIT		8
+
+#define DC_SS			(1 <<  DC_SS_BIT)
+#define DC_DBE			(1 << 13)
+#define DC_RID			(1 << 27)
+#define DC_ORP			(1 << 28)
+#define DC_MM			(1 << 29)
+#define DC_RES			(1 << 30)
+
+/* Fields in the Development Status register */
+#define DS_SSS			(1 <<  0)
+#define DS_SWB			(1 <<  1)
+#define DS_HWB			(1 <<  2)
+#define DS_BP_SHIFT		8
+#define DS_BP_MASK		(0xff << DS_BP_SHIFT)
+
+#define __mfdr(addr)							\
+({									\
+	register unsigned long value;					\
+	asm volatile("mfdr	%0, %1" : "=r"(value) : "i"(addr));	\
+	value;								\
+})
+#define __mtdr(addr, value)						\
+	asm volatile("mtdr	%0, %1" : : "i"(addr), "r"(value))
+
+#endif /* __ASM_AVR32_OCD_H */
diff --git a/include/asm-avr32/page.h b/include/asm-avr32/page.h
new file mode 100644
index 00000000000..0f630b3e993
--- /dev/null
+++ b/include/asm-avr32/page.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PAGE_H
+#define __ASM_AVR32_PAGE_H
+
+#ifdef __KERNEL__
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT	12
+#ifdef __ASSEMBLY__
+#define PAGE_SIZE	(1 << PAGE_SHIFT)
+#else
+#define PAGE_SIZE	(1UL << PAGE_SHIFT)
+#endif
+#define PAGE_MASK	(~(PAGE_SIZE-1))
+#define PTE_MASK	PAGE_MASK
+
+#ifndef __ASSEMBLY__
+
+#include <asm/addrspace.h>
+
+extern void clear_page(void *to);
+extern void copy_page(void *to, void *from);
+
+#define clear_user_page(page, vaddr, pg)	clear_page(page)
+#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef struct { unsigned long pte; } pte_t;
+typedef struct { unsigned long pgd; } pgd_t;
+typedef struct { unsigned long pgprot; } pgprot_t;
+
+#define pte_val(x)		((x).pte)
+#define pgd_val(x)		((x).pgd)
+#define pgprot_val(x)		((x).pgprot)
+
+#define __pte(x)		((pte_t) { (x) })
+#define __pgd(x)		((pgd_t) { (x) })
+#define __pgprot(x)		((pgprot_t) { (x) })
+
+/* FIXME: These should be removed soon */
+extern unsigned long memory_start, memory_end;
+
+/* Pure 2^n version of get_order */
+static inline int get_order(unsigned long size)
+{
+	unsigned lz;
+
+	size = (size - 1) >> PAGE_SHIFT;
+	asm("clz %0, %1" : "=r"(lz) : "r"(size));
+	return 32 - lz;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+/* Align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr)	(((addr) + PAGE_SIZE - 1) & PAGE_MASK)
+
+/*
+ * The hardware maps the virtual addresses 0x80000000 -> 0x9fffffff
+ * permanently to the physical addresses 0x00000000 -> 0x1fffffff when
+ * segmentation is enabled. We want to make use of this in order to
+ * minimize TLB pressure.
+ */
+#define PAGE_OFFSET		(0x80000000UL)
+
+/*
+ * ALSA uses virt_to_page() on DMA pages, which I'm not entirely sure
+ * is a good idea. Anyway, we can't simply subtract PAGE_OFFSET here
+ * in that case, so we'll have to mask out the three most significant
+ * bits of the address instead...
+ *
+ * What's the difference between __pa() and virt_to_phys() anyway?
+ */
+#define __pa(x)		PHYSADDR(x)
+#define __va(x)		((void *)(P1SEGADDR(x)))
+
+#define MAP_NR(addr)	(((unsigned long)(addr) - PAGE_OFFSET) >> PAGE_SHIFT)
+
+#define phys_to_page(phys)	(pfn_to_page(phys >> PAGE_SHIFT))
+#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+
+#define PHYS_PFN_OFFSET		(CONFIG_PHYS_OFFSET >> PAGE_SHIFT)
+
+#define pfn_to_page(pfn)	(mem_map + ((pfn) - PHYS_PFN_OFFSET))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map) + PHYS_PFN_OFFSET)
+#define pfn_valid(pfn)		((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
+#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE |	\
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+/*
+ * Memory above this physical address will be considered highmem.
+ */
+#define HIGHMEM_START		0x20000000UL
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_AVR32_PAGE_H */
diff --git a/include/asm-avr32/param.h b/include/asm-avr32/param.h
new file mode 100644
index 00000000000..34bc8d4c3b2
--- /dev/null
+++ b/include/asm-avr32/param.h
@@ -0,0 +1,23 @@
+#ifndef __ASM_AVR32_PARAM_H
+#define __ASM_AVR32_PARAM_H
+
+#ifdef __KERNEL__
+# define HZ		CONFIG_HZ
+# define USER_HZ	100		/* User interfaces are in "ticks" */
+# define CLOCKS_PER_SEC	(USER_HZ)	/* frequency at which times() counts */
+#endif
+
+#ifndef HZ
+# define HZ		100
+#endif
+
+/* TODO: Should be configurable */
+#define EXEC_PAGESIZE	4096
+
+#ifndef NOGROUP
+# define NOGROUP	(-1)
+#endif
+
+#define MAXHOSTNAMELEN	64
+
+#endif /* __ASM_AVR32_PARAM_H */
diff --git a/include/asm-avr32/pci.h b/include/asm-avr32/pci.h
new file mode 100644
index 00000000000..0f5f134b896
--- /dev/null
+++ b/include/asm-avr32/pci.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_AVR32_PCI_H__
+#define __ASM_AVR32_PCI_H__
+
+/* We don't support PCI yet, but some drivers require this file anyway */
+
+#define PCI_DMA_BUS_IS_PHYS	(1)
+
+#endif /* __ASM_AVR32_PCI_H__ */
diff --git a/include/asm-avr32/percpu.h b/include/asm-avr32/percpu.h
new file mode 100644
index 00000000000..69227b4cd0d
--- /dev/null
+++ b/include/asm-avr32/percpu.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_PERCPU_H
+#define __ASM_AVR32_PERCPU_H
+
+#include <asm-generic/percpu.h>
+
+#endif /* __ASM_AVR32_PERCPU_H */
diff --git a/include/asm-avr32/pgalloc.h b/include/asm-avr32/pgalloc.h
new file mode 100644
index 00000000000..7492cfb92ce
--- /dev/null
+++ b/include/asm-avr32/pgalloc.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PGALLOC_H
+#define __ASM_AVR32_PGALLOC_H
+
+#include <asm/processor.h>
+#include <linux/threads.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#define pmd_populate_kernel(mm, pmd, pte) \
+	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
+
+static __inline__ void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+				    struct page *pte)
+{
+	set_pmd(pmd, __pmd(_PAGE_TABLE + page_to_phys(pte)));
+}
+
+/*
+ * Allocate and free page tables
+ */
+static __inline__ pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	unsigned int pgd_size = (USER_PTRS_PER_PGD * sizeof(pgd_t));
+	pgd_t *pgd = (pgd_t *)kmalloc(pgd_size, GFP_KERNEL);
+
+	if (pgd)
+		memset(pgd, 0, pgd_size);
+
+	return pgd;
+}
+
+static inline void pgd_free(pgd_t *pgd)
+{
+	kfree(pgd);
+}
+
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+					  unsigned long address)
+{
+	int count = 0;
+	pte_t *pte;
+
+	do {
+		pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT);
+		if (pte)
+			clear_page(pte);
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+
+	return pte;
+}
+
+static inline struct page *pte_alloc_one(struct mm_struct *mm,
+					 unsigned long address)
+{
+	int count = 0;
+	struct page *pte;
+
+	do {
+		pte = alloc_pages(GFP_KERNEL, 0);
+		if (pte)
+			clear_page(page_address(pte));
+		else {
+			current->state = TASK_UNINTERRUPTIBLE;
+			schedule_timeout(HZ);
+		}
+	} while (!pte && (count++ < 10));
+
+	return pte;
+}
+
+static inline void pte_free_kernel(pte_t *pte)
+{
+	free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct page *pte)
+{
+	__free_page(pte);
+}
+
+#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+
+#define check_pgt_cache() do { } while(0)
+
+#endif /* __ASM_AVR32_PGALLOC_H */
diff --git a/include/asm-avr32/pgtable-2level.h b/include/asm-avr32/pgtable-2level.h
new file mode 100644
index 00000000000..425dd567b5b
--- /dev/null
+++ b/include/asm-avr32/pgtable-2level.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PGTABLE_2LEVEL_H
+#define __ASM_AVR32_PGTABLE_2LEVEL_H
+
+#include <asm-generic/pgtable-nopmd.h>
+
+/*
+ * Traditional 2-level paging structure
+ */
+#define PGDIR_SHIFT	22
+#define PTRS_PER_PGD	1024
+
+#define PTRS_PER_PTE	1024
+
+#ifndef __ASSEMBLY__
+#define pte_ERROR(e) \
+	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
+#define pgd_ERROR(e) \
+	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
+
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified.  Thus, the following
+ * hook is made available.
+ */
+#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
+#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep, pteval)
+
+/*
+ * (pmds are folded into pgds so this doesn't get actually called,
+ * but the define is needed for a generic inline function.)
+ */
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
+
+#define pte_pfn(x)		((unsigned long)(((x).pte >> PAGE_SHIFT)))
+#define pfn_pte(pfn, prot)	__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pfn_pmd(pfn, prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_AVR32_PGTABLE_2LEVEL_H */
diff --git a/include/asm-avr32/pgtable.h b/include/asm-avr32/pgtable.h
new file mode 100644
index 00000000000..6b8ca9db2bd
--- /dev/null
+++ b/include/asm-avr32/pgtable.h
@@ -0,0 +1,408 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PGTABLE_H
+#define __ASM_AVR32_PGTABLE_H
+
+#include <asm/addrspace.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/sched.h>
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Use two-level page tables just as the i386 (without PAE)
+ */
+#include <asm/pgtable-2level.h>
+
+/*
+ * The following code might need some cleanup when the values are
+ * final...
+ */
+#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE-1))
+#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE-1))
+
+#define USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
+#define FIRST_USER_ADDRESS	0
+
+#define PTE_PHYS_MASK	0x1ffff000
+
+#ifndef __ASSEMBLY__
+extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
+extern void paging_init(void);
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used for
+ * zero-mapped memory areas etc.
+ */
+extern struct page *empty_zero_page;
+#define ZERO_PAGE(vaddr) (empty_zero_page)
+
+/*
+ * Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8 MiB value just means that there will be a 8 MiB "hole"
+ * after the uncached physical memory (P2 segment) until the vmalloc
+ * area starts. That means that any out-of-bounds memory accesses will
+ * hopefully be caught; we don't know if the end of the P1/P2 segments
+ * are actually used for anything, but it is anyway safer to let the
+ * MMU catch these kinds of errors than to rely on the memory bus.
+ *
+ * A "hole" of the same size is added to the end of the P3 segment as
+ * well. It might seem wasteful to use 16 MiB of virtual address space
+ * on this, but we do have 512 MiB of it...
+ *
+ * The vmalloc() routines leave a hole of 4 KiB between each vmalloced
+ * area for the same reason.
+ */
+#define VMALLOC_OFFSET	(8 * 1024 * 1024)
+#define VMALLOC_START	(P3SEG + VMALLOC_OFFSET)
+#define VMALLOC_END	(P4SEG - VMALLOC_OFFSET)
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Page flags. Some of these flags are not directly supported by
+ * hardware, so we have to emulate them.
+ */
+#define _TLBEHI_BIT_VALID	9
+#define _TLBEHI_VALID		(1 << _TLBEHI_BIT_VALID)
+
+#define _PAGE_BIT_WT		0  /* W-bit   : write-through */
+#define _PAGE_BIT_DIRTY		1  /* D-bit   : page changed */
+#define _PAGE_BIT_SZ0		2  /* SZ0-bit : Size of page */
+#define _PAGE_BIT_SZ1		3  /* SZ1-bit : Size of page */
+#define _PAGE_BIT_EXECUTE	4  /* X-bit   : execute access allowed */
+#define _PAGE_BIT_RW		5  /* AP0-bit : write access allowed */
+#define _PAGE_BIT_USER		6  /* AP1-bit : user space access allowed */
+#define _PAGE_BIT_BUFFER	7  /* B-bit   : bufferable */
+#define _PAGE_BIT_GLOBAL	8  /* G-bit   : global (ignore ASID) */
+#define _PAGE_BIT_CACHABLE	9  /* C-bit   : cachable */
+
+/* If we drop support for 1K pages, we get two extra bits */
+#define _PAGE_BIT_PRESENT	10
+#define _PAGE_BIT_ACCESSED	11 /* software: page was accessed */
+
+/* The following flags are only valid when !PRESENT */
+#define _PAGE_BIT_FILE		0 /* software: pagecache or swap? */
+
+#define _PAGE_WT		(1 << _PAGE_BIT_WT)
+#define _PAGE_DIRTY		(1 << _PAGE_BIT_DIRTY)
+#define _PAGE_EXECUTE		(1 << _PAGE_BIT_EXECUTE)
+#define _PAGE_RW		(1 << _PAGE_BIT_RW)
+#define _PAGE_USER		(1 << _PAGE_BIT_USER)
+#define _PAGE_BUFFER		(1 << _PAGE_BIT_BUFFER)
+#define _PAGE_GLOBAL		(1 << _PAGE_BIT_GLOBAL)
+#define _PAGE_CACHABLE		(1 << _PAGE_BIT_CACHABLE)
+
+/* Software flags */
+#define _PAGE_ACCESSED		(1 << _PAGE_BIT_ACCESSED)
+#define _PAGE_PRESENT		(1 << _PAGE_BIT_PRESENT)
+#define _PAGE_FILE		(1 << _PAGE_BIT_FILE)
+
+/*
+ * Page types, i.e. sizes. _PAGE_TYPE_NONE corresponds to what is
+ * usually called _PAGE_PROTNONE on other architectures.
+ *
+ * XXX: Find out if _PAGE_PROTNONE is equivalent with !_PAGE_USER. If
+ * so, we can encode all possible page sizes (although we can't really
+ * support 1K pages anyway due to the _PAGE_PRESENT and _PAGE_ACCESSED
+ * bits)
+ *
+ */
+#define _PAGE_TYPE_MASK		((1 << _PAGE_BIT_SZ0) | (1 << _PAGE_BIT_SZ1))
+#define _PAGE_TYPE_NONE		(0 << _PAGE_BIT_SZ0)
+#define _PAGE_TYPE_SMALL	(1 << _PAGE_BIT_SZ0)
+#define _PAGE_TYPE_MEDIUM	(2 << _PAGE_BIT_SZ0)
+#define _PAGE_TYPE_LARGE	(3 << _PAGE_BIT_SZ0)
+
+/*
+ * Mask which drop software flags. We currently can't handle more than
+ * 512 MiB of physical memory, so we can use bits 29-31 for other
+ * stuff.  With a fixed 4K page size, we can use bits 10-11 as well as
+ * bits 2-3 (SZ)
+ */
+#define _PAGE_FLAGS_HARDWARE_MASK	0xfffff3ff
+
+#define _PAGE_FLAGS_CACHE_MASK	(_PAGE_CACHABLE | _PAGE_BUFFER | _PAGE_WT)
+
+/* TODO: Check for saneness */
+/* User-mode page table flags (to be set in a pgd or pmd entry) */
+#define _PAGE_TABLE		(_PAGE_PRESENT | _PAGE_TYPE_SMALL | _PAGE_RW \
+				 | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+/* Kernel-mode page table flags */
+#define _KERNPG_TABLE		(_PAGE_PRESENT | _PAGE_TYPE_SMALL | _PAGE_RW \
+				 | _PAGE_ACCESSED | _PAGE_DIRTY)
+/* Flags that may be modified by software */
+#define _PAGE_CHG_MASK		(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY \
+				 | _PAGE_FLAGS_CACHE_MASK)
+
+#define _PAGE_FLAGS_READ	(_PAGE_CACHABLE	| _PAGE_BUFFER)
+#define _PAGE_FLAGS_WRITE	(_PAGE_FLAGS_READ | _PAGE_RW | _PAGE_DIRTY)
+
+#define _PAGE_NORMAL(x)	__pgprot((x) | _PAGE_PRESENT | _PAGE_TYPE_SMALL	\
+				 | _PAGE_ACCESSED)
+
+#define PAGE_NONE	(_PAGE_ACCESSED | _PAGE_TYPE_NONE)
+#define PAGE_READ	(_PAGE_FLAGS_READ | _PAGE_USER)
+#define PAGE_EXEC	(_PAGE_FLAGS_READ | _PAGE_EXECUTE | _PAGE_USER)
+#define PAGE_WRITE	(_PAGE_FLAGS_WRITE | _PAGE_USER)
+#define PAGE_KERNEL	_PAGE_NORMAL(_PAGE_FLAGS_WRITE | _PAGE_EXECUTE | _PAGE_GLOBAL)
+#define PAGE_KERNEL_RO	_PAGE_NORMAL(_PAGE_FLAGS_READ | _PAGE_EXECUTE | _PAGE_GLOBAL)
+
+#define _PAGE_P(x)	_PAGE_NORMAL((x) & ~(_PAGE_RW | _PAGE_DIRTY))
+#define _PAGE_S(x)	_PAGE_NORMAL(x)
+
+#define PAGE_COPY	_PAGE_P(PAGE_WRITE | PAGE_READ)
+
+#ifndef __ASSEMBLY__
+/*
+ * The hardware supports flags for write- and execute access. Read is
+ * always allowed if the page is loaded into the TLB, so the "-w-",
+ * "--x" and "-wx" mappings are implemented as "rw-", "r-x" and "rwx",
+ * respectively.
+ *
+ * The "---" case is handled by software; the page will simply not be
+ * loaded into the TLB if the page type is _PAGE_TYPE_NONE.
+ */
+
+#define __P000	__pgprot(PAGE_NONE)
+#define __P001	_PAGE_P(PAGE_READ)
+#define __P010	_PAGE_P(PAGE_WRITE)
+#define __P011	_PAGE_P(PAGE_WRITE | PAGE_READ)
+#define __P100	_PAGE_P(PAGE_EXEC)
+#define __P101	_PAGE_P(PAGE_EXEC | PAGE_READ)
+#define __P110	_PAGE_P(PAGE_EXEC | PAGE_WRITE)
+#define __P111	_PAGE_P(PAGE_EXEC | PAGE_WRITE | PAGE_READ)
+
+#define __S000	__pgprot(PAGE_NONE)
+#define __S001	_PAGE_S(PAGE_READ)
+#define __S010	_PAGE_S(PAGE_WRITE)
+#define __S011	_PAGE_S(PAGE_WRITE | PAGE_READ)
+#define __S100	_PAGE_S(PAGE_EXEC)
+#define __S101	_PAGE_S(PAGE_EXEC | PAGE_READ)
+#define __S110	_PAGE_S(PAGE_EXEC | PAGE_WRITE)
+#define __S111	_PAGE_S(PAGE_EXEC | PAGE_WRITE | PAGE_READ)
+
+#define pte_none(x)	(!pte_val(x))
+#define pte_present(x)	(pte_val(x) & _PAGE_PRESENT)
+
+#define pte_clear(mm,addr,xp)					\
+	do {							\
+		set_pte_at(mm, addr, xp, __pte(0));		\
+	} while (0)
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_read(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_USER;
+}
+static inline int pte_write(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_RW;
+}
+static inline int pte_exec(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_EXECUTE;
+}
+static inline int pte_dirty(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_DIRTY;
+}
+static inline int pte_young(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_ACCESSED;
+}
+
+/*
+ * The following only work if pte_present() is not true.
+ */
+static inline int pte_file(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_FILE;
+}
+
+/* Mutator functions for PTE bits */
+static inline pte_t pte_rdprotect(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER));
+	return pte;
+}
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW));
+	return pte;
+}
+static inline pte_t pte_exprotect(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_EXECUTE));
+	return pte;
+}
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY));
+	return pte;
+}
+static inline pte_t pte_mkold(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED));
+	return pte;
+}
+static inline pte_t pte_mkread(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER));
+	return pte;
+}
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW));
+	return pte;
+}
+static inline pte_t pte_mkexec(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) | _PAGE_EXECUTE));
+	return pte;
+}
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY));
+	return pte;
+}
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED));
+	return pte;
+}
+
+#define pmd_none(x)	(!pmd_val(x))
+#define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
+#define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
+#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER))	\
+			 != _KERNPG_TABLE)
+
+/*
+ * Permanent address of a page. We don't support highmem, so this is
+ * trivial.
+ */
+#define pages_to_mb(x)	((x) >> (20-PAGE_SHIFT))
+#define pte_page(x) 	phys_to_page(pte_val(x) & PTE_PHYS_MASK)
+
+/*
+ * Mark the prot value as uncacheable and unbufferable
+ */
+#define pgprot_noncached(prot)						\
+	__pgprot(pgprot_val(prot) & ~(_PAGE_BUFFER | _PAGE_CACHABLE))
+
+/*
+ * Mark the prot value as uncacheable but bufferable
+ */
+#define pgprot_writecombine(prot)					\
+	__pgprot((pgprot_val(prot) & ~_PAGE_CACHABLE) | _PAGE_BUFFER)
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * extern pte_t mk_pte(struct page *page, pgprot_t pgprot)
+ */
+#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	set_pte(&pte, __pte((pte_val(pte) & _PAGE_CHG_MASK)
+			    | pgprot_val(newprot)));
+	return pte;
+}
+
+#define page_pte(page)	page_pte_prot(page, __pgprot(0))
+
+#define pmd_page_vaddr(pmd)					\
+	((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
+
+#define pmd_page(pmd)	(phys_to_page(pmd_val(pmd)))
+
+/* to find an entry in a page-table-directory. */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
+#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address))
+#define pgd_offset_current(address)				\
+	((pgd_t *)__mfsr(SYSREG_PTBR) + pgd_index(address))
+
+/* to find an entry in a kernel page-table-directory */
+#define pgd_offset_k(address) pgd_offset(&init_mm, address)
+
+/* Find an entry in the third-level page table.. */
+#define pte_index(address)				\
+	((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_offset(dir, address)					\
+	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
+#define pte_offset_kernel(dir, address)					\
+	((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
+#define pte_offset_map(dir, address) pte_offset_kernel(dir, address)
+#define pte_offset_map_nested(dir, address) pte_offset_kernel(dir, address)
+#define pte_unmap(pte)		do { } while (0)
+#define pte_unmap_nested(pte)	do { } while (0)
+
+struct vm_area_struct;
+extern void update_mmu_cache(struct vm_area_struct * vma,
+			     unsigned long address, pte_t pte);
+
+/*
+ * Encode and decode a swap entry
+ *
+ * Constraints:
+ *   _PAGE_FILE at bit 0
+ *   _PAGE_TYPE_* at bits 2-3 (for emulating _PAGE_PROTNONE)
+ *   _PAGE_PRESENT at bit 10
+ *
+ * We encode the type into bits 4-9 and offset into bits 11-31. This
+ * gives us a 21 bits offset, or 2**21 * 4K = 8G usable swap space per
+ * device, and 64 possible types.
+ *
+ * NOTE: We should set ZEROs at the position of _PAGE_PRESENT
+ *       and _PAGE_PROTNONE bits
+ */
+#define __swp_type(x)		(((x).val >> 4) & 0x3f)
+#define __swp_offset(x)		((x).val >> 11)
+#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 11) })
+#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
+#define __swp_entry_to_pte(x)	((pte_t) { (x).val })
+
+/*
+ * Encode and decode a nonlinear file mapping entry. We have to
+ * preserve _PAGE_FILE and _PAGE_PRESENT here. _PAGE_TYPE_* isn't
+ * necessary, since _PAGE_FILE implies !_PAGE_PROTNONE (?)
+ */
+#define PTE_FILE_MAX_BITS	30
+#define pte_to_pgoff(pte)	(((pte_val(pte) >> 1) & 0x1ff)		\
+				 | ((pte_val(pte) >> 11) << 9))
+#define pgoff_to_pte(off)	((pte_t) { ((((off) & 0x1ff) << 1)	\
+					    | (((off) >> 9) << 11)	\
+					    | _PAGE_FILE) })
+
+typedef pte_t *pte_addr_t;
+
+#define kern_addr_valid(addr)	(1)
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
+	remap_pfn_range(vma, vaddr, pfn, size, prot)
+
+#define MK_IOSPACE_PFN(space, pfn)	(pfn)
+#define GET_IOSPACE(pfn)		0
+#define GET_PFN(pfn)			(pfn)
+
+/* No page table caches to initialize (?) */
+#define pgtable_cache_init()	do { } while(0)
+
+#include <asm-generic/pgtable.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_AVR32_PGTABLE_H */
diff --git a/include/asm-avr32/poll.h b/include/asm-avr32/poll.h
new file mode 100644
index 00000000000..736e29755df
--- /dev/null
+++ b/include/asm-avr32/poll.h
@@ -0,0 +1,27 @@
+#ifndef __ASM_AVR32_POLL_H
+#define __ASM_AVR32_POLL_H
+
+/* These are specified by iBCS2 */
+#define POLLIN		0x0001
+#define POLLPRI		0x0002
+#define POLLOUT		0x0004
+#define POLLERR		0x0008
+#define POLLHUP		0x0010
+#define POLLNVAL	0x0020
+
+/* The rest seem to be more-or-less nonstandard. Check them! */
+#define POLLRDNORM	0x0040
+#define POLLRDBAND	0x0080
+#define POLLWRNORM	0x0100
+#define POLLWRBAND	0x0200
+#define POLLMSG		0x0400
+#define POLLREMOVE	0x1000
+#define POLLRDHUP	0x2000
+
+struct pollfd {
+	int fd;
+	short events;
+	short revents;
+};
+
+#endif /* __ASM_AVR32_POLL_H */
diff --git a/include/asm-avr32/posix_types.h b/include/asm-avr32/posix_types.h
new file mode 100644
index 00000000000..2831b039b34
--- /dev/null
+++ b/include/asm-avr32/posix_types.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_POSIX_TYPES_H
+#define __ASM_AVR32_POSIX_TYPES_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc.  Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned long   __kernel_ino_t;
+typedef unsigned short  __kernel_mode_t;
+typedef unsigned short  __kernel_nlink_t;
+typedef long            __kernel_off_t;
+typedef int             __kernel_pid_t;
+typedef unsigned short  __kernel_ipc_pid_t;
+typedef unsigned int	__kernel_uid_t;
+typedef unsigned int	__kernel_gid_t;
+typedef unsigned long	__kernel_size_t;
+typedef int             __kernel_ssize_t;
+typedef int             __kernel_ptrdiff_t;
+typedef long            __kernel_time_t;
+typedef long            __kernel_suseconds_t;
+typedef long            __kernel_clock_t;
+typedef int             __kernel_timer_t;
+typedef int             __kernel_clockid_t;
+typedef int             __kernel_daddr_t;
+typedef char *          __kernel_caddr_t;
+typedef unsigned short  __kernel_uid16_t;
+typedef unsigned short  __kernel_gid16_t;
+typedef unsigned int    __kernel_uid32_t;
+typedef unsigned int    __kernel_gid32_t;
+
+typedef unsigned short  __kernel_old_uid_t;
+typedef unsigned short  __kernel_old_gid_t;
+typedef unsigned short  __kernel_old_dev_t;
+
+#ifdef __GNUC__
+typedef long long       __kernel_loff_t;
+#endif
+
+typedef struct {
+#if defined(__KERNEL__) || defined(__USE_ALL)
+    int     val[2];
+#else /* !defined(__KERNEL__) && !defined(__USE_ALL) */
+    int     __val[2];
+#endif /* !defined(__KERNEL__) && !defined(__USE_ALL) */
+} __kernel_fsid_t;
+
+#if defined(__KERNEL__)
+
+#undef  __FD_SET
+static __inline__ void __FD_SET(unsigned long __fd, __kernel_fd_set *__fdsetp)
+{
+    unsigned long __tmp = __fd / __NFDBITS;
+    unsigned long __rem = __fd % __NFDBITS;
+    __fdsetp->fds_bits[__tmp] |= (1UL<<__rem);
+}
+
+#undef  __FD_CLR
+static __inline__ void __FD_CLR(unsigned long __fd, __kernel_fd_set *__fdsetp)
+{
+    unsigned long __tmp = __fd / __NFDBITS;
+    unsigned long __rem = __fd % __NFDBITS;
+    __fdsetp->fds_bits[__tmp] &= ~(1UL<<__rem);
+}
+
+
+#undef  __FD_ISSET
+static __inline__ int __FD_ISSET(unsigned long __fd, const __kernel_fd_set *__p)
+{
+    unsigned long __tmp = __fd / __NFDBITS;
+    unsigned long __rem = __fd % __NFDBITS;
+    return (__p->fds_bits[__tmp] & (1UL<<__rem)) != 0;
+}
+
+/*
+ * This will unroll the loop for the normal constant case (8 ints,
+ * for a 256-bit fd_set)
+ */
+#undef  __FD_ZERO
+static __inline__ void __FD_ZERO(__kernel_fd_set *__p)
+{
+    unsigned long *__tmp = __p->fds_bits;
+    int __i;
+
+    if (__builtin_constant_p(__FDSET_LONGS)) {
+        switch (__FDSET_LONGS) {
+            case 16:
+                __tmp[ 0] = 0; __tmp[ 1] = 0;
+                __tmp[ 2] = 0; __tmp[ 3] = 0;
+                __tmp[ 4] = 0; __tmp[ 5] = 0;
+                __tmp[ 6] = 0; __tmp[ 7] = 0;
+                __tmp[ 8] = 0; __tmp[ 9] = 0;
+                __tmp[10] = 0; __tmp[11] = 0;
+                __tmp[12] = 0; __tmp[13] = 0;
+                __tmp[14] = 0; __tmp[15] = 0;
+                return;
+
+            case 8:
+                __tmp[ 0] = 0; __tmp[ 1] = 0;
+                __tmp[ 2] = 0; __tmp[ 3] = 0;
+                __tmp[ 4] = 0; __tmp[ 5] = 0;
+                __tmp[ 6] = 0; __tmp[ 7] = 0;
+                return;
+
+            case 4:
+                __tmp[ 0] = 0; __tmp[ 1] = 0;
+                __tmp[ 2] = 0; __tmp[ 3] = 0;
+                return;
+        }
+    }
+    __i = __FDSET_LONGS;
+    while (__i) {
+        __i--;
+        *__tmp = 0;
+        __tmp++;
+    }
+}
+
+#endif /* defined(__KERNEL__) */
+
+#endif /* __ASM_AVR32_POSIX_TYPES_H */
diff --git a/include/asm-avr32/processor.h b/include/asm-avr32/processor.h
new file mode 100644
index 00000000000..f6913778a45
--- /dev/null
+++ b/include/asm-avr32/processor.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PROCESSOR_H
+#define __ASM_AVR32_PROCESSOR_H
+
+#include <asm/page.h>
+#include <asm/cache.h>
+
+#define TASK_SIZE	0x80000000
+
+#ifndef __ASSEMBLY__
+
+static inline void *current_text_addr(void)
+{
+	register void *pc asm("pc");
+	return pc;
+}
+
+enum arch_type {
+	ARCH_AVR32A,
+	ARCH_AVR32B,
+	ARCH_MAX
+};
+
+enum cpu_type {
+	CPU_MORGAN,
+	CPU_AT32AP,
+	CPU_MAX
+};
+
+enum tlb_config {
+	TLB_NONE,
+	TLB_SPLIT,
+	TLB_UNIFIED,
+	TLB_INVALID
+};
+
+struct avr32_cpuinfo {
+	struct clk *clk;
+	unsigned long loops_per_jiffy;
+	enum arch_type arch_type;
+	enum cpu_type cpu_type;
+	unsigned short arch_revision;
+	unsigned short cpu_revision;
+	enum tlb_config tlb_config;
+
+	struct cache_info icache;
+	struct cache_info dcache;
+};
+
+extern struct avr32_cpuinfo boot_cpu_data;
+
+#ifdef CONFIG_SMP
+extern struct avr32_cpuinfo cpu_data[];
+#define current_cpu_data cpu_data[smp_processor_id()]
+#else
+#define cpu_data (&boot_cpu_data)
+#define current_cpu_data boot_cpu_data
+#endif
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's
+ */
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+
+#define cpu_relax()		barrier()
+#define cpu_sync_pipeline()	asm volatile("sub pc, -2" : : : "memory")
+
+struct cpu_context {
+	unsigned long sr;
+	unsigned long pc;
+	unsigned long ksp;	/* Kernel stack pointer */
+	unsigned long r7;
+	unsigned long r6;
+	unsigned long r5;
+	unsigned long r4;
+	unsigned long r3;
+	unsigned long r2;
+	unsigned long r1;
+	unsigned long r0;
+};
+
+/* This struct contains the CPU context as stored by switch_to() */
+struct thread_struct {
+	struct cpu_context cpu_context;
+	unsigned long single_step_addr;
+	u16 single_step_insn;
+};
+
+#define INIT_THREAD {						\
+	.cpu_context = {					\
+		.ksp = sizeof(init_stack) + (long)&init_stack,	\
+	},							\
+}
+
+/*
+ * Do necessary setup to start up a newly executed thread.
+ */
+#define start_thread(regs, new_pc, new_sp)	 \
+	do {					 \
+		set_fs(USER_DS);		 \
+		memset(regs, 0, sizeof(*regs));	 \
+		regs->sr = MODE_USER;		 \
+		regs->pc = new_pc & ~1;		 \
+		regs->sp = new_sp;		 \
+	} while(0)
+
+struct task_struct;
+
+/* Free all resources held by a thread */
+extern void release_thread(struct task_struct *);
+
+/* Create a kernel thread without removing it from tasklists */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Prepare to copy thread state - unlazy all lazy status */
+#define prepare_to_copy(tsk) do { } while(0)
+
+/* Return saved PC of a blocked thread */
+#define thread_saved_pc(tsk)    ((tsk)->thread.cpu_context.pc)
+
+struct pt_regs;
+void show_trace(struct task_struct *task, unsigned long *stack,
+		struct pt_regs *regs);
+
+extern unsigned long get_wchan(struct task_struct *p);
+
+#define KSTK_EIP(tsk)	((tsk)->thread.cpu_context.pc)
+#define KSTK_ESP(tsk)	((tsk)->thread.cpu_context.ksp)
+
+#define ARCH_HAS_PREFETCH
+
+static inline void prefetch(const void *x)
+{
+	const char *c = x;
+	asm volatile("pref %0" : : "r"(c));
+}
+#define PREFETCH_STRIDE	L1_CACHE_BYTES
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_AVR32_PROCESSOR_H */
diff --git a/include/asm-avr32/ptrace.h b/include/asm-avr32/ptrace.h
new file mode 100644
index 00000000000..60f0f19a81f
--- /dev/null
+++ b/include/asm-avr32/ptrace.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_PTRACE_H
+#define __ASM_AVR32_PTRACE_H
+
+#define PTRACE_GETREGS		12
+#define PTRACE_SETREGS		13
+
+/*
+ * Status Register bits
+ */
+#define SR_H		0x40000000
+#define SR_R		0x20000000
+#define SR_J		0x10000000
+#define SR_DM		0x08000000
+#define SR_D		0x04000000
+#define MODE_NMI	0x01c00000
+#define MODE_EXCEPTION	0x01800000
+#define MODE_INT3	0x01400000
+#define MODE_INT2	0x01000000
+#define MODE_INT1	0x00c00000
+#define MODE_INT0	0x00800000
+#define MODE_SUPERVISOR	0x00400000
+#define MODE_USER	0x00000000
+#define MODE_MASK	0x01c00000
+#define SR_EM		0x00200000
+#define SR_I3M		0x00100000
+#define SR_I2M		0x00080000
+#define SR_I1M		0x00040000
+#define SR_I0M		0x00020000
+#define SR_GM		0x00010000
+
+#define SR_H_BIT	30
+#define SR_R_BIT	29
+#define SR_J_BIT	28
+#define SR_DM_BIT	27
+#define SR_D_BIT	26
+#define MODE_SHIFT	22
+#define SR_EM_BIT	21
+#define SR_I3M_BIT	20
+#define SR_I2M_BIT	19
+#define SR_I1M_BIT	18
+#define SR_I0M_BIT	17
+#define SR_GM_BIT	16
+
+/* The user-visible part */
+#define SR_L		0x00000020
+#define SR_Q		0x00000010
+#define SR_V		0x00000008
+#define SR_N		0x00000004
+#define SR_Z		0x00000002
+#define SR_C		0x00000001
+
+#define SR_L_BIT	5
+#define SR_Q_BIT	4
+#define SR_V_BIT	3
+#define SR_N_BIT	2
+#define SR_Z_BIT	1
+#define SR_C_BIT	0
+
+/*
+ * The order is defined by the stmts instruction. r0 is stored first,
+ * so it gets the highest address.
+ *
+ * Registers 0-12 are general-purpose registers (r12 is normally used for
+ * the function return value).
+ * Register 13 is the stack pointer
+ * Register 14 is the link register
+ * Register 15 is the program counter (retrieved from the RAR sysreg)
+ */
+#define FRAME_SIZE_FULL 72
+#define REG_R12_ORIG	68
+#define REG_R0		64
+#define REG_R1		60
+#define REG_R2		56
+#define REG_R3		52
+#define REG_R4		48
+#define REG_R5		44
+#define REG_R6		40
+#define REG_R7		36
+#define REG_R8		32
+#define REG_R9		28
+#define REG_R10		24
+#define REG_R11		20
+#define REG_R12		16
+#define REG_SP		12
+#define REG_LR		 8
+
+#define FRAME_SIZE_MIN	 8
+#define REG_PC		 4
+#define REG_SR		 0
+
+#ifndef __ASSEMBLY__
+struct pt_regs {
+	/* These are always saved */
+	unsigned long sr;
+	unsigned long pc;
+
+	/* These are sometimes saved */
+	unsigned long lr;
+	unsigned long sp;
+	unsigned long r12;
+	unsigned long r11;
+	unsigned long r10;
+	unsigned long r9;
+	unsigned long r8;
+	unsigned long r7;
+	unsigned long r6;
+	unsigned long r5;
+	unsigned long r4;
+	unsigned long r3;
+	unsigned long r2;
+	unsigned long r1;
+	unsigned long r0;
+
+	/* Only saved on system call */
+	unsigned long r12_orig;
+};
+
+#ifdef __KERNEL__
+# define user_mode(regs) (((regs)->sr & MODE_MASK) == MODE_USER)
+extern void show_regs (struct pt_regs *);
+
+static __inline__ int valid_user_regs(struct pt_regs *regs)
+{
+	/*
+	 * Some of the Java bits might be acceptable if/when we
+	 * implement some support for that stuff...
+	 */
+	if ((regs->sr & 0xffff0000) == 0)
+		return 1;
+
+	/*
+	 * Force status register flags to be sane and report this
+	 * illegal behaviour...
+	 */
+	regs->sr &= 0x0000ffff;
+	return 0;
+}
+
+#define instruction_pointer(regs) ((regs)->pc)
+
+#define profile_pc(regs) instruction_pointer(regs)
+
+#endif /* __KERNEL__ */
+
+#endif /* ! __ASSEMBLY__ */
+
+#endif /* __ASM_AVR32_PTRACE_H */
diff --git a/include/asm-avr32/resource.h b/include/asm-avr32/resource.h
new file mode 100644
index 00000000000..c6dd101472b
--- /dev/null
+++ b/include/asm-avr32/resource.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_RESOURCE_H
+#define __ASM_AVR32_RESOURCE_H
+
+#include <asm-generic/resource.h>
+
+#endif /* __ASM_AVR32_RESOURCE_H */
diff --git a/include/asm-avr32/scatterlist.h b/include/asm-avr32/scatterlist.h
new file mode 100644
index 00000000000..bfe7d753423
--- /dev/null
+++ b/include/asm-avr32/scatterlist.h
@@ -0,0 +1,21 @@
+#ifndef __ASM_AVR32_SCATTERLIST_H
+#define __ASM_AVR32_SCATTERLIST_H
+
+struct scatterlist {
+    struct page		*page;
+    unsigned int	offset;
+    dma_addr_t		dma_address;
+    unsigned int	length;
+};
+
+/* These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg)	((sg)->dma_address)
+#define sg_dma_len(sg)		((sg)->length)
+
+#define ISA_DMA_THRESHOLD (0xffffffff)
+
+#endif /* __ASM_AVR32_SCATTERLIST_H */
diff --git a/include/asm-avr32/sections.h b/include/asm-avr32/sections.h
new file mode 100644
index 00000000000..aa14252e418
--- /dev/null
+++ b/include/asm-avr32/sections.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_SECTIONS_H
+#define __ASM_AVR32_SECTIONS_H
+
+#include <asm-generic/sections.h>
+
+#endif /* __ASM_AVR32_SECTIONS_H */
diff --git a/include/asm-avr32/semaphore.h b/include/asm-avr32/semaphore.h
new file mode 100644
index 00000000000..ef99ddccc10
--- /dev/null
+++ b/include/asm-avr32/semaphore.h
@@ -0,0 +1,109 @@
+/*
+ * SMP- and interrupt-safe semaphores.
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * Based on include/asm-i386/semaphore.h
+ *   Copyright (C) 1996 Linus Torvalds
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SEMAPHORE_H
+#define __ASM_AVR32_SEMAPHORE_H
+
+#include <linux/linkage.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+#include <linux/wait.h>
+#include <linux/rwsem.h>
+
+struct semaphore {
+	atomic_t count;
+	int sleepers;
+	wait_queue_head_t wait;
+};
+
+#define __SEMAPHORE_INITIALIZER(name, n)				\
+{									\
+	.count		= ATOMIC_INIT(n),				\
+	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
+}
+
+#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+
+#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
+#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+
+static inline void sema_init (struct semaphore *sem, int val)
+{
+	atomic_set(&sem->count, val);
+	sem->sleepers = 0;
+	init_waitqueue_head(&sem->wait);
+}
+
+static inline void init_MUTEX (struct semaphore *sem)
+{
+	sema_init(sem, 1);
+}
+
+static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+{
+	sema_init(sem, 0);
+}
+
+void __down(struct semaphore * sem);
+int  __down_interruptible(struct semaphore * sem);
+void __up(struct semaphore * sem);
+
+/*
+ * This is ugly, but we want the default case to fall through.
+ * "__down_failed" is a special asm handler that calls the C
+ * routine that actually waits. See arch/i386/kernel/semaphore.c
+ */
+static inline void down(struct semaphore * sem)
+{
+	might_sleep();
+	if (unlikely(atomic_dec_return (&sem->count) < 0))
+		__down (sem);
+}
+
+/*
+ * Interruptible try to acquire a semaphore.  If we obtained
+ * it, return zero.  If we were interrupted, returns -EINTR
+ */
+static inline int down_interruptible(struct semaphore * sem)
+{
+	int ret = 0;
+
+	might_sleep();
+	if (unlikely(atomic_dec_return (&sem->count) < 0))
+		ret = __down_interruptible (sem);
+	return ret;
+}
+
+/*
+ * Non-blockingly attempt to down() a semaphore.
+ * Returns zero if we acquired it
+ */
+static inline int down_trylock(struct semaphore * sem)
+{
+	return atomic_dec_if_positive(&sem->count) < 0;
+}
+
+/*
+ * Note! This is subtle. We jump to wake people up only if
+ * the semaphore was negative (== somebody was waiting on it).
+ * The default case (no contention) will result in NO
+ * jumps for both down() and up().
+ */
+static inline void up(struct semaphore * sem)
+{
+	if (unlikely(atomic_inc_return (&sem->count) <= 0))
+		__up (sem);
+}
+
+#endif /*__ASM_AVR32_SEMAPHORE_H */
diff --git a/include/asm-avr32/sembuf.h b/include/asm-avr32/sembuf.h
new file mode 100644
index 00000000000..e472216e0c9
--- /dev/null
+++ b/include/asm-avr32/sembuf.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_AVR32_SEMBUF_H
+#define __ASM_AVR32_SEMBUF_H
+
+/*
+* The semid64_ds structure for AVR32 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct semid64_ds {
+        struct ipc64_perm sem_perm;             /* permissions .. see ipc.h */
+        __kernel_time_t sem_otime;              /* last semop time */
+        unsigned long   __unused1;
+        __kernel_time_t sem_ctime;              /* last change time */
+        unsigned long   __unused2;
+        unsigned long   sem_nsems;              /* no. of semaphores in array */
+        unsigned long   __unused3;
+        unsigned long   __unused4;
+};
+
+#endif /* __ASM_AVR32_SEMBUF_H */
diff --git a/include/asm-avr32/setup.h b/include/asm-avr32/setup.h
new file mode 100644
index 00000000000..10193da4113
--- /dev/null
+++ b/include/asm-avr32/setup.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * Based on linux/include/asm-arm/setup.h
+ *   Copyright (C) 1997-1999 Russel King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SETUP_H__
+#define __ASM_AVR32_SETUP_H__
+
+#define COMMAND_LINE_SIZE 256
+
+/* Magic number indicating that a tag table is present */
+#define ATAG_MAGIC	0xa2a25441
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Generic memory range, used by several tags.
+ *
+ *   addr is always physical.
+ *   size is measured in bytes.
+ *   next is for use by the OS, e.g. for grouping regions into
+ *        linked lists.
+ */
+struct tag_mem_range {
+	u32			addr;
+	u32			size;
+	struct tag_mem_range *	next;
+};
+
+/* The list ends with an ATAG_NONE node. */
+#define ATAG_NONE	0x00000000
+
+struct tag_header {
+	u32 size;
+	u32 tag;
+};
+
+/* The list must start with an ATAG_CORE node */
+#define ATAG_CORE	0x54410001
+
+struct tag_core {
+	u32 flags;
+	u32 pagesize;
+	u32 rootdev;
+};
+
+/* it is allowed to have multiple ATAG_MEM nodes */
+#define ATAG_MEM	0x54410002
+/* ATAG_MEM uses tag_mem_range */
+
+/* command line: \0 terminated string */
+#define ATAG_CMDLINE	0x54410003
+
+struct tag_cmdline {
+	char	cmdline[1];	/* this is the minimum size */
+};
+
+/* Ramdisk image (may be compressed) */
+#define ATAG_RDIMG	0x54410004
+/* ATAG_RDIMG uses tag_mem_range */
+
+/* Information about various clocks present in the system */
+#define ATAG_CLOCK	0x54410005
+
+struct tag_clock {
+	u32	clock_id;	/* Which clock are we talking about? */
+	u32	clock_flags;	/* Special features */
+	u64	clock_hz;	/* Clock speed in Hz */
+};
+
+/* The clock types we know about */
+#define CLOCK_BOOTCPU	0
+
+/* Memory reserved for the system (e.g. the bootloader) */
+#define ATAG_RSVD_MEM	0x54410006
+/* ATAG_RSVD_MEM uses tag_mem_range */
+
+/* Ethernet information */
+
+#define ATAG_ETHERNET	0x54410007
+
+struct tag_ethernet {
+	u8	mac_index;
+	u8	mii_phy_addr;
+	u8	hw_address[6];
+};
+
+#define ETH_INVALID_PHY	0xff
+
+struct tag {
+	struct tag_header hdr;
+	union {
+		struct tag_core core;
+		struct tag_mem_range mem_range;
+		struct tag_cmdline cmdline;
+		struct tag_clock clock;
+		struct tag_ethernet ethernet;
+	} u;
+};
+
+struct tagtable {
+	u32	tag;
+	int	(*parse)(struct tag *);
+};
+
+#define __tag __attribute_used__ __attribute__((__section__(".taglist")))
+#define __tagtable(tag, fn)						\
+	static struct tagtable __tagtable_##fn __tag = { tag, fn }
+
+#define tag_member_present(tag,member)					\
+	((unsigned long)(&((struct tag *)0L)->member + 1)		\
+	 <= (tag)->hdr.size * 4)
+
+#define tag_next(t)	((struct tag *)((u32 *)(t) + (t)->hdr.size))
+#define tag_size(type)	((sizeof(struct tag_header) + sizeof(struct type)) >> 2)
+
+#define for_each_tag(t,base)						\
+	for (t = base; t->hdr.size; t = tag_next(t))
+
+extern struct tag_mem_range *mem_phys;
+extern struct tag_mem_range *mem_reserved;
+extern struct tag_mem_range *mem_ramdisk;
+
+extern struct tag *bootloader_tags;
+
+extern void setup_bootmem(void);
+extern void setup_processor(void);
+extern void board_setup_fbmem(unsigned long fbmem_start,
+			      unsigned long fbmem_size);
+
+/* Chip-specific hook to enable the use of SDRAM */
+void chip_enable_sdram(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_AVR32_SETUP_H__ */
diff --git a/include/asm-avr32/shmbuf.h b/include/asm-avr32/shmbuf.h
new file mode 100644
index 00000000000..c62fba41739
--- /dev/null
+++ b/include/asm-avr32/shmbuf.h
@@ -0,0 +1,42 @@
+#ifndef __ASM_AVR32_SHMBUF_H
+#define __ASM_AVR32_SHMBUF_H
+
+/*
+ * The shmid64_ds structure for i386 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 64-bit time_t to solve y2038 problem
+ * - 2 miscellaneous 32-bit values
+ */
+
+struct shmid64_ds {
+	struct ipc64_perm	shm_perm;	/* operation perms */
+	size_t			shm_segsz;	/* size of segment (bytes) */
+	__kernel_time_t		shm_atime;	/* last attach time */
+	unsigned long		__unused1;
+	__kernel_time_t		shm_dtime;	/* last detach time */
+	unsigned long		__unused2;
+	__kernel_time_t		shm_ctime;	/* last change time */
+	unsigned long		__unused3;
+	__kernel_pid_t		shm_cpid;	/* pid of creator */
+	__kernel_pid_t		shm_lpid;	/* pid of last operator */
+	unsigned long		shm_nattch;	/* no. of current attaches */
+	unsigned long		__unused4;
+	unsigned long		__unused5;
+};
+
+struct shminfo64 {
+	unsigned long	shmmax;
+	unsigned long	shmmin;
+	unsigned long	shmmni;
+	unsigned long	shmseg;
+	unsigned long	shmall;
+	unsigned long	__unused1;
+	unsigned long	__unused2;
+	unsigned long	__unused3;
+	unsigned long	__unused4;
+};
+
+#endif /* __ASM_AVR32_SHMBUF_H */
diff --git a/include/asm-avr32/shmparam.h b/include/asm-avr32/shmparam.h
new file mode 100644
index 00000000000..3681266c77f
--- /dev/null
+++ b/include/asm-avr32/shmparam.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_SHMPARAM_H
+#define __ASM_AVR32_SHMPARAM_H
+
+#define SHMLBA PAGE_SIZE	/* attach addr a multiple of this */
+
+#endif /* __ASM_AVR32_SHMPARAM_H */
diff --git a/include/asm-avr32/sigcontext.h b/include/asm-avr32/sigcontext.h
new file mode 100644
index 00000000000..e04062b5f39
--- /dev/null
+++ b/include/asm-avr32/sigcontext.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SIGCONTEXT_H
+#define __ASM_AVR32_SIGCONTEXT_H
+
+struct sigcontext {
+	unsigned long	oldmask;
+
+	/* CPU registers */
+	unsigned long	sr;
+	unsigned long	pc;
+	unsigned long	lr;
+	unsigned long	sp;
+	unsigned long	r12;
+	unsigned long	r11;
+	unsigned long	r10;
+	unsigned long	r9;
+	unsigned long	r8;
+	unsigned long	r7;
+	unsigned long	r6;
+	unsigned long	r5;
+	unsigned long	r4;
+	unsigned long	r3;
+	unsigned long	r2;
+	unsigned long	r1;
+	unsigned long	r0;
+};
+
+#endif /* __ASM_AVR32_SIGCONTEXT_H */
diff --git a/include/asm-avr32/siginfo.h b/include/asm-avr32/siginfo.h
new file mode 100644
index 00000000000..5ee93f40a8a
--- /dev/null
+++ b/include/asm-avr32/siginfo.h
@@ -0,0 +1,6 @@
+#ifndef _AVR32_SIGINFO_H
+#define _AVR32_SIGINFO_H
+
+#include <asm-generic/siginfo.h>
+
+#endif
diff --git a/include/asm-avr32/signal.h b/include/asm-avr32/signal.h
new file mode 100644
index 00000000000..caffefeeba1
--- /dev/null
+++ b/include/asm-avr32/signal.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SIGNAL_H
+#define __ASM_AVR32_SIGNAL_H
+
+#include <linux/types.h>
+
+/* Avoid too many header ordering problems.  */
+struct siginfo;
+
+#ifdef __KERNEL__
+/* Most things should be clean enough to redefine this at will, if care
+   is taken to make libc match.  */
+
+#define _NSIG		64
+#define _NSIG_BPW	32
+#define _NSIG_WORDS	(_NSIG / _NSIG_BPW)
+
+typedef unsigned long old_sigset_t;		/* at least 32 bits */
+
+typedef struct {
+	unsigned long sig[_NSIG_WORDS];
+} sigset_t;
+
+#else
+/* Here we must cater to libcs that poke about in kernel headers.  */
+
+#define NSIG		32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+
+#define SIGHUP		 1
+#define SIGINT		 2
+#define SIGQUIT		 3
+#define SIGILL		 4
+#define SIGTRAP		 5
+#define SIGABRT		 6
+#define SIGIOT		 6
+#define SIGBUS		 7
+#define SIGFPE		 8
+#define SIGKILL		 9
+#define SIGUSR1		10
+#define SIGSEGV		11
+#define SIGUSR2		12
+#define SIGPIPE		13
+#define SIGALRM		14
+#define SIGTERM		15
+#define SIGSTKFLT	16
+#define SIGCHLD		17
+#define SIGCONT		18
+#define SIGSTOP		19
+#define SIGTSTP		20
+#define SIGTTIN		21
+#define SIGTTOU		22
+#define SIGURG		23
+#define SIGXCPU		24
+#define SIGXFSZ		25
+#define SIGVTALRM	26
+#define SIGPROF		27
+#define SIGWINCH	28
+#define SIGIO		29
+#define SIGPOLL		SIGIO
+/*
+#define SIGLOST		29
+*/
+#define SIGPWR		30
+#define SIGSYS		31
+#define	SIGUNUSED	31
+
+/* These should not be considered constants from userland.  */
+#define SIGRTMIN	32
+#define SIGRTMAX	(_NSIG-1)
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_NOCLDSTOP		flag to turn off SIGCHLD when children stop.
+ * SA_NOCLDWAIT		flag on SIGCHLD to inhibit zombies.
+ * SA_SIGINFO		deliver the signal with SIGINFO structs
+ * SA_ONSTACK		indicates that a registered stack_t will be used.
+ * SA_RESTART		flag to get restarting signals (which were the default long ago)
+ * SA_NODEFER		prevents the current signal from being masked in the handler.
+ * SA_RESETHAND		clears the handler when the signal is delivered.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP	0x00000001
+#define SA_NOCLDWAIT	0x00000002
+#define SA_SIGINFO	0x00000004
+#define SA_RESTORER	0x04000000
+#define SA_ONSTACK	0x08000000
+#define SA_RESTART	0x10000000
+#define SA_NODEFER	0x40000000
+#define SA_RESETHAND	0x80000000
+
+#define SA_NOMASK	SA_NODEFER
+#define SA_ONESHOT	SA_RESETHAND
+
+/*
+ * sigaltstack controls
+ */
+#define SS_ONSTACK	1
+#define SS_DISABLE	2
+
+#define MINSIGSTKSZ	2048
+#define SIGSTKSZ	8192
+
+#include <asm-generic/signal.h>
+
+#ifdef __KERNEL__
+struct old_sigaction {
+	__sighandler_t sa_handler;
+	old_sigset_t sa_mask;
+	unsigned long sa_flags;
+	__sigrestore_t sa_restorer;
+};
+
+struct sigaction {
+	__sighandler_t sa_handler;
+	unsigned long sa_flags;
+	__sigrestore_t sa_restorer;
+	sigset_t sa_mask;		/* mask last for extensibility */
+};
+
+struct k_sigaction {
+	struct sigaction sa;
+};
+#else
+/* Here we must cater to libcs that poke about in kernel headers.  */
+
+struct sigaction {
+	union {
+		__sighandler_t _sa_handler;
+		void (*_sa_sigaction)(int, struct siginfo *, void *);
+	} _u;
+	sigset_t sa_mask;
+	unsigned long sa_flags;
+	void (*sa_restorer)(void);
+};
+
+#define sa_handler	_u._sa_handler
+#define sa_sigaction	_u._sa_sigaction
+
+#endif /* __KERNEL__ */
+
+typedef struct sigaltstack {
+	void __user *ss_sp;
+	int ss_flags;
+	size_t ss_size;
+} stack_t;
+
+#ifdef __KERNEL__
+
+#include <asm/sigcontext.h>
+#undef __HAVE_ARCH_SIG_BITOPS
+
+#define ptrace_signal_deliver(regs, cookie) do { } while (0)
+
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/asm-avr32/socket.h b/include/asm-avr32/socket.h
new file mode 100644
index 00000000000..543229de817
--- /dev/null
+++ b/include/asm-avr32/socket.h
@@ -0,0 +1,53 @@
+#ifndef __ASM_AVR32_SOCKET_H
+#define __ASM_AVR32_SOCKET_H
+
+#include <asm/sockios.h>
+
+/* For setsockopt(2) */
+#define SOL_SOCKET	1
+
+#define SO_DEBUG	1
+#define SO_REUSEADDR	2
+#define SO_TYPE		3
+#define SO_ERROR	4
+#define SO_DONTROUTE	5
+#define SO_BROADCAST	6
+#define SO_SNDBUF	7
+#define SO_RCVBUF	8
+#define SO_SNDBUFFORCE	32
+#define SO_RCVBUFFORCE	33
+#define SO_KEEPALIVE	9
+#define SO_OOBINLINE	10
+#define SO_NO_CHECK	11
+#define SO_PRIORITY	12
+#define SO_LINGER	13
+#define SO_BSDCOMPAT	14
+/* To add :#define SO_REUSEPORT 15 */
+#define SO_PASSCRED	16
+#define SO_PEERCRED	17
+#define SO_RCVLOWAT	18
+#define SO_SNDLOWAT	19
+#define SO_RCVTIMEO	20
+#define SO_SNDTIMEO	21
+
+/* Security levels - as per NRL IPv6 - don't actually do anything */
+#define SO_SECURITY_AUTHENTICATION		22
+#define SO_SECURITY_ENCRYPTION_TRANSPORT	23
+#define SO_SECURITY_ENCRYPTION_NETWORK		24
+
+#define SO_BINDTODEVICE	25
+
+/* Socket filtering */
+#define SO_ATTACH_FILTER        26
+#define SO_DETACH_FILTER        27
+
+#define SO_PEERNAME		28
+#define SO_TIMESTAMP		29
+#define SCM_TIMESTAMP		SO_TIMESTAMP
+
+#define SO_ACCEPTCONN		30
+
+#define SO_PEERSEC		31
+#define SO_PASSSEC		34
+
+#endif /* __ASM_AVR32_SOCKET_H */
diff --git a/include/asm-avr32/sockios.h b/include/asm-avr32/sockios.h
new file mode 100644
index 00000000000..84f3d65b3b3
--- /dev/null
+++ b/include/asm-avr32/sockios.h
@@ -0,0 +1,12 @@
+#ifndef __ASM_AVR32_SOCKIOS_H
+#define __ASM_AVR32_SOCKIOS_H
+
+/* Socket-level I/O control calls. */
+#define FIOSETOWN 	0x8901
+#define SIOCSPGRP	0x8902
+#define FIOGETOWN	0x8903
+#define SIOCGPGRP	0x8904
+#define SIOCATMARK	0x8905
+#define SIOCGSTAMP	0x8906		/* Get stamp */
+
+#endif /* __ASM_AVR32_SOCKIOS_H */
diff --git a/include/asm-avr32/stat.h b/include/asm-avr32/stat.h
new file mode 100644
index 00000000000..e72881e1023
--- /dev/null
+++ b/include/asm-avr32/stat.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_STAT_H
+#define __ASM_AVR32_STAT_H
+
+struct __old_kernel_stat {
+        unsigned short st_dev;
+        unsigned short st_ino;
+        unsigned short st_mode;
+        unsigned short st_nlink;
+        unsigned short st_uid;
+        unsigned short st_gid;
+        unsigned short st_rdev;
+        unsigned long  st_size;
+        unsigned long  st_atime;
+        unsigned long  st_mtime;
+        unsigned long  st_ctime;
+};
+
+struct stat {
+        unsigned long st_dev;
+        unsigned long st_ino;
+        unsigned short st_mode;
+        unsigned short st_nlink;
+        unsigned short st_uid;
+        unsigned short st_gid;
+        unsigned long  st_rdev;
+        unsigned long  st_size;
+        unsigned long  st_blksize;
+        unsigned long  st_blocks;
+        unsigned long  st_atime;
+        unsigned long  st_atime_nsec;
+        unsigned long  st_mtime;
+        unsigned long  st_mtime_nsec;
+        unsigned long  st_ctime;
+        unsigned long  st_ctime_nsec;
+        unsigned long  __unused4;
+        unsigned long  __unused5;
+};
+
+#define STAT_HAVE_NSEC 1
+
+struct stat64 {
+	unsigned long long st_dev;
+
+	unsigned long long st_ino;
+	unsigned int	st_mode;
+	unsigned int	st_nlink;
+
+	unsigned long	st_uid;
+	unsigned long	st_gid;
+
+	unsigned long long st_rdev;
+
+	long long	st_size;
+	unsigned long	__pad1;		/* align 64-bit st_blocks */
+	unsigned long	st_blksize;
+
+	unsigned long long st_blocks;	/* Number 512-byte blocks allocated. */
+
+	unsigned long	st_atime;
+	unsigned long	st_atime_nsec;
+
+	unsigned long	st_mtime;
+	unsigned long	st_mtime_nsec;
+
+	unsigned long	st_ctime;
+	unsigned long	st_ctime_nsec;
+
+	unsigned long	__unused1;
+	unsigned long	__unused2;
+};
+
+#endif /* __ASM_AVR32_STAT_H */
diff --git a/include/asm-avr32/statfs.h b/include/asm-avr32/statfs.h
new file mode 100644
index 00000000000..2961bd18c50
--- /dev/null
+++ b/include/asm-avr32/statfs.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_STATFS_H
+#define __ASM_AVR32_STATFS_H
+
+#include <asm-generic/statfs.h>
+
+#endif /* __ASM_AVR32_STATFS_H */
diff --git a/include/asm-avr32/string.h b/include/asm-avr32/string.h
new file mode 100644
index 00000000000..c91a623cd58
--- /dev/null
+++ b/include/asm-avr32/string.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_STRING_H
+#define __ASM_AVR32_STRING_H
+
+#define __HAVE_ARCH_MEMSET
+extern void *memset(void *b, int c, size_t len);
+
+#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *to, const void *from, size_t len);
+
+#endif /* __ASM_AVR32_STRING_H */
diff --git a/include/asm-avr32/sysreg.h b/include/asm-avr32/sysreg.h
new file mode 100644
index 00000000000..f91975f330f
--- /dev/null
+++ b/include/asm-avr32/sysreg.h
@@ -0,0 +1,332 @@
+/*
+ * AVR32 System Registers
+ *
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SYSREG_H__
+#define __ASM_AVR32_SYSREG_H__
+
+/* sysreg register offsets */
+#define SYSREG_SR                               0x0000
+#define SYSREG_EVBA                             0x0004
+#define SYSREG_ACBA                             0x0008
+#define SYSREG_CPUCR                            0x000c
+#define SYSREG_ECR                              0x0010
+#define SYSREG_RSR_SUP                          0x0014
+#define SYSREG_RSR_INT0                         0x0018
+#define SYSREG_RSR_INT1                         0x001c
+#define SYSREG_RSR_INT2                         0x0020
+#define SYSREG_RSR_INT3                         0x0024
+#define SYSREG_RSR_EX                           0x0028
+#define SYSREG_RSR_NMI                          0x002c
+#define SYSREG_RSR_DBG                          0x0030
+#define SYSREG_RAR_SUP                          0x0034
+#define SYSREG_RAR_INT0                         0x0038
+#define SYSREG_RAR_INT1                         0x003c
+#define SYSREG_RAR_INT2                         0x0040
+#define SYSREG_RAR_INT3                         0x0044
+#define SYSREG_RAR_EX                           0x0048
+#define SYSREG_RAR_NMI                          0x004c
+#define SYSREG_RAR_DBG                          0x0050
+#define SYSREG_JECR                             0x0054
+#define SYSREG_JOSP                             0x0058
+#define SYSREG_JAVA_LV0                         0x005c
+#define SYSREG_JAVA_LV1                         0x0060
+#define SYSREG_JAVA_LV2                         0x0064
+#define SYSREG_JAVA_LV3                         0x0068
+#define SYSREG_JAVA_LV4                         0x006c
+#define SYSREG_JAVA_LV5                         0x0070
+#define SYSREG_JAVA_LV6                         0x0074
+#define SYSREG_JAVA_LV7                         0x0078
+#define SYSREG_JTBA                             0x007c
+#define SYSREG_JBCR                             0x0080
+#define SYSREG_CONFIG0                          0x0100
+#define SYSREG_CONFIG1                          0x0104
+#define SYSREG_COUNT                            0x0108
+#define SYSREG_COMPARE                          0x010c
+#define SYSREG_TLBEHI                           0x0110
+#define SYSREG_TLBELO                           0x0114
+#define SYSREG_PTBR                             0x0118
+#define SYSREG_TLBEAR                           0x011c
+#define SYSREG_MMUCR                            0x0120
+#define SYSREG_TLBARLO                          0x0124
+#define SYSREG_TLBARHI                          0x0128
+#define SYSREG_PCCNT                            0x012c
+#define SYSREG_PCNT0                            0x0130
+#define SYSREG_PCNT1                            0x0134
+#define SYSREG_PCCR                             0x0138
+#define SYSREG_BEAR                             0x013c
+
+/* Bitfields in SR */
+#define SYSREG_SR_C_OFFSET                      0
+#define SYSREG_SR_C_SIZE                        1
+#define SYSREG_Z_OFFSET                         1
+#define SYSREG_Z_SIZE                           1
+#define SYSREG_SR_N_OFFSET                      2
+#define SYSREG_SR_N_SIZE                        1
+#define SYSREG_SR_V_OFFSET                      3
+#define SYSREG_SR_V_SIZE                        1
+#define SYSREG_Q_OFFSET                         4
+#define SYSREG_Q_SIZE                           1
+#define SYSREG_GM_OFFSET                        16
+#define SYSREG_GM_SIZE                          1
+#define SYSREG_I0M_OFFSET                       17
+#define SYSREG_I0M_SIZE                         1
+#define SYSREG_I1M_OFFSET                       18
+#define SYSREG_I1M_SIZE                         1
+#define SYSREG_I2M_OFFSET                       19
+#define SYSREG_I2M_SIZE                         1
+#define SYSREG_I3M_OFFSET                       20
+#define SYSREG_I3M_SIZE                         1
+#define SYSREG_EM_OFFSET                        21
+#define SYSREG_EM_SIZE                          1
+#define SYSREG_M0_OFFSET                        22
+#define SYSREG_M0_SIZE                          1
+#define SYSREG_M1_OFFSET                        23
+#define SYSREG_M1_SIZE                          1
+#define SYSREG_M2_OFFSET                        24
+#define SYSREG_M2_SIZE                          1
+#define SYSREG_SR_D_OFFSET                      26
+#define SYSREG_SR_D_SIZE                        1
+#define SYSREG_DM_OFFSET                        27
+#define SYSREG_DM_SIZE                          1
+#define SYSREG_SR_J_OFFSET                      28
+#define SYSREG_SR_J_SIZE                        1
+#define SYSREG_R_OFFSET                         29
+#define SYSREG_R_SIZE                           1
+#define SYSREG_H_OFFSET                         30
+#define SYSREG_H_SIZE                           1
+
+/* Bitfields in EVBA */
+
+/* Bitfields in ACBA */
+
+/* Bitfields in CPUCR */
+#define SYSREG_BI_OFFSET                        0
+#define SYSREG_BI_SIZE                          1
+#define SYSREG_BE_OFFSET                        1
+#define SYSREG_BE_SIZE                          1
+#define SYSREG_FE_OFFSET                        2
+#define SYSREG_FE_SIZE                          1
+#define SYSREG_RE_OFFSET                        3
+#define SYSREG_RE_SIZE                          1
+#define SYSREG_IBE_OFFSET                       4
+#define SYSREG_IBE_SIZE                         1
+#define SYSREG_IEE_OFFSET                       5
+#define SYSREG_IEE_SIZE                         1
+
+/* Bitfields in ECR */
+#define SYSREG_ECR_OFFSET                       0
+#define SYSREG_ECR_SIZE                         32
+
+/* Bitfields in RSR_SUP */
+
+/* Bitfields in RSR_INT0 */
+
+/* Bitfields in RSR_INT1 */
+
+/* Bitfields in RSR_INT2 */
+
+/* Bitfields in RSR_INT3 */
+
+/* Bitfields in RSR_EX */
+
+/* Bitfields in RSR_NMI */
+
+/* Bitfields in RSR_DBG */
+
+/* Bitfields in RAR_SUP */
+
+/* Bitfields in RAR_INT0 */
+
+/* Bitfields in RAR_INT1 */
+
+/* Bitfields in RAR_INT2 */
+
+/* Bitfields in RAR_INT3 */
+
+/* Bitfields in RAR_EX */
+
+/* Bitfields in RAR_NMI */
+
+/* Bitfields in RAR_DBG */
+
+/* Bitfields in JECR */
+
+/* Bitfields in JOSP */
+
+/* Bitfields in JAVA_LV0 */
+
+/* Bitfields in JAVA_LV1 */
+
+/* Bitfields in JAVA_LV2 */
+
+/* Bitfields in JAVA_LV3 */
+
+/* Bitfields in JAVA_LV4 */
+
+/* Bitfields in JAVA_LV5 */
+
+/* Bitfields in JAVA_LV6 */
+
+/* Bitfields in JAVA_LV7 */
+
+/* Bitfields in JTBA */
+
+/* Bitfields in JBCR */
+
+/* Bitfields in CONFIG0 */
+#define SYSREG_CONFIG0_D_OFFSET                 1
+#define SYSREG_CONFIG0_D_SIZE                   1
+#define SYSREG_CONFIG0_S_OFFSET                 2
+#define SYSREG_CONFIG0_S_SIZE                   1
+#define SYSREG_O_OFFSET                         3
+#define SYSREG_O_SIZE                           1
+#define SYSREG_P_OFFSET                         4
+#define SYSREG_P_SIZE                           1
+#define SYSREG_CONFIG0_J_OFFSET                 5
+#define SYSREG_CONFIG0_J_SIZE                   1
+#define SYSREG_F_OFFSET                         6
+#define SYSREG_F_SIZE                           1
+#define SYSREG_MMUT_OFFSET                      7
+#define SYSREG_MMUT_SIZE                        3
+#define SYSREG_AR_OFFSET                        10
+#define SYSREG_AR_SIZE                          3
+#define SYSREG_AT_OFFSET                        13
+#define SYSREG_AT_SIZE                          3
+#define SYSREG_PROCESSORREVISION_OFFSET         16
+#define SYSREG_PROCESSORREVISION_SIZE           8
+#define SYSREG_PROCESSORID_OFFSET               24
+#define SYSREG_PROCESSORID_SIZE                 8
+
+/* Bitfields in CONFIG1 */
+#define SYSREG_DASS_OFFSET                      0
+#define SYSREG_DASS_SIZE                        3
+#define SYSREG_DLSZ_OFFSET                      3
+#define SYSREG_DLSZ_SIZE                        3
+#define SYSREG_DSET_OFFSET                      6
+#define SYSREG_DSET_SIZE                        4
+#define SYSREG_IASS_OFFSET                      10
+#define SYSREG_IASS_SIZE                        2
+#define SYSREG_ILSZ_OFFSET                      13
+#define SYSREG_ILSZ_SIZE                        3
+#define SYSREG_ISET_OFFSET                      16
+#define SYSREG_ISET_SIZE                        4
+#define SYSREG_DMMUSZ_OFFSET                    20
+#define SYSREG_DMMUSZ_SIZE                      6
+#define SYSREG_IMMUSZ_OFFSET                    26
+#define SYSREG_IMMUSZ_SIZE                      6
+
+/* Bitfields in COUNT */
+
+/* Bitfields in COMPARE */
+
+/* Bitfields in TLBEHI */
+#define SYSREG_ASID_OFFSET                      0
+#define SYSREG_ASID_SIZE                        8
+#define SYSREG_TLBEHI_I_OFFSET                  8
+#define SYSREG_TLBEHI_I_SIZE                    1
+#define SYSREG_TLBEHI_V_OFFSET                  9
+#define SYSREG_TLBEHI_V_SIZE                    1
+#define SYSREG_VPN_OFFSET                       10
+#define SYSREG_VPN_SIZE                         22
+
+/* Bitfields in TLBELO */
+#define SYSREG_W_OFFSET                         0
+#define SYSREG_W_SIZE                           1
+#define SYSREG_TLBELO_D_OFFSET                  1
+#define SYSREG_TLBELO_D_SIZE                    1
+#define SYSREG_SZ_OFFSET                        2
+#define SYSREG_SZ_SIZE                          2
+#define SYSREG_AP_OFFSET                        4
+#define SYSREG_AP_SIZE                          3
+#define SYSREG_B_OFFSET                         7
+#define SYSREG_B_SIZE                           1
+#define SYSREG_G_OFFSET                         8
+#define SYSREG_G_SIZE                           1
+#define SYSREG_TLBELO_C_OFFSET                  9
+#define SYSREG_TLBELO_C_SIZE                    1
+#define SYSREG_PFN_OFFSET                       10
+#define SYSREG_PFN_SIZE                         22
+
+/* Bitfields in PTBR */
+
+/* Bitfields in TLBEAR */
+
+/* Bitfields in MMUCR */
+#define SYSREG_E_OFFSET                         0
+#define SYSREG_E_SIZE                           1
+#define SYSREG_M_OFFSET                         1
+#define SYSREG_M_SIZE                           1
+#define SYSREG_MMUCR_I_OFFSET                   2
+#define SYSREG_MMUCR_I_SIZE                     1
+#define SYSREG_MMUCR_N_OFFSET                   3
+#define SYSREG_MMUCR_N_SIZE                     1
+#define SYSREG_MMUCR_S_OFFSET                   4
+#define SYSREG_MMUCR_S_SIZE                     1
+#define SYSREG_DLA_OFFSET                       8
+#define SYSREG_DLA_SIZE                         6
+#define SYSREG_DRP_OFFSET                       14
+#define SYSREG_DRP_SIZE                         6
+#define SYSREG_ILA_OFFSET                       20
+#define SYSREG_ILA_SIZE                         6
+#define SYSREG_IRP_OFFSET                       26
+#define SYSREG_IRP_SIZE                         6
+
+/* Bitfields in TLBARLO */
+
+/* Bitfields in TLBARHI */
+
+/* Bitfields in PCCNT */
+
+/* Bitfields in PCNT0 */
+
+/* Bitfields in PCNT1 */
+
+/* Bitfields in PCCR */
+
+/* Bitfields in BEAR */
+
+/* Constants for ECR */
+#define ECR_UNRECOVERABLE                       0
+#define ECR_TLB_MULTIPLE                        1
+#define ECR_BUS_ERROR_WRITE                     2
+#define ECR_BUS_ERROR_READ                      3
+#define ECR_NMI                                 4
+#define ECR_ADDR_ALIGN_X                        5
+#define ECR_PROTECTION_X                        6
+#define ECR_DEBUG                               7
+#define ECR_ILLEGAL_OPCODE                      8
+#define ECR_UNIMPL_INSTRUCTION                  9
+#define ECR_PRIVILEGE_VIOLATION                 10
+#define ECR_FPE                                 11
+#define ECR_COPROC_ABSENT                       12
+#define ECR_ADDR_ALIGN_R                        13
+#define ECR_ADDR_ALIGN_W                        14
+#define ECR_PROTECTION_R                        15
+#define ECR_PROTECTION_W                        16
+#define ECR_DTLB_MODIFIED                       17
+#define ECR_TLB_MISS_X                          20
+#define ECR_TLB_MISS_R                          24
+#define ECR_TLB_MISS_W                          28
+
+/* Bit manipulation macros */
+#define SYSREG_BIT(name)                        (1 << SYSREG_##name##_OFFSET)
+#define SYSREG_BF(name,value)                   (((value) & ((1 << SYSREG_##name##_SIZE) - 1)) << SYSREG_##name##_OFFSET)
+#define SYSREG_BFEXT(name,value)                (((value) >> SYSREG_##name##_OFFSET) & ((1 << SYSREG_##name##_SIZE) - 1))
+#define SYSREG_BFINS(name,value,old)            (((old) & ~(((1 << SYSREG_##name##_SIZE) - 1) << SYSREG_##name##_OFFSET)) | SYSREG_BF(name,value))
+
+#ifdef __CHECKER__
+extern unsigned long __builtin_mfsr(unsigned long reg);
+extern void __builtin_mtsr(unsigned long reg, unsigned long value);
+#endif
+
+/* Register access macros */
+#define sysreg_read(reg)                        __builtin_mfsr(SYSREG_##reg)
+#define sysreg_write(reg, value)                __builtin_mtsr(SYSREG_##reg, value)
+
+#endif /* __ASM_AVR32_SYSREG_H__ */
diff --git a/include/asm-avr32/system.h b/include/asm-avr32/system.h
new file mode 100644
index 00000000000..ac596058697
--- /dev/null
+++ b/include/asm-avr32/system.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_SYSTEM_H
+#define __ASM_AVR32_SYSTEM_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#include <asm/ptrace.h>
+#include <asm/sysreg.h>
+
+#define xchg(ptr,x) \
+	((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+
+#define nop() asm volatile("nop")
+
+#define mb()			asm volatile("" : : : "memory")
+#define rmb()			mb()
+#define wmb()			asm volatile("sync 0" : : : "memory")
+#define read_barrier_depends()  do { } while(0)
+#define set_mb(var, value)      do { var = value; mb(); } while(0)
+
+/*
+ * Help PathFinder and other Nexus-compliant debuggers keep track of
+ * the current PID by emitting an Ownership Trace Message each time we
+ * switch task.
+ */
+#ifdef CONFIG_OWNERSHIP_TRACE
+#include <asm/ocd.h>
+#define finish_arch_switch(prev)			\
+	do {						\
+		__mtdr(DBGREG_PID, prev->pid);		\
+		__mtdr(DBGREG_PID, current->pid);	\
+	} while(0)
+#endif
+
+/*
+ * switch_to(prev, next, last) should switch from task `prev' to task
+ * `next'. `prev' will never be the same as `next'.
+ *
+ * We just delegate everything to the __switch_to assembly function,
+ * which is implemented in arch/avr32/kernel/switch_to.S
+ *
+ * mb() tells GCC not to cache `current' across this call.
+ */
+struct cpu_context;
+struct task_struct;
+extern struct task_struct *__switch_to(struct task_struct *,
+				       struct cpu_context *,
+				       struct cpu_context *);
+#define switch_to(prev, next, last)					\
+	do {								\
+		last = __switch_to(prev, &prev->thread.cpu_context + 1,	\
+				   &next->thread.cpu_context);		\
+	} while (0)
+
+#ifdef CONFIG_SMP
+# error "The AVR32 port does not support SMP"
+#else
+# define smp_mb()		barrier()
+# define smp_rmb()		barrier()
+# define smp_wmb()		barrier()
+# define smp_read_barrier_depends() do { } while(0)
+#endif
+
+#include <linux/irqflags.h>
+
+extern void __xchg_called_with_bad_pointer(void);
+
+#ifdef __CHECKER__
+extern unsigned long __builtin_xchg(void *ptr, unsigned long x);
+#endif
+
+#define xchg_u32(val, m) __builtin_xchg((void *)m, val)
+
+static inline unsigned long __xchg(unsigned long x,
+				       volatile void *ptr,
+				       int size)
+{
+	switch(size) {
+	case 4:
+		return xchg_u32(x, ptr);
+	default:
+		__xchg_called_with_bad_pointer();
+		return x;
+	}
+}
+
+static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
+					  unsigned long new)
+{
+	__u32 ret;
+
+	asm volatile(
+		"1:	ssrf	5\n"
+		"	ld.w	%[ret], %[m]\n"
+		"	cp.w	%[ret], %[old]\n"
+		"	brne	2f\n"
+		"	stcond	%[m], %[new]\n"
+		"	brne	1b\n"
+		"2:\n"
+		: [ret] "=&r"(ret), [m] "=m"(*m)
+		: "m"(m), [old] "ir"(old), [new] "r"(new)
+		: "memory", "cc");
+	return ret;
+}
+
+extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
+        volatile int * m, unsigned long old, unsigned long new);
+#define __cmpxchg_u64 __cmpxchg_u64_unsupported_on_32bit_kernels
+
+/* This function doesn't exist, so you'll get a linker error
+   if something tries to do an invalid cmpxchg().  */
+extern void __cmpxchg_called_with_bad_pointer(void);
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
+				      unsigned long new, int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32(ptr, old, new);
+	case 8:
+		return __cmpxchg_u64(ptr, old, new);
+	}
+
+	__cmpxchg_called_with_bad_pointer();
+	return old;
+}
+
+#define cmpxchg(ptr, old, new)					\
+	((typeof(*(ptr)))__cmpxchg((ptr), (unsigned long)(old),	\
+				   (unsigned long)(new),	\
+				   sizeof(*(ptr))))
+
+struct pt_regs;
+extern void __die(const char *, struct pt_regs *, unsigned long,
+		  const char *, const char *, unsigned long);
+extern void __die_if_kernel(const char *, struct pt_regs *, unsigned long,
+			    const char *, const char *, unsigned long);
+
+#define die(msg, regs, err)					\
+	__die(msg, regs, err, __FILE__ ":", __FUNCTION__, __LINE__)
+#define die_if_kernel(msg, regs, err)					\
+	__die_if_kernel(msg, regs, err, __FILE__ ":", __FUNCTION__, __LINE__)
+
+#define arch_align_stack(x)	(x)
+
+#endif /* __ASM_AVR32_SYSTEM_H */
diff --git a/include/asm-avr32/termbits.h b/include/asm-avr32/termbits.h
new file mode 100644
index 00000000000..9dc6eacafa3
--- /dev/null
+++ b/include/asm-avr32/termbits.h
@@ -0,0 +1,173 @@
+#ifndef __ASM_AVR32_TERMBITS_H
+#define __ASM_AVR32_TERMBITS_H
+
+#include <linux/posix_types.h>
+
+typedef unsigned char	cc_t;
+typedef unsigned int	speed_t;
+typedef unsigned int	tcflag_t;
+
+#define NCCS 19
+struct termios {
+	tcflag_t c_iflag;		/* input mode flags */
+	tcflag_t c_oflag;		/* output mode flags */
+	tcflag_t c_cflag;		/* control mode flags */
+	tcflag_t c_lflag;		/* local mode flags */
+	cc_t c_line;			/* line discipline */
+	cc_t c_cc[NCCS];		/* control characters */
+};
+
+/* c_cc characters */
+#define VINTR 0
+#define VQUIT 1
+#define VERASE 2
+#define VKILL 3
+#define VEOF 4
+#define VTIME 5
+#define VMIN 6
+#define VSWTC 7
+#define VSTART 8
+#define VSTOP 9
+#define VSUSP 10
+#define VEOL 11
+#define VREPRINT 12
+#define VDISCARD 13
+#define VWERASE 14
+#define VLNEXT 15
+#define VEOL2 16
+
+/* c_iflag bits */
+#define IGNBRK	0000001
+#define BRKINT	0000002
+#define IGNPAR	0000004
+#define PARMRK	0000010
+#define INPCK	0000020
+#define ISTRIP	0000040
+#define INLCR	0000100
+#define IGNCR	0000200
+#define ICRNL	0000400
+#define IUCLC	0001000
+#define IXON	0002000
+#define IXANY	0004000
+#define IXOFF	0010000
+#define IMAXBEL	0020000
+#define IUTF8	0040000
+
+/* c_oflag bits */
+#define OPOST	0000001
+#define OLCUC	0000002
+#define ONLCR	0000004
+#define OCRNL	0000010
+#define ONOCR	0000020
+#define ONLRET	0000040
+#define OFILL	0000100
+#define OFDEL	0000200
+#define NLDLY	0000400
+#define   NL0	0000000
+#define   NL1	0000400
+#define CRDLY	0003000
+#define   CR0	0000000
+#define   CR1	0001000
+#define   CR2	0002000
+#define   CR3	0003000
+#define TABDLY	0014000
+#define   TAB0	0000000
+#define   TAB1	0004000
+#define   TAB2	0010000
+#define   TAB3	0014000
+#define   XTABS	0014000
+#define BSDLY	0020000
+#define   BS0	0000000
+#define   BS1	0020000
+#define VTDLY	0040000
+#define   VT0	0000000
+#define   VT1	0040000
+#define FFDLY	0100000
+#define   FF0	0000000
+#define   FF1	0100000
+
+/* c_cflag bit meaning */
+#define CBAUD	0010017
+#define  B0	0000000		/* hang up */
+#define  B50	0000001
+#define  B75	0000002
+#define  B110	0000003
+#define  B134	0000004
+#define  B150	0000005
+#define  B200	0000006
+#define  B300	0000007
+#define  B600	0000010
+#define  B1200	0000011
+#define  B1800	0000012
+#define  B2400	0000013
+#define  B4800	0000014
+#define  B9600	0000015
+#define  B19200	0000016
+#define  B38400	0000017
+#define EXTA B19200
+#define EXTB B38400
+#define CSIZE	0000060
+#define   CS5	0000000
+#define   CS6	0000020
+#define   CS7	0000040
+#define   CS8	0000060
+#define CSTOPB	0000100
+#define CREAD	0000200
+#define PARENB	0000400
+#define PARODD	0001000
+#define HUPCL	0002000
+#define CLOCAL	0004000
+#define CBAUDEX 0010000
+#define    B57600 0010001
+#define   B115200 0010002
+#define   B230400 0010003
+#define   B460800 0010004
+#define   B500000 0010005
+#define   B576000 0010006
+#define   B921600 0010007
+#define  B1000000 0010010
+#define  B1152000 0010011
+#define  B1500000 0010012
+#define  B2000000 0010013
+#define  B2500000 0010014
+#define  B3000000 0010015
+#define  B3500000 0010016
+#define  B4000000 0010017
+#define CIBAUD	  002003600000	/* input baud rate (not used) */
+#define CMSPAR	  010000000000		/* mark or space (stick) parity */
+#define CRTSCTS	  020000000000		/* flow control */
+
+/* c_lflag bits */
+#define ISIG	0000001
+#define ICANON	0000002
+#define XCASE	0000004
+#define ECHO	0000010
+#define ECHOE	0000020
+#define ECHOK	0000040
+#define ECHONL	0000100
+#define NOFLSH	0000200
+#define TOSTOP	0000400
+#define ECHOCTL	0001000
+#define ECHOPRT	0002000
+#define ECHOKE	0004000
+#define FLUSHO	0010000
+#define PENDIN	0040000
+#define IEXTEN	0100000
+
+/* tcflow() and TCXONC use these */
+#define	TCOOFF		0
+#define	TCOON		1
+#define	TCIOFF		2
+#define	TCION		3
+
+/* tcflush() and TCFLSH use these */
+#define	TCIFLUSH	0
+#define	TCOFLUSH	1
+#define	TCIOFLUSH	2
+
+/* tcsetattr uses these */
+#define	TCSANOW		0
+#define	TCSADRAIN	1
+#define	TCSAFLUSH	2
+
+#endif /* __ASM_AVR32_TERMBITS_H */
diff --git a/include/asm-avr32/termios.h b/include/asm-avr32/termios.h
new file mode 100644
index 00000000000..615bc0639e5
--- /dev/null
+++ b/include/asm-avr32/termios.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TERMIOS_H
+#define __ASM_AVR32_TERMIOS_H
+
+#include <asm/termbits.h>
+#include <asm/ioctls.h>
+
+struct winsize {
+	unsigned short ws_row;
+	unsigned short ws_col;
+	unsigned short ws_xpixel;
+	unsigned short ws_ypixel;
+};
+
+#define NCC 8
+struct termio {
+	unsigned short c_iflag;		/* input mode flags */
+	unsigned short c_oflag;		/* output mode flags */
+	unsigned short c_cflag;		/* control mode flags */
+	unsigned short c_lflag;		/* local mode flags */
+	unsigned char c_line;		/* line discipline */
+	unsigned char c_cc[NCC];	/* control characters */
+};
+
+/* modem lines */
+#define TIOCM_LE	0x001
+#define TIOCM_DTR	0x002
+#define TIOCM_RTS	0x004
+#define TIOCM_ST	0x008
+#define TIOCM_SR	0x010
+#define TIOCM_CTS	0x020
+#define TIOCM_CAR	0x040
+#define TIOCM_RNG	0x080
+#define TIOCM_DSR	0x100
+#define TIOCM_CD	TIOCM_CAR
+#define TIOCM_RI	TIOCM_RNG
+#define TIOCM_OUT1	0x2000
+#define TIOCM_OUT2	0x4000
+#define TIOCM_LOOP	0x8000
+
+/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
+
+/* line disciplines */
+#define N_TTY		0
+#define N_SLIP		1
+#define N_MOUSE		2
+#define N_PPP		3
+#define N_STRIP		4
+#define N_AX25		5
+#define N_X25		6	/* X.25 async */
+#define N_6PACK		7
+#define N_MASC		8	/* Reserved for Mobitex module <kaz@cafe.net> */
+#define N_R3964		9	/* Reserved for Simatic R3964 module */
+#define N_PROFIBUS_FDL	10	/* Reserved for Profibus <Dave@mvhi.com> */
+#define N_IRDA		11	/* Linux IR - http://irda.sourceforge.net/ */
+#define N_SMSBLOCK	12	/* SMS block mode - for talking to GSM data cards about SMS messages */
+#define N_HDLC		13	/* synchronous HDLC */
+#define N_SYNC_PPP	14	/* synchronous PPP */
+#define N_HCI		15  /* Bluetooth HCI UART */
+
+#ifdef __KERNEL__
+/*	intr=^C		quit=^\		erase=del	kill=^U
+	eof=^D		vtime=\0	vmin=\1		sxtc=\0
+	start=^Q	stop=^S		susp=^Z		eol=\0
+	reprint=^R	discard=^U	werase=^W	lnext=^V
+	eol2=\0
+*/
+#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
+
+#include <asm-generic/termios.h>
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __ASM_AVR32_TERMIOS_H */
diff --git a/include/asm-avr32/thread_info.h b/include/asm-avr32/thread_info.h
new file mode 100644
index 00000000000..d1f5b35ebd5
--- /dev/null
+++ b/include/asm-avr32/thread_info.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_THREAD_INFO_H
+#define __ASM_AVR32_THREAD_INFO_H
+
+#include <asm/page.h>
+
+#define THREAD_SIZE_ORDER	1
+#define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
+
+#ifndef __ASSEMBLY__
+#include <asm/types.h>
+
+struct task_struct;
+struct exec_domain;
+
+struct thread_info {
+	struct task_struct	*task;		/* main task structure */
+	struct exec_domain	*exec_domain;	/* execution domain */
+	unsigned long		flags;		/* low level flags */
+	__u32			cpu;
+	__s32			preempt_count;	/* 0 => preemptable, <0 => BUG */
+	struct restart_block	restart_block;
+	__u8			supervisor_stack[0];
+};
+
+#define INIT_THREAD_INFO(tsk)						\
+{									\
+	.task		= &tsk,						\
+	.exec_domain	= &default_exec_domain,				\
+	.flags		= 0,						\
+	.cpu		= 0,						\
+	.preempt_count	= 1,						\
+	.restart_block	= {						\
+		.fn	= do_no_restart_syscall				\
+	}								\
+}
+
+#define init_thread_info	(init_thread_union.thread_info)
+#define init_stack		(init_thread_union.stack)
+
+/*
+ * Get the thread information struct from C.
+ * We do the usual trick and use the lower end of the stack for this
+ */
+static inline struct thread_info *current_thread_info(void)
+{
+	unsigned long addr = ~(THREAD_SIZE - 1);
+
+	asm("and %0, sp" : "=r"(addr) : "0"(addr));
+	return (struct thread_info *)addr;
+}
+
+/* thread information allocation */
+#define alloc_thread_info(ti) \
+	((struct thread_info *) __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER))
+#define free_thread_info(ti) free_pages((unsigned long)(ti), 1)
+#define get_thread_info(ti) get_task_struct((ti)->task)
+#define put_thread_info(ti) put_task_struct((ti)->task)
+
+#endif /* !__ASSEMBLY__ */
+
+#define PREEMPT_ACTIVE		0x40000000
+
+/*
+ * Thread information flags
+ * - these are process state flags that various assembly files may need to access
+ * - pending work-to-be-done flags are in LSW
+ * - other flags in MSW
+ */
+#define TIF_SYSCALL_TRACE       0       /* syscall trace active */
+#define TIF_NOTIFY_RESUME       1       /* resumption notification requested */
+#define TIF_SIGPENDING          2       /* signal pending */
+#define TIF_NEED_RESCHED        3       /* rescheduling necessary */
+#define TIF_POLLING_NRFLAG      4       /* true if poll_idle() is polling
+					   TIF_NEED_RESCHED */
+#define TIF_BREAKPOINT		5	/* true if we should break after return */
+#define TIF_SINGLE_STEP		6	/* single step after next break */
+#define TIF_MEMDIE		7
+#define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal */
+#define TIF_USERSPACE		31      /* true if FS sets userspace */
+
+#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
+#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
+#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
+#define _TIF_BREAKPOINT		(1 << TIF_BREAKPOINT)
+#define _TIF_SINGLE_STEP	(1 << TIF_SINGLE_STEP)
+#define _TIF_MEMDIE		(1 << TIF_MEMDIE)
+#define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
+
+/* XXX: These two masks must never span more than 16 bits! */
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK		0x0000013e
+/* work to do on any return to userspace */
+#define _TIF_ALLWORK_MASK	0x0000013f
+/* work to do on return from debug mode */
+#define _TIF_DBGWORK_MASK	0x0000017e
+
+#endif /* __ASM_AVR32_THREAD_INFO_H */
diff --git a/include/asm-avr32/timex.h b/include/asm-avr32/timex.h
new file mode 100644
index 00000000000..5e44ecb3ce0
--- /dev/null
+++ b/include/asm-avr32/timex.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TIMEX_H
+#define __ASM_AVR32_TIMEX_H
+
+/*
+ * This is the frequency of the timer used for Linux's timer interrupt.
+ * The value should be defined as accurate as possible or under certain
+ * circumstances Linux timekeeping might become inaccurate or fail.
+ *
+ * For many system the exact clockrate of the timer isn't known but due to
+ * the way this value is used we can get away with a wrong value as long
+ * as this value is:
+ *
+ *  - a multiple of HZ
+ *  - a divisor of the actual rate
+ *
+ * 500000 is a good such cheat value.
+ *
+ * The obscure number 1193182 is the same as used by the original i8254
+ * time in legacy PC hardware; the chip is never found in AVR32 systems.
+ */
+#define CLOCK_TICK_RATE		500000	/* Underlying HZ */
+
+typedef unsigned long cycles_t;
+
+static inline cycles_t get_cycles (void)
+{
+	return 0;
+}
+
+extern int read_current_timer(unsigned long *timer_value);
+#define ARCH_HAS_READ_CURRENT_TIMER	1
+
+#endif /* __ASM_AVR32_TIMEX_H */
diff --git a/include/asm-avr32/tlb.h b/include/asm-avr32/tlb.h
new file mode 100644
index 00000000000..5c55f9ce7c7
--- /dev/null
+++ b/include/asm-avr32/tlb.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TLB_H
+#define __ASM_AVR32_TLB_H
+
+#define tlb_start_vma(tlb, vma) \
+	flush_cache_range(vma, vma->vm_start, vma->vm_end)
+
+#define tlb_end_vma(tlb, vma) \
+	flush_tlb_range(vma, vma->vm_start, vma->vm_end)
+
+#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while(0)
+
+/*
+ * Flush whole TLB for MM
+ */
+#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+
+#include <asm-generic/tlb.h>
+
+/*
+ * For debugging purposes
+ */
+extern void show_dtlb_entry(unsigned int index);
+extern void dump_dtlb(void);
+
+#endif /* __ASM_AVR32_TLB_H */
diff --git a/include/asm-avr32/tlbflush.h b/include/asm-avr32/tlbflush.h
new file mode 100644
index 00000000000..730e268f81f
--- /dev/null
+++ b/include/asm-avr32/tlbflush.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TLBFLUSH_H
+#define __ASM_AVR32_TLBFLUSH_H
+
+#include <asm/mmu.h>
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb() flushes the current mm struct TLBs
+ *  - flush_tlb_all() flushes all processes' TLB entries
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLBs
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
+ */
+extern void flush_tlb(void);
+extern void flush_tlb_all(void);
+extern void flush_tlb_mm(struct mm_struct *mm);
+extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+			    unsigned long end);
+extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long page);
+extern void __flush_tlb_page(unsigned long asid, unsigned long page);
+
+static inline void flush_tlb_pgtables(struct mm_struct *mm,
+				      unsigned long start, unsigned long end)
+{
+	/* Nothing to do */
+}
+
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+#endif /* __ASM_AVR32_TLBFLUSH_H */
diff --git a/include/asm-avr32/topology.h b/include/asm-avr32/topology.h
new file mode 100644
index 00000000000..5b766cbb480
--- /dev/null
+++ b/include/asm-avr32/topology.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_AVR32_TOPOLOGY_H
+#define __ASM_AVR32_TOPOLOGY_H
+
+#include <asm-generic/topology.h>
+
+#endif /* __ASM_AVR32_TOPOLOGY_H */
diff --git a/include/asm-avr32/traps.h b/include/asm-avr32/traps.h
new file mode 100644
index 00000000000..6a8fb944f41
--- /dev/null
+++ b/include/asm-avr32/traps.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TRAPS_H
+#define __ASM_AVR32_TRAPS_H
+
+#include <linux/list.h>
+
+struct undef_hook {
+	struct list_head node;
+	u32 insn_mask;
+	u32 insn_val;
+	int (*fn)(struct pt_regs *regs, u32 insn);
+};
+
+void register_undef_hook(struct undef_hook *hook);
+void unregister_undef_hook(struct undef_hook *hook);
+
+#endif /* __ASM_AVR32_TRAPS_H */
diff --git a/include/asm-avr32/types.h b/include/asm-avr32/types.h
new file mode 100644
index 00000000000..3f47db9675a
--- /dev/null
+++ b/include/asm-avr32/types.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_TYPES_H
+#define __ASM_AVR32_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned short umode_t;
+
+/*
+ * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the
+ * header files exported to user space
+ */
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+typedef __signed__ long long __s64;
+typedef unsigned long long __u64;
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * These aren't exported outside the kernel to avoid name space clashes
+ */
+#ifdef __KERNEL__
+
+#define BITS_PER_LONG 32
+
+#ifndef __ASSEMBLY__
+
+typedef signed char s8;
+typedef unsigned char u8;
+
+typedef signed short s16;
+typedef unsigned short u16;
+
+typedef signed int s32;
+typedef unsigned int u32;
+
+typedef signed long long s64;
+typedef unsigned long long u64;
+
+/* Dma addresses are 32-bits wide.  */
+
+typedef u32 dma_addr_t;
+
+#ifdef CONFIG_LBD
+typedef u64 sector_t;
+#define HAVE_SECTOR_T
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __KERNEL__ */
+
+
+#endif /* __ASM_AVR32_TYPES_H */
diff --git a/include/asm-avr32/uaccess.h b/include/asm-avr32/uaccess.h
new file mode 100644
index 00000000000..821deb5a9d2
--- /dev/null
+++ b/include/asm-avr32/uaccess.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_UACCESS_H
+#define __ASM_AVR32_UACCESS_H
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+
+#define VERIFY_READ	0
+#define VERIFY_WRITE	1
+
+typedef struct {
+	unsigned int is_user_space;
+} mm_segment_t;
+
+/*
+ * The fs value determines whether argument validity checking should be
+ * performed or not.  If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons (Data Segment Register?), these macros are misnamed.
+ */
+#define MAKE_MM_SEG(s)	((mm_segment_t) { (s) })
+#define segment_eq(a,b)	((a).is_user_space == (b).is_user_space)
+
+#define USER_ADDR_LIMIT 0x80000000
+
+#define KERNEL_DS	MAKE_MM_SEG(0)
+#define USER_DS		MAKE_MM_SEG(1)
+
+#define get_ds()	(KERNEL_DS)
+
+static inline mm_segment_t get_fs(void)
+{
+	return MAKE_MM_SEG(test_thread_flag(TIF_USERSPACE));
+}
+
+static inline void set_fs(mm_segment_t s)
+{
+	if (s.is_user_space)
+		set_thread_flag(TIF_USERSPACE);
+	else
+		clear_thread_flag(TIF_USERSPACE);
+}
+
+/*
+ * Test whether a block of memory is a valid user space address.
+ * Returns 0 if the range is valid, nonzero otherwise.
+ *
+ * We do the following checks:
+ *   1. Is the access from kernel space?
+ *   2. Does (addr + size) set the carry bit?
+ *   3. Is (addr + size) a negative number (i.e. >= 0x80000000)?
+ *
+ * If yes on the first check, access is granted.
+ * If no on any of the others, access is denied.
+ */
+#define __range_ok(addr, size)						\
+	(test_thread_flag(TIF_USERSPACE)				\
+	 && (((unsigned long)(addr) >= 0x80000000)			\
+	     || ((unsigned long)(size) > 0x80000000)			\
+	     || (((unsigned long)(addr) + (unsigned long)(size)) > 0x80000000)))
+
+#define access_ok(type, addr, size) (likely(__range_ok(addr, size) == 0))
+
+static inline int
+verify_area(int type, const void __user *addr, unsigned long size)
+{
+	return access_ok(type, addr, size) ? 0 : -EFAULT;
+}
+
+/* Generic arbitrary sized copy. Return the number of bytes NOT copied */
+extern __kernel_size_t __copy_user(void *to, const void *from,
+				   __kernel_size_t n);
+
+extern __kernel_size_t copy_to_user(void __user *to, const void *from,
+				    __kernel_size_t n);
+extern __kernel_size_t copy_from_user(void *to, const void __user *from,
+				      __kernel_size_t n);
+
+static inline __kernel_size_t __copy_to_user(void __user *to, const void *from,
+					     __kernel_size_t n)
+{
+	return __copy_user((void __force *)to, from, n);
+}
+static inline __kernel_size_t __copy_from_user(void *to,
+					       const void __user *from,
+					       __kernel_size_t n)
+{
+	return __copy_user(to, (const void __force *)from, n);
+}
+
+#define __copy_to_user_inatomic __copy_to_user
+#define __copy_from_user_inatomic __copy_from_user
+
+/*
+ * put_user: - Write a simple value into user space.
+ * @x:   Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+#define put_user(x,ptr)	\
+	__put_user_check((x),(ptr),sizeof(*(ptr)))
+
+/*
+ * get_user: - Get a simple variable from user space.
+ * @x:   Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+#define get_user(x,ptr) \
+	__get_user_check((x),(ptr),sizeof(*(ptr)))
+
+/*
+ * __put_user: - Write a simple value into user space, with less checking.
+ * @x:   Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+#define __put_user(x,ptr) \
+	__put_user_nocheck((x),(ptr),sizeof(*(ptr)))
+
+/*
+ * __get_user: - Get a simple variable from user space, with less checking.
+ * @x:   Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+#define __get_user(x,ptr) \
+	__get_user_nocheck((x),(ptr),sizeof(*(ptr)))
+
+extern int __get_user_bad(void);
+extern int __put_user_bad(void);
+
+#define __get_user_nocheck(x, ptr, size)				\
+({									\
+	typeof(*(ptr)) __gu_val = (typeof(*(ptr)) __force)0;		\
+	int __gu_err = 0;						\
+									\
+	switch (size) {							\
+	case 1: __get_user_asm("ub", __gu_val, ptr, __gu_err); break;	\
+	case 2: __get_user_asm("uh", __gu_val, ptr, __gu_err); break;	\
+	case 4: __get_user_asm("w", __gu_val, ptr, __gu_err); break;	\
+	case 8: __get_user_asm("d", __gu_val, ptr, __gu_err); break;	\
+	default: __gu_err = __get_user_bad(); break;			\
+	}								\
+									\
+	x = __gu_val;							\
+	__gu_err;							\
+})
+
+#define __get_user_check(x, ptr, size)					\
+({									\
+	typeof(*(ptr)) __gu_val = (typeof(*(ptr)) __force)0;		\
+	const typeof(*(ptr)) __user * __gu_addr = (ptr);		\
+	int __gu_err = 0;						\
+									\
+	if (access_ok(VERIFY_READ, __gu_addr, size)) {			\
+		switch (size) {						\
+		case 1:							\
+			__get_user_asm("ub", __gu_val, __gu_addr,	\
+				       __gu_err);			\
+			break;						\
+		case 2:							\
+			__get_user_asm("uh", __gu_val, __gu_addr,	\
+				       __gu_err);			\
+			break;						\
+		case 4:							\
+			__get_user_asm("w", __gu_val, __gu_addr,	\
+				       __gu_err);			\
+			break;						\
+		case 8:							\
+			__get_user_asm("d", __gu_val, __gu_addr,	\
+				       __gu_err);			\
+			break;						\
+		default:						\
+			__gu_err = __get_user_bad();			\
+			break;						\
+		}							\
+	} else {							\
+		__gu_err = -EFAULT;					\
+	}								\
+	x = __gu_val;							\
+	__gu_err;							\
+})
+
+#define __get_user_asm(suffix, __gu_val, ptr, __gu_err)			\
+	asm volatile(							\
+		"1:	ld." suffix "	%1, %3			\n"	\
+		"2:						\n"	\
+		"	.section .fixup, \"ax\"			\n"	\
+		"3:	mov	%0, %4				\n"	\
+		"	rjmp	2b				\n"	\
+		"	.previous				\n"	\
+		"	.section __ex_table, \"a\"		\n"	\
+		"	.long	1b, 3b				\n"	\
+		"	.previous				\n"	\
+		: "=r"(__gu_err), "=r"(__gu_val)			\
+		: "0"(__gu_err), "m"(*(ptr)), "i"(-EFAULT))
+
+#define __put_user_nocheck(x, ptr, size)				\
+({									\
+	typeof(*(ptr)) __pu_val;					\
+	int __pu_err = 0;						\
+									\
+	__pu_val = (x);							\
+	switch (size) {							\
+	case 1: __put_user_asm("b", ptr, __pu_val, __pu_err); break;	\
+	case 2: __put_user_asm("h", ptr, __pu_val, __pu_err); break;	\
+	case 4: __put_user_asm("w", ptr, __pu_val, __pu_err); break;	\
+	case 8: __put_user_asm("d", ptr, __pu_val, __pu_err); break;	\
+	default: __pu_err = __put_user_bad(); break;			\
+	}								\
+	__pu_err;							\
+})
+
+#define __put_user_check(x, ptr, size)					\
+({									\
+	typeof(*(ptr)) __pu_val;					\
+	typeof(*(ptr)) __user *__pu_addr = (ptr);			\
+	int __pu_err = 0;						\
+									\
+	__pu_val = (x);							\
+	if (access_ok(VERIFY_WRITE, __pu_addr, size)) {			\
+		switch (size) {						\
+		case 1:							\
+			__put_user_asm("b", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 2:							\
+			__put_user_asm("h", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 4:							\
+			__put_user_asm("w", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 8:							\
+			__put_user_asm("d", __pu_addr, __pu_val,		\
+				       __pu_err);			\
+			break;						\
+		default:						\
+			__pu_err = __put_user_bad();			\
+			break;						\
+		}							\
+	} else {							\
+		__pu_err = -EFAULT;					\
+	}								\
+	__pu_err;							\
+})
+
+#define __put_user_asm(suffix, ptr, __pu_val, __gu_err)			\
+	asm volatile(							\
+		"1:	st." suffix "	%1, %3			\n"	\
+		"2:						\n"	\
+		"	.section .fixup, \"ax\"			\n"	\
+		"3:	mov	%0, %4				\n"	\
+		"	rjmp	2b				\n"	\
+		"	.previous				\n"	\
+		"	.section __ex_table, \"a\"		\n"	\
+		"	.long	1b, 3b				\n"	\
+		"	.previous				\n"	\
+		: "=r"(__gu_err), "=m"(*(ptr))				\
+		: "0"(__gu_err), "r"(__pu_val), "i"(-EFAULT))
+
+extern __kernel_size_t clear_user(void __user *addr, __kernel_size_t size);
+extern __kernel_size_t __clear_user(void __user *addr, __kernel_size_t size);
+
+extern long strncpy_from_user(char *dst, const char __user *src, long count);
+extern long __strncpy_from_user(char *dst, const char __user *src, long count);
+
+extern long strnlen_user(const char __user *__s, long __n);
+extern long __strnlen_user(const char __user *__s, long __n);
+
+#define strlen_user(s) strnlen_user(s, ~0UL >> 1)
+
+struct exception_table_entry
+{
+	unsigned long insn, fixup;
+};
+
+#endif /* __ASM_AVR32_UACCESS_H */
diff --git a/include/asm-avr32/ucontext.h b/include/asm-avr32/ucontext.h
new file mode 100644
index 00000000000..ac7259c2a79
--- /dev/null
+++ b/include/asm-avr32/ucontext.h
@@ -0,0 +1,12 @@
+#ifndef __ASM_AVR32_UCONTEXT_H
+#define __ASM_AVR32_UCONTEXT_H
+
+struct ucontext {
+	unsigned long		uc_flags;
+	struct ucontext	*	uc_link;
+	stack_t			uc_stack;
+	struct sigcontext	uc_mcontext;
+	sigset_t		uc_sigmask;
+};
+
+#endif /* __ASM_AVR32_UCONTEXT_H */
diff --git a/include/asm-avr32/unaligned.h b/include/asm-avr32/unaligned.h
new file mode 100644
index 00000000000..3042723fcbf
--- /dev/null
+++ b/include/asm-avr32/unaligned.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_AVR32_UNALIGNED_H
+#define __ASM_AVR32_UNALIGNED_H
+
+/*
+ * AVR32 can handle some unaligned accesses, depending on the
+ * implementation.  The AVR32 AP implementation can handle unaligned
+ * words, but halfwords must be halfword-aligned, and doublewords must
+ * be word-aligned.
+ *
+ * TODO: Make all this CPU-specific and optimize.
+ */
+
+#include <linux/string.h>
+
+/* Use memmove here, so gcc does not insert a __builtin_memcpy. */
+
+#define get_unaligned(ptr) \
+  ({ __typeof__(*(ptr)) __tmp; memmove(&__tmp, (ptr), sizeof(*(ptr))); __tmp; })
+
+#define put_unaligned(val, ptr)				\
+  ({ __typeof__(*(ptr)) __tmp = (val);			\
+     memmove((ptr), &__tmp, sizeof(*(ptr)));		\
+     (void)0; })
+
+#endif /* __ASM_AVR32_UNALIGNED_H */
diff --git a/include/asm-avr32/unistd.h b/include/asm-avr32/unistd.h
new file mode 100644
index 00000000000..1f528f92690
--- /dev/null
+++ b/include/asm-avr32/unistd.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_UNISTD_H
+#define __ASM_AVR32_UNISTD_H
+
+/*
+ * This file contains the system call numbers.
+ */
+
+#define __NR_restart_syscall      0
+#define __NR_exit		  1
+#define __NR_fork		  2
+#define __NR_read		  3
+#define __NR_write		  4
+#define __NR_open		  5
+#define __NR_close		  6
+#define __NR_umask		  7
+#define __NR_creat		  8
+#define __NR_link		  9
+#define __NR_unlink		 10
+#define __NR_execve		 11
+#define __NR_chdir		 12
+#define __NR_time		 13
+#define __NR_mknod		 14
+#define __NR_chmod		 15
+#define __NR_chown		 16
+#define __NR_lchown		 17
+#define __NR_lseek		 18
+#define __NR__llseek		 19
+#define __NR_getpid		 20
+#define __NR_mount		 21
+#define __NR_umount2		 22
+#define __NR_setuid		 23
+#define __NR_getuid		 24
+#define __NR_stime		 25
+#define __NR_ptrace		 26
+#define __NR_alarm		 27
+#define __NR_pause		 28
+#define __NR_utime		 29
+#define __NR_stat		 30
+#define __NR_fstat		 31
+#define __NR_lstat		 32
+#define __NR_access		 33
+#define __NR_chroot		 34
+#define __NR_sync		 35
+#define __NR_fsync		 36
+#define __NR_kill		 37
+#define __NR_rename		 38
+#define __NR_mkdir		 39
+#define __NR_rmdir		 40
+#define __NR_dup		 41
+#define __NR_pipe		 42
+#define __NR_times		 43
+#define __NR_clone		 44
+#define __NR_brk		 45
+#define __NR_setgid		 46
+#define __NR_getgid		 47
+#define __NR_getcwd		 48
+#define __NR_geteuid		 49
+#define __NR_getegid		 50
+#define __NR_acct		 51
+#define __NR_setfsuid		 52
+#define __NR_setfsgid		 53
+#define __NR_ioctl		 54
+#define __NR_fcntl		 55
+#define __NR_setpgid		 56
+#define __NR_mremap		 57
+#define __NR_setresuid		 58
+#define __NR_getresuid		 59
+#define __NR_setreuid		 60
+#define __NR_setregid		 61
+#define __NR_ustat		 62
+#define __NR_dup2		 63
+#define __NR_getppid		 64
+#define __NR_getpgrp		 65
+#define __NR_setsid		 66
+#define __NR_rt_sigaction	 67
+#define __NR_rt_sigreturn	 68
+#define __NR_rt_sigprocmask	 69
+#define __NR_rt_sigpending	 70
+#define __NR_rt_sigtimedwait	 71
+#define __NR_rt_sigqueueinfo	 72
+#define __NR_rt_sigsuspend	 73
+#define __NR_sethostname	 74
+#define __NR_setrlimit		 75
+#define __NR_getrlimit		 76	/* SuS compliant getrlimit */
+#define __NR_getrusage		 77
+#define __NR_gettimeofday	 78
+#define __NR_settimeofday	 79
+#define __NR_getgroups		 80
+#define __NR_setgroups		 81
+#define __NR_select		 82
+#define __NR_symlink		 83
+#define __NR_fchdir		 84
+#define __NR_readlink		 85
+#define __NR_pread		 86
+#define __NR_pwrite		 87
+#define __NR_swapon		 88
+#define __NR_reboot		 89
+#define __NR_mmap2		 90
+#define __NR_munmap		 91
+#define __NR_truncate		 92
+#define __NR_ftruncate		 93
+#define __NR_fchmod		 94
+#define __NR_fchown		 95
+#define __NR_getpriority	 96
+#define __NR_setpriority	 97
+#define __NR_wait4		 98
+#define __NR_statfs		 99
+#define __NR_fstatfs		100
+#define __NR_vhangup		101
+#define __NR_sigaltstack	102
+#define __NR_syslog		103
+#define __NR_setitimer		104
+#define __NR_getitimer		105
+#define __NR_swapoff		106
+#define __NR_sysinfo		107
+#define __NR_ipc		108
+#define __NR_sendfile		109
+#define __NR_setdomainname	110
+#define __NR_uname		111
+#define __NR_adjtimex		112
+#define __NR_mprotect		113
+#define __NR_vfork		114
+#define __NR_init_module	115
+#define __NR_delete_module	116
+#define __NR_quotactl		117
+#define __NR_getpgid		118
+#define __NR_bdflush		119
+#define __NR_sysfs		120
+#define __NR_personality	121
+#define __NR_afs_syscall	122 /* Syscall for Andrew File System */
+#define __NR_getdents		123
+#define __NR_flock		124
+#define __NR_msync		125
+#define __NR_readv		126
+#define __NR_writev		127
+#define __NR_getsid		128
+#define __NR_fdatasync		129
+#define __NR__sysctl		130
+#define __NR_mlock		131
+#define __NR_munlock		132
+#define __NR_mlockall		133
+#define __NR_munlockall		134
+#define __NR_sched_setparam		135
+#define __NR_sched_getparam		136
+#define __NR_sched_setscheduler		137
+#define __NR_sched_getscheduler		138
+#define __NR_sched_yield		139
+#define __NR_sched_get_priority_max	140
+#define __NR_sched_get_priority_min	141
+#define __NR_sched_rr_get_interval	142
+#define __NR_nanosleep		143
+#define __NR_poll		144
+#define __NR_nfsservctl		145
+#define __NR_setresgid		146
+#define __NR_getresgid		147
+#define __NR_prctl              148
+#define __NR_socket		149
+#define __NR_bind		150
+#define __NR_connect		151
+#define __NR_listen		152
+#define __NR_accept		153
+#define __NR_getsockname	154
+#define __NR_getpeername	155
+#define __NR_socketpair		156
+#define __NR_send		157
+#define __NR_recv		158
+#define __NR_sendto		159
+#define __NR_recvfrom		160
+#define __NR_shutdown		161
+#define __NR_setsockopt		162
+#define __NR_getsockopt		163
+#define __NR_sendmsg		164
+#define __NR_recvmsg		165
+#define __NR_truncate64		166
+#define __NR_ftruncate64	167
+#define __NR_stat64		168
+#define __NR_lstat64		169
+#define __NR_fstat64		170
+#define __NR_pivot_root		171
+#define __NR_mincore		172
+#define __NR_madvise		173
+#define __NR_getdents64		174
+#define __NR_fcntl64		175
+#define __NR_gettid		176
+#define __NR_readahead		177
+#define __NR_setxattr		178
+#define __NR_lsetxattr		179
+#define __NR_fsetxattr		180
+#define __NR_getxattr		181
+#define __NR_lgetxattr		182
+#define __NR_fgetxattr		183
+#define __NR_listxattr		184
+#define __NR_llistxattr		185
+#define __NR_flistxattr		186
+#define __NR_removexattr	187
+#define __NR_lremovexattr	188
+#define __NR_fremovexattr	189
+#define __NR_tkill		190
+#define __NR_sendfile64		191
+#define __NR_futex		192
+#define __NR_sched_setaffinity	193
+#define __NR_sched_getaffinity	194
+#define __NR_capget		195
+#define __NR_capset		196
+#define __NR_io_setup		197
+#define __NR_io_destroy		198
+#define __NR_io_getevents	199
+#define __NR_io_submit		200
+#define __NR_io_cancel		201
+#define __NR_fadvise64		202
+#define __NR_exit_group		203
+#define __NR_lookup_dcookie	204
+#define __NR_epoll_create	205
+#define __NR_epoll_ctl		206
+#define __NR_epoll_wait		207
+#define __NR_remap_file_pages	208
+#define __NR_set_tid_address	209
+
+#define __NR_timer_create	210
+#define __NR_timer_settime	211
+#define __NR_timer_gettime	212
+#define __NR_timer_getoverrun	213
+#define __NR_timer_delete	214
+#define __NR_clock_settime	215
+#define __NR_clock_gettime	216
+#define __NR_clock_getres	217
+#define __NR_clock_nanosleep	218
+#define __NR_statfs64		219
+#define __NR_fstatfs64		220
+#define __NR_tgkill		221
+				/* 222 reserved for tux */
+#define __NR_utimes		223
+#define __NR_fadvise64_64	224
+
+#define __NR_cacheflush		225
+
+#define __NR_vserver		226
+#define __NR_mq_open		227
+#define __NR_mq_unlink		228
+#define __NR_mq_timedsend	229
+#define __NR_mq_timedreceive	230
+#define __NR_mq_notify		231
+#define __NR_mq_getsetattr	232
+#define __NR_kexec_load		233
+#define __NR_waitid		234
+#define __NR_add_key		235
+#define __NR_request_key	236
+#define __NR_keyctl		237
+#define __NR_ioprio_set		238
+#define __NR_ioprio_get		239
+#define __NR_inotify_init	240
+#define __NR_inotify_add_watch	241
+#define __NR_inotify_rm_watch	242
+#define __NR_openat		243
+#define __NR_mkdirat		244
+#define __NR_mknodat		245
+#define __NR_fchownat		246
+#define __NR_futimesat		247
+#define __NR_fstatat64		248
+#define __NR_unlinkat		249
+#define __NR_renameat		250
+#define __NR_linkat		251
+#define __NR_symlinkat		252
+#define __NR_readlinkat		253
+#define __NR_fchmodat		254
+#define __NR_faccessat		255
+#define __NR_pselect6		256
+#define __NR_ppoll		257
+#define __NR_unshare		258
+#define __NR_set_robust_list	259
+#define __NR_get_robust_list	260
+#define __NR_splice		261
+#define __NR_sync_file_range	262
+#define __NR_tee		263
+#define __NR_vmsplice		264
+
+#define NR_syscalls		265
+
+
+/*
+ * AVR32 calling convention for system calls:
+ *   - System call number in r8
+ *   - Parameters in r12 and downwards to r9 as well as r6 and r5.
+ *   - Return value in r12
+ */
+
+/*
+ * user-visible error numbers are in the range -1 - -124: see
+ * <asm-generic/errno.h>
+ */
+
+#define __syscall_return(type, res) do {				\
+		if ((unsigned long)(res) >= (unsigned long)(-125)) {	\
+			errno = -(res);					\
+			res = -1;					\
+		}							\
+		return (type) (res);					\
+	} while (0)
+
+#ifdef __KERNEL__
+#define __ARCH_WANT_IPC_PARSE_VERSION
+#define __ARCH_WANT_STAT64
+#define __ARCH_WANT_SYS_ALARM
+#define __ARCH_WANT_SYS_GETHOSTNAME
+#define __ARCH_WANT_SYS_PAUSE
+#define __ARCH_WANT_SYS_TIME
+#define __ARCH_WANT_SYS_UTIME
+#define __ARCH_WANT_SYS_WAITPID
+#define __ARCH_WANT_SYS_FADVISE64
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_LLSEEK
+#define __ARCH_WANT_SYS_GETPGRP
+#define __ARCH_WANT_SYS_RT_SIGACTION
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+#endif
+
+#if defined(__KERNEL_SYSCALLS__) || defined(__CHECKER__)
+
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <asm/signal.h>
+
+struct pt_regs;
+
+/*
+ * we need this inline - forking from kernel space will result
+ * in NO COPY ON WRITE (!!!), until an execve is executed. This
+ * is no problem, but for the stack. This is handled by not letting
+ * main() use the stack at all after fork(). Thus, no function
+ * calls - which means inline code for fork too, as otherwise we
+ * would use the stack upon exit from 'fork()'.
+ *
+ * Actually only pause and fork are needed inline, so that there
+ * won't be any messing with the stack from main(), but we define
+ * some others too.
+ */
+static inline int execve(const char *file, char **argv, char **envp)
+{
+	register long scno asm("r8") = __NR_execve;
+	register long sc1 asm("r12") = (long)file;
+	register long sc2 asm("r11") = (long)argv;
+	register long sc3 asm("r10") = (long)envp;
+	int res;
+
+	asm volatile("scall"
+		     : "=r"(sc1)
+		     : "r"(scno), "0"(sc1), "r"(sc2), "r"(sc3)
+		     : "lr", "memory");
+	res = sc1;
+	__syscall_return(int, res);
+}
+
+asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
+asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+			       struct pt_regs *regs);
+asmlinkage int sys_rt_sigreturn(struct pt_regs *regs);
+asmlinkage int sys_pipe(unsigned long __user *filedes);
+asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
+			  unsigned long prot, unsigned long flags,
+			  unsigned long fd, off_t offset);
+asmlinkage int sys_cacheflush(int operation, void __user *addr, size_t len);
+asmlinkage int sys_fork(struct pt_regs *regs);
+asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
+			 unsigned long parent_tidptr,
+			 unsigned long child_tidptr, struct pt_regs *regs);
+asmlinkage int sys_vfork(struct pt_regs *regs);
+asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv,
+			  char __user *__user *uenvp, struct pt_regs *regs);
+
+#endif
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
+
+#endif /* __ASM_AVR32_UNISTD_H */
diff --git a/include/asm-avr32/user.h b/include/asm-avr32/user.h
new file mode 100644
index 00000000000..060fb3acee4
--- /dev/null
+++ b/include/asm-avr32/user.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2004-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Note: We may not need these definitions for AVR32, as we don't
+ * support a.out.
+ */
+#ifndef __ASM_AVR32_USER_H
+#define __ASM_AVR32_USER_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+#include <asm/page.h>
+
+/*
+ * Core file format: The core file is written in such a way that gdb
+ * can understand it and provide useful information to the user (under
+ * linux we use the `trad-core' bfd).  The file contents are as follows:
+ *
+ *  upage: 1 page consisting of a user struct that tells gdb
+ *	what is present in the file.  Directly after this is a
+ *	copy of the task_struct, which is currently not used by gdb,
+ *	but it may come in handy at some point.  All of the registers
+ *	are stored as part of the upage.  The upage should always be
+ *	only one page long.
+ *  data: The data segment follows next.  We use current->end_text to
+ *	current->brk to pick up all of the user variables, plus any memory
+ *	that may have been sbrk'ed.  No attempt is made to determine if a
+ *	page is demand-zero or if a page is totally unused, we just cover
+ *	the entire range.  All of the addresses are rounded in such a way
+ *	that an integral number of pages is written.
+ *  stack: We need the stack information in order to get a meaningful
+ *	backtrace.  We need to write the data from usp to
+ *	current->start_stack, so we round each of these in order to be able
+ *	to write an integer number of pages.
+ */
+
+struct user_fpu_struct {
+	/* We have no FPU (yet) */
+};
+
+struct user {
+	struct pt_regs	regs;			/* entire machine state */
+	size_t		u_tsize;		/* text size (pages) */
+	size_t		u_dsize;		/* data size (pages) */
+	size_t		u_ssize;		/* stack size (pages) */
+	unsigned long	start_code;		/* text starting address */
+	unsigned long	start_data;		/* data starting address */
+	unsigned long	start_stack;		/* stack starting address */
+	long int	signal;			/* signal causing core dump */
+	struct regs *	u_ar0;			/* help gdb find registers */
+	unsigned long	magic;			/* identifies a core file */
+	char		u_comm[32];		/* user command name */
+};
+
+#define NBPG			PAGE_SIZE
+#define UPAGES			1
+#define HOST_TEXT_START_ADDR	(u.start_code)
+#define HOST_DATA_START_ADDR	(u.start_data)
+#define HOST_STACK_END_ADDR	(u.start_stack + u.u_ssize * NBPG)
+
+#endif /* __ASM_AVR32_USER_H */
diff --git a/include/linux/elf-em.h b/include/linux/elf-em.h
index 6a5796c81c9..666e0a5f00f 100644
--- a/include/linux/elf-em.h
+++ b/include/linux/elf-em.h
@@ -31,6 +31,7 @@
 #define EM_M32R		88	/* Renesas M32R */
 #define EM_H8_300	46	/* Renesas H8/300,300H,H8S */
 #define EM_FRV		0x5441	/* Fujitsu FR-V */
+#define EM_AVR32	0x18ad	/* Atmel AVR32 */
 
 /*
  * This is an interim value that we will use until the committee comes
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 554ee688a9f..3f21cc79a13 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -277,7 +277,7 @@ config DEBUG_HIGHMEM
 config DEBUG_BUGVERBOSE
 	bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EMBEDDED
 	depends on BUG
-	depends on ARM || ARM26 || M32R || M68K || SPARC32 || SPARC64 || X86_32 || FRV
+	depends on ARM || ARM26 || AVR32 || M32R || M68K || SPARC32 || SPARC64 || X86_32 || FRV
 	default !EMBEDDED
 	help
 	  Say Y here to make BUG() panics output the file name and line number
@@ -315,7 +315,7 @@ config DEBUG_VM
 
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390)
+	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || AVR32)
 	default y if DEBUG_INFO && UML
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
-- 
cgit v1.2.3


From bc157b75960f1f33566074e820342690216629b9 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Mon, 25 Sep 2006 23:32:16 -0700
Subject: [PATCH] AVR32 MTD: Static Memory Controller driver

This patchset adds the necessary drivers and infrastructure to access the
external flash on the ATSTK1000 board through the MTD subsystem.  With this
stuff in place, it will be possible to use a jffs2 filesystem stored in the
external flash as a root filesystem.  It might also be possible to update the
boot loader if you drop the write protection of partition 0.

As suggested by David Woodhouse, I reworked the patches to use the physmap
driver instead of introducing a separate mapping driver for the ATSTK1000.
I've also cleaned up the hsmc header by removing useless comments and
converting spaces to tabs (my headerfile generator needs some work.)

Unfortunately, I couldn't unlock the flash in fixup_use_atmel_lock because the
erase regions hadn't been set up yet, so I had to do it from cfi_amdstd_setup
instead.

This patch:

This adds a simple API for configuring the static memory controller along with
an implementation for the Atmel HSMC.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/avr32/mach-at32ap/Makefile     |   2 +-
 arch/avr32/mach-at32ap/at32ap7000.c |  10 +++
 arch/avr32/mach-at32ap/hsmc.c       | 164 ++++++++++++++++++++++++++++++++++++
 arch/avr32/mach-at32ap/hsmc.h       | 127 ++++++++++++++++++++++++++++
 include/asm-avr32/arch-at32ap/smc.h |  60 +++++++++++++
 5 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 arch/avr32/mach-at32ap/hsmc.c
 create mode 100644 arch/avr32/mach-at32ap/hsmc.h
 create mode 100644 include/asm-avr32/arch-at32ap/smc.h

diff --git a/arch/avr32/mach-at32ap/Makefile b/arch/avr32/mach-at32ap/Makefile
index 4b10853eb61..f62eb691551 100644
--- a/arch/avr32/mach-at32ap/Makefile
+++ b/arch/avr32/mach-at32ap/Makefile
@@ -1,2 +1,2 @@
-obj-y				+= at32ap.o clock.o pio.o intc.o extint.o
+obj-y				+= at32ap.o clock.o pio.o intc.o extint.o hsmc.o
 obj-$(CONFIG_CPU_AT32AP7000)	+= at32ap7000.o
diff --git a/arch/avr32/mach-at32ap/at32ap7000.c b/arch/avr32/mach-at32ap/at32ap7000.c
index e8c6893a1c2..37982b60398 100644
--- a/arch/avr32/mach-at32ap/at32ap7000.c
+++ b/arch/avr32/mach-at32ap/at32ap7000.c
@@ -450,6 +450,13 @@ static struct clk hramc_clk = {
 	.users		= 1,
 };
 
+static struct resource smc0_resource[] = {
+	PBMEM(0xfff03400),
+};
+DEFINE_DEV(smc, 0);
+DEV_CLK(pclk, smc0, pbb, 13);
+DEV_CLK(mck, smc0, hsb, 0);
+
 static struct platform_device pdc_device = {
 	.name		= "pdc",
 	.id		= 0,
@@ -503,6 +510,7 @@ void __init at32_add_system_devices(void)
 
 	platform_device_register(&at32_sm_device);
 	platform_device_register(&at32_intc0_device);
+	platform_device_register(&smc0_device);
 	platform_device_register(&pdc_device);
 
 	platform_device_register(&pio0_device);
@@ -796,6 +804,8 @@ struct clk *at32_clock_list[] = {
 	&at32_intc0_pclk,
 	&ebi_clk,
 	&hramc_clk,
+	&smc0_pclk,
+	&smc0_mck,
 	&pdc_hclk,
 	&pdc_pclk,
 	&pico_clk,
diff --git a/arch/avr32/mach-at32ap/hsmc.c b/arch/avr32/mach-at32ap/hsmc.c
new file mode 100644
index 00000000000..7691721928a
--- /dev/null
+++ b/arch/avr32/mach-at32ap/hsmc.c
@@ -0,0 +1,164 @@
+/*
+ * Static Memory Controller for AT32 chips
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define DEBUG
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include <asm/io.h>
+#include <asm/arch/smc.h>
+
+#include "hsmc.h"
+
+#define NR_CHIP_SELECTS 6
+
+struct hsmc {
+	void __iomem *regs;
+	struct clk *pclk;
+	struct clk *mck;
+};
+
+static struct hsmc *hsmc;
+
+int smc_set_configuration(int cs, const struct smc_config *config)
+{
+	unsigned long mul;
+	unsigned long offset;
+	u32 setup, pulse, cycle, mode;
+
+	if (!hsmc)
+		return -ENODEV;
+	if (cs >= NR_CHIP_SELECTS)
+		return -EINVAL;
+
+	/*
+	 * cycles = x / T = x * f
+	 *   = ((x * 1000000000) * ((f * 65536) / 1000000000)) / 65536
+	 *   = ((x * 1000000000) * (((f / 10000) * 65536) / 100000)) / 65536
+	 */
+	mul = (clk_get_rate(hsmc->mck) / 10000) << 16;
+	mul /= 100000;
+
+#define ns2cyc(x) ((((x) * mul) + 65535) >> 16)
+
+	setup = (HSMC_BF(NWE_SETUP, ns2cyc(config->nwe_setup))
+		 | HSMC_BF(NCS_WR_SETUP, ns2cyc(config->ncs_write_setup))
+		 | HSMC_BF(NRD_SETUP, ns2cyc(config->nrd_setup))
+		 | HSMC_BF(NCS_RD_SETUP, ns2cyc(config->ncs_read_setup)));
+	pulse = (HSMC_BF(NWE_PULSE, ns2cyc(config->nwe_pulse))
+		 | HSMC_BF(NCS_WR_PULSE, ns2cyc(config->ncs_write_pulse))
+		 | HSMC_BF(NRD_PULSE, ns2cyc(config->nrd_pulse))
+		 | HSMC_BF(NCS_RD_PULSE, ns2cyc(config->ncs_read_pulse)));
+	cycle = (HSMC_BF(NWE_CYCLE, ns2cyc(config->write_cycle))
+		 | HSMC_BF(NRD_CYCLE, ns2cyc(config->read_cycle)));
+
+	switch (config->bus_width) {
+	case 1:
+		mode = HSMC_BF(DBW, HSMC_DBW_8_BITS);
+		break;
+	case 2:
+		mode = HSMC_BF(DBW, HSMC_DBW_16_BITS);
+		break;
+	case 4:
+		mode = HSMC_BF(DBW, HSMC_DBW_32_BITS);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (config->nrd_controlled)
+		mode |= HSMC_BIT(READ_MODE);
+	if (config->nwe_controlled)
+		mode |= HSMC_BIT(WRITE_MODE);
+	if (config->byte_write)
+		mode |= HSMC_BIT(BAT);
+
+	pr_debug("smc cs%d: setup/%08x pulse/%08x cycle/%08x mode/%08x\n",
+		 cs, setup, pulse, cycle, mode);
+
+	offset = cs * 0x10;
+	hsmc_writel(hsmc, SETUP0 + offset, setup);
+	hsmc_writel(hsmc, PULSE0 + offset, pulse);
+	hsmc_writel(hsmc, CYCLE0 + offset, cycle);
+	hsmc_writel(hsmc, MODE0 + offset, mode);
+	hsmc_readl(hsmc, MODE0); /* I/O barrier */
+
+	return 0;
+}
+EXPORT_SYMBOL(smc_set_configuration);
+
+static int hsmc_probe(struct platform_device *pdev)
+{
+	struct resource *regs;
+	struct clk *pclk, *mck;
+	int ret;
+
+	if (hsmc)
+		return -EBUSY;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!regs)
+		return -ENXIO;
+	pclk = clk_get(&pdev->dev, "pclk");
+	if (IS_ERR(pclk))
+		return PTR_ERR(pclk);
+	mck = clk_get(&pdev->dev, "mck");
+	if (IS_ERR(mck)) {
+		ret = PTR_ERR(mck);
+		goto out_put_pclk;
+	}
+
+	ret = -ENOMEM;
+	hsmc = kzalloc(sizeof(struct hsmc), GFP_KERNEL);
+	if (!hsmc)
+		goto out_put_clocks;
+
+	clk_enable(pclk);
+	clk_enable(mck);
+
+	hsmc->pclk = pclk;
+	hsmc->mck = mck;
+	hsmc->regs = ioremap(regs->start, regs->end - regs->start + 1);
+	if (!hsmc->regs)
+		goto out_disable_clocks;
+
+	dev_info(&pdev->dev, "Atmel Static Memory Controller at 0x%08lx\n",
+		 (unsigned long)regs->start);
+
+	platform_set_drvdata(pdev, hsmc);
+
+	return 0;
+
+out_disable_clocks:
+	clk_disable(mck);
+	clk_disable(pclk);
+	kfree(hsmc);
+out_put_clocks:
+	clk_put(mck);
+out_put_pclk:
+	clk_put(pclk);
+	hsmc = NULL;
+	return ret;
+}
+
+static struct platform_driver hsmc_driver = {
+	.probe		= hsmc_probe,
+	.driver		= {
+		.name	= "smc",
+	},
+};
+
+static int __init hsmc_init(void)
+{
+	return platform_driver_register(&hsmc_driver);
+}
+arch_initcall(hsmc_init);
diff --git a/arch/avr32/mach-at32ap/hsmc.h b/arch/avr32/mach-at32ap/hsmc.h
new file mode 100644
index 00000000000..5681276fafd
--- /dev/null
+++ b/arch/avr32/mach-at32ap/hsmc.h
@@ -0,0 +1,127 @@
+/*
+ * Register definitions for Atmel Static Memory Controller (SMC)
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_AVR32_HSMC_H__
+#define __ASM_AVR32_HSMC_H__
+
+/* HSMC register offsets */
+#define HSMC_SETUP0				0x0000
+#define HSMC_PULSE0				0x0004
+#define HSMC_CYCLE0				0x0008
+#define HSMC_MODE0				0x000c
+#define HSMC_SETUP1				0x0010
+#define HSMC_PULSE1				0x0014
+#define HSMC_CYCLE1				0x0018
+#define HSMC_MODE1				0x001c
+#define HSMC_SETUP2				0x0020
+#define HSMC_PULSE2				0x0024
+#define HSMC_CYCLE2				0x0028
+#define HSMC_MODE2				0x002c
+#define HSMC_SETUP3				0x0030
+#define HSMC_PULSE3				0x0034
+#define HSMC_CYCLE3				0x0038
+#define HSMC_MODE3				0x003c
+#define HSMC_SETUP4				0x0040
+#define HSMC_PULSE4				0x0044
+#define HSMC_CYCLE4				0x0048
+#define HSMC_MODE4				0x004c
+#define HSMC_SETUP5				0x0050
+#define HSMC_PULSE5				0x0054
+#define HSMC_CYCLE5				0x0058
+#define HSMC_MODE5				0x005c
+
+/* Bitfields in SETUP0 */
+#define HSMC_NWE_SETUP_OFFSET			0
+#define HSMC_NWE_SETUP_SIZE			6
+#define HSMC_NCS_WR_SETUP_OFFSET		8
+#define HSMC_NCS_WR_SETUP_SIZE			6
+#define HSMC_NRD_SETUP_OFFSET			16
+#define HSMC_NRD_SETUP_SIZE			6
+#define HSMC_NCS_RD_SETUP_OFFSET		24
+#define HSMC_NCS_RD_SETUP_SIZE			6
+
+/* Bitfields in PULSE0 */
+#define HSMC_NWE_PULSE_OFFSET			0
+#define HSMC_NWE_PULSE_SIZE			7
+#define HSMC_NCS_WR_PULSE_OFFSET		8
+#define HSMC_NCS_WR_PULSE_SIZE			7
+#define HSMC_NRD_PULSE_OFFSET			16
+#define HSMC_NRD_PULSE_SIZE			7
+#define HSMC_NCS_RD_PULSE_OFFSET		24
+#define HSMC_NCS_RD_PULSE_SIZE			7
+
+/* Bitfields in CYCLE0 */
+#define HSMC_NWE_CYCLE_OFFSET			0
+#define HSMC_NWE_CYCLE_SIZE			9
+#define HSMC_NRD_CYCLE_OFFSET			16
+#define HSMC_NRD_CYCLE_SIZE			9
+
+/* Bitfields in MODE0 */
+#define HSMC_READ_MODE_OFFSET			0
+#define HSMC_READ_MODE_SIZE			1
+#define HSMC_WRITE_MODE_OFFSET			1
+#define HSMC_WRITE_MODE_SIZE			1
+#define HSMC_EXNW_MODE_OFFSET			4
+#define HSMC_EXNW_MODE_SIZE			2
+#define HSMC_BAT_OFFSET				8
+#define HSMC_BAT_SIZE				1
+#define HSMC_DBW_OFFSET				12
+#define HSMC_DBW_SIZE				2
+#define HSMC_TDF_CYCLES_OFFSET			16
+#define HSMC_TDF_CYCLES_SIZE			4
+#define HSMC_TDF_MODE_OFFSET			20
+#define HSMC_TDF_MODE_SIZE			1
+#define HSMC_PMEN_OFFSET			24
+#define HSMC_PMEN_SIZE				1
+#define HSMC_PS_OFFSET				28
+#define HSMC_PS_SIZE				2
+
+/* Constants for READ_MODE */
+#define HSMC_READ_MODE_NCS_CONTROLLED		0
+#define HSMC_READ_MODE_NRD_CONTROLLED		1
+
+/* Constants for WRITE_MODE */
+#define HSMC_WRITE_MODE_NCS_CONTROLLED		0
+#define HSMC_WRITE_MODE_NWE_CONTROLLED		1
+
+/* Constants for EXNW_MODE */
+#define HSMC_EXNW_MODE_DISABLED			0
+#define HSMC_EXNW_MODE_RESERVED			1
+#define HSMC_EXNW_MODE_FROZEN			2
+#define HSMC_EXNW_MODE_READY			3
+
+/* Constants for BAT */
+#define HSMC_BAT_BYTE_SELECT			0
+#define HSMC_BAT_BYTE_WRITE			1
+
+/* Constants for DBW */
+#define HSMC_DBW_8_BITS				0
+#define HSMC_DBW_16_BITS			1
+#define HSMC_DBW_32_BITS			2
+
+/* Bit manipulation macros */
+#define HSMC_BIT(name)							\
+	(1 << HSMC_##name##_OFFSET)
+#define HSMC_BF(name,value)						\
+	(((value) & ((1 << HSMC_##name##_SIZE) - 1))			\
+	 << HSMC_##name##_OFFSET)
+#define HSMC_BFEXT(name,value)						\
+	(((value) >> HSMC_##name##_OFFSET)				\
+	 & ((1 << HSMC_##name##_SIZE) - 1))
+#define HSMC_BFINS(name,value,old)					\
+	(((old) & ~(((1 << HSMC_##name##_SIZE) - 1)			\
+		    << HSMC_##name##_OFFSET)) | HSMC_BF(name,value))
+
+/* Register access macros */
+#define hsmc_readl(port,reg)						\
+	readl((port)->regs + HSMC_##reg)
+#define hsmc_writel(port,reg,value)					\
+	writel((value), (port)->regs + HSMC_##reg)
+
+#endif /* __ASM_AVR32_HSMC_H__ */
diff --git a/include/asm-avr32/arch-at32ap/smc.h b/include/asm-avr32/arch-at32ap/smc.h
new file mode 100644
index 00000000000..3732b328303
--- /dev/null
+++ b/include/asm-avr32/arch-at32ap/smc.h
@@ -0,0 +1,60 @@
+/*
+ * Static Memory Controller for AT32 chips
+ *
+ * Copyright (C) 2006 Atmel Corporation
+ *
+ * Inspired by the OMAP2 General-Purpose Memory Controller interface
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ARCH_AT32AP_SMC_H
+#define __ARCH_AT32AP_SMC_H
+
+/*
+ * All timing parameters are in nanoseconds.
+ */
+struct smc_config {
+	/* Delay from address valid to assertion of given strobe */
+	u16		ncs_read_setup;
+	u16		nrd_setup;
+	u16		ncs_write_setup;
+	u16		nwe_setup;
+
+	/* Pulse length of given strobe */
+	u16		ncs_read_pulse;
+	u16		nrd_pulse;
+	u16		ncs_write_pulse;
+	u16		nwe_pulse;
+
+	/* Total cycle length of given operation */
+	u16		read_cycle;
+	u16		write_cycle;
+
+	/* Bus width in bytes */
+	u8		bus_width;
+
+	/*
+	 * 0: Data is sampled on rising edge of NCS
+	 * 1: Data is sampled on rising edge of NRD
+	 */
+	unsigned int	nrd_controlled:1;
+
+	/*
+	 * 0: Data is driven on falling edge of NCS
+	 * 1: Data is driven on falling edge of NWR
+	 */
+	unsigned int	nwe_controlled:1;
+
+	/*
+	 * 0: Byte select access type
+	 * 1: Byte write access type
+	 */
+	unsigned int	byte_write:1;
+};
+
+extern int smc_set_configuration(int cs, const struct smc_config *config);
+extern struct smc_config *smc_get_configuration(int cs);
+
+#endif /* __ARCH_AT32AP_SMC_H */
-- 
cgit v1.2.3


From 2514183dff2d5282cb745af34f56d1b98e5b2df8 Mon Sep 17 00:00:00 2001
From: Haavard Skinnemoen <hskinnemoen@atmel.com>
Date: Mon, 25 Sep 2006 23:32:17 -0700
Subject: [PATCH] AVR32 MTD: AT49BV6416 platform device for ATSTK1000

FRegister a platform device for the AT49BV6416 NOR flash chip on the ATSTK1000
development board for use by the physmap MTD driver.

The SMC timings are set up before the platform device is registered so that no
board-specific mapping driver is necessary.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/avr32/boards/atstk1000/Makefile |  2 +-
 arch/avr32/boards/atstk1000/flash.c  | 95 ++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 arch/avr32/boards/atstk1000/flash.c

diff --git a/arch/avr32/boards/atstk1000/Makefile b/arch/avr32/boards/atstk1000/Makefile
index c3e36ece865..df949948053 100644
--- a/arch/avr32/boards/atstk1000/Makefile
+++ b/arch/avr32/boards/atstk1000/Makefile
@@ -1,2 +1,2 @@
-obj-y				+= setup.o spi.o
+obj-y				+= setup.o spi.o flash.o
 obj-$(CONFIG_BOARD_ATSTK1002)	+= atstk1002.o
diff --git a/arch/avr32/boards/atstk1000/flash.c b/arch/avr32/boards/atstk1000/flash.c
new file mode 100644
index 00000000000..aac4300cca1
--- /dev/null
+++ b/arch/avr32/boards/atstk1000/flash.c
@@ -0,0 +1,95 @@
+/*
+ * ATSTK1000 board-specific flash initialization
+ *
+ * Copyright (C) 2005-2006 Atmel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/physmap.h>
+
+#include <asm/arch/smc.h>
+
+static struct smc_config flash_config __initdata = {
+	.ncs_read_setup		= 0,
+	.nrd_setup		= 40,
+	.ncs_write_setup	= 0,
+	.nwe_setup		= 10,
+
+	.ncs_read_pulse		= 80,
+	.nrd_pulse		= 40,
+	.ncs_write_pulse	= 65,
+	.nwe_pulse		= 55,
+
+	.read_cycle		= 120,
+	.write_cycle		= 120,
+
+	.bus_width		= 2,
+	.nrd_controlled		= 1,
+	.nwe_controlled		= 1,
+	.byte_write		= 1,
+};
+
+static struct mtd_partition flash_parts[] = {
+	{
+		.name           = "u-boot",
+		.offset         = 0x00000000,
+		.size           = 0x00020000,           /* 128 KiB */
+		.mask_flags     = MTD_WRITEABLE,
+	},
+	{
+		.name           = "root",
+		.offset         = 0x00020000,
+		.size           = 0x007d0000,
+	},
+	{
+		.name           = "env",
+		.offset         = 0x007f0000,
+		.size           = 0x00010000,
+		.mask_flags     = MTD_WRITEABLE,
+	},
+};
+
+static struct physmap_flash_data flash_data = {
+	.width		= 2,
+	.nr_parts	= ARRAY_SIZE(flash_parts),
+	.parts		= flash_parts,
+};
+
+static struct resource flash_resource = {
+	.start		= 0x00000000,
+	.end		= 0x007fffff,
+	.flags		= IORESOURCE_MEM,
+};
+
+static struct platform_device flash_device = {
+	.name		= "physmap-flash",
+	.id		= 0,
+	.resource	= &flash_resource,
+	.num_resources	= 1,
+	.dev		= {
+		.platform_data = &flash_data,
+	},
+};
+
+/* This needs to be called after the SMC has been initialized */
+static int __init atstk1000_flash_init(void)
+{
+	int ret;
+
+	ret = smc_set_configuration(0, &flash_config);
+	if (ret < 0) {
+		printk(KERN_ERR "atstk1000: failed to set NOR flash timing\n");
+		return ret;
+	}
+
+	platform_device_register(&flash_device);
+
+	return 0;
+}
+device_initcall(atstk1000_flash_init);
-- 
cgit v1.2.3


From 1447c27d38faf8fb03d4599e8082e507453ea3cf Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Mon, 25 Sep 2006 23:32:17 -0700
Subject: [PATCH] hpet rtc emulation: add watchdog timer

To prevent the emulated RTC timer from stopping when interrupts are delayed
for too long, disable interrupts around all of the register initialization,
and check that the interrupt handler did not schedule the next interrupt in
the past.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Vojtech Pavlik <vojtech@suse.cz>
Cc: Robert Picco <Robert.Picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time_hpet.c | 37 +++++++++++++++++++++++++++++++------
 arch/x86_64/kernel/time.c    | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 14a1376fedd..6bf14a4e995 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -301,23 +301,25 @@ int hpet_rtc_timer_init(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	local_irq_save(flags);
+
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
 	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_PERIODIC;
 	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
 	hpet_writel(cfg, HPET_T1_CFG);
 
+	local_irq_restore(flags);
+
 	return 1;
 }
 
 static void hpet_rtc_timer_reinit(void)
 {
-	unsigned int cfg, cnt;
+	unsigned int cfg, cnt, ticks_per_int, lost_ints;
 
 	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
 		cfg = hpet_readl(HPET_T1_CFG);
@@ -332,10 +334,33 @@ static void hpet_rtc_timer_reinit(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	/* It is more accurate to use the comparator value than current count.*/
-	cnt = hpet_t1_cmp;
-	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
+	ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+	hpet_t1_cmp += ticks_per_int;
+	hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+	/*
+	 * If the interrupt handler was delayed too long, the write above tries
+	 * to schedule the next interrupt in the past and the hardware would
+	 * not interrupt until the counter had wrapped around.
+	 * So we have to check that the comparator wasn't set to a past time.
+	 */
+	cnt = hpet_readl(HPET_COUNTER);
+	if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+		lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+		/* Make sure that, even with the time needed to execute
+		 * this code, the next scheduled interrupt has been moved
+		 * back to the future: */
+		lost_ints++;
+
+		hpet_t1_cmp += lost_ints * ticks_per_int;
+		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+		if (PIE_on)
+			PIE_count += lost_ints;
+
+		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+		       hpet_rtc_int_freq);
+	}
 }
 
 /*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7a9b1822418..7700e6cd2bd 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1148,23 +1148,25 @@ int hpet_rtc_timer_init(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	local_irq_save(flags);
+
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
 	hpet_t1_cmp = cnt;
-	local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_PERIODIC;
 	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
 	hpet_writel(cfg, HPET_T1_CFG);
 
+	local_irq_restore(flags);
+
 	return 1;
 }
 
 static void hpet_rtc_timer_reinit(void)
 {
-	unsigned int cfg, cnt;
+	unsigned int cfg, cnt, ticks_per_int, lost_ints;
 
 	if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
 		cfg = hpet_readl(HPET_T1_CFG);
@@ -1179,10 +1181,33 @@ static void hpet_rtc_timer_reinit(void)
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
 	/* It is more accurate to use the comparator value than current count.*/
-	cnt = hpet_t1_cmp;
-	cnt += hpet_tick*HZ/hpet_rtc_int_freq;
-	hpet_writel(cnt, HPET_T1_CMP);
-	hpet_t1_cmp = cnt;
+	ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+	hpet_t1_cmp += ticks_per_int;
+	hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+	/*
+	 * If the interrupt handler was delayed too long, the write above tries
+	 * to schedule the next interrupt in the past and the hardware would
+	 * not interrupt until the counter had wrapped around.
+	 * So we have to check that the comparator wasn't set to a past time.
+	 */
+	cnt = hpet_readl(HPET_COUNTER);
+	if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+		lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+		/* Make sure that, even with the time needed to execute
+		 * this code, the next scheduled interrupt has been moved
+		 * back to the future: */
+		lost_ints++;
+
+		hpet_t1_cmp += lost_ints * ticks_per_int;
+		hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+		if (PIE_on)
+			PIE_count += lost_ints;
+
+		printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+		       hpet_rtc_int_freq);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 99325326a57b6a56595bb097655bee9fd27d77b0 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Mon, 25 Sep 2006 23:32:19 -0700
Subject: [PATCH] i386: show_registers(): try harder to print failing code

show_registers() tries to dump failing code starting 43 bytes before the
offending instruction, but this address can be bad, for example in a device
driver where the failing instruction is less than 43 bytes from the start
of the driver's code.  When that happens, try to dump code starting at the
failing instruction instead of printing no code at all.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/traps.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 7e9edafffd8..4fcc6690be9 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -313,6 +313,8 @@ void show_registers(struct pt_regs *regs)
 	 */
 	if (in_kernel) {
 		u8 __user *eip;
+		int code_bytes = 64;
+		unsigned char c;
 
 		printk("\n" KERN_EMERG "Stack: ");
 		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
@@ -320,9 +322,12 @@ void show_registers(struct pt_regs *regs)
 		printk(KERN_EMERG "Code: ");
 
 		eip = (u8 __user *)regs->eip - 43;
-		for (i = 0; i < 64; i++, eip++) {
-			unsigned char c;
-
+		if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+			/* try starting at EIP */
+			eip = (u8 __user *)regs->eip;
+			code_bytes = 32;
+		}
+		for (i = 0; i < code_bytes; i++, eip++) {
 			if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
 				printk(" Bad EIP value.");
 				break;
-- 
cgit v1.2.3


From 3a750363e6075a28e5542ce93a69c620c0cfd605 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Mon, 25 Sep 2006 23:32:20 -0700
Subject: [PATCH] Use BUG_ON(foo) instead of "if (foo) BUG()" in
 include/asm-i386/dma-mapping.h

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/dma-mapping.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index 9cf20cacf76..576ae01d71c 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -21,8 +21,7 @@ static inline dma_addr_t
 dma_map_single(struct device *dev, void *ptr, size_t size,
 	       enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(direction == DMA_NONE);
 	WARN_ON(size == 0);
 	flush_write_buffers();
 	return virt_to_phys(ptr);
@@ -32,8 +31,7 @@ static inline void
 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 		 enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(direction == DMA_NONE);
 }
 
 static inline int
@@ -42,8 +40,7 @@ dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 {
 	int i;
 
-	if (direction == DMA_NONE)
-		BUG();
+	BUG_ON(direction == DMA_NONE);
 	WARN_ON(nents == 0 || sg[0].length == 0);
 
 	for (i = 0; i < nents; i++ ) {
-- 
cgit v1.2.3


From c94a62aae6ebc99b416e55c023b6f5a1d19a400b Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 25 Sep 2006 23:32:21 -0700
Subject: [PATCH] apm: clean up module initalization

Clean up module initalization for apm.c.  I had started by auditing for
proper return code checks in misc_register, but I found that in the event
of an initalization failure, a proc file and a kernel thread were left
hanging out.  this patch properly cleans up those loose ends on any
initalization failure.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 8591f2fa920..24fd577861f 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -2339,6 +2339,7 @@ static int __init apm_init(void)
 	ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD);
 	if (ret < 0) {
 		printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n");
+		remove_proc_entry("apm", NULL);
 		return -ENOMEM;
 	}
 
@@ -2348,7 +2349,13 @@ static int __init apm_init(void)
 		return 0;
 	}
 
-	misc_register(&apm_device);
+	/*
+	 * Note we don't actually care if the misc_device cannot be registered.
+	 * this driver can do its job without it, even if userspace can't
+	 * control it.  just log the error
+	 */
+	if (misc_register(&apm_device))
+		printk(KERN_WARNING "apm: Could not register misc device.\n");
 
 	if (HZ != 100)
 		idle_period = (idle_period * HZ) / 100;
-- 
cgit v1.2.3


From 05f4a3ec94281347e05c81eafefcfe5ea545c94c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:22 -0700
Subject: [PATCH] x86: remove locally-defined ldt structure in favour of
 standard type

arch/i386/kernel/reboot.c defines its own struct to describe an ldt entry: it
should use struct Xgt_desc_struct (currently load_ldt is a macro, so doesn't
complain: paravirt patches make it warn).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/reboot.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 54cfeabbc5e..84278e0093a 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -145,14 +145,10 @@ real_mode_gdt_entries [3] =
 	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct
-{
-	unsigned short       size __attribute__ ((packed));
-	unsigned long long * base __attribute__ ((packed));
-}
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, NULL },
-no_idt = { 0, NULL };
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
 
 
 /* This is 16-bit protected mode code to disable paging and the cache,
-- 
cgit v1.2.3


From 027a8c7e6067a1bcdef6775d1b1c08729dfbae51 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Mon, 25 Sep 2006 23:32:23 -0700
Subject: [PATCH] x86: implement always-locked bit ops, for memory shared with
 an SMP hypervisor

Add "always lock'd" implementations of set_bit, clear_bit and change_bit and
the corresponding test_and_ functions.  Also add "always lock'd"
implementation of cmpxchg.  These give guaranteed strong synchronisation and
are required for non-SMP kernels running on an SMP hypervisor.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/sync_bitops.h | 156 +++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/system.h      |  36 ++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 include/asm-i386/sync_bitops.h

diff --git a/include/asm-i386/sync_bitops.h b/include/asm-i386/sync_bitops.h
new file mode 100644
index 00000000000..c94d51c993e
--- /dev/null
+++ b/include/asm-i386/sync_bitops.h
@@ -0,0 +1,156 @@
+#ifndef _I386_SYNC_BITOPS_H
+#define _I386_SYNC_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *) addr)
+
+/**
+ * sync_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writting portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_set_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btsl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * sync_clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void sync_clear_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btrl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered. It may be
+ * reordered on other architectures than x86.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_change_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__("lock; btcl %1,%0"
+			     :"+m" (ADDR)
+			     :"Ir" (nr)
+			     : "memory");
+}
+
+/**
+ * sync_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It may be reordered on other architectures than x86.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_set_bit(int nr, volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btsl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * sync_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It can be reorderdered on other architectures other than x86.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_clear_bit(int nr, volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btrl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * sync_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_change_bit(int nr, volatile unsigned long* addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("lock; btcl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit),"+m" (ADDR)
+			     :"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+static __always_inline int sync_const_test_bit(int nr, const volatile unsigned long *addr)
+{
+	return ((1UL << (nr & 31)) &
+		(((const volatile unsigned int *)addr)[nr >> 5])) != 0;
+}
+
+static inline int sync_var_test_bit(int nr, const volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
+			     :"=r" (oldbit)
+			     :"m" (ADDR),"Ir" (nr));
+	return oldbit;
+}
+
+#define sync_test_bit(nr,addr)			\
+	(__builtin_constant_p(nr) ?		\
+	 sync_constant_test_bit((nr),(addr)) :	\
+	 sync_var_test_bit((nr),(addr)))
+
+#undef ADDR
+
+#endif /* _I386_SYNC_BITOPS_H */
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 098bcee94e3..a6dabbcd6e6 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -267,6 +267,9 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 #define cmpxchg(ptr,o,n)\
 	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
 					(unsigned long)(n),sizeof(*(ptr))))
+#define sync_cmpxchg(ptr,o,n)\
+	((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
+					(unsigned long)(n),sizeof(*(ptr))))
 #endif
 
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -296,6 +299,39 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 	return old;
 }
 
+/*
+ * Always use locked operations when touching memory shared with a
+ * hypervisor, since the system may be SMP even if the guest kernel
+ * isn't.
+ */
+static inline unsigned long __sync_cmpxchg(volatile void *ptr,
+					    unsigned long old,
+					    unsigned long new, int size)
+{
+	unsigned long prev;
+	switch (size) {
+	case 1:
+		__asm__ __volatile__("lock; cmpxchgb %b1,%2"
+				     : "=a"(prev)
+				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 2:
+		__asm__ __volatile__("lock; cmpxchgw %w1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 4:
+		__asm__ __volatile__("lock; cmpxchgl %1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	}
+	return old;
+}
+
 #ifndef CONFIG_X86_CMPXCHG
 /*
  * Building a kernel capable running on 80386. It may be necessary to
-- 
cgit v1.2.3


From 9f093394d75cd9c5df82c7a99c5eb5d7ce7ba199 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:24 -0700
Subject: [PATCH] x86: roll all the cpuid asm into one __cpuid call

It's a little neater, and also means only one place to patch for
paravirtualization.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/processor.h | 60 ++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index b32346d62e1..2277127696d 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -143,6 +143,18 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {}
 #define X86_EFLAGS_VIP	0x00100000 /* Virtual Interrupt Pending */
 #define X86_EFLAGS_ID	0x00200000 /* CPUID detection flag */
 
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+			   unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	__asm__("cpuid"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+}
+
 /*
  * Generic CPUID function
  * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
@@ -150,24 +162,18 @@ static inline void detect_ht(struct cpuinfo_x86 *c) {}
  */
 static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
 {
-	__asm__("cpuid"
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (op), "c"(0));
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /* Some CPUID calls want 'count' to be placed in ecx */
 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
-	       	int *edx)
+			       int *edx)
 {
-	__asm__("cpuid"
-		: "=a" (*eax),
-		  "=b" (*ebx),
-		  "=c" (*ecx),
-		  "=d" (*edx)
-		: "0" (op), "c" (count));
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
 }
 
 /*
@@ -175,42 +181,30 @@ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
  */
 static inline unsigned int cpuid_eax(unsigned int op)
 {
-	unsigned int eax;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__("cpuid"
-		: "=a" (eax)
-		: "0" (op)
-		: "bx", "cx", "dx");
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return eax;
 }
 static inline unsigned int cpuid_ebx(unsigned int op)
 {
-	unsigned int eax, ebx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__("cpuid"
-		: "=a" (eax), "=b" (ebx)
-		: "0" (op)
-		: "cx", "dx" );
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ebx;
 }
 static inline unsigned int cpuid_ecx(unsigned int op)
 {
-	unsigned int eax, ecx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__("cpuid"
-		: "=a" (eax), "=c" (ecx)
-		: "0" (op)
-		: "bx", "dx" );
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return ecx;
 }
 static inline unsigned int cpuid_edx(unsigned int op)
 {
-	unsigned int eax, edx;
+	unsigned int eax, ebx, ecx, edx;
 
-	__asm__("cpuid"
-		: "=a" (eax), "=d" (edx)
-		: "0" (op)
-		: "bx", "cx");
+	cpuid(op, &eax, &ebx, &ecx, &edx);
 	return edx;
 }
 
-- 
cgit v1.2.3


From 052e79941a042e5be4feffa03b1fd60d93fb9e9a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Mon, 25 Sep 2006 23:32:25 -0700
Subject: [PATCH] x86: make __FIXADDR_TOP variable to allow it to make space
 for a hypervisor

Make __FIXADDR_TOP a variable, so that it can be set to not get in the way of
address space a hypervisor may want to reserve.

Original patch by Gerd Hoffmann <kraxel@suse.de>

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig         |  1 +
 arch/i386/mm/init.c       | 42 ++++++++++++++++++++++++++++++++++++++++++
 arch/i386/mm/pgtable.c    | 26 ++++++++++++++++++++++++++
 include/asm-i386/fixmap.h |  7 ++++++-
 4 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b2751eadbc5..1f83efb7a60 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -794,6 +794,7 @@ config HOTPLUG_CPU
 config COMPAT_VDSO
 	bool "Compat VDSO support"
 	default y
+	depends on !PARAVIRT
 	help
 	  Map the VDSO to the predictable old-style address too.
 	---help---
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 353453d550a..efd0bcdac65 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -629,6 +629,48 @@ void __init mem_init(void)
 		(unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
 	       );
 
+#if 1 /* double-sanity-check paranoia */
+	printk("virtual kernel memory layout:\n"
+	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+	       "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+	       "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+	       FIXADDR_START, FIXADDR_TOP,
+	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+	       (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+	       VMALLOC_START, VMALLOC_END,
+	       (VMALLOC_END - VMALLOC_START) >> 20,
+
+	       (unsigned long)__va(0), (unsigned long)high_memory,
+	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+	       (unsigned long)&__init_begin, (unsigned long)&__init_end,
+	       ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+
+	       (unsigned long)&_etext, (unsigned long)&_edata,
+	       ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+	       (unsigned long)&_text, (unsigned long)&_etext,
+	       ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+	BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+#endif
+	BUG_ON(VMALLOC_START                   > VMALLOC_END);
+	BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
 #ifdef CONFIG_X86_PAE
 	if (!cpu_has_pae)
 		panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index a9f4910a22f..10126e3f817 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -139,6 +140,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 	__flush_tlb_one(vaddr);
 }
 
+static int fixmaps;
+#ifndef CONFIG_COMPAT_VDSO
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+#endif
+
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
@@ -148,6 +155,25 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 		return;
 	}
 	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+	fixmaps++;
+}
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void reserve_top_address(unsigned long reserve)
+{
+	BUG_ON(fixmaps > 0);
+#ifdef CONFIG_COMPAT_VDSO
+	BUG_ON(reserve != 0);
+#else
+	__FIXADDR_TOP = -reserve - PAGE_SIZE;
+	__VMALLOC_RESERVE += reserve;
+#endif
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index a48cc3f7ccc..02428cb3662 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -19,7 +19,11 @@
  * Leave one empty page between vmalloc'ed areas and
  * the start of the fixmap.
  */
-#define __FIXADDR_TOP	0xfffff000
+#ifndef CONFIG_COMPAT_VDSO
+extern unsigned long __FIXADDR_TOP;
+#else
+#define __FIXADDR_TOP  0xfffff000
+#endif
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
@@ -93,6 +97,7 @@ enum fixed_addresses {
 
 extern void __set_fixmap (enum fixed_addresses idx,
 					unsigned long phys, pgprot_t flags);
+extern void reserve_top_address(unsigned long reserve);
 
 #define set_fixmap(idx, phys) \
 		__set_fixmap(idx, phys, PAGE_KERNEL)
-- 
cgit v1.2.3


From 461a9afff5e731d6337c0f5b08a1e727ccd57e0a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Mon, 25 Sep 2006 23:32:25 -0700
Subject: [PATCH] x86: add a bootparameter to reserve high linear address space

Add a boot parameter to reserve high linear address space for hypervisors.
This is necessary to allow dynamically loaded hypervisor modules, which might
not happen until userspace is already running, and also provides a useful tool
to benchmark the performance impact of reduced lowmem address space.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  5 +++++
 arch/i386/kernel/setup.c            | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 71d05f48172..766abdab94e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1363,6 +1363,11 @@ running once the system is up.
 
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
+	reservetop=	[IA-32]
+			Format: nn[KMG]
+			Reserves a hole at the top of the kernel virtual
+			address space.
+
 	resume=		[SWSUSP]
 			Specify the partition device for software suspend
 
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 060c68004be..16d99444cf6 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -934,6 +934,24 @@ static void __init parse_cmdline_early (char ** cmdline_p)
 	}
 }
 
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
+
 /*
  * Callback for efi_memory_walk.
  */
-- 
cgit v1.2.3


From 9c9b8b388296ad5a306ab238dc677cfe6ff4cb12 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Mon, 25 Sep 2006 23:32:26 -0700
Subject: [PATCH] x86: put .note.* sections into a PT_NOTE segment in vmlinux

This patch will pack any .note.* section into a PT_NOTE segment in the output
file.

To do this, we tell ld that we need a PT_NOTE segment.  This requires us to
start explicitly mapping sections to segments, so we also need to explicitly
create PT_LOAD segments for text and data, and map the sections to them
appropriately.  Fortunately, each section will default to its previous
section's segment, so it doesn't take many changes to vmlinux.lds.S.

This only changes i386 for now, but I presume the corresponding changes for
other architectures will be as simple.

This change also adds <linux/elfnote.h>, which defines C and Assembler macros
for actually creating ELF notes.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vmlinux.lds.S    | 12 +++++-
 include/asm-generic/vmlinux.lds.h |  3 ++
 include/linux/elfnote.h           | 88 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/elfnote.h

diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 2d4f1386e2b..1e7ac1c44dd 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
+
+PHDRS {
+	text PT_LOAD FLAGS(5);	/* R_E */
+	data PT_LOAD FLAGS(7);	/* RWE */
+	note PT_NOTE FLAGS(4);	/* R__ */
+}
 SECTIONS
 {
   . = __KERNEL_START;
@@ -26,7 +32,7 @@ SECTIONS
 	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
-	} = 0x9090
+	} :text = 0x9090
 
   _etext = .;			/* End of text section */
 
@@ -48,7 +54,7 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
 	*(.data)
 	CONSTRUCTORS
-	}
+	} :data
 
   . = ALIGN(4096);
   __nosave_begin = .;
@@ -184,4 +190,6 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
+
+  NOTES
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index db5a3732f10..253ae132827 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -194,3 +194,6 @@
 		.stab.index 0 : { *(.stab.index) }			\
 		.stab.indexstr 0 : { *(.stab.indexstr) }		\
 		.comment 0 : { *(.comment) }
+
+#define NOTES								\
+		.notes : { *(.note.*) } :note
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
new file mode 100644
index 00000000000..16f9f8ebffd
--- /dev/null
+++ b/include/linux/elfnote.h
@@ -0,0 +1,88 @@
+#ifndef _LINUX_ELFNOTE_H
+#define _LINUX_ELFNOTE_H
+/*
+ * Helper macros to generate ELF Note structures, which are put into a
+ * PT_NOTE segment of the final vmlinux image.  These are useful for
+ * including name-value pairs of metadata into the kernel binary (or
+ * modules?) for use by external programs.
+ *
+ * Each note has three parts: a name, a type and a desc.  The name is
+ * intended to distinguish the note's originator, so it would be a
+ * company, project, subsystem, etc; it must be in a suitable form for
+ * use in a section name.  The type is an integer which is used to tag
+ * the data, and is considered to be within the "name" namespace (so
+ * "FooCo"'s type 42 is distinct from "BarProj"'s type 42).  The
+ * "desc" field is the actual data.  There are no constraints on the
+ * desc field's contents, though typically they're fairly small.
+ *
+ * All notes from a given NAME are put into a section named
+ * .note.NAME.  When the kernel image is finally linked, all the notes
+ * are packed into a single .notes section, which is mapped into the
+ * PT_NOTE segment.  Because notes for a given name are grouped into
+ * the same section, they'll all be adjacent the output file.
+ *
+ * This file defines macros for both C and assembler use.  Their
+ * syntax is slightly different, but they're semantically similar.
+ *
+ * See the ELF specification for more detail about ELF notes.
+ */
+
+#ifdef __ASSEMBLER__
+/*
+ * Generate a structure with the same shape as Elf{32,64}_Nhdr (which
+ * turn out to be the same size and shape), followed by the name and
+ * desc data with appropriate padding.  The 'desc' argument includes
+ * the assembler pseudo op defining the type of the data: .asciz
+ * "hello, world"
+ */
+.macro ELFNOTE name type desc:vararg
+.pushsection ".note.\name"
+  .align 4
+  .long 2f - 1f			/* namesz */
+  .long 4f - 3f			/* descsz */
+  .long \type
+1:.asciz "\name"
+2:.align 4
+3:\desc
+4:.align 4
+.popsection
+.endm
+#else	/* !__ASSEMBLER__ */
+#include <linux/elf.h>
+/*
+ * Use an anonymous structure which matches the shape of
+ * Elf{32,64}_Nhdr, but includes the name and desc data.  The size and
+ * type of name and desc depend on the macro arguments.  "name" must
+ * be a literal string, and "desc" must be passed by value.  You may
+ * only define one note per line, since __LINE__ is used to generate
+ * unique symbols.
+ */
+#define _ELFNOTE_PASTE(a,b)	a##b
+#define _ELFNOTE(size, name, unique, type, desc)			\
+	static const struct {						\
+		struct elf##size##_note _nhdr;				\
+		unsigned char _name[sizeof(name)]			\
+		__attribute__((aligned(sizeof(Elf##size##_Word))));	\
+		typeof(desc) _desc					\
+			     __attribute__((aligned(sizeof(Elf##size##_Word)))); \
+	} _ELFNOTE_PASTE(_note_, unique)				\
+		__attribute_used__					\
+		__attribute__((section(".note." name),			\
+			       aligned(sizeof(Elf##size##_Word)),	\
+			       unused)) = {				\
+		{							\
+			sizeof(name),					\
+			sizeof(desc),					\
+			type,						\
+		},							\
+		name,							\
+		desc							\
+	}
+#define ELFNOTE(size, name, type, desc)		\
+	_ELFNOTE(size, name, __LINE__, type, desc)
+
+#define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc)
+#define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc)
+#endif	/* __ASSEMBLER__ */
+
+#endif /* _LINUX_ELFNOTE_H */
-- 
cgit v1.2.3


From 5091e746848f74c9a2c0579b4ef8d8cd1a6b135d Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@xensource.com>
Date: Mon, 25 Sep 2006 23:32:28 -0700
Subject: [PATCH] Translate asm version of ELFNOTE macro into preprocessor
 macro

I've come across some problems with the assembly version of the ELFNOTE
macro currently in -mm. (in
x86-put-note-sections-into-a-pt_note-segment-in-vmlinux.patch)

The first is that older gas does not support :varargs in .macro
definitions (in my testing 2.17 does while 2.15 does not, I don't know
when it became supported). The Changes file says binutils >= 2.12 so I
think we need to avoid using it. There are no other uses in mainline or
-mm. Old gas appears to just ignore it so you get "too many arguments"
type errors.

Secondly it seems that passing strings as arguments to assembler macros
is broken without varargs. It looks like they get unquoted or each
character is treated as a separate argument or something and this causes
all manner of grief. I think this is because of the use of -traditional
when compiling assembly files.

Therefore I have translated the assembler macro into a pre-processor
macro.

I added the desctype as a separate argument instead of including it with
the descdata as the previous version did since -traditional means the
ELFNOTE definition after the #else needs to have the same number of
arguments (I think so anyway, the -traditional CPP semantics are pretty
fscking strange!).

With this patch I am able to define elfnotes in assembly like this with
both old and new assemblers.

	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long,  __PAGE_OFFSET)

Which seems reasonable enough.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/elfnote.h | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 16f9f8ebffd..67396db141e 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -31,22 +31,24 @@
 /*
  * Generate a structure with the same shape as Elf{32,64}_Nhdr (which
  * turn out to be the same size and shape), followed by the name and
- * desc data with appropriate padding.  The 'desc' argument includes
- * the assembler pseudo op defining the type of the data: .asciz
- * "hello, world"
+ * desc data with appropriate padding.  The 'desctype' argument is the
+ * assembler pseudo op defining the type of the data e.g. .asciz while
+ * 'descdata' is the data itself e.g.  "hello, world".
+ *
+ * e.g. ELFNOTE(XYZCo, 42, .asciz, "forty-two")
+ *      ELFNOTE(XYZCo, 12, .long, 0xdeadbeef)
  */
-.macro ELFNOTE name type desc:vararg
-.pushsection ".note.\name"
-  .align 4
-  .long 2f - 1f			/* namesz */
-  .long 4f - 3f			/* descsz */
-  .long \type
-1:.asciz "\name"
-2:.align 4
-3:\desc
-4:.align 4
-.popsection
-.endm
+#define ELFNOTE(name, type, desctype, descdata)	\
+.pushsection .note.name			;	\
+  .align 4				;	\
+  .long 2f - 1f		/* namesz */	;	\
+  .long 4f - 3f		/* descsz */	;	\
+  .long type				;	\
+1:.asciz "name"				;	\
+2:.align 4				;	\
+3:desctype descdata			;	\
+4:.align 4				;	\
+.popsection				;
 #else	/* !__ASSEMBLER__ */
 #include <linux/elf.h>
 /*
-- 
cgit v1.2.3


From 753b9f86e7aef76c2beda32668ce528f90cb1733 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Mon, 25 Sep 2006 23:32:29 -0700
Subject: [PATCH] x86: enable VMSPLIT for highmem kernels

The current VMSPLIT Kconfig option is disabled whenever highmem is on.
This is a bit screwy because the people who need to change VMSPLIT the most
tend to be the ones with highmem and constrained lowmem.

So, remove the highmem dependency.  But, re-include the dependency for the
"full 1GB of lowmem" option.  You can't have the full 1GB of lowmem and
highmem because of the need for the vmalloc(), kmap(), etc...  areas.

I thought there would be at least a bit of tweaking to do to
get it to work, but everything seems OK.

Boot tested on a 4GB x86 machine, and a 12GB 3-node NUMA-Q:

elm3b82:~# cat /proc/meminfo
MemTotal:      3695412 kB
MemFree:       3659540 kB
...
LowTotal:      2909008 kB
LowFree:       2892324 kB
...
elm3b82:~# zgrep PAE /proc/config.gz
CONFIG_X86_PAE=y

larry:~# cat /proc/meminfo
MemTotal:     11845900 kB
MemFree:      11786748 kB
...
LowTotal:      2855180 kB
LowFree:       2830092 kB

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 1f83efb7a60..6189b0c28d6 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -494,7 +494,7 @@ config HIGHMEM64G
 endchoice
 
 choice
-	depends on EXPERIMENTAL && !X86_PAE
+	depends on EXPERIMENTAL
 	prompt "Memory split" if EMBEDDED
 	default VMSPLIT_3G
 	help
@@ -516,6 +516,7 @@ choice
 	config VMSPLIT_3G
 		bool "3G/1G user/kernel split"
 	config VMSPLIT_3G_OPT
+		depends on !HIGHMEM
 		bool "3G/1G user/kernel split (for full 1G low memory)"
 	config VMSPLIT_2G
 		bool "2G/2G user/kernel split"
-- 
cgit v1.2.3


From 673eae8230a192f07b8715b872d6925521e9738d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:29 -0700
Subject: [PATCH] x86: trivial pgtable.h __ASSEMBLY__ move

Parsing generic pgtable.h in assembler is simply crazy.  None of this file is
needed in assembler code, and C inline functions and structures routine break
one or more different compiles.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index c2059a3a062..349260cd86e 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_GENERIC_PGTABLE_H
 #define _ASM_GENERIC_PGTABLE_H
 
+#ifndef __ASSEMBLY__
+
 #ifndef __HAVE_ARCH_PTEP_ESTABLISH
 /*
  * Establish a new mapping:
@@ -188,7 +190,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 })
 #endif
 
-#ifndef __ASSEMBLY__
 /*
  * When walking page tables, we usually want to skip any p?d_none entries;
  * and any p?d_bad entries - reporting the error before resetting to none.
-- 
cgit v1.2.3


From 6049742dbcecf170e903638a029f4dc280b9d53d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:30 -0700
Subject: [PATCH] x86: trivial move of __HAVE macros in i386 pagetable headers

Move the __HAVE_ARCH_PTEP defines to accompany the function definitions.
Anything else is just a complete nightmare to track through the 2/3-level
paging code, and this caused duplicate definitions to be needed (pte_same),
which could have easily been taken care of with the asm-generic pgtable
functions.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable-2level.h |  3 ++-
 include/asm-i386/pgtable-3level.h |  2 ++
 include/asm-i386/pgtable.h        | 10 ++++------
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index 2756d4b04c2..201c86a6711 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -21,8 +21,9 @@
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte_low, 0))
-#define pte_same(a, b)		((a).pte_low == (b).pte_low)
+
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 #define pte_none(x)		(!(x).pte_low)
 #define pte_pfn(x)		((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index 807ed9e366d..0d899173232 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -105,6 +105,7 @@ static inline void pmd_clear(pmd_t *pmd)
 	*(tmp + 1) = 0;
 }
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	pte_t res;
@@ -117,6 +118,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return res;
 }
 
+#define __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t a, pte_t b)
 {
 	return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index c04b3d0f484..38bf9751443 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -246,6 +246,7 @@ static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return p
 # include <asm/pgtable-2level.h>
 #endif
 
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	if (!pte_dirty(*ptep))
@@ -253,6 +254,7 @@ static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned
 	return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low);
 }
 
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
 	if (!pte_young(*ptep))
@@ -260,6 +262,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned
 	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low);
 }
 
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
 {
 	pte_t pte;
@@ -272,6 +275,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 	return pte;
 }
 
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
@@ -441,12 +445,6 @@ extern void noexec_setup(const char *str);
 #define GET_IOSPACE(pfn)		0
 #define GET_PFN(pfn)			(pfn)
 
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
 #include <asm-generic/pgtable.h>
 
 #endif /* _I386_PGTABLE_H */
-- 
cgit v1.2.3


From 2965a0e6da0ccd8971ccf2c00a02bfa6e212acdb Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:31 -0700
Subject: [PATCH] x86: trivial move of ptep_set_access_flags

Move ptep_set_access_flags to be closer to the other ptep accessors, and make
the indentation standard.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 38bf9751443..0dc051a8078 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -246,6 +246,22 @@ static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return p
 # include <asm/pgtable-2level.h>
 #endif
 
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags(vma, address, ptep, entry, dirty)		\
+do {									\
+	if (dirty) {							\
+		(ptep)->pte_low = (entry).pte_low;			\
+		flush_tlb_page(vma, address);				\
+	}								\
+} while (0)
+
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
 static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
@@ -415,23 +431,8 @@ extern void noexec_setup(const char *str);
 /*
  * The i386 doesn't have any external MMU info: the kernel page
  * tables contain all the necessary information.
- *
- * Also, we only update the dirty/accessed state if we set
- * the dirty bit by hand in the kernel, since the hardware
- * will do the accessed bit for us, and we don't want to
- * race with other CPU's that might be updating the dirty
- * bit at the same time.
  */
 #define update_mmu_cache(vma,address,pte) do { } while (0)
-#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
-	do {								  \
-		if (__dirty) {						  \
-			(__ptep)->pte_low = (__entry).pte_low;	  	  \
-			flush_tlb_page(__vma, __address);		  \
-		}							  \
-	} while (0)
-
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_FLATMEM
-- 
cgit v1.2.3


From 27d26666fc495166036f4ee2208df25ae7f89ab8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 25 Sep 2006 23:32:32 -0700
Subject: [PATCH] x86: remove unused include from efi_stub.S

Remove unnecessary include from efi_stub.S

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/efi_stub.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S
index d3ee73a3eee..ef00bb77d7e 100644
--- a/arch/i386/kernel/efi_stub.S
+++ b/arch/i386/kernel/efi_stub.S
@@ -7,7 +7,6 @@
 
 #include <linux/linkage.h>
 #include <asm/page.h>
-#include <asm/pgtable.h>
 
 /*
  * efi_call_phys(void *, ...) is a function with variable parameters.
-- 
cgit v1.2.3


From eaa70773e750cc09d60938bceacd028bc76b8e3a Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@hpl.hp.com>
Date: Mon, 25 Sep 2006 23:32:32 -0700
Subject: [PATCH] i386: add smp_call_function_single

Continiung the series of small patches necessary for the perfmon subsystem,
here is a patch that adds support for the smp_call_function_single()
function for i386.  It exists for almost all other architectures but i386.
The perfmon subsystem needs it in one case to free some state on a
designated remote CPU.

Signed-off-by: Stephane Eranian <eranian@hpl.hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smp.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index c10789d7a9d..304243ed7a3 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -634,3 +634,69 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
 	}
 }
 
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				int nonatomic, int wait)
+{
+	struct call_data_struct data;
+	int cpus = 1;
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	wmb();
+	/* Send a message to all other CPUs and wait for them to respond */
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus)
+		cpu_relax();
+
+	if (!wait)
+		return;
+
+	while (atomic_read(&data.finished) != cpus)
+		cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+	int nonatomic, int wait)
+{
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+	if (cpu == me) {
+		WARN_ON(1);
+		put_cpu();
+		return -EBUSY;
+	}
+	spin_lock_bh(&call_lock);
+	__smp_call_function_single(cpu, func, info, nonatomic, wait);
+	spin_unlock_bh(&call_lock);
+	put_cpu();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
-- 
cgit v1.2.3


From a3bc0dbc81d36fd38991b4373f6de8e1a507605a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:33 -0700
Subject: [PATCH] smp_call_function_single() cleanup

If we're going to implement smp_call_function_single() on three architecture
with the same prototype then it should have a declaration in a
non-arch-specific header file.

Move it into <linux/smp.h>.

Cc: Stephane Eranian <eranian@hpl.hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smp.c              | 4 ++--
 arch/ia64/kernel/perfmon.c          | 1 +
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 3 ++-
 arch/x86_64/kernel/smpboot.c        | 3 ++-
 include/asm-ia64/smp.h              | 2 --
 include/asm-x86_64/smp.h            | 2 --
 include/linux/smp.h                 | 3 +++
 7 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 304243ed7a3..465188e2d70 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -683,8 +683,8 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
  * or is or has executed.
  */
 
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-	int nonatomic, int wait)
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			int nonatomic, int wait)
 {
 	/* prevent preemption and reschedule on another processor */
 	int me = get_cpu();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 84a7e52f56f..7bb7696e4ce 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -34,6 +34,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/vfs.h>
+#include <linux/smp.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/bitops.h>
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 9a8a29339d2..b632b9c1e3b 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -32,9 +32,10 @@
 #include <linux/cpumask.h>
 #include <linux/smp_lock.h>
 #include <linux/nodemask.h>
+#include <linux/smp.h>
+
 #include <asm/processor.h>
 #include <asm/topology.h>
-#include <asm/smp.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
 #include <asm/sal.h>
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 975380207b4..3ae9ffddddc 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -46,9 +46,10 @@
 #include <linux/bootmem.h>
 #include <linux/thread_info.h>
 #include <linux/module.h>
-
 #include <linux/delay.h>
 #include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+
 #include <asm/mtrr.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h
index 719ff309ce0..74bde1c2bb1 100644
--- a/include/asm-ia64/smp.h
+++ b/include/asm-ia64/smp.h
@@ -122,8 +122,6 @@ extern void __init smp_build_cpu_map(void);
 extern void __init init_smp_config (void);
 extern void smp_do_timer (struct pt_regs *regs);
 
-extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info,
-				     int retry, int wait);
 extern void smp_send_reschedule (int cpu);
 extern void lock_ipi_calllock(void);
 extern void unlock_ipi_calllock(void);
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 6805e1feb30..ce97f65e1d1 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -48,8 +48,6 @@ extern void unlock_ipi_call_lock(void);
 extern int smp_num_siblings;
 extern void smp_send_reschedule(int cpu);
 void smp_stop_cpu(void);
-extern int smp_call_function_single(int cpuid, void (*func) (void *info),
-				void *info, int retry, int wait);
 
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 837e8bce134..51649987f69 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,6 +53,9 @@ extern void smp_cpus_done(unsigned int max_cpus);
  */
 int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
+int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
+				int retry, int wait);
+
 /*
  * Call a function on all processors
  */
-- 
cgit v1.2.3


From 0f0f1b400ce3c4780b9d974bc69e8b558d99aba4 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 25 Sep 2006 23:32:34 -0700
Subject: [PATCH] Voyager: tty locking

Voyager fiddles with current->signal.tty without locking.  It turns out
that the code in question has already cleared current->signal.tty correctly
because daemonize() does the right thing already.

The signal handling also appears to be incorrect as it does an unprotected
sigfillset that also appears unneccessary.  As I don't have a bowtie and am
therefore not a qualified voyager maintainer I leave that to James.

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: James Bottomley <James.Bottomley@steeleye.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mach-voyager/voyager_thread.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index 50f6de6ff64..f39887359e8 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -130,7 +130,6 @@ thread(void *unused)
 	init_timer(&wakeup_timer);
 
 	sigfillset(&current->blocked);
-	current->signal->tty = NULL;
 
 	printk(KERN_NOTICE "Voyager starting monitor thread\n");
 
-- 
cgit v1.2.3


From c7f40ff15aba95bc09a759024d62b2c344ef0856 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Mon, 25 Sep 2006 23:32:35 -0700
Subject: [PATCH] i386: Kill references to xtime

Remove all references to xtime in i386 and replace them w/
get/set_timeofday().  Requires some ugly and uncertain changes to APM, but
has been lightly tested to work.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/apm.c  | 17 +++++------------
 arch/i386/kernel/time.c | 26 +++++++++++++++-----------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 24fd577861f..ff9ce4b5eaa 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -1154,9 +1154,11 @@ out:
 
 static void set_time(void)
 {
+	struct timespec ts;
 	if (got_clock_diff) {	/* Must know time zone in order to set clock */
-		xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
-		xtime.tv_nsec = 0; 
+		ts.tv_sec = get_cmos_time() + clock_cmos_diff;
+		ts.tv_nsec = 0;
+		do_settimeofday(&ts);
 	} 
 }
 
@@ -1232,13 +1234,8 @@ static int suspend(int vetoable)
 	restore_processor_state();
 
 	local_irq_disable();
-	write_seqlock(&xtime_lock);
-	spin_lock(&i8253_lock);
-	reinit_timer();
 	set_time();
-
-	spin_unlock(&i8253_lock);
-	write_sequnlock(&xtime_lock);
+	reinit_timer();
 
 	if (err == APM_NO_ERROR)
 		err = APM_SUCCESS;
@@ -1365,9 +1362,7 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				write_seqlock_irq(&xtime_lock);
 				set_time();
-				write_sequnlock_irq(&xtime_lock);
 				device_resume();
 				pm_send_all(PM_RESUME, (void *)0);
 				queue_event(event, NULL);
@@ -1383,9 +1378,7 @@ static void check_events(void)
 			break;
 
 		case APM_UPDATE_TIME:
-			write_seqlock_irq(&xtime_lock);
 			set_time();
-			write_sequnlock_irq(&xtime_lock);
 			break;
 
 		case APM_CRITICAL_SUSPEND:
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index edd00f6cee3..6f333e7fb23 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -288,7 +288,7 @@ static int timer_resume(struct sys_device *dev)
 	unsigned long flags;
 	unsigned long sec;
 	unsigned long sleep_length;
-
+	struct timespec ts;
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_enabled())
 		hpet_reenable();
@@ -296,9 +296,11 @@ static int timer_resume(struct sys_device *dev)
 	setup_pit_timer();
 	sec = get_cmos_time() + clock_cmos_diff;
 	sleep_length = (get_cmos_time() - sleep_start) * HZ;
+
+	ts.tv_sec = sec;
+	ts.tv_nsec = 0;
+	do_settimeofday(&ts);
 	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
 	jiffies_64 += sleep_length;
 	wall_jiffies += sleep_length;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -334,10 +336,11 @@ extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+	struct timespec ts;
+	ts.tv_sec = get_cmos_time();
+	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+	do_settimeofday(&ts);
 
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
@@ -349,6 +352,7 @@ static void __init hpet_time_init(void)
 
 void __init time_init(void)
 {
+	struct timespec ts;
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_capable()) {
 		/*
@@ -359,10 +363,10 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
+	ts.tv_sec = get_cmos_time();
+	ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+	do_settimeofday(&ts);
 
 	time_init_hook();
 }
-- 
cgit v1.2.3


From 182daa55679d9b3557d27609e7ab4bfab749ac5b Mon Sep 17 00:00:00 2001
From: Josh Triplett <josht@us.ibm.com>
Date: Mon, 25 Sep 2006 23:32:36 -0700
Subject: [PATCH] mtrr: Add lock annotations for prepare_set and post_set

The functions prepare_set and post_set in kernel/cpu/mtrr/generic.c wrap
the spinlock set_atomicity_lock: prepare_set returns with the lock held,
and post_set releases the lock without acquiring it.  Add lock annotations
to these two functions so that sparse can check callers for lock pairing,
and so that sparse will not complain about these functions since they
intentionally use locks in this manner.

Signed-off-by: Josh Triplett <josh@freedesktop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mtrr/generic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 169ac8e0db6..0b61eed8bbd 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -243,7 +243,7 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
  * has been called.
  */
 
-static void prepare_set(void)
+static void prepare_set(void) __acquires(set_atomicity_lock)
 {
 	unsigned long cr0;
 
@@ -274,7 +274,7 @@ static void prepare_set(void)
 	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
 }
 
-static void post_set(void)
+static void post_set(void) __releases(set_atomicity_lock)
 {
 	/*  Flush TLBs (no need to flush caches - they are disabled)  */
 	__flush_tlb();
-- 
cgit v1.2.3


From 060ec3d52db417a4fa554b6e14594ca62418c326 Mon Sep 17 00:00:00 2001
From: "Fernando J. Pereda" <ferdy@gentoo.org>
Date: Mon, 25 Sep 2006 23:32:37 -0700
Subject: [PATCH] alpha: Fix ALPHA_EV56 dependencies typo

There appears to be a typo in the EV56 config option. NORITAKE and PRIMO are
be able to set a variation of either.

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 213c7850d5f..2b36afd8e96 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -381,7 +381,7 @@ config ALPHA_EV56
 
 config ALPHA_EV56
 	prompt "EV56 CPU (speed >= 333MHz)?"
-	depends on ALPHA_NORITAKE && ALPHA_PRIMO
+	depends on ALPHA_NORITAKE || ALPHA_PRIMO
 
 config ALPHA_EV56
 	prompt "EV56 CPU (speed >= 400MHz)?"
-- 
cgit v1.2.3


From 930631edd4b1fe2781d9fe90edbe35d89dfc94cc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 25 Sep 2006 23:32:40 -0700
Subject: [PATCH] add DIV_ROUND_UP()

Add the DIV_ROUND_UP() helper macro: divide `n' by `d', rounding up.

Stolen from the gfs2 tree(!) because the swsusp patches need it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b2ae4fdce8..e44a37e2c71 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -33,6 +33,7 @@ extern const char linux_banner[];
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define ALIGN(x,a) (((x)+(a)-1UL)&~((a)-1UL))
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 
 #define	KERN_EMERG	"<0>"	/* system is unusable			*/
-- 
cgit v1.2.3


From 3a4f7577c9ef393ca80c783f02ffbc125de771c7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:41 -0700
Subject: [PATCH] swsusp: add write-speed instrumentation

Add some instrumentation to the swsusp writeout code to show what bandwidth
we're achieving.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64..79b66e734bd 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -146,6 +146,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
 	handle->bitmap = NULL;
 }
 
+static void show_speed(struct timeval *start, struct timeval *stop,
+			unsigned nr_pages, char *msg)
+{
+	s64 elapsed_centisecs64;
+	int centisecs;
+	int k;
+	int kps;
+
+	elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+	do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+	centisecs = elapsed_centisecs64;
+	if (centisecs == 0)
+		centisecs = 1;	/* avoid div-by-zero */
+	k = nr_pages * (PAGE_SIZE / 1024);
+	kps = (k * 100) / centisecs;
+	printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+			centisecs / 100, centisecs % 100,
+			kps / 1000, (kps % 1000) / 10);
+}
+
 static int get_swap_writer(struct swap_map_handle *handle)
 {
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -206,17 +226,21 @@ static int flush_swap_writer(struct swap_map_handle *handle)
 
 static int save_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_write)
 {
 	unsigned int m;
 	int ret;
 	int error = 0;
+	int nr_pages;
+	struct timeval start;
+	struct timeval stop;
 
-	printk("Saving image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
+	printk("Saving image data pages (%u pages) ...     ", nr_to_write);
+	m = nr_to_write / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	do_gettimeofday(&start);
 	do {
 		ret = snapshot_read_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
@@ -228,8 +252,10 @@ static int save_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	do_gettimeofday(&stop);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	show_speed(&start, &stop, nr_to_write, "Wrote");
 	return error;
 }
 
-- 
cgit v1.2.3


From ab954160350c91c77ae03740ef90458c3ad5412c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:42 -0700
Subject: [PATCH] swsusp: write speedup

Switch the swsusp writeout code from 4k-at-a-time to 4MB-at-a-time.

Crufty old PIII testbox:
	12.9 MB/s -> 20.9 MB/s

Sony Vaio:
	14.7 MB/s -> 26.5 MB/s

The implementation is crude.  A better one would use larger BIOs, but wouldn't
gain any performance.

The memcpys will be mostly pipelined with the IO and basically come for free.

The ENOMEM path has not been tested.  It should be.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h |  5 ++-
 kernel/power/swap.c  | 93 ++++++++++++++++++++++++++++++++++++++++++----------
 mm/page_io.c         | 25 ++++++++++----
 3 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2f5ad7c2d2..3d434cbffe2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -12,6 +12,8 @@
 
 struct notifier_block;
 
+struct bio;
+
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
@@ -216,7 +218,8 @@ extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern int rw_swap_page_sync(int, swp_entry_t, struct page *);
+extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
+				struct bio **bio_chain);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 79b66e734bd..2dc883d361d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -49,18 +49,16 @@ static int mark_swapfiles(swp_entry_t start)
 {
 	int error;
 
-	rw_swap_page_sync(READ,
-			  swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header));
+	rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header), NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
 		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
 		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
 		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE,
-					  swp_entry(root_swap, 0),
-					  virt_to_page((unsigned long)
-						       &swsusp_header));
+		error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+				virt_to_page((unsigned long)&swsusp_header),
+				NULL);
 	} else {
 		pr_debug("swsusp: Partition is not swap space.\n");
 		error = -ENODEV;
@@ -88,16 +86,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
  *	write_page - Write one page to given swap location.
  *	@buf:		Address we're writing.
  *	@offset:	Offset of the swap page we're writing to.
+ *	@bio_chain:	Link the next write BIO here
  */
 
-static int write_page(void *buf, unsigned long offset)
+static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
 {
 	swp_entry_t entry;
 	int error = -ENOSPC;
 
 	if (offset) {
+		struct page *page = virt_to_page(buf);
+
+		if (bio_chain) {
+			/*
+			 * Whether or not we successfully allocated a copy page,
+			 * we take a ref on the page here.  It gets undone in
+			 * wait_on_bio_chain().
+			 */
+			struct page *page_copy;
+			page_copy = alloc_page(GFP_ATOMIC);
+			if (page_copy == NULL) {
+				WARN_ON_ONCE(1);
+				bio_chain = NULL;	/* Go synchronous */
+				get_page(page);
+			} else {
+				memcpy(page_address(page_copy),
+					page_address(page), PAGE_SIZE);
+				page = page_copy;
+			}
+		}
 		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+		error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
 	}
 	return error;
 }
@@ -185,37 +204,68 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	return 0;
 }
 
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
+static int wait_on_bio_chain(struct bio **bio_chain)
 {
-	int error;
+	struct bio *bio;
+	struct bio *next_bio;
+	int ret = 0;
+
+	if (bio_chain == NULL)
+		return 0;
+
+	bio = *bio_chain;
+	while (bio) {
+		struct page *page;
+
+		next_bio = bio->bi_private;
+		page = bio->bi_io_vec[0].bv_page;
+		wait_on_page_locked(page);
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+		put_page(page);
+		bio_put(bio);
+		bio = next_bio;
+	}
+	*bio_chain = NULL;
+	return ret;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
+{
+	int error = 0;
 	unsigned long offset;
 
 	if (!handle->cur)
 		return -EINVAL;
 	offset = alloc_swap_page(root_swap, handle->bitmap);
-	error = write_page(buf, offset);
+	error = write_page(buf, offset, bio_chain);
 	if (error)
 		return error;
 	handle->cur->entries[handle->k++] = offset;
 	if (handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
+		if (error)
+			goto out;
 		offset = alloc_swap_page(root_swap, handle->bitmap);
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap);
+		error = write_page(handle->cur, handle->cur_swap, NULL);
 		if (error)
-			return error;
+			goto out;
 		memset(handle->cur, 0, PAGE_SIZE);
 		handle->cur_swap = offset;
 		handle->k = 0;
 	}
-	return 0;
+out:
+	return error;
 }
 
 static int flush_swap_writer(struct swap_map_handle *handle)
 {
 	if (handle->cur && handle->cur_swap)
-		return write_page(handle->cur, handle->cur_swap);
+		return write_page(handle->cur, handle->cur_swap, NULL);
 	else
 		return -EINVAL;
 }
@@ -232,6 +282,8 @@ static int save_image(struct swap_map_handle *handle,
 	int ret;
 	int error = 0;
 	int nr_pages;
+	int err2;
+	struct bio *bio;
 	struct timeval start;
 	struct timeval stop;
 
@@ -240,11 +292,13 @@ static int save_image(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	bio = NULL;
 	do_gettimeofday(&start);
 	do {
 		ret = snapshot_read_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
-			error = swap_write_page(handle, data_of(*snapshot));
+			error = swap_write_page(handle, data_of(*snapshot),
+						&bio);
 			if (error)
 				break;
 			if (!(nr_pages % m))
@@ -252,7 +306,10 @@ static int save_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	err2 = wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	show_speed(&start, &stop, nr_to_write, "Wrote");
@@ -307,7 +364,7 @@ int swsusp_write(void)
 	error = get_swap_writer(&handle);
 	if (!error) {
 		unsigned long start = handle.cur_swap;
-		error = swap_write_page(&handle, header);
+		error = swap_write_page(&handle, header, NULL);
 		if (!error)
 			error = save_image(&handle, &snapshot,
 					header->pages - 1);
diff --git a/mm/page_io.c b/mm/page_io.c
index d2f0a578337..f46a9862b7e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -156,10 +156,12 @@ out:
  * We use end_swap_bio_read() even for writes, because it happens to do what
  * we want.
  */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
+			struct bio **bio_chain)
 {
 	struct bio *bio;
 	int ret = 0;
+	int bio_rw;
 
 	lock_page(page);
 
@@ -170,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 		goto out;
 	}
 
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	wait_on_page_locked(page);
-
-	if (!PageUptodate(page) || PageError(page))
-		ret = -EIO;
+	bio_rw = rw;
+	if (!bio_chain)
+		bio_rw |= (1 << BIO_RW_SYNC);
+	if (bio_chain)
+		bio_get(bio);
+	submit_bio(bio_rw, bio);
+	if (bio_chain == NULL) {
+		wait_on_page_locked(page);
+
+		if (!PageUptodate(page) || PageError(page))
+			ret = -EIO;
+	}
+	if (bio_chain) {
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+	}
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 8c002494b55119a3fd1dddee83b4fb75cfda47e5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:43 -0700
Subject: [PATCH] swsusp: add read-speed instrumentation

Add some instrumentation to the swsusp readin code to show what bandwidth
we're achieving.

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2dc883d361d..9ab98957216 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -522,12 +522,15 @@ static int load_image(struct swap_map_handle *handle,
 	unsigned int m;
 	int ret;
 	int error = 0;
+	struct timeval start;
+	struct timeval stop;
 
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	do_gettimeofday(&start);
 	do {
 		ret = snapshot_write_next(snapshot, PAGE_SIZE);
 		if (ret > 0) {
@@ -539,11 +542,13 @@ static int load_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
+	do_gettimeofday(&stop);
 	if (!error) {
 		printk("\b\b\b\bdone\n");
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
+	show_speed(&start, &stop, nr_pages, "Read");
 	return error;
 }
 
-- 
cgit v1.2.3


From 546e0d271941dd1ff6961e2a1f7eac75f1fc277e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 25 Sep 2006 23:32:44 -0700
Subject: [PATCH] swsusp: read speedup

Implement async reads for swsusp resuming.

Crufty old PIII testbox:
	15.7 MB/s -> 20.3 MB/s

Sony Vaio:
	14.6 MB/s -> 33.3 MB/s

I didn't implement the post-resume bio_set_pages_dirty().  I don't really
understand why resume needs to run set_page_dirty() against these pages.

It might be a worry that this code modifies PG_Uptodate, PG_Error and
PG_Locked against the image pages.  Can this possibly affect the resumed-into
kernel?  Hopefully not, if we're atomically restoring its mem_map?

Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Jens Axboe <axboe@suse.de>
Cc: Laurent Riffard <laurent.riffard@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   1 +
 kernel/power/power.h    |   1 +
 kernel/power/snapshot.c |  11 ++--
 kernel/power/swap.c     | 138 ++++++++++++++++++++++++------------------------
 mm/page_io.c            |   2 +-
 5 files changed, 81 insertions(+), 72 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3d434cbffe2..e7c36ba2a2d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -220,6 +220,7 @@ extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
 				struct bio **bio_chain);
+extern int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err);
 
 /* linux/mm/swap_state.c */
 extern struct address_space swapper_space;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb..59ce712f082 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -58,6 +58,7 @@ struct snapshot_handle {
 	struct pbe	*pbe, *last_pbe;
 	void		*buffer;
 	unsigned int	buf_offset;
+	int		sync_read;
 };
 
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648..591301ae8b7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -314,7 +314,7 @@ static unsigned int unsafe_pages;
  *	and we count them using unsafe_pages
  */
 
-static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
@@ -828,13 +828,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	}
 	if (!handle->offset)
 		handle->buffer = buffer;
+	handle->sync_read = 1;
 	if (handle->prev < handle->page) {
 		if (!handle->prev) {
-			error = load_header(handle, (struct swsusp_info *)buffer);
+			error = load_header(handle,
+					(struct swsusp_info *)buffer);
 			if (error)
 				return error;
 		} else if (handle->prev <= nr_meta_pages) {
-			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			handle->pbe = unpack_orig_addresses(buffer,
+							handle->pbe);
 			if (!handle->pbe) {
 				error = prepare_image(handle);
 				if (error)
@@ -842,10 +845,12 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				handle->pbe = pagedir_nosave;
 				handle->last_pbe = NULL;
 				handle->buffer = get_buffer(handle);
+				handle->sync_read = 0;
 			}
 		} else {
 			handle->pbe = handle->pbe->next;
 			handle->buffer = get_buffer(handle);
+			handle->sync_read = 0;
 		}
 		handle->prev = handle->page;
 	}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9ab98957216..8309d20b256 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
@@ -214,6 +215,8 @@ static int wait_on_bio_chain(struct bio **bio_chain)
 		return 0;
 
 	bio = *bio_chain;
+	if (bio == NULL)
+		return 0;
 	while (bio) {
 		struct page *page;
 
@@ -349,7 +352,8 @@ int swsusp_write(void)
 	int error;
 
 	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		printk(KERN_ERR "swsusp: Cannot find swap device, try "
+				"swapon -a.\n");
 		return error;
 	}
 	memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -381,27 +385,6 @@ int swsusp_write(void)
 	return error;
 }
 
-/*
- *	Using bio to read from swap.
- *	This code requires a bit more work than just using buffer heads
- *	but, it is the recommended way for 2.5/2.6.
- *	The following are to signal the beginning and end of I/O. Bios
- *	finish asynchronously, while we want them to happen synchronously.
- *	A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-		printk(KERN_ERR "I/O error reading swsusp image.\n");
-		return -EIO;
-	}
-	atomic_set(&io_done, 0);
-	return 0;
-}
-
 static struct block_device *resume_bdev;
 
 /**
@@ -409,15 +392,15 @@ static struct block_device *resume_bdev;
  *	@rw:	READ or WRITE.
  *	@off	physical offset of page.
  *	@page:	page we're reading or writing.
+ *	@bio_chain: list of pending biod (for async reading)
  *
  *	Straight from the textbook - allocate and initialize the bio.
- *	If we're writing, make sure the page is marked as dirty.
- *	Then submit it and wait.
+ *	If we're reading, make sure the page is marked as dirty.
+ *	Then submit it and, if @bio_chain == NULL, wait.
  */
-
-static int submit(int rw, pgoff_t page_off, void *page)
+static int submit(int rw, pgoff_t page_off, struct page *page,
+			struct bio **bio_chain)
 {
-	int error = 0;
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_ATOMIC, 1);
@@ -425,33 +408,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
 		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_io;
+	bio->bi_end_io = end_swap_bio_read;
 
-	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-		error = -EFAULT;
-		goto Done;
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+		bio_put(bio);
+		return -EFAULT;
 	}
 
-	atomic_set(&io_done, 1);
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	while (atomic_read(&io_done))
-		yield();
-	if (rw == READ)
-		bio_set_pages_dirty(bio);
- Done:
-	bio_put(bio);
-	return error;
+	lock_page(page);
+	bio_get(bio);
+
+	if (bio_chain == NULL) {
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		wait_on_page_locked(page);
+		if (rw == READ)
+			bio_set_pages_dirty(bio);
+		bio_put(bio);
+	} else {
+		get_page(page);
+		bio->bi_private = *bio_chain;
+		*bio_chain = bio;
+		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	}
+	return 0;
 }
 
-static int bio_read_page(pgoff_t page_off, void *page)
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 {
-	return submit(READ, page_off, page);
+	return submit(READ, page_off, virt_to_page(addr), bio_chain);
 }
 
-static int bio_write_page(pgoff_t page_off, void *page)
+static int bio_write_page(pgoff_t page_off, void *addr)
 {
-	return submit(WRITE, page_off, page);
+	return submit(WRITE, page_off, virt_to_page(addr), NULL);
 }
 
 /**
@@ -476,7 +466,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
 	if (!handle->cur)
 		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur);
+	error = bio_read_page(swp_offset(start), handle->cur, NULL);
 	if (error) {
 		release_swap_reader(handle);
 		return error;
@@ -485,7 +475,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
 	return 0;
 }
 
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+				struct bio **bio_chain)
 {
 	unsigned long offset;
 	int error;
@@ -495,16 +486,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = bio_read_page(offset, buf);
+	error = bio_read_page(offset, buf, bio_chain);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
+		error = wait_on_bio_chain(bio_chain);
 		handle->k = 0;
 		offset = handle->cur->next_swap;
 		if (!offset)
 			release_swap_reader(handle);
-		else
-			error = bio_read_page(offset, handle->cur);
+		else if (!error)
+			error = bio_read_page(offset, handle->cur, NULL);
 	}
 	return error;
 }
@@ -517,38 +509,48 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
 
 static int load_image(struct swap_map_handle *handle,
                       struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_read)
 {
 	unsigned int m;
-	int ret;
 	int error = 0;
 	struct timeval start;
 	struct timeval stop;
+	struct bio *bio;
+	int err2;
+	unsigned nr_pages;
 
-	printk("Loading image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
+	printk("Loading image data pages (%u pages) ...     ", nr_to_read);
+	m = nr_to_read / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
+	bio = NULL;
 	do_gettimeofday(&start);
-	do {
-		ret = snapshot_write_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_read_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
+	for ( ; ; ) {
+		error = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (error <= 0)
+			break;
+		error = swap_read_page(handle, data_of(*snapshot), &bio);
+		if (error)
+			break;
+		if (snapshot->sync_read)
+			error = wait_on_bio_chain(&bio);
+		if (error)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
+	err2 = wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
+	if (!error)
+		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
-	show_speed(&start, &stop, nr_pages, "Read");
+	show_speed(&start, &stop, nr_to_read, "Read");
 	return error;
 }
 
@@ -571,7 +573,7 @@ int swsusp_read(void)
 	header = (struct swsusp_info *)data_of(snapshot);
 	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_read_page(&handle, header);
+		error = swap_read_page(&handle, header, NULL);
 	if (!error)
 		error = load_image(&handle, &snapshot, header->pages - 1);
 	release_swap_reader(&handle);
@@ -597,7 +599,7 @@ int swsusp_check(void)
 	if (!IS_ERR(resume_bdev)) {
 		set_blocksize(resume_bdev, PAGE_SIZE);
 		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header)))
+		if ((error = bio_read_page(0, &swsusp_header, NULL)))
 			return error;
 		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
 			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/mm/page_io.c b/mm/page_io.c
index f46a9862b7e..d4840ecbf8f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -74,7 +74,7 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
 	return 0;
 }
 
-static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
-- 
cgit v1.2.3


From ae83c5eef59ffe6eb61110b8c8fd1ea3e0881712 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:45 -0700
Subject: [PATCH] swsusp: clean up browsing of pfns

Clean up some loops over pfns for each zone in snapshot.c: reduce the
number of additions to perform, rework detection of saveable pages and make
the code a bit less difficult to understand, hopefully.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 62 +++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 591301ae8b7..979096c2777 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -156,7 +156,7 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
 
-static int pfn_is_nosave(unsigned long pfn)
+static inline int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
 	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
@@ -167,43 +167,43 @@ static int pfn_is_nosave(unsigned long pfn)
  *	saveable - Determine whether a page should be cloned or not.
  *	@pfn:	The page
  *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
+ *	We save a page if it isn't Nosave, and is not in the range of pages
+ *	statically defined as 'unsaveable', and it
+ *	isn't a part of a free chunk of pages.
  */
 
-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+static struct page *saveable_page(unsigned long pfn)
 {
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
 	struct page *page;
 
 	if (!pfn_valid(pfn))
-		return 0;
+		return NULL;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
+
 	if (PageNosave(page))
-		return 0;
+		return NULL;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
-		return 0;
+		return NULL;
 	if (PageNosaveFree(page))
-		return 0;
+		return NULL;
 
-	return 1;
+	return page;
 }
 
 unsigned int count_data_pages(void)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			n += saveable(zone, &zone_pfn);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			n += !!saveable_page(pfn);
 	}
 	return n;
 }
@@ -211,7 +211,7 @@ unsigned int count_data_pages(void)
 static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	struct pbe *pbe, *p;
 
 	pbe = pblist;
@@ -224,13 +224,14 @@ static void copy_data_pages(struct pbe *pblist)
 			SetPageNosaveFree(virt_to_page(p));
 		for_each_pbe (p, pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page *page;
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+			struct page *page = saveable_page(pfn);
+
+			if (page) {
 				long *src, *dst;
 				int n;
 
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
 				/* copy_page and memcpy are not usable for copying task structs. */
@@ -383,13 +384,14 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 void swsusp_free(void)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 
 	for_each_zone(zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page *page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+
 				if (PageNosave(page) && PageNosaveFree(page)) {
 					ClearPageNosave(page);
 					ClearPageNosaveFree(page);
@@ -598,7 +600,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 static int mark_unsafe_pages(struct pbe *pblist)
 {
 	struct zone *zone;
-	unsigned long zone_pfn;
+	unsigned long pfn, max_zone_pfn;
 	struct pbe *p;
 
 	if (!pblist) /* a sanity check */
@@ -606,10 +608,10 @@ static int mark_unsafe_pages(struct pbe *pblist)
 
 	/* Clear page flags */
 	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
+		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (pfn_valid(pfn))
+				ClearPageNosaveFree(pfn_to_page(pfn));
 	}
 
 	/* Mark orig addresses */
-- 
cgit v1.2.3


From fb13a28b0f5ada60861868c4fa48a12bd0cb8dea Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:46 -0700
Subject: [PATCH] swsusp: struct snapshot_handle cleanup

Add comments describing struct snapshot_handle and its members, change the
confusing name of its member 'page' to 'cur'.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    | 64 ++++++++++++++++++++++++++++++++++++++++++-------
 kernel/power/snapshot.c | 40 +++++++++++++++----------------
 2 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 59ce712f082..1cefcf87a69 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,17 +50,65 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
 
+/**
+ *	Auxiliary structure used for reading the snapshot image data and
+ *	metadata from and writing them to the list of page backup entries
+ *	(PBEs) which is the main data structure of swsusp.
+ *
+ *	Using struct snapshot_handle we can transfer the image, including its
+ *	metadata, as a continuous sequence of bytes with the help of
+ *	snapshot_read_next() and snapshot_write_next().
+ *
+ *	The code that writes the image to a storage or transfers it to
+ *	the user land is required to use snapshot_read_next() for this
+ *	purpose and it should not make any assumptions regarding the internal
+ *	structure of the image.  Similarly, the code that reads the image from
+ *	a storage or transfers it from the user land is required to use
+ *	snapshot_write_next().
+ *
+ *	This may allow us to change the internal structure of the image
+ *	in the future with considerably less effort.
+ */
+
 struct snapshot_handle {
-	loff_t		offset;
-	unsigned int	page;
-	unsigned int	page_offset;
-	unsigned int	prev;
-	struct pbe	*pbe, *last_pbe;
-	void		*buffer;
-	unsigned int	buf_offset;
-	int		sync_read;
+	loff_t		offset;	/* number of the last byte ready for reading
+				 * or writing in the sequence
+				 */
+	unsigned int	cur;	/* number of the block of PAGE_SIZE bytes the
+				 * next operation will refer to (ie. current)
+				 */
+	unsigned int	cur_offset;	/* offset with respect to the current
+					 * block (for the next operation)
+					 */
+	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
+				 * was the current one previously
+				 */
+	struct pbe	*pbe;	/* PBE that corresponds to 'buffer' */
+	struct pbe	*last_pbe;	/* When the image is restored (eg. read
+					 * from disk) we can store some image
+					 * data directly in the page frames
+					 * in which they were before suspend.
+					 * In such a case the PBEs that
+					 * correspond to them will be unused.
+					 * This is the last PBE, so far, that
+					 * does not correspond to such data.
+					 */
+	void		*buffer;	/* address of the block to read from
+					 * or write to
+					 */
+	unsigned int	buf_offset;	/* location to read from or write to,
+					 * given as a displacement from 'buffer'
+					 */
+	int		sync_read;	/* Set to one to notify the caller of
+					 * snapshot_write_next() that it may
+					 * need to call wait_on_bio_chain()
+					 */
 };
 
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
 
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 979096c2777..81fe8de9e60 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -555,7 +555,7 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-	if (handle->page > nr_meta_pages + nr_copy_pages)
+	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
@@ -568,8 +568,8 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 		handle->pbe = pagedir_nosave;
 	}
-	if (handle->prev < handle->page) {
-		if (handle->page <= nr_meta_pages) {
+	if (handle->prev < handle->cur) {
+		if (handle->cur <= nr_meta_pages) {
 			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
 			if (!handle->pbe)
 				handle->pbe = pagedir_nosave;
@@ -577,15 +577,15 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 			handle->buffer = (void *)handle->pbe->address;
 			handle->pbe = handle->pbe->next;
 		}
-		handle->prev = handle->page;
+		handle->prev = handle->cur;
 	}
-	handle->buf_offset = handle->page_offset;
-	if (handle->page_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->page_offset;
-		handle->page_offset = 0;
-		handle->page++;
+	handle->buf_offset = handle->cur_offset;
+	if (handle->cur_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->cur_offset;
+		handle->cur_offset = 0;
+		handle->cur++;
 	} else {
-		handle->page_offset += count;
+		handle->cur_offset += count;
 	}
 	handle->offset += count;
 	return count;
@@ -820,7 +820,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
 	int error = 0;
 
-	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
@@ -831,7 +831,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset)
 		handle->buffer = buffer;
 	handle->sync_read = 1;
-	if (handle->prev < handle->page) {
+	if (handle->prev < handle->cur) {
 		if (!handle->prev) {
 			error = load_header(handle,
 					(struct swsusp_info *)buffer);
@@ -854,15 +854,15 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 			handle->buffer = get_buffer(handle);
 			handle->sync_read = 0;
 		}
-		handle->prev = handle->page;
+		handle->prev = handle->cur;
 	}
-	handle->buf_offset = handle->page_offset;
-	if (handle->page_offset + count >= PAGE_SIZE) {
-		count = PAGE_SIZE - handle->page_offset;
-		handle->page_offset = 0;
-		handle->page++;
+	handle->buf_offset = handle->cur_offset;
+	if (handle->cur_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->cur_offset;
+		handle->cur_offset = 0;
+		handle->cur++;
 	} else {
-		handle->page_offset += count;
+		handle->cur_offset += count;
 	}
 	handle->offset += count;
 	return count;
@@ -871,5 +871,5 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
 	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
-		handle->page <= nr_meta_pages + nr_copy_pages);
+		handle->cur <= nr_meta_pages + nr_copy_pages);
 }
-- 
cgit v1.2.3


From e8eff5ac294e12531c4195e0c15a222d3c9015e5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:46 -0700
Subject: [PATCH] Make swsusp avoid memory holes and reserved memory regions on
 x86_64

On x86_64 machines with more than 2 GB of RAM there are large memory gaps
(with no corresponding kernel virtual addresses) and reserved memory
regions between areas of usable physical RAM.  Moreover, if CONFIG_FLATMEM
is set, they appear within the normal zone.  swsusp should not try to save
them, so the corresponding page structs have to be marked as 'nosave'.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Mel Gorman <mel@csn.ul.ie>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/setup.c |  1 +
 include/asm-x86_64/e820.h  |  1 +
 3 files changed, 50 insertions(+)

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index d6d7f731f6f..708a3cd9a27 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 #include <linux/kexec.h>
 #include <linux/module.h>
+#include <linux/mm.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -297,6 +298,53 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+/* Mark pages corresponding to given address range as nosave */
+static void __init
+e820_mark_nosave_range(unsigned long start, unsigned long end)
+{
+	unsigned long pfn, max_pfn;
+
+	if (start >= end)
+		return;
+
+	printk("Nosave address range: %016lx - %016lx\n", start, end);
+	max_pfn = end >> PAGE_SHIFT;
+	for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
+		if (pfn_valid(pfn))
+			SetPageNosave(pfn_to_page(pfn));
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+	int i;
+	unsigned long paddr;
+
+	paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (paddr < ei->addr)
+			e820_mark_nosave_range(paddr,
+					round_up(ei->addr, PAGE_SIZE));
+
+		paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+		if (ei->type != E820_RAM)
+			e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
+					paddr);
+
+		if (paddr >= (end_pfn << PAGE_SHIFT))
+			break;
+	}
+}
+
 /* 
  * Add a memory region to the kernel e820 map.
  */ 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 34afad70482..4b39f0da17f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -689,6 +689,7 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	probe_roms();
 	e820_reserve_resources(); 
+	e820_mark_nosave_regions();
 
 	request_resource(&iomem_resource, &video_ram_resource);
 
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index 670a3388e70..f6567483231 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void setup_memory_region(void);
 extern void contig_e820_setup(void); 
 extern unsigned long e820_end_of_ram(void);
 extern void e820_reserve_resources(void);
+extern void e820_mark_nosave_regions(void);
 extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
-- 
cgit v1.2.3


From e3920fb42c8ddfe63befb54d95c0e13eabacea9b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:48 -0700
Subject: [PATCH] Disable CPU hotplug during suspend

The current suspend code has to be run on one CPU, so we use the CPU
hotplug to take the non-boot CPUs offline on SMP machines.  However, we
should also make sure that these CPUs will not be enabled by someone else
after we have disabled them.

The functions disable_nonboot_cpus() and enable_nonboot_cpus() are moved to
kernel/cpu.c, because they now refer to some stuff in there that should
better be static.  Also it's better if disable_nonboot_cpus() returns an
error instead of panicking if something goes wrong, and
enable_nonboot_cpus() has no reason to panic(), because the CPUs may have
been enabled by the userland before it tries to take them online.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpu.h     |   8 +++
 include/linux/suspend.h |   8 ---
 kernel/cpu.c            | 138 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/power/Makefile   |   2 -
 kernel/power/disk.c     |   7 ++-
 kernel/power/main.c     |  10 ++--
 kernel/power/smp.c      |  62 ----------------------
 kernel/power/user.c     |  14 +++--
 8 files changed, 145 insertions(+), 104 deletions(-)
 delete mode 100644 kernel/power/smp.c

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8fb344a9abd..3fef7d67aed 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -89,4 +89,12 @@ int cpu_down(unsigned int cpu);
 static inline int cpu_is_offline(int cpu) { return 0; }
 #endif
 
+#ifdef CONFIG_SUSPEND_SMP
+extern int disable_nonboot_cpus(void);
+extern void enable_nonboot_cpus(void);
+#else
+static inline int disable_nonboot_cpus(void) { return 0; }
+static inline void enable_nonboot_cpus(void) {}
+#endif
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 96e31aa64cc..6e8a06c950f 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -57,14 +57,6 @@ static inline int software_suspend(void)
 }
 #endif /* CONFIG_PM */
 
-#ifdef CONFIG_SUSPEND_SMP
-extern void disable_nonboot_cpus(void);
-extern void enable_nonboot_cpus(void);
-#else
-static inline void disable_nonboot_cpus(void) {}
-static inline void enable_nonboot_cpus(void) {}
-#endif
-
 void save_processor_state(void);
 void restore_processor_state(void);
 struct saved_context;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c..32c96628463 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
 	return 0;
 }
 
-int cpu_down(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
 {
 	int err;
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 
-	mutex_lock(&cpu_add_remove_lock);
-	if (num_online_cpus() == 1) {
-		err = -EBUSY;
-		goto out;
-	}
+	if (num_online_cpus() == 1)
+		return -EBUSY;
 
-	if (!cpu_online(cpu)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!cpu_online(cpu))
+		return -EINVAL;
 
 	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
 				__FUNCTION__, cpu);
-		err = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	/* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
 	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
-out:
+	return err;
+}
+
+int cpu_down(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_down(cpu);
+
 	mutex_unlock(&cpu_add_remove_lock);
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-int __devinit cpu_up(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int __devinit _cpu_up(unsigned int cpu)
 {
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	mutex_lock(&cpu_add_remove_lock);
-	if (cpu_online(cpu) || !cpu_present(cpu)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (cpu_online(cpu) || !cpu_present(cpu))
+		return -EINVAL;
 
 	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
 	if (ret != 0)
 		blocking_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
+
+	return ret;
+}
+
+int __devinit cpu_up(unsigned int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&cpu_add_remove_lock);
+	if (cpu_hotplug_disabled)
+		err = -EBUSY;
+	else
+		err = _cpu_up(cpu);
+
+	mutex_unlock(&cpu_add_remove_lock);
+	return err;
+}
+
+#ifdef CONFIG_SUSPEND_SMP
+static cpumask_t frozen_cpus;
+
+int disable_nonboot_cpus(void)
+{
+	int cpu, first_cpu, error;
+
+	mutex_lock(&cpu_add_remove_lock);
+	first_cpu = first_cpu(cpu_present_map);
+	if (!cpu_online(first_cpu)) {
+		error = _cpu_up(first_cpu);
+		if (error) {
+			printk(KERN_ERR "Could not bring CPU%d up.\n",
+				first_cpu);
+			goto out;
+		}
+	}
+	error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
+	if (error) {
+		printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
+		goto out;
+	}
+	/* We take down all of the non-boot CPUs in one shot to avoid races
+	 * with the userspace trying to use the CPU hotplug at the same time
+	 */
+	cpus_clear(frozen_cpus);
+	printk("Disabling non-boot CPUs ...\n");
+	for_each_online_cpu(cpu) {
+		if (cpu == first_cpu)
+			continue;
+		error = _cpu_down(cpu);
+		if (!error) {
+			cpu_set(cpu, frozen_cpus);
+			printk("CPU%d is down\n", cpu);
+		} else {
+			printk(KERN_ERR "Error taking CPU%d down: %d\n",
+				cpu, error);
+			break;
+		}
+	}
+	if (!error) {
+		BUG_ON(num_online_cpus() > 1);
+		/* Make sure the CPUs won't be enabled by someone else */
+		cpu_hotplug_disabled = 1;
+	} else {
+		printk(KERN_ERR "Non-boot CPUs are not disabled");
+	}
 out:
 	mutex_unlock(&cpu_add_remove_lock);
-	return ret;
+	return error;
+}
+
+void enable_nonboot_cpus(void)
+{
+	int cpu, error;
+
+	/* Allow everyone to use the CPU hotplug again */
+	mutex_lock(&cpu_add_remove_lock);
+	cpu_hotplug_disabled = 0;
+	mutex_unlock(&cpu_add_remove_lock);
+
+	printk("Enabling non-boot CPUs ...\n");
+	for_each_cpu_mask(cpu, frozen_cpus) {
+		error = cpu_up(cpu);
+		if (!error) {
+			printk("CPU%d is up\n", cpu);
+			continue;
+		}
+		printk(KERN_WARNING "Error taking CPU%d up: %d\n",
+			cpu, error);
+	}
+	cpus_clear(frozen_cpus);
 }
+#endif
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4..38725f526af 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
-obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
-
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e7406784..7c7b9b65e36 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/cpu.h>
 
 #include "power.h"
 
@@ -72,7 +73,10 @@ static int prepare_processes(void)
 	int error;
 
 	pm_prepare_console();
-	disable_nonboot_cpus();
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto enable_cpus;
 
 	if (freeze_processes()) {
 		error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
 		return 0;
 thaw:
 	thaw_processes();
+enable_cpus:
 	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c77679..4d403323a7b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/console.h>
+#include <linux/cpu.h>
 
 #include "power.h"
 
@@ -51,7 +52,7 @@ void pm_set_ops(struct pm_ops * ops)
 
 static int suspend_prepare(suspend_state_t state)
 {
-	int error = 0;
+	int error;
 	unsigned int free_pages;
 
 	if (!pm_ops || !pm_ops->enter)
@@ -59,12 +60,9 @@ static int suspend_prepare(suspend_state_t state)
 
 	pm_prepare_console();
 
-	disable_nonboot_cpus();
-
-	if (num_online_cpus() != 1) {
-		error = -EPERM;
+	error = disable_nonboot_cpus();
+	if (error)
 		goto Enable_cpu;
-	}
 
 	if (freeze_processes()) {
 		error = -EAGAIN;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d6..00000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * drivers/power/smp.c - Functions for stopping other CPUs.
- *
- * Copyright 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
- *
- * This file is released under the GPLv2.
- */
-
-#undef DEBUG
-
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <asm/atomic.h>
-#include <asm/tlbflush.h>
-
-/* This is protected by pm_sem semaphore */
-static cpumask_t frozen_cpus;
-
-void disable_nonboot_cpus(void)
-{
-	int cpu, error;
-
-	error = 0;
-	cpus_clear(frozen_cpus);
-	printk("Freezing cpus ...\n");
-	for_each_online_cpu(cpu) {
-		if (cpu == 0)
-			continue;
-		error = cpu_down(cpu);
-		if (!error) {
-			cpu_set(cpu, frozen_cpus);
-			printk("CPU%d is down\n", cpu);
-			continue;
-		}
-		printk("Error taking cpu %d down: %d\n", cpu, error);
-	}
-	BUG_ON(raw_smp_processor_id() != 0);
-	if (error)
-		panic("cpus not sleeping");
-}
-
-void enable_nonboot_cpus(void)
-{
-	int cpu, error;
-
-	printk("Thawing cpus ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = cpu_up(cpu);
-		if (!error) {
-			printk("CPU%d is up\n", cpu);
-			continue;
-		}
-		printk("Error taking cpu %d up: %d\n", cpu, error);
-		panic("Not enough cpus");
-	}
-	cpus_clear(frozen_cpus);
-}
-
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48..0ef5e4ba39e 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
 
 #include <asm/uaccess.h>
 
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		down(&pm_sem);
-		disable_nonboot_cpus();
-		if (freeze_processes()) {
-			thaw_processes();
-			enable_nonboot_cpus();
-			error = -EBUSY;
+		error = disable_nonboot_cpus();
+		if (!error) {
+			error = freeze_processes();
+			if (error) {
+				thaw_processes();
+				error = -EBUSY;
+			}
 		}
+		enable_nonboot_cpus();
 		up(&pm_sem);
 		if (!error)
 			data->frozen = 1;
-- 
cgit v1.2.3


From f623f0db8e6aa86a37be86167e4ff478821a9f4f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:49 -0700
Subject: [PATCH] swsusp: Fix mark_free_pages

Clean up mm/page_alloc.c#mark_free_pages() and make it avoid clearing
PageNosaveFree for PageNosave pages.  This allows us to get rid of an ugly
hack in kernel/power/snapshot.c#copy_data_pages().

Additionally, the page-copying loop in copy_data_pages() is moved to an
inline function.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 27 +++++++++++++--------------
 mm/page_alloc.c         | 24 ++++++++++++++++--------
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 81fe8de9e60..4afb7900002 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -208,37 +208,36 @@ unsigned int count_data_pages(void)
 	return n;
 }
 
+static inline void copy_data_page(long *dst, long *src)
+{
+	int n;
+
+	/* copy_page and memcpy are not usable for copying task structs. */
+	for (n = PAGE_SIZE / sizeof(long); n; n--)
+		*dst++ = *src++;
+}
+
 static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
-	struct pbe *pbe, *p;
+	struct pbe *pbe;
 
 	pbe = pblist;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
-		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pblist)
-			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe (p, pblist)
-			SetPageNosaveFree(virt_to_page(p->address));
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
 			struct page *page = saveable_page(pfn);
 
 			if (page) {
-				long *src, *dst;
-				int n;
+				void *ptr = page_address(page);
 
 				BUG_ON(!pbe);
-				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page and memcpy are not usable for copying task structs. */
-				dst = (long *)pbe->address;
-				src = (long *)pbe->orig_address;
-				for (n = PAGE_SIZE / sizeof(long); n; n--)
-					*dst++ = *src++;
+				copy_data_page((void *)pbe->address, ptr);
+				pbe->orig_address = (unsigned long)ptr;
 				pbe = pbe->next;
 			}
 		}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 51070b6d593..9810f0a60db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -694,7 +694,8 @@ static void __drain_pages(unsigned int cpu)
 
 void mark_free_pages(struct zone *zone)
 {
-	unsigned long zone_pfn, flags;
+	unsigned long pfn, max_zone_pfn;
+	unsigned long flags;
 	int order;
 	struct list_head *curr;
 
@@ -702,18 +703,25 @@ void mark_free_pages(struct zone *zone)
 		return;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
+
+	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+		if (pfn_valid(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+
+			if (!PageNosave(page))
+				ClearPageNosaveFree(page);
+		}
 
 	for (order = MAX_ORDER - 1; order >= 0; --order)
 		list_for_each(curr, &zone->free_area[order].free_list) {
-			unsigned long start_pfn, i;
+			unsigned long i;
 
-			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			for (i = 0; i < (1UL << order); i++)
+				SetPageNosaveFree(pfn_to_page(pfn + i));
+		}
 
-			for (i=0; i < (1<<order); i++)
-				SetPageNosaveFree(pfn_to_page(start_pfn+i));
-	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
-- 
cgit v1.2.3


From f6143aa60ed71e58578bc92cc64d98158a694d99 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:50 -0700
Subject: [PATCH] swsusp: Reorder memory-allocating functions

Move some functions in kernel/power/snapshot.c to a better place (in the
same file) and introduce free_image_page() (will be necessary in the
future).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 93 ++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4afb7900002..7ad0c046552 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -156,6 +156,58 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
 
+/**
+ *	@safe_needed - on resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	used before suspend.
+ *
+ *	The unsafe pages are marked with the PG_nosave_free flag
+ *	and we count them using unsafe_pages
+ */
+
+static unsigned int unsafe_pages;
+
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+{
+	void *res;
+
+	res = (void *)get_zeroed_page(gfp_mask);
+	if (safe_needed)
+		while (res && PageNosaveFree(virt_to_page(res))) {
+			/* The page is unsafe, mark it for swsusp_free() */
+			SetPageNosave(virt_to_page(res));
+			unsafe_pages++;
+			res = (void *)get_zeroed_page(gfp_mask);
+		}
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+	return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
+
+/**
+ *	free_image_page - free page represented by @addr, allocated with
+ *	alloc_image_page (page flags set by it must be cleared)
+ */
+
+static inline void free_image_page(void *addr, int clear_nosave_free)
+{
+	ClearPageNosave(virt_to_page(addr));
+	if (clear_nosave_free)
+		ClearPageNosaveFree(virt_to_page(addr));
+	free_page((unsigned long)addr);
+}
+
+/**
+ *	pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
 static inline int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -245,7 +297,6 @@ static void copy_data_pages(struct pbe *pblist)
 	BUG_ON(pbe);
 }
 
-
 /**
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
@@ -256,10 +307,7 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
-		ClearPageNosave(virt_to_page(pblist));
-		if (clear_nosave_free)
-			ClearPageNosaveFree(virt_to_page(pblist));
-		free_page((unsigned long)pblist);
+		free_image_page(pblist, clear_nosave_free);
 		pblist = pbe;
 	}
 }
@@ -303,41 +351,6 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
-static unsigned int unsafe_pages;
-
-/**
- *	@safe_needed - on resume, for storing the PBE list and the image,
- *	we can only use memory pages that do not conflict with the pages
- *	used before suspend.
- *
- *	The unsafe pages are marked with the PG_nosave_free flag
- *	and we count them using unsafe_pages
- */
-
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
-{
-	void *res;
-
-	res = (void *)get_zeroed_page(gfp_mask);
-	if (safe_needed)
-		while (res && PageNosaveFree(virt_to_page(res))) {
-			/* The page is unsafe, mark it for swsusp_free() */
-			SetPageNosave(virt_to_page(res));
-			unsafe_pages++;
-			res = (void *)get_zeroed_page(gfp_mask);
-		}
-	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
-	}
-	return res;
-}
-
-unsigned long get_safe_page(gfp_t gfp_mask)
-{
-	return (unsigned long)alloc_image_page(gfp_mask, 1);
-}
-
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
-- 
cgit v1.2.3


From cd560bb2f9e2cd451bb3942af43da19632ba4a8e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:50 -0700
Subject: [PATCH] swsusp: Fix alloc_pagedir

Get rid of the FIXME in kernel/power/snapshot.c#alloc_pagedir() and
simplify the functions called by it.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 7ad0c046552..4ca372f2bc1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -316,12 +316,12 @@ static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
  *	fill_pb_page - Create a list of PBEs on a given memory page
  */
 
-static inline void fill_pb_page(struct pbe *pbpage)
+static inline void fill_pb_page(struct pbe *pbpage, unsigned int n)
 {
 	struct pbe *p;
 
 	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
+	pbpage += n - 1;
 	do
 		p->next = p + 1;
 	while (++p < pbpage);
@@ -330,24 +330,26 @@ static inline void fill_pb_page(struct pbe *pbpage)
 /**
  *	create_pbe_list - Create a list of PBEs on top of a given chain
  *	of memory pages allocated with alloc_pagedir()
+ *
+ *	This function assumes that pages allocated by alloc_image_page() will
+ *	always be zeroed.
  */
 
 static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
-	struct pbe *pbpage, *p;
+	struct pbe *pbpage;
 	unsigned int num = PBES_PER_PAGE;
 
 	for_each_pb_page (pbpage, pblist) {
 		if (num >= nr_pages)
 			break;
 
-		fill_pb_page(pbpage);
+		fill_pb_page(pbpage, PBES_PER_PAGE);
 		num += PBES_PER_PAGE;
 	}
 	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
+		num -= PBES_PER_PAGE;
+		fill_pb_page(pbpage, nr_pages - num);
 	}
 }
 
@@ -374,17 +376,17 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 		return NULL;
 
 	pblist = alloc_image_page(gfp_mask, safe_needed);
-	/* FIXME: rewrite this ugly loop */
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
+	pbe = pblist;
+	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
+		if (!pbe) {
+			free_pagedir(pblist, 1);
+			return NULL;
+		}
 		pbe += PB_PAGE_SKIP;
 		pbe->next = alloc_image_page(gfp_mask, safe_needed);
+		pbe = pbe->next;
 	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist, 1);
-		pblist = NULL;
-        } else
-		create_pbe_list(pblist, nr_pages);
+	create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
-- 
cgit v1.2.3


From dcbb5a54f6e3984efa24772394f2225b11495c55 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:51 -0700
Subject: [PATCH] swsusp: clean up suspend header

Remove some things that are no longer used or defined elsewhere from suspend.h
and make the inline version of software_suspend() return the right error code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 6e8a06c950f..c11cacf1a13 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -10,11 +10,11 @@
 #include <linux/pm.h>
 
 /* page backup entry */
-typedef struct pbe {
+struct pbe {
 	unsigned long address;		/* address of the copy */
 	unsigned long orig_address;	/* original address of page */
 	struct pbe *next;
-} suspend_pagedir_t;
+};
 
 #define for_each_pbe(pbe, pblist) \
 	for (pbe = pblist ; pbe ; pbe = pbe->next)
@@ -25,15 +25,6 @@ typedef struct pbe {
 #define for_each_pb_page(pbe, pblist) \
 	for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next)
 
-
-#define SWAP_FILENAME_MAXLENGTH	32
-
-
-extern dev_t swsusp_resume_device;
-   
-/* mm/vmscan.c */
-extern int shrink_mem(void);
-
 /* mm/page_alloc.c */
 extern void drain_local_pages(void);
 extern void mark_free_pages(struct zone *zone);
@@ -53,7 +44,7 @@ static inline void pm_restore_console(void) {}
 static inline int software_suspend(void)
 {
 	printk("Warning: fake suspend called\n");
-	return -EPERM;
+	return -ENOSYS;
 }
 #endif /* CONFIG_PM */
 
-- 
cgit v1.2.3


From 75534b50cc658e951bcb213c2763c81e9f7b0b48 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:52 -0700
Subject: [PATCH] Change the name of pagedir_nosave

The name of the pagedir_nosave variable does not make sense any more, so it
seems reasonable to change it to something more meaningful.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/power/swsusp.S         |  2 +-
 arch/powerpc/kernel/swsusp_32.S  |  4 ++--
 arch/x86_64/kernel/suspend_asm.S |  2 +-
 kernel/power/power.h             |  2 --
 kernel/power/snapshot.c          | 26 ++++++++++++++------------
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index c893b897217..8a2b50a0aaa 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -32,7 +32,7 @@ ENTRY(swsusp_arch_resume)
 	movl	$swsusp_pg_dir-__PAGE_OFFSET, %ecx
 	movl	%ecx, %cr3
 
-	movl	pagedir_nosave, %edx
+	movl	restore_pblist, %edx
 	.p2align 4,,7
 
 copy_loop:
diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S
index 7369f9a6ad2..69e8f86aa4f 100644
--- a/arch/powerpc/kernel/swsusp_32.S
+++ b/arch/powerpc/kernel/swsusp_32.S
@@ -159,8 +159,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	isync
 
 	/* Load ptr the list of pages to copy in r3 */
-	lis	r11,(pagedir_nosave - KERNELBASE)@h
-	ori	r11,r11,pagedir_nosave@l
+	lis	r11,(restore_pblist - KERNELBASE)@h
+	ori	r11,r11,restore_pblist@l
 	lwz	r10,0(r11)
 
 	/* Copy the pages. This is a very basic implementation, to
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 320b6fb00cc..bfbe00763c6 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -54,7 +54,7 @@ ENTRY(restore_image)
 	movq	%rcx, %cr3;
 	movq	%rax, %cr4;  # turn PGE back on
 
-	movq	pagedir_nosave(%rip), %rdx
+	movq	restore_pblist(%rip), %rdx
 loop:
 	testq	%rdx, %rdx
 	jz	done
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1cefcf87a69..e18ba207e78 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern struct pbe *pagedir_nosave;
-
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4ca372f2bc1..1d276b3ae15 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,7 +34,9 @@
 
 #include "power.h"
 
-struct pbe *pagedir_nosave;
+/* List of PBEs used for creating and restoring the suspend image */
+struct pbe *restore_pblist;
+
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
@@ -415,7 +417,7 @@ void swsusp_free(void)
 	}
 	nr_copy_pages = 0;
 	nr_meta_pages = 0;
-	pagedir_nosave = NULL;
+	restore_pblist = NULL;
 	buffer = NULL;
 }
 
@@ -490,15 +492,15 @@ asmlinkage int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	pagedir_nosave = swsusp_alloc(nr_pages);
-	if (!pagedir_nosave)
+	restore_pblist = swsusp_alloc(nr_pages);
+	if (!restore_pblist)
 		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages(pagedir_nosave);
+	copy_data_pages(restore_pblist);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -580,13 +582,13 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset) {
 		init_header((struct swsusp_info *)buffer);
 		handle->buffer = buffer;
-		handle->pbe = pagedir_nosave;
+		handle->pbe = restore_pblist;
 	}
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
 			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
 			if (!handle->pbe)
-				handle->pbe = pagedir_nosave;
+				handle->pbe = restore_pblist;
 		} else {
 			handle->buffer = (void *)handle->pbe->address;
 			handle->pbe = handle->pbe->next;
@@ -689,7 +691,7 @@ static int load_header(struct snapshot_handle *handle,
 		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
 		if (!pblist)
 			return -ENOMEM;
-		pagedir_nosave = pblist;
+		restore_pblist = pblist;
 		handle->pbe = pblist;
 		nr_copy_pages = info->image_pages;
 		nr_meta_pages = info->pages - info->image_pages - 1;
@@ -716,7 +718,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
 
 /**
  *	prepare_image - use metadata contained in the PBE list
- *	pointed to by pagedir_nosave to mark the pages that will
+ *	pointed to by restore_pblist to mark the pages that will
  *	be overwritten in the process of restoring the system
  *	memory state from the image ("unsafe" pages) and allocate
  *	memory for the image
@@ -741,7 +743,7 @@ static int prepare_image(struct snapshot_handle *handle)
 	unsigned int nr_pages = nr_copy_pages;
 	struct pbe *p, *pblist = NULL;
 
-	p = pagedir_nosave;
+	p = restore_pblist;
 	error = mark_unsafe_pages(p);
 	if (!error) {
 		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
@@ -773,7 +775,7 @@ static int prepare_image(struct snapshot_handle *handle)
 		}
 	}
 	if (!error) {
-		pagedir_nosave = pblist;
+		restore_pblist = pblist;
 	} else {
 		handle->pbe = NULL;
 		swsusp_free();
@@ -858,7 +860,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				error = prepare_image(handle);
 				if (error)
 					return error;
-				handle->pbe = pagedir_nosave;
+				handle->pbe = restore_pblist;
 				handle->last_pbe = NULL;
 				handle->buffer = get_buffer(handle);
 				handle->sync_read = 0;
-- 
cgit v1.2.3


From 0bcd888d64684f896ffa70c1d16a42b00753c184 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:52 -0700
Subject: [PATCH] swsusp: Introduce some helpful constants

Introduce some constants that hopefully will help improve the readability of
code in kernel/power/snapshot.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1d276b3ae15..d0d691f976d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -167,6 +167,11 @@ static inline int restore_highmem(void) {return 0;}
  *	and we count them using unsafe_pages
  */
 
+#define PG_ANY		0
+#define PG_SAFE		1
+#define PG_UNSAFE_CLEAR	1
+#define PG_UNSAFE_KEEP	0
+
 static unsigned int unsafe_pages;
 
 static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
@@ -190,7 +195,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-	return (unsigned long)alloc_image_page(gfp_mask, 1);
+	return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
 }
 
 /**
@@ -381,7 +386,7 @@ static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
 	pbe = pblist;
 	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
 		if (!pbe) {
-			free_pagedir(pblist, 1);
+			free_pagedir(pblist, PG_UNSAFE_CLEAR);
 			return NULL;
 		}
 		pbe += PB_PAGE_SKIP;
@@ -458,12 +463,13 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
 {
 	struct pbe *pblist;
 
-	if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
+	pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (!pblist) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
 
-	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) {
 		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 		swsusp_free();
 		return NULL;
@@ -575,7 +581,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
@@ -688,7 +694,7 @@ static int load_header(struct snapshot_handle *handle,
 
 	error = check_header(info);
 	if (!error) {
-		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY);
 		if (!pblist)
 			return -ENOMEM;
 		restore_pblist = pblist;
@@ -746,10 +752,10 @@ static int prepare_image(struct snapshot_handle *handle)
 	p = restore_pblist;
 	error = mark_unsafe_pages(p);
 	if (!error) {
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE);
 		if (pblist)
 			copy_page_backup_list(pblist, p);
-		free_pagedir(p, 0);
+		free_pagedir(p, PG_UNSAFE_KEEP);
 		if (!pblist)
 			error = -ENOMEM;
 	}
@@ -840,7 +846,7 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 		return 0;
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
-		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
 		if (!buffer)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From b788db79896ef2a5817b9395ad63573b254a6d93 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:54 -0700
Subject: [PATCH] swsusp: Introduce memory bitmaps

Introduce the memory bitmap data structure and make swsusp use in the suspend
phase.

The current swsusp's internal data structure is not very efficient from the
memory usage point of view, so it seems reasonable to replace it with a data
structure that will require less memory, such as a pair of bitmaps.

The idea is to use bitmaps that may be allocated as sets of individual pages,
so that we can avoid making allocations of order greater than 0.  For this
reason the memory bitmap structure consists of several linked lists of objects
that contain pointers to memory pages with the actual bitmap data.  Still, for
a typical system all of these lists fit in a single page, so it's reasonable
to introduce an additional mechanism allowing us to allocate all of them
efficiently without sacrificing the generality of the design.  This is done
with the help of the chain_allocator structure and associated functions.

We need to use two memory bitmaps during the suspend phase of the
suspend-resume cycle.  One of them is necessary for marking the saveable
pages, and the second is used to mark the pages in which to store the copies
of them (aka image pages).

First, the bitmaps are created and we allocate as many image pages as needed
(the corresponding bits in the second bitmap are set as soon as the pages are
allocated).  Second, the bits corresponding to the saveable pages are set in
the first bitmap and the saveable pages are copied to the image pages.
Finally, the first bitmap is used to save the kernel virtual addresses of the
saveable pages and the second one is used to save the contents of the image
pages.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |   3 +-
 kernel/power/snapshot.c | 602 ++++++++++++++++++++++++++++++++++++++++++------
 kernel/power/swsusp.c   |   5 +-
 3 files changed, 540 insertions(+), 70 deletions(-)

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e18ba207e78..6e9e2acc34f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -109,9 +109,10 @@ struct snapshot_handle {
  */
 #define data_of(handle)	((handle).buffer + (handle).buf_offset)
 
+extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
-int snapshot_image_loaded(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
 
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d0d691f976d..852e0df4171 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -211,6 +211,467 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 	free_page((unsigned long)addr);
 }
 
+/* struct linked_page is used to build chains of pages */
+
+#define LINKED_PAGE_DATA_SIZE	(PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+	struct linked_page *next;
+	char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
+
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+{
+	while (list) {
+		struct linked_page *lp = list->next;
+
+		free_image_page(list, clear_page_nosave);
+		list = lp;
+	}
+}
+
+/**
+  *	struct chain_allocator is used for allocating small objects out of
+  *	a linked list of pages called 'the chain'.
+  *
+  *	The chain grows each time when there is no room for a new object in
+  *	the current page.  The allocated objects cannot be freed individually.
+  *	It is only possible to free them all at once, by freeing the entire
+  *	chain.
+  *
+  *	NOTE: The chain allocator may be inefficient if the allocated objects
+  *	are not much smaller than PAGE_SIZE.
+  */
+
+struct chain_allocator {
+	struct linked_page *chain;	/* the chain */
+	unsigned int used_space;	/* total size of objects allocated out
+					 * of the current page
+					 */
+	gfp_t gfp_mask;		/* mask for allocating pages */
+	int safe_needed;	/* if set, only "safe" pages are allocated */
+};
+
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+{
+	ca->chain = NULL;
+	ca->used_space = LINKED_PAGE_DATA_SIZE;
+	ca->gfp_mask = gfp_mask;
+	ca->safe_needed = safe_needed;
+}
+
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+{
+	void *ret;
+
+	if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
+		struct linked_page *lp;
+
+		lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+		if (!lp)
+			return NULL;
+
+		lp->next = ca->chain;
+		ca->chain = lp;
+		ca->used_space = 0;
+	}
+	ret = ca->chain->data + ca->used_space;
+	ca->used_space += size;
+	return ret;
+}
+
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
+{
+	free_list_of_pages(ca->chain, clear_page_nosave);
+	memset(ca, 0, sizeof(struct chain_allocator));
+}
+
+/**
+ *	Data types related to memory bitmaps.
+ *
+ *	Memory bitmap is a structure consiting of many linked lists of
+ *	objects.  The main list's elements are of type struct zone_bitmap
+ *	and each of them corresonds to one zone.  For each zone bitmap
+ *	object there is a list of objects of type struct bm_block that
+ *	represent each blocks of bit chunks in which information is
+ *	stored.
+ *
+ *	struct memory_bitmap contains a pointer to the main list of zone
+ *	bitmap objects, a struct bm_position used for browsing the bitmap,
+ *	and a pointer to the list of pages used for allocating all of the
+ *	zone bitmap objects and bitmap block objects.
+ *
+ *	NOTE: It has to be possible to lay out the bitmap in memory
+ *	using only allocations of order 0.  Additionally, the bitmap is
+ *	designed to work with arbitrary number of zones (this is over the
+ *	top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ *	struct zone_bitmap contains a pointer to a list of bitmap block
+ *	objects and a pointer to the bitmap block object that has been
+ *	most recently used for setting bits.  Additionally, it contains the
+ *	pfns that correspond to the start and end of the represented zone.
+ *
+ *	struct bm_block contains a pointer to the memory page in which
+ *	information is stored (in the form of a block of bit chunks
+ *	of type unsigned long each).  It also contains the pfns that
+ *	correspond to the start and end of the represented memory area and
+ *	the number of bit chunks in the block.
+ *
+ *	NOTE: Memory bitmaps are used for two types of operations only:
+ *	"set a bit" and "find the next bit set".  Moreover, the searching
+ *	is always carried out after all of the "set a bit" operations
+ *	on given bitmap.
+ */
+
+#define BM_END_OF_MAP	(~0UL)
+
+#define BM_CHUNKS_PER_BLOCK	(PAGE_SIZE / sizeof(long))
+#define BM_BITS_PER_CHUNK	(sizeof(long) << 3)
+#define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
+
+struct bm_block {
+	struct bm_block *next;		/* next element of the list */
+	unsigned long start_pfn;	/* pfn represented by the first bit */
+	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
+	unsigned int size;	/* number of bit chunks */
+	unsigned long *data;	/* chunks of bits representing pages */
+};
+
+struct zone_bitmap {
+	struct zone_bitmap *next;	/* next element of the list */
+	unsigned long start_pfn;	/* minimal pfn in this zone */
+	unsigned long end_pfn;		/* maximal pfn in this zone plus 1 */
+	struct bm_block *bm_blocks;	/* list of bitmap blocks */
+	struct bm_block *cur_block;	/* recently used bitmap block */
+};
+
+/* strcut bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+	struct zone_bitmap *zone_bm;
+	struct bm_block *block;
+	int chunk;
+	int bit;
+};
+
+struct memory_bitmap {
+	struct zone_bitmap *zone_bm_list;	/* list of zone bitmaps */
+	struct linked_page *p_list;	/* list of pages used to store zone
+					 * bitmap objects and bitmap block
+					 * objects
+					 */
+	struct bm_position cur;	/* most recently used bit position */
+};
+
+/* Functions that operate on memory bitmaps */
+
+static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
+{
+	bm->cur.chunk = 0;
+	bm->cur.bit = -1;
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+	struct zone_bitmap *zone_bm;
+
+	zone_bm = bm->zone_bm_list;
+	bm->cur.zone_bm = zone_bm;
+	bm->cur.block = zone_bm->bm_blocks;
+	memory_bm_reset_chunk(bm);
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
+/**
+ *	create_bm_block_list - create a list of block bitmap objects
+ */
+
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+{
+	struct bm_block *bblist = NULL;
+
+	while (nr_blocks-- > 0) {
+		struct bm_block *bb;
+
+		bb = chain_alloc(ca, sizeof(struct bm_block));
+		if (!bb)
+			return NULL;
+
+		bb->next = bblist;
+		bblist = bb;
+	}
+	return bblist;
+}
+
+/**
+ *	create_zone_bm_list - create a list of zone bitmap objects
+ */
+
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
+{
+	struct zone_bitmap *zbmlist = NULL;
+
+	while (nr_zones-- > 0) {
+		struct zone_bitmap *zbm;
+
+		zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+		if (!zbm)
+			return NULL;
+
+		zbm->next = zbmlist;
+		zbmlist = zbm;
+	}
+	return zbmlist;
+}
+
+/**
+  *	memory_bm_create - allocate memory for a memory bitmap
+  */
+
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+	struct chain_allocator ca;
+	struct zone *zone;
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+	unsigned int nr;
+
+	chain_init(&ca, gfp_mask, safe_needed);
+
+	/* Compute the number of zones */
+	nr = 0;
+	for_each_zone (zone)
+		if (populated_zone(zone) && !is_highmem(zone))
+			nr++;
+
+	/* Allocate the list of zones bitmap objects */
+	zone_bm = create_zone_bm_list(nr, &ca);
+	bm->zone_bm_list = zone_bm;
+	if (!zone_bm) {
+		chain_free(&ca, PG_UNSAFE_CLEAR);
+		return -ENOMEM;
+	}
+
+	/* Initialize the zone bitmap objects */
+	for_each_zone (zone) {
+		unsigned long pfn;
+
+		if (!populated_zone(zone) || is_highmem(zone))
+			continue;
+
+		zone_bm->start_pfn = zone->zone_start_pfn;
+		zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		/* Allocate the list of bitmap block objects */
+		nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+		bb = create_bm_block_list(nr, &ca);
+		zone_bm->bm_blocks = bb;
+		zone_bm->cur_block = bb;
+		if (!bb)
+			goto Free;
+
+		nr = zone->spanned_pages;
+		pfn = zone->zone_start_pfn;
+		/* Initialize the bitmap block objects */
+		while (bb) {
+			unsigned long *ptr;
+
+			ptr = alloc_image_page(gfp_mask, safe_needed);
+			bb->data = ptr;
+			if (!ptr)
+				goto Free;
+
+			bb->start_pfn = pfn;
+			if (nr >= BM_BITS_PER_BLOCK) {
+				pfn += BM_BITS_PER_BLOCK;
+				bb->size = BM_CHUNKS_PER_BLOCK;
+				nr -= BM_BITS_PER_BLOCK;
+			} else {
+				/* This is executed only once in the loop */
+				pfn += nr;
+				bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
+			}
+			bb->end_pfn = pfn;
+			bb = bb->next;
+		}
+		zone_bm = zone_bm->next;
+	}
+	bm->p_list = ca.chain;
+	memory_bm_position_reset(bm);
+	return 0;
+
+Free:
+	bm->p_list = ca.chain;
+	memory_bm_free(bm, PG_UNSAFE_CLEAR);
+	return -ENOMEM;
+}
+
+/**
+  *	memory_bm_free - free memory occupied by the memory bitmap @bm
+  */
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+	struct zone_bitmap *zone_bm;
+
+	/* Free the list of bit blocks for each zone_bitmap object */
+	zone_bm = bm->zone_bm_list;
+	while (zone_bm) {
+		struct bm_block *bb;
+
+		bb = zone_bm->bm_blocks;
+		while (bb) {
+			if (bb->data)
+				free_image_page(bb->data, clear_nosave_free);
+			bb = bb->next;
+		}
+		zone_bm = zone_bm->next;
+	}
+	free_list_of_pages(bm->p_list, clear_nosave_free);
+	bm->zone_bm_list = NULL;
+}
+
+/**
+ *	memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
+ *	of @bm->cur_zone_bm are updated.
+ *
+ *	If the bit cannot be set, the function returns -EINVAL .
+ */
+
+static int
+memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+
+	/* Check if the pfn is from the current zone */
+	zone_bm = bm->cur.zone_bm;
+	if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+		zone_bm = bm->zone_bm_list;
+		/* We don't assume that the zones are sorted by pfns */
+		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+			zone_bm = zone_bm->next;
+			if (unlikely(!zone_bm))
+				return -EINVAL;
+		}
+		bm->cur.zone_bm = zone_bm;
+	}
+	/* Check if the pfn corresponds to the current bitmap block */
+	bb = zone_bm->cur_block;
+	if (pfn < bb->start_pfn)
+		bb = zone_bm->bm_blocks;
+
+	while (pfn >= bb->end_pfn) {
+		bb = bb->next;
+		if (unlikely(!bb))
+			return -EINVAL;
+	}
+	zone_bm->cur_block = bb;
+	pfn -= bb->start_pfn;
+	set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
+	return 0;
+}
+
+/* Two auxiliary functions for memory_bm_next_pfn */
+
+/* Find the first set bit in the given chunk, if there is one */
+
+static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
+{
+	bit++;
+	while (bit < BM_BITS_PER_CHUNK) {
+		if (test_bit(bit, chunk_p))
+			return bit;
+
+		bit++;
+	}
+	return -1;
+}
+
+/* Find a chunk containing some bits set in given block of bits */
+
+static inline int next_chunk_in_block(int n, struct bm_block *bb)
+{
+	n++;
+	while (n < bb->size) {
+		if (bb->data[n])
+			return n;
+
+		n++;
+	}
+	return -1;
+}
+
+/**
+ *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
+ *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
+ *	returned.
+ *
+ *	It is required to run memory_bm_position_reset() before the first call to
+ *	this function.
+ */
+
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+{
+	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
+	int chunk;
+	int bit;
+
+	do {
+		bb = bm->cur.block;
+		do {
+			chunk = bm->cur.chunk;
+			bit = bm->cur.bit;
+			do {
+				bit = next_bit_in_chunk(bit, bb->data + chunk);
+				if (bit >= 0)
+					goto Return_pfn;
+
+				chunk = next_chunk_in_block(chunk, bb);
+				bit = -1;
+			} while (chunk >= 0);
+			bb = bb->next;
+			bm->cur.block = bb;
+			memory_bm_reset_chunk(bm);
+		} while (bb);
+		zone_bm = bm->cur.zone_bm->next;
+		if (zone_bm) {
+			bm->cur.zone_bm = zone_bm;
+			bm->cur.block = zone_bm->bm_blocks;
+			memory_bm_reset_chunk(bm);
+		}
+	} while (zone_bm);
+	memory_bm_position_reset(bm);
+	return BM_END_OF_MAP;
+
+Return_pfn:
+	bm->cur.chunk = chunk;
+	bm->cur.bit = bit;
+	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+}
+
+/**
+ *	snapshot_additional_pages - estimate the number of additional pages
+ *	be needed for setting up the suspend image data structures for given
+ *	zone (usually the returned value is greater than the exact number)
+ */
+
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+	unsigned int res;
+
+	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+	res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+	return res;
+}
+
 /**
  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
  */
@@ -276,32 +737,38 @@ static inline void copy_data_page(long *dst, long *src)
 		*dst++ = *src++;
 }
 
-static void copy_data_pages(struct pbe *pblist)
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
 	struct zone *zone;
-	unsigned long pfn, max_zone_pfn;
-	struct pbe *pbe;
+	unsigned long pfn;
 
-	pbe = pblist;
 	for_each_zone (zone) {
+		unsigned long max_zone_pfn;
+
 		if (is_highmem(zone))
 			continue;
+
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
-		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
-			struct page *page = saveable_page(pfn);
-
-			if (page) {
-				void *ptr = page_address(page);
-
-				BUG_ON(!pbe);
-				copy_data_page((void *)pbe->address, ptr);
-				pbe->orig_address = (unsigned long)ptr;
-				pbe = pbe->next;
-			}
-		}
+		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+			if (saveable_page(pfn))
+				memory_bm_set_bit(orig_bm, pfn);
 	}
-	BUG_ON(pbe);
+	memory_bm_position_reset(orig_bm);
+	memory_bm_position_reset(copy_bm);
+	do {
+		pfn = memory_bm_next_pfn(orig_bm);
+		if (likely(pfn != BM_END_OF_MAP)) {
+			struct page *page;
+			void *src;
+
+			page = pfn_to_page(pfn);
+			src = page_address(page);
+			page = pfn_to_page(memory_bm_next_pfn(copy_bm));
+			copy_data_page(page_address(page), src);
+		}
+	} while (pfn != BM_END_OF_MAP);
 }
 
 /**
@@ -447,37 +914,43 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+		unsigned int nr_pages)
 {
-	struct pbe *p;
+	int error;
 
-	for_each_pbe (p, pblist) {
-		p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
-		if (!p->address)
-			return -ENOMEM;
-	}
-	return 0;
-}
+	error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free;
 
-static struct pbe *swsusp_alloc(unsigned int nr_pages)
-{
-	struct pbe *pblist;
+	error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
+	if (error)
+		goto Free;
 
-	pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (!pblist) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return NULL;
-	}
+	while (nr_pages-- > 0) {
+		struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		if (!page)
+			goto Free;
 
-	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, PG_ANY)) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return NULL;
+		SetPageNosave(page);
+		SetPageNosaveFree(page);
+		memory_bm_set_bit(copy_bm, page_to_pfn(page));
 	}
+	return 0;
 
-	return pblist;
+Free:
+	swsusp_free();
+	return -ENOMEM;
 }
 
+/* Memory bitmap used for marking saveable pages */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used for marking allocated pages that will contain the copies
+ * of saveable pages
+ */
+static struct memory_bitmap copy_bm;
+
 asmlinkage int swsusp_save(void)
 {
 	unsigned int nr_pages;
@@ -498,15 +971,14 @@ asmlinkage int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	restore_pblist = swsusp_alloc(nr_pages);
-	if (!restore_pblist)
+	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
 		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages(restore_pblist);
+	copy_data_pages(&copy_bm, &orig_bm);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -535,22 +1007,23 @@ static void init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
+ *	pack_addresses - the addresses corresponding to pfns found in the
+ *	bitmap @bm are stored in the array @buf[] (1 page)
  */
 
-static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+static inline void
+pack_addresses(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
+	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+		unsigned long pfn = memory_bm_next_pfn(bm);
+
+		if (unlikely(pfn == BM_END_OF_MAP))
+			break;
+
+		buf[j] = (unsigned long)page_address(pfn_to_page(pfn));
 	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
 }
 
 /**
@@ -579,6 +1052,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
 	if (handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
+
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
 		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
@@ -588,16 +1062,17 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (!handle->offset) {
 		init_header((struct swsusp_info *)buffer);
 		handle->buffer = buffer;
-		handle->pbe = restore_pblist;
+		memory_bm_position_reset(&orig_bm);
+		memory_bm_position_reset(&copy_bm);
 	}
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
-			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
-			if (!handle->pbe)
-				handle->pbe = restore_pblist;
+			memset(buffer, 0, PAGE_SIZE);
+			pack_addresses(buffer, &orig_bm);
 		} else {
-			handle->buffer = (void *)handle->pbe->address;
-			handle->pbe = handle->pbe->next;
+			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+
+			handle->buffer = page_address(pfn_to_page(pfn));
 		}
 		handle->prev = handle->cur;
 	}
@@ -736,12 +1211,7 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
  *	of "safe" which will be used later
  */
 
-struct safe_page {
-	struct safe_page *next;
-	char padding[PAGE_SIZE - sizeof(void *)];
-};
-
-static struct safe_page *safe_pages;
+static struct linked_page *safe_pages;
 
 static int prepare_image(struct snapshot_handle *handle)
 {
@@ -763,9 +1233,9 @@ static int prepare_image(struct snapshot_handle *handle)
 	if (!error && nr_pages > unsafe_pages) {
 		nr_pages -= unsafe_pages;
 		while (nr_pages--) {
-			struct safe_page *ptr;
+			struct linked_page *ptr;
 
-			ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+			ptr = (void *)get_zeroed_page(GFP_ATOMIC);
 			if (!ptr) {
 				error = -ENOMEM;
 				break;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c8301..8ef677ea0ce 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
 	printk("Shrinking memory...  ");
 	do {
 		size = 2 * count_highmem_pages();
-		size += size / 50 + count_data_pages();
-		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
-			PAGES_FOR_IO;
+		size += size / 50 + count_data_pages() + PAGES_FOR_IO;
 		tmp = size;
 		for_each_zone (zone)
 			if (!is_highmem(zone) && populated_zone(zone)) {
 				tmp -= zone->free_pages;
 				tmp += zone->lowmem_reserve[ZONE_NORMAL];
+				tmp += snapshot_additional_pages(zone);
 			}
 		if (tmp > 0) {
 			tmp = __shrink_memory(tmp);
-- 
cgit v1.2.3


From 940864ddabdb180e02041c4dcd46ba6f9eee732f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:55 -0700
Subject: [PATCH] swsusp: Use memory bitmaps during resume

Make swsusp use memory bitmaps to store its internal information during the
resume phase of the suspend-resume cycle.

If the pfns of saveable pages are saved during the suspend phase instead of
the kernel virtual addresses of these pages, we can use them during the resume
phase directly to set the corresponding bits in a memory bitmap.  Then, this
bitmap is used to mark the page frames corresponding to the pages that were
saveable before the suspend (aka "unsafe" page frames).

Next, we allocate as many page frames as needed to store the entire suspend
image and make sure that there will be some extra free "safe" page frames for
the list of PBEs constructed later.  Subsequently, the image is loaded and, if
possible, the data loaded from it are written into their "original" page
frames (ie.  the ones they had occupied before the suspend).

The image data that cannot be written into their "original" page frames are
loaded into "safe" page frames and their "original" kernel virtual addresses,
as well as the addresses of the "safe" pages containing their copies, are
stored in a list of PBEs.  Finally, the list of PBEs is used to copy the
remaining image data into their "original" page frames (this is done
atomically, by the architecture-dependent parts of swsusp).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   9 --
 kernel/power/power.h    |  11 +-
 kernel/power/snapshot.c | 416 +++++++++++++++++++++---------------------------
 kernel/power/swap.c     |   4 +-
 kernel/power/user.c     |   1 +
 5 files changed, 186 insertions(+), 255 deletions(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c11cacf1a13..b1237f16ecd 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -16,15 +16,6 @@ struct pbe {
 	struct pbe *next;
 };
 
-#define for_each_pbe(pbe, pblist) \
-	for (pbe = pblist ; pbe ; pbe = pbe->next)
-
-#define PBES_PER_PAGE      (PAGE_SIZE/sizeof(struct pbe))
-#define PB_PAGE_SKIP       (PBES_PER_PAGE-1)
-
-#define for_each_pb_page(pbe, pblist) \
-	for (pbe = pblist ; pbe ; pbe = (pbe+PB_PAGE_SKIP)->next)
-
 /* mm/page_alloc.c */
 extern void drain_local_pages(void);
 extern void mark_free_pages(struct zone *zone);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6e9e2acc34f..bfe999f7b27 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -81,16 +81,6 @@ struct snapshot_handle {
 	unsigned int	prev;	/* number of the block of PAGE_SIZE bytes that
 				 * was the current one previously
 				 */
-	struct pbe	*pbe;	/* PBE that corresponds to 'buffer' */
-	struct pbe	*last_pbe;	/* When the image is restored (eg. read
-					 * from disk) we can store some image
-					 * data directly in the page frames
-					 * in which they were before suspend.
-					 * In such a case the PBEs that
-					 * correspond to them will be unused.
-					 * This is the last PBE, so far, that
-					 * does not correspond to such data.
-					 */
 	void		*buffer;	/* address of the block to read from
 					 * or write to
 					 */
@@ -113,6 +103,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
+extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 
 #define SNAPSHOT_IOC_MAGIC	'3'
 #define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 852e0df4171..1b84313cbab 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,7 +39,7 @@ struct pbe *restore_pblist;
 
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
-static unsigned long *buffer;
+static void *buffer;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -172,7 +172,7 @@ static inline int restore_highmem(void) {return 0;}
 #define PG_UNSAFE_CLEAR	1
 #define PG_UNSAFE_KEEP	0
 
-static unsigned int unsafe_pages;
+static unsigned int allocated_unsafe_pages;
 
 static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
@@ -183,7 +183,7 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 		while (res && PageNosaveFree(virt_to_page(res))) {
 			/* The page is unsafe, mark it for swsusp_free() */
 			SetPageNosave(virt_to_page(res));
-			unsafe_pages++;
+			allocated_unsafe_pages++;
 			res = (void *)get_zeroed_page(gfp_mask);
 		}
 	if (res) {
@@ -772,101 +772,10 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 }
 
 /**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		free_image_page(pblist, clear_nosave_free);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage, unsigned int n)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += n - 1;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
+ *	swsusp_free - free pages allocated for the suspend.
  *
- *	This function assumes that pages allocated by alloc_image_page() will
- *	always be zeroed.
- */
-
-static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
-{
-	struct pbe *pbpage;
-	unsigned int num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage, PBES_PER_PAGE);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		num -= PBES_PER_PAGE;
-		fill_pb_page(pbpage, nr_pages - num);
-	}
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
-				 int safe_needed)
-{
-	unsigned int num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pblist = alloc_image_page(gfp_mask, safe_needed);
-	pbe = pblist;
-	for (num = PBES_PER_PAGE; num < nr_pages; num += PBES_PER_PAGE) {
-		if (!pbe) {
-			free_pagedir(pblist, PG_UNSAFE_CLEAR);
-			return NULL;
-		}
-		pbe += PB_PAGE_SKIP;
-		pbe->next = alloc_image_page(gfp_mask, safe_needed);
-		pbe = pbe->next;
-	}
-	create_pbe_list(pblist, nr_pages);
-	return pblist;
-}
-
-/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
+ *	Suspend pages are alocated before the atomic copy is made, so we
+ *	need to release them after the resume.
  */
 
 void swsusp_free(void)
@@ -904,14 +813,18 @@ void swsusp_free(void)
 static int enough_free_mem(unsigned int nr_pages)
 {
 	struct zone *zone;
-	unsigned int n = 0;
+	unsigned int free = 0, meta = 0;
 
 	for_each_zone (zone)
-		if (!is_highmem(zone))
-			n += zone->free_pages;
-	pr_debug("swsusp: available memory: %u pages\n", n);
-	return n > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+		if (!is_highmem(zone)) {
+			free += zone->free_pages;
+			meta += snapshot_additional_pages(zone);
+		}
+
+	pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+		nr_pages, PAGES_FOR_IO, meta, free);
+
+	return free > nr_pages + PAGES_FOR_IO + meta;
 }
 
 static int
@@ -961,11 +874,6 @@ asmlinkage int swsusp_save(void)
 	nr_pages = count_data_pages();
 	printk("swsusp: Need to copy %u pages\n", nr_pages);
 
-	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_pages,
-		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
-		 PAGES_FOR_IO, nr_free_pages());
-
 	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
@@ -1007,22 +915,19 @@ static void init_header(struct swsusp_info *info)
 }
 
 /**
- *	pack_addresses - the addresses corresponding to pfns found in the
- *	bitmap @bm are stored in the array @buf[] (1 page)
+ *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+ *	are stored in the array @buf[] (1 page at a time)
  */
 
 static inline void
-pack_addresses(unsigned long *buf, struct memory_bitmap *bm)
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
 	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-		unsigned long pfn = memory_bm_next_pfn(bm);
-
-		if (unlikely(pfn == BM_END_OF_MAP))
+		buf[j] = memory_bm_next_pfn(bm);
+		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
-
-		buf[j] = (unsigned long)page_address(pfn_to_page(pfn));
 	}
 }
 
@@ -1068,7 +973,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 	if (handle->prev < handle->cur) {
 		if (handle->cur <= nr_meta_pages) {
 			memset(buffer, 0, PAGE_SIZE);
-			pack_addresses(buffer, &orig_bm);
+			pack_pfns(buffer, &orig_bm);
 		} else {
 			unsigned long pfn = memory_bm_next_pfn(&copy_bm);
 
@@ -1094,14 +999,10 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
  *	had been used before suspend
  */
 
-static int mark_unsafe_pages(struct pbe *pblist)
+static int mark_unsafe_pages(struct memory_bitmap *bm)
 {
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return -EINVAL;
 
 	/* Clear page flags */
 	for_each_zone (zone) {
@@ -1111,30 +1012,37 @@ static int mark_unsafe_pages(struct pbe *pblist)
 				ClearPageNosaveFree(pfn_to_page(pfn));
 	}
 
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist) {
-		if (virt_addr_valid(p->orig_address))
-			SetPageNosaveFree(virt_to_page(p->orig_address));
-		else
-			return -EFAULT;
-	}
+	/* Mark pages that correspond to the "original" pfns as "unsafe" */
+	memory_bm_position_reset(bm);
+	do {
+		pfn = memory_bm_next_pfn(bm);
+		if (likely(pfn != BM_END_OF_MAP)) {
+			if (likely(pfn_valid(pfn)))
+				SetPageNosaveFree(pfn_to_page(pfn));
+			else
+				return -EFAULT;
+		}
+	} while (pfn != BM_END_OF_MAP);
 
-	unsafe_pages = 0;
+	allocated_unsafe_pages = 0;
 
 	return 0;
 }
 
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 {
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
+	unsigned long pfn;
+
+	memory_bm_position_reset(src);
+	pfn = memory_bm_next_pfn(src);
+	while (pfn != BM_END_OF_MAP) {
+		memory_bm_set_bit(dst, pfn);
+		pfn = memory_bm_next_pfn(src);
 	}
 }
 
-static int check_header(struct swsusp_info *info)
+static inline int check_header(struct swsusp_info *info)
 {
 	char *reason = NULL;
 
@@ -1161,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
  *	load header - check the image header and copy data from it
  */
 
-static int load_header(struct snapshot_handle *handle,
-                              struct swsusp_info *info)
+static int
+load_header(struct swsusp_info *info)
 {
 	int error;
-	struct pbe *pblist;
 
+	restore_pblist = NULL;
 	error = check_header(info);
 	if (!error) {
-		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, PG_ANY);
-		if (!pblist)
-			return -ENOMEM;
-		restore_pblist = pblist;
-		handle->pbe = pblist;
 		nr_copy_pages = info->image_pages;
 		nr_meta_pages = info->pages - info->image_pages - 1;
 	}
@@ -1181,108 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
 }
 
 /**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
+ *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
+ *	the corresponding bit in the memory bitmap @bm
  */
 
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
+static inline void
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
+	for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
+		if (unlikely(buf[j] == BM_END_OF_MAP))
+			break;
+
+		memory_bm_set_bit(bm, buf[j]);
 	}
-	return pbe;
 }
 
 /**
- *	prepare_image - use metadata contained in the PBE list
- *	pointed to by restore_pblist to mark the pages that will
- *	be overwritten in the process of restoring the system
- *	memory state from the image ("unsafe" pages) and allocate
- *	memory for the image
+ *	prepare_image - use the memory bitmap @bm to mark the pages that will
+ *	be overwritten in the process of restoring the system memory state
+ *	from the suspend image ("unsafe" pages) and allocate memory for the
+ *	image.
  *
- *	The idea is to allocate the PBE list first and then
- *	allocate as many pages as it's needed for the image data,
- *	but not to assign these pages to the PBEs initially.
- *	Instead, we just mark them as allocated and create a list
- *	of "safe" which will be used later
+ *	The idea is to allocate a new memory bitmap first and then allocate
+ *	as many pages as needed for the image data, but not to assign these
+ *	pages to specific tasks initially.  Instead, we just mark them as
+ *	allocated and create a list of "safe" pages that will be used later.
  */
 
-static struct linked_page *safe_pages;
+#define PBES_PER_LINKED_PAGE	(LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
+static struct linked_page *safe_pages_list;
 
-static int prepare_image(struct snapshot_handle *handle)
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-	int error = 0;
-	unsigned int nr_pages = nr_copy_pages;
-	struct pbe *p, *pblist = NULL;
+	unsigned int nr_pages;
+	struct linked_page *sp_list, *lp;
+	int error;
 
-	p = restore_pblist;
-	error = mark_unsafe_pages(p);
-	if (!error) {
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, PG_SAFE);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p, PG_UNSAFE_KEEP);
-		if (!pblist)
+	error = mark_unsafe_pages(bm);
+	if (error)
+		goto Free;
+
+	error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
+	if (error)
+		goto Free;
+
+	duplicate_memory_bitmap(new_bm, bm);
+	memory_bm_free(bm, PG_UNSAFE_KEEP);
+	/* Reserve some safe pages for potential later use.
+	 *
+	 * NOTE: This way we make sure there will be enough safe pages for the
+	 * chain_alloc() in get_buffer().  It is a bit wasteful, but
+	 * nr_copy_pages cannot be greater than 50% of the memory anyway.
+	 */
+	sp_list = NULL;
+	/* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+	while (nr_pages > 0) {
+		lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+		if (!lp) {
 			error = -ENOMEM;
+			goto Free;
+		}
+		lp->next = sp_list;
+		sp_list = lp;
+		nr_pages--;
 	}
-	safe_pages = NULL;
-	if (!error && nr_pages > unsafe_pages) {
-		nr_pages -= unsafe_pages;
-		while (nr_pages--) {
-			struct linked_page *ptr;
-
-			ptr = (void *)get_zeroed_page(GFP_ATOMIC);
-			if (!ptr) {
-				error = -ENOMEM;
-				break;
-			}
-			if (!PageNosaveFree(virt_to_page(ptr))) {
-				/* The page is "safe", add it to the list */
-				ptr->next = safe_pages;
-				safe_pages = ptr;
-			}
-			/* Mark the page as allocated */
-			SetPageNosave(virt_to_page(ptr));
-			SetPageNosaveFree(virt_to_page(ptr));
+	/* Preallocate memory for the image */
+	safe_pages_list = NULL;
+	nr_pages = nr_copy_pages - allocated_unsafe_pages;
+	while (nr_pages > 0) {
+		lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+		if (!lp) {
+			error = -ENOMEM;
+			goto Free;
+		}
+		if (!PageNosaveFree(virt_to_page(lp))) {
+			/* The page is "safe", add it to the list */
+			lp->next = safe_pages_list;
+			safe_pages_list = lp;
 		}
+		/* Mark the page as allocated */
+		SetPageNosave(virt_to_page(lp));
+		SetPageNosaveFree(virt_to_page(lp));
+		nr_pages--;
 	}
-	if (!error) {
-		restore_pblist = pblist;
-	} else {
-		handle->pbe = NULL;
-		swsusp_free();
+	/* Free the reserved safe pages so that chain_alloc() can use them */
+	while (sp_list) {
+		lp = sp_list->next;
+		free_image_page(sp_list, PG_UNSAFE_CLEAR);
+		sp_list = lp;
 	}
+	return 0;
+
+Free:
+	swsusp_free();
 	return error;
 }
 
-static void *get_buffer(struct snapshot_handle *handle)
+/**
+ *	get_buffer - compute the address that snapshot_write_next() should
+ *	set for its caller to write to.
+ */
+
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
-	struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
-	struct page *page = virt_to_page(pbe->orig_address);
+	struct pbe *pbe;
+	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
 
-	if (PageNosave(page) && PageNosaveFree(page)) {
-		/*
-		 * We have allocated the "original" page frame and we can
-		 * use it directly to store the read page
+	if (PageNosave(page) && PageNosaveFree(page))
+		/* We have allocated the "original" page frame and we can
+		 * use it directly to store the loaded page.
 		 */
-		pbe->address = 0;
-		if (last && last->next)
-			last->next = NULL;
-		return (void *)pbe->orig_address;
-	}
-	/*
-	 * The "original" page frame has not been allocated and we have to
-	 * use a "safe" page frame to store the read page
+		return page_address(page);
+
+	/* The "original" page frame has not been allocated and we have to
+	 * use a "safe" page frame to store the loaded page.
 	 */
-	pbe->address = (unsigned long)safe_pages;
-	safe_pages = safe_pages->next;
-	if (last)
-		last->next = pbe;
-	handle->last_pbe = pbe;
+	pbe = chain_alloc(ca, sizeof(struct pbe));
+	if (!pbe) {
+		swsusp_free();
+		return NULL;
+	}
+	pbe->orig_address = (unsigned long)page_address(page);
+	pbe->address = (unsigned long)safe_pages_list;
+	safe_pages_list = safe_pages_list->next;
+	pbe->next = restore_pblist;
+	restore_pblist = pbe;
 	return (void *)pbe->address;
 }
 
@@ -1310,10 +1242,13 @@ static void *get_buffer(struct snapshot_handle *handle)
 
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
+	static struct chain_allocator ca;
 	int error = 0;
 
+	/* Check if we have already loaded the entire image */
 	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
 		return 0;
+
 	if (!buffer) {
 		/* This makes the buffer be freed by swsusp_free() */
 		buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
@@ -1324,26 +1259,32 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 		handle->buffer = buffer;
 	handle->sync_read = 1;
 	if (handle->prev < handle->cur) {
-		if (!handle->prev) {
-			error = load_header(handle,
-					(struct swsusp_info *)buffer);
+		if (handle->prev == 0) {
+			error = load_header(buffer);
+			if (error)
+				return error;
+
+			error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
 			if (error)
 				return error;
+
 		} else if (handle->prev <= nr_meta_pages) {
-			handle->pbe = unpack_orig_addresses(buffer,
-							handle->pbe);
-			if (!handle->pbe) {
-				error = prepare_image(handle);
+			unpack_orig_pfns(buffer, &copy_bm);
+			if (handle->prev == nr_meta_pages) {
+				error = prepare_image(&orig_bm, &copy_bm);
 				if (error)
 					return error;
-				handle->pbe = restore_pblist;
-				handle->last_pbe = NULL;
-				handle->buffer = get_buffer(handle);
+
+				chain_init(&ca, GFP_ATOMIC, PG_SAFE);
+				memory_bm_position_reset(&orig_bm);
+				restore_pblist = NULL;
+				handle->buffer = get_buffer(&orig_bm, &ca);
 				handle->sync_read = 0;
+				if (!handle->buffer)
+					return -ENOMEM;
 			}
 		} else {
-			handle->pbe = handle->pbe->next;
-			handle->buffer = get_buffer(handle);
+			handle->buffer = get_buffer(&orig_bm, &ca);
 			handle->sync_read = 0;
 		}
 		handle->prev = handle->cur;
@@ -1362,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
-		handle->cur <= nr_meta_pages + nr_copy_pages);
+	return !(!nr_copy_pages ||
+			handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+
+void snapshot_free_unused_memory(struct snapshot_handle *handle)
+{
+	/* Free only if we have loaded the image entirely */
+	if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+		memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8309d20b256..9b2ee5344de 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -331,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
-	return free_swap > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+	return free_swap > nr_pages + PAGES_FOR_IO;
 }
 
 /**
@@ -547,6 +546,7 @@ static int load_image(struct swap_map_handle *handle,
 		error = err2;
 	if (!error) {
 		printk("\b\b\b\bdone\n");
+		snapshot_free_unused_memory(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
 	}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 0ef5e4ba39e..2e4499f3e4d 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -193,6 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 			error = -EPERM;
 			break;
 		}
+		snapshot_free_unused_memory(&data->handle);
 		down(&pm_sem);
 		pm_prepare_console();
 		error = device_suspend(PMSG_FREEZE);
-- 
cgit v1.2.3


From c8eb8b4025175f967af0ba8e933f23aa9954dc35 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:56 -0700
Subject: [PATCH] PM: make it possible to disable console suspending

Change suspend_console() so that it waits for all consoles to flush the
remaining messages and make it possible to switch the console suspending off
with the help of a Kconfig option.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: Stefan Seyfried <seife@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/console.h |  5 +++++
 kernel/power/Kconfig    | 11 +++++++++++
 kernel/printk.c         |  3 +++
 3 files changed, 19 insertions(+)

diff --git a/include/linux/console.h b/include/linux/console.h
index 3bdf2155e56..76a1807726e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -120,9 +120,14 @@ extern void console_stop(struct console *);
 extern void console_start(struct console *);
 extern int is_console_locked(void);
 
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
 extern void resume_console(void);
+#else
+static inline void suspend_console(void) {}
+static inline void resume_console(void) {}
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
 
 /* Some debug stub to catch some of the obvious races in the VT code */
 #if 1
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c5..4b6e2f18e05 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
 	code. This is helpful when debugging and reporting various PM bugs, 
 	like suspend support.
 
+config DISABLE_CONSOLE_SUSPEND
+	bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+	depends on PM && PM_DEBUG
+	default n
+	---help---
+	This option turns off the console suspend mechanism that prevents
+	debug messages from reaching the console during the suspend/resume
+	operations.  This may be helpful when debugging device drivers'
+	suspend/resume routines, but may itself lead to problems, for example
+	if netconsole is used.
+
 config PM_TRACE
 	bool "Suspend/resume event tracing"
 	depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989..771f5e861bc 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 	return 0;
 }
 
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /**
  * suspend_console - suspend the console subsystem
  *
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
  */
 void suspend_console(void)
 {
+	printk("Suspending console(s)\n");
 	acquire_console_sem();
 	console_suspended = 1;
 }
@@ -737,6 +739,7 @@ void resume_console(void)
 	console_suspended = 0;
 	release_console_sem();
 }
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
 
 /**
  * acquire_console_sem - lock the console system for exclusive use.
-- 
cgit v1.2.3


From e1da95ae38afdcda83328300c4aed755d9fc01a6 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:57 -0700
Subject: [PATCH] suspend: make it possible to disable serial console suspend

Hack uart_suspend_port() and uart_resume_port() so that serial console
ports are not suspended if CONFIG_DISABLE_CONSOLE_SUSPEND is set.

This makes it possible to debug the suspend and resume routines of all
device drivers as well as the lowest-level swsusp code with the help of the
serial console.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/serial/serial_core.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 372e47f7d59..5f7ba1adb30 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1929,6 +1929,13 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 
 	mutex_lock(&state->mutex);
 
+#ifdef CONFIG_DISABLE_CONSOLE_SUSPEND
+	if (uart_console(port)) {
+		mutex_unlock(&state->mutex);
+		return 0;
+	}
+#endif
+
 	if (state->info && state->info->flags & UIF_INITIALIZED) {
 		const struct uart_ops *ops = port->ops;
 
@@ -1967,6 +1974,13 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 
 	mutex_lock(&state->mutex);
 
+#ifdef CONFIG_DISABLE_CONSOLE_SUSPEND
+	if (uart_console(port)) {
+		mutex_unlock(&state->mutex);
+		return 0;
+	}
+#endif
+
 	uart_change_pm(state, 0);
 
 	/*
-- 
cgit v1.2.3


From 7d145aa3abf4d96c91f37c012facd5cfbb9010d1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:57 -0700
Subject: [PATCH] i386: Detect clock skew during suspend

Detect the situations in which the time after a resume from disk would be
earlier than the time before the suspend and prevent them from happening on
i386.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f333e7fb23..1302e4ab3c4 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -270,16 +270,19 @@ void notify_arch_cmos_timer(void)
 	mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-static long clock_cmos_diff, sleep_start;
+static long clock_cmos_diff;
+static unsigned long sleep_start;
 
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
 	/*
 	 * Estimate time zone so that set_time can update the clock
 	 */
-	clock_cmos_diff = -get_cmos_time();
+	unsigned long ctime =  get_cmos_time();
+
+	clock_cmos_diff = -ctime;
 	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
+	sleep_start = ctime;
 	return 0;
 }
 
@@ -287,16 +290,25 @@ static int timer_resume(struct sys_device *dev)
 {
 	unsigned long flags;
 	unsigned long sec;
-	unsigned long sleep_length;
+	unsigned long ctime = get_cmos_time();
+	long sleep_length = (ctime - sleep_start) * HZ;
 	struct timespec ts;
+
+	if (sleep_length < 0) {
+		printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
+		/* The time after the resume must not be earlier than the time
+		 * before the suspend or some nasty things will happen
+		 */
+		sleep_length = 0;
+		ctime = sleep_start;
+	}
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_enabled())
 		hpet_reenable();
 #endif
 	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
 
+	sec = ctime + clock_cmos_diff;
 	ts.tv_sec = sec;
 	ts.tv_nsec = 0;
 	do_settimeofday(&ts);
-- 
cgit v1.2.3


From c5c6ba4e08ab9c9e390a0f3a7d9a5c332f5cc6ef Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 25 Sep 2006 23:32:58 -0700
Subject: [PATCH] PM: Add pm_trace switch

Add the pm_trace attribute in /sys/power which has to be explicitly set to
one to really enable the "PM tracing" code compiled in when CONFIG_PM_TRACE
is set (which modifies the machine's CMOS clock in unpredictable ways).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/interface.txt | 15 +++++++++++++++
 include/linux/resume-trace.h      | 24 ++++++++++++++----------
 kernel/power/main.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index 4117802af0f..a66bec222b1 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -52,3 +52,18 @@ suspend image will be as small as possible.
 
 Reading from this file will display the current image size limit, which
 is set to 500 MB by default.
+
+/sys/power/pm_trace controls the code which saves the last PM event point in
+the RTC across reboots, so that you can debug a machine that just hangs
+during suspend (or more commonly, during resume).  Namely, the RTC is only
+used to save the last PM event point if this file contains '1'.  Initially it
+contains '0' which may be changed to '1' by writing a string representing a
+nonzero integer into it.
+
+To use this debugging feature you should attempt to suspend the machine, then
+reboot it and run
+
+	dmesg -s 1000000 | grep 'hash matches'
+
+CAUTION: Using it will cause your machine's real-time (CMOS) clock to be
+set to a random invalid time after a resume.
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index a376bd4ade3..81e9299ca14 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -3,21 +3,25 @@
 
 #ifdef CONFIG_PM_TRACE
 
+extern int pm_trace_enabled;
+
 struct device;
 extern void set_trace_device(struct device *);
 extern void generate_resume_trace(void *tracedata, unsigned int user);
 
 #define TRACE_DEVICE(dev) set_trace_device(dev)
-#define TRACE_RESUME(user) do {				\
-	void *tracedata;				\
-	asm volatile("movl $1f,%0\n"			\
-		".section .tracedata,\"a\"\n"		\
-		"1:\t.word %c1\n"			\
-		"\t.long %c2\n"				\
-		".previous"				\
-		:"=r" (tracedata)			\
-		: "i" (__LINE__), "i" (__FILE__));	\
-	generate_resume_trace(tracedata, user);		\
+#define TRACE_RESUME(user) do {					\
+	if (pm_trace_enabled) {					\
+		void *tracedata;				\
+		asm volatile("movl $1f,%0\n"			\
+			".section .tracedata,\"a\"\n"		\
+			"1:\t.word %c1\n"			\
+			"\t.long %c2\n"				\
+			".previous"				\
+			:"=r" (tracedata)			\
+			: "i" (__LINE__), "i" (__FILE__));	\
+		generate_resume_trace(tracedata, user);		\
+	}							\
 } while (0)
 
 #else
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4d403323a7b..873228c71da 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,6 +17,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/resume-trace.h>
 
 #include "power.h"
 
@@ -281,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 
 power_attr(state);
 
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+	return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+	int val;
+
+	if (sscanf(buf, "%d", &val) == 1) {
+		pm_trace_enabled = !!val;
+		return n;
+	}
+	return -EINVAL;
+}
+
+power_attr(pm_trace);
+
+static struct attribute * g[] = {
+	&state_attr.attr,
+	&pm_trace_attr.attr,
+	NULL,
+};
+#else
 static struct attribute * g[] = {
 	&state_attr.attr,
 	NULL,
 };
+#endif /* CONFIG_PM_TRACE */
 
 static struct attribute_group attr_group = {
 	.attrs = g,
-- 
cgit v1.2.3


From 13c06be399902c9ebda08e092edb1614bb4a3761 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:32:59 -0700
Subject: [PATCH] uml: Use klibc setjmp/longjmp

This patch adds an implementation of setjmp and longjmp to UML, allowing
access to the inside of a jmpbuf without needing the access macros formerly
provided by libc.

The implementation is stolen from klibc.  I copy the relevant files into
arch/um.  I have another patch which avoids the copying, but requires klibc be
in the tree.

setjmp and longjmp users required some tweaking.  Includes of <setjmp.h> were
removed and includes of the UML longjmp.h were added where necessary.  There
are also replacements of siglongjmp with UML_LONGJMP which I somehow missed
earlier.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/longjmp.h                  |  5 ++-
 arch/um/include/sysdep-i386/archsetjmp.h   | 19 ++++++++++
 arch/um/include/sysdep-x86_64/archsetjmp.h | 21 +++++++++++
 arch/um/os-Linux/process.c                 |  1 -
 arch/um/os-Linux/skas/process.c            |  3 +-
 arch/um/os-Linux/sys-i386/registers.c      | 10 +++---
 arch/um/os-Linux/sys-x86_64/registers.c    | 10 +++---
 arch/um/os-Linux/trap.c                    |  1 -
 arch/um/os-Linux/uaccess.c                 |  3 +-
 arch/um/os-Linux/util.c                    |  5 ++-
 arch/um/sys-i386/Makefile                  |  2 +-
 arch/um/sys-i386/setjmp.S                  | 58 ++++++++++++++++++++++++++++++
 arch/um/sys-x86_64/Makefile                |  4 +--
 arch/um/sys-x86_64/setjmp.S                | 54 ++++++++++++++++++++++++++++
 14 files changed, 173 insertions(+), 23 deletions(-)
 create mode 100644 arch/um/include/sysdep-i386/archsetjmp.h
 create mode 100644 arch/um/include/sysdep-x86_64/archsetjmp.h
 create mode 100644 arch/um/sys-i386/setjmp.S
 create mode 100644 arch/um/sys-x86_64/setjmp.S

diff --git a/arch/um/include/longjmp.h b/arch/um/include/longjmp.h
index 1b5c0131a12..e93c6d3e893 100644
--- a/arch/um/include/longjmp.h
+++ b/arch/um/include/longjmp.h
@@ -1,9 +1,12 @@
 #ifndef __UML_LONGJMP_H
 #define __UML_LONGJMP_H
 
-#include <setjmp.h>
+#include "sysdep/archsetjmp.h"
 #include "os.h"
 
+extern int setjmp(jmp_buf);
+extern void longjmp(jmp_buf, int);
+
 #define UML_LONGJMP(buf, val) do { \
 	longjmp(*buf, val);	\
 } while(0)
diff --git a/arch/um/include/sysdep-i386/archsetjmp.h b/arch/um/include/sysdep-i386/archsetjmp.h
new file mode 100644
index 00000000000..ea1ba3d42ae
--- /dev/null
+++ b/arch/um/include/sysdep-i386/archsetjmp.h
@@ -0,0 +1,19 @@
+/*
+ * arch/i386/include/klibc/archsetjmp.h
+ */
+
+#ifndef _KLIBC_ARCHSETJMP_H
+#define _KLIBC_ARCHSETJMP_H
+
+struct __jmp_buf {
+	unsigned int __ebx;
+	unsigned int __esp;
+	unsigned int __ebp;
+	unsigned int __esi;
+	unsigned int __edi;
+	unsigned int __eip;
+};
+
+typedef struct __jmp_buf jmp_buf[1];
+
+#endif				/* _SETJMP_H */
diff --git a/arch/um/include/sysdep-x86_64/archsetjmp.h b/arch/um/include/sysdep-x86_64/archsetjmp.h
new file mode 100644
index 00000000000..454fc60aff6
--- /dev/null
+++ b/arch/um/include/sysdep-x86_64/archsetjmp.h
@@ -0,0 +1,21 @@
+/*
+ * arch/x86_64/include/klibc/archsetjmp.h
+ */
+
+#ifndef _KLIBC_ARCHSETJMP_H
+#define _KLIBC_ARCHSETJMP_H
+
+struct __jmp_buf {
+	unsigned long __rbx;
+	unsigned long __rsp;
+	unsigned long __rbp;
+	unsigned long __r12;
+	unsigned long __r13;
+	unsigned long __r14;
+	unsigned long __r15;
+	unsigned long __rip;
+};
+
+typedef struct __jmp_buf jmp_buf[1];
+
+#endif				/* _SETJMP_H */
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index b98d3ca2cd1..3afde92ad2c 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -7,7 +7,6 @@
 #include <stdio.h>
 #include <errno.h>
 #include <signal.h>
-#include <setjmp.h>
 #include <linux/unistd.h>
 #include <sys/mman.h>
 #include <sys/wait.h>
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 7baf90fda58..50418a5e713 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -8,7 +8,6 @@
 #include <unistd.h>
 #include <errno.h>
 #include <signal.h>
-#include <setjmp.h>
 #include <sched.h>
 #include "ptrace_user.h"
 #include <sys/wait.h>
@@ -470,7 +469,7 @@ void thread_wait(void *sw, void *fb)
 	*switch_buf = &buf;
 	fork_buf = fb;
 	if(UML_SETJMP(&buf) == 0)
-		siglongjmp(*fork_buf, INIT_JMP_REMOVE_SIGSTACK);
+		UML_LONGJMP(fork_buf, INIT_JMP_REMOVE_SIGSTACK);
 }
 
 void switch_threads(void *me, void *next)
diff --git a/arch/um/os-Linux/sys-i386/registers.c b/arch/um/os-Linux/sys-i386/registers.c
index 516f66dd87e..1f90a2d7138 100644
--- a/arch/um/os-Linux/sys-i386/registers.c
+++ b/arch/um/os-Linux/sys-i386/registers.c
@@ -5,12 +5,12 @@
 
 #include <errno.h>
 #include <string.h>
-#include <setjmp.h>
 #include "sysdep/ptrace_user.h"
 #include "sysdep/ptrace.h"
 #include "uml-config.h"
 #include "skas_ptregs.h"
 #include "registers.h"
+#include "longjmp.h"
 #include "user.h"
 
 /* These are set once at boot time and not changed thereafter */
@@ -132,9 +132,9 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs)
 
 void get_thread_regs(union uml_pt_regs *uml_regs, void *buffer)
 {
-	struct __jmp_buf_tag *jmpbuf = buffer;
+	struct __jmp_buf *jmpbuf = buffer;
 
-	UPT_SET(uml_regs, EIP, jmpbuf->__jmpbuf[JB_PC]);
-	UPT_SET(uml_regs, UESP, jmpbuf->__jmpbuf[JB_SP]);
-	UPT_SET(uml_regs, EBP, jmpbuf->__jmpbuf[JB_BP]);
+	UPT_SET(uml_regs, EIP, jmpbuf->__eip);
+	UPT_SET(uml_regs, UESP, jmpbuf->__esp);
+	UPT_SET(uml_regs, EBP, jmpbuf->__ebp);
 }
diff --git a/arch/um/os-Linux/sys-x86_64/registers.c b/arch/um/os-Linux/sys-x86_64/registers.c
index becd898d939..e730447d6c0 100644
--- a/arch/um/os-Linux/sys-x86_64/registers.c
+++ b/arch/um/os-Linux/sys-x86_64/registers.c
@@ -5,11 +5,11 @@
 
 #include <errno.h>
 #include <string.h>
-#include <setjmp.h>
 #include "ptrace_user.h"
 #include "uml-config.h"
 #include "skas_ptregs.h"
 #include "registers.h"
+#include "longjmp.h"
 #include "user.h"
 
 /* These are set once at boot time and not changed thereafter */
@@ -80,9 +80,9 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs)
 
 void get_thread_regs(union uml_pt_regs *uml_regs, void *buffer)
 {
-	struct __jmp_buf_tag *jmpbuf = buffer;
+	struct __jmp_buf *jmpbuf = buffer;
 
-	UPT_SET(uml_regs, RIP, jmpbuf->__jmpbuf[JB_PC]);
-	UPT_SET(uml_regs, RSP, jmpbuf->__jmpbuf[JB_RSP]);
-	UPT_SET(uml_regs, RBP, jmpbuf->__jmpbuf[JB_RBP]);
+	UPT_SET(uml_regs, RIP, jmpbuf->__rip);
+	UPT_SET(uml_regs, RSP, jmpbuf->__rsp);
+	UPT_SET(uml_regs, RBP, jmpbuf->__rbp);
 }
diff --git a/arch/um/os-Linux/trap.c b/arch/um/os-Linux/trap.c
index 90b29ae9af4..1df231a2624 100644
--- a/arch/um/os-Linux/trap.c
+++ b/arch/um/os-Linux/trap.c
@@ -5,7 +5,6 @@
 
 #include <stdlib.h>
 #include <signal.h>
-#include <setjmp.h>
 #include "kern_util.h"
 #include "user_util.h"
 #include "os.h"
diff --git a/arch/um/os-Linux/uaccess.c b/arch/um/os-Linux/uaccess.c
index 865f6a6a259..bbb73a65037 100644
--- a/arch/um/os-Linux/uaccess.c
+++ b/arch/um/os-Linux/uaccess.c
@@ -4,8 +4,7 @@
  * Licensed under the GPL
  */
 
-#include <setjmp.h>
-#include <string.h>
+#include <stddef.h>
 #include "longjmp.h"
 
 unsigned long __do_user_copy(void *to, const void *from, int n,
diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c
index c47a2a7ce70..3f5b1514e8a 100644
--- a/arch/um/os-Linux/util.c
+++ b/arch/um/os-Linux/util.c
@@ -7,7 +7,6 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <limits.h>
-#include <setjmp.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/utsname.h>
@@ -107,11 +106,11 @@ int setjmp_wrapper(void (*proc)(void *, void *), ...)
 	jmp_buf buf;
 	int n;
 
-	n = sigsetjmp(buf, 1);
+	n = UML_SETJMP(&buf);
 	if(n == 0){
 		va_start(args, proc);
 		(*proc)(&buf, &args);
 	}
 	va_end(args);
-	return(n);
+	return n;
 }
diff --git a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile
index 374d61a1943..59cc7027575 100644
--- a/arch/um/sys-i386/Makefile
+++ b/arch/um/sys-i386/Makefile
@@ -1,5 +1,5 @@
 obj-y = bugs.o checksum.o delay.o fault.o ksyms.o ldt.o ptrace.o \
-	ptrace_user.o signal.o sigcontext.o syscalls.o sysrq.o \
+	ptrace_user.o setjmp.o signal.o sigcontext.o syscalls.o sysrq.o \
 	sys_call_table.o tls.o
 
 obj-$(CONFIG_MODE_SKAS) += stub.o stub_segv.o
diff --git a/arch/um/sys-i386/setjmp.S b/arch/um/sys-i386/setjmp.S
new file mode 100644
index 00000000000..b766792c993
--- /dev/null
+++ b/arch/um/sys-i386/setjmp.S
@@ -0,0 +1,58 @@
+#
+# arch/i386/setjmp.S
+#
+# setjmp/longjmp for the i386 architecture
+#
+
+#
+# The jmp_buf is assumed to contain the following, in order:
+#	%ebx
+#	%esp
+#	%ebp
+#	%esi
+#	%edi
+#	<return address>
+#
+
+	.text
+	.align 4
+	.globl setjmp
+	.type setjmp, @function
+setjmp:
+#ifdef _REGPARM
+	movl %eax,%edx
+#else
+	movl 4(%esp),%edx
+#endif
+	popl %ecx			# Return address, and adjust the stack
+	xorl %eax,%eax			# Return value
+	movl %ebx,(%edx)
+	movl %esp,4(%edx)		# Post-return %esp!
+	pushl %ecx			# Make the call/return stack happy
+	movl %ebp,8(%edx)
+	movl %esi,12(%edx)
+	movl %edi,16(%edx)
+	movl %ecx,20(%edx)		# Return address
+	ret
+
+	.size setjmp,.-setjmp
+
+	.text
+	.align 4
+	.globl longjmp
+	.type longjmp, @function
+longjmp:
+#ifdef _REGPARM
+	xchgl %eax,%edx
+#else
+	movl 4(%esp),%edx		# jmp_ptr address
+	movl 8(%esp),%eax		# Return value
+#endif
+	movl (%edx),%ebx
+	movl 4(%edx),%esp
+	movl 8(%edx),%ebp
+	movl 12(%edx),%esi
+	movl 16(%edx),%edi
+	jmp *20(%edx)
+
+	.size longjmp,.-longjmp
diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile
index c19794d435d..f41768b8e25 100644
--- a/arch/um/sys-x86_64/Makefile
+++ b/arch/um/sys-x86_64/Makefile
@@ -5,8 +5,8 @@
 #
 
 obj-y = bugs.o delay.o fault.o ldt.o mem.o ptrace.o ptrace_user.o \
-	sigcontext.o signal.o syscalls.o syscall_table.o sysrq.o ksyms.o \
-	tls.o
+	setjmp.o sigcontext.o signal.o syscalls.o syscall_table.o sysrq.o \
+	ksyms.o tls.o
 
 obj-$(CONFIG_MODE_SKAS) += stub.o stub_segv.o
 obj-$(CONFIG_MODULES) += um_module.o
diff --git a/arch/um/sys-x86_64/setjmp.S b/arch/um/sys-x86_64/setjmp.S
new file mode 100644
index 00000000000..45f547b4043
--- /dev/null
+++ b/arch/um/sys-x86_64/setjmp.S
@@ -0,0 +1,54 @@
+#
+# arch/x86_64/setjmp.S
+#
+# setjmp/longjmp for the x86-64 architecture
+#
+
+#
+# The jmp_buf is assumed to contain the following, in order:
+#	%rbx
+#	%rsp (post-return)
+#	%rbp
+#	%r12
+#	%r13
+#	%r14
+#	%r15
+#	<return address>
+#
+
+	.text
+	.align 4
+	.globl setjmp
+	.type setjmp, @function
+setjmp:
+	pop  %rsi			# Return address, and adjust the stack
+	xorl %eax,%eax			# Return value
+	movq %rbx,(%rdi)
+	movq %rsp,8(%rdi)		# Post-return %rsp!
+	push %rsi			# Make the call/return stack happy
+	movq %rbp,16(%rdi)
+	movq %r12,24(%rdi)
+	movq %r13,32(%rdi)
+	movq %r14,40(%rdi)
+	movq %r15,48(%rdi)
+	movq %rsi,56(%rdi)		# Return address
+	ret
+
+	.size setjmp,.-setjmp
+
+	.text
+	.align 4
+	.globl longjmp
+	.type longjmp, @function
+longjmp:
+	movl %esi,%eax			# Return value (int)
+	movq (%rdi),%rbx
+	movq 8(%rdi),%rsp
+	movq 16(%rdi),%rbp
+	movq 24(%rdi),%r12
+	movq 32(%rdi),%r13
+	movq 40(%rdi),%r14
+	movq 48(%rdi),%r15
+	jmp *56(%rdi)
+
+	.size longjmp,.-longjmp
-- 
cgit v1.2.3


From 91b165c0594ab78c64f26d26e3174e6dfd60ed9d Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:00 -0700
Subject: [PATCH] uml: Use ARRAY_SIZE more assiduously

There were a bunch of missed ARRAY_SIZE opportunities.

Also, some formatting fixes in the affected areas of code.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/chan_kern.c     |  2 +-
 arch/um/drivers/mconsole_kern.c |  2 +-
 arch/um/drivers/mconsole_user.c |  7 ++++---
 arch/um/drivers/pcap_kern.c     |  2 +-
 arch/um/kernel/mem.c            |  3 ++-
 arch/um/kernel/reboot.c         | 13 +------------
 arch/um/kernel/tlb.c            |  5 +++--
 arch/um/os-Linux/mem.c          |  6 +++---
 arch/um/sys-i386/bugs.c         |  9 +++++----
 arch/um/sys-i386/ldt.c          |  3 +--
 arch/um/sys-i386/ptrace_user.c  |  5 +++--
 11 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 7218c754505..e82764f75e7 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -544,7 +544,7 @@ static struct chan *parse_chan(struct line *line, char *str, int device,
 
 	ops = NULL;
 	data = NULL;
-	for(i = 0; i < sizeof(chan_table)/sizeof(chan_table[0]); i++){
+	for(i = 0; i < ARRAY_SIZE(chan_table); i++){
 		entry = &chan_table[i];
 		if(!strncmp(str, entry->key, strlen(entry->key))){
 			ops = entry->ops;
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index b414522f768..79610b5ce67 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -497,7 +497,7 @@ static void mconsole_get_config(int (*get_config)(char *, char *, int,
 	}
 
 	error = NULL;
-	size = sizeof(default_buf)/sizeof(default_buf[0]);
+	size = ARRAY_SIZE(default_buf);
 	buf = default_buf;
 
 	while(1){
diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c
index 9bfd405c3bd..5b2f5fe9e42 100644
--- a/arch/um/drivers/mconsole_user.c
+++ b/arch/um/drivers/mconsole_user.c
@@ -16,6 +16,7 @@
 #include "user.h"
 #include "mconsole.h"
 #include "umid.h"
+#include "user_util.h"
 
 static struct mconsole_command commands[] = {
 	/* With uts namespaces, uts information becomes process-specific, so
@@ -65,14 +66,14 @@ static struct mconsole_command *mconsole_parse(struct mc_request *req)
 	struct mconsole_command *cmd;
 	int i;
 
-	for(i=0;i<sizeof(commands)/sizeof(commands[0]);i++){
+	for(i = 0; i < ARRAY_SIZE(commands); i++){
 		cmd = &commands[i];
 		if(!strncmp(req->request.data, cmd->command, 
 			    strlen(cmd->command))){
-			return(cmd);
+			return cmd;
 		}
 	}
-	return(NULL);
+	return NULL;
 }
 
 #define MIN(a,b) ((a)<(b) ? (a):(b))
diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c
index 466ff2c2f91..4c767c7adb9 100644
--- a/arch/um/drivers/pcap_kern.c
+++ b/arch/um/drivers/pcap_kern.c
@@ -76,7 +76,7 @@ int pcap_setup(char *str, char **mac_out, void *data)
 	if(host_if != NULL)
 		init->host_if = host_if;
 
-	for(i = 0; i < sizeof(options)/sizeof(options[0]); i++){
+	for(i = 0; i < ARRAY_SIZE(options); i++){
 		if(options[i] == NULL)
 			continue;
 		if(!strcmp(options[i], "promisc"))
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index b1cd5c6e468..93121c6d26e 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -223,8 +223,9 @@ void paging_init(void)
 
 	empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
 	empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
-	for(i=0;i<sizeof(zones_size)/sizeof(zones_size[0]);i++) 
+	for(i = 0; i < ARRAY_SIZE(zones_size); i++)
 		zones_size[i] = 0;
+
 	zones_size[ZONE_DMA] = (end_iomem >> PAGE_SHIFT) - (uml_physmem >> PAGE_SHIFT);
 #ifdef CONFIG_HIGHMEM
 	zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT;
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 3ef73bf2e78..f602623644a 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -22,7 +22,7 @@ static void kill_idlers(int me)
 	struct task_struct *p;
 	int i;
 
-	for(i = 0; i < sizeof(idle_threads)/sizeof(idle_threads[0]); i++){
+	for(i = 0; i < ARRAY_SIZE(idle_threads); i++){
 		p = idle_threads[i];
 		if((p != NULL) && (p->thread.mode.tt.extern_pid != me))
 			os_kill_process(p->thread.mode.tt.extern_pid, 0);
@@ -62,14 +62,3 @@ void machine_halt(void)
 {
 	machine_power_off();
 }
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index f5b0636f9ad..cca330edf71 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -137,10 +137,11 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
         int r, w, x;
         struct host_vm_op ops[1];
         void *flush = NULL;
-        int op_index = -1, last_op = sizeof(ops) / sizeof(ops[0]) - 1;
+        int op_index = -1, last_op = ARRAY_SIZE(ops) - 1;
         int ret = 0;
 
-        if(mm == NULL) return;
+        if(mm == NULL)
+		return;
 
         ops[0].type = NONE;
         for(addr = start_addr; addr < end_addr && !ret;){
diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
index 560c8063c77..b170b4704dc 100644
--- a/arch/um/os-Linux/mem.c
+++ b/arch/um/os-Linux/mem.c
@@ -114,14 +114,14 @@ static void which_tmpdir(void)
 	}
 
 	while(1){
-		found = next(fd, buf, sizeof(buf) / sizeof(buf[0]), ' ');
+		found = next(fd, buf, ARRAY_SIZE(buf), ' ');
 		if(found != 1)
 			break;
 
 		if(!strncmp(buf, "/dev/shm", strlen("/dev/shm")))
 			goto found;
 
-		found = next(fd, buf, sizeof(buf) / sizeof(buf[0]), '\n');
+		found = next(fd, buf, ARRAY_SIZE(buf), '\n');
 		if(found != 1)
 			break;
 	}
@@ -135,7 +135,7 @@ err:
 	return;
 
 found:
-	found = next(fd, buf, sizeof(buf) / sizeof(buf[0]), ' ');
+	found = next(fd, buf, ARRAY_SIZE(buf), ' ');
 	if(found != 1)
 		goto err;
 
diff --git a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c
index 41b0ab2fe83..f1bcd399ac9 100644
--- a/arch/um/sys-i386/bugs.c
+++ b/arch/um/sys-i386/bugs.c
@@ -13,6 +13,7 @@
 #include "sysdep/ptrace.h"
 #include "task.h"
 #include "os.h"
+#include "user_util.h"
 
 #define MAXTOKEN 64
 
@@ -104,17 +105,17 @@ int cpu_feature(char *what, char *buf, int len)
 static int check_cpu_flag(char *feature, int *have_it)
 {
 	char buf[MAXTOKEN], c;
-	int fd, len = sizeof(buf)/sizeof(buf[0]);
+	int fd, len = ARRAY_SIZE(buf);
 
 	printk("Checking for host processor %s support...", feature);
 	fd = os_open_file("/proc/cpuinfo", of_read(OPENFLAGS()), 0);
 	if(fd < 0){
 		printk("Couldn't open /proc/cpuinfo, err = %d\n", -fd);
-		return(0);
+		return 0;
 	}
 
 	*have_it = 0;
-	if(!find_cpuinfo_line(fd, "flags", buf, sizeof(buf) / sizeof(buf[0])))
+	if(!find_cpuinfo_line(fd, "flags", buf, ARRAY_SIZE(buf)))
 		goto out;
 
 	c = token(fd, buf, len - 1, ' ');
@@ -138,7 +139,7 @@ static int check_cpu_flag(char *feature, int *have_it)
 	if(*have_it == 0) printk("No\n");
 	else if(*have_it == 1) printk("Yes\n");
 	os_close_file(fd);
-	return(1);
+	return 1;
 }
 
 #if 0 /* This doesn't work in tt mode, plus it's causing compilation problems
diff --git a/arch/um/sys-i386/ldt.c b/arch/um/sys-i386/ldt.c
index fe0877b3509..69971b78bea 100644
--- a/arch/um/sys-i386/ldt.c
+++ b/arch/um/sys-i386/ldt.c
@@ -424,9 +424,8 @@ void ldt_get_host_info(void)
 			size++;
 	}
 
-	if(size < sizeof(dummy_list)/sizeof(dummy_list[0])) {
+	if(size < ARRAY_SIZE(dummy_list))
 		host_ldt_entries = dummy_list;
-	}
 	else {
 		size = (size + 1) * sizeof(dummy_list[0]);
 		host_ldt_entries = (short *)kmalloc(size, GFP_KERNEL);
diff --git a/arch/um/sys-i386/ptrace_user.c b/arch/um/sys-i386/ptrace_user.c
index 40aa8853144..5f3cc668582 100644
--- a/arch/um/sys-i386/ptrace_user.c
+++ b/arch/um/sys-i386/ptrace_user.c
@@ -15,6 +15,7 @@
 #include "user.h"
 #include "os.h"
 #include "uml-config.h"
+#include "user_util.h"
 
 int ptrace_getregs(long pid, unsigned long *regs_out)
 {
@@ -51,7 +52,7 @@ static void write_debugregs(int pid, unsigned long *regs)
 	int nregs, i;
 
 	dummy = NULL;
-	nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]);
+	nregs = ARRAY_SIZE(dummy->u_debugreg);
 	for(i = 0; i < nregs; i++){
 		if((i == 4) || (i == 5)) continue;
 		if(ptrace(PTRACE_POKEUSR, pid, &dummy->u_debugreg[i],
@@ -68,7 +69,7 @@ static void read_debugregs(int pid, unsigned long *regs)
 	int nregs, i;
 
 	dummy = NULL;
-	nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]);
+	nregs = ARRAY_SIZE(dummy->u_debugreg);
 	for(i = 0; i < nregs; i++){
 		regs[i] = ptrace(PTRACE_PEEKUSR, pid,
 				 &dummy->u_debugreg[i], 0);
-- 
cgit v1.2.3


From 8f80e9466e18288df7391c9d21532c4125ac9c62 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:01 -0700
Subject: [PATCH] uml: Fix stack alignment

Stack randomization needs to be conditional on the personality allowing it.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/process_kern.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index f6a5a502120..537895d68ad 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -23,6 +23,7 @@
 #include "linux/proc_fs.h"
 #include "linux/ptrace.h"
 #include "linux/random.h"
+#include "linux/personality.h"
 #include "asm/unistd.h"
 #include "asm/mman.h"
 #include "asm/segment.h"
@@ -476,7 +477,7 @@ int singlestepping(void * t)
 #ifndef arch_align_stack
 unsigned long arch_align_stack(unsigned long sp)
 {
-	if (randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }
-- 
cgit v1.2.3


From 5e1f65a67d76341795ea527d30bfdca03999d46b Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:01 -0700
Subject: [PATCH] uml: Whitespace fixes

arch/um/kernel/tlb.c had some pretty serious whitespace problems.  I also
fixed some returns.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/tlb.c | 367 +++++++++++++++++++++++++--------------------------
 1 file changed, 183 insertions(+), 184 deletions(-)

diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index cca330edf71..54a5ff25645 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -16,12 +16,12 @@
 #include "os.h"
 
 static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len,
- 		    int r, int w, int x, struct host_vm_op *ops, int *index,
+		    int r, int w, int x, struct host_vm_op *ops, int *index,
 		    int last_filled, union mm_context *mmu, void **flush,
 		    int (*do_ops)(union mm_context *, struct host_vm_op *,
 				  int, int, void **))
 {
-        __u64 offset;
+	__u64 offset;
 	struct host_vm_op *last;
 	int fd, ret = 0;
 
@@ -89,7 +89,7 @@ static int add_munmap(unsigned long addr, unsigned long len,
 static int add_mprotect(unsigned long addr, unsigned long len, int r, int w,
 			int x, struct host_vm_op *ops, int *index,
 			int last_filled, union mm_context *mmu, void **flush,
- 			int (*do_ops)(union mm_context *, struct host_vm_op *,
+			int (*do_ops)(union mm_context *, struct host_vm_op *,
 				      int, int, void **))
 {
 	struct host_vm_op *last;
@@ -124,106 +124,105 @@ static int add_mprotect(unsigned long addr, unsigned long len, int r, int w,
 #define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1))
 
 void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
-                      unsigned long end_addr, int force,
+		      unsigned long end_addr, int force,
 		      int (*do_ops)(union mm_context *, struct host_vm_op *,
 				    int, int, void **))
 {
-        pgd_t *npgd;
-        pud_t *npud;
-        pmd_t *npmd;
-        pte_t *npte;
-        union mm_context *mmu = &mm->context;
-        unsigned long addr, end;
-        int r, w, x;
-        struct host_vm_op ops[1];
-        void *flush = NULL;
-        int op_index = -1, last_op = ARRAY_SIZE(ops) - 1;
-        int ret = 0;
-
-        if(mm == NULL)
+	pgd_t *npgd;
+	pud_t *npud;
+	pmd_t *npmd;
+	pte_t *npte;
+	union mm_context *mmu = &mm->context;
+	unsigned long addr, end;
+	int r, w, x;
+	struct host_vm_op ops[1];
+	void *flush = NULL;
+	int op_index = -1, last_op = ARRAY_SIZE(ops) - 1;
+	int ret = 0;
+
+	if(mm == NULL)
 		return;
 
-        ops[0].type = NONE;
-        for(addr = start_addr; addr < end_addr && !ret;){
-                npgd = pgd_offset(mm, addr);
-                if(!pgd_present(*npgd)){
-                        end = ADD_ROUND(addr, PGDIR_SIZE);
-                        if(end > end_addr)
-                                end = end_addr;
-                        if(force || pgd_newpage(*npgd)){
-                                ret = add_munmap(addr, end - addr, ops,
-                                                 &op_index, last_op, mmu,
-                                                 &flush, do_ops);
-                                pgd_mkuptodate(*npgd);
-                        }
-                        addr = end;
-                        continue;
-                }
-
-                npud = pud_offset(npgd, addr);
-                if(!pud_present(*npud)){
-                        end = ADD_ROUND(addr, PUD_SIZE);
-                        if(end > end_addr)
-                                end = end_addr;
-                        if(force || pud_newpage(*npud)){
-                                ret = add_munmap(addr, end - addr, ops,
-                                                 &op_index, last_op, mmu,
-                                                 &flush, do_ops);
-                                pud_mkuptodate(*npud);
-                        }
-                        addr = end;
-                        continue;
-                }
-
-                npmd = pmd_offset(npud, addr);
-                if(!pmd_present(*npmd)){
-                        end = ADD_ROUND(addr, PMD_SIZE);
-                        if(end > end_addr)
-                                end = end_addr;
-                        if(force || pmd_newpage(*npmd)){
-                                ret = add_munmap(addr, end - addr, ops,
-                                                 &op_index, last_op, mmu,
-                                                 &flush, do_ops);
-                                pmd_mkuptodate(*npmd);
-                        }
-                        addr = end;
-                        continue;
-                }
-
-                npte = pte_offset_kernel(npmd, addr);
-                r = pte_read(*npte);
-                w = pte_write(*npte);
-                x = pte_exec(*npte);
+	ops[0].type = NONE;
+	for(addr = start_addr; addr < end_addr && !ret;){
+		npgd = pgd_offset(mm, addr);
+		if(!pgd_present(*npgd)){
+			end = ADD_ROUND(addr, PGDIR_SIZE);
+			if(end > end_addr)
+				end = end_addr;
+			if(force || pgd_newpage(*npgd)){
+				ret = add_munmap(addr, end - addr, ops,
+						 &op_index, last_op, mmu,
+						 &flush, do_ops);
+				pgd_mkuptodate(*npgd);
+			}
+			addr = end;
+			continue;
+		}
+
+		npud = pud_offset(npgd, addr);
+		if(!pud_present(*npud)){
+			end = ADD_ROUND(addr, PUD_SIZE);
+			if(end > end_addr)
+				end = end_addr;
+			if(force || pud_newpage(*npud)){
+				ret = add_munmap(addr, end - addr, ops,
+						 &op_index, last_op, mmu,
+						 &flush, do_ops);
+				pud_mkuptodate(*npud);
+			}
+			addr = end;
+			continue;
+		}
+
+		npmd = pmd_offset(npud, addr);
+		if(!pmd_present(*npmd)){
+			end = ADD_ROUND(addr, PMD_SIZE);
+			if(end > end_addr)
+				end = end_addr;
+			if(force || pmd_newpage(*npmd)){
+				ret = add_munmap(addr, end - addr, ops,
+						 &op_index, last_op, mmu,
+						 &flush, do_ops);
+				pmd_mkuptodate(*npmd);
+			}
+			addr = end;
+			continue;
+		}
+
+		npte = pte_offset_kernel(npmd, addr);
+		r = pte_read(*npte);
+		w = pte_write(*npte);
+		x = pte_exec(*npte);
 		if (!pte_young(*npte)) {
 			r = 0;
 			w = 0;
 		} else if (!pte_dirty(*npte)) {
 			w = 0;
 		}
-                if(force || pte_newpage(*npte)){
-                        if(pte_present(*npte))
-			  ret = add_mmap(addr,
-					 pte_val(*npte) & PAGE_MASK,
-					 PAGE_SIZE, r, w, x, ops,
-					 &op_index, last_op, mmu,
-					 &flush, do_ops);
+		if(force || pte_newpage(*npte)){
+			if(pte_present(*npte))
+				ret = add_mmap(addr,
+					       pte_val(*npte) & PAGE_MASK,
+					       PAGE_SIZE, r, w, x, ops,
+					       &op_index, last_op, mmu,
+					       &flush, do_ops);
 			else ret = add_munmap(addr, PAGE_SIZE, ops,
 					      &op_index, last_op, mmu,
 					      &flush, do_ops);
-                }
-                else if(pte_newprot(*npte))
+		}
+		else if(pte_newprot(*npte))
 			ret = add_mprotect(addr, PAGE_SIZE, r, w, x, ops,
 					   &op_index, last_op, mmu,
 					   &flush, do_ops);
 
-                *npte = pte_mkuptodate(*npte);
-                addr += PAGE_SIZE;
-        }
-
+		*npte = pte_mkuptodate(*npte);
+		addr += PAGE_SIZE;
+	}
 	if(!ret)
 		ret = (*do_ops)(mmu, ops, op_index, 1, &flush);
 
-	/* This is not an else because ret is modified above */
+/* This is not an else because ret is modified above */
 	if(ret) {
 		printk("fix_range_common: failed, killing current process\n");
 		force_sig(SIGKILL, current);
@@ -232,160 +231,160 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
 
 int flush_tlb_kernel_range_common(unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm;
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        pte_t *pte;
-        unsigned long addr, last;
-        int updated = 0, err;
-
-        mm = &init_mm;
-        for(addr = start; addr < end;){
-                pgd = pgd_offset(mm, addr);
-                if(!pgd_present(*pgd)){
-                        last = ADD_ROUND(addr, PGDIR_SIZE);
-                        if(last > end)
-                                last = end;
-                        if(pgd_newpage(*pgd)){
-                                updated = 1;
-                                err = os_unmap_memory((void *) addr,
-                                                      last - addr);
-                                if(err < 0)
-                                        panic("munmap failed, errno = %d\n",
-                                              -err);
-                        }
-                        addr = last;
-                        continue;
-                }
-
-                pud = pud_offset(pgd, addr);
-                if(!pud_present(*pud)){
-                        last = ADD_ROUND(addr, PUD_SIZE);
-                        if(last > end)
-                                last = end;
-                        if(pud_newpage(*pud)){
-                                updated = 1;
-                                err = os_unmap_memory((void *) addr,
-                                                      last - addr);
-                                if(err < 0)
-                                        panic("munmap failed, errno = %d\n",
-                                              -err);
-                        }
-                        addr = last;
-                        continue;
-                }
-
-                pmd = pmd_offset(pud, addr);
-                if(!pmd_present(*pmd)){
-                        last = ADD_ROUND(addr, PMD_SIZE);
-                        if(last > end)
-                                last = end;
-                        if(pmd_newpage(*pmd)){
-                                updated = 1;
-                                err = os_unmap_memory((void *) addr,
-                                                      last - addr);
-                                if(err < 0)
-                                        panic("munmap failed, errno = %d\n",
-                                              -err);
-                        }
-                        addr = last;
-                        continue;
-                }
-
-                pte = pte_offset_kernel(pmd, addr);
-                if(!pte_present(*pte) || pte_newpage(*pte)){
-                        updated = 1;
-                        err = os_unmap_memory((void *) addr,
-                                              PAGE_SIZE);
-                        if(err < 0)
-                                panic("munmap failed, errno = %d\n",
-                                      -err);
-                        if(pte_present(*pte))
-                                map_memory(addr,
-                                           pte_val(*pte) & PAGE_MASK,
-                                           PAGE_SIZE, 1, 1, 1);
-                }
-                else if(pte_newprot(*pte)){
-                        updated = 1;
-                        os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1);
-                }
-                addr += PAGE_SIZE;
-        }
-        return(updated);
+	struct mm_struct *mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long addr, last;
+	int updated = 0, err;
+
+	mm = &init_mm;
+	for(addr = start; addr < end;){
+		pgd = pgd_offset(mm, addr);
+		if(!pgd_present(*pgd)){
+			last = ADD_ROUND(addr, PGDIR_SIZE);
+			if(last > end)
+				last = end;
+			if(pgd_newpage(*pgd)){
+				updated = 1;
+				err = os_unmap_memory((void *) addr,
+						      last - addr);
+				if(err < 0)
+					panic("munmap failed, errno = %d\n",
+					      -err);
+			}
+			addr = last;
+			continue;
+		}
+
+		pud = pud_offset(pgd, addr);
+		if(!pud_present(*pud)){
+			last = ADD_ROUND(addr, PUD_SIZE);
+			if(last > end)
+				last = end;
+			if(pud_newpage(*pud)){
+				updated = 1;
+				err = os_unmap_memory((void *) addr,
+						      last - addr);
+				if(err < 0)
+					panic("munmap failed, errno = %d\n",
+					      -err);
+			}
+			addr = last;
+			continue;
+		}
+
+		pmd = pmd_offset(pud, addr);
+		if(!pmd_present(*pmd)){
+			last = ADD_ROUND(addr, PMD_SIZE);
+			if(last > end)
+				last = end;
+			if(pmd_newpage(*pmd)){
+				updated = 1;
+				err = os_unmap_memory((void *) addr,
+						      last - addr);
+				if(err < 0)
+					panic("munmap failed, errno = %d\n",
+					      -err);
+			}
+			addr = last;
+			continue;
+		}
+
+		pte = pte_offset_kernel(pmd, addr);
+		if(!pte_present(*pte) || pte_newpage(*pte)){
+			updated = 1;
+			err = os_unmap_memory((void *) addr,
+					      PAGE_SIZE);
+			if(err < 0)
+				panic("munmap failed, errno = %d\n",
+				      -err);
+			if(pte_present(*pte))
+				map_memory(addr,
+					   pte_val(*pte) & PAGE_MASK,
+					   PAGE_SIZE, 1, 1, 1);
+		}
+		else if(pte_newprot(*pte)){
+			updated = 1;
+			os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1);
+		}
+		addr += PAGE_SIZE;
+	}
+	return(updated);
 }
 
 pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address)
 {
-        return(pgd_offset(mm, address));
+	return(pgd_offset(mm, address));
 }
 
 pud_t *pud_offset_proc(pgd_t *pgd, unsigned long address)
 {
-        return(pud_offset(pgd, address));
+	return(pud_offset(pgd, address));
 }
 
 pmd_t *pmd_offset_proc(pud_t *pud, unsigned long address)
 {
-        return(pmd_offset(pud, address));
+	return(pmd_offset(pud, address));
 }
 
 pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address)
 {
-        return(pte_offset_kernel(pmd, address));
+	return(pte_offset_kernel(pmd, address));
 }
 
 pte_t *addr_pte(struct task_struct *task, unsigned long addr)
 {
-        pgd_t *pgd = pgd_offset(task->mm, addr);
-        pud_t *pud = pud_offset(pgd, addr);
-        pmd_t *pmd = pmd_offset(pud, addr);
+	pgd_t *pgd = pgd_offset(task->mm, addr);
+	pud_t *pud = pud_offset(pgd, addr);
+	pmd_t *pmd = pmd_offset(pud, addr);
 
-        return(pte_offset_map(pmd, addr));
+	return(pte_offset_map(pmd, addr));
 }
 
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
 {
-        address &= PAGE_MASK;
-        flush_tlb_range(vma, address, address + PAGE_SIZE);
+	address &= PAGE_MASK;
+	flush_tlb_range(vma, address, address + PAGE_SIZE);
 }
 
 void flush_tlb_all(void)
 {
-        flush_tlb_mm(current->mm);
+	flush_tlb_mm(current->mm);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-        CHOOSE_MODE_PROC(flush_tlb_kernel_range_tt,
-                         flush_tlb_kernel_range_common, start, end);
+	CHOOSE_MODE_PROC(flush_tlb_kernel_range_tt,
+			 flush_tlb_kernel_range_common, start, end);
 }
 
 void flush_tlb_kernel_vm(void)
 {
-        CHOOSE_MODE(flush_tlb_kernel_vm_tt(),
-                    flush_tlb_kernel_range_common(start_vm, end_vm));
+	CHOOSE_MODE(flush_tlb_kernel_vm_tt(),
+		    flush_tlb_kernel_range_common(start_vm, end_vm));
 }
 
 void __flush_tlb_one(unsigned long addr)
 {
-        CHOOSE_MODE_PROC(__flush_tlb_one_tt, __flush_tlb_one_skas, addr);
+	CHOOSE_MODE_PROC(__flush_tlb_one_tt, __flush_tlb_one_skas, addr);
 }
 
 void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 {
-        CHOOSE_MODE_PROC(flush_tlb_range_tt, flush_tlb_range_skas, vma, start,
-                         end);
+	CHOOSE_MODE_PROC(flush_tlb_range_tt, flush_tlb_range_skas, vma, start,
+			 end);
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
 {
-        CHOOSE_MODE_PROC(flush_tlb_mm_tt, flush_tlb_mm_skas, mm);
+	CHOOSE_MODE_PROC(flush_tlb_mm_tt, flush_tlb_mm_skas, mm);
 }
 
 void force_flush_all(void)
 {
-        CHOOSE_MODE(force_flush_all_tt(), force_flush_all_skas());
+	CHOOSE_MODE(force_flush_all_tt(), force_flush_all_skas());
 }
 
-- 
cgit v1.2.3


From 6b7aaad9ba4f2a059a70014be12a921eceebfc47 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:02 -0700
Subject: [PATCH] uml: Fix handling of failed execs of helpers

There were some bugs in handling failures to exec helper programs.  errno was
passed back from the child with the wrong sign.  It was also ignored.  In the
case where it mattered, the errno from the (successful) read in the parent was
used instead.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/os-Linux/helper.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c
index 6987d1d247a..cd15b9df5b5 100644
--- a/arch/um/os-Linux/helper.c
+++ b/arch/um/os-Linux/helper.c
@@ -42,7 +42,7 @@ static int helper_child(void *arg)
 	if(data->pre_exec != NULL)
 		(*data->pre_exec)(data->pre_data);
 	execvp(argv[0], argv);
-	errval = errno;
+	errval = -errno;
 	printk("helper_child - execve of '%s' failed - errno = %d\n", argv[0], errno);
 	os_write_file(data->fd, &errval, sizeof(errval));
 	kill(os_getpid(), SIGKILL);
@@ -62,7 +62,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
 		stack = *stack_out;
 	else stack = alloc_stack(0, __cant_sleep());
 	if(stack == 0)
-		return(-ENOMEM);
+		return -ENOMEM;
 
 	ret = os_pipe(fds, 1, 0);
 	if(ret < 0){
@@ -95,16 +95,16 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
 	/* Read the errno value from the child, if the exec failed, or get 0 if
 	 * the exec succeeded because the pipe fd was set as close-on-exec. */
 	n = os_read_file(fds[0], &ret, sizeof(ret));
-	if (n < 0) {
-		printk("run_helper : read on pipe failed, ret = %d\n", -n);
-		ret = n;
-		kill(pid, SIGKILL);
-		CATCH_EINTR(waitpid(pid, NULL, 0));
-	} else if(n != 0){
-		CATCH_EINTR(n = waitpid(pid, NULL, 0));
-		ret = -errno;
-	} else {
+	if(n == 0)
 		ret = pid;
+	else {
+		if(n < 0){
+			printk("run_helper : read on pipe failed, ret = %d\n",
+			       -n);
+			ret = n;
+			kill(pid, SIGKILL);
+		}
+		CATCH_EINTR(waitpid(pid, NULL, 0));
 	}
 
 out_close:
-- 
cgit v1.2.3


From 6edf428ed177e333863a8e5c37751a9ec176f241 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:03 -0700
Subject: [PATCH] uml: Improve SIGBUS diagnostics

UML can get a SIGBUS anywhere if the tmpfs mount being used for its memory
runs out of space.  This patch adds a printk before the panic to provide a
clue as to what likely went wrong.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/trap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ac70fa5a2e2..e5eeaf2b6af 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -227,9 +227,16 @@ void bad_segv(struct faultinfo fi, unsigned long ip)
 
 void relay_signal(int sig, union uml_pt_regs *regs)
 {
-	if(arch_handle_signal(sig, regs)) return;
-	if(!UPT_IS_USER(regs))
+	if(arch_handle_signal(sig, regs))
+		return;
+
+	if(!UPT_IS_USER(regs)){
+		if(sig == SIGBUS)
+			printk("Bus error - the /dev/shm or /tmp mount likely "
+			       "just ran out of space\n");
 		panic("Kernel mode signal %d", sig);
+	}
+
         current->thread.arch.faultinfo = *UPT_FAULTINFO(regs);
 	force_sig(sig, current);
 }
-- 
cgit v1.2.3


From 19bdf0409f25a85a45874a5a8da6f3e4edcf4a49 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:04 -0700
Subject: [PATCH] uml: SIGIO cleanups

- Various cleanups in the sigio code.

- Removed explicit zero-initializations of a few structures.

- Improved some error messages.

- An API change - there was an asymmetry between reactivate_fd calling
  maybe_sigio_broken, which goes through all the machinery of figuring out if
  a file descriptor supports SIGIO and applying the workaround to it if not,
  and deactivate_fd, which just turns off the descriptor.

  This is changed so that only activate_fd calls maybe_sigio_broken, when
  the descriptor is first seen.  reactivate_fd now calls add_sigio_fd, which
  is symmetric with ignore_sigio_fd.

  This removes a recursion which makes a critical section look more critical
  than it really was, obsoleting a big comment to that effect.  This requires
  keeping track of all descriptors which are getting the SIGIO treatment, not
  just the ones being polled at any given moment, so that reactivate_fd,
  through add_sigio_fd, doesn't try to tell the SIGIO thread about descriptors
  it doesn't care about.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/os.h     |   1 +
 arch/um/kernel/irq.c     |  34 +++++-----------
 arch/um/os-Linux/sigio.c | 103 ++++++++++++++++++++++++++---------------------
 3 files changed, 67 insertions(+), 71 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 5316e8a4a4f..c73dfa78f94 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -329,6 +329,7 @@ extern void os_set_ioignore(void);
 extern void init_irq_signals(int on_sigstack);
 
 /* sigio.c */
+extern int add_sigio_fd(int fd);
 extern int ignore_sigio_fd(int fd);
 extern void maybe_sigio_broken(int fd, int read);
 
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 589c69a7504..ce7f233fc49 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -142,19 +142,6 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 				     .events 		= events,
 				     .current_events 	= 0 } );
 
-	/* Critical section - locked by a spinlock because this stuff can
-	 * be changed from interrupt handlers.  The stuff above is done
-	 * outside the lock because it allocates memory.
-	 */
-
-	/* Actually, it only looks like it can be called from interrupt
-	 * context.  The culprit is reactivate_fd, which calls
-	 * maybe_sigio_broken, which calls write_sigio_workaround,
-	 * which calls activate_fd.  However, write_sigio_workaround should
-	 * only be called once, at boot time.  That would make it clear that
-	 * this is called only from process context, and can be locked with
-	 * a semaphore.
-	 */
 	spin_lock_irqsave(&irq_lock, flags);
 	for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
 		if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
@@ -165,7 +152,6 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 		}
 	}
 
-	/*-------------*/
 	if (type == IRQ_WRITE)
 		fd = -1;
 
@@ -198,7 +184,6 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 
 		spin_lock_irqsave(&irq_lock, flags);
 	}
-	/*-------------*/
 
 	*last_irq_ptr = new_fd;
 	last_irq_ptr = &new_fd->next;
@@ -210,14 +195,14 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
 	 */
 	maybe_sigio_broken(fd, (type == IRQ_READ));
 
-	return(0);
+	return 0;
 
  out_unlock:
 	spin_unlock_irqrestore(&irq_lock, flags);
  out_kfree:
 	kfree(new_fd);
  out:
-	return(err);
+	return err;
 }
 
 static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
@@ -302,10 +287,7 @@ void reactivate_fd(int fd, int irqnum)
 	os_set_pollfd(i, irq->fd);
 	spin_unlock_irqrestore(&irq_lock, flags);
 
-	/* This calls activate_fd, so it has to be outside the critical
-	 * section.
-	 */
-	maybe_sigio_broken(fd, (irq->type == IRQ_READ));
+	add_sigio_fd(fd);
 }
 
 void deactivate_fd(int fd, int irqnum)
@@ -316,11 +298,15 @@ void deactivate_fd(int fd, int irqnum)
 
 	spin_lock_irqsave(&irq_lock, flags);
 	irq = find_irq_by_fd(fd, irqnum, &i);
-	if (irq == NULL)
-		goto out;
+	if(irq == NULL){
+		spin_unlock_irqrestore(&irq_lock, flags);
+		return;
+	}
+
 	os_set_pollfd(i, -1);
- out:
 	spin_unlock_irqrestore(&irq_lock, flags);
+
+	ignore_sigio_fd(fd);
 }
 
 int deactivate_all_fds(void)
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 0ecac563c7b..f6457765b17 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -43,17 +43,9 @@ struct pollfds {
 /* Protected by sigio_lock().  Used by the sigio thread, but the UML thread
  * synchronizes with it.
  */
-static struct pollfds current_poll = {
-	.poll  		= NULL,
-	.size 		= 0,
-	.used 		= 0
-};
-
-static struct pollfds next_poll = {
-	.poll  		= NULL,
-	.size 		= 0,
-	.used 		= 0
-};
+static struct pollfds current_poll;
+static struct pollfds next_poll;
+static struct pollfds all_sigio_fds;
 
 static int write_sigio_thread(void *unused)
 {
@@ -78,7 +70,8 @@ static int write_sigio_thread(void *unused)
 				n = os_read_file(sigio_private[1], &c, sizeof(c));
 				if(n != sizeof(c))
 					printk("write_sigio_thread : "
-					       "read failed, err = %d\n", -n);
+					       "read on socket failed, "
+					       "err = %d\n", -n);
 				tmp = current_poll;
 				current_poll = next_poll;
 				next_poll = tmp;
@@ -93,35 +86,36 @@ static int write_sigio_thread(void *unused)
 
 			n = os_write_file(respond_fd, &c, sizeof(c));
 			if(n != sizeof(c))
-				printk("write_sigio_thread : write failed, "
-				       "err = %d\n", -n);
+				printk("write_sigio_thread : write on socket "
+				       "failed, err = %d\n", -n);
 		}
 	}
 
 	return 0;
 }
 
-static int need_poll(int n)
+static int need_poll(struct pollfds *polls, int n)
 {
-	if(n <= next_poll.size){
-		next_poll.used = n;
-		return(0);
+	if(n <= polls->size){
+		polls->used = n;
+		return 0;
 	}
-	kfree(next_poll.poll);
-	next_poll.poll = um_kmalloc_atomic(n * sizeof(struct pollfd));
-	if(next_poll.poll == NULL){
+	kfree(polls->poll);
+	polls->poll = um_kmalloc_atomic(n * sizeof(struct pollfd));
+	if(polls->poll == NULL){
 		printk("need_poll : failed to allocate new pollfds\n");
-		next_poll.size = 0;
-		next_poll.used = 0;
-		return(-1);
+		polls->size = 0;
+		polls->used = 0;
+		return -ENOMEM;
 	}
-	next_poll.size = n;
-	next_poll.used = n;
-	return(0);
+	polls->size = n;
+	polls->used = n;
+	return 0;
 }
 
 /* Must be called with sigio_lock held, because it's needed by the marked
- * critical section. */
+ * critical section.
+ */
 static void update_thread(void)
 {
 	unsigned long flags;
@@ -156,34 +150,39 @@ static void update_thread(void)
 	set_signals(flags);
 }
 
-static int add_sigio_fd(int fd, int read)
+int add_sigio_fd(int fd)
 {
-	int err = 0, i, n, events;
+	struct pollfd *p;
+	int err = 0, i, n;
 
 	sigio_lock();
+	for(i = 0; i < all_sigio_fds.used; i++){
+		if(all_sigio_fds.poll[i].fd == fd)
+			break;
+	}
+	if(i == all_sigio_fds.used)
+		goto out;
+
+	p = &all_sigio_fds.poll[i];
+
 	for(i = 0; i < current_poll.used; i++){
 		if(current_poll.poll[i].fd == fd)
 			goto out;
 	}
 
 	n = current_poll.used + 1;
-	err = need_poll(n);
+	err = need_poll(&next_poll, n);
 	if(err)
 		goto out;
 
 	for(i = 0; i < current_poll.used; i++)
 		next_poll.poll[i] = current_poll.poll[i];
 
-	if(read) events = POLLIN;
-	else events = POLLOUT;
-
-	next_poll.poll[n - 1] = ((struct pollfd) { .fd  	= fd,
-						   .events 	= events,
-						   .revents 	= 0 });
+	next_poll.poll[n - 1] = *p;
 	update_thread();
  out:
 	sigio_unlock();
-	return(err);
+	return err;
 }
 
 int ignore_sigio_fd(int fd)
@@ -205,18 +204,14 @@ int ignore_sigio_fd(int fd)
 	if(i == current_poll.used)
 		goto out;
 
-	err = need_poll(current_poll.used - 1);
+	err = need_poll(&next_poll, current_poll.used - 1);
 	if(err)
 		goto out;
 
 	for(i = 0; i < current_poll.used; i++){
 		p = &current_poll.poll[i];
-		if(p->fd != fd) next_poll.poll[n++] = current_poll.poll[i];
-	}
-	if(n == i){
-		printk("ignore_sigio_fd : fd %d not found\n", fd);
-		err = -1;
-		goto out;
+		if(p->fd != fd)
+			next_poll.poll[n++] = *p;
 	}
 
 	update_thread();
@@ -234,7 +229,7 @@ static struct pollfd *setup_initial_poll(int fd)
 		printk("setup_initial_poll : failed to allocate poll\n");
 		return NULL;
 	}
-	*p = ((struct pollfd) { .fd  	= fd,
+	*p = ((struct pollfd) { .fd		= fd,
 				.events 	= POLLIN,
 				.revents 	= 0 });
 	return p;
@@ -323,6 +318,8 @@ out_close1:
 
 void maybe_sigio_broken(int fd, int read)
 {
+	int err;
+
 	if(!isatty(fd))
 		return;
 
@@ -330,7 +327,19 @@ void maybe_sigio_broken(int fd, int read)
 		return;
 
 	write_sigio_workaround();
-	add_sigio_fd(fd, read);
+
+	sigio_lock();
+	err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1);
+	if(err){
+		printk("maybe_sigio_broken - failed to add pollfd\n");
+		goto out;
+	}
+	all_sigio_fds.poll[all_sigio_fds.used++] =
+		((struct pollfd) { .fd  	= fd,
+				   .events 	= read ? POLLIN : POLLOUT,
+				   .revents 	= 0 });
+out:
+	sigio_unlock();
 }
 
 static void sigio_cleanup(void)
-- 
cgit v1.2.3


From 4b84c69b5f6c08a540e3683f1360a6cdef2806c7 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:04 -0700
Subject: [PATCH] uml: Move signal handlers to arch code

Have most signals go through an arch-provided handler which recovers the
sigcontext and then calls a generic handler.  This replaces the
ARCH_GET_SIGCONTEXT macro, which was somewhat fragile.  On x86_64, recovering
%rdx (which holds the sigcontext pointer) must be the first thing that
happens.  sig_handler duly invokes that first, but there is no guarantee that
I can see that instructions won't be reordered such that %rdx is used before
that.  Having the arch provide the handler seems much more robust.

Some signals in some parts of UML require their own handlers - these places
don't call set_handler any more.  They call sigaction or signal themselves.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/sysdep-i386/signal.h   | 27 ---------------------------
 arch/um/include/sysdep-x86_64/signal.h | 29 -----------------------------
 arch/um/os-Linux/irq.c                 |  2 +-
 arch/um/os-Linux/main.c                | 34 ++++++++++++++++++++++++++++------
 arch/um/os-Linux/process.c             | 12 +++++++++++-
 arch/um/os-Linux/signal.c              | 31 +++++++++++++------------------
 arch/um/os-Linux/skas/process.c        | 17 ++++++++++++++---
 arch/um/os-Linux/sys-i386/Makefile     |  2 +-
 arch/um/os-Linux/sys-i386/signal.c     | 15 +++++++++++++++
 arch/um/os-Linux/sys-x86_64/Makefile   |  2 +-
 arch/um/os-Linux/sys-x86_64/signal.c   | 16 ++++++++++++++++
 arch/um/os-Linux/time.c                |  4 ++--
 12 files changed, 102 insertions(+), 89 deletions(-)
 delete mode 100644 arch/um/include/sysdep-i386/signal.h
 delete mode 100644 arch/um/include/sysdep-x86_64/signal.h
 create mode 100644 arch/um/os-Linux/sys-i386/signal.c
 create mode 100644 arch/um/os-Linux/sys-x86_64/signal.c

diff --git a/arch/um/include/sysdep-i386/signal.h b/arch/um/include/sysdep-i386/signal.h
deleted file mode 100644
index 07518b16213..00000000000
--- a/arch/um/include/sysdep-i386/signal.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2004 PathScale, Inc
- * Licensed under the GPL
- */
-
-#ifndef __I386_SIGNAL_H_
-#define __I386_SIGNAL_H_
-
-#include <signal.h>
-
-#define ARCH_SIGHDLR_PARAM int sig
-
-#define ARCH_GET_SIGCONTEXT(sc, sig) \
-	do sc = (struct sigcontext *) (&sig + 1); while(0)
-
-#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/include/sysdep-x86_64/signal.h b/arch/um/include/sysdep-x86_64/signal.h
deleted file mode 100644
index 6142897af3d..00000000000
--- a/arch/um/include/sysdep-x86_64/signal.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2004 PathScale, Inc
- * Licensed under the GPL
- */
-
-#ifndef __X86_64_SIGNAL_H_
-#define __X86_64_SIGNAL_H_
-
-#define ARCH_SIGHDLR_PARAM int sig
-
-#define ARCH_GET_SIGCONTEXT(sc, sig_addr) \
-	do { \
-		struct ucontext *__uc; \
-		asm("movq %%rdx, %0" : "=r" (__uc)); \
-		sc = (struct sigcontext *) &__uc->uc_mcontext; \
-	} while(0)
-
-#endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * ---------------------------------------------------------------------------
- * Local variables:
- * c-file-style: "linux"
- * End:
- */
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index 7555bf9c33d..a97206df5b5 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -132,7 +132,7 @@ void os_set_pollfd(int i, int fd)
 
 void os_set_ioignore(void)
 {
-	set_handler(SIGIO, SIG_IGN, 0, -1);
+	signal(SIGIO, SIG_IGN);
 }
 
 void init_irq_signals(int on_sigstack)
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 90912aaca7a..d1c5670787d 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -67,13 +67,32 @@ static __init void do_uml_initcalls(void)
 
 static void last_ditch_exit(int sig)
 {
-	signal(SIGINT, SIG_DFL);
-	signal(SIGTERM, SIG_DFL);
-	signal(SIGHUP, SIG_DFL);
 	uml_cleanup();
 	exit(1);
 }
 
+static void install_fatal_handler(int sig)
+{
+	struct sigaction action;
+
+	/* All signals are enabled in this handler ... */
+	sigemptyset(&action.sa_mask);
+
+	/* ... including the signal being handled, plus we want the
+	 * handler reset to the default behavior, so that if an exit
+	 * handler is hanging for some reason, the UML will just die
+	 * after this signal is sent a second time.
+	 */
+	action.sa_flags = SA_RESETHAND | SA_NODEFER;
+	action.sa_restorer = NULL;
+	action.sa_handler = last_ditch_exit;
+	if(sigaction(sig, &action, NULL) < 0){
+		printf("failed to install handler for signal %d - errno = %d\n",
+		       errno);
+		exit(1);
+	}
+}
+
 #define UML_LIB_PATH	":/usr/lib/uml"
 
 static void setup_env_path(void)
@@ -158,9 +177,12 @@ int main(int argc, char **argv, char **envp)
 	}
 	new_argv[argc] = NULL;
 
-	set_handler(SIGINT, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
-	set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
-	set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
+	/* Allow these signals to bring down a UML if all other
+	 * methods of control fail.
+	 */
+	install_fatal_handler(SIGINT);
+	install_fatal_handler(SIGTERM);
+	install_fatal_handler(SIGHUP);
 
 	scan_elf_aux( envp);
 
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 3afde92ad2c..ff203625a4b 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -246,7 +246,17 @@ void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
 		set_sigstack(sig_stack, pages * page_size());
 		flags = SA_ONSTACK;
 	}
-	if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1);
+	if(usr1_handler){
+		struct sigaction sa;
+
+		sa.sa_handler = usr1_handler;
+		sigemptyset(&sa.sa_mask);
+		sa.sa_flags = flags;
+		sa.sa_restorer = NULL;
+		if(sigaction(SIGUSR1, &sa, NULL) < 0)
+			panic("init_new_thread_stack - sigaction failed - "
+			      "errno = %d\n", errno);
+	}
 }
 
 void init_new_thread_signals(void)
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 60e4faedf25..55b62e2b8f4 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -15,7 +15,6 @@
 #include "user.h"
 #include "signal_kern.h"
 #include "sysdep/sigcontext.h"
-#include "sysdep/signal.h"
 #include "sigcontext.h"
 #include "mode.h"
 #include "os.h"
@@ -38,18 +37,10 @@
 static int signals_enabled = 1;
 static int pending = 0;
 
-void sig_handler(ARCH_SIGHDLR_PARAM)
+void sig_handler(int sig, struct sigcontext *sc)
 {
-	struct sigcontext *sc;
 	int enabled;
 
-	/* Must be the first thing that this handler does - x86_64 stores
-	 * the sigcontext in %rdx, and we need to save it before it has a
-	 * chance to get trashed.
-	 */
-
-	ARCH_GET_SIGCONTEXT(sc, sig);
-
 	enabled = signals_enabled;
 	if(!enabled && (sig == SIGIO)){
 		pending |= SIGIO_MASK;
@@ -84,13 +75,10 @@ static void real_alarm_handler(int sig, struct sigcontext *sc)
 
 }
 
-void alarm_handler(ARCH_SIGHDLR_PARAM)
+void alarm_handler(int sig, struct sigcontext *sc)
 {
-	struct sigcontext *sc;
 	int enabled;
 
-	ARCH_GET_SIGCONTEXT(sc, sig);
-
 	enabled = signals_enabled;
 	if(!signals_enabled){
 		if(sig == SIGVTALRM)
@@ -126,6 +114,10 @@ void remove_sigstack(void)
 		panic("disabling signal stack failed, errno = %d\n", errno);
 }
 
+void (*handlers[_NSIG])(int sig, struct sigcontext *sc);
+
+extern void hard_handler(int sig);
+
 void set_handler(int sig, void (*handler)(int), int flags, ...)
 {
 	struct sigaction action;
@@ -133,13 +125,16 @@ void set_handler(int sig, void (*handler)(int), int flags, ...)
 	sigset_t sig_mask;
 	int mask;
 
-	va_start(ap, flags);
-	action.sa_handler = handler;
+	handlers[sig] = (void (*)(int, struct sigcontext *)) handler;
+	action.sa_handler = hard_handler;
+
 	sigemptyset(&action.sa_mask);
-	while((mask = va_arg(ap, int)) != -1){
+
+	va_start(ap, flags);
+	while((mask = va_arg(ap, int)) != -1)
 		sigaddset(&action.sa_mask, mask);
-	}
 	va_end(ap);
+
 	action.sa_flags = flags;
 	action.sa_restorer = NULL;
 	if(sigaction(sig, &action, NULL) < 0)
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 50418a5e713..88ff0de95cd 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -189,14 +189,25 @@ static int userspace_tramp(void *stack)
 		}
 	}
 	if(!ptrace_faultinfo && (stack != NULL)){
+		struct sigaction sa;
+
 		unsigned long v = UML_CONFIG_STUB_CODE +
 				  (unsigned long) stub_segv_handler -
 				  (unsigned long) &__syscall_stub_start;
 
 		set_sigstack((void *) UML_CONFIG_STUB_DATA, page_size());
-		set_handler(SIGSEGV, (void *) v, SA_ONSTACK,
-			    SIGIO, SIGWINCH, SIGALRM, SIGVTALRM,
-			    SIGUSR1, -1);
+		sigemptyset(&sa.sa_mask);
+		sigaddset(&sa.sa_mask, SIGIO);
+		sigaddset(&sa.sa_mask, SIGWINCH);
+		sigaddset(&sa.sa_mask, SIGALRM);
+		sigaddset(&sa.sa_mask, SIGVTALRM);
+		sigaddset(&sa.sa_mask, SIGUSR1);
+		sa.sa_flags = SA_ONSTACK;
+		sa.sa_handler = (void *) v;
+		sa.sa_restorer = NULL;
+		if(sigaction(SIGSEGV, &sa, NULL) < 0)
+			panic("userspace_tramp - setting SIGSEGV handler "
+			      "failed - errno = %d\n", errno);
 	}
 
 	os_stop_process(os_getpid());
diff --git a/arch/um/os-Linux/sys-i386/Makefile b/arch/um/os-Linux/sys-i386/Makefile
index b3213613c41..37806621b25 100644
--- a/arch/um/os-Linux/sys-i386/Makefile
+++ b/arch/um/os-Linux/sys-i386/Makefile
@@ -3,7 +3,7 @@
 # Licensed under the GPL
 #
 
-obj-$(CONFIG_MODE_SKAS) = registers.o tls.o
+obj-$(CONFIG_MODE_SKAS) = registers.o signal.o tls.o
 
 USER_OBJS := $(obj-y)
 
diff --git a/arch/um/os-Linux/sys-i386/signal.c b/arch/um/os-Linux/sys-i386/signal.c
new file mode 100644
index 00000000000..0d3eae51835
--- /dev/null
+++ b/arch/um/os-Linux/sys-i386/signal.c
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include <signal.h>
+
+extern void (*handlers[])(int sig, struct sigcontext *sc);
+
+void hard_handler(int sig)
+{
+	struct sigcontext *sc = (struct sigcontext *) (&sig + 1);
+
+	(*handlers[sig])(sig, sc);
+}
diff --git a/arch/um/os-Linux/sys-x86_64/Makefile b/arch/um/os-Linux/sys-x86_64/Makefile
index 340ef26f594..f67842a7735 100644
--- a/arch/um/os-Linux/sys-x86_64/Makefile
+++ b/arch/um/os-Linux/sys-x86_64/Makefile
@@ -3,7 +3,7 @@
 # Licensed under the GPL
 #
 
-obj-$(CONFIG_MODE_SKAS) = registers.o
+obj-$(CONFIG_MODE_SKAS) = registers.o signal.o
 
 USER_OBJS := $(obj-y)
 
diff --git a/arch/um/os-Linux/sys-x86_64/signal.c b/arch/um/os-Linux/sys-x86_64/signal.c
new file mode 100644
index 00000000000..3f369e5f976
--- /dev/null
+++ b/arch/um/os-Linux/sys-x86_64/signal.c
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include <signal.h>
+
+extern void (*handlers[])(int sig, struct sigcontext *sc);
+
+void hard_handler(int sig)
+{
+	struct ucontext *uc;
+	asm("movq %%rdx, %0" : "=r" (uc));
+
+	(*handlers[sig])(sig, (struct sigcontext *) &uc->uc_mcontext);
+}
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index 4ae73c0e548..7f5ebbadca6 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -40,8 +40,8 @@ void disable_timer(void)
 		printk("disnable_timer - setitimer failed, errno = %d\n",
 		       errno);
 	/* If there are signals already queued, after unblocking ignore them */
-	set_handler(SIGALRM, SIG_IGN, 0, -1);
-	set_handler(SIGVTALRM, SIG_IGN, 0, -1);
+	signal(SIGALRM, SIG_IGN);
+	signal(SIGVTALRM, SIG_IGN);
 }
 
 void switch_timers(int to_real)
-- 
cgit v1.2.3


From 537ae946e808d0f22d660f7a3500832fe0c07d14 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:05 -0700
Subject: [PATCH] uml: timer cleanups

set_interval returns an error instead of panicing if setitimer fails.  Some of
its callers now check the return.

enable_timer is largely tt-mode-specific, so it is marked as such, and the
only skas-mode caller is made to call set-interval instead.

user_time_init was a no-value-added wrapper around set_interval, so it is
gone.

Since set_interval is now called from kernel code, callers no longer pass
ITIMER_* to it.  Instead, they pass a flag which is converted into ITIMER_REAL
or ITIMER_VIRTUAL.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/os.h            |  4 +++-
 arch/um/kernel/time.c           |  7 +++++--
 arch/um/os-Linux/skas/process.c |  6 +++++-
 arch/um/os-Linux/time.c         | 18 +++++++++---------
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index c73dfa78f94..24fb6d8680e 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -276,9 +276,11 @@ extern int setjmp_wrapper(void (*proc)(void *, void *), ...);
 
 extern void switch_timers(int to_real);
 extern void idle_sleep(int secs);
+extern int set_interval(int is_virtual);
+#ifdef CONFIG_MODE_TT
 extern void enable_timer(void);
+#endif
 extern void disable_timer(void);
-extern void user_time_init(void);
 extern void uml_idle_timer(void);
 extern unsigned long long os_nsecs(void);
 
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 552ca1cb984..d1d799346e0 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -113,12 +113,15 @@ static void register_timer(void)
 
 	err = request_irq(TIMER_IRQ, um_timer, IRQF_DISABLED, "timer", NULL);
 	if(err != 0)
-		printk(KERN_ERR "timer_init : request_irq failed - "
+		printk(KERN_ERR "register_timer : request_irq failed - "
 		       "errno = %d\n", -err);
 
 	timer_irq_inited = 1;
 
-	user_time_init();
+	err = set_interval(1);
+	if(err != 0)
+		printk(KERN_ERR "register_timer : set_interval failed - "
+		       "errno = %d\n", -err);
 }
 
 extern void (*late_time_init)(void);
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 88ff0de95cd..42e3d1ed802 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -155,11 +155,15 @@ extern int __syscall_stub_start;
 static int userspace_tramp(void *stack)
 {
 	void *addr;
+	int err;
 
 	ptrace(PTRACE_TRACEME, 0, 0, 0);
 
 	init_new_thread_signals();
-	enable_timer();
+	err = set_interval(1);
+	if(err)
+		panic("userspace_tramp - setting timer failed, errno = %d\n",
+		      err);
 
 	if(!proc_mm){
 		/* This has a pte, but it can't be mapped in with the usual
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index 7f5ebbadca6..38be096e750 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -17,20 +17,25 @@
 #include "kern_constants.h"
 #include "os.h"
 
-static void set_interval(int timer_type)
+int set_interval(int is_virtual)
 {
 	int usec = 1000000/hz();
+	int timer_type = is_virtual ? ITIMER_VIRTUAL : ITIMER_REAL;
 	struct itimerval interval = ((struct itimerval) { { 0, usec },
 							  { 0, usec } });
 
 	if(setitimer(timer_type, &interval, NULL) == -1)
-		panic("setitimer failed - errno = %d\n", errno);
+		return -errno;
+
+	return 0;
 }
 
+#ifdef CONFIG_MODE_TT
 void enable_timer(void)
 {
-	set_interval(ITIMER_VIRTUAL);
+	set_interval(1);
 }
+#endif
 
 void disable_timer(void)
 {
@@ -74,7 +79,7 @@ void uml_idle_timer(void)
 
 	set_handler(SIGALRM, (__sighandler_t) alarm_handler,
 		    SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1);
-	set_interval(ITIMER_REAL);
+	set_interval(0);
 }
 #endif
 
@@ -94,8 +99,3 @@ void idle_sleep(int secs)
 	ts.tv_nsec = 0;
 	nanosleep(&ts, NULL);
 }
-
-void user_time_init(void)
-{
-	set_interval(ITIMER_VIRTUAL);
-}
-- 
cgit v1.2.3


From 602cc2418177a5b80f533f569e5a42c4495988c9 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:06 -0700
Subject: [PATCH] uml: Remove unused variable

timer_irq_inited was useless, so it is removed.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/kern_util.h | 1 -
 arch/um/kernel/time.c       | 5 -----
 arch/um/os-Linux/signal.c   | 7 -------
 3 files changed, 13 deletions(-)

diff --git a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h
index b98bdd8e052..89e1dc835a5 100644
--- a/arch/um/include/kern_util.h
+++ b/arch/um/include/kern_util.h
@@ -27,7 +27,6 @@ extern int ncpus;
 extern char *linux_prog;
 extern char *gdb_init;
 extern int kmalloc_ok;
-extern int timer_irq_inited;
 extern int jail;
 extern int nsyscalls;
 
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index d1d799346e0..2454bbd9555 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -35,9 +35,6 @@ unsigned long long sched_clock(void)
 	return (unsigned long long)jiffies_64 * (1000000000 / HZ);
 }
 
-/* Changed at early boot */
-int timer_irq_inited = 0;
-
 static unsigned long long prev_nsecs;
 #ifdef CONFIG_UML_REAL_TIME_CLOCK
 static long long delta;   		/* Deviation per interval */
@@ -116,8 +113,6 @@ static void register_timer(void)
 		printk(KERN_ERR "register_timer : request_irq failed - "
 		       "errno = %d\n", -err);
 
-	timer_irq_inited = 1;
-
 	err = set_interval(1);
 	if(err != 0)
 		printk(KERN_ERR "register_timer : set_interval failed - "
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 55b62e2b8f4..6b81739279d 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -55,15 +55,8 @@ void sig_handler(int sig, struct sigcontext *sc)
 	set_signals(enabled);
 }
 
-extern int timer_irq_inited;
-
 static void real_alarm_handler(int sig, struct sigcontext *sc)
 {
-	if(!timer_irq_inited){
-		signals_enabled = 1;
-		return;
-	}
-
 	if(sig == SIGALRM)
 		switch_timers(0);
 
-- 
cgit v1.2.3


From bf61f50d63b4d9e30d7a86a2d44bb300ae7c1dd4 Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Mon, 25 Sep 2006 23:33:07 -0700
Subject: [PATCH] uml: clean our set_ether_mac

Clean set_ether_mac usage.  Maybe could also be removed, but surely it can't
be a global function taking a void* argument.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Acked-by: Jeff Dike <jdike@addtoit.com>
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/drivers/net_kern.c | 14 ++++++--------
 arch/um/include/net_user.h |  1 -
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 501f95675d8..4a7966b2193 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -31,6 +31,11 @@
 #include "irq_user.h"
 #include "irq_kern.h"
 
+static inline void set_ether_mac(struct net_device *dev, unsigned char *addr)
+{
+	memcpy(dev->dev_addr, addr, ETH_ALEN);
+}
+
 #define DRIVER_NAME "uml-netdev"
 
 static DEFINE_SPINLOCK(opened_lock);
@@ -242,7 +247,7 @@ static int uml_net_set_mac(struct net_device *dev, void *addr)
 	struct sockaddr *hwaddr = addr;
 
 	spin_lock(&lp->lock);
-	memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN);
+	set_ether_mac(dev, hwaddr->sa_data);
 	spin_unlock(&lp->lock);
 
 	return(0);
@@ -790,13 +795,6 @@ void dev_ip_addr(void *d, unsigned char *bin_buf)
 	memcpy(bin_buf, &in->ifa_address, sizeof(in->ifa_address));
 }
 
-void set_ether_mac(void *d, unsigned char *addr)
-{
-	struct net_device *dev = d;
-
-	memcpy(dev->dev_addr, addr, ETH_ALEN);	
-}
-
 struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra)
 {
 	if((skb != NULL) && (skb_tailroom(skb) < extra)){
diff --git a/arch/um/include/net_user.h b/arch/um/include/net_user.h
index 800c403920b..47ef7cb49a8 100644
--- a/arch/um/include/net_user.h
+++ b/arch/um/include/net_user.h
@@ -26,7 +26,6 @@ struct net_user_info {
 
 extern void ether_user_init(void *data, void *dev);
 extern void dev_ip_addr(void *d, unsigned char *bin_buf);
-extern void set_ether_mac(void *d, unsigned char *addr);
 extern void iter_addresses(void *d, void (*cb)(unsigned char *, 
 					       unsigned char *, void *), 
 			   void *arg);
-- 
cgit v1.2.3


From 75e29b18d9a46bf3193278e92dc95609a8cca2ab Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:08 -0700
Subject: [PATCH] uml: stack usage reduction

The KSTK_* macros used an inordinate amount of stack.  In order to overcome
an impedance mismatch between their interface, which just returns a single
register value, and the interface of get_thread_regs, which took a full
pt_regs, the implementation created an on-stack pt_regs, filled it in, and
returned one field.  do_task_stat calls KSTK_* twice, resulting in two
local pt_regs, blowing out the stack.

This patch changes the interface (and name) of get_thread_regs to just
return a single register from a jmp_buf.

The include of archsetjmp.h" in registers.h to get the definition of
jmp_buf exposed a bogus include of <setjmp.h> in start_up.c.  <setjmp.h>
shouldn't be used anywhere any more since UML uses the klibc
setjmp/longjmp.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/include/registers.h             |  3 ++-
 arch/um/os-Linux/start_up.c             |  1 -
 arch/um/os-Linux/sys-i386/registers.c   | 15 +++++++++------
 arch/um/os-Linux/sys-x86_64/registers.c | 15 +++++++++------
 include/asm-um/processor-generic.h      |  4 +---
 5 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/um/include/registers.h b/arch/um/include/registers.h
index 83b688ca198..f845b3629a6 100644
--- a/arch/um/include/registers.h
+++ b/arch/um/include/registers.h
@@ -7,6 +7,7 @@
 #define __REGISTERS_H
 
 #include "sysdep/ptrace.h"
+#include "sysdep/archsetjmp.h"
 
 extern void init_thread_registers(union uml_pt_regs *to);
 extern int save_fp_registers(int pid, unsigned long *fp_regs);
@@ -15,6 +16,6 @@ extern void save_registers(int pid, union uml_pt_regs *regs);
 extern void restore_registers(int pid, union uml_pt_regs *regs);
 extern void init_registers(int pid);
 extern void get_safe_registers(unsigned long * regs, unsigned long * fp_regs);
-extern void get_thread_regs(union uml_pt_regs *uml_regs, void *buffer);
+extern unsigned long get_thread_reg(int reg, jmp_buf *buf);
 
 #endif
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 50314850400..7fe92680c7d 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -14,7 +14,6 @@
 #include <sched.h>
 #include <fcntl.h>
 #include <errno.h>
-#include <setjmp.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
diff --git a/arch/um/os-Linux/sys-i386/registers.c b/arch/um/os-Linux/sys-i386/registers.c
index 1f90a2d7138..7cd0369e02b 100644
--- a/arch/um/os-Linux/sys-i386/registers.c
+++ b/arch/um/os-Linux/sys-i386/registers.c
@@ -130,11 +130,14 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs)
 		       HOST_FP_SIZE * sizeof(unsigned long));
 }
 
-void get_thread_regs(union uml_pt_regs *uml_regs, void *buffer)
+unsigned long get_thread_reg(int reg, jmp_buf *buf)
 {
-	struct __jmp_buf *jmpbuf = buffer;
-
-	UPT_SET(uml_regs, EIP, jmpbuf->__eip);
-	UPT_SET(uml_regs, UESP, jmpbuf->__esp);
-	UPT_SET(uml_regs, EBP, jmpbuf->__ebp);
+	switch(reg){
+	case EIP: return buf[0]->__eip;
+	case UESP: return buf[0]->__esp;
+	case EBP: return buf[0]->__ebp;
+	default:
+		printk("get_thread_regs - unknown register %d\n", reg);
+		return 0;
+	}
 }
diff --git a/arch/um/os-Linux/sys-x86_64/registers.c b/arch/um/os-Linux/sys-x86_64/registers.c
index e730447d6c0..cb8e8a26328 100644
--- a/arch/um/os-Linux/sys-x86_64/registers.c
+++ b/arch/um/os-Linux/sys-x86_64/registers.c
@@ -78,11 +78,14 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs)
 		       HOST_FP_SIZE * sizeof(unsigned long));
 }
 
-void get_thread_regs(union uml_pt_regs *uml_regs, void *buffer)
+unsigned long get_thread_reg(int reg, jmp_buf *buf)
 {
-	struct __jmp_buf *jmpbuf = buffer;
-
-	UPT_SET(uml_regs, RIP, jmpbuf->__rip);
-	UPT_SET(uml_regs, RSP, jmpbuf->__rsp);
-	UPT_SET(uml_regs, RBP, jmpbuf->__rbp);
+	switch(reg){
+	case RIP: return buf[0]->__rip;
+	case RSP: return buf[0]->__rsp;
+	case RBP: return buf[0]->__rbp;
+	default:
+		printk("get_thread_regs - unknown register %d\n", reg);
+		return 0;
+	}
 }
diff --git a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h
index 824c2889638..afa4fe1ca9f 100644
--- a/include/asm-um/processor-generic.h
+++ b/include/asm-um/processor-generic.h
@@ -138,9 +138,7 @@ extern struct cpuinfo_um cpu_data[];
 
 #ifdef CONFIG_MODE_SKAS
 #define KSTK_REG(tsk, reg) \
-	({ union uml_pt_regs regs; \
-	   get_thread_regs(&regs, tsk->thread.mode.skas.switch_buf); \
-	   UPT_REG(&regs, reg); })
+	get_thread_reg(reg, tsk->thread.mode.skas.switch_buf)
 #else
 #define KSTK_REG(tsk, reg) (0xbadbabe)
 #endif
-- 
cgit v1.2.3


From b1fc0b1f21c4082d24d1f456a846b4fa7d16a70b Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 25 Sep 2006 23:33:08 -0700
Subject: [PATCH] UML: tty locking

Ensure current->signal->tty doesn't get freed during log_exec().

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/exec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index fc38a6d5906..0561c43b468 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -41,9 +41,11 @@ static long execve1(char *file, char __user * __user *argv,
         long error;
 
 #ifdef CONFIG_TTY_LOG
-	task_lock(current);
+	mutex_lock(&tty_mutex);
+	task_lock(current);	/* FIXME:  is this needed ? */
 	log_exec(argv, current->signal->tty);
 	task_unlock(current);
+	mutex_unlock(&tty_mutex);
 #endif
         error = do_execve(file, argv, env, &current->thread.regs);
         if (error == 0){
-- 
cgit v1.2.3


From 70e0eb8ef143f3729065c504177413ffe165af22 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:09 -0700
Subject: [PATCH] Split i386 and x86_64 ptrace.h

The use of SEGMENT_RPL_MASK in the i386 ptrace.h introduced by
x86-allow-a-kernel-to-not-be-in-ring-0.patch broke the UML build, as UML
includes the underlying architecture's ptrace.h, but has no easy access to the
x86 segment definitions.

Rather than kludging around this, as in the past, this patch splits the
userspace-usable parts, which are the bits that UML needs, of ptrace.h into
ptrace-abi.h, which is included back into ptrace.h.  Thus, there is no net
effect on i386.

As a side-effect, this creates a ptrace header which is close to being usable
in /usr/include.

x86_64 is also treated in this way for consistency.  There was some trailing
whitespace there, which is cleaned up.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/Kbuild         |  1 +
 include/asm-i386/ptrace-abi.h   | 39 +++++++++++++++++++++++++++
 include/asm-i386/ptrace.h       | 35 +-----------------------
 include/asm-x86_64/Kbuild       |  1 +
 include/asm-x86_64/ptrace-abi.h | 51 +++++++++++++++++++++++++++++++++++
 include/asm-x86_64/ptrace.h     | 59 +++++------------------------------------
 6 files changed, 100 insertions(+), 86 deletions(-)
 create mode 100644 include/asm-i386/ptrace-abi.h
 create mode 100644 include/asm-x86_64/ptrace-abi.h

diff --git a/include/asm-i386/Kbuild b/include/asm-i386/Kbuild
index b75a348d0c1..147e4ac1ebf 100644
--- a/include/asm-i386/Kbuild
+++ b/include/asm-i386/Kbuild
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
 header-y += boot.h
 header-y += debugreg.h
 header-y += ldt.h
+header-y += ptrace-abi.h
 header-y += ucontext.h
 
 unifdef-y += mtrr.h
diff --git a/include/asm-i386/ptrace-abi.h b/include/asm-i386/ptrace-abi.h
new file mode 100644
index 00000000000..a44901817a2
--- /dev/null
+++ b/include/asm-i386/ptrace-abi.h
@@ -0,0 +1,39 @@
+#ifndef I386_PTRACE_ABI_H
+#define I386_PTRACE_ABI_H
+
+#define EBX 0
+#define ECX 1
+#define EDX 2
+#define ESI 3
+#define EDI 4
+#define EBP 5
+#define EAX 6
+#define DS 7
+#define ES 8
+#define FS 9
+#define GS 10
+#define ORIG_EAX 11
+#define EIP 12
+#define CS  13
+#define EFL 14
+#define UESP 15
+#define SS   16
+#define FRAME_SIZE 17
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS            12
+#define PTRACE_SETREGS            13
+#define PTRACE_GETFPREGS          14
+#define PTRACE_SETFPREGS          15
+#define PTRACE_GETFPXREGS         18
+#define PTRACE_SETFPXREGS         19
+
+#define PTRACE_OLDSETOPTIONS         21
+
+#define PTRACE_GET_THREAD_AREA    25
+#define PTRACE_SET_THREAD_AREA    26
+
+#define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
+
+#endif
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index f324c53b6f9..1910880fcd4 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -1,24 +1,7 @@
 #ifndef _I386_PTRACE_H
 #define _I386_PTRACE_H
 
-#define EBX 0
-#define ECX 1
-#define EDX 2
-#define ESI 3
-#define EDI 4
-#define EBP 5
-#define EAX 6
-#define DS 7
-#define ES 8
-#define FS 9
-#define GS 10
-#define ORIG_EAX 11
-#define EIP 12
-#define CS  13
-#define EFL 14
-#define UESP 15
-#define SS   16
-#define FRAME_SIZE 17
+#include <asm/ptrace-abi.h>
 
 /* this struct defines the way the registers are stored on the 
    stack during a system call. */
@@ -41,22 +24,6 @@ struct pt_regs {
 	int  xss;
 };
 
-/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
-#define PTRACE_GETREGS            12
-#define PTRACE_SETREGS            13
-#define PTRACE_GETFPREGS          14
-#define PTRACE_SETFPREGS          15
-#define PTRACE_GETFPXREGS         18
-#define PTRACE_SETFPXREGS         19
-
-#define PTRACE_OLDSETOPTIONS         21
-
-#define PTRACE_GET_THREAD_AREA    25
-#define PTRACE_SET_THREAD_AREA    26
-
-#define PTRACE_SYSEMU		  31
-#define PTRACE_SYSEMU_SINGLESTEP  32
-
 #ifdef __KERNEL__
 
 #include <asm/vm86.h>
diff --git a/include/asm-x86_64/Kbuild b/include/asm-x86_64/Kbuild
index 40f2f13fe17..1ee9b07f3fe 100644
--- a/include/asm-x86_64/Kbuild
+++ b/include/asm-x86_64/Kbuild
@@ -11,6 +11,7 @@ header-y += debugreg.h
 header-y += ldt.h
 header-y += msr.h
 header-y += prctl.h
+header-y += ptrace-abi.h
 header-y += setup.h
 header-y += sigcontext32.h
 header-y += ucontext.h
diff --git a/include/asm-x86_64/ptrace-abi.h b/include/asm-x86_64/ptrace-abi.h
new file mode 100644
index 00000000000..19184b0806b
--- /dev/null
+++ b/include/asm-x86_64/ptrace-abi.h
@@ -0,0 +1,51 @@
+#ifndef _X86_64_PTRACE_ABI_H
+#define _X86_64_PTRACE_ABI_H
+
+#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+/* arguments: interrupts/non tracing syscalls only save upto here*/
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+#define ORIG_RAX 120       /* = ERROR */
+/* end of arguments */
+/* cpu exception frame or undefined in case of fast syscall. */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+#define ARGOFFSET R11
+#endif /* __ASSEMBLY__ */
+
+/* top of stack page */
+#define FRAME_SIZE 168
+
+#define PTRACE_OLDSETOPTIONS         21
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS            12
+#define PTRACE_SETREGS            13
+#define PTRACE_GETFPREGS          14
+#define PTRACE_SETFPREGS          15
+#define PTRACE_GETFPXREGS         18
+#define PTRACE_SETFPXREGS         19
+
+/* only useful for access 32bit programs */
+#define PTRACE_GET_THREAD_AREA    25
+#define PTRACE_SET_THREAD_AREA    26
+
+#define PTRACE_ARCH_PRCTL	  30	/* arch_prctl for child */
+
+#endif
diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h
index ca6f15ff61d..ab827dc381d 100644
--- a/include/asm-x86_64/ptrace.h
+++ b/include/asm-x86_64/ptrace.h
@@ -1,40 +1,9 @@
 #ifndef _X86_64_PTRACE_H
 #define _X86_64_PTRACE_H
 
-#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) 
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save upto here*/
-#define R11 48
-#define R10 56	
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120       /* = ERROR */ 
-/* end of arguments */ 	
-/* cpu exception frame or undefined in case of fast syscall. */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-#define ARGOFFSET R11
-#endif /* __ASSEMBLY__ */
+#include <asm/ptrace-abi.h>
 
-/* top of stack page */ 
-#define FRAME_SIZE 168
-
-#define PTRACE_OLDSETOPTIONS         21
-
-#ifndef __ASSEMBLY__ 
+#ifndef __ASSEMBLY__
 
 struct pt_regs {
 	unsigned long r15;
@@ -45,7 +14,7 @@ struct pt_regs {
 	unsigned long rbx;
 /* arguments: non interrupts/non tracing syscalls only save upto here*/
  	unsigned long r11;
-	unsigned long r10;	
+	unsigned long r10;
 	unsigned long r9;
 	unsigned long r8;
 	unsigned long rax;
@@ -54,32 +23,18 @@ struct pt_regs {
 	unsigned long rsi;
 	unsigned long rdi;
 	unsigned long orig_rax;
-/* end of arguments */ 	
+/* end of arguments */
 /* cpu exception frame or undefined */
 	unsigned long rip;
 	unsigned long cs;
-	unsigned long eflags; 
-	unsigned long rsp; 
+	unsigned long eflags;
+	unsigned long rsp;
 	unsigned long ss;
-/* top of stack page */ 
+/* top of stack page */
 };
 
 #endif
 
-/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
-#define PTRACE_GETREGS            12
-#define PTRACE_SETREGS            13
-#define PTRACE_GETFPREGS          14
-#define PTRACE_SETFPREGS          15
-#define PTRACE_GETFPXREGS         18
-#define PTRACE_SETFPXREGS         19
-
-/* only useful for access 32bit programs */
-#define PTRACE_GET_THREAD_AREA    25
-#define PTRACE_SET_THREAD_AREA    26
-
-#define PTRACE_ARCH_PRCTL	  30	/* arch_prctl for child */
-
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__) 
 #define user_mode(regs) (!!((regs)->cs & 3))
 #define user_mode_vm(regs) user_mode(regs)
-- 
cgit v1.2.3


From e8df8c3304cfc7d68e9a6a7b6df6c64783cd6991 Mon Sep 17 00:00:00 2001
From: Jeff Dike <jdike@addtoit.com>
Date: Mon, 25 Sep 2006 23:33:10 -0700
Subject: [PATCH] Make UML use ptrace-abi.h

Include the host architecture's ptrace-abi.h instead of ptrace.h.

There was some cpp mangling of names around the ptrace.h include to avoid
symbol clashes between UML and the host architecture.  Most of these can go
away.  The exception is struct pt_regs, which is convenient to have in
userspace, but must be renamed in order that UML can define its own.

ptrace-x86_64.h needed to have some now-obsolete cpp cruft and a declaration
removed.

Signed-off-by: Jeff Dike <jdike@addtoit.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-um/ptrace-generic.h | 14 +-------------
 include/asm-um/ptrace-x86_64.h  |  4 ----
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/include/asm-um/ptrace-generic.h b/include/asm-um/ptrace-generic.h
index a36f5371b36..99c87c5ce99 100644
--- a/include/asm-um/ptrace-generic.h
+++ b/include/asm-um/ptrace-generic.h
@@ -8,19 +8,7 @@
 
 #ifndef __ASSEMBLY__
 
-
-#define pt_regs pt_regs_subarch
-#define show_regs show_regs_subarch
-#define send_sigtrap send_sigtrap_subarch
-
-#include "asm/arch/ptrace.h"
-
-#undef pt_regs
-#undef show_regs
-#undef send_sigtrap
-#undef user_mode
-#undef instruction_pointer
-
+#include "asm/arch/ptrace-abi.h"
 #include "sysdep/ptrace.h"
 
 struct pt_regs {
diff --git a/include/asm-um/ptrace-x86_64.h b/include/asm-um/ptrace-x86_64.h
index c894e68b1f9..2074483e6ca 100644
--- a/include/asm-um/ptrace-x86_64.h
+++ b/include/asm-um/ptrace-x86_64.h
@@ -11,15 +11,11 @@
 #include "asm/errno.h"
 #include "asm/host_ldt.h"
 
-#define signal_fault signal_fault_x86_64
 #define __FRAME_OFFSETS /* Needed to get the R* macros */
 #include "asm/ptrace-generic.h"
-#undef signal_fault
 
 #define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64
 
-void signal_fault(struct pt_regs_subarch *regs, void *frame, char *where);
-
 #define FS_BASE (21 * sizeof(unsigned long))
 #define GS_BASE (22 * sizeof(unsigned long))
 #define DS (23 * sizeof(unsigned long))
-- 
cgit v1.2.3


From e8216dee838c09776680a6f1a2e54d81f3cdfa14 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 25 Sep 2006 23:33:11 -0700
Subject: [PATCH] s390: fix cmm kernel thread handling

Convert cmm's usage of kernel_thread to kthread_run.  Also create the
cmmthread at module load time, so it is possible to check if creation of
the thread fails.

In addition the cmmthread now gets terminated when the module gets unloaded
instead of leaving a stale kernel thread.  Also check the return values of
other registration functions at module load and handle their return values
appropriately.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/mm/cmm.c | 54 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index f2e00df99ba..607f50ead1f 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -16,6 +16,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/swap.h>
+#include <linux/kthread.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -46,8 +47,7 @@ static struct cmm_page_array *cmm_page_list;
 static struct cmm_page_array *cmm_timed_page_list;
 static DEFINE_SPINLOCK(cmm_lock);
 
-static unsigned long cmm_thread_active;
-static struct work_struct cmm_thread_starter;
+static struct task_struct *cmm_thread_ptr;
 static wait_queue_head_t cmm_thread_wait;
 static struct timer_list cmm_timer;
 
@@ -143,14 +143,12 @@ cmm_thread(void *dummy)
 {
 	int rc;
 
-	daemonize("cmmthread");
 	while (1) {
 		rc = wait_event_interruptible(cmm_thread_wait,
 			(cmm_pages != cmm_pages_target ||
-			 cmm_timed_pages != cmm_timed_pages_target));
-		if (rc == -ERESTARTSYS) {
-			/* Got kill signal. End thread. */
-			clear_bit(0, &cmm_thread_active);
+			 cmm_timed_pages != cmm_timed_pages_target ||
+			 kthread_should_stop()));
+		if (kthread_should_stop() || rc == -ERESTARTSYS) {
 			cmm_pages_target = cmm_pages;
 			cmm_timed_pages_target = cmm_timed_pages;
 			break;
@@ -175,17 +173,9 @@ cmm_thread(void *dummy)
 	return 0;
 }
 
-static void
-cmm_start_thread(void)
-{
-	kernel_thread(cmm_thread, NULL, 0);
-}
-
 static void
 cmm_kick_thread(void)
 {
-	if (!test_and_set_bit(0, &cmm_thread_active))
-		schedule_work(&cmm_thread_starter);
 	wake_up(&cmm_thread_wait);
 }
 
@@ -429,22 +419,48 @@ struct ctl_table_header *cmm_sysctl_header;
 static int
 cmm_init (void)
 {
+	int rc = -ENOMEM;
+
 #ifdef CONFIG_CMM_PROC
 	cmm_sysctl_header = register_sysctl_table(cmm_dir_table, 1);
+	if (!cmm_sysctl_header)
+		goto out;
 #endif
 #ifdef CONFIG_CMM_IUCV
-	smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
+	rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
+	if (rc < 0)
+		goto out_smsg;
 #endif
-	register_oom_notifier(&cmm_oom_nb);
-	INIT_WORK(&cmm_thread_starter, (void *) cmm_start_thread, NULL);
+	rc = register_oom_notifier(&cmm_oom_nb);
+	if (rc < 0)
+		goto out_oom_notify;
 	init_waitqueue_head(&cmm_thread_wait);
 	init_timer(&cmm_timer);
-	return 0;
+	cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+	rc = IS_ERR(cmm_thread_ptr) ? PTR_ERR(cmm_thread_ptr) : 0;
+	if (!rc)
+		goto out;
+	/*
+	 * kthread_create failed. undo all the stuff from above again.
+	 */
+	unregister_oom_notifier(&cmm_oom_nb);
+
+out_oom_notify:
+#ifdef CONFIG_CMM_IUCV
+	smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
+out_smsg:
+#endif
+#ifdef CONFIG_CMM_PROC
+	unregister_sysctl_table(cmm_sysctl_header);
+#endif
+out:
+	return rc;
 }
 
 static void
 cmm_exit(void)
 {
+	kthread_stop(cmm_thread_ptr);
 	unregister_oom_notifier(&cmm_oom_nb);
 	cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
 	cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
-- 
cgit v1.2.3