From 8d00a6c8f6b08e7167bc03bf955cdc7e47c5132e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Jul 2008 08:39:57 +0200
Subject: genirq: remove last NO_IDLE_HZ leftovers

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8ccb462ea42..f3047df2d23 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -197,10 +197,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
-#ifndef handle_dynamic_tick
-# define handle_dynamic_tick(a)		do { } while (0)
-#endif
-
 #ifdef CONFIG_SMP
 
 #if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
-- 
cgit v1.2.3


From 6902c0bead4ce266226fc0c5b3828b850bdc884a Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 6 Jun 2008 01:33:22 -0400
Subject: Input: gameport - make gameport_register_driver() return errors

Perform actual driver registration right in gameport_register_driver()
instead of offloading it to kgameportd and return proper error code to
callers if driver registration fails.

Note that driver <-> port matching is still done by kgameportd.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/gameport.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index f64e29c0ef3..5126125afd4 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -146,10 +146,11 @@ static inline void gameport_unpin_driver(struct gameport *gameport)
 	mutex_unlock(&gameport->drv_mutex);
 }
 
-void __gameport_register_driver(struct gameport_driver *drv, struct module *owner);
-static inline void gameport_register_driver(struct gameport_driver *drv)
+int __gameport_register_driver(struct gameport_driver *drv,
+				struct module *owner, const char *mod_name);
+static inline int gameport_register_driver(struct gameport_driver *drv)
 {
-	__gameport_register_driver(drv, THIS_MODULE);
+	return __gameport_register_driver(drv, THIS_MODULE, KBUILD_MODNAME);
 }
 
 void gameport_unregister_driver(struct gameport_driver *drv);
-- 
cgit v1.2.3


From 8c4b3c29329eb7ffded3023e6d65bc415cb4e215 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 6 Jun 2008 01:33:51 -0400
Subject: Input: gameport - mark gameport_register_driver() __must_check

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/gameport.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index 5126125afd4..0cd825f7363 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -148,7 +148,7 @@ static inline void gameport_unpin_driver(struct gameport *gameport)
 
 int __gameport_register_driver(struct gameport_driver *drv,
 				struct module *owner, const char *mod_name);
-static inline int gameport_register_driver(struct gameport_driver *drv)
+static inline int __must_check gameport_register_driver(struct gameport_driver *drv)
 {
 	return __gameport_register_driver(drv, THIS_MODULE, KBUILD_MODNAME);
 }
-- 
cgit v1.2.3


From 03bac96fae0efdb25e2059e5accbe4f3ee6328dd Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 23 Jun 2008 10:47:34 -0400
Subject: Input: expand keycode space

Expand the number of potential key codes from 512 to 768 since people
are coming up with more and more keys.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h           | 2 +-
 include/linux/mod_devicetable.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index a5802c9c81a..7fae1dee356 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -579,7 +579,7 @@ struct input_absinfo {
 
 /* We avoid low common keys in module aliases so they don't get huge. */
 #define KEY_MIN_INTERESTING	KEY_MUTE
-#define KEY_MAX			0x1ff
+#define KEY_MAX			0x2ff
 #define KEY_CNT			(KEY_MAX+1)
 
 /*
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c4db5827963..0dddfa44ec1 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -274,7 +274,7 @@ struct pcmcia_device_id {
 /* Input */
 #define INPUT_DEVICE_ID_EV_MAX		0x1f
 #define INPUT_DEVICE_ID_KEY_MIN_INTERESTING	0x71
-#define INPUT_DEVICE_ID_KEY_MAX		0x1ff
+#define INPUT_DEVICE_ID_KEY_MAX		0x2ff
 #define INPUT_DEVICE_ID_REL_MAX		0x0f
 #define INPUT_DEVICE_ID_ABS_MAX		0x3f
 #define INPUT_DEVICE_ID_MSC_MAX		0x07
-- 
cgit v1.2.3


From 5a599a15182ed48e5bf54111feb3b21e425e194d Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@ruivo.org>
Date: Mon, 23 Jun 2008 10:47:53 -0400
Subject: Input: add keycodes for remote controls/phone keypads

The new keys are separate from normal numeric keys and standard
numeric keypads. The userspace should not attempt to apply modifiers
like shift and NumLock to these so tey work properly regardless of
the language mapping used.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 7fae1dee356..b86fb5581ce 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -577,6 +577,19 @@ struct input_absinfo {
 #define KEY_BRL_DOT9		0x1f9
 #define KEY_BRL_DOT10		0x1fa
 
+#define KEY_NUMERIC_0		0x200	/* used by phones, remote controls, */
+#define KEY_NUMERIC_1		0x201	/* and other keypads */
+#define KEY_NUMERIC_2		0x202
+#define KEY_NUMERIC_3		0x203
+#define KEY_NUMERIC_4		0x204
+#define KEY_NUMERIC_5		0x205
+#define KEY_NUMERIC_6		0x206
+#define KEY_NUMERIC_7		0x207
+#define KEY_NUMERIC_8		0x208
+#define KEY_NUMERIC_9		0x209
+#define KEY_NUMERIC_STAR	0x20a
+#define KEY_NUMERIC_POUND	0x20b
+
 /* We avoid low common keys in module aliases so they don't get huge. */
 #define KEY_MIN_INTERESTING	KEY_MUTE
 #define KEY_MAX			0x2ff
-- 
cgit v1.2.3


From feb2f55db45919aa80731f8877b60cab454b7b94 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Fri, 1 Aug 2008 11:53:29 +0300
Subject: [MTD] [OneNAND] Add defines for HF and sync write

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/onenand_regs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d1b310c92eb..0c6bbe28f38 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -152,6 +152,8 @@
 #define ONENAND_SYS_CFG1_INT		(1 << 6)
 #define ONENAND_SYS_CFG1_IOBE		(1 << 5)
 #define ONENAND_SYS_CFG1_RDY_CONF	(1 << 4)
+#define ONENAND_SYS_CFG1_HF		(1 << 2)
+#define ONENAND_SYS_CFG1_SYNC_WRITE	(1 << 1)
 
 /*
  * Controller Status Register F240h (R)
-- 
cgit v1.2.3


From 2e489e077a6ad118c4f247faedf330117b107cce Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Tue, 5 Aug 2008 16:39:42 +0100
Subject: [MTD] [NOR] Add qry_mode_on()/qry_omde_off() to deal with odd chips

There are some CFI chips which require non standard procedures to get
into QRY mode. The possible way to support them would be trying
different modes till QRY will be read. This patch introduce two new
functions qry_mode_on qry_mode_off. qry_mode_on tries different commands
in order switch chip into QRY mode.

So if we have one more "odd" chip - we just could add several lines to
qry_mode_on. Also using these functions remove unnecessary code
duplicaton in porbe procedure.

Currently there are two "odd" cases
1. Some old intel chips which require 0xFF before 0x98
2. ST M29DW chip which requires 0x98 to be sent at 0x555 (according to
CFI should be 0x55)

This patch is partialy based on the patch from Uwe
(see "[PATCH 2/4] [RFC][MTD] cfi_probe: remove Intel chip workaround"
thread )

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Signed-off-by: Alexander Belyakov <abelyako@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/cfi.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d6fb115f5a0..3058917d7b9 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -12,6 +12,7 @@
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi_endian.h>
+#include <linux/mtd/xip.h>
 
 #ifdef CONFIG_MTD_CFI_I1
 #define cfi_interleave(cfi) 1
@@ -430,7 +431,6 @@ static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t
 {
 	map_word val;
 	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, cfi_interleave(cfi), type);
-
 	val = cfi_build_cmd(cmd, map, cfi);
 
 	if (prev_val)
@@ -483,6 +483,13 @@ static inline void cfi_udelay(int us)
 	}
 }
 
+int __xipram qry_present(struct map_info *map, __u32 base,
+				struct cfi_private *cfi);
+int __xipram qry_mode_on(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi);
+void __xipram qry_mode_off(uint32_t base, struct map_info *map,
+				struct cfi_private *cfi);
+
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
 struct cfi_fixup {
-- 
cgit v1.2.3


From e93cafe45fd74935e0aca2b79e533f0e3ed9640f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Anders=20Grafstr=C3=B6m?= <grfstrm@users.sourceforge.net>
Date: Tue, 5 Aug 2008 18:37:41 +0200
Subject: [MTD] [NOR] cfi_cmdset_0001: Timeouts for erase, write and unlock
 operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Timeouts are currently given by the typical operation time times 8.
It works in the general well-behaved case but not when an erase block is
failing. For erase operations, it seems that a failing erase block will
keep the device state machine in erasing state until the vendor
specified maximum timeout period has passed. By this time the driver
would have long since timed out, left erasing state and attempted
further operations which all fail. This patch implements timeouts using
values from the CFI Query structure when available.
The patch also sets a longer timeout for locking operations. The current
value used for locking/unlocking given by 1000000/HZ microseconds is too
short for devices like J3 and J5 Strataflash which have a typical clear
lock-bits time of 0.5 seconds.

Signed-off-by: Anders Grafström <grfstrm@users.sourceforge.net>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/flashchip.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index 08dd131301c..d4f38c5fd44 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -73,6 +73,10 @@ struct flchip {
 	int buffer_write_time;
 	int erase_time;
 
+	int word_write_time_max;
+	int buffer_write_time_max;
+	int erase_time_max;
+
 	void *priv;
 };
 
-- 
cgit v1.2.3


From c314dfdc358847eef0fc07ec8682e1acc8cadd00 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Thu, 7 Aug 2008 11:55:07 +0100
Subject: [MTD] [NOR] Rename and export new cfi_qry_*() functions

They need to be exported, so let's give them less generic-sounding names
while we're at it.

Original export patch, along with the suggestion about the nomenclature,
from Stephen Rothwell.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/cfi.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 3058917d7b9..ee5124ec319 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -483,12 +483,12 @@ static inline void cfi_udelay(int us)
 	}
 }
 
-int __xipram qry_present(struct map_info *map, __u32 base,
-				struct cfi_private *cfi);
-int __xipram qry_mode_on(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi);
-void __xipram qry_mode_off(uint32_t base, struct map_info *map,
-				struct cfi_private *cfi);
+int __xipram cfi_qry_present(struct map_info *map, __u32 base,
+			     struct cfi_private *cfi);
+int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map,
+			     struct cfi_private *cfi);
+void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map,
+			       struct cfi_private *cfi);
 
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
-- 
cgit v1.2.3


From b845b517b5e3706a3729f6ea83b88ab85f0725b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 Aug 2008 21:47:09 +0200
Subject: printk: robustify printk

Avoid deadlocks against rq->lock and xtime_lock by deferring the klogd
wakeup by polling from the timer tick.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index aaa998f65c7..113ac8d0425 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -200,6 +200,8 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+extern void printk_tick(void);
+extern int printk_needs_cpu(int);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -211,6 +213,8 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline void printk_tick(void) { }
+static inline int printk_needs_cpu(int) { return 0; }
 #endif
 
 extern void asmlinkage __attribute__((format(printf, 1, 2)))
-- 
cgit v1.2.3


From ced9cd40ac14111befd6b0c73ec90106c22a3fd7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 11 Aug 2008 14:38:12 +0200
Subject: printk: robustify printk, fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

 include/linux/kernel.h: In function ‘printk_needs_cpu':
 include/linux/kernel.h:217: error: parameter name omitted

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 113ac8d0425..3652a456412 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -200,8 +200,6 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
-extern void printk_tick(void);
-extern int printk_needs_cpu(int);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -213,10 +211,11 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
-static inline void printk_tick(void) { }
-static inline int printk_needs_cpu(int) { return 0; }
 #endif
 
+extern int printk_needs_cpu(int cpu);
+extern void printk_tick(void);
+
 extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
-- 
cgit v1.2.3


From bb0eb217c980d50c45f3e793b4dcc70ab9ee820d Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 12 Aug 2008 12:40:50 +0300
Subject: [MTD] Define and use MTD_FAIL_ADDR_UNKNOWN instead of 0xffffffff

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/mtd.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 92263654855..eae26bb6430 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -25,8 +25,10 @@
 #define MTD_ERASE_DONE          0x08
 #define MTD_ERASE_FAILED        0x10
 
+#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff
+
 /* If the erase fails, fail_addr might indicate exactly which block failed.  If
-   fail_addr = 0xffffffff, the failure was not at the device level or was not
+   fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not
    specific to any particular block. */
 struct erase_info {
 	struct mtd_info *mtd;
-- 
cgit v1.2.3


From 17c1d2be28e485c0c8b09661db39d5bf2605069d Mon Sep 17 00:00:00 2001
From: Alexey Korolev <akorolev@infradead.org>
Date: Wed, 20 Aug 2008 22:32:08 +0100
Subject: [MTD] [NAND] Fix missing kernel-doc

[Reported by Randy Dunlap]

Signed-off-by: Alexey Korolev <akorolev@infradead.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 81774e5facf..733d3f3b4eb 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -248,6 +248,7 @@ struct nand_hw_control {
  * @read_page_raw:	function to read a raw page without ECC
  * @write_page_raw:	function to write a raw page without ECC
  * @read_page:	function to read a page according to the ecc generator requirements
+ * @read_subpage:	function to read parts of the page covered by ECC.
  * @write_page:	function to write a page according to the ecc generator requirements
  * @read_oob:	function to read chip OOB data
  * @write_oob:	function to write chip OOB data
-- 
cgit v1.2.3


From 1aa5dfb751d275ae7117d3b73ac423b4a46f2a73 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:28 -0700
Subject: clocksource: keep track of original clocksource frequency

The clocksource frequency is represented by
clocksource->mult/2^(clocksource->shift).  Currently, when NTP makes
adjustments to the clock frequency, they are made directly to the mult
value.

This has the drawback that once changed, we cannot know what the orignal
mult value was, or how much adjustment has been applied.

This property causes problems in calculating proper ntp intervals when
switching back and forth between clocksources.

This patch separates the current mult value into a mult and mult_orig
pair.  The mult_orig value stays constant, while the ntp clocksource
adjustments are done only to the mult value.

This allows for correct ntp interval calculation and additionally lays the
groundwork for a new notion of time, what I'm calling the monotonic-raw
time, which is introduced in a following patch.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 55e434feec9..f0a7fb98441 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -45,7 +45,8 @@ struct clocksource;
  * @read:		returns a cycle value
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters
- * @mult:		cycle to nanosecond multiplier
+ * @mult:		cycle to nanosecond multiplier (adjusted by NTP)
+ * @mult_orig:		cycle to nanosecond multiplier (unadjusted by NTP)
  * @shift:		cycle to nanosecond divisor (power of two)
  * @flags:		flags describing special properties
  * @vread:		vsyscall based read
@@ -63,6 +64,7 @@ struct clocksource {
 	cycle_t (*read)(void);
 	cycle_t mask;
 	u32 mult;
+	u32 mult_orig;
 	u32 shift;
 	unsigned long flags;
 	cycle_t (*vread)(void);
@@ -201,16 +203,17 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 {
 	u64 tmp;
 
-	/* XXX - All of this could use a whole lot of optimization */
+	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = length_nsec;
 	tmp <<= c->shift;
-	tmp += c->mult/2;
-	do_div(tmp, c->mult);
+	tmp += c->mult_orig/2;
+	do_div(tmp, c->mult_orig);
 
 	c->cycle_interval = (cycle_t)tmp;
 	if (c->cycle_interval == 0)
 		c->cycle_interval = 1;
 
+	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
 }
 
-- 
cgit v1.2.3


From 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Wed, 20 Aug 2008 16:37:30 -0700
Subject: clocksource: introduce CLOCK_MONOTONIC_RAW

In talking with Josip Loncaric, and his work on clock synchronization (see
btime.sf.net), he mentioned that for really close synchronization, it is
useful to have access to "hardware time", that is a notion of time that is
not in any way adjusted by the clock slewing done to keep close time sync.

Part of the issue is if we are using the kernel's ntp adjusted
representation of time in order to measure how we should correct time, we
can run into what Paul McKenney aptly described as "Painting a road using
the lines we're painting as the guide".

I had been thinking of a similar problem, and was trying to come up with a
way to give users access to a purely hardware based time representation
that avoided users having to know the underlying frequency and mask values
needed to deal with the wide variety of possible underlying hardware
counters.

My solution is to introduce CLOCK_MONOTONIC_RAW.  This exposes a
nanosecond based time value, that increments starting at bootup and has no
frequency adjustments made to it what so ever.

The time is accessed from userspace via the posix_clock_gettime() syscall,
passing CLOCK_MONOTONIC_RAW as the clock_id.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h | 3 +++
 include/linux/time.h        | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f0a7fb98441..f88d32f8ff7 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -79,6 +79,7 @@ struct clocksource {
 	/* timekeeping specific data, ignore */
 	cycle_t cycle_interval;
 	u64	xtime_interval;
+	u32	raw_interval;
 	/*
 	 * Second part is written at each timer interrupt
 	 * Keep it in a different cache line to dirty no
@@ -87,6 +88,7 @@ struct clocksource {
 	cycle_t cycle_last ____cacheline_aligned_in_smp;
 	u64 xtime_nsec;
 	s64 error;
+	struct timespec raw_time;
 
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
@@ -215,6 +217,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 
 	/* Go back from cycles -> shifted ns, this time use ntp adjused mult */
 	c->xtime_interval = (u64)c->cycle_interval * c->mult;
+	c->raw_interval = ((u64)c->cycle_interval * c->mult_orig) >> c->shift;
 }
 
 
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82..205f974b9eb 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -117,6 +117,7 @@ extern int do_setitimer(int which, struct itimerval *value,
 extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
+extern void getrawmonotonic(struct timespec *ts);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 
@@ -214,6 +215,7 @@ struct itimerval {
 #define CLOCK_MONOTONIC			1
 #define CLOCK_PROCESS_CPUTIME_ID	2
 #define CLOCK_THREAD_CPUTIME_ID		3
+#define CLOCK_MONOTONIC_RAW		4
 
 /*
  * The IDs of various hardware clocks:
-- 
cgit v1.2.3


From 916c7a855174e3b53d182b97a26b2e27a29726a1 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Wed, 20 Aug 2008 16:46:08 -0700
Subject: ntp: fix ADJ_OFFSET_SS_READ bug and do_adjtimex() cleanup

Thanks to the review by Michael Kerrisk a bug in the recent
ADJ_OFFSET_SS_READ option was discovered, where the ntp time_offset was
inadvertently set by it.  This fixes this by making the adjtime code
more separate from the ntp_adjtime code (both of which really want to
be separate syscalls).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index fc6035d29d5..c00bcdd3ae4 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -141,8 +141,15 @@ struct timex {
 #define ADJ_MICRO		0x1000	/* select microsecond resolution */
 #define ADJ_NANO		0x2000	/* select nanosecond resolution */
 #define ADJ_TICK		0x4000	/* tick value */
+
+#ifdef __KERNEL__
+#define ADJ_ADJTIME		0x8000	/* switch between adjtime/adjtimex modes */
+#define ADJ_OFFSET_SINGLESHOT	0x0001	/* old-fashioned adjtime */
+#define ADJ_OFFSET_READONLY	0x2000	/* read-only adjtime */
+#else
 #define ADJ_OFFSET_SINGLESHOT	0x8001	/* old-fashioned adjtime */
-#define ADJ_OFFSET_SS_READ	0xa001  /* read-only adjtime */
+#define ADJ_OFFSET_SS_READ	0xa001	/* read-only adjtime */
+#endif
 
 /* xntp 3.4 compatibility names */
 #define MOD_OFFSET	ADJ_OFFSET
-- 
cgit v1.2.3


From 942ed161944b3476639916cf544e6975b29c985a Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@srcf.ucam.org>
Date: Tue, 26 Aug 2008 21:09:59 +0100
Subject: power_supply: Add function to return system-wide power state

Certain drivers benefit from knowing whether the system is on ac or
battery, for instance when determining which backlight registers to
read. This adds a simple call to determine whether there's an online
power supply other than any batteries.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
---
 include/linux/power_supply.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index ea96ead1d39..f9348cba6dc 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -165,6 +165,12 @@ struct power_supply_info {
 extern void power_supply_changed(struct power_supply *psy);
 extern int power_supply_am_i_supplied(struct power_supply *psy);
 
+#if defined(CONFIG_POWER_SUPPLY) || defined(CONFIG_POWER_SUPPLY_MODULE)
+extern int power_supply_is_system_supplied(void);
+#else
+static inline int power_supply_is_system_supplied(void) { return -ENOSYS; }
+#endif
+
 extern int power_supply_register(struct device *parent,
 				 struct power_supply *psy);
 extern void power_supply_unregister(struct power_supply *psy);
-- 
cgit v1.2.3


From 978b0116cd225682a29e3d1d5010319bf2de32c2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 6 Sep 2008 20:04:36 +0200
Subject: softirq: allocate less vectors

We don't need whole 32 of them, only NR_SOFTIRQS.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 58ff4e74b2f..54b3623434e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -252,6 +252,8 @@ enum
 	HRTIMER_SOFTIRQ,
 #endif
 	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
+
+	NR_SOFTIRQS
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
-- 
cgit v1.2.3


From 4d5975e5016a9025814b92981de21eaf9203caa6 Mon Sep 17 00:00:00 2001
From: Eric Miao <eric.y.miao@gmail.com>
Date: Wed, 10 Sep 2008 12:06:15 -0400
Subject: Input: ads7846 - introduce .gpio_pendown to get pendown state

The GPIO connected to ADS7846 nPENIRQ signal is usually used to get
the pendown state as well. Introduce a .gpio_pendown, and use this
to decide the pendown state if .get_pendown_state is NULL.

Signed-off-by: Eric Miao <eric.miao@marvell.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/spi/ads7846.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index daf744017a3..05eab2f11e6 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -43,6 +43,9 @@ struct ads7846_platform_data {
 	u16	debounce_tol;		/* tolerance used for filtering */
 	u16	debounce_rep;		/* additional consecutive good readings
 					 * required after the first two */
+	int	gpio_pendown;		/* the GPIO used to decide the pendown
+					 * state if get_pendown_state == NULL
+					 */
 	int	(*get_pendown_state)(void);
 	int	(*filter_init)	(struct ads7846_platform_data *pdata,
 				 void **filter_data);
-- 
cgit v1.2.3


From f06febc96ba8e0af80bcc3eaec0a109e88275fac Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang

Overview

This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling.  It was put together
with the help of Roland McGrath, the owner and original writer of this code.

The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads.  It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.

This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."

Code Changes

This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine.  (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.)  To do this, at each tick we now update fields in
signal_struct as well as task_struct.  The run_posix_cpu_timers() function
uses those fields to make its decisions.

We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:

struct task_cputime {
	cputime_t utime;
	cputime_t stime;
	unsigned long long sum_exec_runtime;
};

This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels.  For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:

struct thread_group_cputime {
	struct task_cputime totals;
};

struct thread_group_cputime {
	struct task_cputime *totals;
};

We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers).  The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends.  In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention).  For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu().  The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().

We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel.  The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields.  The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures.  The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated.  The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU.  Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.

Non-SMP operation is trivial and will not be mentioned further.

The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().

All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.

Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away.  All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline.  When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.

Performance

The fix appears not to add significant overhead to existing operations.  It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below).  Overall it's a wash except in those
two cases.

I've since done somewhat more involved testing on a dual-core Opteron system.

Case 1: With no itimer running, for a test with 100,000 threads, the fixed
	kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
	all of which was spent in the system.  There were twice as many
	voluntary context switches with the fix as without it.

Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
	an unmodified kernel can handle), the fixed kernel ran the test in
	eight percent of the time (5.8 seconds as opposed to 70 seconds) and
	had better tick accuracy (.012 seconds per tick as opposed to .023
	seconds per tick).

Case 3: A 4000-thread test with an initial timer tick of .01 second and an
	interval of 10,000 seconds (i.e. a timer that ticks only once) had
	very nearly the same performance in both cases:  6.3 seconds elapsed
	for the fixed kernel versus 5.5 seconds for the unfixed kernel.

With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds).  The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.

Since the fix affected the rlimit code, I also tested soft and hard CPU limits.

Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
	running), the modified kernel was very slightly favored in that while
	it killed the process in 19.997 seconds of CPU time (5.002 seconds of
	wall time), only .003 seconds of that was system time, the rest was
	user time.  The unmodified kernel killed the process in 20.001 seconds
	of CPU (5.014 seconds of wall time) of which .016 seconds was system
	time.  Really, though, the results were too close to call.  The results
	were essentially the same with no itimer running.

Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
	(where the hard limit would never be reached) and an itimer running,
	the modified kernel exhibited worse tick accuracy than the unmodified
	kernel: .050 seconds/tick versus .028 seconds/tick.  Otherwise,
	performance was almost indistinguishable.  With no itimer running this
	test exhibited virtually identical behavior and times in both cases.

In times past I did some limited performance testing.  those results are below.

On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s.  On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds.  Performance with eight, four and one
thread were comparable.  Interestingly, the timer ticks with the fix seemed
more accurate:  The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick.  Both cases were configured for an interval of
0.01 seconds.  Again, the other tests were comparable.  Each thread in this
test computed the primes up to 25,000,000.

I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix.  In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable).  System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s).  It received 147651 ticks for 0.015 seconds per tick, still quite
accurate.  There is obviously no comparable test without the fix.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/posix-timers.h |   2 +
 include/linux/sched.h        | 257 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/time.h         |   3 +
 3 files changed, 249 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7dd38f30ad..f9d8e9e94e9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -115,4 +115,6 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 
 long clock_nanosleep_restart(struct restart_block *restart_block);
 
+void update_rlimit_cpu(unsigned long rlim_new);
+
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d9120c5ad1..26d7a5f2d0b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -425,6 +425,45 @@ struct pacct_struct {
 	unsigned long		ac_minflt, ac_majflt;
 };
 
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:		time spent in user mode, in &cputime_t units
+ * @stime:		time spent in kernel mode, in &cputime_t units
+ * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
+ * 
+ * This structure groups together three kinds of CPU time that are
+ * tracked for threads and thread groups.  Most things considering
+ * CPU time want to group these counts together and treat all three
+ * of them in parallel.
+ */
+struct task_cputime {
+	cputime_t utime;
+	cputime_t stime;
+	unsigned long long sum_exec_runtime;
+};
+/* Alternate field names when used to cache expirations. */
+#define prof_exp	stime
+#define virt_exp	utime
+#define sched_exp	sum_exec_runtime
+
+/**
+ * struct thread_group_cputime - thread group interval timer counts
+ * @totals:		thread group interval timers; substructure for
+ *			uniprocessor kernel, per-cpu for SMP kernel.
+ *
+ * This structure contains the version of task_cputime, above, that is
+ * used for thread group CPU clock calculations.
+ */
+#ifdef CONFIG_SMP
+struct thread_group_cputime {
+	struct task_cputime *totals;
+};
+#else
+struct thread_group_cputime {
+	struct task_cputime totals;
+};
+#endif
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -470,6 +509,17 @@ struct signal_struct {
 	cputime_t it_prof_expires, it_virt_expires;
 	cputime_t it_prof_incr, it_virt_incr;
 
+	/*
+	 * Thread group totals for process CPU clocks.
+	 * See thread_group_cputime(), et al, for details.
+	 */
+	struct thread_group_cputime cputime;
+
+	/* Earliest-expiration cache. */
+	struct task_cputime cputime_expires;
+
+	struct list_head cpu_timers[3];
+
 	/* job control IDs */
 
 	/*
@@ -500,7 +550,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t utime, stime, cutime, cstime;
+	cputime_t cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -508,14 +558,6 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
-	/*
-	 * Cumulative ns of scheduled CPU time for dead threads in the
-	 * group, not including a zombie group leader.  (This only differs
-	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
-	 * other than jiffies.)
-	 */
-	unsigned long long sum_sched_runtime;
-
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
@@ -527,8 +569,6 @@ struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	struct list_head cpu_timers[3];
-
 	/* keep the process-shared keyrings here so that they do the right
 	 * thing in threads created with CLONE_THREAD */
 #ifdef CONFIG_KEYS
@@ -1134,8 +1174,7 @@ struct task_struct {
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 
-  	cputime_t it_prof_expires, it_virt_expires;
-	unsigned long long it_sched_expires;
+	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 
 /* process credentials */
@@ -1585,6 +1624,7 @@ extern unsigned long long cpu_clock(int cpu);
 
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
+extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -2081,6 +2121,197 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Thread group CPU time accounting.
+ */
+#ifdef CONFIG_SMP
+
+extern int thread_group_cputime_alloc_smp(struct task_struct *);
+extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals = NULL;
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						    struct task_struct *new)
+{
+	if (curr->signal->cputime.totals)
+		return 0;
+	return thread_group_cputime_alloc_smp(curr);
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+	free_percpu(sig->cputime.totals);
+}
+
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * This is a wrapper for the real routine, thread_group_cputime_smp().  See
+ * that routine for details.
+ */
+static inline void thread_group_cputime(
+	struct task_struct *tsk,
+	struct task_cputime *times)
+{
+	thread_group_cputime_smp(tsk, times);
+}
+
+/**
+ * thread_group_cputime_account_user - Maintain utime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the utime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->utime = cputime_add(times->utime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_system - Maintain stime for a thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @cputime:	Time value by which to increment the stime field of that
+ *		structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->stime = cputime_add(times->stime, cputime);
+		put_cpu_no_resched();
+	}
+}
+
+/**
+ * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
+ *						thread group.
+ *
+ * @tgtimes:	Pointer to thread_group_cputime structure.
+ * @ns:		Time value by which to increment the sum_exec_runtime field
+ *		of that structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	if (tgtimes->totals) {
+		struct task_cputime *times;
+
+		times = per_cpu_ptr(tgtimes->totals, get_cpu());
+		times->sum_exec_runtime += ns;
+		put_cpu_no_resched();
+	}
+}
+
+#else /* CONFIG_SMP */
+
+static inline void thread_group_cputime_init(struct signal_struct *sig)
+{
+	sig->cputime.totals.utime = cputime_zero;
+	sig->cputime.totals.stime = cputime_zero;
+	sig->cputime.totals.sum_exec_runtime = 0;
+}
+
+static inline int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void thread_group_cputime_free(struct signal_struct *sig)
+{
+}
+
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
+						     struct task_struct *tsk)
+{
+}
+
+static inline void thread_group_cputime(struct task_struct *tsk,
+					 struct task_cputime *cputime)
+{
+	*cputime = tsk->signal->cputime.totals;
+}
+
+static inline void thread_group_cputime_account_user(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+}
+
+static inline void thread_group_cputime_account_system(
+	struct thread_group_cputime *tgtimes,
+	cputime_t cputime)
+{
+	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+}
+
+static inline void thread_group_cputime_account_exec_runtime(
+	struct thread_group_cputime *tgtimes,
+	unsigned long long ns)
+{
+	tgtimes->totals->sum_exec_runtime += ns;
+}
+
+#endif /* CONFIG_SMP */
+
+static inline void account_group_user_time(struct task_struct *tsk,
+					    cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_user(&sig->cputime, cputime);
+}
+
+static inline void account_group_system_time(struct task_struct *tsk,
+					      cputime_t cputime)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_system(&sig->cputime, cputime);
+}
+
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+					       unsigned long long ns)
+{
+	struct signal_struct *sig;
+
+	sig = tsk->signal;
+	if (likely(sig))
+		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+}
+
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82..1b70b3c293e 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -125,6 +125,9 @@ extern int timekeeping_valid_for_hres(void);
 extern void update_wall_time(void);
 extern void update_xtime_cache(u64 nsec);
 
+struct tms;
+extern void do_sys_times(struct tms *);
+
 /**
  * timespec_to_ns - Convert timespec to nanoseconds
  * @ts:		pointer to the timespec variable to be converted
-- 
cgit v1.2.3


From 0a8eaa4f9b58759595a1bfe13a1295fdc25ba026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:03:52 +0200
Subject: timers: fix itimer/many thread hang, fix #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix the UP build:

In file included from arch/x86/kernel/asm-offsets_32.c:9,
                 from arch/x86/kernel/asm-offsets.c:3:
include/linux/sched.h: In function ‘thread_group_cputime_clone_thread’:
include/linux/sched.h:2272: warning: no return statement in function returning non-void
include/linux/sched.h: In function ‘thread_group_cputime_account_user’:
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2284: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_system’:
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h:2291: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
include/linux/sched.h: In function ‘thread_group_cputime_account_exec_runtime’:
include/linux/sched.h:2298: error: invalid type argument of ‘->’ (have ‘struct task_cputime’)
distcc[14501] ERROR: compile arch/x86/kernel/asm-offsets.c on a/30 failed
make[1]: *** [arch/x86/kernel/asm-offsets.s] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26d7a5f2d0b..ed355f02d32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2251,6 +2251,7 @@ static inline void thread_group_cputime_free(struct signal_struct *sig)
 static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
 						     struct task_struct *tsk)
 {
+	return 0;
 }
 
 static inline void thread_group_cputime(struct task_struct *tsk,
@@ -2263,21 +2264,21 @@ static inline void thread_group_cputime_account_user(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->utime = cputime_add(tgtimes->totals->utime, cputime);
+	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
 }
 
 static inline void thread_group_cputime_account_system(
 	struct thread_group_cputime *tgtimes,
 	cputime_t cputime)
 {
-	tgtimes->totals->stime = cputime_add(tgtimes->totals->stime, cputime);
+	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
 }
 
 static inline void thread_group_cputime_account_exec_runtime(
 	struct thread_group_cputime *tgtimes,
 	unsigned long long ns)
 {
-	tgtimes->totals->sum_exec_runtime += ns;
+	tgtimes->totals.sum_exec_runtime += ns;
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 5ce73a4a5a4893a1aa4cdeed1b1a5a6de42c43b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 17:11:46 +0200
Subject: timers: fix itimer/many thread hang, cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ed355f02d32..7ce8d4e5356 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -430,7 +430,7 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * 
+ *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
  * CPU time want to group these counts together and treat all three
-- 
cgit v1.2.3


From 600715dcdf567c86f8b2c6173fcfb4b873e25a19 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 11 Sep 2008 01:31:45 -0700
Subject: generic: add phys_addr_t for holding physical addresses

Add a kernel-wide "phys_addr_t" which is guaranteed to be able to hold
any physical address.  By default it equals the word size of the
architecture, but a 32-bit architecture can set ARCH_PHYS_ADDR_T_64BIT
if it needs a 64-bit phys_addr_t.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/types.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index d4a9ce6e276..022c668496d 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -197,6 +197,12 @@ typedef u64 resource_size_t;
 typedef u32 resource_size_t;
 #endif
 
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+typedef u64 phys_addr_t;
+#else
+typedef u32 phys_addr_t;
+#endif
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
-- 
cgit v1.2.3


From 947d0496cf3e12ebfa70b3eaf561c25403247ce9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 11 Sep 2008 01:31:48 -0700
Subject: generic: make PFN_PHYS explicitly return phys_addr_t

PFN_PHYS, as its name suggests, turns a pfn into a physical address.
However, it is a macro which just operates on its argument without
modifying its type.  pfns are typed unsigned long, but an unsigned
long may not be long enough to hold a physical address (32-bit systems
with more than 32 bits of physcial address).

Make sure we cast to phys_addr_t to return a complete result.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pfn.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index bb01f8b92b5..7646637221f 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -1,9 +1,13 @@
 #ifndef _LINUX_PFN_H_
 #define _LINUX_PFN_H_
 
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
 #define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
 #define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
 #define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
+#define PFN_PHYS(x)	((phys_addr_t)(x) << PAGE_SHIFT)
 
 #endif
-- 
cgit v1.2.3


From 8308c54d7e312f7a03e2ce2057d0837e6fe3843f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 11 Sep 2008 01:31:50 -0700
Subject: generic: redefine resource_size_t as phys_addr_t

There's no good reason why a resource_size_t shouldn't just be a
physical address, so simply redefine it in terms of phys_addr_t.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/types.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 022c668496d..f24f7beb47d 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -191,18 +191,14 @@ typedef __u32 __bitwise __wsum;
 #ifdef __KERNEL__
 typedef unsigned __bitwise__ gfp_t;
 
-#ifdef CONFIG_RESOURCES_64BIT
-typedef u64 resource_size_t;
-#else
-typedef u32 resource_size_t;
-#endif
-
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
 #else
 typedef u32 phys_addr_t;
 #endif
 
+typedef phys_addr_t resource_size_t;
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
-- 
cgit v1.2.3


From d7cfb60c5cf904ecf1e0ae23ec178175b86f0d4a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:44 +0100
Subject: hrtimer: remove hrtimer_clock_base::get_softirq_time()

Peter Zijlstra noticed this 8 months ago and I just noticed
it again.

hrtimer_clock_base::get_softirq_time() is currently unused
in the entire tree. In fact, looking at the logs, it appears
as if it was never used. Remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cb..1b079bd29c3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -145,7 +145,6 @@ struct hrtimer_sleeper {
  * @first:		pointer to the timer node which expires first
  * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
- * @get_softirq_time:	function to retrieve the current time from the softirq
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  * @reprogram:		function to reprogram the timer event
@@ -157,7 +156,6 @@ struct hrtimer_clock_base {
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
-	ktime_t			(*get_softirq_time)(void);
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
-- 
cgit v1.2.3


From b91c4996df56fcd201f85c392a1de7bc3f6641f5 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 19 Sep 2008 13:13:48 +0100
Subject: hrtimer: remove hrtimer_clock_base::reprogram()

hrtimer_clock_base::reprogram() also appears to never
have been used, so remove it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1b079bd29c3..68b0196d869 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -147,7 +147,6 @@ struct hrtimer_sleeper {
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
- * @reprogram:		function to reprogram the timer event
  */
 struct hrtimer_clock_base {
 	struct hrtimer_cpu_base	*cpu_base;
@@ -159,9 +158,6 @@ struct hrtimer_clock_base {
 	ktime_t			softirq_time;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t			offset;
-	int			(*reprogram)(struct hrtimer *t,
-					     struct hrtimer_clock_base *b,
-					     ktime_t n);
 #endif
 };
 
-- 
cgit v1.2.3


From bb34d92f643086d546b49cef680f6f305ed84414 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v2

This is the second resubmission of the posix timer rework patch, posted
a few days ago.

This includes the changes from the previous resubmittion, which addressed
Oleg Nesterov's comments, removing the RCU stuff from the patch and
un-inlining the thread_group_cputime() function for SMP.

In addition, per Ingo Molnar it simplifies the UP code, consolidating much
of it with the SMP version and depending on lower-level SMP/UP handling to
take care of the differences.

It also cleans up some UP compile errors, moves the scheduler stats-related
macros into kernel/sched_stats.h, cleans up a merge error in
kernel/fork.c and has a few other minor fixes and cleanups as suggested
by Oleg and Ingo. Thanks for the review, guys.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |   1 +
 include/linux/sched.h       | 183 ++------------------------------------------
 2 files changed, 6 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a91c9..cac3750cd65 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -52,6 +52,7 @@ static inline int kstat_irqs(int irq)
 	return sum;
 }
 
+extern unsigned long long task_delta_exec(struct task_struct *);
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ce8d4e5356..b982fb48c8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -454,15 +454,9 @@ struct task_cputime {
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU clock calculations.
  */
-#ifdef CONFIG_SMP
 struct thread_group_cputime {
 	struct task_cputime *totals;
 };
-#else
-struct thread_group_cputime {
-	struct task_cputime totals;
-};
-#endif
 
 /*
  * NOTE! "signal_struct" does not have it's own
@@ -2124,193 +2118,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
-#ifdef CONFIG_SMP
 
-extern int thread_group_cputime_alloc_smp(struct task_struct *);
-extern void thread_group_cputime_smp(struct task_struct *, struct task_cputime *);
+extern int thread_group_cputime_alloc(struct task_struct *);
+extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
 	sig->cputime.totals = NULL;
 }
 
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						    struct task_struct *new)
+static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 {
 	if (curr->signal->cputime.totals)
 		return 0;
-	return thread_group_cputime_alloc_smp(curr);
+	return thread_group_cputime_alloc(curr);
 }
 
-static inline void thread_group_cputime_free(struct signal_struct *sig)
-{
-	free_percpu(sig->cputime.totals);
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * This is a wrapper for the real routine, thread_group_cputime_smp().  See
- * that routine for details.
- */
-static inline void thread_group_cputime(
-	struct task_struct *tsk,
-	struct task_cputime *times)
-{
-	thread_group_cputime_smp(tsk, times);
-}
-
-/**
- * thread_group_cputime_account_user - Maintain utime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the utime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->utime = cputime_add(times->utime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_system - Maintain stime for a thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @cputime:	Time value by which to increment the stime field of that
- *		structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->stime = cputime_add(times->stime, cputime);
-		put_cpu_no_resched();
-	}
-}
-
-/**
- * thread_group_cputime_account_exec_runtime - Maintain exec runtime for a
- *						thread group.
- *
- * @tgtimes:	Pointer to thread_group_cputime structure.
- * @ns:		Time value by which to increment the sum_exec_runtime field
- *		of that structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	if (tgtimes->totals) {
-		struct task_cputime *times;
-
-		times = per_cpu_ptr(tgtimes->totals, get_cpu());
-		times->sum_exec_runtime += ns;
-		put_cpu_no_resched();
-	}
-}
-
-#else /* CONFIG_SMP */
-
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	sig->cputime.totals.utime = cputime_zero;
-	sig->cputime.totals.stime = cputime_zero;
-	sig->cputime.totals.sum_exec_runtime = 0;
-}
-
-static inline int thread_group_cputime_alloc(struct task_struct *tsk)
-{
-	return 0;
-}
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
-}
-
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr,
-						     struct task_struct *tsk)
-{
-	return 0;
-}
-
-static inline void thread_group_cputime(struct task_struct *tsk,
-					 struct task_cputime *cputime)
-{
-	*cputime = tsk->signal->cputime.totals;
-}
-
-static inline void thread_group_cputime_account_user(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.utime = cputime_add(tgtimes->totals.utime, cputime);
-}
-
-static inline void thread_group_cputime_account_system(
-	struct thread_group_cputime *tgtimes,
-	cputime_t cputime)
-{
-	tgtimes->totals.stime = cputime_add(tgtimes->totals.stime, cputime);
-}
-
-static inline void thread_group_cputime_account_exec_runtime(
-	struct thread_group_cputime *tgtimes,
-	unsigned long long ns)
-{
-	tgtimes->totals.sum_exec_runtime += ns;
-}
-
-#endif /* CONFIG_SMP */
-
-static inline void account_group_user_time(struct task_struct *tsk,
-					    cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_user(&sig->cputime, cputime);
-}
-
-static inline void account_group_system_time(struct task_struct *tsk,
-					      cputime_t cputime)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_system(&sig->cputime, cputime);
-}
-
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-					       unsigned long long ns)
-{
-	struct signal_struct *sig;
-
-	sig = tsk->signal;
-	if (likely(sig))
-		thread_group_cputime_account_exec_runtime(&sig->cputime, ns);
+	free_percpu(sig->cputime.totals);
 }
 
 /*
-- 
cgit v1.2.3


From 5a9fa73072854981a5c05eb7ba18a96d49c2804f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 22 Sep 2008 14:42:50 -0700
Subject: posix-timers: kill ->it_sigev_signo and ->it_sigev_value

With the recent changes ->it_sigev_signo and ->it_sigev_value are only
used in sys_timer_create(), kill them.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: mingo@elte.hu
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index f9d8e9e94e9..a7c72135554 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -45,8 +45,6 @@ struct k_itimer {
 	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
-	int it_sigev_signo;		/* signo word of sigevent struct */
-	sigval_t it_sigev_value;	/* value word of sigevent struct */
 	struct task_struct *it_process;	/* process to send signal to */
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
-- 
cgit v1.2.3


From 1b02469088ac7a13d7e622b618b7410d0f1ce5ec Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 22 Sep 2008 14:42:43 -0700
Subject: hrtimer: reorder struct hrtimer to save 8 bytes on 64bit builds

reorder struct hrtimer to save 8 bytes on 64 bit builds when
CONFIG_TIMER_STATS selected.  (also removes 8 bytes from signal_struct)

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 68b0196d869..8730b60c943 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -115,12 +115,12 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
-	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
+	enum hrtimer_cb_mode		cb_mode;
 #ifdef CONFIG_TIMER_STATS
+	int				start_pid;
 	void				*start_site;
 	char				start_comm[16];
-	int				start_pid;
 #endif
 };
 
-- 
cgit v1.2.3


From d40e944c25fb4642adb2a4c580a48218a9f3f824 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Mon, 22 Sep 2008 14:42:44 -0700
Subject: ntp: improve adjtimex frequency rounding

Change PPM_SCALE_INV_SHIFT so that it doesn't throw away any input bits
(19 is the amount of the factor 2 in PPM_SCALE), the output frequency
can then be calculated back to its input value, as the inverse divide
produce a slightly larger value, which is then correctly rounded by the
final shift.

Reported-by: Martin Ziegler <ziegler@uni-freiburg.de>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index c00bcdd3ae4..9007313b5b7 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -82,7 +82,7 @@
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
 #define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
-#define PPM_SCALE_INV_SHIFT 20
+#define PPM_SCALE_INV_SHIFT 19
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
 
-- 
cgit v1.2.3


From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 25 Sep 2008 18:43:34 -0700
Subject: IO resources, x86: ioremap sanity check to catch mapping requests
 exceeding the BAR sizes

Go through the iomem resource tree to check if any of the ioremap()
requests span more than any slot in the iomem resource tree and do
a WARN_ON() if we hit this check.

This will raise a red-flag, if some driver is mapping more than what
is needed. And hopefully identify possible corruptions much earlier.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ioport.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index fded376b94e..01712cf1a38 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -169,6 +169,7 @@ extern struct resource * __devm_request_region(struct device *dev,
 
 extern void __devm_release_region(struct device *dev, struct resource *parent,
 				  resource_size_t start, resource_size_t n);
+extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size);
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
-- 
cgit v1.2.3


From e416de5e61e1a9b7f987804cbb67230b5f5293c6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 23 Sep 2008 17:25:10 +0100
Subject: Export the ROM enable/disable helpers

.... so that they can be used by MTD map drivers. Lets us close #9420

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index c0e14008a3c..7a4cee00c1d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -631,6 +631,8 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 
 /* ROM control related routines */
+int pci_enable_rom(struct pci_dev *pdev);
+void pci_disable_rom(struct pci_dev *pdev);
 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
 size_t pci_get_rom_size(void __iomem *rom, size_t size);
-- 
cgit v1.2.3


From 7086efe1c1536f6bc160e7d60a9bfd645b91f279 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Fri, 12 Sep 2008 09:54:39 -0700
Subject: timers: fix itimer/many thread hang, v3

- fix UP lockup
- another set of UP/SMP cleanups and simplifications

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b982fb48c8f..23d9d546454 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2134,7 +2134,6 @@ static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 	return thread_group_cputime_alloc(curr);
 }
 
-
 static inline void thread_group_cputime_free(struct signal_struct *sig)
 {
 	free_percpu(sig->cputime.totals);
-- 
cgit v1.2.3


From bbfbd8b151fe35c9a1180a7f5254c5d6b8387cc0 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 1 Oct 2008 16:13:54 +0900
Subject: sh: Move the shared INTC code out to drivers/sh/

The INTC code will be re-used across different architectures, so move
this out to drivers/sh/ and include/linux/sh_intc.h respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/sh_intc.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 include/linux/sh_intc.h

(limited to 'include/linux')

diff --git a/include/linux/sh_intc.h b/include/linux/sh_intc.h
new file mode 100644
index 00000000000..68e212ff9dd
--- /dev/null
+++ b/include/linux/sh_intc.h
@@ -0,0 +1,91 @@
+#ifndef __SH_INTC_H
+#define __SH_INTC_H
+
+typedef unsigned char intc_enum;
+
+struct intc_vect {
+	intc_enum enum_id;
+	unsigned short vect;
+};
+
+#define INTC_VECT(enum_id, vect) { enum_id, vect }
+#define INTC_IRQ(enum_id, irq) INTC_VECT(enum_id, irq2evt(irq))
+
+struct intc_group {
+	intc_enum enum_id;
+	intc_enum enum_ids[32];
+};
+
+#define INTC_GROUP(enum_id, ids...) { enum_id, { ids } }
+
+struct intc_mask_reg {
+	unsigned long set_reg, clr_reg, reg_width;
+	intc_enum enum_ids[32];
+#ifdef CONFIG_SMP
+	unsigned long smp;
+#endif
+};
+
+struct intc_prio_reg {
+	unsigned long set_reg, clr_reg, reg_width, field_width;
+	intc_enum enum_ids[16];
+#ifdef CONFIG_SMP
+	unsigned long smp;
+#endif
+};
+
+struct intc_sense_reg {
+	unsigned long reg, reg_width, field_width;
+	intc_enum enum_ids[16];
+};
+
+#ifdef CONFIG_SMP
+#define INTC_SMP(stride, nr) .smp = (stride) | ((nr) << 8)
+#else
+#define INTC_SMP(stride, nr)
+#endif
+
+struct intc_desc {
+	struct intc_vect *vectors;
+	unsigned int nr_vectors;
+	struct intc_group *groups;
+	unsigned int nr_groups;
+	struct intc_mask_reg *mask_regs;
+	unsigned int nr_mask_regs;
+	struct intc_prio_reg *prio_regs;
+	unsigned int nr_prio_regs;
+	struct intc_sense_reg *sense_regs;
+	unsigned int nr_sense_regs;
+	char *name;
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+	struct intc_mask_reg *ack_regs;
+	unsigned int nr_ack_regs;
+#endif
+};
+
+#define _INTC_ARRAY(a) a, sizeof(a)/sizeof(*a)
+#define DECLARE_INTC_DESC(symbol, chipname, vectors, groups,		\
+	mask_regs, prio_regs, sense_regs)				\
+struct intc_desc symbol __initdata = {					\
+	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
+	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
+	_INTC_ARRAY(sense_regs),					\
+	chipname,							\
+}
+
+#if defined(CONFIG_CPU_SH3) || defined(CONFIG_CPU_SH4A)
+#define DECLARE_INTC_DESC_ACK(symbol, chipname, vectors, groups,	\
+	mask_regs, prio_regs, sense_regs, ack_regs)			\
+struct intc_desc symbol __initdata = {					\
+	_INTC_ARRAY(vectors), _INTC_ARRAY(groups),			\
+	_INTC_ARRAY(mask_regs), _INTC_ARRAY(prio_regs),			\
+	_INTC_ARRAY(sense_regs),					\
+	chipname,							\
+	_INTC_ARRAY(ack_regs),						\
+}
+#endif
+
+void __init register_intc_controller(struct intc_desc *desc);
+int intc_set_priority(unsigned int irq, unsigned int prio);
+
+#endif /* __SH_INTC_H */
-- 
cgit v1.2.3


From 1daef0a868370c5a96d031b9202e3354bea060e6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 27 Jul 2008 18:19:01 -0400
Subject: NFS: Clean up nfs_sb_active/nfs_sb_deactive

Instead of causing umount requests to block on server->active_wq while the
asynchronous sillyrename deletes are executing, we can use the sb->s_active
counter to obtain a reference to the super_block, and then release that
reference in nfs_async_unlink_release().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs_sb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index c9beacd16c0..4e477ae5869 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -119,7 +119,6 @@ struct nfs_server {
 	void (*destroy)(struct nfs_server *);
 
 	atomic_t active; /* Keep trace of any activity to this server */
-	wait_queue_head_t active_wq;  /* Wait for any activity to stop  */
 
 	/* mountd-related mount options */
 	struct sockaddr_storage	mountd_address;
-- 
cgit v1.2.3


From 4eec952e42314b53e48fef1f54dd89cbf9789734 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Jul 2008 17:58:13 -0400
Subject: NFS: Add options for finer control of the lookup cache

Add the flag NFS_MOUNT_LOOKUP_CACHE_NONEG to turn off the caching of
negative dentries. In reality what we do is to force
nfs_lookup_revalidate() to always discard negative dentries.

Add the flag NFS_MOUNT_LOOKUP_CACHE_NONE for enforcing stricter
revalidation of dentries. It forces the revalidate code to always do a
lookup instead of just checking the cached mtime of the parent directory.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_mount.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index df7c6b7a7eb..6549a06ac16 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -65,4 +65,8 @@ struct nfs_mount_data {
 #define NFS_MOUNT_UNSHARED	0x8000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
 
+/* The following are for internal use only */
+#define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
+#define NFS_MOUNT_LOOKUP_CACHE_NONE	0x20000
+
 #endif
-- 
cgit v1.2.3


From 691beb13cdc88358334ef0ba867c080a247a760f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 5 Oct 2008 14:48:22 -0400
Subject: NFS: Allow concurrent inode revalidation

Currently, if two processes are both trying to revalidate metadata for the
same inode, they will find themselves being serialised. There is no good
justification for this now that we have improved our ability to detect
stale attribute data, so we should remove that serialisation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 78a5922a2f1..ca563ee13e3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -200,11 +200,10 @@ struct nfs_inode {
 /*
  * Bit offsets in flags field
  */
-#define NFS_INO_REVALIDATING	(0)		/* revalidating attrs */
-#define NFS_INO_ADVISE_RDPLUS	(1)		/* advise readdirplus */
-#define NFS_INO_STALE		(2)		/* possible stale inode */
-#define NFS_INO_ACL_LRU_SET	(3)		/* Inode is on the LRU list */
-#define NFS_INO_MOUNTPOINT	(4)		/* inode is remote mountpoint */
+#define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
+#define NFS_INO_STALE		(1)		/* possible stale inode */
+#define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
+#define NFS_INO_MOUNTPOINT	(3)		/* inode is remote mountpoint */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3


From 9fa8d66f1e55bf197568c8c689043c2aad1ffc97 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Tue, 26 Aug 2008 16:23:20 +0100
Subject: NFS: remove 8 bytes of padding from struct nfs_fattr on 64 bit builds

remove 8 bytes of padding from struct nfs_fattr on 64 bit builds

This also removes padding from several nfs structures, including
16 bytes from  nfs4_opendata, nfs4_createdata,nfs3_createdata
& 8 bytes from nfs_read_data,nfs_write_data,nfs_removeres,nfs4_closedata

This also reduces the reported stack usage of many nfs functions (30+).

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
----

This patch is against the latest git 2.6.27-rc4.
I've built & run this on my AMD64 desktop, & successfully run _simple_
tests with a  64 bit client => 32 bit server & 32 bit client to 64 bit
server.

On fedora with gcc (GCC) 4.3.0 20080428 (Red Hat 4.3.0-8) checkpatch
reports 33 functions with reduced stack usage.
e.g.
__nfs_revalidate_inode [nfs] 216 => 200
_nfs4_proc_access [nfs] 304 => 288
_nfs4_proc_link [nfs] 536 => 504
_nfs4_proc_remove [nfs] 304 => 288
_nfs4_proc_rename [nfs] 584 => 552
nfs3_proc_access [nfs] 272 => 256
nfs3_proc_getacl [nfs] 384 => 368
nfs3_proc_link [nfs] 496 => 464
etc
I can supply the complete list if anyone is interested.

regards
Richard
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8c77c11224d..9cabbb3a9e6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -36,6 +36,7 @@ struct nfs_fattr {
 	__u32			nlink;
 	__u32			uid;
 	__u32			gid;
+	dev_t			rdev;
 	__u64			size;
 	union {
 		struct {
@@ -46,7 +47,6 @@ struct nfs_fattr {
 			__u64	used;
 		} nfs3;
 	} du;
-	dev_t			rdev;
 	struct nfs_fsid		fsid;
 	__u64			fileid;
 	struct timespec		atime;
-- 
cgit v1.2.3


From d1ce02e1689dff9d413138f60a79b4e3affb4708 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 25 Sep 2008 11:57:12 -0400
Subject: NFS: SETCLIENTID truncates client ID and netid

The sc_name field is currently 56 bytes long.  This is not large enough
to hold a pair of IPv6 addresses, the authentication type, the protocol
name, and a uniquifier number.  The maximum possible size of the name
string using IPv6 addresses is just under 110 bytes, so I increased the
size of the sc_name field to accomodate this maximum.

In addition, the strings in the nfs4_setclientid structure are
constructed with scnprintf(), which wants to terminate its output with
'\0'.  The sc_netid field was large enough only for a three byte netid
string and a '\0' so inet6 netids were being truncated.  Perhaps we
don't need the overhead of scnprintf() to do a simple string copy, but
I fixed this by increasing the size of the buffer by one byte.

Since all three of the string buffers in nfs4_setclientid are
constructed with scnprintf(), I increased the size of all three by one
byte to document the requirement, although I don't think either the
universal address field or the name field will be so small that these
strings get truncated in this way.

The size of the Linux client's client ID on the wire will be larger
than before.  RFC 3530 suggests the size limit for client IDs is 1024,
and we are still well below that.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9cabbb3a9e6..f6e95bfad5d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -672,16 +672,16 @@ struct nfs4_rename_res {
 	struct nfs_fattr *		new_fattr;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN	(56)
+#define NFS4_SETCLIENTID_NAMELEN	(128)
 struct nfs4_setclientid {
 	const nfs4_verifier *		sc_verifier;
 	unsigned int			sc_name_len;
-	char				sc_name[NFS4_SETCLIENTID_NAMELEN];
+	char				sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
 	u32				sc_prog;
 	unsigned int			sc_netid_len;
-	char				sc_netid[RPCBIND_MAXNETIDLEN];
+	char				sc_netid[RPCBIND_MAXNETIDLEN + 1];
 	unsigned int			sc_uaddr_len;
-	char				sc_uaddr[RPCBIND_MAXUADDRLEN];
+	char				sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
 	u32				sc_cb_ident;
 };
 
-- 
cgit v1.2.3


From 19d771f3caccaf66ce2fb539319222139e5b4e88 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 8 Oct 2008 13:54:52 -0400
Subject: NFS: Save padding bytes in struct nfs4_setclientid

Peter Staubach suggested reducing NFS4_SETCLIENTID_NAMELEN by one byte so
as to avoid 7 bytes of unnecessary padding.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index f6e95bfad5d..6ee6ae3f095 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -672,7 +672,7 @@ struct nfs4_rename_res {
 	struct nfs_fattr *		new_fattr;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN	(128)
+#define NFS4_SETCLIENTID_NAMELEN	(127)
 struct nfs4_setclientid {
 	const nfs4_verifier *		sc_verifier;
 	unsigned int			sc_name_len;
-- 
cgit v1.2.3


From fe9053b30bb48b99f7b45541249f5cfe96bdf7f7 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 14:59:59 -0400
Subject: RPC/RDMA: add data types and new FRMR memory registration enum.

Internal RPC/RDMA structure updates in preparation for FRMR support.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Acked-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 4de56b1d372..55a5d92ca1e 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -78,6 +78,7 @@ enum rpcrdma_memreg {
 	RPCRDMA_MEMWINDOWS,
 	RPCRDMA_MEMWINDOWS_ASYNC,
 	RPCRDMA_MTHCAFMR,
+	RPCRDMA_FRMR,
 	RPCRDMA_ALLPHYSICAL,
 	RPCRDMA_LAST
 };
-- 
cgit v1.2.3


From 5675add36e76b9487e7f9e689f854cb8d6afd9b4 Mon Sep 17 00:00:00 2001
From: Tom Talpey <talpey@netapp.com>
Date: Thu, 9 Oct 2008 15:01:41 -0400
Subject: RPC/RDMA: harden connection logic against missing/late rdma_cm
 upcalls.

Add defensive timeouts to wait_for_completion() calls in RDMA
address resolution, and make them interruptible. Fix the timeout
units to milliseconds (formerly jiffies) and move to private header.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 55a5d92ca1e..54a379c9e8e 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -66,9 +66,6 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-#define RDMA_RESOLVE_TIMEOUT	(5*HZ)	/* TBD 5 seconds */
-#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
-
 /* memory registration strategies */
 #define RPCRDMA_PERSISTENT_REGISTRATION (1)
 
-- 
cgit v1.2.3


From 061b1bd394ca8628b7c24eb4658ba3535da4249a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 24 Sep 2008 14:46:44 -0700
Subject: Staging: add TAINT_CRAP for all drivers/staging code

We need to add a flag for all code that is in the drivers/staging/
directory to prevent all other kernel developers from worrying about
issues here, and to notify users that the drivers might not be as good
as they are normally used to.

Based on code from Andreas Gruenbacher and Jeff Mahoney to provide a
TAINT flag for the support level of a kernel module in the Novell
enterprise kernel release.

This is the kernel portion of this feature, the ability for the flag to
be set needs to be done in the build process and will happen in a
follow-up patch.

Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Jeff Mahoney <jeffm@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6..b36805cb95f 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -260,6 +260,7 @@ extern enum system_states {
 #define TAINT_DIE			(1<<7)
 #define TAINT_OVERRIDDEN_ACPI_TABLE	(1<<8)
 #define TAINT_WARN			(1<<9)
+#define TAINT_CRAP			(1<<10)
 
 extern void dump_stack(void) __cold;
 
-- 
cgit v1.2.3


From f9da8d157b60d8c5bfc5a21fc50538fdb754a65b Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Fri, 10 Oct 2008 23:14:14 -0400
Subject: Input: move map_to_7segment.h to include/linux

The map_to_7segment.h provides generic 7segment LED mappings and is
designed to be used by other drivers.  Moving it to common area will
make it more usable.  Also exporting it to userspace will help users
of sysfs interface.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Acked-by: Henk Vergonet <henk.vergonet@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/Kbuild            |   1 +
 include/linux/map_to_7segment.h | 187 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 include/linux/map_to_7segment.h

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 4c4142c5aa6..0b136c5990c 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -106,6 +106,7 @@ header-y += keyctl.h
 header-y += limits.h
 header-y += magic.h
 header-y += major.h
+header-y += map_to_7segment.h
 header-y += matroxfb.h
 header-y += meye.h
 header-y += minix_fs.h
diff --git a/include/linux/map_to_7segment.h b/include/linux/map_to_7segment.h
new file mode 100644
index 00000000000..7df8432c440
--- /dev/null
+++ b/include/linux/map_to_7segment.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2005 Henk Vergonet <Henk.Vergonet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef MAP_TO_7SEGMENT_H
+#define MAP_TO_7SEGMENT_H
+
+/* This file provides translation primitives and tables for the conversion
+ * of (ASCII) characters to a 7-segments notation.
+ *
+ * The 7 segment's wikipedia notation below is used as standard.
+ * See: http://en.wikipedia.org/wiki/Seven_segment_display
+ *
+ * Notation:	+-a-+
+ *		f   b
+ *		+-g-+
+ *		e   c
+ *		+-d-+
+ *
+ * Usage:
+ *
+ *   Register a map variable, and fill it with a character set:
+ *	static SEG7_DEFAULT_MAP(map_seg7);
+ *
+ *
+ *   Then use for conversion:
+ *	seg7 = map_to_seg7(&map_seg7, some_char);
+ *	...
+ *
+ * In device drivers it is recommended, if required, to make the char map
+ * accessible via the sysfs interface using the following scheme:
+ *
+ * static ssize_t show_map(struct device *dev, char *buf) {
+ *	memcpy(buf, &map_seg7, sizeof(map_seg7));
+ *	return sizeof(map_seg7);
+ * }
+ * static ssize_t store_map(struct device *dev, const char *buf, size_t cnt) {
+ *	if(cnt != sizeof(map_seg7))
+ *		return -EINVAL;
+ *	memcpy(&map_seg7, buf, cnt);
+ *	return cnt;
+ * }
+ * static DEVICE_ATTR(map_seg7, PERMS_RW, show_map, store_map);
+ *
+ * History:
+ * 2005-05-31	RFC linux-kernel@vger.kernel.org
+ */
+#include <linux/errno.h>
+
+
+#define BIT_SEG7_A		0
+#define BIT_SEG7_B		1
+#define BIT_SEG7_C		2
+#define BIT_SEG7_D		3
+#define BIT_SEG7_E		4
+#define BIT_SEG7_F		5
+#define BIT_SEG7_G		6
+#define BIT_SEG7_RESERVED	7
+
+struct seg7_conversion_map {
+	unsigned char	table[128];
+};
+
+static inline int map_to_seg7(struct seg7_conversion_map *map, int c)
+{
+	return c >= 0 && c < sizeof(map->table) ? map->table[c] : -EINVAL;
+}
+
+#define SEG7_CONVERSION_MAP(_name, _map)	\
+	struct seg7_conversion_map _name = { .table = { _map } }
+
+/*
+ * It is recommended to use a facility that allows user space to redefine
+ * custom character sets for LCD devices. Please use a sysfs interface
+ * as described above.
+ */
+#define MAP_TO_SEG7_SYSFS_FILE	"map_seg7"
+
+/*******************************************************************************
+ * ASCII conversion table
+ ******************************************************************************/
+
+#define _SEG7(l,a,b,c,d,e,f,g)	\
+      (	a<<BIT_SEG7_A |	b<<BIT_SEG7_B |	c<<BIT_SEG7_C |	d<<BIT_SEG7_D |	\
+	e<<BIT_SEG7_E |	f<<BIT_SEG7_F |	g<<BIT_SEG7_G )
+
+#define _MAP_0_32_ASCII_SEG7_NON_PRINTABLE	\
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+
+#define _MAP_33_47_ASCII_SEG7_SYMBOL		\
+ _SEG7('!',0,0,0,0,1,1,0), _SEG7('"',0,1,0,0,0,1,0), _SEG7('#',0,1,1,0,1,1,0),\
+ _SEG7('$',1,0,1,1,0,1,1), _SEG7('%',0,0,1,0,0,1,0), _SEG7('&',1,0,1,1,1,1,1),\
+ _SEG7('\'',0,0,0,0,0,1,0),_SEG7('(',1,0,0,1,1,1,0), _SEG7(')',1,1,1,1,0,0,0),\
+ _SEG7('*',0,1,1,0,1,1,1), _SEG7('+',0,1,1,0,0,0,1), _SEG7(',',0,0,0,0,1,0,0),\
+ _SEG7('-',0,0,0,0,0,0,1), _SEG7('.',0,0,0,0,1,0,0), _SEG7('/',0,1,0,0,1,0,1),
+
+#define _MAP_48_57_ASCII_SEG7_NUMERIC		\
+ _SEG7('0',1,1,1,1,1,1,0), _SEG7('1',0,1,1,0,0,0,0), _SEG7('2',1,1,0,1,1,0,1),\
+ _SEG7('3',1,1,1,1,0,0,1), _SEG7('4',0,1,1,0,0,1,1), _SEG7('5',1,0,1,1,0,1,1),\
+ _SEG7('6',1,0,1,1,1,1,1), _SEG7('7',1,1,1,0,0,0,0), _SEG7('8',1,1,1,1,1,1,1),\
+ _SEG7('9',1,1,1,1,0,1,1),
+
+#define _MAP_58_64_ASCII_SEG7_SYMBOL		\
+ _SEG7(':',0,0,0,1,0,0,1), _SEG7(';',0,0,0,1,0,0,1), _SEG7('<',1,0,0,0,0,1,1),\
+ _SEG7('=',0,0,0,1,0,0,1), _SEG7('>',1,1,0,0,0,0,1), _SEG7('?',1,1,1,0,0,1,0),\
+ _SEG7('@',1,1,0,1,1,1,1),
+
+#define _MAP_65_90_ASCII_SEG7_ALPHA_UPPR	\
+ _SEG7('A',1,1,1,0,1,1,1), _SEG7('B',1,1,1,1,1,1,1), _SEG7('C',1,0,0,1,1,1,0),\
+ _SEG7('D',1,1,1,1,1,1,0), _SEG7('E',1,0,0,1,1,1,1), _SEG7('F',1,0,0,0,1,1,1),\
+ _SEG7('G',1,1,1,1,0,1,1), _SEG7('H',0,1,1,0,1,1,1), _SEG7('I',0,1,1,0,0,0,0),\
+ _SEG7('J',0,1,1,1,0,0,0), _SEG7('K',0,1,1,0,1,1,1), _SEG7('L',0,0,0,1,1,1,0),\
+ _SEG7('M',1,1,1,0,1,1,0), _SEG7('N',1,1,1,0,1,1,0), _SEG7('O',1,1,1,1,1,1,0),\
+ _SEG7('P',1,1,0,0,1,1,1), _SEG7('Q',1,1,1,1,1,1,0), _SEG7('R',1,1,1,0,1,1,1),\
+ _SEG7('S',1,0,1,1,0,1,1), _SEG7('T',0,0,0,1,1,1,1), _SEG7('U',0,1,1,1,1,1,0),\
+ _SEG7('V',0,1,1,1,1,1,0), _SEG7('W',0,1,1,1,1,1,1), _SEG7('X',0,1,1,0,1,1,1),\
+ _SEG7('Y',0,1,1,0,0,1,1), _SEG7('Z',1,1,0,1,1,0,1),
+
+#define _MAP_91_96_ASCII_SEG7_SYMBOL		\
+ _SEG7('[',1,0,0,1,1,1,0), _SEG7('\\',0,0,1,0,0,1,1),_SEG7(']',1,1,1,1,0,0,0),\
+ _SEG7('^',1,1,0,0,0,1,0), _SEG7('_',0,0,0,1,0,0,0), _SEG7('`',0,1,0,0,0,0,0),
+
+#define _MAP_97_122_ASCII_SEG7_ALPHA_LOWER	\
+ _SEG7('A',1,1,1,0,1,1,1), _SEG7('b',0,0,1,1,1,1,1), _SEG7('c',0,0,0,1,1,0,1),\
+ _SEG7('d',0,1,1,1,1,0,1), _SEG7('E',1,0,0,1,1,1,1), _SEG7('F',1,0,0,0,1,1,1),\
+ _SEG7('G',1,1,1,1,0,1,1), _SEG7('h',0,0,1,0,1,1,1), _SEG7('i',0,0,1,0,0,0,0),\
+ _SEG7('j',0,0,1,1,0,0,0), _SEG7('k',0,0,1,0,1,1,1), _SEG7('L',0,0,0,1,1,1,0),\
+ _SEG7('M',1,1,1,0,1,1,0), _SEG7('n',0,0,1,0,1,0,1), _SEG7('o',0,0,1,1,1,0,1),\
+ _SEG7('P',1,1,0,0,1,1,1), _SEG7('q',1,1,1,0,0,1,1), _SEG7('r',0,0,0,0,1,0,1),\
+ _SEG7('S',1,0,1,1,0,1,1), _SEG7('T',0,0,0,1,1,1,1), _SEG7('u',0,0,1,1,1,0,0),\
+ _SEG7('v',0,0,1,1,1,0,0), _SEG7('W',0,1,1,1,1,1,1), _SEG7('X',0,1,1,0,1,1,1),\
+ _SEG7('y',0,1,1,1,0,1,1), _SEG7('Z',1,1,0,1,1,0,1),
+
+#define _MAP_123_126_ASCII_SEG7_SYMBOL		\
+ _SEG7('{',1,0,0,1,1,1,0), _SEG7('|',0,0,0,0,1,1,0), _SEG7('}',1,1,1,1,0,0,0),\
+ _SEG7('~',1,0,0,0,0,0,0),
+
+/* Maps */
+
+/* This set tries to map as close as possible to the visible characteristics
+ * of the ASCII symbol, lowercase and uppercase letters may differ in
+ * presentation on the display.
+ */
+#define MAP_ASCII7SEG_ALPHANUM			\
+	_MAP_0_32_ASCII_SEG7_NON_PRINTABLE	\
+	_MAP_33_47_ASCII_SEG7_SYMBOL		\
+	_MAP_48_57_ASCII_SEG7_NUMERIC		\
+	_MAP_58_64_ASCII_SEG7_SYMBOL		\
+	_MAP_65_90_ASCII_SEG7_ALPHA_UPPR	\
+	_MAP_91_96_ASCII_SEG7_SYMBOL		\
+	_MAP_97_122_ASCII_SEG7_ALPHA_LOWER	\
+	_MAP_123_126_ASCII_SEG7_SYMBOL
+
+/* This set tries to map as close as possible to the symbolic characteristics
+ * of the ASCII character for maximum discrimination.
+ * For now this means all alpha chars are in lower case representations.
+ * (This for example facilitates the use of hex numbers with uppercase input.)
+ */
+#define MAP_ASCII7SEG_ALPHANUM_LC			\
+	_MAP_0_32_ASCII_SEG7_NON_PRINTABLE	\
+	_MAP_33_47_ASCII_SEG7_SYMBOL		\
+	_MAP_48_57_ASCII_SEG7_NUMERIC		\
+	_MAP_58_64_ASCII_SEG7_SYMBOL		\
+	_MAP_97_122_ASCII_SEG7_ALPHA_LOWER	\
+	_MAP_91_96_ASCII_SEG7_SYMBOL		\
+	_MAP_97_122_ASCII_SEG7_ALPHA_LOWER	\
+	_MAP_123_126_ASCII_SEG7_SYMBOL
+
+#define SEG7_DEFAULT_MAP(_name)		\
+	SEG7_CONVERSION_MAP(_name,MAP_ASCII7SEG_ALPHANUM)
+
+#endif	/* MAP_TO_7SEGMENT_H */
+
-- 
cgit v1.2.3


From 6283815d1853b7daf31dc4adb83e5c1dc9568251 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 13 Oct 2008 11:55:12 +1100
Subject: md: linear: Represent dev_info->size and dev_info->offset in sectors.

Rename them to num_sectors and start_sector which is more descriptive.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/linear.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index 7e375111d00..87090e98529 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -5,8 +5,8 @@
 
 struct dev_info {
 	mdk_rdev_t	*rdev;
-	sector_t	size;
-	sector_t	offset;
+	sector_t	num_sectors;
+	sector_t	start_sector;
 };
 
 typedef struct dev_info dev_info_t;
-- 
cgit v1.2.3


From ab5bd5cbc8d4b868378d062eed3d4240930fbb86 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Mon, 13 Oct 2008 11:55:12 +1100
Subject: md: Convert remaining 1k representations in linear.c to sectors.

This patch renames hash_spacing and preshift to  spacing and
sector_shift respectively with the following change of semantics:

Case 1: (sizeof(sector_t) <= sizeof(u32)).
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In this case, we have sector_shift = preshift = 0 and spacing =
2 * hash_spacing.

Hence, the index for the hash table which is computed by the new code
in which_dev() as sector / spacing equals the old value which was
(sector/2) / hash_spacing.

Note also that the value of nb_zone stays the same because both sz
and base double.

Case 2: (sizeof(sector_t) > sizeof(u32)).
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

(aka the shifting dance case). Here we have sector_shift = preshift +
1 and

spacing = 2 * hash_spacing

during the computation of nb_zone and curr_sector, but

spacing = hash_spacing

in which_dev() because in the last hunk of the patch for linear.c we
shift down conf->spacing (= 2 * hash_spacing) by one more bit than
in the old code.

Hence in the computation of nb_zone, sz and base have the same value
as before, so nb_zone is not affected. Also curr_sector in the next
hunk stays the same.

In which_dev() the hash table index is computed as

(sector >> sector_shift) / spacing

In view of sector_shift = preshift + 1 and spacing = hash_spacing,
this equals

((sector/2) >> preshift) / hash_spacing

which is the value computed by the old code.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/linear.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index 87090e98529..f38b9c586af 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -15,9 +15,11 @@ struct linear_private_data
 {
 	struct linear_private_data *prev;	/* earlier version */
 	dev_info_t		**hash_table;
-	sector_t		hash_spacing;
+	sector_t		spacing;
 	sector_t		array_sectors;
-	int			preshift; /* shift before dividing by hash_spacing */
+	int			sector_shift;	/* shift before dividing
+						 * by spacing
+						 */
 	dev_info_t		disks[0];
 };
 
-- 
cgit v1.2.3


From fb4d8c76e56a887b9eee99fbc55fe82b18625d30 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 13 Oct 2008 11:55:12 +1100
Subject: md: Remove unnecessary #includes, #defines, and function
 declarations.

A lot of cruft has gathered over the years.  Time to remove it.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/md.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index dc0e3fcb9f2..bb727fa1ce7 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -19,27 +19,7 @@
 #define _MD_H
 
 #include <linux/blkdev.h>
-#include <linux/major.h>
-#include <linux/ioctl.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/hdreg.h>
-#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
-#include <linux/delay.h>
-#include <net/checksum.h>
-#include <linux/random.h>
-#include <linux/kernel_stat.h>
-#include <asm/io.h>
-#include <linux/completion.h>
-#include <linux/mempool.h>
-#include <linux/list.h>
-#include <linux/reboot.h>
-#include <linux/vmalloc.h>
-#include <linux/blkpg.h>
-#include <linux/bio.h>
 
 /*
  * 'md_p.h' holds the 'physical' layout of RAID devices
@@ -83,10 +63,8 @@ extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
 extern void md_write_start(mddev_t *mddev, struct bio *bi);
 extern void md_write_end(mddev_t *mddev);
-extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
-extern void md_unplug_mddev(mddev_t *mddev);
 
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
-- 
cgit v1.2.3


From d710e13812600037a723a673dc5c96a071de98d3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 13 Oct 2008 11:55:12 +1100
Subject: md: remove space after function name in declaration and call.

Having
   function (args)
instead of
   function(args)

make is harder to search for calls of particular functions.
So remove all those spaces.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/raid/md.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index bb727fa1ce7..82bea14cae1 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -54,17 +54,17 @@
 
 extern int mdp_major;
 
-extern int register_md_personality (struct mdk_personality *p);
-extern int unregister_md_personality (struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
 				mddev_t *mddev, const char *name);
-extern void md_unregister_thread (mdk_thread_t *thread);
+extern void md_unregister_thread(mdk_thread_t *thread);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
 extern void md_write_start(mddev_t *mddev, struct bio *bi);
 extern void md_write_end(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
 
 extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 			   sector_t sector, int size, struct page *page);
-- 
cgit v1.2.3


From 6000a368cd8e6da1caf101411bdb494cd6fb8b09 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Tue, 19 Aug 2008 18:45:30 -0500
Subject: [SCSI] block: separate failfast into multiple bits.

Multipath is best at handling transport errors. If it gets a device
error then there is not much the multipath layer can do. It will just
access the same device but from a different path.

This patch breaks up failfast into device, transport and driver errors.
The multipath layers (md and dm mutlipath) only ask the lower levels to
fast fail transport errors. The user of failfast, read ahead, will ask
to fast fail on all errors.

Note that blk_noretry_request will return true if any failfast bit
is set. This allows drivers that do not support the multipath failfast
bits to continue to fail on any failfast error like before. Drivers
like scsi that are able to fail fast specific errors can check
for the specific fail fast type. In the next patch I will convert
scsi.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 include/linux/bio.h    | 26 +++++++++++++++++---------
 include/linux/blkdev.h | 15 ++++++++++++---
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ff5b4cf9e2d..1beda208cbf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -129,25 +129,30 @@ struct bio {
  * bit 2 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this oen is issued.
- * bit 3 -- fail fast, don't want low level driver retries
- * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
  *	Note that this does NOT indicate that the IO itself is sync, just
  *	that the block layer will not postpone issue of this IO by plugging.
- * bit 5 -- metadata request
+ * bit 4 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
- * bit 6 -- discard sectors
+ * bit 5 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
+ * bit 6 -- fail fast device errors
+ * bit 7 -- fail fast transport errors
+ * bit 8 -- fail fast driver errors
+ *	Don't want driver retries for any fast fail whatever the reason.
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER	2
-#define BIO_RW_FAILFAST	3
-#define BIO_RW_SYNC	4
-#define BIO_RW_META	5
-#define BIO_RW_DISCARD	6
+#define BIO_RW_SYNC	3
+#define BIO_RW_META	4
+#define BIO_RW_DISCARD	5
+#define BIO_RW_FAILFAST_DEV		6
+#define BIO_RW_FAILFAST_TRANSPORT	7
+#define BIO_RW_FAILFAST_DRIVER		8
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -174,7 +179,10 @@ struct bio {
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
-#define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
+#define bio_failfast_dev(bio)	((bio)->bi_rw &	(1 << BIO_RW_FAILFAST_DEV))
+#define bio_failfast_transport(bio)	\
+	((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT))
+#define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 #define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a92d9e4ea96..f3491d22526 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -87,7 +87,9 @@ enum {
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
-	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
+	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
+	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
 	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
@@ -111,8 +113,10 @@ enum rq_flag_bits {
 };
 
 #define REQ_RW		(1 << __REQ_RW)
+#define REQ_FAILFAST_DEV	(1 << __REQ_FAILFAST_DEV)
+#define REQ_FAILFAST_TRANSPORT	(1 << __REQ_FAILFAST_TRANSPORT)
+#define REQ_FAILFAST_DRIVER	(1 << __REQ_FAILFAST_DRIVER)
 #define REQ_DISCARD	(1 << __REQ_DISCARD)
-#define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
@@ -560,7 +564,12 @@ enum {
 #define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
 #define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
 
-#define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
+#define blk_failfast_dev(rq)	((rq)->cmd_flags & REQ_FAILFAST_DEV)
+#define blk_failfast_transport(rq) ((rq)->cmd_flags & REQ_FAILFAST_TRANSPORT)
+#define blk_failfast_driver(rq)	((rq)->cmd_flags & REQ_FAILFAST_DRIVER)
+#define blk_noretry_request(rq)	(blk_failfast_dev(rq) ||	\
+				 blk_failfast_transport(rq) ||	\
+				 blk_failfast_driver(rq))
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
 
 #define blk_account_rq(rq)	(blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
-- 
cgit v1.2.3


From 69fd3a8d098faf41a04930afa83757c0555ee360 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sun, 12 Oct 2008 16:18:36 +0200
Subject: [MTD] remove unused mtd parameter in of_mtd_parse_partitions()

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/partitions.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 5014f7a9f5d..c92b4d43960 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -73,7 +73,6 @@ struct device;
 struct device_node;
 
 int __devinit of_mtd_parse_partitions(struct device *dev,
-                                      struct mtd_info *mtd,
                                       struct device_node *node,
                                       struct mtd_partition **pparts);
 
-- 
cgit v1.2.3


From 97e1c18e8d17bd87e1e383b2e9d9fc740332c8e2 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 18 Jul 2008 12:16:16 -0400
Subject: tracing: Kernel Tracepoints

Implementation of kernel tracepoints. Inspired from the Linux Kernel
Markers. Allows complete typing verification by declaring both tracing
statement inline functions and probe registration/unregistration static
inline functions within the same macro "DEFINE_TRACE". No format string
is required. See the tracepoint Documentation and Samples patches for
usage examples.

Taken from the documentation patch :

"A tracepoint placed in code provides a hook to call a function (probe)
that you can provide at runtime. A tracepoint can be "on" (a probe is
connected to it) or "off" (no probe is attached). When a tracepoint is
"off" it has no effect, except for adding a tiny time penalty (checking
a condition for a branch) and space penalty (adding a few bytes for the
function call at the end of the instrumented function and adds a data
structure in a separate section).  When a tracepoint is "on", the
function you provide is called each time the tracepoint is executed, in
the execution context of the caller. When the function provided ends its
execution, it returns to the caller (continuing from the tracepoint
site).

You can put tracepoints at important locations in the code. They are
lightweight hooks that can pass an arbitrary number of parameters, which
prototypes are described in a tracepoint declaration placed in a header
file."

Addition and removal of tracepoints is synchronized by RCU using the
scheduler (and preempt_disable) as guarantees to find a quiescent state
(this is really RCU "classic"). The update side uses rcu_barrier_sched()
with call_rcu_sched() and the read/execute side uses
"preempt_disable()/preempt_enable()".

We make sure the previous array containing probes, which has been
scheduled for deletion by the rcu callback, is indeed freed before we
proceed to the next update. It therefore limits the rate of modification
of a single tracepoint to one update per RCU period. The objective here
is to permit fast batch add/removal of probes on _different_
tracepoints.

Changelog :
- Use #name ":" #proto as string to identify the tracepoint in the
  tracepoint table. This will make sure not type mismatch happens due to
  connexion of a probe with the wrong type to a tracepoint declared with
  the same name in a different header.
- Add tracepoint_entry_free_old.
- Change __TO_TRACE to get rid of the 'i' iterator.

Masami Hiramatsu <mhiramat@redhat.com> :
Tested on x86-64.

Performance impact of a tracepoint : same as markers, except that it
adds about 70 bytes of instructions in an unlikely branch of each
instrumented function (the for loop, the stack setup and the function
call). It currently adds a memory read, a test and a conditional branch
at the instrumentation site (in the hot path). Immediate values will
eventually change this into a load immediate, test and branch, which
removes the memory read which will make the i-cache impact smaller
(changing the memory read for a load immediate removes 3-4 bytes per
site on x86_32 (depending on mov prefixes), or 7-8 bytes on x86_64, it
also saves the d-cache hit).

About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added.

Quoting Hideo Aoki about Markers :

I evaluated overhead of kernel marker using linux-2.6-sched-fixes git
tree, which includes several markers for LTTng, using an ia64 server.

While the immediate trace mark feature isn't implemented on ia64, there
is no major performance regression. So, I think that we don't have any
issues to propose merging marker point patches into Linus's tree from
the viewpoint of performance impact.

I prepared two kernels to evaluate. The first one was compiled without
CONFIG_MARKERS. The second one was enabled CONFIG_MARKERS.

I downloaded the original hackbench from the following URL:
http://devresources.linux-foundation.org/craiger/hackbench/src/hackbench.c

I ran hackbench 5 times in each condition and calculated the average and
difference between the kernels.

    The parameter of hackbench: every 50 from 50 to 800
    The number of CPUs of the server: 2, 4, and 8

Below is the results. As you can see, major performance regression
wasn't found in any case. Even if number of processes increases,
differences between marker-enabled kernel and marker- disabled kernel
doesn't increase. Moreover, if number of CPUs increases, the differences
doesn't increase either.

Curiously, marker-enabled kernel is better than marker-disabled kernel
in more than half cases, although I guess it comes from the difference
of memory access pattern.

* 2 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      4.811   |       4.872  |  +0.061  |  +1.27  |
      100 |      9.854   |      10.309  |  +0.454  |  +4.61  |
      150 |     15.602   |      15.040  |  -0.562  |  -3.6   |
      200 |     20.489   |      20.380  |  -0.109  |  -0.53  |
      250 |     25.798   |      25.652  |  -0.146  |  -0.56  |
      300 |     31.260   |      30.797  |  -0.463  |  -1.48  |
      350 |     36.121   |      35.770  |  -0.351  |  -0.97  |
      400 |     42.288   |      42.102  |  -0.186  |  -0.44  |
      450 |     47.778   |      47.253  |  -0.526  |  -1.1   |
      500 |     51.953   |      52.278  |  +0.325  |  +0.63  |
      550 |     58.401   |      57.700  |  -0.701  |  -1.2   |
      600 |     63.334   |      63.222  |  -0.112  |  -0.18  |
      650 |     68.816   |      68.511  |  -0.306  |  -0.44  |
      700 |     74.667   |      74.088  |  -0.579  |  -0.78  |
      750 |     78.612   |      79.582  |  +0.970  |  +1.23  |
      800 |     85.431   |      85.263  |  -0.168  |  -0.2   |
--------------------------------------------------------------

* 4 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      2.586   |       2.584  |  -0.003  |  -0.1   |
      100 |      5.254   |       5.283  |  +0.030  |  +0.56  |
      150 |      8.012   |       8.074  |  +0.061  |  +0.76  |
      200 |     11.172   |      11.000  |  -0.172  |  -1.54  |
      250 |     13.917   |      14.036  |  +0.119  |  +0.86  |
      300 |     16.905   |      16.543  |  -0.362  |  -2.14  |
      350 |     19.901   |      20.036  |  +0.135  |  +0.68  |
      400 |     22.908   |      23.094  |  +0.186  |  +0.81  |
      450 |     26.273   |      26.101  |  -0.172  |  -0.66  |
      500 |     29.554   |      29.092  |  -0.461  |  -1.56  |
      550 |     32.377   |      32.274  |  -0.103  |  -0.32  |
      600 |     35.855   |      35.322  |  -0.533  |  -1.49  |
      650 |     39.192   |      38.388  |  -0.804  |  -2.05  |
      700 |     41.744   |      41.719  |  -0.025  |  -0.06  |
      750 |     45.016   |      44.496  |  -0.520  |  -1.16  |
      800 |     48.212   |      47.603  |  -0.609  |  -1.26  |
--------------------------------------------------------------

* 8 CPUs

Number of | without      | with         | diff     | diff    |
processes | Marker [Sec] | Marker [Sec] |   [Sec]  |   [%]   |
--------------------------------------------------------------
       50 |      2.094   |       2.072  |  -0.022  |  -1.07  |
      100 |      4.162   |       4.273  |  +0.111  |  +2.66  |
      150 |      6.485   |       6.540  |  +0.055  |  +0.84  |
      200 |      8.556   |       8.478  |  -0.078  |  -0.91  |
      250 |     10.458   |      10.258  |  -0.200  |  -1.91  |
      300 |     12.425   |      12.750  |  +0.325  |  +2.62  |
      350 |     14.807   |      14.839  |  +0.032  |  +0.22  |
      400 |     16.801   |      16.959  |  +0.158  |  +0.94  |
      450 |     19.478   |      19.009  |  -0.470  |  -2.41  |
      500 |     21.296   |      21.504  |  +0.208  |  +0.98  |
      550 |     23.842   |      23.979  |  +0.137  |  +0.57  |
      600 |     26.309   |      26.111  |  -0.198  |  -0.75  |
      650 |     28.705   |      28.446  |  -0.259  |  -0.9   |
      700 |     31.233   |      31.394  |  +0.161  |  +0.52  |
      750 |     34.064   |      33.720  |  -0.344  |  -1.01  |
      800 |     36.320   |      36.114  |  -0.206  |  -0.57  |
--------------------------------------------------------------

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/module.h     |  17 ++++++
 include/linux/tracepoint.h | 127 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 include/linux/tracepoint.h

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 68e09557c95..8b611350386 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
 #include <linux/marker.h>
+#include <linux/tracepoint.h>
 #include <asm/local.h>
 
 #include <asm/module.h>
@@ -331,6 +332,10 @@ struct module
 	struct marker *markers;
 	unsigned int num_markers;
 #endif
+#ifdef CONFIG_TRACEPOINTS
+	struct tracepoint *tracepoints;
+	unsigned int num_tracepoints;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
@@ -454,6 +459,9 @@ extern void print_modules(void);
 
 extern void module_update_markers(void);
 
+extern void module_update_tracepoints(void);
+extern int module_get_iter_tracepoints(struct tracepoint_iter *iter);
+
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
@@ -558,6 +566,15 @@ static inline void module_update_markers(void)
 {
 }
 
+static inline void module_update_tracepoints(void)
+{
+}
+
+static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
+{
+	return 0;
+}
+
 #endif /* CONFIG_MODULES */
 
 struct device_driver;
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
new file mode 100644
index 00000000000..e623a6fca5c
--- /dev/null
+++ b/include/linux/tracepoint.h
@@ -0,0 +1,127 @@
+#ifndef _LINUX_TRACEPOINT_H
+#define _LINUX_TRACEPOINT_H
+
+/*
+ * Kernel Tracepoint API.
+ *
+ * See Documentation/tracepoint.txt.
+ *
+ * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * Heavily inspired from the Linux Kernel Markers.
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+
+struct module;
+struct tracepoint;
+
+struct tracepoint {
+	const char *name;		/* Tracepoint name */
+	int state;			/* State. */
+	void **funcs;
+} __attribute__((aligned(8)));
+
+
+#define TPPROTO(args...)	args
+#define TPARGS(args...)		args
+
+#ifdef CONFIG_TRACEPOINTS
+
+/*
+ * it_func[0] is never NULL because there is at least one element in the array
+ * when the array itself is non NULL.
+ */
+#define __DO_TRACE(tp, proto, args)					\
+	do {								\
+		void **it_func;						\
+									\
+		rcu_read_lock_sched();					\
+		it_func = rcu_dereference((tp)->funcs);			\
+		if (it_func) {						\
+			do {						\
+				((void(*)(proto))(*it_func))(args);	\
+			} while (*(++it_func));				\
+		}							\
+		rcu_read_unlock_sched();				\
+	} while (0)
+
+/*
+ * Make sure the alignment of the structure in the __tracepoints section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ */
+#define DEFINE_TRACE(name, proto, args)					\
+	static inline void trace_##name(proto)				\
+	{								\
+		static const char __tpstrtab_##name[]			\
+		__attribute__((section("__tracepoints_strings")))	\
+		= #name ":" #proto;					\
+		static struct tracepoint __tracepoint_##name		\
+		__attribute__((section("__tracepoints"), aligned(8))) =	\
+		{ __tpstrtab_##name, 0, NULL };				\
+		if (unlikely(__tracepoint_##name.state))		\
+			__DO_TRACE(&__tracepoint_##name,		\
+				TPPROTO(proto), TPARGS(args));		\
+	}								\
+	static inline int register_trace_##name(void (*probe)(proto))	\
+	{								\
+		return tracepoint_probe_register(#name ":" #proto,	\
+			(void *)probe);					\
+	}								\
+	static inline void unregister_trace_##name(void (*probe)(proto))\
+	{								\
+		tracepoint_probe_unregister(#name ":" #proto,		\
+			(void *)probe);					\
+	}
+
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end);
+
+#else /* !CONFIG_TRACEPOINTS */
+#define DEFINE_TRACE(name, proto, args)			\
+	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
+	{ }								\
+	static inline void trace_##name(proto)				\
+	{ }								\
+	static inline int register_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}								\
+	static inline void unregister_trace_##name(void (*probe)(proto))\
+	{ }
+
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe);
+
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_unregister(const char *name, void *probe);
+
+struct tracepoint_iter {
+	struct module *module;
+	struct tracepoint *tracepoint;
+};
+
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+	struct tracepoint *begin, struct tracepoint *end);
+
+#endif
-- 
cgit v1.2.3


From 36dcd67ae994fece615b7c700958d215e884b9ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 29 Jul 2008 12:00:59 +0200
Subject: ftrace: ignore functions that cannot be kprobe-ed

kprobes already has an extensive list of annotations for functions
that should not be instrumented. Add notrace annotations to these
functions as well.

This is particularly useful for functions called by the NMI path.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kprobes.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0be7795655f..497b1d1f7a0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,6 +29,7 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
+#include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -47,7 +48,7 @@
 #define KPROBE_HIT_SSDONE	0x00000008
 
 /* Attach to insert probes on any functions which should be ignored*/
-#define __kprobes	__attribute__((__section__(".kprobes.text")))
+#define __kprobes	__attribute__((__section__(".kprobes.text"))) notrace
 
 struct kprobe;
 struct pt_regs;
@@ -256,7 +257,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
 #else /* CONFIG_KPROBES */
 
-#define __kprobes	/**/
+#define __kprobes	notrace
 struct jprobe;
 struct kretprobe;
 
-- 
cgit v1.2.3


From 68bf21aa15c85d2e9b623dcda2b1ed8893275fa1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:08 -0400
Subject: ftrace: mcount call site on boot nops core

This is the infrastructure to the converting the mcount call sites
recorded by the __mcount_loc section into nops on boot. It also allows
for using these sites to enable tracing as normal. When the __mcount_loc
section is used, the "ftraced" kernel thread is disabled.

This uses the current infrastructure to record the mcount call sites
as well as convert them to nops. The mcount function is kept as a stub
on boot up and not converted to the ftrace_record_ip function. We use the
ftrace_record_ip to only record from the table.

This patch does not handle modules. That comes with a later patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index bb384068272..d4d6ab453b7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -162,4 +162,10 @@ static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 #endif
 
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+extern void ftrace_init(void);
+#else
+static inline void ftrace_init(void) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
-- 
cgit v1.2.3


From 90d595fe5ca4b685465c068907e6e554760abea8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:09 -0400
Subject: ftrace: enable mcount recording for modules

This patch enables the loading of the __mcount_section of modules and
changing all the callers of mcount into nops.

The modification is done before the init_module function is called, so
again, we do not need to use kstop_machine to make these changes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d4d6ab453b7..4936489f9ed 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -164,8 +164,11 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
+extern void ftrace_init_module(unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
+static inline void
+ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
-- 
cgit v1.2.3


From 29e71abf56cebc5c5a4e184a6eb4360cc58554ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 15:45:10 -0400
Subject: ftrace: rebuild everything on change to FTRACE_MCOUNT_RECORD

When enabling or disabling CONFIG_FTRACE_MCOUNT_RECORD, we want a full
kernel compile to handle the adding of the __mcount_loc sections.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 75d81f157d2..ecce4a4ccd5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -486,4 +486,9 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
 
+/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+#endif
+
 #endif
-- 
cgit v1.2.3


From 28614889bcb2558a47d02d52394b7fd9795a9547 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 22:47:18 -0400
Subject: ftrace: move notrace to compiler.h

The notrace define belongs in compiler.h so that it can be used in
init.h

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 2 ++
 include/linux/linkage.h  | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8322141ee48..98115d9d04d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -44,6 +44,8 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # error Sorry, your compiler is too old/not recognized.
 #endif
 
+#define notrace __attribute__((no_instrument_function))
+
 /* Intel compiler defines __GNUC__. So we will overwrite implementations
  * coming from above header files here
  */
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 56ba3739465..9fd1f859021 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/linkage.h>
 
-#define notrace __attribute__((no_instrument_function))
-
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
-- 
cgit v1.2.3


From fed1939c64d2288938fdc1c367d49082da65e195 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 14 Aug 2008 22:47:19 -0400
Subject: ftrace: remove old pointers to mcount

When a mcount pointer is recorded into a table, it is used to add or
remove calls to mcount (replacing them with nops). If the code is removed
via removing a module, the pointers still exist.  At modifying the code
a check is always made to make sure the code being replaced is the code
expected. In-other-words, the code being replaced is compared to what
it is expected to be before being replaced.

There is a very small chance that the code being replaced just happens
to look like code that calls mcount (very small since the call to mcount
is relative). To remove this chance, this patch adds ftrace_release to
allow module unloading to remove the pointers to mcount within the module.

Another change for init calls is made to not trace calls marked with
__init. The tracing can not be started until after init is done anyway.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 include/linux/init.h   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4936489f9ed..6b232a2460c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -165,10 +165,12 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_release(void *start, unsigned long size);
 #else
 static inline void ftrace_init(void) { }
 static inline void
 ftrace_init_module(unsigned long *start, unsigned long *end) { }
+static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/init.h b/include/linux/init.h
index 93538b696e3..27f61f6b3cb 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -40,7 +40,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold
+#define __init		__section(.init.text) __cold notrace
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
-- 
cgit v1.2.3


From dd0e545f061f90099a3dcc13aa77e29c6295cf23 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 1 Aug 2008 12:26:41 -0400
Subject: ftrace: printk formatting infrastructure

This patch adds a feature that can help kernel developers debug their
code using ftrace.

  int ftrace_printk(const char *fmt, ...);

This records into the ftrace buffer using printf formatting. The entry
size in the buffers are still a fixed length. A new type has been added
that allows for more entries to be used for a single recording.

The start of the print is still the same as the other entries.

It returns the number of characters written to the ftrace buffer.

For example:

Having a module with the following code:

static int __init ftrace_print_test(void)
{
        ftrace_printk("jiffies are %ld\n", jiffies);
        return 0;
}

Gives me:

  insmod-5441  3...1 7569us : ftrace_print_test: jiffies are 4296626666

for the latency_trace file and:

          insmod-5441  [03]  1959.370498: ftrace_print_test jiffies are 4296626666

for the trace file.

Note: Only the infrastructure should go into the kernel. It is to help
facilitate debugging for other kernel developers. Calls to ftrace_printk
is not intended to be left in the kernel, and should be frowned upon just
like scattering printks around in the code.

But having this easily at your fingertips helps the debugging go faster
and bugs be solved quicker.

Maybe later on, we can hook this with markers and have their printf format
be sucked into ftrace output.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6b232a2460c..f53b975e32f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -157,9 +157,18 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x)
+extern int
+__ftrace_printk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+static inline int
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
@@ -173,4 +182,5 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
+
 #endif /* _LINUX_FTRACE_H */
-- 
cgit v1.2.3


From 2f2c99dba2398ef7d9c21f7c793180a50e68b1f0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 1 Aug 2008 16:45:49 -0400
Subject: ftrace: ftrace_printk doc moved

Based on Randy Dunlap's suggestion, the ftrace_printk kernel-doc belongs
with the ftrace_printk macro that should be used. Not with the
__ftrace_printk internal function.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f53b975e32f..018af16bce5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -157,7 +157,24 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-# define ftrace_printk(x...) __ftrace_printk(_THIS_IP_, x)
+
+/**
+ * ftrace_printk - printf formatting in the ftrace buffer
+ * @fmt: the printf format for printing
+ *
+ * Note: __ftrace_printk is an internal function for ftrace_printk and
+ *       the @ip is passed in via the ftrace_printk macro.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving ftrace_printks scattered around in
+ * your code.
+ */
+# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-- 
cgit v1.2.3


From 3f5a54e371ca20b119b73704f6c01b71295c1714 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 30 Jul 2008 22:36:46 -0400
Subject: ftrace: dump out ftrace buffers to console on panic

At OLS I had a lot of interest to be able to have the ftrace buffers
dumped on panic.  Usually one would expect to uses kexec and examine
the buffers after a new kernel is loaded. But sometimes the resources
do not permit kdump and kexec, so having an option to still see the
sequence of events up to the crash is very advantageous.

This patch adds the option to have the ftrace buffers dumped to the
console in the latency_trace format on a panic. When the option is set,
the default entries per CPU buffer are lowered to 16384, since the writing
to the serial (if that is the console) may take an awful long time
otherwise.

[
 Changes since -v1:
  Got alpine to send correctly (as well as spell check working).
  Removed config option.
  Moved the static variables into ftrace_dump itself.
  Gave printk a log level.
]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 018af16bce5..f7fb92045bf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -178,6 +178,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern void ftrace_dump(void);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -186,6 +187,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
 {
 	return 0;
 }
+static inline void ftrace_dump(void) { }
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From 7b928c23fa3e9fa37d1d4ba52ba963f41ee5aae0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 Aug 2008 17:48:02 +0200
Subject: ftrace: build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

 In file included from init/main.c:65:
 include/linux/ftrace.h:166: error: expected ‘,' or ‘;' before ‘{' token
 make[1]: *** [init/main.o] Error 1
 make: *** [init/main.o] Error 2

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7fb92045bf..ce929cb5543 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -183,7 +183,10 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)))
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
+
+static inline int
+ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From c5131ad6c3cbe8f6674993e29a76cecf8deb4384 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 Aug 2008 18:22:09 +0200
Subject: ftrace: ftrace_kill_atomic() build fix

fix:

 kernel/built-in.o: In function `ftrace_dump':
 (.text+0x2e2ea): undefined reference to `ftrace_kill_atomic'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ce929cb5543..36c439927ff 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -36,6 +36,7 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define register_ftrace_function(ops) do { } while (0)
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
+static inline void ftrace_kill_atomic(void) { }
 #endif /* CONFIG_FTRACE */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-- 
cgit v1.2.3


From 3700273586ee6a58b95dd07d9f8a02db4a9b476f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 18 Aug 2008 16:24:56 +0800
Subject: ftrace: fix incorrect comment style of __ftrace_enabled_save()

This patch fixes incorrect comment style of __ftrace_enabled_save().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 36c439927ff..8b4cf38c80d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,9 +99,11 @@ static inline void tracer_disable(void)
 #endif
 }
 
-/* Ftrace disable/restore without lock. Some synchronization mechanism
+/*
+ * Ftrace disable/restore without lock. Some synchronization mechanism
  * must be used to prevent ftrace_enabled to be changed between
- * disable/restore. */
+ * disable/restore.
+ */
 static inline int __ftrace_enabled_save(void)
 {
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3


From c0719e5a4b1ccc04180b7a7b71095c9fb7131919 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 6 Sep 2008 01:06:03 -0400
Subject: ftrace: use ftrace_release for all dynamic ftrace functions

ftrace_release is necessary for all uses of dynamic ftrace and not just
the archs that have CONFIG_FTRACE_MCOUNT_RECORD defined.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8b4cf38c80d..5de9903645d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -77,8 +77,10 @@ extern void mcount_call(void);
 
 extern int skip_trace(unsigned long ip);
 
-void ftrace_disable_daemon(void);
-void ftrace_enable_daemon(void);
+extern void ftrace_release(void *start, unsigned long size);
+
+extern void ftrace_disable_daemon(void);
+extern void ftrace_enable_daemon(void);
 
 #else
 # define skip_trace(ip)				({ 0; })
@@ -86,6 +88,7 @@ void ftrace_enable_daemon(void);
 # define ftrace_set_filter(buf, len, reset)	do { } while (0)
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
+static inline void ftrace_release(void *start, unsigned long size) { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
@@ -199,12 +202,10 @@ static inline void ftrace_dump(void) { }
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
 extern void ftrace_init_module(unsigned long *start, unsigned long *end);
-extern void ftrace_release(void *start, unsigned long size);
 #else
 static inline void ftrace_init(void) { }
 static inline void
 ftrace_init_module(unsigned long *start, unsigned long *end) { }
-static inline void ftrace_release(void *start, unsigned long size) { }
 #endif
 
 
-- 
cgit v1.2.3


From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 16 Sep 2008 22:00:34 +0300
Subject: x86 mmiotrace: implement mmiotrace_printk()

Offer mmiotrace users a function to inject markers from inside the kernel.
This depends on the trace_vprintk() patch.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mmiotrace.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 61d19e1b7a0..60cc3bf5c53 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -34,11 +34,15 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p);
 /* Called from page fault handler. */
 extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
-/* Called from ioremap.c */
 #ifdef CONFIG_MMIOTRACE
+/* Called from ioremap.c */
 extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
 							void __iomem *addr);
 extern void mmiotrace_iounmap(volatile void __iomem *addr);
+
+/* For anyone to insert markers. Remember trailing newline. */
+extern int mmiotrace_printk(const char *fmt, ...)
+				__attribute__ ((format (printf, 1, 2)));
 #else
 static inline void mmiotrace_ioremap(resource_size_t offset,
 					unsigned long size, void __iomem *addr)
@@ -48,7 +52,15 @@ static inline void mmiotrace_ioremap(resource_size_t offset,
 static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 {
 }
-#endif /* CONFIG_MMIOTRACE_HOOKS */
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+				__attribute__ ((format (printf, 1, 0)));
+
+static inline int mmiotrace_printk(const char *fmt, ...)
+{
+	return 0;
+}
+#endif /* CONFIG_MMIOTRACE */
 
 enum mm_io_opcode {
 	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
@@ -81,5 +93,6 @@ extern void enable_mmiotrace(void);
 extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
+extern int mmio_trace_printk(const char *fmt, va_list args);
 
 #endif /* MMIOTRACE_H */
-- 
cgit v1.2.3


From 4427414170a63331a9cc36b9598502c5cdfe453b Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 16 Sep 2008 22:03:56 +0300
Subject: mmiotrace: remove left-over marker cruft

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mmiotrace.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 60cc3bf5c53..139d7c88d9c 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -67,8 +67,7 @@ enum mm_io_opcode {
 	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
 	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
 	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
-	MMIO_MARKER = 0x5,   /* raw char data */
-	MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
+	MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */
 };
 
 struct mmiotrace_rw {
-- 
cgit v1.2.3


From e98d0eabef2748d88fa58760d104e8e68517406b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 29 Sep 2008 11:05:13 -0400
Subject: markers: marker_synchronize_unregister()

Create marker_synchronize_unregister() which must be called before the end of
exit() to make sure every probe callers have exited the non preemptible section
and thus are not executing the probe code anymore.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 1290653f924..889196c7fbb 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -160,4 +160,11 @@ extern int marker_probe_unregister_private_data(marker_probe_func *probe,
 extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 	int num);
 
+/*
+ * marker_synchronize_unregister must be called between the last marker probe
+ * unregistration and the end of module exit to make sure there is no caller
+ * executing a probe when it is freed.
+ */
+#define marker_synchronize_unregister() synchronize_sched()
+
 #endif
-- 
cgit v1.2.3


From 53c8c8fdfd2d2d515bdcb3d0f2a11d1f3f42ece1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 3 Oct 2008 11:52:54 -0400
Subject: markers: turn marker_synchronize_unregister() into an inline

Turn marker synchronize unregister into a static inline. There is no
reason to keep it as a macro over a static inline.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb..38e32e781ed 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -13,6 +13,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/rcupdate.h>
 
 struct module;
 struct marker;
@@ -165,6 +166,9 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
  * unregistration and the end of module exit to make sure there is no caller
  * executing a probe when it is freed.
  */
-#define marker_synchronize_unregister() synchronize_sched()
+static inline void marker_synchronize_unregister(void)
+{
+	synchronize_sched();
+}
 
 #endif
-- 
cgit v1.2.3


From d13744cd6e3fef373a3fe656ac349b4e7c49ff79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= <fweisbec@gmail.com>
Date: Tue, 23 Sep 2008 11:32:08 +0100
Subject: tracing/ftrace: add the boot tracer

Add the boot/initcall tracer.

It's primary purpose is to be able to trace the initcalls.

It is intended to be used with scripts/bootgraph.pl after some small
improvements.

Note that it is not active after its init. To avoid tracing (and so
crashing) before the whole tracing engine init, you have to explicitly
call start_boot_trace() after do_pre_smp_initcalls() to enable it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5de9903645d..91954eb6460 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -5,6 +5,8 @@
 
 #include <linux/linkage.h>
 #include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/types.h>
 
 extern int ftrace_enabled;
 extern int
@@ -209,4 +211,21 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+struct boot_trace {
+	pid_t			caller;
+	initcall_t		func;
+	int			result;
+	unsigned long long	duration;
+};
+
+#ifdef CONFIG_BOOT_TRACER
+extern void trace_boot(struct boot_trace *it);
+extern void start_boot_trace(void);
+#else
+static inline void trace_boot(struct boot_trace *it) { }
+static inline void start_boot_trace(void) { }
+#endif
+
+
+
 #endif /* _LINUX_FTRACE_H */
-- 
cgit v1.2.3


From 7a8e76a3829f1067b70f715771ff88baf2fbf3c3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 29 Sep 2008 23:02:38 -0400
Subject: tracing: unified trace buffer

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

  struct ring_buffer_event {
	u32 type:2, len:3, time_delta:27;
	u32 array[];
  };

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

 RINGBUF_TYPE_PADDING: this type is used to note extra space at the end
	of a buffer page.

 RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events
	is greater than the 27 bit delta can hold. We add another
	32 bits, and record that in its own event (8 byte size).

 RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
	help keep the buffer timestamps in sync.

RINGBUF_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

	type = RINGBUF_TYPE_DATA
	len = 2
	time_delta: <time-stamp> - <prev_event-time-stamp>
	array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

	type = RINGBUF_TYPE_DATA
	len = 0
	time_delta: <time-stamp> - <prev_event-time-stamp>
	array[0]: 84 (Note the alignment)
	array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
	This is the size of the memory used to record this
	event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
	This returns the delta time stamp since the last event.
	Note: Even though this is in the header, there should
		be no reason to access this directly, accept
		for debugging.

ring_buffer_event_data(event): get the data from the event
	This is the function to use to get the actual data
	from the event. Note, it is only a pointer to the
	data inside the buffer. This data must be copied to
	another location otherwise you risk it being written
	over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
	overwrite or consumer/producer mode. Overwrite will
	overwrite old data, where as consumer producer will
	throw away new data if the consumer catches up with the
	producer.  The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
	buffer. Note, it is up to the caller to provide that
	the buffer is not being used while this is happening.
	This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
	entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
	the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
	consume it. That is, this function increments the head
	pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
	For now, this disables the cpu buffer, until you issue
	a finish. This is just because we do not want the iterator
	to be overwritten. This restriction may change in the future.
	But note, this is used for static reading of a buffer which
	is usually done "after" a trace. Live readings would want
	to use the ring_buffer_consume above, which will not
	disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
	the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
	of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
	of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
	Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
	cpu buffer of another buffer. This is handy when you
	want to take a snap shot of a running trace on just one
	cpu. Having a backup buffer, to swap with facilitates this.
	Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
	into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure.  But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h | 130 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 include/linux/ring_buffer.h

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
new file mode 100644
index 00000000000..c52375b8330
--- /dev/null
+++ b/include/linux/ring_buffer.h
@@ -0,0 +1,130 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use functions below.
+ */
+struct ring_buffer_event {
+	u32		type:2, len:3, time_delta:27;
+	u32		array[];
+};
+
+/**
+ * enum ring_buffer_type - internal ring buffer types
+ *
+ * @RINGBUF_TYPE_PADDING:	Left over page padding
+ *				 array is ignored
+ *				 size is variable depending on how much
+ *				  padding is needed
+ *
+ * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
+ *				 array[0] = time delta (28 .. 59)
+ *				 size = 8 bytes
+ *
+ * @RINGBUF_TYPE_TIME_STAMP:	Sync time stamp with external clock
+ *				 array[0] = tv_nsec
+ *				 array[1] = tv_sec
+ *				 size = 16 bytes
+ *
+ * @RINGBUF_TYPE_DATA:		Data record
+ *				 If len is zero:
+ *				  array[0] holds the actual length
+ *				  array[1..(length+3)/4-1] holds data
+ *				 else
+ *				  length = len << 2
+ *				  array[0..(length+3)/4] holds data
+ */
+enum ring_buffer_type {
+	RINGBUF_TYPE_PADDING,
+	RINGBUF_TYPE_TIME_EXTEND,
+	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
+	RINGBUF_TYPE_TIME_STAMP,
+	RINGBUF_TYPE_DATA,
+};
+
+unsigned ring_buffer_event_length(struct ring_buffer_event *event);
+void *ring_buffer_event_data(struct ring_buffer_event *event);
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+	return event->time_delta;
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+			 unsigned long length,
+			 unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+			      struct ring_buffer_event *event,
+			      unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+		      unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+			 struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+	RB_FL_OVERWRITE		= 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
-- 
cgit v1.2.3


From d769041f865330034131525ee6a7f72eb4af2a24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 1 Oct 2008 00:29:53 -0400
Subject: ring_buffer: implement new locking

The old "lock always" scheme had issues with lockdep, and was not very
efficient anyways.

This patch does a new design to be partially lockless on writes.
Writes will add new entries to the per cpu pages by simply disabling
interrupts. When a write needs to go to another page than it will
grab the lock.

A new "read page" has been added so that the reader can pull out a page
from the ring buffer to read without worrying about the writer writing over
it. This allows us to not take the lock for all reads. The lock is
now only taken when a read needs to go to a new page.

This is far from lockless, and interrupts still need to be disabled,
but it is a step towards a more lockless solution, and it also
solves a lot of the issues that were noticed by the first conversion
of ftrace to the ring buffers.

Note: the ring_buffer_{un}lock API has been removed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c52375b8330..536b0ca46a0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -63,9 +63,6 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
-void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
-void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
-
 /*
  * size is in bytes for each per CPU buffer.
  */
-- 
cgit v1.2.3


From cb5ab74204a6e2579d1119bf1348eb806526b12b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 2 Oct 2008 12:59:20 +0200
Subject: tracing/fastboot: change the printing of boot tracer according to
 bootgraph.pl

Change the boot tracer printing to make it parsable for
the scripts/bootgraph.pl script.

We have now to output two lines for each initcall, according to the
printk in do_one_initcall() in init/main.c
We need now the call's time and the return's time.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 91954eb6460..4455490d91b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -216,6 +216,8 @@ struct boot_trace {
 	initcall_t		func;
 	int			result;
 	unsigned long long	duration;
+	ktime_t			calltime;
+	ktime_t			rettime;
 };
 
 #ifdef CONFIG_BOOT_TRACER
-- 
cgit v1.2.3


From 5601020feb0c3010e9e3e0131e9697ac6a06777b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 2 Oct 2008 13:26:05 +0200
Subject: tracing/fastboot: get the initcall name before it disappears

After some initcall traces, some initcall names may be inconsistent.
That's because these functions will disappear from the .init section
and also their name from the symbols table.

So we have to copy the name of the function in a buffer large enough
during the trace appending. It is not costly for the ring_buffer because
the number of initcall entries is commonly not really large.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4455490d91b..e672e51c40a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/kallsyms.h>
 
 extern int ftrace_enabled;
 extern int
@@ -213,7 +214,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	initcall_t		func;
+	char 			func[KSYM_NAME_LEN];
 	int			result;
 	unsigned long long	duration;
 	ktime_t			calltime;
@@ -221,10 +222,10 @@ struct boot_trace {
 };
 
 #ifdef CONFIG_BOOT_TRACER
-extern void trace_boot(struct boot_trace *it);
+extern void trace_boot(struct boot_trace *it, initcall_t fn);
 extern void start_boot_trace(void);
 #else
-static inline void trace_boot(struct boot_trace *it) { }
+static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
 #endif
 
-- 
cgit v1.2.3


From 3e1932ad59726d794a865cc159c0593d54bf0cb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 2 Oct 2008 17:45:47 +0200
Subject: tracing/fastboot: build fix

fix:

 In file included from kernel/sysctl.c:52:
 include/linux/ftrace.h:217: error: 'KSYM_NAME_LEN' undeclared here (not in a function)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e672e51c40a..deded114dff 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,14 +1,14 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#ifdef CONFIG_FTRACE
-
 #include <linux/linkage.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
 
+#ifdef CONFIG_FTRACE
+
 extern int ftrace_enabled;
 extern int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-- 
cgit v1.2.3


From eb7fa935274bb233686fdf7a53f40c5d9ee76ed6 Mon Sep 17 00:00:00 2001
From: Steven Noonan <steven@uplinklabs.net>
Date: Thu, 2 Oct 2008 12:00:07 -0700
Subject: ftrace: ktime.h not included in ftrace.h

Including <linux/ktime.h> eliminates the following error:

include/linux/ftrace.h:220: error: expected specifier-qualifier-list
before 'ktime_t'

Signed-off-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index deded114dff..ed53265d1f6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -3,6 +3,7 @@
 
 #include <linux/linkage.h>
 #include <linux/fs.h>
+#include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
-- 
cgit v1.2.3


From 097d036a2f25eecc42435c57e010aaf4a2eed2d9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 3 Oct 2008 15:39:21 +0200
Subject: tracing/fastboot: only trace non-module initcalls

At this time, only built-in initcalls interest us.
We can't really produce a relevant graph if we include
the modules initcall too.

I had good results after this patch (see svg in attachment).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ed53265d1f6..5812dba4ee2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,9 +225,11 @@ struct boot_trace {
 #ifdef CONFIG_BOOT_TRACER
 extern void trace_boot(struct boot_trace *it, initcall_t fn);
 extern void start_boot_trace(void);
+extern void stop_boot_trace(void);
 #else
 static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
+static inline void stop_boot_trace(void) { }
 #endif
 
 
-- 
cgit v1.2.3


From ca538f6bbe583406f941f3041d40c41f9a13d1de Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Thu, 9 Oct 2008 15:23:05 -0700
Subject: tracing/fastboot: add better resolution to initcall debug/tracing

Change the time resolution for initcall_debug to microseconds, from
milliseconds.  This is handy to determine which initcalls you want to work
on for faster booting.

One one of my test machines, over 90% of the initcalls are less than a
millisecond and (without this patch) these are all reported as 0 msecs.
Working on the 900 us ones is more important than the 4 us ones.

With 'quiet' on the kernel command line, this adds no significant overhead
to kernel boot time.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5812dba4ee2..a3d46151be1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -215,9 +215,9 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	char 			func[KSYM_NAME_LEN];
+	char			func[KSYM_NAME_LEN];
 	int			result;
-	unsigned long long	duration;
+	unsigned long long	duration;		/* usecs */
 	ktime_t			calltime;
 	ktime_t			rettime;
 };
-- 
cgit v1.2.3


From bfadadfccc19e36f7d600c5ce7b3e5ba5197fbf0 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 10 Oct 2008 03:48:25 -0400
Subject: markers: fix synchronize marker unregister static inline

Use a #define for synchronize marker unregister to fix include dependencies.

Fixes the slab circular inclusion which triggers when slab.git is combined
with tracing.git, where rcupdate includes slab, which includes markers
which includes rcupdate.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 38e32e781ed..889196c7fbb 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -13,7 +13,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/rcupdate.h>
 
 struct module;
 struct marker;
@@ -166,9 +165,6 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
  * unregistration and the end of module exit to make sure there is no caller
  * executing a probe when it is freed.
  */
-static inline void marker_synchronize_unregister(void)
-{
-	synchronize_sched();
-}
+#define marker_synchronize_unregister() synchronize_sched()
 
 #endif
-- 
cgit v1.2.3


From f2461fc82a083dd60062e05e704c5fcc1c658ba1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 6 Oct 2008 10:33:00 -0400
Subject: tracepoints: tracepoint_synchronize_unregister()

Create tracepoint_synchronize_unregister() which must be called before the end
of exit() to make sure every probe callers have exited the non preemptible
section and thus are not executing the probe code anymore.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index e623a6fca5c..199f4c207c1 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -124,4 +124,11 @@ extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
 extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
 	struct tracepoint *begin, struct tracepoint *end);
 
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+#define tracepoint_synchronize_unregister() synchronize_sched()
+
 #endif
-- 
cgit v1.2.3


From 231375cc5cc3549bb413f94a164bdcbd5f9ce943 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 3 Oct 2008 15:01:33 -0400
Subject: tracepoints: synchronize unregister static inline

Turn tracepoint synchronize unregister into a static inline. There is no
reason to keep it as a macro over a static inline.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 199f4c207c1..c5bb39c7a77 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -129,6 +129,9 @@ extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
  * probe unregistration and the end of module exit to make sure there is no
  * caller executing a probe when it is freed.
  */
-#define tracepoint_synchronize_unregister() synchronize_sched()
+static inline void tracepoint_synchronize_unregister(void)
+{
+	synchronize_sched();
+}
 
 #endif
-- 
cgit v1.2.3


From 6028aa01f759a1dae11e5d0e495b3dc9d2b0a47b Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Date: Tue, 14 Oct 2008 21:23:26 +0900
Subject: [MTD] [NAND] sh_flctl: add support for Renesas SuperH FLCTL

Several Renesas SuperH CPU has FLCTL. The FLCTL support NAND Flash.
This driver support SH7723.

Signed-off-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/sh_flctl.h | 125 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 include/linux/mtd/sh_flctl.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h
new file mode 100644
index 00000000000..e77c1cea404
--- /dev/null
+++ b/include/linux/mtd/sh_flctl.h
@@ -0,0 +1,125 @@
+/*
+ * SuperH FLCTL nand controller
+ *
+ * Copyright © 2008 Renesas Solutions Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __SH_FLCTL_H__
+#define __SH_FLCTL_H__
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+
+/* FLCTL registers */
+#define FLCMNCR(f)		(f->reg + 0x0)
+#define FLCMDCR(f)		(f->reg + 0x4)
+#define FLCMCDR(f)		(f->reg + 0x8)
+#define FLADR(f)		(f->reg + 0xC)
+#define FLADR2(f)		(f->reg + 0x3C)
+#define FLDATAR(f)		(f->reg + 0x10)
+#define FLDTCNTR(f)		(f->reg + 0x14)
+#define FLINTDMACR(f)		(f->reg + 0x18)
+#define FLBSYTMR(f)		(f->reg + 0x1C)
+#define FLBSYCNT(f)		(f->reg + 0x20)
+#define FLDTFIFO(f)		(f->reg + 0x24)
+#define FLECFIFO(f)		(f->reg + 0x28)
+#define FLTRCR(f)		(f->reg + 0x2C)
+#define	FL4ECCRESULT0(f)	(f->reg + 0x80)
+#define	FL4ECCRESULT1(f)	(f->reg + 0x84)
+#define	FL4ECCRESULT2(f)	(f->reg + 0x88)
+#define	FL4ECCRESULT3(f)	(f->reg + 0x8C)
+#define	FL4ECCCR(f)		(f->reg + 0x90)
+#define	FL4ECCCNT(f)		(f->reg + 0x94)
+#define	FLERRADR(f)		(f->reg + 0x98)
+
+/* FLCMNCR control bits */
+#define ECCPOS2		(0x1 << 25)
+#define _4ECCCNTEN	(0x1 << 24)
+#define _4ECCEN		(0x1 << 23)
+#define _4ECCCORRECT	(0x1 << 22)
+#define SNAND_E		(0x1 << 18)	/* SNAND (0=512 1=2048)*/
+#define QTSEL_E		(0x1 << 17)
+#define ENDIAN		(0x1 << 16)	/* 1 = little endian */
+#define FCKSEL_E	(0x1 << 15)
+#define ECCPOS_00	(0x00 << 12)
+#define ECCPOS_01	(0x01 << 12)
+#define ECCPOS_02	(0x02 << 12)
+#define ACM_SACCES_MODE	(0x01 << 10)
+#define NANWF_E		(0x1 << 9)
+#define SE_D		(0x1 << 8)	/* Spare area disable */
+#define	CE1_ENABLE	(0x1 << 4)	/* Chip Enable 1 */
+#define	CE0_ENABLE	(0x1 << 3)	/* Chip Enable 0 */
+#define	TYPESEL_SET	(0x1 << 0)
+
+/* FLCMDCR control bits */
+#define ADRCNT2_E	(0x1 << 31)	/* 5byte address enable */
+#define ADRMD_E		(0x1 << 26)	/* Sector address access */
+#define CDSRC_E		(0x1 << 25)	/* Data buffer selection */
+#define DOSR_E		(0x1 << 24)	/* Status read check */
+#define SELRW		(0x1 << 21)	/*  0:read 1:write */
+#define DOADR_E		(0x1 << 20)	/* Address stage execute */
+#define ADRCNT_1	(0x00 << 18)	/* Address data bytes: 1byte */
+#define ADRCNT_2	(0x01 << 18)	/* Address data bytes: 2byte */
+#define ADRCNT_3	(0x02 << 18)	/* Address data bytes: 3byte */
+#define ADRCNT_4	(0x03 << 18)	/* Address data bytes: 4byte */
+#define DOCMD2_E	(0x1 << 17)	/* 2nd cmd stage execute */
+#define DOCMD1_E	(0x1 << 16)	/* 1st cmd stage execute */
+
+/* FLTRCR control bits */
+#define TRSTRT		(0x1 << 0)	/* translation start */
+#define TREND		(0x1 << 1)	/* translation end */
+
+/* FL4ECCCR control bits */
+#define	_4ECCFA		(0x1 << 2)	/* 4 symbols correct fault */
+#define	_4ECCEND	(0x1 << 1)	/* 4 symbols end */
+#define	_4ECCEXST	(0x1 << 0)	/* 4 symbols exist */
+
+#define INIT_FL4ECCRESULT_VAL	0x03FF03FF
+#define LOOP_TIMEOUT_MAX	0x00010000
+
+#define mtd_to_flctl(mtd)	container_of(mtd, struct sh_flctl, mtd)
+
+struct sh_flctl {
+	struct mtd_info		mtd;
+	struct nand_chip	chip;
+	void __iomem		*reg;
+
+	uint8_t	done_buff[2048 + 64];	/* max size 2048 + 64 */
+	int	read_bytes;
+	int	index;
+	int	seqin_column;		/* column in SEQIN cmd */
+	int	seqin_page_addr;	/* page_addr in SEQIN cmd */
+	uint32_t seqin_read_cmd;		/* read cmd in SEQIN cmd */
+	int	erase1_page_addr;	/* page_addr in ERASE1 cmd */
+	uint32_t erase_ADRCNT;		/* bits of FLCMDCR in ERASE1 cmd */
+	uint32_t rw_ADRCNT;	/* bits of FLCMDCR in READ WRITE cmd */
+
+	int	hwecc_cant_correct[4];
+
+	unsigned page_size:1;	/* NAND page size (0 = 512, 1 = 2048) */
+	unsigned hwecc:1;	/* Hardware ECC (0 = disabled, 1 = enabled) */
+};
+
+struct sh_flctl_platform_data {
+	struct mtd_partition	*parts;
+	int			nr_parts;
+	unsigned long		flcmncr_val;
+
+	unsigned has_hwecc:1;
+};
+
+#endif	/* __SH_FLCTL_H__ */
-- 
cgit v1.2.3


From 74baaaaec8b4f22e1ae279f5ecca4ff705b28912 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 14 Oct 2008 09:21:02 -0400
Subject: vfs: Remove the range_cont writeback mode.

Ext4 was the only user of range_cont writeback mode and ext4 switched
to a different method. So remove the range_cont mode which is not used
in the kernel.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
CC: linux-fsdevel@vger.kernel.org
---
 include/linux/writeback.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c561a1..bd91987c065 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,7 +63,6 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
-	unsigned range_cont:1;
 };
 
 /*
-- 
cgit v1.2.3


From e6a7d3c04f8fe49099521e6dc9a46b0272381f2f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 14 Oct 2008 11:58:31 -0700
Subject: netfilter: ctnetlink: remove bogus module dependency between
 ctnetlink and nf_nat

This patch removes the module dependency between ctnetlink and
nf_nat by means of an indirect call that is initialized when
nf_nat is loaded. Now, nf_conntrack_netlink only requires
nf_conntrack and nfnetlink.

This patch puts nfnetlink_parse_nat_setup_hook into the
nf_conntrack_core to avoid dependencies between ctnetlink,
nf_conntrack_ipv4 and nf_conntrack_ipv6.

This patch also introduces the function ctnetlink_change_nat
that is only invoked from the creation path. Actually, the
nat handling cannot be invoked from the update path since
this is not allowed. By introducing this function, we remove
the useless nat handling in the update path and we avoid
deadlock-prone code.

This patch also adds the required EAGAIN logic for nfnetlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 0d8424f7689..7d8e0455cca 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -78,6 +78,9 @@ extern int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group,
 			  int echo);
 extern int nfnetlink_unicast(struct sk_buff *skb, u_int32_t pid, int flags);
 
+extern void nfnl_lock(void);
+extern void nfnl_unlock(void);
+
 #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
 	MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))
 
-- 
cgit v1.2.3


From 4704f0e274829e3af00737d2d9adace2d71a9605 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 14 Oct 2008 19:16:07 -0400
Subject: NFS: Fix the resolution problem with nfs_inode_attrs_need_update()

It appears that 'jiffies' timestamps do not have high enough resolution for
nfs_inode_attrs_need_update(). One problem is that a GETATTR can be
launched within < 1 jiffy of the last operation that updated the attribute.
Another problem is that RPC calls can take < 1 jiffy to execute.

We can fix this by switching the variables to use a simple global counter
that gets incremented every time we start another GETATTR call.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h  | 10 +++-------
 include/linux/nfs_xdr.h |  1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ca563ee13e3..ac8d0233b05 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -137,7 +137,7 @@ struct nfs_inode {
 	unsigned long		attrtimeo_timestamp;
 	__u64			change_attr;		/* v4 only */
 
-	unsigned long		last_updated;
+	unsigned long		attr_gencount;
 	/* "Generation counter" for the attribute cache. This is
 	 * bumped whenever we update the metadata on the
 	 * server.
@@ -344,15 +344,11 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ct
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
 extern u64 nfs_compat_user_ino64(u64 fileid);
+extern void nfs_fattr_init(struct nfs_fattr *fattr);
 
 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
 extern __be32 root_nfs_parse_addr(char *name); /*__init*/
-
-static inline void nfs_fattr_init(struct nfs_fattr *fattr)
-{
-	fattr->valid = 0;
-	fattr->time_start = jiffies;
-}
+extern unsigned long nfs_inc_attr_generation_counter(void);
 
 /*
  * linux/fs/nfs/file.c
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6ee6ae3f095..c1c31acb8a2 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -56,6 +56,7 @@ struct nfs_fattr {
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
 	unsigned long		time_start;
+	unsigned long		gencount;
 };
 
 #define NFS_ATTR_WCC		0x0001		/* pre-op WCC data    */
-- 
cgit v1.2.3


From d98e6346350ac909f095768beb28b82368bd126f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 1 Jul 2008 16:23:49 -0500
Subject: KVM: Move KVM TRACE DEFINITIONS to common header

Move KVM trace definitions from x86 specific kvm headers to common kvm
headers to create a cross-architecture numbering scheme for trace
events. This means the kvmtrace_format userspace tool won't need to know
which architecture produced the log file being processed.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h      | 21 +++++++++++++++++++++
 include/linux/kvm_host.h | 19 +++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 70a30651cd1..8a3ceadb136 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -440,4 +440,25 @@ struct kvm_trace_rec {
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8525afc5310..a18aaad2ab7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -326,6 +326,25 @@ struct kvm_stats_debugfs_item {
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
+#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 5, d1, d2, d3, d4, d5)
+#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 4, d1, d2, d3, d4, 0)
+#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 3, d1, d2, d3, 0, 0)
+#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 2, d1, d2, 0, 0, 0)
+#define KVMTRACE_1D(evt, vcpu, d1, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 1, d1, 0, 0, 0, 0)
+#define KVMTRACE_0D(evt, vcpu, name) \
+	trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+						vcpu, 0, 0, 0, 0, 0, 0)
+
 #ifdef CONFIG_KVM_TRACE
 int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
 void kvm_trace_cleanup(void);
-- 
cgit v1.2.3


From e32c8f2c0720fb21c6f4a5f6ccbebdadc878f707 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:00 +0200
Subject: KVM: kvmtrace: Remove use of bit fields in kvm trace structure

This patch fixes kvmtrace use on big endian systems. When using bit fields the
compiler will lay data out in the wrong order expected when laid down into a
file.
This fixes it by using one variable instead of using bit fields.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a3ceadb136..8a16b083df2 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -311,9 +311,13 @@ struct kvm_s390_interrupt {
 
 /* This structure represents a single trace buffer record. */
 struct kvm_trace_rec {
-	__u32 event:28;
-	__u32 extra_u32:3;
-	__u32 cycle_in:1;
+	/* variable rec_val
+	 * is split into:
+	 * bits 0 - 27  -> event id
+	 * bits 28 -30  -> number of extra data args of size u32
+	 * bits 31      -> binary indicator for if tsc is in record
+	 */
+	__u32 rec_val;
 	__u32 pid;
 	__u32 vcpu_id;
 	union {
@@ -327,6 +331,13 @@ struct kvm_trace_rec {
 	} u;
 };
 
+#define TRACE_REC_EVENT_ID(val) \
+		(0x0fffffff & (val))
+#define TRACE_REC_NUM_DATA_ARGS(val) \
+		(0x70000000 & ((val) << 28))
+#define TRACE_REC_TCS(val) \
+		(0x80000000 & ((val) << 31))
+
 #define KVMIO 0xAE
 
 /*
-- 
cgit v1.2.3


From 3f7f95c65ef6a89472a28da1b9436eaeee288831 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:01 +0200
Subject: KVM: kvmtrace: replace get_cycles with ktime_get v3

The current kvmtrace code uses get_cycles() while the interpretation would be
easier using using nanoseconds. ktime_get() should give at least the same
accuracy as get_cycles on all architectures (even better on 32bit archs) but
at a better unit (e.g. comparable between hosts with different frequencies.

[avi: avoid ktime_t in public header]

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a16b083df2..5d08f11bb27 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -322,12 +322,12 @@ struct kvm_trace_rec {
 	__u32 vcpu_id;
 	union {
 		struct {
-			__u64 cycle_u64;
+			__u64 timestamp;
 			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} __attribute__((packed)) cycle;
+		} __attribute__((packed)) timestamp;
 		struct {
 			__u32 extra_u32[KVM_TRC_EXTRA_MAX];
-		} nocycle;
+		} notimestamp;
 	} u;
 };
 
-- 
cgit v1.2.3


From 31711f2294b38d8334efaf7dbac6da4781fd151e Mon Sep 17 00:00:00 2001
From: Jerone Young <jyoung5@us.ibm.com>
Date: Mon, 14 Jul 2008 14:00:03 +0200
Subject: KVM: ppc: adds trace points for ppc tlb activity

This patch adds trace points to track powerpc TLB activities using the
KVM_TRACE infrastructure.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 5d08f11bb27..e21a5050d4d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -471,5 +471,8 @@ struct kvm_trace_rec {
 #define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
 #define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
 #define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 
 #endif
-- 
cgit v1.2.3


From 3b4bd7969f7b61a1ab455bff084ee4f0a2411055 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:04 +0200
Subject: KVM: ppc: trace powerpc instruction emulation

This patch adds a trace point for the instruction emulation on embedded powerpc
utilizing the KVM_TRACE interface.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e21a5050d4d..d29b6488144 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -474,5 +474,6 @@ struct kvm_trace_rec {
 #define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
 #define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
 #endif
-- 
cgit v1.2.3


From 4d5c5d0fe89c921336b95f5e7e4f529a9df92f53 Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Mon, 28 Jul 2008 19:26:26 +0300
Subject: KVM: pci device assignment

Based on a patch from: Amit Shah <amit.shah@qumranet.com>

This patch adds support for handling PCI devices that are assigned to
the guest.

The device to be assigned to the guest is registered in the host kernel
and interrupt delivery is handled.  If a device is already assigned, or
the device driver for it is still loaded on the host, the device
assignment is failed by conveying a -EBUSY reply to the userspace.

Devices that share their interrupt line are not supported at the moment.

By itself, this patch will not make devices work within the guest.
The VT-d extension is required to enable the device to perform DMA.
Another alternative is PVDMA.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index d29b6488144..ef4bc6f8977 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,6 +383,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#define KVM_CAP_DEVICE_ASSIGNMENT 17
 
 /*
  * ioctls for VM fds
@@ -412,6 +413,10 @@ struct kvm_trace_rec {
 			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
+				   struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
+			    struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -476,4 +481,18 @@ struct kvm_trace_rec {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+struct kvm_assigned_pci_dev {
+	__u32 assigned_dev_id;
+	__u32 busnr;
+	__u32 devfn;
+	__u32 flags;
+};
+
+struct kvm_assigned_irq {
+	__u32 assigned_dev_id;
+	__u32 host_irq;
+	__u32 guest_irq;
+	__u32 flags;
+};
+
 #endif
-- 
cgit v1.2.3


From d76901750ab9f71091d33ef3d2b5909d8a9a4ad4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 8 Sep 2008 15:23:48 -0300
Subject: KVM: x86: do not execute halted vcpus

Offline or uninitialized vcpu's can be executed if requested to perform
userspace work.

Follow Avi's suggestion to handle halted vcpu's in the main loop,
simplifying kvm_emulate_halt(). Introduce a new vcpu->requests bit to
indicate events that promote state from halted to running.

Also standardize vcpu wake sites.

Signed-off-by: Marcelo Tosatti <mtosatti <at> redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a18aaad2ab7..4b036430ea2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@
 #define KVM_REQ_MMU_RELOAD         3
 #define KVM_REQ_TRIPLE_FAULT       4
 #define KVM_REQ_PENDING_TIMER      5
+#define KVM_REQ_UNHALT             6
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-- 
cgit v1.2.3


From 387179464257921eb9aa3d15cc3ff194f6945a7c Mon Sep 17 00:00:00 2001
From: "Kay, Allen M" <allen.m.kay@intel.com>
Date: Tue, 9 Sep 2008 18:37:29 +0300
Subject: VT-d: Changes to support KVM

This patch extends the VT-d driver to support KVM

[Ben: fixed memory pinning]
[avi: move dma_remapping.h as well]

Signed-off-by: Kay, Allen M <allen.m.kay@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Acked-by: Mark Gross <mgross@linux.intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/dma_remapping.h | 157 ++++++++++++++++++++
 include/linux/intel-iommu.h   | 327 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/iova.h          |  52 +++++++
 3 files changed, 536 insertions(+)
 create mode 100644 include/linux/dma_remapping.h
 create mode 100644 include/linux/intel-iommu.h
 create mode 100644 include/linux/iova.h

(limited to 'include/linux')

diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
new file mode 100644
index 00000000000..bff5c65f81d
--- /dev/null
+++ b/include/linux/dma_remapping.h
@@ -0,0 +1,157 @@
+#ifndef _DMA_REMAPPING_H
+#define _DMA_REMAPPING_H
+
+/*
+ * We need a fixed PAGE_SIZE of 4K irrespective of
+ * arch PAGE_SIZE for IOMMU page tables.
+ */
+#define PAGE_SHIFT_4K		(12)
+#define PAGE_SIZE_4K		(1UL << PAGE_SHIFT_4K)
+#define PAGE_MASK_4K		(((u64)-1) << PAGE_SHIFT_4K)
+#define PAGE_ALIGN_4K(addr)	(((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K)
+
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT_4K)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
+
+
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & PAGE_MASK_4K;
+}
+
+struct context_entry;
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & PAGE_MASK_4K):
+		NULL);
+}
+
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & PAGE_MASK_4K)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while (0)
+#define context_set_fault_enable(c) \
+	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
+#define context_set_translation_type(c, val) \
+	do { \
+		(c).lo &= (((u64)-1) << 4) | 3; \
+		(c).lo |= ((val) & 3) << 2; \
+	} while (0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define context_set_address_root(c, val) \
+	do {(c).lo |= (val) & PAGE_MASK_4K;} while (0)
+#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
+#define context_set_domain_id(c, val) \
+	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
+
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
+
+#define DMA_PTE_READ (1)
+#define DMA_PTE_WRITE (2)
+
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
+#define dma_set_pte_prot(p, prot) \
+		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & PAGE_MASK_4K)
+#define dma_set_pte_addr(p, addr) do {\
+		(p).val |= ((addr) & PAGE_MASK_4K); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
+struct intel_iommu;
+
+struct dmar_domain {
+	int	id;			/* domain id */
+	struct intel_iommu *iommu;	/* back pointer to owning iommu */
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	spinlock_t	mapping_lock;	/* page table lock */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
+	int		flags;
+};
+
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u8 bus;			/* PCI bus numer */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
+extern int init_dmars(void);
+extern void free_dmar_iommu(struct intel_iommu *iommu);
+
+extern int dmar_disabled;
+
+#ifndef CONFIG_DMAR_GFX_WA
+static inline void iommu_prepare_gfx_mapping(void)
+{
+	return;
+}
+#endif /* !CONFIG_DMAR_GFX_WA */
+
+#endif
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
new file mode 100644
index 00000000000..2e117f30a76
--- /dev/null
+++ b/include/linux/intel-iommu.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Ashok Raj <ashok.raj@intel.com>
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ */
+
+#ifndef _INTEL_IOMMU_H_
+#define _INTEL_IOMMU_H_
+
+#include <linux/types.h>
+#include <linux/msi.h>
+#include <linux/sysdev.h>
+#include <linux/iova.h>
+#include <linux/io.h>
+#include <linux/dma_remapping.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Intel IOMMU register specification per version 1.0 public spec.
+ */
+
+#define	DMAR_VER_REG	0x0	/* Arch version supported by this IOMMU */
+#define	DMAR_CAP_REG	0x8	/* Hardware supported capabilities */
+#define	DMAR_ECAP_REG	0x10	/* Extended capabilities supported */
+#define	DMAR_GCMD_REG	0x18	/* Global command register */
+#define	DMAR_GSTS_REG	0x1c	/* Global status register */
+#define	DMAR_RTADDR_REG	0x20	/* Root entry table */
+#define	DMAR_CCMD_REG	0x28	/* Context command reg */
+#define	DMAR_FSTS_REG	0x34	/* Fault Status register */
+#define	DMAR_FECTL_REG	0x38	/* Fault control register */
+#define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
+#define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
+#define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
+#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
+#define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
+#define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
+#define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
+#define	DMAR_PHMBASE_REG 0x70	/* pmrr high base addr */
+#define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
+#define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
+#define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
+#define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
+#define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
+
+#define OFFSET_STRIDE		(9)
+/*
+#define dmar_readl(dmar, reg) readl(dmar + reg)
+#define dmar_readq(dmar, reg) ({ \
+		u32 lo, hi; \
+		lo = readl(dmar + reg); \
+		hi = readl(dmar + reg + 4); \
+		(((u64) hi) << 32) + lo; })
+*/
+static inline u64 dmar_readq(void __iomem *addr)
+{
+	u32 lo, hi;
+	lo = readl(addr);
+	hi = readl(addr + 4);
+	return (((u64) hi) << 32) + lo;
+}
+
+static inline void dmar_writeq(void __iomem *addr, u64 val)
+{
+	writel((u32)val, addr);
+	writel((u32)(val >> 32), addr + 4);
+}
+
+#define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
+#define DMAR_VER_MINOR(v)		((v) & 0x0f)
+
+/*
+ * Decoding Capability Register
+ */
+#define cap_read_drain(c)	(((c) >> 55) & 1)
+#define cap_write_drain(c)	(((c) >> 54) & 1)
+#define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
+#define cap_num_fault_regs(c)	((((c) >> 40) & 0xff) + 1)
+#define cap_pgsel_inv(c)	(((c) >> 39) & 1)
+
+#define cap_super_page_val(c)	(((c) >> 34) & 0xf)
+#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
+					* OFFSET_STRIDE) + 21)
+
+#define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
+#define cap_max_fault_reg_offset(c) \
+	(cap_fault_reg_offset(c) + cap_num_fault_regs(c) * 16)
+
+#define cap_zlr(c)		(((c) >> 22) & 1)
+#define cap_isoch(c)		(((c) >> 23) & 1)
+#define cap_mgaw(c)		((((c) >> 16) & 0x3f) + 1)
+#define cap_sagaw(c)		(((c) >> 8) & 0x1f)
+#define cap_caching_mode(c)	(((c) >> 7) & 1)
+#define cap_phmr(c)		(((c) >> 6) & 1)
+#define cap_plmr(c)		(((c) >> 5) & 1)
+#define cap_rwbf(c)		(((c) >> 4) & 1)
+#define cap_afl(c)		(((c) >> 3) & 1)
+#define cap_ndoms(c)		(((unsigned long)1) << (4 + 2 * ((c) & 0x7)))
+/*
+ * Extended Capability Register
+ */
+
+#define ecap_niotlb_iunits(e)	((((e) >> 24) & 0xff) + 1)
+#define ecap_iotlb_offset(e) 	((((e) >> 8) & 0x3ff) * 16)
+#define ecap_max_iotlb_offset(e) \
+	(ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
+#define ecap_coherent(e)	((e) & 0x1)
+#define ecap_qis(e)		((e) & 0x2)
+#define ecap_eim_support(e)	((e >> 4) & 0x1)
+#define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
+
+
+/* IOTLB_REG */
+#define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
+#define DMA_TLB_DSI_FLUSH (((u64)2) << 60)
+#define DMA_TLB_PSI_FLUSH (((u64)3) << 60)
+#define DMA_TLB_IIRG(type) ((type >> 60) & 7)
+#define DMA_TLB_IAIG(val) (((val) >> 57) & 7)
+#define DMA_TLB_READ_DRAIN (((u64)1) << 49)
+#define DMA_TLB_WRITE_DRAIN (((u64)1) << 48)
+#define DMA_TLB_DID(id)	(((u64)((id) & 0xffff)) << 32)
+#define DMA_TLB_IVT (((u64)1) << 63)
+#define DMA_TLB_IH_NONLEAF (((u64)1) << 6)
+#define DMA_TLB_MAX_SIZE (0x3f)
+
+/* INVALID_DESC */
+#define DMA_ID_TLB_GLOBAL_FLUSH	(((u64)1) << 3)
+#define DMA_ID_TLB_DSI_FLUSH	(((u64)2) << 3)
+#define DMA_ID_TLB_PSI_FLUSH	(((u64)3) << 3)
+#define DMA_ID_TLB_READ_DRAIN	(((u64)1) << 7)
+#define DMA_ID_TLB_WRITE_DRAIN	(((u64)1) << 6)
+#define DMA_ID_TLB_DID(id)	(((u64)((id & 0xffff) << 16)))
+#define DMA_ID_TLB_IH_NONLEAF	(((u64)1) << 6)
+#define DMA_ID_TLB_ADDR(addr)	(addr)
+#define DMA_ID_TLB_ADDR_MASK(mask)	(mask)
+
+/* PMEN_REG */
+#define DMA_PMEN_EPM (((u32)1)<<31)
+#define DMA_PMEN_PRS (((u32)1)<<0)
+
+/* GCMD_REG */
+#define DMA_GCMD_TE (((u32)1) << 31)
+#define DMA_GCMD_SRTP (((u32)1) << 30)
+#define DMA_GCMD_SFL (((u32)1) << 29)
+#define DMA_GCMD_EAFL (((u32)1) << 28)
+#define DMA_GCMD_WBF (((u32)1) << 27)
+#define DMA_GCMD_QIE (((u32)1) << 26)
+#define DMA_GCMD_SIRTP (((u32)1) << 24)
+#define DMA_GCMD_IRE (((u32) 1) << 25)
+
+/* GSTS_REG */
+#define DMA_GSTS_TES (((u32)1) << 31)
+#define DMA_GSTS_RTPS (((u32)1) << 30)
+#define DMA_GSTS_FLS (((u32)1) << 29)
+#define DMA_GSTS_AFLS (((u32)1) << 28)
+#define DMA_GSTS_WBFS (((u32)1) << 27)
+#define DMA_GSTS_QIES (((u32)1) << 26)
+#define DMA_GSTS_IRTPS (((u32)1) << 24)
+#define DMA_GSTS_IRES (((u32)1) << 25)
+
+/* CCMD_REG */
+#define DMA_CCMD_ICC (((u64)1) << 63)
+#define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61)
+#define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61)
+#define DMA_CCMD_DEVICE_INVL (((u64)3) << 61)
+#define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32)
+#define DMA_CCMD_MASK_NOBIT 0
+#define DMA_CCMD_MASK_1BIT 1
+#define DMA_CCMD_MASK_2BIT 2
+#define DMA_CCMD_MASK_3BIT 3
+#define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16)
+#define DMA_CCMD_DID(d) ((u64)((d) & 0xffff))
+
+/* FECTL_REG */
+#define DMA_FECTL_IM (((u32)1) << 31)
+
+/* FSTS_REG */
+#define DMA_FSTS_PPF ((u32)2)
+#define DMA_FSTS_PFO ((u32)1)
+#define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
+
+/* FRCD_REG, 32 bits access */
+#define DMA_FRCD_F (((u32)1) << 31)
+#define dma_frcd_type(d) ((d >> 30) & 1)
+#define dma_frcd_fault_reason(c) (c & 0xff)
+#define dma_frcd_source_id(c) (c & 0xffff)
+#define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
+
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
+
+#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
+{\
+	cycles_t start_time = get_cycles();\
+	while (1) {\
+		sts = op (iommu->reg + offset);\
+		if (cond)\
+			break;\
+		if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
+			panic("DMAR hardware is malfunctioning\n");\
+		cpu_relax();\
+	}\
+}
+
+#define QI_LENGTH	256	/* queue length */
+
+enum {
+	QI_FREE,
+	QI_IN_USE,
+	QI_DONE
+};
+
+#define QI_CC_TYPE		0x1
+#define QI_IOTLB_TYPE		0x2
+#define QI_DIOTLB_TYPE		0x3
+#define QI_IEC_TYPE		0x4
+#define QI_IWD_TYPE		0x5
+
+#define QI_IEC_SELECTIVE	(((u64)1) << 4)
+#define QI_IEC_IIDEX(idx)	(((u64)(idx & 0xffff) << 32))
+#define QI_IEC_IM(m)		(((u64)(m & 0x1f) << 27))
+
+#define QI_IWD_STATUS_DATA(d)	(((u64)d) << 32)
+#define QI_IWD_STATUS_WRITE	(((u64)1) << 5)
+
+struct qi_desc {
+	u64 low, high;
+};
+
+struct q_inval {
+	spinlock_t      q_lock;
+	struct qi_desc  *desc;          /* invalidation queue */
+	int             *desc_status;   /* desc status */
+	int             free_head;      /* first free entry */
+	int             free_tail;      /* last free entry */
+	int             free_cnt;
+};
+
+#ifdef CONFIG_INTR_REMAP
+/* 1MB - maximum possible interrupt remapping table size */
+#define INTR_REMAP_PAGE_ORDER	8
+#define INTR_REMAP_TABLE_REG_SIZE	0xf
+
+#define INTR_REMAP_TABLE_ENTRIES	65536
+
+struct ir_table {
+	struct irte *base;
+};
+#endif
+
+struct intel_iommu {
+	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
+	u64		cap;
+	u64		ecap;
+	int		seg;
+	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
+	spinlock_t	register_lock; /* protect register handling */
+	int		seq_id;	/* sequence id of the iommu */
+
+#ifdef CONFIG_DMAR
+	unsigned long 	*domain_ids; /* bitmap of domains */
+	struct dmar_domain **domains; /* ptr to domains */
+	spinlock_t	lock; /* protect context, domain ids */
+	struct root_entry *root_entry; /* virtual address */
+
+	unsigned int irq;
+	unsigned char name[7];    /* Device Name */
+	struct msi_msg saved_msg;
+	struct sys_device sysdev;
+#endif
+	struct q_inval  *qi;            /* Queued invalidation info */
+#ifdef CONFIG_INTR_REMAP
+	struct ir_table *ir_table;	/* Interrupt remapping info */
+#endif
+};
+
+static inline void __iommu_flush_cache(
+	struct intel_iommu *iommu, void *addr, int size)
+{
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(addr, size);
+}
+
+extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+
+extern int alloc_iommu(struct dmar_drhd_unit *drhd);
+extern void free_iommu(struct intel_iommu *iommu);
+extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void qi_global_iec(struct intel_iommu *iommu);
+
+extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+
+void intel_iommu_domain_exit(struct dmar_domain *domain);
+struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
+int intel_iommu_context_mapping(struct dmar_domain *domain,
+				struct pci_dev *pdev);
+int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
+			     u64 hpa, size_t size, int prot);
+void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
+struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
+u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
+
+#ifdef CONFIG_DMAR
+int intel_iommu_found(void);
+#else /* CONFIG_DMAR */
+static inline int intel_iommu_found(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR */
+
+#endif
diff --git a/include/linux/iova.h b/include/linux/iova.h
new file mode 100644
index 00000000000..228f6c94b69
--- /dev/null
+++ b/include/linux/iova.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This file is released under the GPLv2.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
+ *
+ */
+
+#ifndef _IOVA_H_
+#define _IOVA_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h>
+
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
+/* iova structure */
+struct iova {
+	struct rb_node	node;
+	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
+	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+};
+
+/* holds all the iova translations for a domain */
+struct iova_domain {
+	spinlock_t	iova_alloc_lock;/* Lock to protect iova  allocation */
+	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
+	struct rb_root	rbroot;		/* iova domain rbtree root */
+	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	dma_32bit_pfn;
+};
+
+struct iova *alloc_iova_mem(void);
+void free_iova_mem(struct iova *iova);
+void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void __free_iova(struct iova_domain *iovad, struct iova *iova);
+struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+	unsigned long limit_pfn,
+	bool size_aligned);
+struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
+	unsigned long pfn_hi);
+void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
+void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
+void put_iova_domain(struct iova_domain *iovad);
+
+#endif
-- 
cgit v1.2.3


From 62c476c7c7f25a5b245b9902a935636e6316e58c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Sun, 14 Sep 2008 03:48:28 +0300
Subject: KVM: Device Assignment with VT-d

Based on a patch by: Kay, Allen M <allen.m.kay@intel.com>

This patch enables PCI device assignment based on VT-d support.
When a device is assigned to the guest, the guest memory is pinned and
the mapping is updated in the VT-d IOMMU.

[Amit: Expose KVM_CAP_IOMMU so we can check if an IOMMU is present
and also control enable/disable from userspace]

Signed-off-by: Kay, Allen M <allen.m.kay@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Amit Shah <amit.shah@qumranet.com>

Acked-by: Mark Gross <mgross@linux.intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h      |  3 +++
 include/linux/kvm_host.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ef4bc6f8977..4269be171fa 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -384,6 +384,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
+#define KVM_CAP_IOMMU 18
 
 /*
  * ioctls for VM fds
@@ -495,4 +496,6 @@ struct kvm_assigned_irq {
 	__u32 flags;
 };
 
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4b036430ea2..6252802c3cc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -286,6 +286,53 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
+struct kvm_irq_ack_notifier {
+	struct hlist_node link;
+	unsigned gsi;
+	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
+};
+
+struct kvm_assigned_dev_kernel {
+	struct kvm_irq_ack_notifier ack_notifier;
+	struct work_struct interrupt_work;
+	struct list_head list;
+	int assigned_dev_id;
+	int host_busnr;
+	int host_devfn;
+	int host_irq;
+	int guest_irq;
+	int irq_requested;
+	struct pci_dev *dev;
+	struct kvm *kvm;
+};
+
+#ifdef CONFIG_DMAR
+int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
+			unsigned long npages);
+int kvm_iommu_map_guest(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+#else /* CONFIG_DMAR */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+				      gfn_t base_gfn,
+				      unsigned long npages)
+{
+	return 0;
+}
+
+static inline int kvm_iommu_map_guest(struct kvm *kvm,
+				      struct kvm_assigned_dev_kernel
+				      *assigned_dev)
+{
+	return -ENODEV;
+}
+
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	return 0;
+}
+#endif /* CONFIG_DMAR */
+
 static inline void kvm_guest_enter(void)
 {
 	account_system_vtime(current);
@@ -308,6 +355,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
 	return (gpa_t)gfn << PAGE_SHIFT;
 }
 
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
+{
+	return (hpa_t)pfn << PAGE_SHIFT;
+}
+
 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
 	set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
-- 
cgit v1.2.3


From 4731d4c7a07769cf2926c327177b97bb8c68cafc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:39 -0300
Subject: KVM: MMU: out of sync shadow core

Allow guest pagetables to go out of sync.  Instead of emulating write
accesses to guest pagetables, or unshadowing them, we un-write-protect
the page table and allow the guest to modify it at will.  We rely on
invlpg executions to synchronize individual ptes, and will synchronize
the entire pagetable on tlb flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6252802c3cc..73b7c52b949 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,7 @@
 #define KVM_REQ_TRIPLE_FAULT       4
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
+#define KVM_REQ_MMU_SYNC           7
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
-- 
cgit v1.2.3


From 8a98f6648a2b0756d8f26d6c13332f5526355fec Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:47:38 +0800
Subject: KVM: Move device assignment logic to common code

To share with other archs, this patch moves device assignment
logic to common parts.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h      | 2 ++
 include/linux/kvm_host.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 4269be171fa..9acf34a6dfb 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,7 +383,9 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#ifdef CONFIG_X86
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
+#endif
 #define KVM_CAP_IOMMU 18
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 73b7c52b949..10c1146cd00 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -281,6 +281,7 @@ void kvm_free_physmem(struct kvm *kvm);
 
 struct  kvm *kvm_arch_create_vm(void);
 void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
-- 
cgit v1.2.3


From c77fb9dc7a0383c86eabef30272a763a482403e1 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 27 Sep 2008 10:55:40 +0800
Subject: KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for
 all archs

Add a kvm_ prefix to avoid polluting kernel's name space.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 10c1146cd00..b3b7598b4d9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -288,6 +288,8 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
+int kvm_is_mmio_pfn(pfn_t pfn);
+
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
 	unsigned gsi;
-- 
cgit v1.2.3


From 3de42dc094ecd313dc7d551e007a134b52f8663d Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:48:45 +0800
Subject: KVM: Separate irq ack notification out of arch/x86/kvm/irq.c

Moving irq ack notification logic as common, and make
it shared with ia64 side.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm_host.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b3b7598b4d9..3833c48fae3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -309,6 +309,12 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
+void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian);
 
 #ifdef CONFIG_DMAR
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
-- 
cgit v1.2.3


From 2381ad241d0bea1253a37f314b270848067640bb Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Wed, 8 Oct 2008 08:29:33 +0800
Subject: KVM: ia64: Add intel iommu support for guests.

With intel iommu hardware, we can assign devices to kvm/ia64 guests.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 include/linux/kvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 9acf34a6dfb..797fcd78124 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -383,7 +383,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86)||defined(CONFIG_IA64)
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
-- 
cgit v1.2.3


From be585c07dd577faac26014db4246e6d7c7a131e7 Mon Sep 17 00:00:00 2001
From: Jay Fenlason <fenlason@redhat.com>
Date: Wed, 1 Oct 2008 18:13:20 -0400
Subject: firewire: Add more documentation to firewire-cdev.h

Signed-off-by: Jay Fenlason <fenlason@redhat.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 include/linux/firewire-cdev.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firewire-cdev.h b/include/linux/firewire-cdev.h
index 0f0e271f97f..4d078e99c01 100644
--- a/include/linux/firewire-cdev.h
+++ b/include/linux/firewire-cdev.h
@@ -154,8 +154,13 @@ struct fw_cdev_event_iso_interrupt {
  * @request:       Valid if @common.type == %FW_CDEV_EVENT_REQUEST
  * @iso_interrupt: Valid if @common.type == %FW_CDEV_EVENT_ISO_INTERRUPT
  *
- * Convenience union for userspace use.  Events could be read(2) into a char
- * buffer and then cast to this union for further processing.
+ * Convenience union for userspace use.  Events could be read(2) into an
+ * appropriately aligned char buffer and then cast to this union for further
+ * processing.  Note that for a request, response or iso_interrupt event,
+ * the data[] or header[] may make the size of the full event larger than
+ * sizeof(union fw_cdev_event).  Also note that if you attempt to read(2)
+ * an event into a buffer that is not large enough for it, the data that does
+ * not fit will be discarded so that the next read(2) will return a new event.
  */
 union fw_cdev_event {
 	struct fw_cdev_event_common common;
-- 
cgit v1.2.3


From 22441cfa0c70dcd457f3c081fcf285c3bd155824 Mon Sep 17 00:00:00 2001
From: Pedro Ribeiro <pribeiro@net.ipl.pt>
Date: Wed, 15 Oct 2008 15:47:49 -0700
Subject: IPV6: Fix default gateway criteria wrt. HIGH/LOW preference radv
 option

Problem observed:
               In IPv6, in the presence of multiple routers candidates to
               default gateway in one segment, each sending a different
               value of preference, the Linux hosts connected to the
               segment weren't selecting the right one in all the
               combinations possible of LOW/MEDIUM/HIGH preference.

This patch changes two files:
include/linux/icmpv6.h
               Get the "router_pref" bitfield in the right place
               (as RFC4191 says), named the bit left with this fix as
               "home_agent" (RFC3775 say that's his function)

net/ipv6/ndisc.c
               Corrects the binary logic behind the updating of the
               router preference in the flags of the routing table

Result:
               With this two fixes applied, the default route used by
               the system was to consistent with the rules mentioned
               in RFC4191 in case of changes in the value of preference
               in router advertisements

Signed-off-by: Pedro Ribeiro <pribeiro@net.ipl.pt>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 03067443198..a93a8dd3311 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -40,16 +40,18 @@ struct icmp6hdr {
                 struct icmpv6_nd_ra {
 			__u8		hop_limit;
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-			__u8		reserved:4,
+			__u8		reserved:3,
 					router_pref:2,
+					home_agent:1,
 					other:1,
 					managed:1;
 
 #elif defined(__BIG_ENDIAN_BITFIELD)
 			__u8		managed:1,
 					other:1,
+					home_agent:1,
 					router_pref:2,
-					reserved:4;
+					reserved:3;
 #else
 #error	"Please fix <asm/byteorder.h>"
 #endif
-- 
cgit v1.2.3


From 29d434b39c807320fbe4bcdce0ab98a0b9fcb285 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: add include protectors

Add include protectors to include/linux/fuse.h and fs/fuse/fuse_i.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 include/linux/fuse.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 265635dc990..8bc1101e9b3 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -19,6 +19,9 @@
  *  - add file flags field to fuse_read_in and fuse_write_in
  */
 
+#ifndef _LINUX_FUSE_H
+#define _LINUX_FUSE_H
+
 #include <asm/types.h>
 #include <linux/major.h>
 
@@ -409,3 +412,5 @@ struct fuse_dirent {
 #define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
 #define FUSE_DIRENT_SIZE(d) \
 	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
+
+#endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From a7c1b990f71574e077b94ce4582e2cf11cb891fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Oct 2008 16:08:57 +0200
Subject: fuse: implement nonseekable open

Let the client request nonseekable open using FOPEN_NONSEEKABLE and
call nonseekable_open() on the file if requested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 include/linux/fuse.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index 8bc1101e9b3..350fe9767bb 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -17,6 +17,9 @@
  *  - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
  *  - add blksize field to fuse_attr
  *  - add file flags field to fuse_read_in and fuse_write_in
+ *
+ * 7.10
+ *  - add nonseekable open flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -29,7 +32,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 9
+#define FUSE_KERNEL_MINOR_VERSION 10
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -101,9 +104,11 @@ struct fuse_file_lock {
  *
  * FOPEN_DIRECT_IO: bypass page cache for this open file
  * FOPEN_KEEP_CACHE: don't invalidate the data cache on open
+ * FOPEN_NONSEEKABLE: the file is not seekable
  */
 #define FOPEN_DIRECT_IO		(1 << 0)
 #define FOPEN_KEEP_CACHE	(1 << 1)
+#define FOPEN_NONSEEKABLE	(1 << 2)
 
 /**
  * INIT request/reply flags
-- 
cgit v1.2.3


From 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 16 Oct 2008 10:09:17 -0400
Subject: vfs: Add no_nrwrite_index_update writeback control flag

If no_nrwrite_index_update is set we don't update nr_to_write and
address space writeback_index in write_cache_pages.  This change
enables a file system to skip these updates in write_cache_pages and do
them in the writepages() callback.  This patch will be followed by an
ext4 patch that make use of these new flags.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
CC: linux-fsdevel@vger.kernel.org
---
 include/linux/writeback.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index bd91987c065..e585657e983 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,15 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	/*
+	 * write_cache_pages() won't update wbc->nr_to_write and
+	 * mapping->writeback_index if no_nrwrite_index_update
+	 * is set.  write_cache_pages() may write more than we
+	 * requested and we want to make sure nr_to_write and
+	 * writeback_index are updated in a consistent manner
+	 * so we use a single control to update them
+	 */
+	unsigned no_nrwrite_index_update:1;
 };
 
 /*
-- 
cgit v1.2.3


From 3ddfda11861d305b02ed810b522dcf48b74ca808 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:43 -0700
Subject: generic: add dyn_array support

Allow crazy big arrays via bootmem at init stage.
Architectures use CONFIG_HAVE_DYN_ARRAY to enable it.

usage:

| static struct irq_desc irq_desc_init __initdata = {
|        .status = IRQ_DISABLED,
|        .chip = &no_irq_chip,
|        .handle_irq = handle_bad_irq,
|        .depth = 1,
|        .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
| #ifdef CONFIG_SMP
|        .affinity = CPU_MASK_ALL
| #endif
| };
|
| static void __init init_work(void *data)
| {
|        struct dyn_array *da = data;
|        struct  irq_desc *desc;
|        int i;
|
|        desc = *da->name;
|
|        for (i = 0; i < *da->nr; i++)
|                memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc));
| }
|
| struct irq_desc *irq_desc;
| DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work);

after pre_alloc_dyn_array() after setup_arch(), the array is ready to be
used.

Via this facility we can replace irq_desc[NR_IRQS] array with dyn_array
irq_desc[nr_irqs].

v2: remove _nopanic in pre_alloc_dyn_array()

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 93538b696e3..cf9fa7f174a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -246,6 +246,29 @@ struct obs_kernel_param {
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
+
+struct dyn_array {
+	void **name;
+	unsigned long size;
+	unsigned int *nr;
+	unsigned long align;
+	void (*init_work)(void *);
+};
+extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
+
+#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+		static struct dyn_array __dyn_array_##nameX __initdata = \
+		{	.name = (void **)&nameX,\
+			.size = sizeX,\
+			.nr   = &nrX,\
+			.align = alignX,\
+			.init_work = init_workX,\
+		}; \
+		static struct dyn_array *__dyn_array_ptr_##nameX __used \
+		__attribute__((__section__(".dyn_array.init"))) = \
+			&__dyn_array_##nameX
+
+extern void pre_alloc_dyn_array(void);
 #endif /* __ASSEMBLY__ */
 
 /**
-- 
cgit v1.2.3


From 1f3fcd4b1adc972d5c6a34cfed98931c46575b49 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:44 -0700
Subject: add per_cpu_dyn_array support

allow dyn-array in per_cpu area, allocated dynamically.

usage:

|  /* in .h */
| struct kernel_stat {
|        struct cpu_usage_stat   cpustat;
|        unsigned int *irqs;
| };
|
|  /* in .c */
| DEFINE_PER_CPU(struct kernel_stat, kstat);
|
| DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL);

after setup_percpu()/per_cpu_alloc_dyn_array(), the dyn_array in
per_cpu area is ready to use.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init.h | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index cf9fa7f174a..332806826b8 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -255,12 +255,13 @@ struct dyn_array {
 	void (*init_work)(void *);
 };
 extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
+extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[];
 
-#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
 		static struct dyn_array __dyn_array_##nameX __initdata = \
-		{	.name = (void **)&nameX,\
+		{	.name = (void **)&(nameX),\
 			.size = sizeX,\
-			.nr   = &nrX,\
+			.nr   = &(nrX),\
 			.align = alignX,\
 			.init_work = init_workX,\
 		}; \
@@ -268,7 +269,27 @@ extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
 		__attribute__((__section__(".dyn_array.init"))) = \
 			&__dyn_array_##nameX
 
+#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+	DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX)
+
+#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
+		static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \
+		{	.name = (void **)&(addrX),\
+			.size = sizeX,\
+			.nr   = &(nrX),\
+			.align = alignX,\
+			.init_work = init_workX,\
+		}; \
+		static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \
+		__attribute__((__section__(".per_cpu_dyn_array.init"))) = \
+			&__per_cpu_dyn_array_##nameX
+
+#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
+	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
+
 extern void pre_alloc_dyn_array(void);
+extern unsigned long per_cpu_dyn_array_size(void);
+extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
 /**
-- 
cgit v1.2.3


From 1f8ff037a871690c762d267d8a052529d3102fc9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:45 -0700
Subject: x86: alloc dyn_array all together

so could spare some memory with small alignment in bootmem

also tighten the alignment checking, and make print out less debug info.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 332806826b8..59fbb4aaba6 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -288,7 +288,7 @@ extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]
 	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
 
 extern void pre_alloc_dyn_array(void);
-extern unsigned long per_cpu_dyn_array_size(void);
+extern unsigned long per_cpu_dyn_array_size(unsigned long *align);
 extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
-- 
cgit v1.2.3


From 85c0f90978bf50596dbd23854648020f1f9b5bfd Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:49:47 -0700
Subject: irq: introduce nr_irqs

at this point nr_irqs is equal NR_IRQS

convert a few easy users from NR_IRQS to dynamic nr_irqs.

v2: according to Eric, we need to take care of arch without generic_hardirqs

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 58ff4e74b2f..511803853a5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,6 +15,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
-- 
cgit v1.2.3


From d60458b224d6b997a582a05cb8c4b9bed9e17a1d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:00 -0700
Subject: irq: make irq_desc to use dyn_array

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 1d73d1abb83..5f4b013624d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -179,7 +179,11 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
+extern struct irq_desc *irq_desc;
+#else
 extern struct irq_desc irq_desc[NR_IRQS];
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
-- 
cgit v1.2.3


From d17a55ded3393ad3878010bb3a8243a15a8d8df5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:01 -0700
Subject: irq: make irqs in kernel stat use per_cpu_dyn_array

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index cf9f40a91c9..fe1f7fe534b 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,11 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned int *irqs;
+#else
 	unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
-- 
cgit v1.2.3


From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:05 -0700
Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead
 of irq_desc[]

add CONFIG_HAVE_SPARSE_IRQ to for use condensed array.
Get rid of irq_desc[] array assumptions.

Preallocate 32 irq_desc, and irq_desc() will try to get more.

( No change in functionality is expected anywhere, except the odd build
  failure where we missed a code site or where a crossing commit itroduces
  new irq_desc[] usage. )

v2: according to Eric, change get_irq_desc() to irq_desc()

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5f4b013624d..80b8200f2ad 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -152,6 +152,10 @@ struct irq_chip {
  * @name:		flow handler name for /proc/interrupts output
  */
 struct irq_desc {
+	unsigned int		irq;
+#ifdef CONFIG_HAVE_SPARSE_IRQ
+	struct irq_desc		*next;
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -179,9 +183,9 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-extern struct irq_desc *irq_desc;
-#else
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+#ifndef CONFIG_HAVE_DYN_ARRAY
+/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
@@ -249,7 +253,10 @@ extern int no_irq_affinity;
 
 static inline int irq_balancing_disabled(unsigned int irq)
 {
-	return irq_desc[irq].status & IRQ_NO_BALANCING_MASK;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	return desc->status & IRQ_NO_BALANCING_MASK;
 }
 
 /* Handle irq action chains: */
@@ -281,7 +288,7 @@ extern unsigned int __do_IRQ(unsigned int irq);
  */
 static inline void generic_handle_irq(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
@@ -325,7 +332,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 static inline void __set_irq_handler_unlocked(int irq,
 					      irq_flow_handler_t handler)
 {
-	irq_desc[irq].handle_irq = handler;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->handle_irq = handler;
 }
 
 /*
@@ -359,7 +369,7 @@ extern void destroy_irq(unsigned int irq);
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_has_action(unsigned int irq)
 {
-	struct irq_desc *desc = irq_desc + irq;
+	struct irq_desc *desc = irq_to_desc(irq);
 	return desc->action != NULL;
 }
 
@@ -374,10 +384,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data);
 extern int set_irq_type(unsigned int irq, unsigned int type);
 extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
-#define get_irq_chip(irq)	(irq_desc[irq].chip)
-#define get_irq_chip_data(irq)	(irq_desc[irq].chip_data)
-#define get_irq_data(irq)	(irq_desc[irq].handler_data)
-#define get_irq_msi(irq)	(irq_desc[irq].msi_desc)
+#define get_irq_chip(irq)	(irq_to_desc(irq)->chip)
+#define get_irq_chip_data(irq)	(irq_to_desc(irq)->chip_data)
+#define get_irq_data(irq)	(irq_to_desc(irq)->handler_data)
+#define get_irq_msi(irq)	(irq_to_desc(irq)->msi_desc)
 
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
-- 
cgit v1.2.3


From 3060d6fe28570640c2d7d66d38b9eaa848c3b9e3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:08 -0700
Subject: x86: put timer_rand_state pointer into irq_desc

irq_timer_state[] is a NR_IRQS sized array that is a side-by array to
the real irq_desc[] array.

Integrate that field into the (now dynamic) irq_desc dynamic array and
save some RAM.

v2: keep the old way to support arch not support irq_desc

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 80b8200f2ad..60c856aaac0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -127,6 +127,7 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -155,6 +156,7 @@ struct irq_desc {
 	unsigned int		irq;
 #ifdef CONFIG_HAVE_SPARSE_IRQ
 	struct irq_desc		*next;
+	struct timer_rand_state *timer_rand_state;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
-- 
cgit v1.2.3


From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:09 -0700
Subject: x86: move kstat_irqs from kstat to irq_desc

based on Eric's patch ...

together mold it with dyn_array for irq_desc, will allcate kstat_irqs for
nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already.

v2: make sure system without generic_hardirqs works they don't have irq_desc
v3: fix merging
v4: [mingo@elte.hu] fix typo

[ mingo@elte.hu ] irq: build fix

fix:

 arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow':
 arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs'

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h         |  7 +++++++
 include/linux/kernel_stat.h | 22 +++++++++++++++-------
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 60c856aaac0..cbf471aee1c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -157,6 +157,11 @@ struct irq_desc {
 #ifdef CONFIG_HAVE_SPARSE_IRQ
 	struct irq_desc		*next;
 	struct timer_rand_state *timer_rand_state;
+#endif
+#ifdef CONFIG_HAVE_DYN_ARRAY
+	unsigned int            *kstat_irqs;
+#else
+	unsigned int            kstat_irqs[NR_CPUS];
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -190,6 +195,8 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
 
 /*
  * Migration helpers for obsolete names, they will go away:
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index fe1f7fe534b..f10616712de 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,10 +28,8 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned int *irqs;
-#else
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_GENERIC_HARDIRQS
+       unsigned int irqs[NR_IRQS];
 #endif
 };
 
@@ -43,15 +41,25 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+       return kstat_cpu(cpu).irqs[irq];
+}
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
+
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-static inline int kstat_irqs(int irq)
+static inline unsigned int kstat_irqs(unsigned int irq)
 {
-	int cpu, sum = 0;
+	unsigned int sum = 0;
+	int cpu;
 
 	for_each_possible_cpu(cpu)
-		sum += kstat_cpu(cpu).irqs[irq];
+		sum += kstat_irqs_cpu(irq, cpu);
 
 	return sum;
 }
-- 
cgit v1.2.3


From 9059d8fa4a3a9153da53da890039f7f956cc9d19 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:10 -0700
Subject: irq: add irq_desc_without_new

add an irq_desc accessor that will not allocate any sparse entry
but returns failure if there's no entry present.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index cbf471aee1c..c9ffef7c3b4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -191,10 +191,23 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *__irq_to_desc(unsigned int irq);
+
+#ifndef CONFIG_HAVE_SPARSE_IRQ
+
 #ifndef CONFIG_HAVE_DYN_ARRAY
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
+#else
+extern struct irq_desc *irq_desc;
 #endif
+
+#else
+
+extern struct irq_desc *sparse_irqs;
+
+#endif
+
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
 
-- 
cgit v1.2.3


From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:11 -0700
Subject: irq: replace loop with nr_irqs with for_each_irq_desc

There are a handful of loops that go from 0 to nr_irqs and use
get_irq_desc() on them. These would allocate all the irq_desc
entries, regardless of the need for them.

Use the smarter for_each_irq_desc() iterator that will only iterate
over the present ones.

v2: make sure arch without GENERIC_HARDIRQS work too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c9ffef7c3b4..9de16ca8b8e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -202,9 +202,16 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+#define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq])
+#endif
+
 #else
 
 extern struct irq_desc *sparse_irqs;
+#define for_each_irq_desc(irqX, desc)		\
+	for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U)
 
 #endif
 
-- 
cgit v1.2.3


From c7fb03a475bd80c642c1345d85c7c550f63514b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:12 -0700
Subject: irq, fs/proc: replace loop with nr_irqs for proc/stat

Replace another nr_irqs loop to avoid the allocation of all sparse
irq entries - use for_each_irq_desc instead.

v2: make sure arch without GENERIC_HARDIRQS works too

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 511803853a5..d4039a0b23f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -17,6 +17,11 @@
 
 extern int nr_irqs;
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+#define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#endif
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
-- 
cgit v1.2.3


From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:15 -0700
Subject: generic: add irq_desc in function in parameter

So we could remove some duplicated calling to irq_desc

v2: make sure irq_desc in  init/main.c is not used without generic_hardirqs

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9de16ca8b8e..7b59e193a11 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -315,10 +315,8 @@ extern unsigned int __do_IRQ(unsigned int irq);
  * irqchip-style controller then we call the ->handle_irq() handler,
  * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
  */
-static inline void generic_handle_irq(unsigned int irq)
+static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
 #else
@@ -329,6 +327,11 @@ static inline void generic_handle_irq(unsigned int irq)
 #endif
 }
 
+static inline void generic_handle_irq(unsigned int irq)
+{
+	generic_handle_irq_desc(irq, irq_to_desc(irq));
+}
+
 /* Handling of unhandled and spurious interrupts: */
 extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
 			   int action_ret);
-- 
cgit v1.2.3


From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:17 -0700
Subject: x86_64: rename irq_desc/irq_desc_alloc

change names:

          irq_desc() ==> irq_desc_alloc
	__irq_desc() ==> irq_desc

Also split a few of the uses in lowlevel x86 code.

v2: need to check if desc is null in smp_irq_move_cleanup

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7b59e193a11..5fe1b01c11f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -191,7 +191,7 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *__irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
 
 #ifndef CONFIG_HAVE_SPARSE_IRQ
 
-- 
cgit v1.2.3


From 67fb283e148e9bd761f73691d3173b6eab9ba8db Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:18 -0700
Subject: irq: separate sparse_irqs from sparse_irqs_free

so later don't need compare with -1U

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 5fe1b01c11f..d5749852ee6 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -211,7 +211,7 @@ extern struct irq_desc *irq_desc;
 
 extern struct irq_desc *sparse_irqs;
 #define for_each_irq_desc(irqX, desc)		\
-	for (desc = sparse_irqs, irqX = desc->irq; desc && irqX != -1U; desc = desc->next, irqX = desc ? desc->irq : -1U)
+	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
 
 #endif
 
-- 
cgit v1.2.3


From e420dfb40c453a9760b86c7f338052bdb4dfa755 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:21 -0700
Subject: x86: put irq_2_iommu pointer into irq_desc

when CONFIG_HAVE_SPARSE_IRQ
preallocate some irq_2_iommu entries, and use get_one_free_irq_2_iomm to
get new one and link to irq_desc if needed.

else will use dyn_array or static array.

v2: <= nr_irqs fix

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d5749852ee6..788d5a35a58 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -128,6 +128,7 @@ struct irq_chip {
 };
 
 struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -162,6 +163,9 @@ struct irq_desc {
 	unsigned int            *kstat_irqs;
 #else
 	unsigned int            kstat_irqs[NR_CPUS];
+#endif
+#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
+       struct irq_2_iommu      *irq_2_iommu;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
-- 
cgit v1.2.3


From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:22 -0700
Subject: x86: use 28 bits irq NR for pci msi/msix and ht

also print out irq no in /proc/interrups and /proc/stat in hex, so could
tell bus/dev/func.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 788d5a35a58..704136138dc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -399,6 +399,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
+extern unsigned int create_irq_nr(unsigned int irq_want);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
cgit v1.2.3


From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:23 -0700
Subject: x86: remove irqbalance in kernel for 32 bit

This has been deprecated for years, the user space irqbalanced utility
works better with numa, has configurable policies, etc...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmai.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 704136138dc..2445d2b3d5d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -185,7 +185,7 @@ struct irq_desc {
 	cpumask_t		affinity;
 	unsigned int		cpu;
 #endif
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_t		pending_mask;
 #endif
 #ifdef CONFIG_PROC_FS
@@ -241,13 +241,13 @@ extern int setup_irq(unsigned int irq, struct irqaction *new);
 
 #ifdef CONFIG_SMP
 
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
 
 void set_pending_irq(unsigned int irq, cpumask_t mask);
 void move_native_irq(int irq);
 void move_masked_irq(int irq);
 
-#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */
+#else /* CONFIG_GENERIC_PENDING_IRQ */
 
 static inline void move_irq(int irq)
 {
@@ -274,14 +274,6 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_IRQBALANCE
-extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask);
-#else
-static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-}
-#endif
-
 extern int no_irq_affinity;
 
 static inline int irq_balancing_disabled(unsigned int irq)
-- 
cgit v1.2.3


From 42379b1122bab7f9aefdbd4b7004a6fa89dfbae5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Tue, 19 Aug 2008 20:50:45 -0700
Subject: pci: change msi-x vector to 32bit

we are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the
cache for irq number should be 32 bit too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98dc6243a70..1f8db240ca4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -723,7 +723,7 @@ enum pci_dma_burst_strategy {
 };
 
 struct msix_entry {
-	u16 	vector;	/* kernel uses to write allocated vector */
+	u32	vector;	/* kernel uses to write allocated vector */
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
-- 
cgit v1.2.3


From 8c464a4b23ca283b414022ebc77787f3c7040fa7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 25 Aug 2008 12:41:19 -0700
Subject: sparseirq: move kstat_irqs from kstat to irq_desc - fix

fix non-sparseirq architectures.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h         |  4 ++--
 include/linux/kernel_stat.h | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2445d2b3d5d..93fe9a943e7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -161,8 +161,6 @@ struct irq_desc {
 #endif
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned int            *kstat_irqs;
-#else
-	unsigned int            kstat_irqs[NR_CPUS];
 #endif
 #if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
        struct irq_2_iommu      *irq_2_iommu;
@@ -219,8 +217,10 @@ extern struct irq_desc *sparse_irqs;
 
 #endif
 
+#ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
+#endif
 
 /*
  * Migration helpers for obsolete names, they will go away:
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index f10616712de..21249d8c129 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_GENERIC_HARDIRQS
+#ifndef CONFIG_HAVE_DYN_ARRAY
        unsigned int irqs[NR_IRQS];
 #endif
 };
@@ -41,7 +41,13 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_GENERIC_HARDIRQS
+#ifndef CONFIG_HAVE_DYN_ARRAY
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+#endif
+
+
+#ifndef CONFIG_HAVE_DYN_ARRAY
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
-- 
cgit v1.2.3


From f6dd5c3106fb283e37d915eeb33019ef40510f85 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Wed, 3 Sep 2008 16:58:32 -0700
Subject: dmar: fix using early fixmap mapping for DMAR table parsing

Very early detection of the DMAR tables will setup fixmap mapping. For
parsing these tables later (while enabling dma and/or interrupt remapping),
early fixmap mapping shouldn't be used. Fix it by calling table detection
routines again, which will call generic apci_get_table() for setting up
the correct mapping.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dmar.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index c360c558e59..f1984fc3e06 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -45,7 +45,6 @@ extern struct list_head dmar_drhd_units;
 	list_for_each_entry(drhd, &dmar_drhd_units, list)
 
 extern int dmar_table_init(void);
-extern int early_dmar_detect(void);
 extern int dmar_dev_scope_init(void);
 
 /* Intel IOMMU detection */
-- 
cgit v1.2.3


From a50f70b17541c0060967c6df61133e968bad3652 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Fri, 3 Oct 2008 11:58:54 -0500
Subject: x86: Add UV EFI table entry v4

Look for a UV entry in the EFI tables.

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/efi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 807373d467f..bb66feb164b 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -208,6 +208,9 @@ typedef efi_status_t efi_set_virtual_address_map_t (unsigned long memory_map_siz
 #define EFI_GLOBAL_VARIABLE_GUID \
     EFI_GUID(  0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c )
 
+#define UV_SYSTEM_TABLE_GUID \
+    EFI_GUID(  0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93 )
+
 typedef struct {
 	efi_guid_t guid;
 	unsigned long table;
@@ -255,6 +258,7 @@ extern struct efi {
 	unsigned long boot_info;	/* boot info table */
 	unsigned long hcdp;		/* HCDP table */
 	unsigned long uga;		/* UGA table */
+	unsigned long uv_systab;	/* UV system table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
-- 
cgit v1.2.3


From 7ef0c30dbf96a8d9a234e90c248eb19df3c031be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 13:07:35 +0200
Subject: genirq: define nr_irqs for architectures with GENERIC_HARDIRQS=n

Revert the sparse irq changes in m68k/s390/sparc and just define
nr_irqs as NR_IRQS for those architectures.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index d4039a0b23f..5a57df2ee92 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,11 +15,13 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-extern int nr_irqs;
-
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define for_each_irq_desc(irq, desc)		\
+# define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
+
+# define nr_irqs		NR_IRQS
+#else
+extern int nr_irqs;
 #endif
 
 /*
-- 
cgit v1.2.3


From 70dd4d992ab324a59cdcd6bedc3f4e729863d514 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 15:39:27 +0200
Subject: genirq: consolidate nr_irqs and for_each_irq_desc()

Move all of those to linux/irq.h where they belong.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  9 ---------
 include/linux/irq.h       | 17 ++++++++++++-----
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5a57df2ee92..58ff4e74b2f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,15 +15,6 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-#ifndef CONFIG_GENERIC_HARDIRQS
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0; irq < nr_irqs; irq++)
-
-# define nr_irqs		NR_IRQS
-#else
-extern int nr_irqs;
-#endif
-
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 93fe9a943e7..dbe8734ae86 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -11,6 +11,18 @@
 
 #include <linux/smp.h>
 
+#ifndef CONFIG_GENERIC_HARDIRQS
+# define nr_irqs		NR_IRQS
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#else
+extern int nr_irqs;
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+#endif
+
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
@@ -204,11 +216,6 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
-#ifdef CONFIG_GENERIC_HARDIRQS
-#define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc = &irq_desc[irq])
-#endif
-
 #else
 
 extern struct irq_desc *sparse_irqs;
-- 
cgit v1.2.3


From c6b7674f323622d86316bf7951ad9cae1ce24642 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:31:29 +0200
Subject: genirq: use inline function for irq_to_desc

For the non sparse irq case an inline function is perfectly fine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index dbe8734ae86..7d1adacaadb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -204,8 +204,6 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
 
 #ifndef CONFIG_HAVE_SPARSE_IRQ
 
@@ -216,8 +214,21 @@ extern struct irq_desc irq_desc[NR_IRQS];
 extern struct irq_desc *irq_desc;
 #endif
 
+static inline struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return (irq < nr_irqs) ? irq_desc + irq : NULL;
+}
+
+static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
+{
+	return irq_to_desc(irq);
+}
+
 #else
 
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
+
 extern struct irq_desc *sparse_irqs;
 #define for_each_irq_desc(irqX, desc)		\
 	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
-- 
cgit v1.2.3


From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:16:55 +0200
Subject: genirq: remove sparse irq code

This code is not ready, but we need to rip it out instead of rebasing
as we would lose the APIC/IO_APIC unification otherwise.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7d1adacaadb..68e0f3f9df3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -167,15 +167,8 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_HAVE_SPARSE_IRQ
-	struct irq_desc		*next;
-	struct timer_rand_state *timer_rand_state;
-#endif
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned int            *kstat_irqs;
-#endif
-#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ)
-       struct irq_2_iommu      *irq_2_iommu;
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -205,8 +198,6 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 
-#ifndef CONFIG_HAVE_SPARSE_IRQ
-
 #ifndef CONFIG_HAVE_DYN_ARRAY
 /* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
@@ -224,17 +215,6 @@ static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
 	return irq_to_desc(irq);
 }
 
-#else
-
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc(unsigned int irq);
-
-extern struct irq_desc *sparse_irqs;
-#define for_each_irq_desc(irqX, desc)		\
-	for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U)
-
-#endif
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
-- 
cgit v1.2.3


From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 14:34:09 +0200
Subject: genirq: remove irq_to_desc_alloc

Remove the leftover of sparseirqs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 68e0f3f9df3..3f33c779030 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -210,11 +210,6 @@ static inline struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < nr_irqs) ? irq_desc + irq : NULL;
 }
 
-static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq)
-{
-	return irq_to_desc(irq);
-}
-
 #ifdef CONFIG_HAVE_DYN_ARRAY
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
-- 
cgit v1.2.3


From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 15 Oct 2008 15:27:23 +0200
Subject: genirq: revert dynarray

Revert the dynarray changes. They need more thought and polishing.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init.h        | 43 -------------------------------------------
 include/linux/irq.h         | 15 ---------------
 include/linux/kernel_stat.h | 16 ++++++----------
 3 files changed, 6 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 59fbb4aaba6..70ad53e1eab 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -247,49 +247,6 @@ struct obs_kernel_param {
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
 
-struct dyn_array {
-	void **name;
-	unsigned long size;
-	unsigned int *nr;
-	unsigned long align;
-	void (*init_work)(void *);
-};
-extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[];
-extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[];
-
-#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
-		static struct dyn_array __dyn_array_##nameX __initdata = \
-		{	.name = (void **)&(nameX),\
-			.size = sizeX,\
-			.nr   = &(nrX),\
-			.align = alignX,\
-			.init_work = init_workX,\
-		}; \
-		static struct dyn_array *__dyn_array_ptr_##nameX __used \
-		__attribute__((__section__(".dyn_array.init"))) = \
-			&__dyn_array_##nameX
-
-#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
-	DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX)
-
-#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \
-		static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \
-		{	.name = (void **)&(addrX),\
-			.size = sizeX,\
-			.nr   = &(nrX),\
-			.align = alignX,\
-			.init_work = init_workX,\
-		}; \
-		static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \
-		__attribute__((__section__(".per_cpu_dyn_array.init"))) = \
-			&__per_cpu_dyn_array_##nameX
-
-#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \
-	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
-
-extern void pre_alloc_dyn_array(void);
-extern unsigned long per_cpu_dyn_array_size(unsigned long *align);
-extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3f33c779030..38bf89f2ade 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -139,8 +139,6 @@ struct irq_chip {
 	const char	*typename;
 };
 
-struct timer_rand_state;
-struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  *
@@ -167,9 +165,6 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned int            *kstat_irqs;
-#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -198,23 +193,13 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
-/* could be removed if we get rid of all irq_desc reference */
 extern struct irq_desc irq_desc[NR_IRQS];
-#else
-extern struct irq_desc *irq_desc;
-#endif
 
 static inline struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < nr_irqs) ? irq_desc + irq : NULL;
 }
 
-#ifdef CONFIG_HAVE_DYN_ARRAY
-#define kstat_irqs_this_cpu(DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()])
-#endif
-
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 21249d8c129..a9d0d360b77 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,9 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_HAVE_DYN_ARRAY
        unsigned int irqs[NR_IRQS];
-#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -41,20 +39,18 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
-#define kstat_irqs_this_cpu(irq) \
-	(kstat_this_cpu.irqs[irq])
-#endif
+struct irq_desc;
 
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
+					    struct irq_desc *desc)
+{
+	kstat_this_cpu.irqs[irq]++;
+}
 
-#ifndef CONFIG_HAVE_DYN_ARRAY
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
-#else
-extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
-- 
cgit v1.2.3


From a1aca5de08a0cb840a90fb3f729a5940f8d21185 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 15 Oct 2008 19:29:15 +0200
Subject: genirq: remove artifacts from sparseirq removal

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init.h        | 1 -
 include/linux/kernel_stat.h | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 70ad53e1eab..93538b696e3 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -246,7 +246,6 @@ struct obs_kernel_param {
 
 /* Relies on boot_command_line being set */
 void __init parse_early_param(void);
-
 #endif /* __ASSEMBLY__ */
 
 /**
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a9d0d360b77..89b6ecd4147 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-       unsigned int irqs[NR_IRQS];
+	unsigned int irqs[NR_IRQS];
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
-- 
cgit v1.2.3


From 811410fdb6b9d82a518542289efe9b2a51e3cbfb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 14:16:11 +0200
Subject: genirq: add reverse iterator for irq_desc

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 38bf89f2ade..31632aa65d1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -21,6 +21,10 @@ extern int nr_irqs;
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+
+# define for_each_irq_desc_reverse(irq, desc)			\
+	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
+	     irq > 0; irq--, desc--)
 #endif
 
 #ifndef CONFIG_S390
-- 
cgit v1.2.3


From 2be3b52a5785a6a5c5349fbd315f57595f7074be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 14:50:27 +0200
Subject: proc: fixup irq iterator

There is no need for irq_desc here. Even for sparse_irq we can
handle this clever in for_each_irq_nr().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 31632aa65d1..0618fb362cb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -27,6 +27,9 @@ extern int nr_irqs;
 	     irq > 0; irq--, desc--)
 #endif
 
+#define for_each_irq_nr(irq)			\
+	for (irq = 0; irq < nr_irqs; irq++)
+
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
-- 
cgit v1.2.3


From ae87221d3ce49d9de1e43756da834fd0bf05a2ad Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 24 Aug 2007 16:11:54 -0700
Subject: sysfs: crash debugging

Print the name of the last-accessed sysfs file when we oops, to help track
down oopses which occur in sysfs store/read handlers.  Because these oopses
tend to not leave any trace of the offending code in the stack traces.

Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 37fa24152bd..8ec406afb3e 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -119,6 +119,8 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 
 void sysfs_notify(struct kobject *kobj, char *dir, char *attr);
 
+void sysfs_printk_last_file(void);
+
 extern int __must_check sysfs_init(void);
 
 #else /* CONFIG_SYSFS */
@@ -231,6 +233,10 @@ static inline int __must_check sysfs_init(void)
 	return 0;
 }
 
+static inline void sysfs_printk_last_file(void)
+{
+}
+
 #endif /* CONFIG_SYSFS */
 
 #endif /* _SYSFS_H_ */
-- 
cgit v1.2.3


From 7fb6b5d51daf3613045258ee8add07022d8c39d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 21 Jul 2008 20:03:34 -0700
Subject: device create: remove device_create_drvdata

Now that the tree is cleaned up, device_create_drvdata can be safely
removed.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 246937c9cbc..60f6456691a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -502,7 +502,6 @@ extern struct device *device_create(struct class *cls, struct device *parent,
 				    dev_t devt, void *drvdata,
 				    const char *fmt, ...)
 				    __attribute__((format(printf, 5, 6)));
-#define device_create_drvdata	device_create
 extern void device_destroy(struct class *cls, dev_t devt);
 
 /*
-- 
cgit v1.2.3


From 346e15beb5343c2eb8216d820f2ed8f150822b08 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 12 Aug 2008 16:46:19 -0400
Subject: driver core: basic infrastructure for per-module dynamic debug
 messages

Base infrastructure to enable per-module debug messages.

I've introduced CONFIG_DYNAMIC_PRINTK_DEBUG, which when enabled centralizes
control of debugging statements on a per-module basis in one /proc file,
currently, <debugfs>/dynamic_printk/modules. When, CONFIG_DYNAMIC_PRINTK_DEBUG,
is not set, debugging statements can still be enabled as before, often by
defining 'DEBUG' for the proper compilation unit. Thus, this patch set has no
affect when CONFIG_DYNAMIC_PRINTK_DEBUG is not set.

The infrastructure currently ties into all pr_debug() and dev_dbg() calls. That
is, if CONFIG_DYNAMIC_PRINTK_DEBUG is set, all pr_debug() and dev_dbg() calls
can be dynamically enabled/disabled on a per-module basis.

Future plans include extending this functionality to subsystems, that define
their own debug levels and flags.

Usage:

Dynamic debugging is controlled by the debugfs file,
<debugfs>/dynamic_printk/modules. This file contains a list of the modules that
can be enabled. The format of the file is as follows:

	<module_name> <enabled=0/1>
		.
		.
		.

	<module_name> : Name of the module in which the debug call resides
	<enabled=0/1> : whether the messages are enabled or not

For example:

	snd_hda_intel enabled=0
	fixup enabled=1
	driver enabled=0

Enable a module:

	$echo "set enabled=1 <module_name>" > dynamic_printk/modules

Disable a module:

	$echo "set enabled=0 <module_name>" > dynamic_printk/modules

Enable all modules:

	$echo "set enabled=1 all" > dynamic_printk/modules

Disable all modules:

	$echo "set enabled=0 all" > dynamic_printk/modules

Finally, passing "dynamic_printk" at the command line enables
debugging for all modules. This mode can be turned off via the above
disable command.

[gkh: minor cleanups and tweaks to make the build work quietly]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h         |  6 ++-
 include/linux/dynamic_printk.h | 93 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/kernel.h         |  7 +++-
 include/linux/module.h         |  1 -
 4 files changed, 104 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/dynamic_printk.h

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 60f6456691a..fb034461b39 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -550,7 +550,11 @@ extern const char *dev_driver_string(const struct device *dev);
 #define dev_info(dev, format, arg...)		\
 	dev_printk(KERN_INFO , dev , format , ## arg)
 
-#ifdef DEBUG
+#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#define dev_dbg(dev, format, ...) do { \
+	dynamic_dev_dbg(dev, format, ##__VA_ARGS__); \
+	} while (0)
+#elif defined(DEBUG)
 #define dev_dbg(dev, format, arg...)		\
 	dev_printk(KERN_DEBUG , dev , format , ## arg)
 #else
diff --git a/include/linux/dynamic_printk.h b/include/linux/dynamic_printk.h
new file mode 100644
index 00000000000..2d528d00907
--- /dev/null
+++ b/include/linux/dynamic_printk.h
@@ -0,0 +1,93 @@
+#ifndef _DYNAMIC_PRINTK_H
+#define _DYNAMIC_PRINTK_H
+
+#define DYNAMIC_DEBUG_HASH_BITS 6
+#define DEBUG_HASH_TABLE_SIZE (1 << DYNAMIC_DEBUG_HASH_BITS)
+
+#define TYPE_BOOLEAN 1
+
+#define DYNAMIC_ENABLED_ALL 0
+#define DYNAMIC_ENABLED_NONE 1
+#define DYNAMIC_ENABLED_SOME 2
+
+extern int dynamic_enabled;
+
+/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
+ * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
+ * use independent hash functions, to reduce the chance of false positives.
+ */
+extern long long dynamic_printk_enabled;
+extern long long dynamic_printk_enabled2;
+
+struct mod_debug {
+	char *modname;
+	char *logical_modname;
+	char *flag_names;
+	int type;
+	int hash;
+	int hash2;
+} __attribute__((aligned(8)));
+
+int register_dynamic_debug_module(char *mod_name, int type, char *share_name,
+					char *flags, int hash, int hash2);
+
+#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+extern int unregister_dynamic_debug_module(char *mod_name);
+extern int __dynamic_dbg_enabled_helper(char *modname, int type,
+					int value, int hash);
+
+#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({	     \
+	int __ret = 0;							     \
+	if (unlikely((dynamic_printk_enabled & (1LL << DEBUG_HASH)) &&	     \
+			(dynamic_printk_enabled2 & (1LL << DEBUG_HASH2))))   \
+			__ret = __dynamic_dbg_enabled_helper(module, type,   \
+								value, hash);\
+	__ret; })
+
+#define dynamic_pr_debug(fmt, ...) do {					    \
+	static char mod_name[]						    \
+	__attribute__((section("__verbose_strings")))			    \
+	 = KBUILD_MODNAME;						    \
+	static struct mod_debug descriptor				    \
+	__used								    \
+	__attribute__((section("__verbose"), aligned(8))) =		    \
+	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
+	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
+						0, 0, DEBUG_HASH))	    \
+		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		    \
+				##__VA_ARGS__);				    \
+	} while (0)
+
+#define dynamic_dev_dbg(dev, format, ...) do {				    \
+	static char mod_name[]						    \
+	__attribute__((section("__verbose_strings")))			    \
+	 = KBUILD_MODNAME;						    \
+	static struct mod_debug descriptor				    \
+	__used								    \
+	__attribute__((section("__verbose"), aligned(8))) =		    \
+	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
+	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
+						0, 0, DEBUG_HASH))	    \
+			dev_printk(KERN_DEBUG, dev,			    \
+					KBUILD_MODNAME ": " format,	    \
+					##__VA_ARGS__);			    \
+	} while (0)
+
+#else
+
+static inline int unregister_dynamic_debug_module(const char *mod_name)
+{
+	return 0;
+}
+static inline int __dynamic_dbg_enabled_helper(char *modname, int type,
+						int value, int hash)
+{
+	return 0;
+}
+
+#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({ 0; })
+#define dynamic_pr_debug(fmt, ...)  do { } while (0)
+#define dynamic_dev_dbg(dev, format, ...)  do { } while (0)
+#endif
+
+#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 75d81f157d2..ededb6e83b4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -16,6 +16,7 @@
 #include <linux/log2.h>
 #include <linux/typecheck.h>
 #include <linux/ratelimit.h>
+#include <linux/dynamic_printk.h>
 #include <asm/byteorder.h>
 #include <asm/bug.h>
 
@@ -303,8 +304,12 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 #define pr_info(fmt, arg...) \
 	printk(KERN_INFO fmt, ##arg)
 
-#ifdef DEBUG
 /* If you are writing a driver, please use dev_dbg instead */
+#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#define pr_debug(fmt, ...) do { \
+	dynamic_pr_debug(fmt, ##__VA_ARGS__); \
+	} while (0)
+#elif defined(DEBUG)
 #define pr_debug(fmt, arg...) \
 	printk(KERN_DEBUG fmt, ##arg)
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index 68e09557c95..a41555cbe00 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -345,7 +345,6 @@ struct module
 	/* Reference counts */
 	struct module_ref ref[NR_CPUS];
 #endif
-
 };
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
-- 
cgit v1.2.3


From f1282c844e86db5a041afa41335b5f9eea6cec0c Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 16 Jul 2008 08:58:04 +1000
Subject: sysfs: Support sysfs_notify from atomic context with new
 sysfs_notify_dirent

Support sysfs_notify from atomic context with new sysfs_notify_dirent

sysfs_notify currently takes sysfs_mutex.
This means that it cannot be called in atomic context.
sysfs_mutex  is sometimes held over a malloc (sysfs_rename_dir)
so it can block on low memory.

In md I want to be able to notify on a sysfs attribute from
atomic context, and I don't want to block on low memory because I
could be in the writeout path for freeing memory.

So:
 - export the "sysfs_dirent" structure along with sysfs_get, sysfs_put
   and sysfs_get_dirent so I can get the sysfs_dirent that I want to
   notify on and hold it in an md structure.
 - split sysfs_notify_dirent out of sysfs_notify so the sysfs_dirent
   can be notified on with no blocking (just a spinlock).

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 8ec406afb3e..d8e0230f1e6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -78,6 +78,8 @@ struct sysfs_ops {
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
 };
 
+struct sysfs_dirent;
+
 #ifdef CONFIG_SYSFS
 
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
@@ -118,10 +120,13 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 			const struct attribute *attr, const char *group);
 
 void sysfs_notify(struct kobject *kobj, char *dir, char *attr);
-
+void sysfs_notify_dirent(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const unsigned char *name);
+struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
+void sysfs_put(struct sysfs_dirent *sd);
 void sysfs_printk_last_file(void);
-
-extern int __must_check sysfs_init(void);
+int __must_check sysfs_init(void);
 
 #else /* CONFIG_SYSFS */
 
@@ -227,6 +232,22 @@ static inline void sysfs_remove_file_from_group(struct kobject *kobj,
 static inline void sysfs_notify(struct kobject *kobj, char *dir, char *attr)
 {
 }
+static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+}
+static inline
+struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const unsigned char *name)
+{
+	return NULL;
+}
+static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+{
+	return NULL;
+}
+static inline void sysfs_put(struct sysfs_dirent *sd)
+{
+}
 
 static inline int __must_check sysfs_init(void)
 {
-- 
cgit v1.2.3


From e61396627f91abb855ddd8925be9172fb5871944 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 20 Sep 2008 19:08:39 -0700
Subject: debug: Introduce a dev_WARN() function

in the line of dev_printk(), this patch introduces a dev_WARN() function,
that takes a struct device and then a printk format/args set of arguments.
Unlike dev_printk(), the effect is that of WARN() in that a full warning
message (including filename/line, module list, versions and a backtrace)
is printed in addition to the device name and the arguments.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index fb034461b39..ec90e79f6a0 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -570,6 +570,14 @@ extern const char *dev_driver_string(const struct device *dev);
 	({ if (0) dev_printk(KERN_DEBUG, dev, format, ##arg); 0; })
 #endif
 
+/*
+ * dev_WARN() acts like dev_printk(), but with the key difference
+ * of using a WARN/WARN_ON to get the message out, including the
+ * file/line information and a backtrace.
+ */
+#define dev_WARN(dev, format, arg...) \
+	WARN(1, "Device: %s\n" format, dev_driver_string(dev), ## arg);
+
 /* Create alias, so I can be autoloaded. */
 #define MODULE_ALIAS_CHARDEV(major,minor) \
 	MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3


From d8bf254089a6c31d7d01a4d1d2f1861662900855 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Mon, 22 Sep 2008 14:41:40 -0700
Subject: platform: add new device registration helper

Add a helper that registers simple platform_device w/o resources but with
parent and device data.

This is usefull to cleanup platform code from code that registers such
simple devices as leds-gpio, generic-bl, etc.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/platform_device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 95ac21ab3a0..4b8cc6a3247 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -37,6 +37,8 @@ extern int platform_add_devices(struct platform_device **, int);
 
 extern struct platform_device *platform_device_register_simple(const char *, int id,
 					struct resource *, unsigned int);
+extern struct platform_device *platform_device_register_data(struct device *,
+		const char *, int, const void *, size_t);
 
 extern struct platform_device *platform_device_alloc(const char *name, int id);
 extern int platform_device_add_resources(struct platform_device *pdev, struct resource *res, unsigned int num);
-- 
cgit v1.2.3


From 8c0e3998f5b71e68fe6b6e489a92e052715e563c Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@freescale.com>
Date: Thu, 25 Sep 2008 16:45:13 -0700
Subject: sysfs: Make dir and name args to sysfs_notify() const

Because they can be, and because code like this produces a warning if
they're not:

struct device_attribute dev_attr;

sysfs_notify(&kobj, NULL, dev_attr.attr.name);

Signed-off-by: Trent Piepho <tpiepho@freescale.com>
CC: Neil Brown <neilb@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d8e0230f1e6..b330e289d71 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -119,7 +119,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 void sysfs_remove_file_from_group(struct kobject *kobj,
 			const struct attribute *attr, const char *group);
 
-void sysfs_notify(struct kobject *kobj, char *dir, char *attr);
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);
 void sysfs_notify_dirent(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 				      const unsigned char *name);
@@ -229,7 +229,8 @@ static inline void sysfs_remove_file_from_group(struct kobject *kobj,
 {
 }
 
-static inline void sysfs_notify(struct kobject *kobj, char *dir, char *attr)
+static inline void sysfs_notify(struct kobject *kobj, const char *dir,
+				const char *attr)
 {
 }
 static inline void sysfs_notify_dirent(struct sysfs_dirent *sd)
-- 
cgit v1.2.3


From 030c1d2bfcc2187650fb975456ca0b61a5bb77f4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 8 May 2008 14:41:00 -0700
Subject: kobject: Fix kobject_rename and !CONFIG_SYSFS

When looking at kobject_rename I found two bugs with
that exist when sysfs support is disabled in the kernel.

kobject_rename does not change the name on the kobject when
sysfs support is not compiled in.

kobject_rename without locking attempts to check the
validity of a rename operation, which the kobject layer
simply does not have the infrastructure to do.

This patch documents the previously unstated requirement of
kobject_rename that is the responsibility of the caller to
provide mutual exclusion and to be certain that the new_name
for the kobject is valid.

This patch modifies sysfs_rename_dir in !CONFIG_SYSFS case
to call kobject_set_name to actually change the kobject_name.

This patch removes the bogus and misleading check in kobject_rename
that attempts to see if a rename is valid.  The check is bogus
because we do not have the proper locking.  The check is misleading
because it looks like we can and do perform checking at the kobject
level that we don't.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b330e289d71..39924a96220 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -20,6 +20,8 @@
 struct kobject;
 struct module;
 
+extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
+			    __attribute__((format(printf, 2, 3)));
 /* FIXME
  * The *owner field is no longer used, but leave around
  * until the tree gets cleaned up fully.
@@ -147,7 +149,7 @@ static inline void sysfs_remove_dir(struct kobject *kobj)
 
 static inline int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-	return 0;
+	return kobject_set_name(kobj, "%s", new_name);
 }
 
 static inline int sysfs_move_dir(struct kobject *kobj,
-- 
cgit v1.2.3


From 0b4a4fea253e1296222603ccc55430ed7cd9413a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 3 Jul 2008 18:05:28 -0700
Subject: kobject: Cleanup kobject_rename and !CONFIG_SYSFS

It finally dawned on me what the clean fix to sysfs_rename_dir
calling kobject_set_name is.  Move the work into kobject_rename
where it belongs.  The callers serialize us anyway so this is
safe.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/sysfs.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 39924a96220..b330e289d71 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -20,8 +20,6 @@
 struct kobject;
 struct module;
 
-extern int kobject_set_name(struct kobject *kobj, const char *name, ...)
-			    __attribute__((format(printf, 2, 3)));
 /* FIXME
  * The *owner field is no longer used, but leave around
  * until the tree gets cleaned up fully.
@@ -149,7 +147,7 @@ static inline void sysfs_remove_dir(struct kobject *kobj)
 
 static inline int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-	return kobject_set_name(kobj, "%s", new_name);
+	return 0;
 }
 
 static inline int sysfs_move_dir(struct kobject *kobj,
-- 
cgit v1.2.3


From 99178b036c97293a65004ff5ec5cff9f833aaecd Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 26 Aug 2008 11:00:57 -0500
Subject: Driver core: add bus_sort_breadthfirst() function

The PCI core wants to reorder the devices in the bus list.  So move this
functionality out of the pci core and into the driver core so that
anyone else can also do this if needed.  This also lets us change how
struct device is attached to drivers in the future without messing with
the PCI core.

Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/device.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index ec90e79f6a0..987f5912720 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -90,6 +90,9 @@ int __must_check bus_for_each_drv(struct bus_type *bus,
 				  struct device_driver *start, void *data,
 				  int (*fn)(struct device_driver *, void *));
 
+void bus_sort_breadthfirst(struct bus_type *bus,
+			   int (*compare)(const struct device *a,
+					  const struct device *b));
 /*
  * Bus notifiers: Get notified of addition/removal of devices
  * and binding/unbinding of drivers to devices.
-- 
cgit v1.2.3


From 1648993fb05c487947c1cec6307aca29d8002abe Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 15 Oct 2008 22:01:03 -0700
Subject: introduce generic header file for the software IO/TLB

A series of patches introduce a generic header file for the software
IO/TLB implementation in lib/swiotlb.c.  Currently each architecture using
this code defines the prototypes itself.  The prototypes are moved to
include/linux/swiotlb.h and this file is included in architecture specific
code for X86 and IA64.

This patch:

Create include/linux/swiotlb.h file which contains all function prototypes
for the lib/swiotlb.c file.

(akpm: the dependent patches will be trickled through arch trees)

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swiotlb.h | 83 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 include/linux/swiotlb.h

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
new file mode 100644
index 00000000000..b18ec5533e8
--- /dev/null
+++ b/include/linux/swiotlb.h
@@ -0,0 +1,83 @@
+#ifndef __LINUX_SWIOTLB_H
+#define __LINUX_SWIOTLB_H
+
+#include <linux/types.h>
+
+struct device;
+struct dma_attrs;
+struct scatterlist;
+
+extern void
+swiotlb_init(void);
+
+extern void
+*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+			dma_addr_t *dma_handle, gfp_t flags);
+
+extern void
+swiotlb_free_coherent(struct device *hwdev, size_t size,
+		      void *vaddr, dma_addr_t dma_handle);
+
+extern dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir);
+
+extern void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+		     size_t size, int dir);
+
+extern dma_addr_t
+swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
+			 int dir, struct dma_attrs *attrs);
+
+extern void
+swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
+			   size_t size, int dir, struct dma_attrs *attrs);
+
+extern int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+	       int direction);
+
+extern void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+		 int direction);
+
+extern int
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		     int dir, struct dma_attrs *attrs);
+
+extern void
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+		       int nelems, int dir, struct dma_attrs *attrs);
+
+extern void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, int dir);
+
+extern void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			int nelems, int dir);
+
+extern void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+			       size_t size, int dir);
+
+extern void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			   int nelems, int dir);
+
+extern void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				  unsigned long offset, size_t size, int dir);
+
+extern void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				     unsigned long offset, size_t size,
+				     int dir);
+
+extern int
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+
+extern int
+swiotlb_dma_supported(struct device *hwdev, u64 mask);
+
+#endif /* __LINUX_SWIOTLB_H */
-- 
cgit v1.2.3


From 9363b9f23c9cc36cc8ef6c05fdf879ee4a96ae92 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Wed, 15 Oct 2008 22:01:05 -0700
Subject: memrlimit: cgroup mm owner callback changes to add task info

This patch adds an additional field to the mm_owner callbacks. This field
is required to get to the mm that changed. Hold mmap_sem in write mode
before calling the mm_owner_changed callback

[hugh@veritas.com: fix mmap_sem deadlock]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c98dd7cb707..30934e4bfaa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -326,7 +326,8 @@ struct cgroup_subsys {
 	 */
 	void (*mm_owner_changed)(struct cgroup_subsys *ss,
 					struct cgroup *old,
-					struct cgroup *new);
+					struct cgroup *new,
+					struct task_struct *p);
 	int subsys_id;
 	int active;
 	int disabled;
-- 
cgit v1.2.3


From 1bfcf1304ea79c46efc3724e548b13b4b442b418 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 15 Oct 2008 22:01:21 -0700
Subject: pm: rework disabling of user mode helpers during suspend/hibernation

We currently use a PM notifier to disable user mode helpers before suspend
and hibernation and to re-enable them during resume.  However, this is not
an ideal solution, because if any drivers want to upload firmware into
memory before suspend, they have to use a PM notifier for this purpose and
there is no guarantee that the ordering of PM notifiers will be as
expected (ie.  the notifier that disables user mode helpers has to be run
after the driver's notifier used for uploading the firmware).

For this reason, it seems better to move the disabling and enabling of
user mode helpers to separate functions that will be called by the PM core
as necessary.

[akpm@linux-foundation.org: remove unneeded ifdefs]
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index a1a91577813..92213a9194e 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -99,4 +99,7 @@ struct file;
 extern int call_usermodehelper_pipe(char *path, char *argv[], char *envp[],
 				    struct file **filp);
 
+extern int usermodehelper_disable(void);
+extern void usermodehelper_enable(void);
+
 #endif /* __LINUX_KMOD_H__ */
-- 
cgit v1.2.3


From d5c003b4d1690e666dbab02bc8e705947baa848c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 15 Oct 2008 22:01:24 -0700
Subject: include: replace __FUNCTION__ with __func__

__FUNCTION__ is gcc-specific, use __func__

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ext2_fs.h     |  2 +-
 include/linux/ext3_fs.h     |  4 ++--
 include/linux/ext3_jbd.h    | 14 +++++++-------
 include/linux/jbd.h         |  4 ++--
 include/linux/jbd2.h        |  4 ++--
 include/linux/pm.h          |  2 +-
 include/linux/reiserfs_fs.h |  2 +-
 include/linux/rtmutex.h     |  2 +-
 8 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index 2efe7b863cf..78c775a83f7 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -47,7 +47,7 @@
 #ifdef EXT2FS_DEBUG
 #	define ext2_debug(f, a...)	{ \
 					printk ("EXT2-fs DEBUG (%s, %d): %s:", \
-						__FILE__, __LINE__, __FUNCTION__); \
+						__FILE__, __LINE__, __func__); \
 				  	printk (f, ## a); \
 					}
 #else
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 8120fa1bc23..159d9b476cd 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -43,7 +43,7 @@
 #define ext3_debug(f, a...)						\
 	do {								\
 		printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",	\
-			__FILE__, __LINE__, __FUNCTION__);		\
+			__FILE__, __LINE__, __func__);		\
 		printk (KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
@@ -871,7 +871,7 @@ extern void ext3_update_dynamic_rev (struct super_block *sb);
 #define ext3_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
-		__ext3_std_error((sb), __FUNCTION__, (errno));	\
+		__ext3_std_error((sb), __func__, (errno));	\
 } while (0)
 
 /*
diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
index 8c43b13a02f..cf82d519be4 100644
--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -137,17 +137,17 @@ int __ext3_journal_dirty_metadata(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
 #define ext3_journal_get_undo_access(handle, bh) \
-	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+	__ext3_journal_get_undo_access(__func__, (handle), (bh))
 #define ext3_journal_get_write_access(handle, bh) \
-	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+	__ext3_journal_get_write_access(__func__, (handle), (bh))
 #define ext3_journal_revoke(handle, blocknr, bh) \
-	__ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+	__ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext3_journal_get_create_access(handle, bh) \
-	__ext3_journal_get_create_access(__FUNCTION__, (handle), (bh))
+	__ext3_journal_get_create_access(__func__, (handle), (bh))
 #define ext3_journal_dirty_metadata(handle, bh) \
-	__ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+	__ext3_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext3_journal_forget(handle, bh) \
-	__ext3_journal_forget(__FUNCTION__, (handle), (bh))
+	__ext3_journal_forget(__func__, (handle), (bh))
 
 int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
 
@@ -160,7 +160,7 @@ static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
 }
 
 #define ext3_journal_stop(handle) \
-	__ext3_journal_stop(__FUNCTION__, (handle))
+	__ext3_journal_stop(__func__, (handle))
 
 static inline handle_t *ext3_journal_current_handle(void)
 {
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 07a9b52a265..7ebbcb1c9ba 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -61,7 +61,7 @@ extern u8 journal_enable_debug;
 	do {								\
 		if ((n) <= journal_enable_debug) {			\
 			printk (KERN_DEBUG "(%s, %d): %s: ",		\
-				__FILE__, __LINE__, __FUNCTION__);	\
+				__FILE__, __LINE__, __func__);	\
 			printk (f, ## a);				\
 		}							\
 	} while (0)
@@ -984,7 +984,7 @@ extern int	cleanup_journal_tail(journal_t *);
 
 #define jbd_ENOSYS() \
 do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
+	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
 	current->state = TASK_UNINTERRUPTIBLE;			           \
 	schedule();						           \
 } while (1)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d2e91ea998f..463d6f10b64 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -61,7 +61,7 @@ extern u8 jbd2_journal_enable_debug;
 	do {								\
 		if ((n) <= jbd2_journal_enable_debug) {			\
 			printk (KERN_DEBUG "(%s, %d): %s: ",		\
-				__FILE__, __LINE__, __FUNCTION__);	\
+				__FILE__, __LINE__, __func__);	\
 			printk (f, ## a);				\
 		}							\
 	} while (0)
@@ -1143,7 +1143,7 @@ extern int	jbd2_cleanup_journal_tail(journal_t *);
 
 #define jbd_ENOSYS() \
 do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
+	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
 	current->state = TASK_UNINTERRUPTIBLE;			           \
 	schedule();						           \
 } while (1)
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 4dcce54b6d7..42de4003c4e 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -419,7 +419,7 @@ extern void __suspend_report_result(const char *function, void *fn, int ret);
 
 #define suspend_report_result(fn, ret)					\
 	do {								\
-		__suspend_report_result(__FUNCTION__, fn, ret);		\
+		__suspend_report_result(__func__, fn, ret);		\
 	} while (0)
 
 #else /* !CONFIG_PM_SLEEP */
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index e9963af16cd..bc5114d35e9 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -87,7 +87,7 @@ void reiserfs_warning(struct super_block *s, const char *fmt, ...);
 if( !( cond ) ) 								\
   reiserfs_panic( NULL, "reiserfs[%i]: assertion " scond " failed at "	\
 		  __FILE__ ":%i:%s: " format "\n",		\
-		  in_interrupt() ? -1 : task_pid_nr(current), __LINE__ , __FUNCTION__ , ##args )
+		  in_interrupt() ? -1 : task_pid_nr(current), __LINE__ , __func__ , ##args )
 
 #define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
 
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 382bb795116..f19b00b7d53 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -54,7 +54,7 @@ struct hrtimer_sleeper;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __FUNCTION__)
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __func__)
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-- 
cgit v1.2.3


From 693ac389326a87d608baa2902c45a6e78ed46681 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 15 Oct 2008 22:01:25 -0700
Subject: include/linux/mount.h: remove CVS keyword

Remove a CVS keyword that wasn't updated for a long time from a comment.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mount.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index 30a1d63b6fb..cab2a85e2ee 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -5,8 +5,6 @@
  *
  * Author:  Marco van Wieringen <mvw@planets.elm.net>
  *
- * Version: $Id: mount.h,v 2.0 1996/11/17 16:48:14 mvw Exp mvw $
- *
  */
 #ifndef _LINUX_MOUNT_H
 #define _LINUX_MOUNT_H
-- 
cgit v1.2.3


From 1ecfea06386c6b1344e83c8f909c87c88262ba1d Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Wed, 15 Oct 2008 22:01:27 -0700
Subject: init.h: remove long-dead __setup_null_param() macro

This macro appears to have been unused for ages, and there are no
invocations of it anywhere in the source tree.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 93538b696e3..ad63824460e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -233,9 +233,6 @@ struct obs_kernel_param {
 		__attribute__((aligned((sizeof(long)))))	\
 		= { __setup_str_##unique_id, fn, early }
 
-#define __setup_null_param(str, unique_id)			\
-	__setup_param(str, unique_id, NULL, 0)
-
 #define __setup(str, fn)					\
 	__setup_param(str, fn, fn, 0)
 
@@ -296,7 +293,6 @@ void __init parse_early_param(void);
 	void cleanup_module(void) __attribute__((alias(#exitfn)));
 
 #define __setup_param(str, unique_id, fn)	/* nothing */
-#define __setup_null_param(str, unique_id) 	/* nothing */
 #define __setup(str, func) 			/* nothing */
 #endif
 
-- 
cgit v1.2.3


From c80cfb0406c01bb5da91bfe30f5cb1fd96831138 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Wed, 15 Oct 2008 22:01:35 -0700
Subject: vsprintf: use new vsprintf symbolic function pointer format

Use the '%pF' format to get rid of an "#ifdef DEBUG" and make some printks
atomic.

This removes the last in-tree uses of print_fn_descriptor_symbol().  I
marked print_fn_descriptor_symbol() deprecated and scheduled it for
removal next year to give time for out-of-tree modules to be updated.

parisc's print_fn_descriptor_symbol() is currently broken there (it needs
to dereference the function pointer similar to ia64 and power).  This
patch shouldn't make anything worse, but it means we need to fix
dereference_function_descriptor() instead of print_fn_descriptor_symbol()
to get meaningful initcall_debug output.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index b9614488744..f3fe34391d8 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -93,12 +93,10 @@ static inline void print_symbol(const char *fmt, unsigned long addr)
 }
 
 /*
- * Pretty-print a function pointer.
- *
- * ia64 and ppc64 function pointers are really function descriptors,
- * which contain a pointer the real address.
+ * Pretty-print a function pointer.  This function is deprecated.
+ * Please use the "%pF" vsprintf format instead.
  */
-static inline void print_fn_descriptor_symbol(const char *fmt, void *addr)
+static inline void __deprecated print_fn_descriptor_symbol(const char *fmt, void *addr)
 {
 #if defined(CONFIG_IA64) || defined(CONFIG_PPC64)
 	addr = *(void **)addr;
-- 
cgit v1.2.3


From a25d644fc0e232f242d1f3baa63c149c42536ff0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 Oct 2008 22:01:38 -0700
Subject: wait: kill is_sync_wait()

is_sync_wait() is used to distinguish between sync and async waits.
Basically sync waits are the ones initialized with init_waitqueue_entry()
and async ones with init_waitqueue_func_entry().  The sync/async
distinction is used only in prepare_to_wait[_exclusive]() and its only
function is to skip setting the current task state if the wait is async.
This has a few problems.

* No one uses it.  None of func_entry users use prepare_to_wait()
  functions, so the code path never gets executed.

* The distinction is bogus.  Maybe back when func_entry is used only
  by aio but it's now also used by epoll and in future possibly by 9p
  and poll/select.

* Taking @state as argument and ignoring it silenly depending on how
  @wait is initialized is just a bad error-prone API.

* It prevents func_entry waits from using wait->private for no good
  reason.

This patch kills is_sync_wait() and the associated code paths from
prepare_to_wait[_exclusive]().  As there was no user of these code paths,
this patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0081147a9fe..ef609f842fa 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -108,15 +108,6 @@ static inline int waitqueue_active(wait_queue_head_t *q)
 	return !list_empty(&q->task_list);
 }
 
-/*
- * Used to distinguish between sync and async io wait context:
- * sync i/o typically specifies a NULL wait queue entry or a wait
- * queue entry bound to a task (current task) to wake up.
- * aio specifies a wait queue entry with an async notification
- * callback routine, not associated with any task.
- */
-#define is_sync_wait(wait)	(!(wait) || ((wait)->private))
-
 extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
 extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
 extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
-- 
cgit v1.2.3


From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Wed, 15 Oct 2008 22:01:41 -0700
Subject: Make the taint flags reliable

It's somewhat unlikely that it happens, but right now a race window
between interrupts or machine checks or oopses could corrupt the tainted
bitmap because it is modified in a non atomic fashion.

Convert the taint variable to an unsigned long and use only atomic bit
operations on it.

Unfortunately this means the intvec sysctl functions cannot be used on it
anymore.

It turned out the taint sysctl handler could actually be simplified a bit
(since it only increases capabilities) so this patch actually removes
code.

[akpm@linux-foundation.org: remove unneeded include]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 75d81f157d2..e971c55f45a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -235,9 +235,10 @@ extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in
 extern int panic_timeout;
 extern int panic_on_oops;
 extern int panic_on_unrecovered_nmi;
-extern int tainted;
 extern const char *print_tainted(void);
-extern void add_taint(unsigned);
+extern void add_taint(unsigned flag);
+extern int test_taint(unsigned flag);
+extern unsigned long get_taint(void);
 extern int root_mountflags;
 
 /* Values used for system_state */
@@ -250,16 +251,16 @@ extern enum system_states {
 	SYSTEM_SUSPEND_DISK,
 } system_state;
 
-#define TAINT_PROPRIETARY_MODULE	(1<<0)
-#define TAINT_FORCED_MODULE		(1<<1)
-#define TAINT_UNSAFE_SMP		(1<<2)
-#define TAINT_FORCED_RMMOD		(1<<3)
-#define TAINT_MACHINE_CHECK		(1<<4)
-#define TAINT_BAD_PAGE			(1<<5)
-#define TAINT_USER			(1<<6)
-#define TAINT_DIE			(1<<7)
-#define TAINT_OVERRIDDEN_ACPI_TABLE	(1<<8)
-#define TAINT_WARN			(1<<9)
+#define TAINT_PROPRIETARY_MODULE	0
+#define TAINT_FORCED_MODULE		1
+#define TAINT_UNSAFE_SMP		2
+#define TAINT_FORCED_RMMOD		3
+#define TAINT_MACHINE_CHECK		4
+#define TAINT_BAD_PAGE			5
+#define TAINT_USER			6
+#define TAINT_DIE			7
+#define TAINT_OVERRIDDEN_ACPI_TABLE	8
+#define TAINT_WARN			9
 
 extern void dump_stack(void) __cold;
 
-- 
cgit v1.2.3


From 22b8ce94708f7cdf0b04965c6f7443dfd374c35c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Wed, 15 Oct 2008 22:01:46 -0700
Subject: profiling: dynamically enable readprofile at runtime

Way too often, I have a machine that exhibits some kind of crappy
behavior.  The CPU looks wedged in the kernel or it is spending way too
much system time and I wonder what is responsible.

I try to run readprofile.  But, of course, Ubuntu doesn't enable it by
default.  Dang!

The reason we boot-time enable it is that it takes a big bufffer that we
generally can only bootmem alloc.  But, does it hurt to at least try and
runtime-alloc it?

To use:
echo 2 > /sys/kernel/profile

Then run readprofile like normal.

This should fix the compile issue with allmodconfig.  I've compile-tested
on a bunch more configs now including a few more architectures.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/profile.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index 7e7087239af..570045053ce 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -35,7 +35,9 @@ enum profile_type {
 extern int prof_on __read_mostly;
 
 /* init basic kernel profiler */
-void __init profile_init(void);
+int profile_init(void);
+int profile_setup(char *str);
+int create_proc_profile(void);
 void profile_tick(int type);
 
 /*
@@ -84,9 +86,9 @@ struct pt_regs;
 
 #define prof_on 0
 
-static inline void profile_init(void)
+static inline int profile_init(void)
 {
-	return;
+	return 0;
 }
 
 static inline void profile_tick(int type)
-- 
cgit v1.2.3


From e1f8e87449147ffe5ea3de64a46af7de450ce279 Mon Sep 17 00:00:00 2001
From: Francois Cami <francois.cami@free.fr>
Date: Wed, 15 Oct 2008 22:01:59 -0700
Subject: Remove Andrew Morton's old email accounts

People can use the real name an an index into MAINTAINERS to find the
current email address.

Signed-off-by: Francois Cami <francois.cami@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/journal-head.h       | 2 +-
 include/linux/task_io_accounting.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 8a62d1e84b9..bb70ebb6a2d 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -3,7 +3,7 @@
  *
  * buffer_head fields for JBD
  *
- * 27 May 2001 Andrew Morton <akpm@digeo.com>
+ * 27 May 2001 Andrew Morton
  *	Created - pulled out of fs.h
  */
 
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 5e88afc9a2f..bdf855c2856 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -5,7 +5,7 @@
  * Don't include this header file directly - it is designed to be dragged in via
  * sched.h.
  *
- * Blame akpm@osdl.org for all this.
+ * Blame Andrew Morton for all this.
  */
 
 struct task_io_accounting {
-- 
cgit v1.2.3


From f7ad160b49c49dc9cd383b9184c6fa4a9b4f7ebb Mon Sep 17 00:00:00 2001
From: Alex Raimondi <raimondi@miromico.ch>
Date: Wed, 15 Oct 2008 22:02:03 -0700
Subject: include/linux/clk.h: fix comment

clk_get and clk_put may not be used from within interrupt context.  Change
comment to this function.

Signed-off-by: Alex Raimondi <raimondi@miromico.ch>
Signed-off-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clk.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 5ca8c6fddb5..778777316ea 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -35,6 +35,8 @@ struct clk;
  * clk_get may return different clock producers depending on @dev.)
  *
  * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_get should not be called from within interrupt context.
  */
 struct clk *clk_get(struct device *dev, const char *id);
 
@@ -76,6 +78,8 @@ unsigned long clk_get_rate(struct clk *clk);
  * Note: drivers must ensure that all clk_enable calls made on this
  * clock source are balanced by clk_disable calls prior to calling
  * this function.
+ *
+ * clk_put should not be called from within interrupt context.
  */
 void clk_put(struct clk *clk);
 
-- 
cgit v1.2.3


From f7a5000f7a8924e9c5fad1801616601d6dc65a17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Oct 2008 22:02:05 -0700
Subject: compat: move cp_compat_stat to common code

struct stat / compat_stat is the same on all architectures, so
cp_compat_stat should be, too.

Turns out it is, except that various architectures have slightly and some
high2lowuid/high2lowgid or the direct assignment instead of the
SET_UID/SET_GID that expands to the correct one anyway.

This patch replaces the arch-specific cp_compat_stat implementations with
a common one based on the x86-64 one.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net> [ sparc bits ]
Acked-by: Kyle McMartin <kyle@mcmartin.ca> [ parisc bits ]
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index cf8d11cad5a..999dddd8d93 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -78,7 +78,6 @@ typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
 
-extern int cp_compat_stat(struct kstat *, struct compat_stat __user *);
 extern int get_compat_timespec(struct timespec *, const struct compat_timespec __user *);
 extern int put_compat_timespec(const struct timespec *, struct compat_timespec __user *);
 
-- 
cgit v1.2.3


From b418da16dd44810e5d5a22bba377cca80512a524 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 15 Oct 2008 22:02:06 -0700
Subject: compat: generic compat get/settimeofday

Nothing arch specific in get/settimeofday.  The details of the timeval
conversion varied a little from arch to arch, but all with the same
results.

Also add an extern declaration for sys_tz to linux/time.h because externs
in .c files are fowned upon.  I'll kill the externs in various other files
in a sparate patch.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net> [ sparc bits ]
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h | 5 +++++
 include/linux/time.h   | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 999dddd8d93..f061a1ea1b7 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -234,6 +234,11 @@ extern int get_compat_itimerspec(struct itimerspec *dst,
 extern int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 				 const struct itimerspec *src);
 
+asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
+		struct timezone __user *tz);
+asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
+		struct timezone __user *tz);
+
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int compat_printk(const char *fmt, ...);
diff --git a/include/linux/time.h b/include/linux/time.h
index e15206a7e82..51e883df0fa 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -29,6 +29,8 @@ struct timezone {
 
 #ifdef __KERNEL__
 
+extern struct timezone sys_tz;
+
 /* Parameters used to convert the timespec values: */
 #define MSEC_PER_SEC	1000L
 #define USEC_PER_MSEC	1000L
-- 
cgit v1.2.3


From 56d936607408d71c4141b2ed501410b072f1e211 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 15 Oct 2008 22:02:10 -0700
Subject: introduce generic iommu_num_pages function

This patch introduces the generic iommu_num_pages function. It can be used by
a given memory area.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Dave Airlie <airlied@linux.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/iommu-helper.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index a6d0586e2bf..3b068e5b567 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -23,4 +23,7 @@ extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 extern void iommu_area_free(unsigned long *map, unsigned long start,
 			    unsigned int nr);
 
+extern unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
+				     unsigned long io_page_size);
+
 #endif
-- 
cgit v1.2.3


From 53112488bebe25c0f5f8a002470046c0fe9a6c61 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Wed, 15 Oct 2008 22:02:37 -0700
Subject: alpha: introduce field 'taso' into struct linux_binprm

This change is Alpha-specific.  It adds field 'taso' into struct
linux_binprm to remember if the application is TASO.  Previously, field
sh_bang was used for this purpose.

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/binfmts.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 826f6235080..54980a3c760 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -36,6 +36,9 @@ struct linux_binprm{
 	unsigned long p; /* current top of mem */
 	unsigned int sh_bang:1,
 		     misc_bang:1;
+#ifdef __alpha__
+	unsigned int taso:1;
+#endif
 	struct file * file;
 	int e_uid, e_gid;
 	kernel_cap_t cap_post_exec_permitted;
-- 
cgit v1.2.3


From bf2a9a39639b8b51377905397a5005f444e9a892 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Wed, 15 Oct 2008 22:02:39 -0700
Subject: Allow recursion in binfmt_script and binfmt_misc

binfmt_script and binfmt_misc disallow recursion to avoid stack overflow
using sh_bang and misc_bang.  It causes problem in some cases:

$ echo '#!/bin/ls' > /tmp/t0
$ echo '#!/tmp/t0' > /tmp/t1
$ echo '#!/tmp/t1' > /tmp/t2
$ chmod +x /tmp/t*
$ /tmp/t2
zsh: exec format error: /tmp/t2

Similar problem with binfmt_misc.

This patch introduces field 'recursion_depth' into struct linux_binprm to
track recursion level in binfmt_misc and binfmt_script.  If recursion
level more then BINPRM_MAX_RECURSION it generates -ENOEXEC.

[akpm@linux-foundation.org: make linux_binprm.recursion_depth a uint]
Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/binfmts.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 54980a3c760..7394b5b349f 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -39,6 +39,7 @@ struct linux_binprm{
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
+	unsigned int recursion_depth;
 	struct file * file;
 	int e_uid, e_gid;
 	kernel_cap_t cap_post_exec_permitted;
@@ -61,6 +62,7 @@ struct linux_binprm{
 #define BINPRM_FLAGS_EXECFD_BIT 1
 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)
 
+#define BINPRM_MAX_RECURSION 4
 
 /*
  * This structure defines the functions that are used to load the binary formats that
-- 
cgit v1.2.3


From 2bec19feabd53cba75e9dab0e79afbe868a37113 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Wed, 15 Oct 2008 22:02:44 -0700
Subject: orion_spi: handle 88F6183 erratum

Add support to orion_spi for the 88F6183 ARM SoC by adding code to work
around a 6183-specific erratum.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/orion_spi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/orion_spi.h b/include/linux/spi/orion_spi.h
index b4d9fa6f797..decf6d8c77b 100644
--- a/include/linux/spi/orion_spi.h
+++ b/include/linux/spi/orion_spi.h
@@ -11,6 +11,7 @@
 
 struct orion_spi_info {
 	u32	tclk;		/* no <linux/clk.h> support yet */
+	u32	enable_clock_fix;
 };
 
 
-- 
cgit v1.2.3


From 9d793b0bcbbbc37d80241862dfa5257963d5415e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 15 Oct 2008 22:02:47 -0700
Subject: i2o: Fix 32/64bit DMA locking

The I2O ioctls assume 32bits.  In itself that is fine as they are old
cards and nobody uses 64bit.  However on LKML it was noted this
assumption is also made for allocated memory and is unsafe on 64bit
systems.

Fixing this is a mess.  It turns out there is tons of crap buried in a
header file that does racy 32/64bit filtering on the masks.

So we:
- Verify all callers of the racy code can sleep (i2o_dma_[re]alloc)
- Move the code into a new i2o/memory.c file
- Remove the gfp_mask argument so nobody can try and misuse the function
- Wrap a mutex around the problem area (a single mutex is easy to do and
  none of this is performance relevant)
- Switch the remaining problem kmalloc holdout to use i2o_dma_alloc

Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Vasily Averin <vvs@sw.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/i2o.h | 292 +++-------------------------------------------------
 1 file changed, 12 insertions(+), 280 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 75ae6d8aba4..4c4e57d1f19 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -570,7 +570,6 @@ struct i2o_controller {
 #endif
 	spinlock_t lock;	/* lock for controller
 				   configuration */
-
 	void *driver_data[I2O_MAX_DRIVERS];	/* storage for drivers */
 };
 
@@ -691,289 +690,22 @@ static inline u32 i2o_dma_high(dma_addr_t dma_addr)
 };
 #endif
 
-/**
- *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
- *	@c: I2O controller for which the calculation should be done
- *	@body_size: maximum body size used for message in 32-bit words.
- *
- *	Return the maximum number of SG elements in a SG list.
- */
-static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
-{
-	i2o_status_block *sb = c->status_block.virt;
-	u16 sg_count =
-	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
-	    body_size;
-
-	if (c->pae_support) {
-		/*
-		 * for 64-bit a SG attribute element must be added and each
-		 * SG element needs 12 bytes instead of 8.
-		 */
-		sg_count -= 2;
-		sg_count /= 3;
-	} else
-		sg_count /= 2;
-
-	if (c->short_req && (sg_count > 8))
-		sg_count = 8;
-
-	return sg_count;
-};
-
-/**
- *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
- *	@c: I2O controller
- *	@ptr: pointer to the data which should be mapped
- *	@size: size of data in bytes
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
- *	SG list if the allocation was successful.
- *
- *	Returns DMA address which must be checked for failures using
- *	dma_mapping_error().
- */
-static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+extern u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size);
+extern dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
 					    size_t size,
 					    enum dma_data_direction direction,
-					    u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-	dma_addr_t dma_addr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0xd4000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0xd0000000;
-		break;
-	default:
-		return 0;
-	}
-
-	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-			*mptr++ = cpu_to_le32(0x7C020002);
-			*mptr++ = cpu_to_le32(PAGE_SIZE);
-		}
-#endif
-
-		*mptr++ = cpu_to_le32(sg_flags | size);
-		*mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
-#endif
-		*sg_ptr = mptr;
-	}
-	return dma_addr;
-};
-
-/**
- *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
- *	@c: I2O controller
- *	@sg: SG list to be mapped
- *	@sg_count: number of elements in the SG list
- *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
- *	@sg_ptr: pointer to the SG list inside the I2O message
- *
- *	This function does all necessary DMA handling and also writes the I2O
- *	SGL elements into the I2O message. For details on DMA handling see also
- *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
- *	list if the allocation was successful.
- *
- *	Returns 0 on failure or 1 on success.
- */
-static inline int i2o_dma_map_sg(struct i2o_controller *c,
+					    u32 ** sg_ptr);
+extern int i2o_dma_map_sg(struct i2o_controller *c,
 				 struct scatterlist *sg, int sg_count,
 				 enum dma_data_direction direction,
-				 u32 ** sg_ptr)
-{
-	u32 sg_flags;
-	u32 *mptr = *sg_ptr;
-
-	switch (direction) {
-	case DMA_TO_DEVICE:
-		sg_flags = 0x14000000;
-		break;
-	case DMA_FROM_DEVICE:
-		sg_flags = 0x10000000;
-		break;
-	default:
-		return 0;
-	}
-
-	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
-	if (!sg_count)
-		return 0;
-
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
-		*mptr++ = cpu_to_le32(0x7C020002);
-		*mptr++ = cpu_to_le32(PAGE_SIZE);
-	}
-#endif
-
-	while (sg_count-- > 0) {
-		if (!sg_count)
-			sg_flags |= 0xC0000000;
-		*mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
-		*mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
-#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
-		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
-			*mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
-#endif
-		sg = sg_next(sg);
-	}
-	*sg_ptr = mptr;
-
-	return 1;
-};
-
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *	@gfp_mask: GFP mask
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
- */
-static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
-				size_t len, gfp_t gfp_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
-
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
-		dma_64 = 1;
-		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
-			return -ENOMEM;
-	}
-
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
-
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
-
-	if (!addr->virt)
-		return -ENOMEM;
-
-	memset(addr->virt, 0, len);
-	addr->len = len;
-
-	return 0;
-};
-
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
- */
-static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
-};
-
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *	@gfp_mask: GFP mask
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
- */
-static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
-				  size_t len, gfp_t gfp_mask)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len, gfp_mask);
-
-	return 0;
-};
-
-/*
- *	i2o_pool_alloc - Allocate an slab cache and mempool
- *	@mempool: pointer to struct i2o_pool to write data into.
- *	@name: name which is used to identify cache
- *	@size: size of each object
- *	@min_nr: minimum number of objects
- *
- *	First allocates a slab cache with name and size. Then allocates a
- *	mempool which uses the slab cache for allocation and freeing.
- *
- *	Returns 0 on success or negative error code on failure.
- */
-static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
-				 size_t size, int min_nr)
-{
-	pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
-	if (!pool->name)
-		goto exit;
-	strcpy(pool->name, name);
-
-	pool->slab =
-	    kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!pool->slab)
-		goto free_name;
-
-	pool->mempool = mempool_create_slab_pool(min_nr, pool->slab);
-	if (!pool->mempool)
-		goto free_slab;
-
-	return 0;
-
-      free_slab:
-	kmem_cache_destroy(pool->slab);
-
-      free_name:
-	kfree(pool->name);
-
-      exit:
-	return -ENOMEM;
-};
-
-/*
- *	i2o_pool_free - Free slab cache and mempool again
- *	@mempool: pointer to struct i2o_pool which should be freed
- *
- *	Note that you have to return all objects to the mempool again before
- *	calling i2o_pool_free().
- */
-static inline void i2o_pool_free(struct i2o_pool *pool)
-{
-	mempool_destroy(pool->mempool);
-	kmem_cache_destroy(pool->slab);
-	kfree(pool->name);
-};
-
+				 u32 ** sg_ptr);
+extern int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr, size_t len);
+extern void i2o_dma_free(struct device *dev, struct i2o_dma *addr);
+extern int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+								size_t len);
+extern int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
+				 size_t size, int min_nr);
+extern void i2o_pool_free(struct i2o_pool *pool);
 /* I2O driver (OSM) functions */
 extern int i2o_driver_register(struct i2o_driver *);
 extern void i2o_driver_unregister(struct i2o_driver *);
-- 
cgit v1.2.3


From bb979d7fc360bc37cbaff43a6fafceb897cb5e47 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 15 Oct 2008 22:02:52 -0700
Subject: autofs4: cleanup autofs mount type usage

Usage of the AUTOFS_TYPE_* defines is a little confusing and appears
inconsistent.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/auto_fs4.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index b785c6f8644..aa96a04ae02 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -29,6 +29,11 @@
 #define AUTOFS_EXP_IMMEDIATE		1
 #define AUTOFS_EXP_LEAVES		2
 
+#define AUTOFS_TYPE_ANY			0x0000
+#define AUTOFS_TYPE_INDIRECT		0x0001
+#define AUTOFS_TYPE_DIRECT		0x0002
+#define AUTOFS_TYPE_OFFSET		0x0004
+
 /* Daemon notification packet types */
 enum autofs_notify {
 	NFY_NONE,
-- 
cgit v1.2.3


From 8d7b48e0bc5fa01a818eac713d4cb0763090cd0e Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 15 Oct 2008 22:02:54 -0700
Subject: autofs4: add miscellaneous device for ioctls

Add a miscellaneous device to the autofs4 module for routing ioctls.  This
provides the ability to obtain an ioctl file handle for an autofs mount
point that is possibly covered by another mount.

The actual problem with autofs is that it can't reconnect to existing
mounts.  Immediately one things of just adding the ability to remount
autofs file systems would solve it, but alas, that can't work.  This is
because autofs direct mounts and the implementation of "on demand mount
and expire" of nested mount trees have the file system mounted on top of
the mount trigger dentry.

To resolve this a miscellaneous device node for routing ioctl commands to
these mount points has been implemented in the autofs4 kernel module and a
library added to autofs.  This provides the ability to open a file
descriptor for these over mounted autofs mount points.

Please refer to Documentation/filesystems/autofs4-mount-control.txt for a
discussion of the problem, implementation alternatives considered and a
description of the interface.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/auto_dev-ioctl.h | 157 +++++++++++++++++++++++++++++++++++++++++
 include/linux/auto_fs4.h       |   2 +-
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/auto_dev-ioctl.h

(limited to 'include/linux')

diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
new file mode 100644
index 00000000000..f4d05ccd731
--- /dev/null
+++ b/include/linux/auto_dev-ioctl.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#ifndef _LINUX_AUTO_DEV_IOCTL_H
+#define _LINUX_AUTO_DEV_IOCTL_H
+
+#include <linux/types.h>
+
+#define AUTOFS_DEVICE_NAME		"autofs"
+
+#define AUTOFS_DEV_IOCTL_VERSION_MAJOR	1
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR	0
+
+#define AUTOFS_DEVID_LEN		16
+
+#define AUTOFS_DEV_IOCTL_SIZE		sizeof(struct autofs_dev_ioctl)
+
+/*
+ * An ioctl interface for autofs mount point control.
+ */
+
+/*
+ * All the ioctls use this structure.
+ * When sending a path size must account for the total length
+ * of the chunk of memory otherwise is is the size of the
+ * structure.
+ */
+
+struct autofs_dev_ioctl {
+	__u32 ver_major;
+	__u32 ver_minor;
+	__u32 size;		/* total size of data passed in
+				 * including this struct */
+	__s32 ioctlfd;		/* automount command fd */
+
+	__u32 arg1;		/* Command parameters */
+	__u32 arg2;
+
+	char path[0];
+};
+
+static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
+{
+	in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+	in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+	in->size = sizeof(struct autofs_dev_ioctl);
+	in->ioctlfd = -1;
+	in->arg1 = 0;
+	in->arg2 = 0;
+	return;
+}
+
+/*
+ * If you change this make sure you make the corresponding change
+ * to autofs-dev-ioctl.c:lookup_ioctl()
+ */
+enum {
+	/* Get various version info */
+	AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
+	AUTOFS_DEV_IOCTL_PROTOVER_CMD,
+	AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD,
+
+	/* Open mount ioctl fd */
+	AUTOFS_DEV_IOCTL_OPENMOUNT_CMD,
+
+	/* Close mount ioctl fd */
+	AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD,
+
+	/* Mount/expire status returns */
+	AUTOFS_DEV_IOCTL_READY_CMD,
+	AUTOFS_DEV_IOCTL_FAIL_CMD,
+
+	/* Activate/deactivate autofs mount */
+	AUTOFS_DEV_IOCTL_SETPIPEFD_CMD,
+	AUTOFS_DEV_IOCTL_CATATONIC_CMD,
+
+	/* Expiry timeout */
+	AUTOFS_DEV_IOCTL_TIMEOUT_CMD,
+
+	/* Get mount last requesting uid and gid */
+	AUTOFS_DEV_IOCTL_REQUESTER_CMD,
+
+	/* Check for eligible expire candidates */
+	AUTOFS_DEV_IOCTL_EXPIRE_CMD,
+
+	/* Request busy status */
+	AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD,
+
+	/* Check if path is a mountpoint */
+	AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
+};
+
+#define AUTOFS_IOCTL 0x93
+
+#define AUTOFS_DEV_IOCTL_VERSION \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOVER \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOSUBVER \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_OPENMOUNT \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_READY \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_FAIL \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_SETPIPEFD \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CATATONIC \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_TIMEOUT \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_REQUESTER \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_EXPIRE \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ASKUMOUNT \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \
+	_IOWR(AUTOFS_IOCTL, \
+	      AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl)
+
+#endif	/* _LINUX_AUTO_DEV_IOCTL_H */
diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index aa96a04ae02..2253716d4b9 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		0
+#define AUTOFS_PROTO_SUBVERSION		1
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
-- 
cgit v1.2.3


From 3d599d1ca57f443e5c4ff5af1e69d90350082f77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?=
 <ukleinek@informatik.uni-freiburg.de>
Date: Wed, 15 Oct 2008 22:03:12 -0700
Subject: gpio_free might sleep, generic part
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the documentation gpio_free should only be called from task
context only.  To make this more explicit add a might sleep to all
implementations.

This is the generic part which changes gpiolib and the fallback
implementation only.

Signed-off-by: Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gpio.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 730a20b8357..e10c49a5b96 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -8,6 +8,7 @@
 
 #else
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 
@@ -32,6 +33,8 @@ static inline int gpio_request(unsigned gpio, const char *label)
 
 static inline void gpio_free(unsigned gpio)
 {
+	might_sleep();
+
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-- 
cgit v1.2.3


From e3a1938805d2e81b27d3d348788644f3bad004f2 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Wed, 15 Oct 2008 22:03:52 -0700
Subject: matroxfb: support G200eV chip

Support the Matrox G200eV chip, based on timings that I found in the X.org
matrox driver.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Cc: Petr Vandrovec <vandrove@vc.cvut.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 1176f1f177e..8edddc240e4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -587,6 +587,7 @@
 #define PCI_DEVICE_ID_MATROX_G200_PCI	0x0520
 #define PCI_DEVICE_ID_MATROX_G200_AGP	0x0521
 #define	PCI_DEVICE_ID_MATROX_G400	0x0525
+#define	PCI_DEVICE_ID_MATROX_G200EV_PCI	0x0530
 #define PCI_DEVICE_ID_MATROX_G550	0x2527
 #define PCI_DEVICE_ID_MATROX_VIA	0x4536
 
-- 
cgit v1.2.3


From b53cde3557b8f97e6a635782875d442551a89bf1 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 15 Oct 2008 22:03:55 -0700
Subject: fbdev: add new TMIO framebuffer driver

Add driver for TMIO framebuffer cells as found e.g. in Toshiba TC6393XB
chips.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Ian Molton <spyro@f2s.com>
Acked-by: Samuel Ortiz <sameo@openedhand.com>
Acked-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mfd/tmio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index ec612e66391..516d955ab8a 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -1,6 +1,8 @@
 #ifndef MFD_TMIO_H
 #define MFD_TMIO_H
 
+#include <linux/fb.h>
+
 #define tmio_ioread8(addr) readb(addr)
 #define tmio_ioread16(addr) readw(addr)
 #define tmio_ioread16_rep(r, b, l) readsw(r, b, l)
@@ -25,4 +27,21 @@ struct tmio_nand_data {
 	unsigned int		num_partitions;
 };
 
+#define FBIO_TMIO_ACC_WRITE	0x7C639300
+#define FBIO_TMIO_ACC_SYNC	0x7C639301
+
+struct tmio_fb_data {
+	int			(*lcd_set_power)(struct platform_device *fb_dev,
+								bool on);
+	int			(*lcd_mode)(struct platform_device *fb_dev,
+					const struct fb_videomode *mode);
+	int			num_modes;
+	struct fb_videomode	*modes;
+
+	/* in mm: size of screen */
+	int			height;
+	int			width;
+};
+
+
 #endif
-- 
cgit v1.2.3


From b563cf59c4d67da7d671788a9848416bfa4180ab Mon Sep 17 00:00:00 2001
From: Rene Herman <rene.herman@keyaccess.nl>
Date: Wed, 15 Oct 2008 22:03:58 -0700
Subject: pnp: make the resource type an unsigned long

PnP encodes the resource type directly as its struct resource->flags value
which is an unsigned long.  Make it so...

Signed-off-by: Rene Herman <rene.herman@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pnp.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index be764e514e3..53b70fd1d9a 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -22,9 +22,11 @@ struct pnp_dev;
  * Resource Management
  */
 #ifdef CONFIG_PNP
-struct resource *pnp_get_resource(struct pnp_dev *, unsigned int, unsigned int);
+struct resource *pnp_get_resource(struct pnp_dev *dev, unsigned long type,
+				unsigned int num);
 #else
-static inline struct resource *pnp_get_resource(struct pnp_dev *dev, unsigned int type, unsigned int num)
+static inline struct resource *pnp_get_resource(struct pnp_dev *dev,
+			unsigned long type, unsigned int num)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 60836eb63b941f407dc2a609f3f0f34fd74ef6c3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 15 Oct 2008 22:04:00 -0700
Subject: telephony: remove CVS keywords

Remove CVS keywords that weren't updated for a long time from comments.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/telephony.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/telephony.h b/include/linux/telephony.h
index 0d0cf2a1e7b..5b2b6261f19 100644
--- a/include/linux/telephony.h
+++ b/include/linux/telephony.h
@@ -28,10 +28,6 @@
  * ON AN "AS IS" BASIS, AND QUICKNET TECHNOLOGIES, INC. HAS NO OBLIGATION
  * TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  *
- * Version:       $Revision: 4.2 $
- *
- * $Id: telephony.h,v 4.2 2001/08/06 07:09:43 craigs Exp $
- *
  *****************************************************************************/
 
 #ifndef TELEPHONY_H
-- 
cgit v1.2.3


From b73c29f6b0ddbcf07b43c5c5e6354e5839b5e68d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 15 Oct 2008 22:04:13 -0700
Subject: quota: remove CVS keywords

Remove CVS keywords that weren't updated for a long time from comments.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/quota.h    | 2 --
 include/linux/quotaops.h | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 376a05048bc..40401b55448 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -28,8 +28,6 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * Version: $Id: quota.h,v 2.0 1996/11/17 16:48:14 mvw Exp mvw $
  */
 
 #ifndef _LINUX_QUOTA_
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index ca6b9b5c8d5..a558a4c1d35 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -3,9 +3,6 @@
  * macros expand to the right source-code.
  *
  * Author:  Marco van Wieringen <mvw@planets.elm.net>
- *
- * Version: $Id: quotaops.h,v 1.2 1998/01/15 16:22:26 ecd Exp $
- *
  */
 #ifndef _LINUX_QUOTAOPS_
 #define _LINUX_QUOTAOPS_
-- 
cgit v1.2.3


From f221e726bf4e082a05dcd573379ac859bfba7126 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 15 Oct 2008 22:04:23 -0700
Subject: sysctl: simplify ->strategy

name and nlen parameters passed to ->strategy hook are unused, remove
them.  In general ->strategy hook should know what it's doing, and don't
do something tricky for which, say, pointer to original userspace array
may be needed (name).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net> [ networking bits ]
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index d0437f36921..39d471d1163 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -972,7 +972,7 @@ extern int sysctl_perm(struct ctl_table_root *root,
 
 typedef struct ctl_table ctl_table;
 
-typedef int ctl_handler (struct ctl_table *table, int __user *name, int nlen,
+typedef int ctl_handler (struct ctl_table *table,
 			 void __user *oldval, size_t __user *oldlenp,
 			 void __user *newval, size_t newlen);
 
-- 
cgit v1.2.3


From 25cbe53ef1cb828ae012f3955a5aa18117114439 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Oct 2008 22:04:25 -0700
Subject: pid_ns: kill the now unused task_child_reaper()

task_child_reaper() has no callers anymore, kill it.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid_namespace.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 1af82c4e17d..d82fe825d62 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -84,12 +84,6 @@ static inline struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 	return tsk->nsproxy->pid_ns;
 }
 
-static inline struct task_struct *task_child_reaper(struct task_struct *tsk)
-{
-	BUG_ON(tsk != current);
-	return tsk->nsproxy->pid_ns->child_reaper;
-}
-
 void pidhash_init(void);
 void pidmap_init(void);
 
-- 
cgit v1.2.3


From 612de10db06c0704a66bbe7fd13990cb1c2cb958 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 15 Oct 2008 22:04:33 -0700
Subject: parport: remove CVS keywords

Remove CVS keywords that weren't updated for a long time from comments.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/parport.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/parport.h b/include/linux/parport.h
index 6a0d7cdb577..e1f83c5065c 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -1,5 +1,3 @@
-/* $Id: parport.h,v 1.1 1998/05/17 10:57:52 andrea Exp andrea $ */
-
 /*
  * Any part of this program may be used in documents licensed under
  * the GNU Free Documentation License, Version 1.1 or any later version
-- 
cgit v1.2.3


From ebf3f09c634906d371f2bfd71b41c7e0c52efe7e Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 15 Oct 2008 22:05:12 -0700
Subject: Configure out AIO support

This patchs adds the CONFIG_AIO option which allows to remove support
for asynchronous I/O operations, that are not necessarly used by
applications, particularly on embedded devices. As this is a
size-reduction option, it depends on CONFIG_EMBEDDED. It allows to
save ~7 kilobytes of kernel code/data:

   text	   data	    bss	    dec	    hex	filename
1115067	 119180	 217088	1451335	 162547	vmlinux
1108025	 119048	 217088	1444161	 160941	vmlinux.new
  -7042    -132       0   -7174   -1C06 +/-

This patch has been originally written by Matt Mackall
<mpm@selenic.com>, and is part of the Linux Tiny project.

[randy.dunlap@oracle.com: build fix]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 09b276c3522..f6b8cf99b59 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -204,12 +204,21 @@ struct kioctx {
 /* prototypes */
 extern unsigned aio_max_size;
 
+#ifdef CONFIG_AIO
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
 extern int aio_put_req(struct kiocb *iocb);
 extern void kick_iocb(struct kiocb *iocb);
 extern int aio_complete(struct kiocb *iocb, long res, long res2);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
+#else
+static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
+static inline int aio_put_req(struct kiocb *iocb) { return 0; }
+static inline void kick_iocb(struct kiocb *iocb) { }
+static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+struct mm_struct;
+static inline void exit_aio(struct mm_struct *mm) { }
+#endif /* CONFIG_AIO */
 
 #define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait)
 
-- 
cgit v1.2.3


From c9f66169f1c696f9489503d7de92daff135c1efd Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Oct 2008 22:05:15 -0700
Subject: resource: add resource_type() and IORESOURCE_TYPE_BITS

Add resource_type() and IORESOURCE_TYPE_BITS.  They make it easier to add
more resource types without having to rewrite tons of code.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index ee9bcc6f32b..0dde77272d7 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -34,7 +34,8 @@ struct resource_list {
  */
 #define IORESOURCE_BITS		0x000000ff	/* Bus-specific bits */
 
-#define IORESOURCE_IO		0x00000100	/* Resource type */
+#define IORESOURCE_TYPE_BITS	0x00000f00	/* Resource type */
+#define IORESOURCE_IO		0x00000100
 #define IORESOURCE_MEM		0x00000200
 #define IORESOURCE_IRQ		0x00000400
 #define IORESOURCE_DMA		0x00000800
@@ -126,6 +127,10 @@ static inline resource_size_t resource_size(struct resource *res)
 {
 	return res->end - res->start + 1;
 }
+static inline unsigned long resource_type(struct resource *res)
+{
+	return res->flags & IORESOURCE_TYPE_BITS;
+}
 
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
-- 
cgit v1.2.3


From 3e624fc72fba09b6f999a9fbb87b64efccd38036 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 16 Oct 2008 20:00:24 -0400
Subject: ext4: Replace hackish ext4_mb_poll_new_transaction with commit
 callback

The multiblock allocator needs to be able to release blocks (and issue
a blkdev discard request) when the transaction which freed those
blocks is committed.  Previously this was done via a polling mechanism
when blocks are allocated or freed.  A much better way of doing things
is to create a jbd2 callback function and attaching the list of blocks
to be freed directly to the transaction structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 include/linux/jbd2.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 463d6f10b64..c7d106ef22e 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -641,6 +641,11 @@ struct transaction_s
 	 */
 	int t_handle_count;
 
+	/*
+	 * For use by the filesystem to store fs-specific data
+	 * structures associated with the transaction
+	 */
+	struct list_head	t_private_list;
 };
 
 struct transaction_run_stats_s {
@@ -935,6 +940,10 @@ struct journal_s
 
 	pid_t			j_last_sync_writer;
 
+	/* This function is called when a transaction is closed */
+	void			(*j_commit_callback)(journal_t *,
+						     transaction_t *);
+
 	/*
 	 * Journal statistics
 	 */
-- 
cgit v1.2.3


From 54514a70adefe356afe854e2d3912d46668068e6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 23 Sep 2008 22:15:57 -0700
Subject: softirq: Add support for triggering softirq work on softirqs.

This is basically a genericization of Jens Axboe's block layer
remote softirq changes.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/interrupt.h | 21 +++++++++++++++++++++
 include/linux/smp.h       |  4 +++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 54b3623434e..35a61dc60d5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -11,6 +11,8 @@
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
+#include <linux/smp.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -273,6 +275,25 @@ extern void softirq_init(void);
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
+/* This is the worklist that queues up per-cpu softirq work.
+ *
+ * send_remote_sendirq() adds work to these lists, and
+ * the softirq handler itself dequeues from them.  The queues
+ * are protected by disabling local cpu interrupts and they must
+ * only be accessed by the local cpu that they are for.
+ */
+DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+
+/* Try to send a softirq to a remote cpu.  If this cannot be done, the
+ * work will be queued to the local cpu.
+ */
+extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq);
+
+/* Like send_remote_softirq(), but the caller must disable local cpu interrupts
+ * and compute the current cpu, passed in as 'this_cpu'.
+ */
+extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
+				  int this_cpu, int softirq);
 
 /* Tasklets --- multithreaded analogue of BHs.
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 66484d4a845..2e4d58b26c0 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -7,6 +7,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
 
@@ -16,7 +17,8 @@ struct call_single_data {
 	struct list_head list;
 	void (*func) (void *info);
 	void *info;
-	unsigned int flags;
+	u16 flags;
+	u16 priv;
 };
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 8677142710516d986d932d6f1fba7be8382c1fec Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 13 Oct 2008 14:19:05 +0200
Subject: block: fix nr_phys_segments miscalculation bug

This fixes the bug reported by Nikanth Karthikesan <knikanth@suse.de>:

http://lkml.org/lkml/2008/10/2/203

The root cause of the bug is that blk_phys_contig_segment
miscalculates q->max_segment_size.

blk_phys_contig_segment checks:

req->biotail->bi_size + next_req->bio->bi_size > q->max_segment_size

But blk_recalc_rq_segments might expect that req->biotail and the
previous bio in the req are supposed be merged into one
segment. blk_recalc_rq_segments might also expect that next_req->bio
and the next bio in the next_req are supposed be merged into one
segment. In such case, we merge two requests that can't be merged
here. Later, blk_rq_map_sg gives more segments than it should.

We need to keep track of segment size in blk_recalc_rq_segments and
use it to see if two requests can be merged. This patch implements it
in the similar way that we used to do for hw merging (virtual
merging).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ff5b4cf9e2d..dc3cec386a9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -79,6 +79,13 @@ struct bio {
 
 	unsigned int		bi_size;	/* residual I/O count */
 
+	/*
+	 * To keep track of the max segment size, we account for the
+	 * sizes of the first and last mergeable segments in this bio.
+	 */
+	unsigned int		bi_seg_front_size;
+	unsigned int		bi_seg_back_size;
+
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
 	unsigned int		bi_comp_cpu;	/* completion CPU */
-- 
cgit v1.2.3


From 756f8243188e013bd067811cdc0cc60760abfdf9 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Thu, 16 Oct 2008 08:23:21 +0200
Subject: blktrace: add support for driver data

This patch adds the new api call blk_add_driver_data() to blktrace.
It allows to trace device driver-specific binary data.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Martin Peschke <mp3@de.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blktrace_api.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3a31eb50616..bdf505d33e7 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -24,6 +24,7 @@ enum blktrace_cat {
 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
 	BLK_TC_META	= 1 << 12,	/* metadata */
 	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
+	BLK_TC_DRV_DATA	= 1 << 14,	/* binary per-driver data */
 
 	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
 };
@@ -51,6 +52,7 @@ enum blktrace_act {
 	__BLK_TA_BOUNCE,		/* bio was bounced */
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
+	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
 };
 
 /*
@@ -82,6 +84,7 @@ enum blktrace_notify {
 #define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
 #define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
 #define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_DRV_DATA	(__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
 
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
@@ -317,6 +320,34 @@ static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
 }
 
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+static inline void blk_add_driver_data(struct request_queue *q,
+				       struct request *rq,
+				       void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -330,6 +361,7 @@ extern int blk_trace_remove(struct request_queue *q);
 #define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
 #define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
 #define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
-- 
cgit v1.2.3


From f73e2d13a16cc88c4faa4729967f92bfeec8a142 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 17 Oct 2008 14:03:08 +0200
Subject: block: remove __generic_unplug_device() from exports

The only out-of-core user is IDE, and that should be using
blk_start_queueing() instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a92d9e4ea96..8eed8b15f99 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -856,7 +856,6 @@ extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(struct request_queue *);
-extern void __generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(struct request_queue *);
-- 
cgit v1.2.3


From fe11edfaabf1787c05d782a7b33e6497d1118b1d Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_MEDIA_CHANGED -> IDE_DFLAG_MEDIA_CHANGED

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index c47e371554c..155a57f55c6 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -464,7 +464,6 @@ struct ide_acpi_hwif_link;
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
-	IDE_AFLAG_MEDIA_CHANGED		= (1 << 1),
 	/* Drive cannot lock the door. */
 	IDE_AFLAG_NO_DOORLOCK		= (1 << 2),
 
@@ -578,7 +577,8 @@ enum {
 	/* don't unload heads */
 	IDE_DFLAG_NO_UNLOAD		= (1 << 27),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 28)
+	IDE_DFLAG_PARKED		= (1 << 28),
+	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From da167876bd0f71f1c646e5dd98997544d8d90e8e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_WP -> IDE_DFLAG_WP

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 155a57f55c6..bd0a4d36b6d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -503,8 +503,6 @@ enum {
 	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
 	/* Requires BH algorithm for packets */
 	IDE_AFLAG_ZIP_DRIVE		= (1 << 20),
-	/* Write protect */
-	IDE_AFLAG_WP			= (1 << 21),
 	/* Supports format progress report */
 	IDE_AFLAG_SRFP			= (1 << 22),
 
@@ -579,6 +577,8 @@ enum {
 	/* heads unloaded, please don't reset port */
 	IDE_DFLAG_PARKED		= (1 << 28),
 	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
+	/* write protect */
+	IDE_DFLAG_WP			= (1 << 30),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From e01286282eef85e4783b06fb2e0ed84fc111eb32 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: IDE_AFLAG_FORMAT_IN_PROGRESS -> IDE_DFLAG_FORMAT_IN_PROGRESS

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index bd0a4d36b6d..d111c3ebbba 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -497,8 +497,6 @@ enum {
 	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 17),
 
 	/* ide-floppy */
-	/* Format in progress */
-	IDE_AFLAG_FORMAT_IN_PROGRESS	= (1 << 18),
 	/* Avoid commands not supported in Clik drive */
 	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
 	/* Requires BH algorithm for packets */
@@ -579,6 +577,7 @@ enum {
 	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
 	/* write protect */
 	IDE_DFLAG_WP			= (1 << 30),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 31),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3


From 42619d35c7af2f88cad56425fe3981f1f65ff0bd Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:11 +0200
Subject: ide: remove IDE_AFLAG_NO_DOORLOCKING

Just use IDE_DFLAG_DOORLOCKING instead.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index d111c3ebbba..ba51a93fa54 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -464,8 +464,6 @@ struct ide_acpi_hwif_link;
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
-	/* Drive cannot lock the door. */
-	IDE_AFLAG_NO_DOORLOCK		= (1 << 2),
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-- 
cgit v1.2.3


From 79cb380397c834a35952d8497651d93b543ef968 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:13 +0200
Subject: ide: allow device drivers to specify per-device type /proc settings

Turn ide_driver_t's 'proc' field into ->proc_entries method
(and also 'settings' field into ->proc_devsets method).  Then
update all device drivers accordingly.

There should be no functional changes caused by this patch.

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ba51a93fa54..488808891ac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1120,8 +1120,8 @@ struct ide_driver_s {
 	void		(*resume)(ide_drive_t *);
 	void		(*shutdown)(ide_drive_t *);
 #ifdef CONFIG_IDE_PROC_FS
-	ide_proc_entry_t		*proc;
-	const struct ide_proc_devset	*settings;
+	ide_proc_entry_t *		(*proc_entries)(ide_drive_t *);
+	const struct ide_proc_devset *	(*proc_devsets)(ide_drive_t *);
 #endif
 };
 
-- 
cgit v1.2.3


From 806f80a6fc203ad0bde84e5a9e94572617d2ae45 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 17 Oct 2008 18:09:14 +0200
Subject: ide: add generic ATA/ATAPI disk driver

* Add struct ide_disk_ops containing protocol specific methods.

* Add 'struct ide_disk_ops *' to ide_drive_t.

* Convert ide-{disk,floppy} drivers to use struct ide_disk_ops.

* Merge ide-{disk,floppy} drivers into generic ide-gd driver.

While at it:
- ide_disk_init_capacity() -> ide_disk_get_capacity()

Acked-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 488808891ac..89e53cfbc78 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -461,6 +461,23 @@ struct ide_acpi_drive_link;
 struct ide_acpi_hwif_link;
 #endif
 
+struct ide_drive_s;
+
+struct ide_disk_ops {
+	int		(*check)(struct ide_drive_s *, const char *);
+	int		(*get_capacity)(struct ide_drive_s *);
+	void		(*setup)(struct ide_drive_s *);
+	void		(*flush)(struct ide_drive_s *);
+	int		(*init_media)(struct ide_drive_s *, struct gendisk *);
+	int		(*set_doorlock)(struct ide_drive_s *, struct gendisk *,
+					int);
+	ide_startstop_t	(*do_request)(struct ide_drive_s *, struct request *,
+				      sector_t);
+	int		(*end_request)(struct ide_drive_s *, int, int);
+	int		(*ioctl)(struct ide_drive_s *, struct inode *,
+				 struct file *, unsigned int, unsigned long);
+};
+
 /* ATAPI device flags */
 enum {
 	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
@@ -594,6 +611,8 @@ struct ide_drive_s {
 #endif
 	struct hwif_s		*hwif;	/* actually (ide_hwif_t *) */
 
+	const struct ide_disk_ops *disk_ops;
+
 	unsigned long dev_flags;
 
 	unsigned long sleep;		/* sleep until this time */
-- 
cgit v1.2.3


From 719254faa17ffedc87ba0fadb9b34e535c9758d5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Oct 2008 09:59:47 +0200
Subject: NOHZ: unify the nohz function calls in irq_enter()

We have two separate nohz function calls in irq_enter() for no good
reason. Just call a single NOHZ function from irq_enter() and call
the bits in the tick code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 98921a3e1aa..b6ec8189ac0 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -96,9 +96,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
 extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern void tick_check_idle(int cpu);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,26 +108,23 @@ static inline void tick_init(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline void tick_check_idle(int cpu) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
 extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
-extern void tick_nohz_update_jiffies(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
-extern void tick_nohz_stop_idle(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
-static inline void tick_nohz_update_jiffies(void) { }
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
 	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
 
 	return len;
 }
-static inline void tick_nohz_stop_idle(int cpu) { }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
-- 
cgit v1.2.3


From 504e518953a330c8d44a95bdd65a5c9f50f1012e Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Thu, 16 Oct 2008 14:15:16 +1100
Subject: Make nfs_file_cred more robust.

As not all files have an associated open_context (e.g. device special
files), it is safest to test for the existence of the open context
before de-referencing it.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_fs.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ac8d0233b05..4eaa8347a0d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -367,8 +367,12 @@ static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
 {
-	if (file != NULL)
-		return nfs_file_open_context(file)->cred;
+	if (file != NULL) {
+		struct nfs_open_context *ctx =
+			nfs_file_open_context(file);
+		if (ctx)
+			return ctx->cred;
+	}
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 97854829b97093ae172144a2597fc49ea203dcf3 Mon Sep 17 00:00:00 2001
From: Manu Abraham <abraham.manu@gmail.com>
Date: Tue, 14 Oct 2008 19:48:07 -0300
Subject: V4L/DVB (9195): Frontend API Fix: 32APSK is a valid modulation for
 the DVB-S2 delivery

Signed-off-by: Manu Abraham <manu@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/dvb/frontend.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 6e4ace27027..38942f731b9 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -166,6 +166,7 @@ typedef enum fe_modulation {
 	VSB_16,
 	PSK_8,
 	APSK_16,
+	APSK_32,
 	DQPSK,
 } fe_modulation_t;
 
-- 
cgit v1.2.3


From 5ba4ecc8b0166de4363cc31aa68d52abe0dff8de Mon Sep 17 00:00:00 2001
From: Manu Abraham <abraham.manu@gmail.com>
Date: Tue, 14 Oct 2008 19:50:03 -0300
Subject: V4L/DVB (9196): Add support for DSS delivery

Signed-off-by: Manu Abraham <manu@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/dvb/frontend.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 38942f731b9..79a8ed8e6a7 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -296,6 +296,7 @@ typedef enum fe_delivery_system {
 	SYS_DVBC_ANNEX_AC,
 	SYS_DVBC_ANNEX_B,
 	SYS_DVBT,
+	SYS_DSS,
 	SYS_DVBS,
 	SYS_DVBS2,
 	SYS_DVBH,
-- 
cgit v1.2.3


From 2a1d245b70f3f966f96767aaea1a2db6823e2f6e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Wed, 15 Oct 2008 14:47:36 -0300
Subject: V4L/DVB (9240): saa7127: Fix two typos

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/i2c-id.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 493435bcdbe..01d67ba9e98 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -60,7 +60,7 @@
 #define I2C_DRIVERID_WM8775	69	/* wm8775 audio processor	*/
 #define I2C_DRIVERID_CS53L32A	70	/* cs53l32a audio processor	*/
 #define I2C_DRIVERID_CX25840	71	/* cx2584x video encoder	*/
-#define I2C_DRIVERID_SAA7127	72	/* saa7124 video encoder	*/
+#define I2C_DRIVERID_SAA7127	72	/* saa7127 video encoder	*/
 #define I2C_DRIVERID_SAA711X	73	/* saa711x video encoders	*/
 #define I2C_DRIVERID_AKITAIOEXP	74	/* IO Expander on Sharp SL-C1000 */
 #define I2C_DRIVERID_INFRARED	75	/* I2C InfraRed on Video boards */
-- 
cgit v1.2.3


From 5b775f672cc993ba9dba5626811ab1f2ac42883b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 26 Aug 2008 16:22:06 -0700
Subject: USB: add USB test and measurement class driver

This driver was originaly written by Stefan Kopp, but massively
reworked by Greg for submission.

Thanks to Felipe Balbi <me@felipebalbi.com> for lots of work in cleaning
up this driver.

Thanks to Oliver Neukum <oliver@neukum.org> for reviewing previous
versions and pointing out problems.


Cc: Stefan Kopp <stefan_kopp@agilent.com>
Cc: Marcel Janssen <korgull@home.nl>
Cc: Felipe Balbi <me@felipebalbi.com>
Cc: Oliver Neukum <oliver@neukum.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/Kbuild |  2 +-
 include/linux/usb/tmc.h  | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/usb/tmc.h

(limited to 'include/linux')

diff --git a/include/linux/usb/Kbuild b/include/linux/usb/Kbuild
index 42e84fc315e..29fd73b0bff 100644
--- a/include/linux/usb/Kbuild
+++ b/include/linux/usb/Kbuild
@@ -4,4 +4,4 @@ header-y += ch9.h
 header-y += gadgetfs.h
 header-y += midi.h
 header-y += g_printer.h
-
+header-y += tmc.h
diff --git a/include/linux/usb/tmc.h b/include/linux/usb/tmc.h
new file mode 100644
index 00000000000..c045ae12556
--- /dev/null
+++ b/include/linux/usb/tmc.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2007 Stefan Kopp, Gechingen, Germany
+ * Copyright (C) 2008 Novell, Inc.
+ * Copyright (C) 2008 Greg Kroah-Hartman <gregkh@suse.de>
+ *
+ * This file holds USB constants defined by the USB Device Class
+ * Definition for Test and Measurement devices published by the USB-IF.
+ *
+ * It also has the ioctl definitions for the usbtmc kernel driver that
+ * userspace needs to know about.
+ */
+
+#ifndef __LINUX_USB_TMC_H
+#define __LINUX_USB_TMC_H
+
+/* USB TMC status values */
+#define USBTMC_STATUS_SUCCESS				0x01
+#define USBTMC_STATUS_PENDING				0x02
+#define USBTMC_STATUS_FAILED				0x80
+#define USBTMC_STATUS_TRANSFER_NOT_IN_PROGRESS		0x81
+#define USBTMC_STATUS_SPLIT_NOT_IN_PROGRESS		0x82
+#define USBTMC_STATUS_SPLIT_IN_PROGRESS			0x83
+
+/* USB TMC requests values */
+#define USBTMC_REQUEST_INITIATE_ABORT_BULK_OUT		1
+#define USBTMC_REQUEST_CHECK_ABORT_BULK_OUT_STATUS	2
+#define USBTMC_REQUEST_INITIATE_ABORT_BULK_IN		3
+#define USBTMC_REQUEST_CHECK_ABORT_BULK_IN_STATUS	4
+#define USBTMC_REQUEST_INITIATE_CLEAR			5
+#define USBTMC_REQUEST_CHECK_CLEAR_STATUS		6
+#define USBTMC_REQUEST_GET_CAPABILITIES			7
+#define USBTMC_REQUEST_INDICATOR_PULSE			64
+
+/* Request values for USBTMC driver's ioctl entry point */
+#define USBTMC_IOC_NR			91
+#define USBTMC_IOCTL_INDICATOR_PULSE	_IO(USBTMC_IOC_NR, 1)
+#define USBTMC_IOCTL_CLEAR		_IO(USBTMC_IOC_NR, 2)
+#define USBTMC_IOCTL_ABORT_BULK_OUT	_IO(USBTMC_IOC_NR, 3)
+#define USBTMC_IOCTL_ABORT_BULK_IN	_IO(USBTMC_IOC_NR, 4)
+#define USBTMC_IOCTL_CLEAR_OUT_HALT	_IO(USBTMC_IOC_NR, 6)
+#define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
+
+#endif
-- 
cgit v1.2.3


From 55b447bf79ad25591437d24b78caa9d0ae4fec82 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Tue, 29 Jul 2008 15:26:15 +0200
Subject: USB: kill URBs permanently

looking at usb_kill_urb() it seems to me that it is unnecessarily lenient.
In the use case of disconnect() you never want to use the URB again
(for the same device) But leaving urb->reject elevated will make it easier
to avoid races between read/write and disconnect.

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 94ac74aba6b..3371c91e7ff 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1459,6 +1459,8 @@ extern struct urb *usb_get_urb(struct urb *urb);
 extern int usb_submit_urb(struct urb *urb, gfp_t mem_flags);
 extern int usb_unlink_urb(struct urb *urb);
 extern void usb_kill_urb(struct urb *urb);
+extern void usb_poison_urb(struct urb *urb);
+extern void usb_unpoison_urb(struct urb *urb);
 extern void usb_kill_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_unlink_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
-- 
cgit v1.2.3


From 6a2839bedc1502b3f0366cc3ad1099a1d92cf8fb Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Tue, 29 Jul 2008 16:18:47 +0200
Subject: USB: extend poisoning to anchors

this extends the poisoning concept to anchors. This way poisoning
will work with fire and forget drivers.

Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3371c91e7ff..d97927970f5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1135,6 +1135,7 @@ struct usb_anchor {
 	struct list_head urb_list;
 	wait_queue_head_t wait;
 	spinlock_t lock;
+	unsigned int poisoned:1;
 };
 
 static inline void init_usb_anchor(struct usb_anchor *anchor)
@@ -1462,6 +1463,7 @@ extern void usb_kill_urb(struct urb *urb);
 extern void usb_poison_urb(struct urb *urb);
 extern void usb_unpoison_urb(struct urb *urb);
 extern void usb_kill_anchored_urbs(struct usb_anchor *anchor);
+extern void usb_poison_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_unlink_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
 extern void usb_unanchor_urb(struct urb *urb);
-- 
cgit v1.2.3


From 60beed95e38793c0baff7f94433c1f639d8d5efd Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 18 Aug 2008 17:38:22 -0700
Subject: usb gadget: function activation/deactivation

Add a new mechanism to the composite gadget framework, letting
functions deactivate (and reactivate) themselves.  Think of it
as a refcounted wrapper for the software pullup control.

A key example of why to use this mechanism involves functions that
require a userspace daemon.  Those functions shuld use this new
mechanism to prevent the gadget from enumerating until those daemons
are activated.  Without this mechanism, hosts would see devices that
malfunction until the relevant daemons start.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/composite.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index c932390c6da..935c380ffe4 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -130,6 +130,9 @@ struct usb_function {
 
 int usb_add_function(struct usb_configuration *, struct usb_function *);
 
+int usb_function_deactivate(struct usb_function *);
+int usb_function_activate(struct usb_function *);
+
 int usb_interface_id(struct usb_configuration *, struct usb_function *);
 
 /**
@@ -316,9 +319,13 @@ struct usb_composite_dev {
 	struct usb_composite_driver	*driver;
 	u8				next_string_id;
 
-	spinlock_t			lock;
+	/* the gadget driver won't enable the data pullup
+	 * while the deactivation count is nonzero.
+	 */
+	unsigned			deactivations;
 
-	/* REVISIT use and existence of lock ... */
+	/* protects at least deactivation count */
+	spinlock_t			lock;
 };
 
 extern int usb_string_id(struct usb_composite_dev *c);
-- 
cgit v1.2.3


From 3086775a4916b0fe128d924d83f4e7d7c39e4d0e Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@nokia.com>
Date: Mon, 18 Aug 2008 17:39:30 -0700
Subject: usb gadget: cdc obex glue

The following patch introduces a new f_obex.c function driver.
It allows userspace obex servers to use usb as transport layer
for their messages.

[ dbrownell@users.sourceforge.net: various fixes and cleanups ]

Signed-off-by: Felipe Balbi <felipe.balbi@nokia.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/cdc.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/cdc.h b/include/linux/usb/cdc.h
index ca228bb9421..18a729343ff 100644
--- a/include/linux/usb/cdc.h
+++ b/include/linux/usb/cdc.h
@@ -160,6 +160,15 @@ struct usb_cdc_mdlm_detail_desc {
 	__u8	bDetailData[0];
 } __attribute__ ((packed));
 
+/* "OBEX Control Model Functional Descriptor" */
+struct usb_cdc_obex_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__le16	bcdVersion;
+} __attribute__ ((packed));
+
 /*-------------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3


From 0b14c3881d4b91272b779f4037e263d392de058f Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Sat, 20 Sep 2008 14:41:47 -0700
Subject: USB: Fix spelling in usb/serial.h

Fixes a minor typo in the comments for usb_set_serial_data.

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/serial.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 655341d0f53..0b8617a9176 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -192,7 +192,7 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
  * The driver.owner field should be set to the module owner of this driver.
  * The driver.name field should be set to the name of this driver (remember
  * it will show up in sysfs, so it needs to be short and to the point.
- * Useing the module name is a good idea.)
+ * Using the module name is a good idea.)
  */
 struct usb_serial_driver {
 	const char *description;
-- 
cgit v1.2.3


From cbc30118d7a376dab4113f299c0c8f035737a5c3 Mon Sep 17 00:00:00 2001
From: Stephen Ware <stephen.ware@eqware.net>
Date: Tue, 30 Sep 2008 11:39:38 -0700
Subject: usb: vstusb.c : new driver for spectrometers used by Vernier Software
 & Technology, Inc.

This patch adds the vstusb driver to the drivers/usb/misc directory.
This driver provides support for Vernier Software & Technology
spectrometers, all made by Ocean Optics. The driver provides both IOCTL
and read()/write() methods for sending raw data to spectrometers across
the bulk channel. Each method allows for a configured timeout.

From: Stephen Ware <stephen.ware@eqware.net>
Signed-off-by: Dennis O'Brien <dennis.obrien@eqware.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb/Kbuild   |  1 +
 include/linux/usb/vstusb.h | 71 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 include/linux/usb/vstusb.h

(limited to 'include/linux')

diff --git a/include/linux/usb/Kbuild b/include/linux/usb/Kbuild
index 29fd73b0bff..54c446309a2 100644
--- a/include/linux/usb/Kbuild
+++ b/include/linux/usb/Kbuild
@@ -5,3 +5,4 @@ header-y += gadgetfs.h
 header-y += midi.h
 header-y += g_printer.h
 header-y += tmc.h
+header-y += vstusb.h
diff --git a/include/linux/usb/vstusb.h b/include/linux/usb/vstusb.h
new file mode 100644
index 00000000000..1cfac67191f
--- /dev/null
+++ b/include/linux/usb/vstusb.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ *  File: drivers/usb/misc/vstusb.h
+ *
+ *  Purpose: Support for the bulk USB Vernier Spectrophotometers
+ *
+ *  Author:     EQware Engineering, Inc.
+ *              Oregon City, OR, USA 97045
+ *
+ *  Copyright:  2007, 2008
+ *              Vernier Software & Technology
+ *              Beaverton, OR, USA 97005
+ *
+ *  Web:        www.vernier.com
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *****************************************************************************/
+/*****************************************************************************
+ *
+ *  The vstusb module is a standard usb 'client' driver running on top of the
+ *  standard usb host controller stack.
+ *
+ *  In general, vstusb supports standard bulk usb pipes.  It supports multiple
+ *  devices and multiple pipes per device.
+ *
+ *  The vstusb driver supports two interfaces:
+ *  1 - ioctl SEND_PIPE/RECV_PIPE - a general bulk write/read msg
+ *  	interface to any pipe with timeout support;
+ *  2 - standard read/write with ioctl config - offers standard read/write
+ *  	interface with ioctl configured pipes and timeouts.
+ *
+ *  Both interfaces can be signal from other process and will abort its i/o
+ *  operation.
+ *
+ *  A timeout of 0 means NO timeout.  The user can still terminate the read via
+ *  signal.
+ *
+ *  If using multiple threads with this driver, the user should ensure that
+ *  any reads, writes, or ioctls are complete before closing the device.
+ *  Changing read/write timeouts or pipes takes effect on next read/write.
+ *
+ *****************************************************************************/
+
+struct vstusb_args {
+	union {
+		/* this struct is used for IOCTL_VSTUSB_SEND_PIPE,	*
+		 * IOCTL_VSTUSB_RECV_PIPE, and read()/write() fops	*/
+		struct {
+			void __user	*buffer;
+			size_t          count;
+			unsigned int    timeout_ms;
+			int             pipe;
+		};
+
+		/* this one is used for IOCTL_VSTUSB_CONFIG_RW  	*/
+		struct {
+			int rd_pipe;
+			int rd_timeout_ms;
+			int wr_pipe;
+			int wr_timeout_ms;
+		};
+	};
+};
+
+#define VST_IOC_MAGIC 'L'
+#define VST_IOC_FIRST 0x20
+#define IOCTL_VSTUSB_SEND_PIPE	_IO(VST_IOC_MAGIC, VST_IOC_FIRST)
+#define IOCTL_VSTUSB_RECV_PIPE	_IO(VST_IOC_MAGIC, VST_IOC_FIRST + 1)
+#define IOCTL_VSTUSB_CONFIG_RW	_IO(VST_IOC_MAGIC, VST_IOC_FIRST + 2)
-- 
cgit v1.2.3


From 1987625226a918cd20c334ffce5e2a224cba0718 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oliver@neukum.org>
Date: Mon, 25 Aug 2008 22:40:25 +0200
Subject: USB: anchor API changes needed for btusb

This extends the anchor API as btusb needs for autosuspend.


Signed-off-by: Oliver Neukum <oneukum@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index d97927970f5..8fa973bede5 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1469,6 +1469,9 @@ extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
 extern void usb_unanchor_urb(struct urb *urb);
 extern int usb_wait_anchor_empty_timeout(struct usb_anchor *anchor,
 					 unsigned int timeout);
+extern struct urb *usb_get_from_anchor(struct usb_anchor *anchor);
+extern void usb_scuttle_anchored_urbs(struct usb_anchor *anchor);
+extern int usb_anchor_empty(struct usb_anchor *anchor);
 
 /**
  * usb_urb_dir_in - check if an URB describes an IN transfer
-- 
cgit v1.2.3


From aaf7ea20000436df3cbb397ccb734ad1e2e5164d Mon Sep 17 00:00:00 2001
From: Mike Rapoport <mike@compulab.co.il>
Date: Wed, 15 Oct 2008 08:38:49 +0200
Subject: [MTD] [NAND] GPIO NAND flash driver

The patch adds support for NAND flashes connected to GPIOs.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand-gpio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/mtd/nand-gpio.h

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h
new file mode 100644
index 00000000000..51534e50f7f
--- /dev/null
+++ b/include/linux/mtd/nand-gpio.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_MTD_NAND_GPIO_H
+#define __LINUX_MTD_NAND_GPIO_H
+
+#include <linux/mtd/nand.h>
+
+struct gpio_nand_platdata {
+	int	gpio_nce;
+	int	gpio_nwp;
+	int	gpio_cle;
+	int	gpio_ale;
+	int	gpio_rdy;
+	void	(*adjust_parts)(struct gpio_nand_platdata *, size_t);
+	struct mtd_partition *parts;
+	unsigned int num_parts;
+	unsigned int options;
+	int	chip_delay;
+};
+
+#endif
-- 
cgit v1.2.3


From dd3a1db900f2a215a7d7dd71b836e149a6cf5fed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 16 Oct 2008 18:20:58 +0200
Subject: genirq: improve include files

Move the irq_desc related iterators out of irq.h, into irqnr.h, also
available via interrupt.h.

This way non-genirq (and even non-hardirq) architectures get the
common definitions and iterators.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  1 +
 include/linux/irq.h       | 20 +-------------------
 include/linux/irqnr.h     | 24 ++++++++++++++++++++++++
 3 files changed, 26 insertions(+), 19 deletions(-)
 create mode 100644 include/linux/irqnr.h

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 58ff4e74b2f..72fcfcff563 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -8,6 +8,7 @@
 #include <linux/preempt.h>
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
+#include <linux/irqnr.h>
 #include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/irqflags.h>
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0618fb362cb..d058c57be02 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -11,25 +11,6 @@
 
 #include <linux/smp.h>
 
-#ifndef CONFIG_GENERIC_HARDIRQS
-# define nr_irqs		NR_IRQS
-
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
-
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)			\
-	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
-	     irq > 0; irq--, desc--)
-#endif
-
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #ifndef CONFIG_S390
 
 #include <linux/linkage.h>
@@ -37,6 +18,7 @@ extern int nr_irqs;
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
 #include <linux/irqreturn.h>
+#include <linux/irqnr.h>
 #include <linux/errno.h>
 
 #include <asm/irq.h>
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
new file mode 100644
index 00000000000..3171ddc3b39
--- /dev/null
+++ b/include/linux/irqnr.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_IRQNR_H
+#define _LINUX_IRQNR_H
+
+#ifndef CONFIG_GENERIC_HARDIRQS
+#include <asm/irq.h>
+# define nr_irqs		NR_IRQS
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0; irq < nr_irqs; irq++)
+#else
+extern int nr_irqs;
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+
+# define for_each_irq_desc_reverse(irq, desc)			\
+	for (irq = nr_irqs -1, desc = irq_desc + (nr_irqs -1 );	\
+	     irq > 0; irq--, desc--)
+#endif
+
+#define for_each_irq_nr(irq)			\
+	for (irq = 0; irq < nr_irqs; irq++)
+
+#endif
-- 
cgit v1.2.3


From 1c1b6ffce5737d764cc474b9bd6677bb9a344094 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 24 Sep 2008 23:36:23 +0200
Subject: mfd: provide and use setup hook for tc6393xb

Instead of using bitfields for initial gpio setup,
provide generic setup/teardown hooks that can be used
to set the gpio states, register child devices, etc.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/tc6393xb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index fec7b3f7a81..1fa820646d9 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -21,8 +21,6 @@
 struct tc6393xb_platform_data {
 	u16	scr_pll2cr;	/* PLL2 Control */
 	u16	scr_gper;	/* GP Enable */
-	u32	scr_gpo_doecr;	/* GPO Data OE Control */
-	u32	scr_gpo_dsr;	/* GPO Data Set */
 
 	int	(*enable)(struct platform_device *dev);
 	int	(*disable)(struct platform_device *dev);
@@ -31,6 +29,8 @@ struct tc6393xb_platform_data {
 
 	int	irq_base;	/* base for subdevice irqs */
 	int	gpio_base;
+	int	(*setup)(struct platform_device *dev);
+	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
 };
-- 
cgit v1.2.3


From f98a0bd0e4b77b12e49ce01f4c9f04503931c291 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 24 Sep 2008 23:46:10 +0200
Subject: mfd: do tcb6393xb state restore on resume only if requested

As requested by Ian make state restore only if it's requested
by platform data: some platforms do correctly save the state of
the chip during suspend/resume, but some (like tosa) incorrectly
power off the chip at suspend, so the driver supports restoring
some bits of the tc6393xb state (not full, merely enough to support
resume on tosa). With this patch this code is disabled by default.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Acked-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/tc6393xb.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 1fa820646d9..3ce10ae0f39 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -33,6 +33,10 @@ struct tc6393xb_platform_data {
 	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
+
+	unsigned resume_restore : 1; /* make special actions
+					to preserve the state
+					on suspend/resume */
 };
 
 /*
-- 
cgit v1.2.3


From 51a55623565c6ca864f7cf19e87c2d4bde1c0c5e Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Fri, 3 Oct 2008 20:11:36 +0200
Subject: mfd: add OHCI cell to tc6393xb

Add information regarding OHCI cell of the tc6393xb

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Acked-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/tc6393xb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 3ce10ae0f39..4437736ebe1 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -44,6 +44,7 @@ struct tc6393xb_platform_data {
  */
 #define	IRQ_TC6393_NAND		0
 #define	IRQ_TC6393_MMC		1
+#define	IRQ_TC6393_OHCI		2
 
 #define	TC6393XB_NR_IRQS	8
 
-- 
cgit v1.2.3


From 9e78cfe53f3c2bc1b37870697c3cde1543fefa8b Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sat, 4 Oct 2008 00:50:36 +0200
Subject: mfd: support tmiofb cell on tc6393xb

Add support for tmiofb cell found in tc6393xb chip.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/tc6393xb.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tc6393xb.h b/include/linux/mfd/tc6393xb.h
index 4437736ebe1..626e448205c 100644
--- a/include/linux/mfd/tc6393xb.h
+++ b/include/linux/mfd/tc6393xb.h
@@ -17,6 +17,8 @@
 #ifndef MFD_TC6393XB_H
 #define MFD_TC6393XB_H
 
+#include <linux/fb.h>
+
 /* Also one should provide the CK3P6MI clock */
 struct tc6393xb_platform_data {
 	u16	scr_pll2cr;	/* PLL2 Control */
@@ -33,18 +35,24 @@ struct tc6393xb_platform_data {
 	void	(*teardown)(struct platform_device *dev);
 
 	struct tmio_nand_data	*nand_data;
+	struct tmio_fb_data	*fb_data;
 
 	unsigned resume_restore : 1; /* make special actions
 					to preserve the state
 					on suspend/resume */
 };
 
+extern int tc6393xb_lcd_mode(struct platform_device *fb,
+			     const struct fb_videomode *mode);
+extern int tc6393xb_lcd_set_power(struct platform_device *fb, bool on);
+
 /*
  * Relative to irq_base
  */
 #define	IRQ_TC6393_NAND		0
 #define	IRQ_TC6393_MMC		1
 #define	IRQ_TC6393_OHCI		2
+#define	IRQ_TC6393_FB		4
 
 #define	TC6393XB_NR_IRQS	8
 
-- 
cgit v1.2.3


From a603a7fa8717fb778bba91b5a879babf333dc6a3 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 15 Oct 2008 12:15:39 +0200
Subject: mfd: TWL4030 core driver

This patch adds the core of the TWL4030 driver, which supports
chips including the TPS65950.  These chips are multi-function; see

  http://focus.ti.com/docs/prod/folders/print/tps65950.html

Public specs are in the works.  For now, the block diagram on
the second page of the datasheet is fairly informative.

There are some known issues with this core code.  Most notably,
the IRQ dispatching needs simplification (to use more of genirq),
generalization (integrating support for secondary IRQ dispatch
as well as primary, and removing the build dependency on OMAP),
and then probably updating to leverage threaded IRQ support
(expected to arrive in mainline "soon").

Once the core is in mainline, drivers for other parts of this
chip can follow its lead and start swimming upstream too.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/i2c/twl4030.h | 339 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 339 insertions(+)
 create mode 100644 include/linux/i2c/twl4030.h

(limited to 'include/linux')

diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h
new file mode 100644
index 00000000000..cdb453162a9
--- /dev/null
+++ b/include/linux/i2c/twl4030.h
@@ -0,0 +1,339 @@
+/*
+ * twl4030.h - header for TWL4030 PM and audio CODEC device
+ *
+ * Copyright (C) 2005-2006 Texas Instruments, Inc.
+ *
+ * Based on tlv320aic23.c:
+ * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#ifndef __TWL4030_H_
+#define __TWL4030_H_
+
+/*
+ * Using the twl4030 core we address registers using a pair
+ *	{ module id, relative register offset }
+ * which that core then maps to the relevant
+ *	{ i2c slave, absolute register address }
+ *
+ * The module IDs are meaningful only to the twl4030 core code,
+ * which uses them as array indices to look up the first register
+ * address each module uses within a given i2c slave.
+ */
+
+/* Slave 0 (i2c address 0x48) */
+#define TWL4030_MODULE_USB		0x00
+
+/* Slave 1 (i2c address 0x49) */
+#define TWL4030_MODULE_AUDIO_VOICE	0x01
+#define TWL4030_MODULE_GPIO		0x02
+#define TWL4030_MODULE_INTBR		0x03
+#define TWL4030_MODULE_PIH		0x04
+#define TWL4030_MODULE_TEST		0x05
+
+/* Slave 2 (i2c address 0x4a) */
+#define TWL4030_MODULE_KEYPAD		0x06
+#define TWL4030_MODULE_MADC		0x07
+#define TWL4030_MODULE_INTERRUPTS	0x08
+#define TWL4030_MODULE_LED		0x09
+#define TWL4030_MODULE_MAIN_CHARGE	0x0A
+#define TWL4030_MODULE_PRECHARGE	0x0B
+#define TWL4030_MODULE_PWM0		0x0C
+#define TWL4030_MODULE_PWM1		0x0D
+#define TWL4030_MODULE_PWMA		0x0E
+#define TWL4030_MODULE_PWMB		0x0F
+
+/* Slave 3 (i2c address 0x4b) */
+#define TWL4030_MODULE_BACKUP		0x10
+#define TWL4030_MODULE_INT		0x11
+#define TWL4030_MODULE_PM_MASTER	0x12
+#define TWL4030_MODULE_PM_RECEIVER	0x13
+#define TWL4030_MODULE_RTC		0x14
+#define TWL4030_MODULE_SECURED_REG	0x15
+
+/*
+ * Read and write single 8-bit registers
+ */
+int twl4030_i2c_write_u8(u8 mod_no, u8 val, u8 reg);
+int twl4030_i2c_read_u8(u8 mod_no, u8 *val, u8 reg);
+
+/*
+ * Read and write several 8-bit registers at once.
+ *
+ * IMPORTANT:  For twl4030_i2c_write(), allocate num_bytes + 1
+ * for the value, and populate your data starting at offset 1.
+ */
+int twl4030_i2c_write(u8 mod_no, u8 *value, u8 reg, u8 num_bytes);
+int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, u8 num_bytes);
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * NOTE:  at up to 1024 registers, this is a big chip.
+ *
+ * Avoid putting register declarations in this file, instead of into
+ * a driver-private file, unless some of the registers in a block
+ * need to be shared with other drivers.  One example is blocks that
+ * have Secondary IRQ Handler (SIH) registers.
+ */
+
+#define TWL4030_SIH_CTRL_EXCLEN_MASK	BIT(0)
+#define TWL4030_SIH_CTRL_PENDDIS_MASK	BIT(1)
+#define TWL4030_SIH_CTRL_COR_MASK	BIT(2)
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * GPIO Block Register offsets (use TWL4030_MODULE_GPIO)
+ */
+
+#define REG_GPIODATAIN1			0x0
+#define REG_GPIODATAIN2			0x1
+#define REG_GPIODATAIN3			0x2
+#define REG_GPIODATADIR1		0x3
+#define REG_GPIODATADIR2		0x4
+#define REG_GPIODATADIR3		0x5
+#define REG_GPIODATAOUT1		0x6
+#define REG_GPIODATAOUT2		0x7
+#define REG_GPIODATAOUT3		0x8
+#define REG_CLEARGPIODATAOUT1		0x9
+#define REG_CLEARGPIODATAOUT2		0xA
+#define REG_CLEARGPIODATAOUT3		0xB
+#define REG_SETGPIODATAOUT1		0xC
+#define REG_SETGPIODATAOUT2		0xD
+#define REG_SETGPIODATAOUT3		0xE
+#define REG_GPIO_DEBEN1			0xF
+#define REG_GPIO_DEBEN2			0x10
+#define REG_GPIO_DEBEN3			0x11
+#define REG_GPIO_CTRL			0x12
+#define REG_GPIOPUPDCTR1		0x13
+#define REG_GPIOPUPDCTR2		0x14
+#define REG_GPIOPUPDCTR3		0x15
+#define REG_GPIOPUPDCTR4		0x16
+#define REG_GPIOPUPDCTR5		0x17
+#define REG_GPIO_ISR1A			0x19
+#define REG_GPIO_ISR2A			0x1A
+#define REG_GPIO_ISR3A			0x1B
+#define REG_GPIO_IMR1A			0x1C
+#define REG_GPIO_IMR2A			0x1D
+#define REG_GPIO_IMR3A			0x1E
+#define REG_GPIO_ISR1B			0x1F
+#define REG_GPIO_ISR2B			0x20
+#define REG_GPIO_ISR3B			0x21
+#define REG_GPIO_IMR1B			0x22
+#define REG_GPIO_IMR2B			0x23
+#define REG_GPIO_IMR3B			0x24
+#define REG_GPIO_EDR1			0x28
+#define REG_GPIO_EDR2			0x29
+#define REG_GPIO_EDR3			0x2A
+#define REG_GPIO_EDR4			0x2B
+#define REG_GPIO_EDR5			0x2C
+#define REG_GPIO_SIH_CTRL		0x2D
+
+/* Up to 18 signals are available as GPIOs, when their
+ * pins are not assigned to another use (such as ULPI/USB).
+ */
+#define TWL4030_GPIO_MAX		18
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Keypad register offsets (use TWL4030_MODULE_KEYPAD)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_KEYPAD_KEYP_ISR1	0x11
+#define TWL4030_KEYPAD_KEYP_IMR1	0x12
+#define TWL4030_KEYPAD_KEYP_ISR2	0x13
+#define TWL4030_KEYPAD_KEYP_IMR2	0x14
+#define TWL4030_KEYPAD_KEYP_SIR		0x15	/* test register */
+#define TWL4030_KEYPAD_KEYP_EDR		0x16
+#define TWL4030_KEYPAD_KEYP_SIH_CTRL	0x17
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Multichannel ADC register offsets (use TWL4030_MODULE_MADC)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_MADC_ISR1		0x61
+#define TWL4030_MADC_IMR1		0x62
+#define TWL4030_MADC_ISR2		0x63
+#define TWL4030_MADC_IMR2		0x64
+#define TWL4030_MADC_SIR		0x65	/* test register */
+#define TWL4030_MADC_EDR		0x66
+#define TWL4030_MADC_SIH_CTRL		0x67
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Battery charger register offsets (use TWL4030_MODULE_INTERRUPTS)
+ */
+
+#define TWL4030_INTERRUPTS_BCIISR1A	0x0
+#define TWL4030_INTERRUPTS_BCIISR2A	0x1
+#define TWL4030_INTERRUPTS_BCIIMR1A	0x2
+#define TWL4030_INTERRUPTS_BCIIMR2A	0x3
+#define TWL4030_INTERRUPTS_BCIISR1B	0x4
+#define TWL4030_INTERRUPTS_BCIISR2B	0x5
+#define TWL4030_INTERRUPTS_BCIIMR1B	0x6
+#define TWL4030_INTERRUPTS_BCIIMR2B	0x7
+#define TWL4030_INTERRUPTS_BCISIR1	0x8	/* test register */
+#define TWL4030_INTERRUPTS_BCISIR2	0x9	/* test register */
+#define TWL4030_INTERRUPTS_BCIEDR1	0xa
+#define TWL4030_INTERRUPTS_BCIEDR2	0xb
+#define TWL4030_INTERRUPTS_BCIEDR3	0xc
+#define TWL4030_INTERRUPTS_BCISIHCTRL	0xd
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Power Interrupt block register offsets (use TWL4030_MODULE_INT)
+ */
+
+#define TWL4030_INT_PWR_ISR1		0x0
+#define TWL4030_INT_PWR_IMR1		0x1
+#define TWL4030_INT_PWR_ISR2		0x2
+#define TWL4030_INT_PWR_IMR2		0x3
+#define TWL4030_INT_PWR_SIR		0x4	/* test register */
+#define TWL4030_INT_PWR_EDR1		0x5
+#define TWL4030_INT_PWR_EDR2		0x6
+#define TWL4030_INT_PWR_SIH_CTRL	0x7
+
+/*----------------------------------------------------------------------*/
+
+struct twl4030_bci_platform_data {
+	int *battery_tmp_tbl;
+	unsigned int tblsize;
+};
+
+/* TWL4030_GPIO_MAX (18) GPIOs, with interrupts */
+struct twl4030_gpio_platform_data {
+	int		gpio_base;
+	unsigned	irq_base, irq_end;
+
+	/* For gpio-N, bit (1 << N) in "pullups" is set if that pullup
+	 * should be enabled.  Else, if that bit is set in "pulldowns",
+	 * that pulldown is enabled.  Don't waste power by letting any
+	 * digital inputs float...
+	 */
+	u32		pullups;
+	u32		pulldowns;
+
+	int		(*setup)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+	int		(*teardown)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+};
+
+struct twl4030_madc_platform_data {
+	int		irq_line;
+};
+
+struct twl4030_keypad_data {
+	int rows;
+	int cols;
+	int *keymap;
+	int irq;
+	unsigned int keymapsize;
+	unsigned int rep:1;
+};
+
+enum twl4030_usb_mode {
+	T2_USB_MODE_ULPI = 1,
+	T2_USB_MODE_CEA2011_3PIN = 2,
+};
+
+struct twl4030_usb_data {
+	enum twl4030_usb_mode	usb_mode;
+};
+
+struct twl4030_platform_data {
+	unsigned				irq_base, irq_end;
+	struct twl4030_bci_platform_data	*bci;
+	struct twl4030_gpio_platform_data	*gpio;
+	struct twl4030_madc_platform_data	*madc;
+	struct twl4030_keypad_data		*keypad;
+	struct twl4030_usb_data			*usb;
+
+	/* REVISIT more to come ... _nothing_ should be hard-wired */
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * FIXME completely stop using TWL4030_IRQ_BASE ... instead, pass the
+ * IRQ data to subsidiary devices using platform device resources.
+ */
+
+/* IRQ information-need base */
+#include <mach/irqs.h>
+/* TWL4030 interrupts */
+
+/* #define TWL4030_MODIRQ_GPIO		(TWL4030_IRQ_BASE + 0) */
+#define TWL4030_MODIRQ_KEYPAD		(TWL4030_IRQ_BASE + 1)
+#define TWL4030_MODIRQ_BCI		(TWL4030_IRQ_BASE + 2)
+#define TWL4030_MODIRQ_MADC		(TWL4030_IRQ_BASE + 3)
+/* #define TWL4030_MODIRQ_USB		(TWL4030_IRQ_BASE + 4) */
+#define TWL4030_MODIRQ_PWR		(TWL4030_IRQ_BASE + 5)
+
+#define TWL4030_PWRIRQ_PWRBTN		(TWL4030_PWR_IRQ_BASE + 0)
+#define TWL4030_PWRIRQ_CHG_PRES		(TWL4030_PWR_IRQ_BASE + 1)
+#define TWL4030_PWRIRQ_USB_PRES		(TWL4030_PWR_IRQ_BASE + 2)
+#define TWL4030_PWRIRQ_RTC		(TWL4030_PWR_IRQ_BASE + 3)
+#define TWL4030_PWRIRQ_HOT_DIE		(TWL4030_PWR_IRQ_BASE + 4)
+#define TWL4030_PWRIRQ_PWROK_TIMEOUT	(TWL4030_PWR_IRQ_BASE + 5)
+#define TWL4030_PWRIRQ_MBCHG		(TWL4030_PWR_IRQ_BASE + 6)
+#define TWL4030_PWRIRQ_SC_DETECT	(TWL4030_PWR_IRQ_BASE + 7)
+
+/* Rest are unsued currently*/
+
+/* Offsets to Power Registers */
+#define TWL4030_VDAC_DEV_GRP		0x3B
+#define TWL4030_VDAC_DEDICATED		0x3E
+#define TWL4030_VAUX1_DEV_GRP		0x17
+#define TWL4030_VAUX1_DEDICATED		0x1A
+#define TWL4030_VAUX2_DEV_GRP		0x1B
+#define TWL4030_VAUX2_DEDICATED		0x1E
+#define TWL4030_VAUX3_DEV_GRP		0x1F
+#define TWL4030_VAUX3_DEDICATED		0x22
+
+/* TWL4030 GPIO interrupt definitions */
+
+#define TWL4030_GPIO_IRQ_NO(n)		(TWL4030_GPIO_IRQ_BASE + (n))
+#define TWL4030_GPIO_IS_ENABLE		1
+
+/*
+ * Exported TWL4030 GPIO APIs
+ *
+ * WARNING -- use standard GPIO and IRQ calls instead; these will vanish.
+ */
+int twl4030_get_gpio_datain(int gpio);
+int twl4030_request_gpio(int gpio);
+int twl4030_set_gpio_debounce(int gpio, int enable);
+int twl4030_free_gpio(int gpio);
+
+#if defined(CONFIG_TWL4030_BCI_BATTERY) || \
+	defined(CONFIG_TWL4030_BCI_BATTERY_MODULE)
+	extern int twl4030charger_usb_en(int enable);
+#else
+	static inline int twl4030charger_usb_en(int enable) { return 0; }
+#endif
+
+#endif /* End of __TWL4030_H */
-- 
cgit v1.2.3


From 26b8f5e1e2d1229c186d8e61d26513c43a058c5e Mon Sep 17 00:00:00 2001
From: Eric Miao <eric.miao@marvell.com>
Date: Wed, 15 Oct 2008 12:20:06 +0200
Subject: mfd: add base support for Dialog DA9030/DA9034 PMICs

DA9030 (a.k.a ARAVA) and DA9034 (a.k.a MICCO) are PMICs designed by
Dialog Semiconductor, usually found on PXA-based platforms. These
PMICs are I2C-based, multi-function devices, usually with LEDs, PWMs
for backlight, BUCKs and LDOs, ADCs and touchscreen controller (on
DA9034).

This is the base support for the I2C operations, event registration
and handling, sub-devices management.

Signed-off-by: Mike Rapoport <mike@compulab.co.il>
Signed-off-by: Eric Miao <eric.miao@marvell.com>
Signed-off-by: Liam Girdwood <lrg@kernel.org>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/da903x.h | 201 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100644 include/linux/mfd/da903x.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/da903x.h b/include/linux/mfd/da903x.h
new file mode 100644
index 00000000000..cad314c1243
--- /dev/null
+++ b/include/linux/mfd/da903x.h
@@ -0,0 +1,201 @@
+#ifndef __LINUX_PMIC_DA903X_H
+#define __LINUX_PMIC_DA903X_H
+
+/* Unified sub device IDs for DA9030/DA9034 */
+enum {
+	DA9030_ID_LED_1,
+	DA9030_ID_LED_2,
+	DA9030_ID_LED_3,
+	DA9030_ID_LED_4,
+	DA9030_ID_LED_PC,
+	DA9030_ID_VIBRA,
+	DA9030_ID_WLED,
+	DA9030_ID_BUCK1,
+	DA9030_ID_BUCK2,
+	DA9030_ID_LDO1,
+	DA9030_ID_LDO2,
+	DA9030_ID_LDO3,
+	DA9030_ID_LDO4,
+	DA9030_ID_LDO5,
+	DA9030_ID_LDO6,
+	DA9030_ID_LDO7,
+	DA9030_ID_LDO8,
+	DA9030_ID_LDO9,
+	DA9030_ID_LDO10,
+	DA9030_ID_LDO11,
+	DA9030_ID_LDO12,
+	DA9030_ID_LDO13,
+	DA9030_ID_LDO14,
+	DA9030_ID_LDO15,
+	DA9030_ID_LDO16,
+	DA9030_ID_LDO17,
+	DA9030_ID_LDO18,
+	DA9030_ID_LDO19,
+	DA9030_ID_LDO_INT,	/* LDO Internal */
+
+	DA9034_ID_LED_1,
+	DA9034_ID_LED_2,
+	DA9034_ID_VIBRA,
+	DA9034_ID_WLED,
+	DA9034_ID_TOUCH,
+
+	DA9034_ID_BUCK1,
+	DA9034_ID_BUCK2,
+	DA9034_ID_LDO1,
+	DA9034_ID_LDO2,
+	DA9034_ID_LDO3,
+	DA9034_ID_LDO4,
+	DA9034_ID_LDO5,
+	DA9034_ID_LDO6,
+	DA9034_ID_LDO7,
+	DA9034_ID_LDO8,
+	DA9034_ID_LDO9,
+	DA9034_ID_LDO10,
+	DA9034_ID_LDO11,
+	DA9034_ID_LDO12,
+	DA9034_ID_LDO13,
+	DA9034_ID_LDO14,
+	DA9034_ID_LDO15,
+};
+
+/*
+ * DA9030/DA9034 LEDs sub-devices uses generic "struct led_info"
+ * as the platform_data
+ */
+
+/* DA9030 flags for "struct led_info"
+ */
+#define DA9030_LED_RATE_ON	(0 << 5)
+#define DA9030_LED_RATE_052S	(1 << 5)
+#define DA9030_LED_DUTY_1_16	(0 << 3)
+#define DA9030_LED_DUTY_1_8	(1 << 3)
+#define DA9030_LED_DUTY_1_4	(2 << 3)
+#define DA9030_LED_DUTY_1_2	(3 << 3)
+
+#define DA9030_VIBRA_MODE_1P3V	(0 << 1)
+#define DA9030_VIBRA_MODE_2P7V	(1 << 1)
+#define DA9030_VIBRA_FREQ_1HZ	(0 << 2)
+#define DA9030_VIBRA_FREQ_2HZ	(1 << 2)
+#define DA9030_VIBRA_FREQ_4HZ	(2 << 2)
+#define DA9030_VIBRA_FREQ_8HZ	(3 << 2)
+#define DA9030_VIBRA_DUTY_ON	(0 << 4)
+#define DA9030_VIBRA_DUTY_75P	(1 << 4)
+#define DA9030_VIBRA_DUTY_50P	(2 << 4)
+#define DA9030_VIBRA_DUTY_25P	(3 << 4)
+
+/* DA9034 flags for "struct led_info" */
+#define DA9034_LED_RAMP		(1 << 7)
+
+/* DA9034 touch screen platform data */
+struct da9034_touch_pdata {
+	int	interval_ms;	/* sampling interval while pen down */
+	int	x_inverted;
+	int	y_inverted;
+};
+
+struct da903x_subdev_info {
+	int		id;
+	const char	*name;
+	void		*platform_data;
+};
+
+struct da903x_platform_data {
+	int num_subdevs;
+	struct da903x_subdev_info *subdevs;
+};
+
+/* bit definitions for DA9030 events */
+#define DA9030_EVENT_ONKEY		(1 << 0)
+#define	DA9030_EVENT_PWREN		(1 << 1)
+#define	DA9030_EVENT_EXTON		(1 << 2)
+#define	DA9030_EVENT_CHDET		(1 << 3)
+#define	DA9030_EVENT_TBAT		(1 << 4)
+#define	DA9030_EVENT_VBATMON		(1 << 5)
+#define	DA9030_EVENT_VBATMON_TXON	(1 << 6)
+#define	DA9030_EVENT_CHIOVER		(1 << 7)
+#define	DA9030_EVENT_TCTO		(1 << 8)
+#define	DA9030_EVENT_CCTO		(1 << 9)
+#define	DA9030_EVENT_ADC_READY		(1 << 10)
+#define	DA9030_EVENT_VBUS_4P4		(1 << 11)
+#define	DA9030_EVENT_VBUS_4P0		(1 << 12)
+#define	DA9030_EVENT_SESS_VALID		(1 << 13)
+#define	DA9030_EVENT_SRP_DETECT		(1 << 14)
+#define	DA9030_EVENT_WATCHDOG		(1 << 15)
+#define	DA9030_EVENT_LDO15		(1 << 16)
+#define	DA9030_EVENT_LDO16		(1 << 17)
+#define	DA9030_EVENT_LDO17		(1 << 18)
+#define	DA9030_EVENT_LDO18		(1 << 19)
+#define	DA9030_EVENT_LDO19		(1 << 20)
+#define	DA9030_EVENT_BUCK2		(1 << 21)
+
+/* bit definitions for DA9034 events */
+#define DA9034_EVENT_ONKEY		(1 << 0)
+#define DA9034_EVENT_EXTON		(1 << 2)
+#define DA9034_EVENT_CHDET		(1 << 3)
+#define DA9034_EVENT_TBAT		(1 << 4)
+#define DA9034_EVENT_VBATMON		(1 << 5)
+#define DA9034_EVENT_REV_IOVER		(1 << 6)
+#define DA9034_EVENT_CH_IOVER		(1 << 7)
+#define DA9034_EVENT_CH_TCTO		(1 << 8)
+#define DA9034_EVENT_CH_CCTO		(1 << 9)
+#define DA9034_EVENT_USB_DEV		(1 << 10)
+#define DA9034_EVENT_OTGCP_IOVER	(1 << 11)
+#define DA9034_EVENT_VBUS_4P55		(1 << 12)
+#define DA9034_EVENT_VBUS_3P8		(1 << 13)
+#define DA9034_EVENT_SESS_1P8		(1 << 14)
+#define DA9034_EVENT_SRP_READY		(1 << 15)
+#define DA9034_EVENT_ADC_MAN		(1 << 16)
+#define DA9034_EVENT_ADC_AUTO4		(1 << 17)
+#define DA9034_EVENT_ADC_AUTO5		(1 << 18)
+#define DA9034_EVENT_ADC_AUTO6		(1 << 19)
+#define DA9034_EVENT_PEN_DOWN		(1 << 20)
+#define DA9034_EVENT_TSI_READY		(1 << 21)
+#define DA9034_EVENT_UART_TX		(1 << 22)
+#define DA9034_EVENT_UART_RX		(1 << 23)
+#define DA9034_EVENT_HEADSET		(1 << 25)
+#define DA9034_EVENT_HOOKSWITCH		(1 << 26)
+#define DA9034_EVENT_WATCHDOG		(1 << 27)
+
+extern int da903x_register_notifier(struct device *dev,
+		struct notifier_block *nb, unsigned int events);
+extern int da903x_unregister_notifier(struct device *dev,
+		struct notifier_block *nb, unsigned int events);
+
+/* Status Query Interface */
+#define DA9030_STATUS_ONKEY		(1 << 0)
+#define DA9030_STATUS_PWREN1		(1 << 1)
+#define DA9030_STATUS_EXTON		(1 << 2)
+#define DA9030_STATUS_CHDET		(1 << 3)
+#define DA9030_STATUS_TBAT		(1 << 4)
+#define DA9030_STATUS_VBATMON		(1 << 5)
+#define DA9030_STATUS_VBATMON_TXON	(1 << 6)
+#define DA9030_STATUS_MCLKDET		(1 << 7)
+
+#define DA9034_STATUS_ONKEY		(1 << 0)
+#define DA9034_STATUS_EXTON		(1 << 2)
+#define DA9034_STATUS_CHDET		(1 << 3)
+#define DA9034_STATUS_TBAT		(1 << 4)
+#define DA9034_STATUS_VBATMON		(1 << 5)
+#define DA9034_STATUS_PEN_DOWN		(1 << 6)
+#define DA9034_STATUS_MCLKDET		(1 << 7)
+#define DA9034_STATUS_USB_DEV		(1 << 8)
+#define DA9034_STATUS_HEADSET		(1 << 9)
+#define DA9034_STATUS_HOOKSWITCH	(1 << 10)
+#define DA9034_STATUS_REMCON		(1 << 11)
+#define DA9034_STATUS_VBUS_VALID_4P55	(1 << 12)
+#define DA9034_STATUS_VBUS_VALID_3P8	(1 << 13)
+#define DA9034_STATUS_SESS_VALID_1P8	(1 << 14)
+#define DA9034_STATUS_SRP_READY		(1 << 15)
+
+extern int da903x_query_status(struct device *dev, unsigned int status);
+
+
+/* NOTE: the two functions below are not intended for use outside
+ * of the DA9034 sub-device drivers
+ */
+extern int da903x_write(struct device *dev, int reg, uint8_t val);
+extern int da903x_read(struct device *dev, int reg, uint8_t *val);
+extern int da903x_update(struct device *dev, int reg, uint8_t val, uint8_t mask);
+extern int da903x_set_bits(struct device *dev, int reg, uint8_t bit_mask);
+extern int da903x_clr_bits(struct device *dev, int reg, uint8_t bit_mask);
+#endif /* __LINUX_PMIC_DA903X_H */
-- 
cgit v1.2.3


From 7acb706ca97fce84bda4a902a33de2f3dae10260 Mon Sep 17 00:00:00 2001
From: Ian Molton <spyro@f2s.com>
Date: Thu, 9 Oct 2008 20:06:09 +0200
Subject: mfd: update TMIO drivers to use the clock API

This patch updates the remaining two TMIO drivers to use the clock API
rather than callback hooks into platform code.

Signed-off-by: Ian Molton <spyro@f2s.com>
Signed-off-by: Samuel Ortiz <sameo@openedhand.com>
---
 include/linux/mfd/t7l66xb.h  | 2 --
 include/linux/mfd/tc6387xb.h | 3 ---
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/t7l66xb.h b/include/linux/mfd/t7l66xb.h
index e83c7f2036f..b4629818aea 100644
--- a/include/linux/mfd/t7l66xb.h
+++ b/include/linux/mfd/t7l66xb.h
@@ -15,8 +15,6 @@
 #include <linux/mfd/tmio.h>
 
 struct t7l66xb_platform_data {
-	int (*enable_clk32k)(struct platform_device *dev);
-	void (*disable_clk32k)(struct platform_device *dev);
 	int (*enable)(struct platform_device *dev);
 	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
diff --git a/include/linux/mfd/tc6387xb.h b/include/linux/mfd/tc6387xb.h
index fa06e0610b8..b4888209494 100644
--- a/include/linux/mfd/tc6387xb.h
+++ b/include/linux/mfd/tc6387xb.h
@@ -11,9 +11,6 @@
 #define MFD_TC6387XB_H
 
 struct tc6387xb_platform_data {
-	int (*enable_clk32k)(struct platform_device *dev);
-	void (*disable_clk32k)(struct platform_device *dev);
-
 	int (*enable)(struct platform_device *dev);
 	int (*disable)(struct platform_device *dev);
 	int (*suspend)(struct platform_device *dev);
-- 
cgit v1.2.3


From 62695a84eb8f2e718bf4dfb21700afaa7a08e0ea Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:09 -0700
Subject: vmscan: move isolate_lru_page() to vmscan.c

On large memory systems, the VM can spend way too much time scanning
through pages that it cannot (or should not) evict from memory.  Not only
does it use up CPU time, but it also provokes lock contention and can
leave large systems under memory presure in a catatonic state.

This patch series improves VM scalability by:

1) putting filesystem backed, swap backed and unevictable pages
   onto their own LRUs, so the system only scans the pages that it
   can/should evict from memory

2) switching to two handed clock replacement for the anonymous LRUs,
   so the number of pages that need to be scanned when the system
   starts swapping is bound to a reasonable number

3) keeping unevictable pages off the LRU completely, so the
   VM does not waste CPU time scanning them. ramfs, ramdisk,
   SHM_LOCKED shared memory segments and mlock()ed VMA pages
   are keept on the unevictable list.

This patch:

isolate_lru_page logically belongs to be in vmscan.c than migrate.c.

It is tough, because we don't need that function without memory migration
so there is a valid argument to have it in migrate.c.  However a
subsequent patch needs to make use of it in the core mm, so we can happily
move it to vmscan.c.

Also, make the function a little more generic by not requiring that it
adds an isolated page to a given list.  Callers can do that.

	Note that we now have '__isolate_lru_page()', that does
	something quite different, visible outside of vmscan.c
	for use with memory controller.  Methinks we need to
	rationalize these names/purposes.	--lts

[akpm@linux-foundation.org: fix mm/memory_hotplug.c build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 03aea612d28..3f34005068d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,7 +7,6 @@
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -21,8 +20,6 @@ extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
 #else
-static inline int isolate_lru_page(struct page *p, struct list_head *list)
-					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private) { return -ENOSYS; }
-- 
cgit v1.2.3


From b69408e88bd86b98feb7b9a38fd865e1ddb29827 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Sat, 18 Oct 2008 20:26:14 -0700
Subject: vmscan: Use an indexed array for LRU variables

Currently we are defining explicit variables for the inactive and active
list.  An indexed array can be more generic and avoid repeating similar
code in several places in the reclaim code.

We are saving a few bytes in terms of code size:

Before:

   text    data     bss     dec     hex filename
4097753  573120 4092484 8763357  85b7dd vmlinux

After:

   text    data     bss     dec     hex filename
4097729  573120 4092484 8763333  85b7c5 vmlinux

Having an easy way to add new lru lists may ease future work on the
reclaim code.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 +++++-----------
 include/linux/mm_inline.h  | 49 +++++++++++++++++++++++++++++++++++-----------
 include/linux/mmzone.h     | 26 ++++++++++++++++++------
 3 files changed, 63 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fdf3967e139..a6ac0d491fe 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,10 +69,8 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 
-extern long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
-extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
+extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+					int priority, enum lru_list lru);
 
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline void page_reset_bad_cgroup(struct page *page)
@@ -159,14 +157,9 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 {
 }
 
-static inline long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
-{
-	return 0;
-}
-
-static inline long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
+static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
+					struct zone *zone, int priority,
+					enum lru_list lru)
 {
 	return 0;
 }
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 895bc4e9303..2704729777e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,40 +1,67 @@
+static inline void
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+	list_add(&page->lru, &zone->lru[l].list);
+	__inc_zone_state(zone, NR_LRU_BASE + l);
+}
+
+static inline void
+del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
+{
+	list_del(&page->lru);
+	__dec_zone_state(zone, NR_LRU_BASE + l);
+}
+
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->active_list);
-	__inc_zone_state(zone, NR_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE);
 }
 
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->inactive_list);
-	__inc_zone_state(zone, NR_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE);
 }
 
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
-	list_del(&page->lru);
-	__dec_zone_state(zone, NR_ACTIVE);
+	del_page_from_lru_list(zone, page, LRU_ACTIVE);
 }
 
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
-	list_del(&page->lru);
-	__dec_zone_state(zone, NR_INACTIVE);
+	del_page_from_lru_list(zone, page, LRU_INACTIVE);
 }
 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+	enum lru_list l = LRU_INACTIVE;
+
 	list_del(&page->lru);
 	if (PageActive(page)) {
 		__ClearPageActive(page);
-		__dec_zone_state(zone, NR_ACTIVE);
-	} else {
-		__dec_zone_state(zone, NR_INACTIVE);
+		l = LRU_ACTIVE;
 	}
+	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
+/**
+ * page_lru - which LRU list should a page be on?
+ * @page: the page to test
+ *
+ * Returns the LRU list a page should be on, as an index
+ * into the array of LRU lists.
+ */
+static inline enum lru_list page_lru(struct page *page)
+{
+	enum lru_list lru = LRU_BASE;
+
+	if (PageActive(page))
+		lru += LRU_ACTIVE;
+
+	return lru;
+}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 428328a05fa..156e18f3919 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -81,8 +81,9 @@ struct zone_padding {
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
-	NR_INACTIVE,
-	NR_ACTIVE,
+	NR_LRU_BASE,
+	NR_INACTIVE = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE,	/*  "     "     "   "       "         */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -107,6 +108,19 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
+enum lru_list {
+	LRU_BASE,
+	LRU_INACTIVE=LRU_BASE,	/* must match order of NR_[IN]ACTIVE */
+	LRU_ACTIVE,		/*  "     "     "   "       "        */
+	NR_LRU_LISTS };
+
+#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
+
+static inline int is_active_lru(enum lru_list l)
+{
+	return (l == LRU_ACTIVE);
+}
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -251,10 +265,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
-	struct list_head	active_list;
-	struct list_head	inactive_list;
-	unsigned long		nr_scan_active;
-	unsigned long		nr_scan_inactive;
+	struct {
+		struct list_head list;
+		unsigned long nr_scan;
+	} lru[NR_LRU_LISTS];
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
-- 
cgit v1.2.3


From f04e9ebbe4909f9a41efd55149bc353299f4e83b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:26:19 -0700
Subject: swap: use an array for the LRU pagevecs

Turn the pagevecs into an array just like the LRUs.  This significantly
cleans up the source code and reduces the size of the kernel by about 13kB
after all the LRU lists have been created further down in the split VM
patch series.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 13 +++++++++++--
 include/linux/swap.h    | 18 ++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 8eb7fa76c1d..6b8f11bcc94 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -23,8 +23,7 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_release_nonlru(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
-void __pagevec_lru_add_active(struct pagevec *pvec);
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
@@ -81,6 +80,16 @@ static inline void pagevec_free(struct pagevec *pvec)
 		__pagevec_free(pvec);
 }
 
+static inline void __pagevec_lru_add(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE);
+}
+
+static inline void __pagevec_lru_add_active(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE);
+}
+
 static inline void pagevec_lru_add(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f169a4e..fcc169610d0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -171,8 +171,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void lru_cache_add(struct page *);
-extern void lru_cache_add_active(struct page *);
+extern void __lru_cache_add(struct page *, enum lru_list lru);
+extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -180,6 +180,20 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+static inline void lru_cache_add(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE);
+}
+
+static inline void lru_cache_add_active(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE);
+}
+
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
-- 
cgit v1.2.3


From 68a22394c286a2daf06ee8d65d8835f738faefa5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:23 -0700
Subject: vmscan: free swap space on swap-in/activation

If vm_swap_full() (swap space more than 50% full), the system will free
swap space at swapin time.  With this patch, the system will also free the
swap space in the pageout code, when we decide that the page is not a
candidate for swapout (and just wasting swap space).

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h | 1 +
 include/linux/swap.h    | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 6b8f11bcc94..fea3a982ee5 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -25,6 +25,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
+void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index fcc169610d0..833be56ad83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -265,6 +265,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int remove_exclusive_swap_page_ref(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
@@ -353,6 +354,11 @@ static inline int remove_exclusive_swap_page(struct page *p)
 	return 0;
 }
 
+static inline int remove_exclusive_swap_page_ref(struct page *page)
+{
+	return 0;
+}
+
 static inline swp_entry_t get_swap_page(void)
 {
 	swp_entry_t entry;
-- 
cgit v1.2.3


From b2e185384f534781fd22f5ce170b2ad26f97df70 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:30 -0700
Subject: define page_file_cache() function

Define page_file_cache() function to answer the question:
	is page backed by a file?

Originally part of Rik van Riel's split-lru patch.  Extracted to make
available for other, independent reclaim patches.

Moved inline function to linux/mm_inline.h where it will be needed by
subsequent "split LRU" and "noreclaim" patches.

Unfortunately this needs to use a page flag, since the PG_swapbacked state
needs to be preserved all the way to the point where the page is last
removed from the LRU.  Trying to derive the status from other info in the
page resulted in wrong VM statistics in earlier split VM patchsets.

The total number of page flags in use on a 32 bit machine after this patch
is 19.

[akpm@linux-foundation.org: fix up out-of-order merge fallout]
[hugh@veritas.com: splitlru: shmem_getpage SetPageSwapBacked sooner[
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: MinChan Kim <minchan.kim@gmail.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h  | 27 +++++++++++++++++++++++++++
 include/linux/page-flags.h |  8 ++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2704729777e..96e970485b6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,3 +1,28 @@
+#ifndef LINUX_MM_INLINE_H
+#define LINUX_MM_INLINE_H
+
+/**
+ * page_is_file_cache - should the page be on a file LRU or anon LRU?
+ * @page: the page to test
+ *
+ * Returns !0 if @page is page cache page backed by a regular filesystem,
+ * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
+ * Used by functions that manipulate the LRU lists, to sort a page
+ * onto the right LRU list.
+ *
+ * We would like to get this info without a page flag, but the state
+ * needs to survive until the page is last deleted from the LRU, which
+ * could be as far down as __page_cache_release.
+ */
+static inline int page_is_file_cache(struct page *page)
+{
+	if (PageSwapBacked(page))
+		return 0;
+
+	/* The page is page cache backed by a normal filesystem. */
+	return 1;
+}
+
 static inline void
 add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
@@ -65,3 +90,5 @@ static inline enum lru_list page_lru(struct page *page)
 
 	return lru;
 }
+
+#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c74d3e87531..57b688cfb5e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -93,6 +93,7 @@ enum pageflags {
 	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
+	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -176,6 +177,7 @@ PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
 	__SETPAGEFLAG(Private, private)
+PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
 __PAGEFLAG(SlobFree, slob_free)
@@ -334,7 +336,8 @@ static inline void __ClearPageTail(struct page *page)
  * Flags checked in bad_page().  Pages on the free list should not have
  * these flags set.  It they are, there is a problem.
  */
-#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | 1 << PG_reclaim | 1 << PG_dirty)
+#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \
+		1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked)
 
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
@@ -347,7 +350,8 @@ static inline void __ClearPageTail(struct page *page)
  * Pages being prepped should not have these flags set.  It they are, there
  * is a problem.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | 1 << PG_reserved | 1 << PG_dirty)
+#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \
+		1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked)
 
 #endif /* !__GENERATING_BOUNDS_H */
 #endif	/* PAGE_FLAGS_H */
-- 
cgit v1.2.3


From 4f98a2fee8acdb4ac84545df98cccecfd130f8db Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:32 -0700
Subject: vmscan: split LRU lists into anon & file sets

Split the LRU lists in two, one set for pages that are backed by real file
systems ("file") and one for pages that are backed by memory and swap
("anon").  The latter includes tmpfs.

The advantage of doing this is that the VM will not have to scan over lots
of anonymous pages (which we generally do not want to swap out), just to
find the page cache pages that it should evict.

This patch has the infrastructure and a basic policy to balance how much
we scan the anon lists and how much we scan the file lists.  The big
policy changes are in separate patches.

[lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset]
[kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru]
[kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page]
[hugh@veritas.com: memcg swapbacked pages active]
[hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED]
[akpm@linux-foundation.org: fix /proc/vmstat units]
[nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration]
[kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo]
[kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/backing-dev.h | 13 ++++++++++++
 include/linux/memcontrol.h  |  2 +-
 include/linux/mm_inline.h   | 50 ++++++++++++++++++++++++++++++++++-----------
 include/linux/mmzone.h      | 47 +++++++++++++++++++++++++++++++++++-------
 include/linux/pagevec.h     | 29 ++++++++++++++++++++------
 include/linux/swap.h        | 20 +++++++++++++-----
 include/linux/vmstat.h      | 10 +++++++++
 7 files changed, 140 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0a24d5550eb..bee52abb8a4 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -175,6 +175,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_READ_MAP:       Can be mapped for reading
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
+ *
+ * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -184,6 +186,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
+#define BDI_CAP_SWAP_BACKED	0x00000100
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -248,6 +251,11 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
+static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -258,4 +266,9 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_swap_backed(struct address_space *mapping)
+{
+	return bdi_cap_swap_backed(mapping->backing_dev_info);
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a6ac0d491fe..8d8f05c1515 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -44,7 +44,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active);
+					int active, int file);
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 96e970485b6..2eb599465d5 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -5,7 +5,7 @@
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
  * @page: the page to test
  *
- * Returns !0 if @page is page cache page backed by a regular filesystem,
+ * Returns LRU_FILE if @page is page cache page backed by a regular filesystem,
  * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
  * Used by functions that manipulate the LRU lists, to sort a page
  * onto the right LRU list.
@@ -20,7 +20,7 @@ static inline int page_is_file_cache(struct page *page)
 		return 0;
 
 	/* The page is page cache backed by a normal filesystem. */
-	return 1;
+	return LRU_FILE;
 }
 
 static inline void
@@ -38,39 +38,64 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 }
 
 static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
+add_page_to_inactive_anon_list(struct zone *zone, struct page *page)
 {
-	add_page_to_lru_list(zone, page, LRU_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON);
 }
 
 static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
+add_page_to_active_anon_list(struct zone *zone, struct page *page)
 {
-	add_page_to_lru_list(zone, page, LRU_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON);
 }
 
 static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
+add_page_to_inactive_file_list(struct zone *zone, struct page *page)
 {
-	del_page_from_lru_list(zone, page, LRU_ACTIVE);
+	add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE);
 }
 
 static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
+add_page_to_active_file_list(struct zone *zone, struct page *page)
 {
-	del_page_from_lru_list(zone, page, LRU_INACTIVE);
+	add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE);
+}
+
+static inline void
+del_page_from_inactive_anon_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON);
+}
+
+static inline void
+del_page_from_active_anon_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON);
+}
+
+static inline void
+del_page_from_inactive_file_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
+}
+
+static inline void
+del_page_from_active_file_list(struct zone *zone, struct page *page)
+{
+	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
 }
 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
-	enum lru_list l = LRU_INACTIVE;
+	enum lru_list l = LRU_BASE;
 
 	list_del(&page->lru);
 	if (PageActive(page)) {
 		__ClearPageActive(page);
-		l = LRU_ACTIVE;
+		l += LRU_ACTIVE;
 	}
+	l += page_is_file_cache(page);
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
@@ -87,6 +112,7 @@ static inline enum lru_list page_lru(struct page *page)
 
 	if (PageActive(page))
 		lru += LRU_ACTIVE;
+	lru += page_is_file_cache(page);
 
 	return lru;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 156e18f3919..59a4c8fd6eb 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -82,21 +82,23 @@ enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
 	NR_LRU_BASE,
-	NR_INACTIVE = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
-	NR_ACTIVE,	/*  "     "     "   "       "         */
+	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
+	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
+	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
-	/* Second 128 byte cacheline */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
+	/* Second 128 byte cacheline */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
@@ -108,17 +110,36 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
+/*
+ * We do arithmetic on the LRU lists in various places in the code,
+ * so it is important to keep the active lists LRU_ACTIVE higher in
+ * the array than the corresponding inactive lists, and to keep
+ * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
+ *
+ * This has to be kept in sync with the statistics in zone_stat_item
+ * above and the descriptions in vmstat_text in mm/vmstat.c
+ */
+#define LRU_BASE 0
+#define LRU_ACTIVE 1
+#define LRU_FILE 2
+
 enum lru_list {
-	LRU_BASE,
-	LRU_INACTIVE=LRU_BASE,	/* must match order of NR_[IN]ACTIVE */
-	LRU_ACTIVE,		/*  "     "     "   "       "        */
+	LRU_INACTIVE_ANON = LRU_BASE,
+	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
+	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
+	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
 	NR_LRU_LISTS };
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
+static inline int is_file_lru(enum lru_list l)
+{
+	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
+}
+
 static inline int is_active_lru(enum lru_list l)
 {
-	return (l == LRU_ACTIVE);
+	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
 }
 
 struct per_cpu_pages {
@@ -269,6 +290,18 @@ struct zone {
 		struct list_head list;
 		unsigned long nr_scan;
 	} lru[NR_LRU_LISTS];
+
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index fea3a982ee5..5fc96a4e760 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -81,20 +81,37 @@ static inline void pagevec_free(struct pagevec *pvec)
 		__pagevec_free(pvec);
 }
 
-static inline void __pagevec_lru_add(struct pagevec *pvec)
+static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
 {
-	____pagevec_lru_add(pvec, LRU_INACTIVE);
+	____pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
 }
 
-static inline void __pagevec_lru_add_active(struct pagevec *pvec)
+static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
 {
-	____pagevec_lru_add(pvec, LRU_ACTIVE);
+	____pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
 }
 
-static inline void pagevec_lru_add(struct pagevec *pvec)
+static inline void __pagevec_lru_add_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
+}
+
+static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
+}
+
+
+static inline void pagevec_lru_add_file(struct pagevec *pvec)
+{
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_file(pvec);
+}
+
+static inline void pagevec_lru_add_anon(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
+		__pagevec_lru_add_anon(pvec);
 }
 
 #endif /* _LINUX_PAGEVEC_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 833be56ad83..7d09d79997a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -184,14 +184,24 @@ extern void swap_setup(void);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static inline void lru_cache_add(struct page *page)
+static inline void lru_cache_add_anon(struct page *page)
 {
-	__lru_cache_add(page, LRU_INACTIVE);
+	__lru_cache_add(page, LRU_INACTIVE_ANON);
 }
 
-static inline void lru_cache_add_active(struct page *page)
+static inline void lru_cache_add_active_anon(struct page *page)
 {
-	__lru_cache_add(page, LRU_ACTIVE);
+	__lru_cache_add(page, LRU_ACTIVE_ANON);
+}
+
+static inline void lru_cache_add_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE_FILE);
+}
+
+static inline void lru_cache_add_active_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE_FILE);
 }
 
 /* linux/mm/vmscan.c */
@@ -199,7 +209,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 							gfp_t gfp_mask);
-extern int __isolate_lru_page(struct page *page, int mode);
+extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 58334d43951..ff5179f2b15 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -159,6 +159,16 @@ static inline unsigned long zone_page_state(struct zone *zone,
 	return x;
 }
 
+extern unsigned long global_lru_pages(void);
+
+static inline unsigned long zone_lru_pages(struct zone *zone)
+{
+	return (zone_page_state(zone, NR_ACTIVE_ANON)
+		+ zone_page_state(zone, NR_ACTIVE_FILE)
+		+ zone_page_state(zone, NR_INACTIVE_ANON)
+		+ zone_page_state(zone, NR_INACTIVE_FILE));
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Determine the per node value of a stat item. This function
-- 
cgit v1.2.3


From 556adecba110bf5f1db6c6b56416cfab5bcab698 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Sat, 18 Oct 2008 20:26:34 -0700
Subject: vmscan: second chance replacement for anonymous pages

We avoid evicting and scanning anonymous pages for the most part, but
under some workloads we can end up with most of memory filled with
anonymous pages.  At that point, we suddenly need to clear the referenced
bits on all of memory, which can take ages on very large memory systems.

We can reduce the maximum number of pages that need to be scanned by not
taking the referenced state into account when deactivating an anonymous
page.  After all, every anonymous page starts out referenced, so why
check?

If an anonymous page gets referenced again before it reaches the end of
the inactive list, we move it back to the active list.

To keep the maximum amount of necessary work reasonable, we scale the
active to inactive ratio with the size of memory, using the formula
active:inactive ratio = sqrt(memory in GB * 10).

Kswapd CPU use now seems to scale by the amount of pageout bandwidth,
instead of by the amount of memory present in the system.

[kamezawa.hiroyu@jp.fujitsu.com: fix OOM with memcg]
[kamezawa.hiroyu@jp.fujitsu.com: memcg: lru scan fix]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 19 +++++++++++++++++++
 include/linux/mmzone.h    |  6 ++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2eb599465d5..f451fedd1e7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -117,4 +117,23 @@ static inline enum lru_list page_lru(struct page *page)
 	return lru;
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static inline int inactive_anon_is_low(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_ANON);
+	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+	if (inactive * zone->inactive_ratio < active)
+		return 1;
+
+	return 0;
+}
 #endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 59a4c8fd6eb..9c5111f49a3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -323,6 +323,12 @@ struct zone {
 	 */
 	int prev_priority;
 
+	/*
+	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+	 * this zone's LRU.  Maintained by the pageout code.
+	 */
+	unsigned int inactive_ratio;
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
-- 
cgit v1.2.3


From 8a7a8544a4f6554ec2d8048ac9f9672f442db5a2 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:37 -0700
Subject: pageflag helpers for configed-out flags

Define proper false/noop inline functions for noreclaim page flags when
!defined(CONFIG_UNEVICTABLE_LRU)

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 57b688cfb5e..3d31616dcd2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -162,6 +162,18 @@ static inline int Page##uname(struct page *page) 			\
 #define TESTSCFLAG(uname, lname)					\
 	TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
 
+#define SETPAGEFLAG_NOOP(uname)						\
+static inline void SetPage##uname(struct page *page) {  }
+
+#define CLEARPAGEFLAG_NOOP(uname)					\
+static inline void ClearPage##uname(struct page *page) {  }
+
+#define __CLEARPAGEFLAG_NOOP(uname)					\
+static inline void __ClearPage##uname(struct page *page) {  }
+
+#define TESTCLEARFLAG_FALSE(uname)					\
+static inline int TestClearPage##uname(struct page *page) { return 0; }
+
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked)
-- 
cgit v1.2.3


From 894bc310419ac95f4fa4142dc364401a7e607f65 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:39 -0700
Subject: Unevictable LRU Infrastructure

When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages.  Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.

Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan.  Based on a patch by Larry Woodman of Red Hat.  Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.

Kosaki Motohiro added the support for the memory controller unevictable
lru list.

Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.

The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.

A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable.  Subsequent patches will add the various
!evictable tests.  We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.

To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference.  If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list.  This way, we avoid "stranding" evictable pages on the
unevictable list.

[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  2 +-
 include/linux/mm_inline.h  | 23 ++++++++++++++++-------
 include/linux/mmzone.h     | 24 +++++++++++++++++++++++-
 include/linux/page-flags.h | 22 +++++++++++++++++++++-
 include/linux/pagevec.h    |  1 -
 include/linux/swap.h       | 12 ++++++++++++
 6 files changed, 73 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8d8f05c1515..ee1b2fcb441 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -34,9 +34,9 @@ extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
+extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern void mem_cgroup_move_lists(struct page *page, bool active);
 extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f451fedd1e7..67d7697fd01 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -91,11 +91,16 @@ del_page_from_lru(struct zone *zone, struct page *page)
 	enum lru_list l = LRU_BASE;
 
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		__ClearPageActive(page);
-		l += LRU_ACTIVE;
+	if (PageUnevictable(page)) {
+		__ClearPageUnevictable(page);
+		l = LRU_UNEVICTABLE;
+	} else {
+		if (PageActive(page)) {
+			__ClearPageActive(page);
+			l += LRU_ACTIVE;
+		}
+		l += page_is_file_cache(page);
 	}
-	l += page_is_file_cache(page);
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
@@ -110,9 +115,13 @@ static inline enum lru_list page_lru(struct page *page)
 {
 	enum lru_list lru = LRU_BASE;
 
-	if (PageActive(page))
-		lru += LRU_ACTIVE;
-	lru += page_is_file_cache(page);
+	if (PageUnevictable(page))
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (PageActive(page))
+			lru += LRU_ACTIVE;
+		lru += page_is_file_cache(page);
+	}
 
 	return lru;
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9c5111f49a3..d1f60d5fe2e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -86,6 +86,11 @@ enum zone_stat_item {
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+#else
+	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
@@ -128,10 +133,18 @@ enum lru_list {
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
-	NR_LRU_LISTS };
+#ifdef CONFIG_UNEVICTABLE_LRU
+	LRU_UNEVICTABLE,
+#else
+	LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
+	NR_LRU_LISTS
+};
 
 #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
 
+#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++)
+
 static inline int is_file_lru(enum lru_list l)
 {
 	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
@@ -142,6 +155,15 @@ static inline int is_active_lru(enum lru_list l)
 	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
 }
 
+static inline int is_unevictable_lru(enum lru_list l)
+{
+#ifdef CONFIG_UNEVICTABLE_LRU
+	return (l == LRU_UNEVICTABLE);
+#else
+	return 0;
+#endif
+}
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3d31616dcd2..ec1a1baad34 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -94,6 +94,9 @@ enum pageflags {
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
 	PG_swapbacked,		/* Page is backed by RAM/swap */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	PG_unevictable,		/* Page is "unevictable"  */
+#endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -182,6 +185,7 @@ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
@@ -225,6 +229,15 @@ PAGEFLAG(SwapCache, swapcache)
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
+	TESTCLEARFLAG(Unevictable, unevictable)
+#else
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
+
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 PAGEFLAG(Uncached, uncached)
 #else
@@ -340,9 +353,16 @@ static inline void __ClearPageTail(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define __PG_UNEVICTABLE (1 << PG_unevictable)
+#else
+#define __PG_UNEVICTABLE 0
+#endif
+
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
 			 1 << PG_buddy | 1 << PG_writeback | \
-			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active)
+			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+			 __PG_UNEVICTABLE)
 
 /*
  * Flags checked in bad_page().  Pages on the free list should not have
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5fc96a4e760..e90a2cb0291 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -101,7 +101,6 @@ static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
 	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
 }
 
-
 static inline void pagevec_lru_add_file(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7d09d79997a..a2113044d20 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -180,6 +180,8 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+extern void add_page_to_unevictable_list(struct page *page);
+
 /**
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
@@ -228,6 +230,16 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+#else
+static inline int page_evictable(struct page *page,
+						struct vm_area_struct *vma)
+{
+	return 1;
+}
+#endif
+
 extern int kswapd_run(int nid);
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From bbfd28eee9fbd73e780b19beb3dc562befbb94fa Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:40 -0700
Subject: unevictable lru: add event counting with statistics

Fix to unevictable-lru-page-statistics.patch

Add unevictable lru infrastructure vm events to the statistics patch.
Rename the "NORECL_" and "noreclaim_" symbols and text strings to
"UNEVICTABLE_" and "unevictable_", respectively.

Currently, both the infrastructure and the mlocked pages event are
added by a single patch later in the series.  This makes it difficult
to add or rework the incremental patches.  The events actually "belong"
with the stats, so pull them up to here.

Also, restore the event counting to putback_lru_page().  This was removed
from previous patch in series where it was "misplaced".  The actual events
weren't defined that early.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ff5179f2b15..135840cd7fe 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -40,6 +40,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
+#endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
+		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
+		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
 #endif
 		NR_VM_EVENT_ITEMS
 };
-- 
cgit v1.2.3


From ba9ddf49391645e6bb93219131a40446538a5e76 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:42 -0700
Subject: Ramfs and Ram Disk pages are unevictable

Christoph Lameter pointed out that ram disk pages also clutter the LRU
lists.  When vmscan finds them dirty and tries to clean them, the ram disk
writeback function just redirties the page so that it goes back onto the
active list.  Round and round she goes...

With the ram disk driver [rd.c] replaced by the newer 'brd.c', this is no
longer the case, as ram disk pages are no longer maintained on the lru.
[This makes them unmigratable for defrag or memory hot remove, but that
can be addressed by a separate patch series.] However, the ramfs pages
behave like ram disk pages used to, so:

Define new address_space flag [shares address_space flags member with
mapping's gfp mask] to indicate that the address space contains all
unevictable pages.  This will provide for efficient testing of ramfs pages
in page_evictable().

Also provide wrapper functions to set/test the unevictable state to
minimize #ifdefs in ramfs driver and any other users of this facility.

Set the unevictable state on address_space structures for new ramfs
inodes.  Test the unevictable state in page_evictable() to cull
unevictable pages.

These changes depend on [CONFIG_]UNEVICTABLE_LRU.

[riel@redhat.com: undo the brd.c part]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Debugged-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5da31c12101..09164d2c5c2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -32,6 +32,28 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 	}
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define AS_UNEVICTABLE	(__GFP_BITS_SHIFT + 2)	/* e.g., ramdisk, SHM_LOCK */
+
+static inline void mapping_set_unevictable(struct address_space *mapping)
+{
+	set_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	if (mapping && (mapping->flags & AS_UNEVICTABLE))
+		return 1;
+	return 0;
+}
+#else
+static inline void mapping_set_unevictable(struct address_space *mapping) { }
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	return 0;
+}
+#endif
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
-- 
cgit v1.2.3


From 89e004ea55abe201b29e2d6e35124101f1288ef7 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:43 -0700
Subject: SHM_LOCKED pages are unevictable

Shmem segments locked into memory via shmctl(SHM_LOCKED) should not be
kept on the normal LRU, since scanning them is a waste of time and might
throw off kswapd's balancing algorithms.  Place them on the unevictable
LRU list instead.

Use the AS_UNEVICTABLE flag to mark address_space of SHM_LOCKed shared
memory regions as unevictable.  Then these pages will be culled off the
normal LRU lists during vmscan.

Add new wrapper function to clear the mapping's unevictable state when/if
shared memory segment is munlocked.

Add 'scan_mapping_unevictable_page()' to mm/vmscan.c to scan all pages in
the shmem segment's mapping [struct address_space] for evictability now
that they're no longer locked.  If so, move them to the appropriate zone
lru list.

Changes depend on [CONFIG_]UNEVICTABLE_LRU.

[kosaki.motohiro@jp.fujitsu.com: revert shm change]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h      |  4 ++--
 include/linux/pagemap.h | 12 +++++++++---
 include/linux/swap.h    |  4 ++++
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c61ba10768e..40236290e2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -700,10 +700,10 @@ static inline int page_mapped(struct page *page)
 extern void show_free_areas(void);
 
 #ifdef CONFIG_SHMEM
-int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #else
 static inline int shmem_lock(struct file *file, int lock,
-			     struct user_struct *user)
+			    struct user_struct *user)
 {
 	return 0;
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09164d2c5c2..4b6c4d8d26b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -40,14 +40,20 @@ static inline void mapping_set_unevictable(struct address_space *mapping)
 	set_bit(AS_UNEVICTABLE, &mapping->flags);
 }
 
+static inline void mapping_clear_unevictable(struct address_space *mapping)
+{
+	clear_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
 static inline int mapping_unevictable(struct address_space *mapping)
 {
-	if (mapping && (mapping->flags & AS_UNEVICTABLE))
-		return 1;
-	return 0;
+	if (likely(mapping))
+		return test_bit(AS_UNEVICTABLE, &mapping->flags);
+	return !!mapping;
 }
 #else
 static inline void mapping_set_unevictable(struct address_space *mapping) { }
+static inline void mapping_clear_unevictable(struct address_space *mapping) { }
 static inline int mapping_unevictable(struct address_space *mapping)
 {
 	return 0;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2113044d20..7edb4cbc29f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -232,12 +232,16 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 
 #ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+extern void scan_mapping_unevictable_pages(struct address_space *);
 #else
 static inline int page_evictable(struct page *page,
 						struct vm_area_struct *vma)
 {
 	return 1;
 }
+static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+}
 #endif
 
 extern int kswapd_run(int nid);
-- 
cgit v1.2.3


From b291f000393f5a0b679012b39d79fbc85c018233 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:44 -0700
Subject: mlock: mlocked pages are unevictable

Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.

This is achieved through various strategies:

1) add yet another page flag--PG_mlocked--to indicate that
   the page is locked for efficient testing in vmscan and,
   optionally, fault path.  This allows early culling of
   unevictable pages, preventing them from getting to
   page_referenced()/try_to_unmap().  Also allows separate
   accounting of mlock'd pages, as Nick's original patch
   did.

   Note:  Nick's original mlock patch used a PG_mlocked
   flag.  I had removed this in favor of the PG_unevictable
   flag + an mlock_count [new page struct member].  I
   restored the PG_mlocked flag to eliminate the new
   count field.

2) add the mlock/unevictable infrastructure to mm/mlock.c,
   with internal APIs in mm/internal.h.  This is a rework
   of Nick's original patch to these files, taking into
   account that mlocked pages are now kept on unevictable
   LRU list.

3) update vmscan.c:page_evictable() to check PageMlocked()
   and, if vma passed in, the vm_flags.  Note that the vma
   will only be passed in for new pages in the fault path;
   and then only if the "cull unevictable pages in fault
   path" patch is included.

4) add try_to_unlock() to rmap.c to walk a page's rmap and
   ClearPageMlocked() if no other vmas have it mlocked.
   Reuses as much of try_to_unmap() as possible.  This
   effectively replaces the use of one of the lru list links
   as an mlock count.  If this mechanism let's pages in mlocked
   vmas leak through w/o PG_mlocked set [I don't know that it
   does], we should catch them later in try_to_unmap().  One
   hopes this will be rare, as it will be relatively expensive.

Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>

splitlru: introduce __get_user_pages():

  New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
  because current get_user_pages() can't grab PROT_NONE pages theresore it
  cause PROT_NONE pages can't munlock.

[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         |  5 +++++
 include/linux/page-flags.h | 19 ++++++++++++++++---
 include/linux/rmap.h       | 14 ++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 40236290e2a..ffee2f74341 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -131,6 +131,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+/*
+ * special vmas that are non-mergable, non-mlock()able
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ec1a1baad34..b12f93a3c34 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -96,6 +96,7 @@ enum pageflags {
 	PG_swapbacked,		/* Page is backed by RAM/swap */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	PG_unevictable,		/* Page is "unevictable"  */
+	PG_mlocked,		/* Page is vma mlocked */
 #endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
@@ -232,7 +233,17 @@ PAGEFLAG_FALSE(SwapCache)
 #ifdef CONFIG_UNEVICTABLE_LRU
 PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
 	TESTCLEARFLAG(Unevictable, unevictable)
+
+#define MLOCK_PAGES 1
+PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
+	TESTSCFLAG(Mlocked, mlocked)
+
 #else
+
+#define MLOCK_PAGES 0
+PAGEFLAG_FALSE(Mlocked)
+	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
+
 PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
 	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
 	__CLEARPAGEFLAG_NOOP(Unevictable)
@@ -354,15 +365,17 @@ static inline void __ClearPageTail(struct page *page)
 #endif /* !PAGEFLAGS_EXTENDED */
 
 #ifdef CONFIG_UNEVICTABLE_LRU
-#define __PG_UNEVICTABLE (1 << PG_unevictable)
+#define __PG_UNEVICTABLE	(1 << PG_unevictable)
+#define __PG_MLOCKED		(1 << PG_mlocked)
 #else
-#define __PG_UNEVICTABLE 0
+#define __PG_UNEVICTABLE	0
+#define __PG_MLOCKED		0
 #endif
 
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
 			 1 << PG_buddy | 1 << PG_writeback | \
 			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
-			 __PG_UNEVICTABLE)
+			 __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
  * Flags checked in bad_page().  Pages on the free list should not have
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fed6f5e0b41..955667e6a52 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -117,6 +117,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  */
 int page_mkclean(struct page *);
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * called in munlock()/munmap() path to check for other vmas holding
+ * the page mlocked.
+ */
+int try_to_munlock(struct page *);
+#else
+static inline int try_to_munlock(struct page *page)
+{
+	return 0;	/* a.k.a. SWAP_SUCCESS */
+}
+#endif
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
@@ -140,5 +153,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
-- 
cgit v1.2.3


From 5344b7e648980cc2ca613ec03a56a8222ff48820 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:51 -0700
Subject: vmstat: mlocked pages statistics

Add NR_MLOCK zone page state, which provides a (conservative) count of
mlocked pages (actually, the number of mlocked pages moved off the LRU).

Reworked by lts to fit in with the modified mlock page support in the
Reclaim Scalability series.

[kosaki.motohiro@jp.fujitsu.com: fix incorrect Mlocked field of /proc/meminfo]
[lee.schermerhorn@hp.com: mlocked-pages: add event counting with statistics]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 ++
 include/linux/vmstat.h | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d1f60d5fe2e..da2d053a95f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -88,8 +88,10 @@ enum zone_stat_item {
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 #ifdef CONFIG_UNEVICTABLE_LRU
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
 #else
 	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+	NR_MLOCK = NR_ACTIVE_FILE,
 #endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 135840cd7fe..05b805020be 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -45,6 +45,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
 		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
 		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
+		UNEVICTABLE_PGMLOCKED,
+		UNEVICTABLE_PGMUNLOCKED,
+		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
+		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 #endif
 		NR_VM_EVENT_ITEMS
 };
-- 
cgit v1.2.3


From 64d6519dda3905dfb94d3f93c07c5f263f41813f Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:52 -0700
Subject: swap: cull unevictable pages in fault path

In the fault paths that install new anonymous pages, check whether the
page is evictable or not using lru_cache_add_active_or_unevictable().  If
the page is evictable, just add it to the active lru list [via the pagevec
cache], else add it to the unevictable list.

This "proactive" culling in the fault path mimics the handling of mlocked
pages in Nick Piggin's series to keep mlocked pages off the lru lists.

Notes:

1) This patch is optional--e.g., if one is concerned about the
   additional test in the fault path.  We can defer the moving of
   nonreclaimable pages until when vmscan [shrink_*_list()]
   encounters them.  Vmscan will only need to handle such pages
   once, but if there are a lot of them it could impact system
   performance.

2) The 'vma' argument to page_evictable() is require to notice that
   we're faulting a page into an mlock()ed vma w/o having to scan the
   page's rmap in the fault path.   Culling mlock()ed anon pages is
   currently the only reason for this patch.

3) We can't cull swap pages in read_swap_cache_async() because the
   vma argument doesn't necessarily correspond to the swap cache
   offset passed in by swapin_readahead().  This could [did!] result
   in mlocking pages in non-VM_LOCKED vmas if [when] we tried to
   cull in this path.

4) Move set_pte_at() to after where we add page to lru to keep it
   hidden from other tasks that might walk the page table.
   We already do it in this order in do_anonymous() page.  And,
   these are COW'd anon pages.  Is this safe?

[riel@redhat.com: undo an overzealous code cleanup]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7edb4cbc29f..07eda69412f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -173,6 +173,8 @@ extern unsigned int nr_free_pagecache_pages(void);
 /* linux/mm/swap.c */
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_cache_add_active_or_unevictable(struct page *,
+					struct vm_area_struct *);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
-- 
cgit v1.2.3


From af936a1606246a10c145feac3770f6287f483f02 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:53 -0700
Subject: vmscan: unevictable LRU scan sysctl

This patch adds a function to scan individual or all zones' unevictable
lists and move any pages that have become evictable onto the respective
zone's inactive list, where shrink_inactive_list() will deal with them.

Adds sysctl to scan all nodes, and per node attributes to individual
nodes' zones.

Kosaki: If evictable page found in unevictable lru when write
/proc/sys/vm/scan_unevictable_pages, print filename and file offset of
these pages.

[akpm@linux-foundation.org: fix one CONFIG_MMU=n build error]
[kosaki.motohiro@jp.fujitsu.com: adapt vmscan-unevictable-lru-scan-sysctl.patch to new sysfs API]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  3 +++
 include/linux/swap.h | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 955667e6a52..1da48db8db0 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -75,6 +75,9 @@ void anon_vma_unlink(struct vm_area_struct *);
 void anon_vma_link(struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
+extern struct anon_vma *page_lock_anon_vma(struct page *page);
+extern void page_unlock_anon_vma(struct anon_vma *anon_vma);
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 07eda69412f..a3af95b2cb6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
+#include <linux/node.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -235,15 +236,29 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 #ifdef CONFIG_UNEVICTABLE_LRU
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
+
+extern unsigned long scan_unevictable_pages;
+extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+extern int scan_unevictable_register_node(struct node *node);
+extern void scan_unevictable_unregister_node(struct node *node);
 #else
 static inline int page_evictable(struct page *page,
 						struct vm_area_struct *vma)
 {
 	return 1;
 }
+
 static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
 {
 }
+
+static inline int scan_unevictable_register_node(struct node *node)
+{
+	return 0;
+}
+
+static inline void scan_unevictable_unregister_node(struct node *node) { }
 #endif
 
 extern int kswapd_run(int nid);
-- 
cgit v1.2.3


From 985737cf2ea096ea946aed82c7484d40defc71a8 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Sat, 18 Oct 2008 20:26:53 -0700
Subject: mlock: count attempts to free mlocked page

Allow free of mlock()ed pages.  This shouldn't happen, but during
developement, it occasionally did.

This patch allows us to survive that condition, while keeping the
statistics and events correct for debug.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 05b805020be..9cd3ab0f554 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -49,6 +49,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGMUNLOCKED,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
+		UNEVICTABLE_MLOCKFREED,
 #endif
 		NR_VM_EVENT_ITEMS
 };
-- 
cgit v1.2.3


From 902d2e8ae0de29f483840ba1134af27343b9564d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:26:54 -0700
Subject: vmscan: kill unused lru functions

Several LRU manupuration function are not used now.  So they can be
removed.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 48 -----------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 67d7697fd01..c948350c378 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -37,54 +37,6 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
-static inline void
-add_page_to_inactive_anon_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON);
-}
-
-static inline void
-add_page_to_active_anon_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON);
-}
-
-static inline void
-add_page_to_inactive_file_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
-static inline void
-add_page_to_active_file_list(struct zone *zone, struct page *page)
-{
-	add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE);
-}
-
-static inline void
-del_page_from_inactive_anon_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON);
-}
-
-static inline void
-del_page_from_active_anon_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON);
-}
-
-static inline void
-del_page_from_inactive_file_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
-static inline void
-del_page_from_active_file_list(struct zone *zone, struct page *page)
-{
-	del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE);
-}
-
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
-- 
cgit v1.2.3


From f45840b5c128445da70e7ec33adc47b4a12bdaf4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:57 -0700
Subject: mm: pagecache insertion fewer atomics

Setting and clearing the page locked when inserting it into swapcache /
pagecache when it has no other references can use non-atomic page flags
operations because no other CPU may be operating on it at this time.

This saves one atomic operation when inserting a page into pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b6c4d8d26b..7334b2b6c4c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -299,14 +299,14 @@ extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
 extern void unlock_page(struct page *page);
 
-static inline void set_page_locked(struct page *page)
+static inline void __set_page_locked(struct page *page)
 {
-	set_bit(PG_locked, &page->flags);
+	__set_bit(PG_locked, &page->flags);
 }
 
-static inline void clear_page_locked(struct page *page)
+static inline void __clear_page_locked(struct page *page)
 {
-	clear_bit(PG_locked, &page->flags);
+	__clear_bit(PG_locked, &page->flags);
 }
 
 static inline int trylock_page(struct page *page)
@@ -438,17 +438,17 @@ extern void __remove_from_page_cache(struct page *page);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run set_page_locked() against it.
+ * the page is new, so we can just run __set_page_locked() against it.
  */
 static inline int add_to_page_cache(struct page *page,
 		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
 	int error;
 
-	set_page_locked(page);
+	__set_page_locked(page);
 	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
 	if (unlikely(error))
-		clear_page_locked(page);
+		__clear_page_locked(page);
 	return error;
 }
 
-- 
cgit v1.2.3


From 8413ac9d8c9a1366a4f57880723126cd24e5a5c3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:26:59 -0700
Subject: mm: page lock use lock bitops

trylock_page, unlock_page open and close a critical section. Hence,
we can use the lock bitops to get the desired memory ordering.

Also, mark trylock as likely to succeed (and remove the annotation from
callers).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7334b2b6c4c..709742be02f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -311,7 +311,7 @@ static inline void __clear_page_locked(struct page *page)
 
 static inline int trylock_page(struct page *page)
 {
-	return !test_and_set_bit(PG_locked, &page->flags);
+	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
 
 /*
-- 
cgit v1.2.3


From 51b07fc3c5c830bb49c80fc5eac041e1f66a72e7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:27:00 -0700
Subject: fs: buffer lock use lock bitops

trylock_buffer and unlock_buffer open and close a critical section.
Hence, we can use the lock bitops to get the desired memory ordering.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/buffer_head.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index eadaab44015..3ce64b90118 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -322,7 +322,7 @@ static inline void wait_on_buffer(struct buffer_head *bh)
 
 static inline int trylock_buffer(struct buffer_head *bh)
 {
-	return likely(!test_and_set_bit(BH_Lock, &bh->b_state));
+	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
 }
 
 static inline void lock_buffer(struct buffer_head *bh)
-- 
cgit v1.2.3


From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 18 Oct 2008 20:27:03 -0700
Subject: mm: rewrite vmap layer

Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and
provide a fast, scalable percpu frontend for small vmaps (requires a
slightly different API, though).

The biggest problem with vmap is actually vunmap.  Presently this requires
a global kernel TLB flush, which on most architectures is a broadcast IPI
to all CPUs to flush the cache.  This is all done under a global lock.  As
the number of CPUs increases, so will the number of vunmaps a scaled
workload will want to perform, and so will the cost of a global TLB flush.
 This gives terrible quadratic scalability characteristics.

Another problem is that the entire vmap subsystem works under a single
lock.  It is a rwlock, but it is actually taken for write in all the fast
paths, and the read locking would likely never be run concurrently anyway,
so it's just pointless.

This is a rewrite of vmap subsystem to solve those problems.  The existing
vmalloc API is implemented on top of the rewritten subsystem.

The TLB flushing problem is solved by using lazy TLB unmapping.  vmap
addresses do not have to be flushed immediately when they are vunmapped,
because the kernel will not reuse them again (would be a use-after-free)
until they are reallocated.  So the addresses aren't allocated again until
a subsequent TLB flush.  A single TLB flush then can flush multiple
vunmaps from each CPU.

XEN and PAT and such do not like deferred TLB flushing because they can't
always handle multiple aliasing virtual addresses to a physical address.
They now call vm_unmap_aliases() in order to flush any deferred mappings.
That call is very expensive (well, actually not a lot more expensive than
a single vunmap under the old scheme), however it should be OK if not
called too often.

The virtual memory extent information is stored in an rbtree rather than a
linked list to improve the algorithmic scalability.

There is a per-CPU allocator for small vmaps, which amortizes or avoids
global locking.

To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces
must be used in place of vmap and vunmap.  Vmalloc does not use these
interfaces at the moment, so it will not be quite so scalable (although it
will use lazy TLB flushing).

As a quick test of performance, I ran a test that loops in the kernel,
linearly mapping then touching then unmapping 4 pages.  Different numbers
of tests were run in parallel on an 4 core, 2 socket opteron.  Results are
in nanoseconds per map+touch+unmap.

threads           vanilla         vmap rewrite
1                 14700           2900
2                 33600           3000
4                 49500           2800
8                 70631           2900

So with a 8 cores, the rewritten version is already 25x faster.

In a slightly more realistic test (although with an older and less
scalable version of the patch), I ripped the not-very-good vunmap batching
code out of XFS, and implemented the large buffer mapping with vm_map_ram
and vm_unmap_ram...  along with a couple of other tricks, I was able to
speed up a large directory workload by 20x on a 64 CPU system.  I believe
vmap/vunmap is actually sped up a lot more than 20x on such a system, but
I'm running into other locks now.  vmap is pretty well blown off the
profiles.

Before:
1352059 total                                      0.1401
798784 _write_lock                              8320.6667 <- vmlist_lock
529313 default_idle                             1181.5022
 15242 smp_call_function                         15.8771  <- vmap tlb flushing
  2472 __get_vm_area_node                         1.9312  <- vmap
  1762 remove_vm_area                             4.5885  <- vunmap
   316 map_vm_area                                0.2297  <- vmap
   312 kfree                                      0.1950
   300 _spin_lock                                 3.1250
   252 sn_send_IPI_phys                           0.4375  <- tlb flushing
   238 vmap                                       0.8264  <- vmap
   216 find_lock_page                             0.5192
   196 find_next_bit                              0.3603
   136 sn2_send_IPI                               0.2024
   130 pio_phys_write_mmr                         2.0312
   118 unmap_kernel_range                         0.1229

After:
 78406 total                                      0.0081
 40053 default_idle                              89.4040
 33576 ia64_spinlock_contention                 349.7500
  1650 _spin_lock                                17.1875
   319 __reg_op                                   0.5538
   281 _atomic_dec_and_lock                       1.0977
   153 mutex_unlock                               1.5938
   123 iget_locked                                0.1671
   117 xfs_dir_lookup                             0.1662
   117 dput                                       0.1406
   114 xfs_iget_core                              0.0268
    92 xfs_da_hashname                            0.1917
    75 d_alloc                                    0.0670
    68 vmap_page_range                            0.0462 <- vmap
    58 kmem_cache_alloc                           0.0604
    57 memset                                     0.0540
    52 rb_next                                    0.1625
    50 __copy_user                                0.0208
    49 bitmap_find_free_region                    0.2188 <- vmap
    46 ia64_sn_udelay                             0.1106
    45 find_inode_fast                            0.1406
    42 memcmp                                     0.2188
    42 finish_task_switch                         0.1094
    42 __d_lookup                                 0.0410
    40 radix_tree_lookup_slot                     0.1250
    37 _spin_unlock_irqrestore                    0.3854
    36 xfs_bmapi                                  0.0050
    36 kmem_cache_free                            0.0256
    35 xfs_vn_getattr                             0.0322
    34 radix_tree_lookup                          0.1062
    33 __link_path_walk                           0.0035
    31 xfs_da_do_buf                              0.0091
    30 _xfs_buf_find                              0.0204
    28 find_get_page                              0.0875
    27 xfs_iread                                  0.0241
    27 __strncpy_from_user                        0.2812
    26 _xfs_buf_initialize                        0.0406
    24 _xfs_buf_lookup_pages                      0.0179
    24 vunmap_page_range                          0.0250 <- vunmap
    23 find_lock_page                             0.0799
    22 vm_map_ram                                 0.0087 <- vmap
    20 kfree                                      0.0125
    19 put_page                                   0.0330
    18 __kmalloc                                  0.0176
    17 xfs_da_node_lookup_int                     0.0086
    17 _read_lock                                 0.0885
    17 page_waitqueue                             0.0664

vmap has gone from being the top 5 on the profiles and flushing the crap
out of all TLBs, to using less than 1% of kernel time.

[akpm@linux-foundation.org: cleanups, section fix]
[akpm@linux-foundation.org: fix build on alpha]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 328eb402272..4c28c4d564e 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,7 @@
 #define _LINUX_VMALLOC_H
 
 #include <linux/spinlock.h>
+#include <linux/init.h>
 #include <asm/page.h>		/* pgprot_t */
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -23,7 +24,6 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #endif
 
 struct vm_struct {
-	/* keep next,addr,size together to speedup lookups */
 	struct vm_struct	*next;
 	void			*addr;
 	unsigned long		size;
@@ -37,6 +37,19 @@ struct vm_struct {
 /*
  *	Highlevel APIs for driver use
  */
+extern void vm_unmap_ram(const void *mem, unsigned int count);
+extern void *vm_map_ram(struct page **pages, unsigned int count,
+				int node, pgprot_t prot);
+extern void vm_unmap_aliases(void);
+
+#ifdef CONFIG_MMU
+extern void __init vmalloc_init(void);
+#else
+static inline void vmalloc_init(void)
+{
+}
+#endif
+
 extern void *vmalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
-- 
cgit v1.2.3


From e575f111dc0f27044e170580e7de50985ab3e011 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:27:08 -0700
Subject: coredump_filter: add hugepage dumping

Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped.  But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g.  vmalloc, sound related).

Thus hugepages are never dumped and it can't be debugged easily.  Many
developers want hugepages to be included into core-dump.

However, We can't read generic VM_RESERVED area because this area is often
IO mapping area.  then these area reading may change device state.  it is
definitly undesiable side-effect.

So adding a hugepage specific bit to the coredump filter is better.  It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.

In additional, libhugetlb use hugetlb private mapping pages as anonymous
page.  Then, hugepage private mapping pages should be core dumped by
default.

Then, /proc/[pid]/core_dump_filter has two new bits.

 - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
 - bit 6 mean hugetlb shared mapping pages are dumped or not.  (default: no)

I tested by following method.

% ulimit -c unlimited
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage  50
% ./crash_hugepage  50  -p
% ls -lh
% gdb ./crash_hugepage core

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>

#include "hugetlbfs.h"

int main(int argc, char** argv){
	char* p;
	int ch;
	int mmap_flags = MAP_SHARED;
	int fd;
	int nr_pages;

	while((ch = getopt(argc, argv, "p")) != -1) {
		switch (ch) {
		case 'p':
			mmap_flags &= ~MAP_SHARED;
			mmap_flags |= MAP_PRIVATE;
			break;
		default:
			/* nothing*/
			break;
		}
	}
	argc -= optind;
	argv += optind;

	if (argc == 0){
		printf("need # of pages\n");
		exit(1);
	}

	nr_pages = atoi(argv[0]);
	if (nr_pages < 2) {
		printf("nr_pages must >2\n");
		exit(1);
	}

	fd = hugetlbfs_unlinked_fd();
	p = mmap(NULL, nr_pages * gethugepagesize(),
		 PROT_READ|PROT_WRITE, mmap_flags, fd, 0);

	sleep(2);

	*(p + gethugepagesize()) = 1; /* COW */
	sleep(2);

	/* crash! */
	*(int*)0 = 1;

	return 0;
}

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c226c7b8294..017cc914ef1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -403,12 +403,15 @@ extern int get_dumpable(struct mm_struct *mm);
 #define MMF_DUMP_MAPPED_PRIVATE	4
 #define MMF_DUMP_MAPPED_SHARED	5
 #define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS	5
+#define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK \
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
-	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE))
 
 struct sighand_struct {
 	atomic_t		count;
-- 
cgit v1.2.3


From 8174f1503f4bf7e9a14b3fbbfdb30c6be6e29f77 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Sat, 18 Oct 2008 20:27:19 -0700
Subject: container freezer: make refrigerator always available

Now that the TIF_FREEZE flag is available in all architectures, extract
the refrigerator() and freeze_task() from kernel/power/process.c and make
it available to all.

The refrigerator() can now be used in a control group subsystem
implementing a control group freezer.

Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Matt Helsley <matthltc@us.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/freezer.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index deddeedf325..17e3bb42dd3 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_FREEZER
 /*
  * Check if a process has been frozen
  */
@@ -39,6 +39,11 @@ static inline void clear_freeze_flag(struct task_struct *p)
 	clear_tsk_thread_flag(p, TIF_FREEZE);
 }
 
+static inline bool should_send_signal(struct task_struct *p)
+{
+	return !(p->flags & PF_FREEZER_NOSIG);
+}
+
 /*
  * Wake up a frozen process
  *
@@ -75,6 +80,9 @@ static inline int try_to_freeze(void)
 		return 0;
 }
 
+extern bool freeze_task(struct task_struct *p, bool sig_only);
+extern void cancel_freezing(struct task_struct *p);
+
 /*
  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
  * calls wait_for_completion(&vfork) and reset right after it returns from this
@@ -166,7 +174,7 @@ static inline void set_freezable_with_signal(void)
 	} while (try_to_freeze());					\
 	__retval;							\
 })
-#else /* !CONFIG_PM_SLEEP */
+#else /* !CONFIG_FREEZER */
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
 static inline void set_freeze_flag(struct task_struct *p) {}
@@ -191,6 +199,6 @@ static inline void set_freezable_with_signal(void) {}
 #define wait_event_freezable_timeout(wq, condition, timeout)		\
 		wait_event_interruptible_timeout(wq, condition, timeout)
 
-#endif /* !CONFIG_PM_SLEEP */
+#endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
-- 
cgit v1.2.3


From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Sat, 18 Oct 2008 20:27:21 -0700
Subject: container freezer: implement freezer cgroup subsystem

This patch implements a new freezer subsystem in the control groups
framework.  It provides a way to stop and resume execution of all tasks in
a cgroup by writing in the cgroup filesystem.

The freezer subsystem in the container filesystem defines a file named
freezer.state.  Writing "FROZEN" to the state file will freeze all tasks
in the cgroup.  Subsequently writing "RUNNING" will unfreeze the tasks in
the cgroup.  Reading will return the current state.

* Examples of usage :

   # mkdir /containers/freezer
   # mount -t cgroup -ofreezer freezer  /containers
   # mkdir /containers/0
   # echo $some_pid > /containers/0/tasks

to get status of the freezer subsystem :

   # cat /containers/0/freezer.state
   RUNNING

to freeze all tasks in the container :

   # echo FROZEN > /containers/0/freezer.state
   # cat /containers/0/freezer.state
   FREEZING
   # cat /containers/0/freezer.state
   FROZEN

to unfreeze all tasks in the container :

   # echo RUNNING > /containers/0/freezer.state
   # cat /containers/0/freezer.state
   RUNNING

This is the basic mechanism which should do the right thing for user space
task in a simple scenario.

It's important to note that freezing can be incomplete.  In that case we
return EBUSY.  This means that some tasks in the cgroup are busy doing
something that prevents us from completely freezing the cgroup at this
time.  After EBUSY, the cgroup will remain partially frozen -- reflected
by freezer.state reporting "FREEZING" when read.  The state will remain
"FREEZING" until one of these things happens:

	1) Userspace cancels the freezing operation by writing "RUNNING" to
		the freezer.state file
	2) Userspace retries the freezing operation by writing "FROZEN" to
		the freezer.state file (writing "FREEZING" is not legal
		and returns EIO)
	3) The tasks that blocked the cgroup from entering the "FROZEN"
		state disappear from the cgroup's set of tasks.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: export thaw_process]
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup_subsys.h |  6 ++++++
 include/linux/freezer.h       | 29 ++++++++++-------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e2877454ec8..9c22396e8b5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FREEZER
+SUBSYS(freezer)
+#endif
+
+/* */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 17e3bb42dd3..8f225339eee 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -46,26 +46,11 @@ static inline bool should_send_signal(struct task_struct *p)
 
 /*
  * Wake up a frozen process
- *
- * task_lock() is taken to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
  */
-static inline int thaw_process(struct task_struct *p)
-{
-	task_lock(p);
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
-		task_unlock(p);
-		wake_up_process(p);
-		return 1;
-	}
-	clear_freeze_flag(p);
-	task_unlock(p);
-	return 0;
-}
+extern int __thaw_process(struct task_struct *p);
+
+/* Takes and releases task alloc lock using task_lock() */
+extern int thaw_process(struct task_struct *p);
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
@@ -83,6 +68,12 @@ static inline int try_to_freeze(void)
 extern bool freeze_task(struct task_struct *p, bool sig_only);
 extern void cancel_freezing(struct task_struct *p);
 
+#ifdef CONFIG_CGROUP_FREEZER
+extern int cgroup_frozen(struct task_struct *task);
+#else /* !CONFIG_CGROUP_FREEZER */
+static inline int cgroup_frozen(struct task_struct *task) { return 0; }
+#endif /* !CONFIG_CGROUP_FREEZER */
+
 /*
  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
  * calls wait_for_completion(&vfork) and reset right after it returns from this
-- 
cgit v1.2.3


From 3e680aae4e53ab54cdbb0c29257dae0cbb158e1c Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@wp.pl>
Date: Sat, 18 Oct 2008 20:27:51 -0700
Subject: fb: convert lock/unlock_kernel() into local fb mutex

Change lock_kernel()/unlock_kernel() to local fb mutex.  Each frame buffer
instance has its own mutex.

The one line try_to_load() function is unrolled to request_module() in two
places for readability.

[righi.andrea@gmail.com: fb: fix NULL pointer BUG dereference in fb_open()]
Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 531ccd5f596..75a81eaf343 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -808,6 +808,7 @@ struct fb_tile_ops {
 struct fb_info {
 	int node;
 	int flags;
+	struct mutex lock;		/* Lock for open/release/ioctl funcs */
 	struct fb_var_screeninfo var;	/* Current var */
 	struct fb_fix_screeninfo fix;	/* Current fix */
 	struct fb_monspecs monspecs;	/* Current Monitor specs */
-- 
cgit v1.2.3


From 0e4fb5e283870757024294bc4567a7c59d936f0b Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Sat, 18 Oct 2008 20:27:57 -0700
Subject: ext3: add an option to control error handling on file data

If the journal doesn't abort when it gets an IO error in file data blocks,
the file data corruption will spread silently.  Because most of
applications and commands do buffered writes without fsync(), they don't
notice the IO error.  It's scary for mission critical systems.  On the
other hand, if the journal aborts whenever it gets an IO error in file
data blocks, the system will easily become inoperable.  So this patch
introduces a filesystem option to determine whether it aborts the journal
or just call printk() when it gets an IO error in file data.

If you mount a ext3 fs with data_err=abort option, it aborts on file data
write error.  If you mount it with data_err=ignore, it doesn't abort, just
call printk().  data_err=ignore is the default.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ext3_fs.h | 2 ++
 include/linux/jbd.h     | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 159d9b476cd..d14f0291848 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -380,6 +380,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
+						  * error in ordered mode */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 7ebbcb1c9ba..35d4f6342fa 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -816,6 +816,9 @@ struct journal_s
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
 #define JFS_BARRIER	0x020	/* Use IDE barriers */
+#define JFS_ABORT_ON_SYNCDATA_ERR	0x040  /* Abort the journal on file
+						* data write error in ordered
+						* mode */
 
 /*
  * Function declarations for the journaling transaction and buffer
-- 
cgit v1.2.3


From 146aa1bd0511f88ddb4e92fafa2b8aad4f2f65f3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:03 -0700
Subject: cgroups: fix probable race with put_css_set[_taskexit] and
 find_css_set

put_css_set_taskexit may be called when find_css_set is called on other
cpu.  And the race will occur:

put_css_set_taskexit side                    find_css_set side

                                        |
atomic_dec_and_test(&kref->refcount)    |
    /* kref->refcount = 0 */            |
....................................................................
                                        |  read_lock(&css_set_lock)
                                        |  find_existing_css_set
                                        |  get_css_set
                                        |  read_unlock(&css_set_lock);
....................................................................
__release_css_set                       |
....................................................................
                                        | /* use a released css_set */
                                        |

[put_css_set is the same. But in the current code, all put_css_set are
put into cgroup mutex critical region as the same as find_css_set.]

[akpm@linux-foundation.org: repair comments]
[menage@google.com: eliminate race in css_set refcounting]
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 30934e4bfaa..7166023e07d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -9,7 +9,6 @@
  */
 
 #include <linux/sched.h>
-#include <linux/kref.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
@@ -149,7 +148,7 @@ struct cgroup {
 struct css_set {
 
 	/* Reference count */
-	struct kref ref;
+	atomic_t refcount;
 
 	/*
 	 * List running through all cgroup groups in the same hash
-- 
cgit v1.2.3


From cc31edceee04a7b87f2be48f9489ebb72d264844 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Sat, 18 Oct 2008 20:28:04 -0700
Subject: cgroups: convert tasks file to use a seq_file with shared pid array

Rather than pre-generating the entire text for the "tasks" file each
time the file is opened, we instead just generate/update the array of
process ids and use a seq_file to report these to userspace.  All open
file handles on the same "tasks" file can share a pid array, which may
be updated any time that no thread is actively reading the array.  By
sharing the array, the potential for userspace to DoS the system by
opening many handles on the same "tasks" file is removed.

[Based on a patch by Lai Jiangshan, extended to use seq_file]

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7166023e07d..8ab91880a0a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
+#include <linux/rwsem.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -136,6 +137,15 @@ struct cgroup {
 	 * release_list_lock
 	 */
 	struct list_head release_list;
+
+	/* pids_mutex protects the fields below */
+	struct rw_semaphore pids_mutex;
+	/* Array of process ids in the cgroup */
+	pid_t *tasks_pids;
+	/* How many files are using the current tasks_pids array */
+	int pids_use_count;
+	/* Length of the current tasks_pids array */
+	int pids_length;
 };
 
 /* A css_set is a structure holding pointers to a set of
-- 
cgit v1.2.3


From 886465f407e57d6c3c81013c919ea670ce1ae0d0 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Sat, 18 Oct 2008 20:28:05 -0700
Subject: cgroups: fix declaration of cgroup_mm_owner_callbacks

The choice of real/dummy declaration for cgroup_mm_owner_callbacks()
shouldn't be based on CONFIG_MM_OWNER, but on CONFIG_CGROUPS.  Otherwise
kernel/exit.c fails to compile when something other than a cgroups
controller selects CONFIG_MM_OWNER

Signed-off-by: Paul Menage <menage@google.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8ab91880a0a..8b00f6643e9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -403,6 +403,9 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+void cgroup_mm_owner_callbacks(struct task_struct *old,
+			       struct task_struct *new);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
@@ -421,15 +424,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+static inline void cgroup_mm_owner_callbacks(struct task_struct *old,
+					     struct task_struct *new) {}
+
 #endif /* !CONFIG_CGROUPS */
 
-#ifdef CONFIG_MM_OWNER
-extern void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
-#else /* !CONFIG_MM_OWNER */
-static inline void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-}
-#endif /* CONFIG_MM_OWNER */
 #endif /* _LINUX_CGROUP_H */
-- 
cgit v1.2.3


From 52d4b9ac0b985168009c2a57098324e67bae171f Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:16 -0700
Subject: memcg: allocate all page_cgroup at boot

Allocate all page_cgroup at boot and remove page_cgroup poitner from
struct page.  This patch adds an interface as

 struct page_cgroup *lookup_page_cgroup(struct page*)

All FLATMEM/DISCONTIGMEM/SPARSEMEM  and MEMORY_HOTPLUG is supported.

Remove page_cgroup pointer reduces the amount of memory by
 - 4 bytes per PAGE_SIZE.
 - 8 bytes per PAGE_SIZE
if memory controller is disabled. (even if configured.)

On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory.
On my x86-64 server with 48GB of memory, this saves 96MB of memory.
I think this reduction makes sense.

By pre-allocation, kmalloc/kfree in charge/uncharge are removed.
This means
  - we're not necessary to be afraid of kmalloc faiulre.
    (this can happen because of gfp_mask type.)
  - we can avoid calling kmalloc/kfree.
  - we can avoid allocating tons of small objects which can be fragmented.
  - we can know what amount of memory will be used for this extra-lru handling.

I added printk message as

	"allocated %ld bytes of page_cgroup"
        "please try cgroup_disable=memory option if you don't want"

maybe enough informative for users.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h  |  13 +-----
 include/linux/mm_types.h    |   3 --
 include/linux/mmzone.h      |  14 +++++-
 include/linux/page_cgroup.h | 103 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 117 insertions(+), 16 deletions(-)
 create mode 100644 include/linux/page_cgroup.h

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ee1b2fcb441..1fbe14d3952 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,9 +27,6 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
-
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -72,16 +69,8 @@ extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru);
 
-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void page_reset_bad_cgroup(struct page *page)
-{
-}
-
-static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return NULL;
-}
 
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline int mem_cgroup_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9d49fa36bbe..fe825471d5a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -94,9 +94,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
 };
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index da2d053a95f..35a7b5e1946 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -601,8 +601,11 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	struct page_cgroup *node_page_cgroup;
+#endif
 #endif
 	struct bootmem_data *bdata;
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -931,6 +934,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #endif
 
 struct page;
+struct page_cgroup;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
@@ -948,6 +952,14 @@ struct mem_section {
 
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	/*
+	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
+	 * section. (see memcontrol.h/page_cgroup.h about this.)
+	 */
+	struct page_cgroup *page_cgroup;
+	unsigned long pad;
+#endif
 };
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
new file mode 100644
index 00000000000..0fd39f2231e
--- /dev/null
+++ b/include/linux/page_cgroup.h
@@ -0,0 +1,103 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#include <linux/bit_spinlock.h>
+/*
+ * Page Cgroup can be considered as an extended mem_map.
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ * All page cgroups are allocated at boot or memory hotplug event,
+ * then the page cgroup for pfn always exists.
+ */
+struct page_cgroup {
+	unsigned long flags;
+	struct mem_cgroup *mem_cgroup;
+	struct page *page;
+	struct list_head lru;		/* per cgroup LRU list */
+};
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
+void __init page_cgroup_init(void);
+struct page_cgroup *lookup_page_cgroup(struct page *page);
+
+enum {
+	/* flags for mem_cgroup */
+	PCG_LOCK,  /* page cgroup is locked */
+	PCG_CACHE, /* charged as cache */
+	PCG_USED, /* this object is in use. */
+	/* flags for LRU placement */
+	PCG_ACTIVE, /* page is active in this cgroup */
+	PCG_FILE, /* page is file system backed */
+	PCG_UNEVICTABLE, /* page is unevictableable */
+};
+
+#define TESTPCGFLAG(uname, lname)			\
+static inline int PageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_bit(PCG_##lname, &pc->flags); }
+
+#define SETPCGFLAG(uname, lname)			\
+static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
+	{ set_bit(PCG_##lname, &pc->flags);  }
+
+#define CLEARPCGFLAG(uname, lname)			\
+static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
+	{ clear_bit(PCG_##lname, &pc->flags);  }
+
+/* Cache flag is set only once (at allocation) */
+TESTPCGFLAG(Cache, CACHE)
+
+TESTPCGFLAG(Used, USED)
+CLEARPCGFLAG(Used, USED)
+
+/* LRU management flags (from global-lru definition) */
+TESTPCGFLAG(File, FILE)
+SETPCGFLAG(File, FILE)
+CLEARPCGFLAG(File, FILE)
+
+TESTPCGFLAG(Active, ACTIVE)
+SETPCGFLAG(Active, ACTIVE)
+CLEARPCGFLAG(Active, ACTIVE)
+
+TESTPCGFLAG(Unevictable, UNEVICTABLE)
+SETPCGFLAG(Unevictable, UNEVICTABLE)
+CLEARPCGFLAG(Unevictable, UNEVICTABLE)
+
+static inline int page_cgroup_nid(struct page_cgroup *pc)
+{
+	return page_to_nid(pc->page);
+}
+
+static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+{
+	return page_zonenum(pc->page);
+}
+
+static inline void lock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_lock(PCG_LOCK, &pc->flags);
+}
+
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
+{
+	return bit_spin_trylock(PCG_LOCK, &pc->flags);
+}
+
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_unlock(PCG_LOCK, &pc->flags);
+}
+
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+struct page_cgroup;
+
+static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+}
+
+static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	return NULL;
+}
+#endif
+#endif
-- 
cgit v1.2.3


From 3eda20118000941e7e8994fc5fac8706d8c10f00 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:19 -0700
Subject: seq_file: add seq_cpumask_list(), seq_nodemask_list()

seq_cpumask_list(), seq_nodemask_list() are very like seq_cpumask(),
seq_nodemask(), but they print human readable string.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/seq_file.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index a1783b229ef..dc50bcc282a 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -60,6 +60,19 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
 	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
 }
 
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits);
+
+static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, NR_CPUS);
+}
+
+static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, MAX_NUMNODES);
+}
+
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
-- 
cgit v1.2.3


From c4596435404976b0ded9cdf18b456ca2e1408ddd Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 18 Oct 2008 20:28:21 -0700
Subject: bitmask: remove bitmap_scnprintf_len()

bitmap_scnprintf_len() is not used now, so we remove it.

Otherwise we have to maintain it and make its return
value always equal to bitmap_scnprintf()'s return value.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 89781fd4885..1abfe664c44 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -110,7 +110,6 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);
 
 extern int bitmap_scnprintf(char *buf, unsigned int len,
 			const unsigned long *src, int nbits);
-extern int bitmap_scnprintf_len(unsigned int nr_bits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
-- 
cgit v1.2.3


From b747c8c102cc0677a7a8056a093f58d7c9b500e7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:21 -0700
Subject: make ptrace_untrace() static

ptrace_untrace() can now become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ea7416c901d..22641d5d45d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,6 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern void ptrace_untrace(struct task_struct *child);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
-- 
cgit v1.2.3


From 656eb2cd5da153762f2e8419ca117ce12ef522c3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sat, 18 Oct 2008 20:28:23 -0700
Subject: add CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS

This adds a kconfig option to change the /proc/PID/coredump_filter default.
Fedora has been carrying a trivial patch to change the hard-wired value for
this default, since Fedora 8.  The default default can't change safely
because there are old GDB versions out there (all before 6.7) that are
confused by the core dump files created by the MMF_DUMP_ELF_HEADERS setting.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 017cc914ef1..f52dbd3587a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -411,7 +411,13 @@ extern int get_dumpable(struct mm_struct *mm);
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
 	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
-	 (1 << MMF_DUMP_HUGETLB_PRIVATE))
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
 
 struct sighand_struct {
 	atomic_t		count;
-- 
cgit v1.2.3


From 57cac4d1880527e0baf6c2fda529d2ad1d815aec Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Sat, 18 Oct 2008 20:28:25 -0700
Subject: kdump: make elfcorehdr_addr independent of CONFIG_PROC_VMCORE

o elfcorehdr_addr is used by not only the code under CONFIG_PROC_VMCORE
  but also by the code which is not inside CONFIG_PROC_VMCORE.  For
  example, is_kdump_kernel() is used by powerpc code to determine if
  kernel is booting after a panic then use previous kernel's TCE table.
  So even if CONFIG_PROC_VMCORE is not set in second kernel, one should be
  able to correctly determine that we are booting after a panic and setup
  calgary iommu accordingly.

o So remove the assumption that elfcorehdr_addr is under
  CONFIG_PROC_VMCORE.

o Move definition of elfcorehdr_addr to arch dependent crash files.
  (Unfortunately crash dump does not have an arch independent file
  otherwise that would have been the best place).

o kexec.c is not the right place as one can Have CRASH_DUMP enabled in
  second kernel without KEXEC being enabled.

o I don't see sh setup code parsing the command line for
  elfcorehdr_addr.  I am wondering how does vmcore interface work on sh.
  Anyway, I am atleast defining elfcoredhr_addr so that compilation is not
  broken on sh.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Simon Horman <horms@verge.net.au>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_dump.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 025e4f57510..de027d1db74 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -9,11 +9,7 @@
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
 
-#ifdef CONFIG_PROC_VMCORE
 extern unsigned long long elfcorehdr_addr;
-#else
-static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-#endif
 
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
@@ -28,6 +24,16 @@ extern struct proc_dir_entry *proc_vmcore;
 
 #define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 
+/*
+ * is_kdump_kernel() checks whether this kernel is booting after a panic of
+ * previous kernel or not. This is determined by checking if previous kernel
+ * has passed the elf core header address on command line.
+ *
+ * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
+ * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of
+ * previous kernel.
+ */
+
 static inline int is_kdump_kernel(void)
 {
 	return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
-- 
cgit v1.2.3


From 85a0ee342e0c06c19d78fdf48307211c6cf18fcb Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Sat, 18 Oct 2008 20:28:29 -0700
Subject: kdump: add is_vmcore_usable() and vmcore_unusable()

The usage of elfcorehdr_addr has changed recently such that being set to
ELFCORE_ADDR_MAX is used by is_kdump_kernel() to indicate if the code is
executing in a kernel executed as a crash kernel.

However, arch/ia64/kernel/setup.c:reserve_elfcorehdr will rest
elfcorehdr_addr to ELFCORE_ADDR_MAX on error, which means any subsequent
calls to is_kdump_kernel() will return 0, even though they should return
1.

Ok, at this point in time there are no subsequent calls, but I think its
fair to say that there is ample scope for error or at the very least
confusion.

This patch add an extra state, ELFCORE_ADDR_ERR, which indicates that
elfcorehdr_addr was passed on the command line, and thus execution is
taking place in a crashdump kernel, but vmcore can't be used for some
reason.  This is tested for using is_vmcore_usable() and set using
vmcore_unusable().  A subsequent patch makes use of this new code.

To summarise, the states that elfcorehdr_addr can now be in are as follows:

ELFCORE_ADDR_MAX: not a crashdump kernel
ELFCORE_ADDR_ERR: crashdump kernel but vmcore is unusable
any other value:  crash dump kernel and vmcore is usable

Signed-off-by: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_dump.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index de027d1db74..0acf3b737e2 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
+#define ELFCORE_ADDR_ERR	(-2ULL)
 
 extern unsigned long long elfcorehdr_addr;
 
@@ -38,6 +39,29 @@ static inline int is_kdump_kernel(void)
 {
 	return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
 }
+
+/* is_vmcore_usable() checks if the kernel is booting after a panic and
+ * the vmcore region is usable.
+ *
+ * This makes use of the fact that due to alignment -2ULL is not
+ * a valid pointer, much in the vain of IS_ERR(), except
+ * dealing directly with an unsigned long long rather than a pointer.
+ */
+
+static inline int is_vmcore_usable(void)
+{
+	return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
+}
+
+/* vmcore_unusable() marks the vmcore as unusable,
+ * without disturbing the logic of is_kdump_kernel()
+ */
+
+static inline void vmcore_unusable(void)
+{
+	if (is_kdump_kernel())
+		elfcorehdr_addr = ELFCORE_ADDR_ERR;
+}
 #else /* !CONFIG_CRASH_DUMP */
 static inline int is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
-- 
cgit v1.2.3


From b8e465f4945bc0e9f324e3bbe15f5180a8e9a6fe Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:35 -0700
Subject: byteorder: add new headers for make headers-install

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/Kbuild | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index bf9aca548f1..e531783e5d7 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -183,6 +183,7 @@ unifdef-y += auto_fs.h
 unifdef-y += auxvec.h
 unifdef-y += binfmts.h
 unifdef-y += blktrace_api.h
+unifdef-y += byteorder.h
 unifdef-y += capability.h
 unifdef-y += capi.h
 unifdef-y += cciss_ioctl.h
@@ -340,6 +341,7 @@ unifdef-y += soundcard.h
 unifdef-y += stat.h
 unifdef-y += stddef.h
 unifdef-y += string.h
+unifdef-y += swab.h
 unifdef-y += synclink.h
 unifdef-y += sysctl.h
 unifdef-y += tcp.h
-- 
cgit v1.2.3


From acf0108a84edae22b99655eb2f6f6c9f7ec4d449 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:36 -0700
Subject: byteorder: use generic C version for value byteswapping

This makes the new implementation of the byteorder helpers match the old
in how it degraded when an arch-defined version was not available:

1) swab()
	- look for arch defined
	- if not, use generic c version

2) swabp()
	- look for arch-defined
	- if not, deref pointer and use swab()

3) swabs()
	- look for arch defined
	- if not, use swabp

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swab.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swab.h b/include/linux/swab.h
index 270d5c208a8..bbed279f3b3 100644
--- a/include/linux/swab.h
+++ b/include/linux/swab.h
@@ -47,8 +47,6 @@ static inline __attribute_const__ __u16 ___swab16(__u16 val)
 {
 #ifdef __arch_swab16
 	return __arch_swab16(val);
-#elif defined(__arch_swab16p)
-	return __arch_swab16p(&val);
 #else
 	return __const_swab16(val);
 #endif
@@ -58,8 +56,6 @@ static inline __attribute_const__ __u32 ___swab32(__u32 val)
 {
 #ifdef __arch_swab32
 	return __arch_swab32(val);
-#elif defined(__arch_swab32p)
-	return __arch_swab32p(&val);
 #else
 	return __const_swab32(val);
 #endif
@@ -69,8 +65,6 @@ static inline __attribute_const__ __u64 ___swab64(__u64 val)
 {
 #ifdef __arch_swab64
 	return __arch_swab64(val);
-#elif defined(__arch_swab64p)
-	return __arch_swab64p(&val);
 #elif defined(__SWAB_64_THRU_32__)
 	__u32 h = val >> 32;
 	__u32 l = val & ((1ULL << 32) - 1);
@@ -84,8 +78,6 @@ static inline __attribute_const__ __u32 ___swahw32(__u32 val)
 {
 #ifdef __arch_swahw32
 	return __arch_swahw32(val);
-#elif defined(__arch_swahw32p)
-	return __arch_swahw32p(&val);
 #else
 	return __const_swahw32(val);
 #endif
@@ -95,8 +87,6 @@ static inline __attribute_const__ __u32 ___swahb32(__u32 val)
 {
 #ifdef __arch_swahb32
 	return __arch_swahb32(val);
-#elif defined(__arch_swahb32p)
-	return __arch_swahb32p(&val);
 #else
 	return __const_swahb32(val);
 #endif
-- 
cgit v1.2.3


From 1d8cca44b6a244b7e378546d719041819049a0f9 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Sat, 18 Oct 2008 20:28:37 -0700
Subject: byteorder: provide swabb.h generically in asm/byteorder.h

This is needed during the transition to the new byteorder headers as the
swabb.h functionality will be provided from asm/byteorder.h in the new
version.  To avoid breakage on arches still using the old implementation,
provide swabb.h from asm/byteorder.h as well.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/byteorder/Kbuild          | 1 +
 include/linux/byteorder/big_endian.h    | 1 +
 include/linux/byteorder/little_endian.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/byteorder/Kbuild b/include/linux/byteorder/Kbuild
index 1133d5f9d81..fbaa7f9cee3 100644
--- a/include/linux/byteorder/Kbuild
+++ b/include/linux/byteorder/Kbuild
@@ -1,3 +1,4 @@
 unifdef-y += big_endian.h
 unifdef-y += little_endian.h
 unifdef-y += swab.h
+unifdef-y += swabb.h
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 44f95b92393..1cba3f3efe5 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)(__u32)(x))
 #define __constant_ntohl(x) ((__force __u32)(__be32)(x))
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 4cc170a3176..cedc1b5a289 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)___constant_swab32((x)))
 #define __constant_ntohl(x) ___constant_swab32((__force __be32)(x))
-- 
cgit v1.2.3


From fdd2e5f88a259a537bb239e0c03c973cb6ea402a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:38 -0700
Subject: make mm/rmap.c:anon_vma_cachep static

This patch makes the needlessly global anon_vma_cachep static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1da48db8db0..89f0564b10c 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -39,18 +39,6 @@ struct anon_vma {
 
 #ifdef CONFIG_MMU
 
-extern struct kmem_cache *anon_vma_cachep;
-
-static inline struct anon_vma *anon_vma_alloc(void)
-{
-	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
-}
-
-static inline void anon_vma_free(struct anon_vma *anon_vma)
-{
-	kmem_cache_free(anon_vma_cachep, anon_vma);
-}
-
 static inline void anon_vma_lock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
-- 
cgit v1.2.3


From a0098efd6ee4e8c04d82d761aa1bb9ec7a0aa32d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:47 -0700
Subject: remove the obsolete BCD*BIN/BIN*BCD macros

Remove the following obsolete macros:

- BCD2BIN
- BIN2BCD
- BCD_TO_BIN
- BIN_TO_BCD

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bcd.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bcd.h b/include/linux/bcd.h
index 7ac518e3c15..75f6d6e699a 100644
--- a/include/linux/bcd.h
+++ b/include/linux/bcd.h
@@ -15,11 +15,4 @@
 unsigned bcd2bin(unsigned char val) __attribute_const__;
 unsigned char bin2bcd(unsigned val) __attribute_const__;
 
-#define BCD2BIN(val)	bcd2bin(val)
-#define BIN2BCD(val)	bin2bcd(val)
-
-/* backwards compat */
-#define BCD_TO_BIN(val) ((val)=BCD2BIN(val))
-#define BIN_TO_BCD(val) ((val)=BIN2BCD(val))
-
 #endif /* _BCD_H */
-- 
cgit v1.2.3


From 5a85a7dda15f88b7f9c96c67fe826b5d0486d601 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sat, 18 Oct 2008 20:28:48 -0700
Subject: include/linux/bcd.h: remove comments

- the macros are gone
- there's no more code in this file,
  LGPL + GPL = GPL,
  and the code that was moved to lib/bcd.c is anyway trivial

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bcd.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bcd.h b/include/linux/bcd.h
index 75f6d6e699a..22ea563ba3e 100644
--- a/include/linux/bcd.h
+++ b/include/linux/bcd.h
@@ -1,12 +1,3 @@
-/* Permission is hereby granted to copy, modify and redistribute this code
- * in terms of the GNU Library General Public License, Version 2 or later,
- * at your option.
- */
-
-/* macros to translate to/from binary and binary-coded decimal (frequently
- * found in RTC chips).
- */
-
 #ifndef _BCD_H
 #define _BCD_H
 
-- 
cgit v1.2.3


From 01e8ef11bc1a74e65678ed55795f59266d4add01 Mon Sep 17 00:00:00 2001
From: Parag Warudkar <parag.lkml@gmail.com>
Date: Sat, 18 Oct 2008 20:28:50 -0700
Subject: x86: sysfs: kill owner field from attribute

Tejun's commit 7b595756ec1f49e0049a9e01a1298d53a7faaa15 made sysfs
attribute->owner unnecessary.  But the field was left in the structure to
ease the merge.  It's been over a year since that change and it is now
time to start killing attribute->owner along with its users - one arch at
a time!

This patch is attempt #1 to get rid of attribute->owner only for
CONFIG_X86_64 or CONFIG_X86_32 .  We will deal with other arches later on
as and when possible - avr32 will be the next since that is something I
can test.  Compile (make allyesconfig / make allmodconfig / custom config)
and boot tested.

akpm: the idea is that we put the declaration of sttribute.owner inside
`#ifndef CONFIG_X86'.  But that proved to be too ambitious for now because
new usages kept on turning up in subsystem trees.

[akpm: remove the ifdef for now]
Signed-off-by: Parag Warudkar <parag.lkml@gmail.com>
Cc: Greg KH <greg@kroah.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: David Brownell <david-b@pacbell.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysfs.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b330e289d71..9d68fed50f1 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -21,8 +21,9 @@ struct kobject;
 struct module;
 
 /* FIXME
- * The *owner field is no longer used, but leave around
- * until the tree gets cleaned up fully.
+ * The *owner field is no longer used.
+ * x86 tree has been cleaned up. The owner
+ * attribute is still left for other arches.
  */
 struct attribute {
 	const char		*name;
-- 
cgit v1.2.3


From edbc25caaa492a82e19baa915f1f6b0a0db6554d Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Thu, 10 Jul 2008 16:29:37 -0500
Subject: PCI: remove dynids.use_driver_data

The driver flag dynids.use_driver_data is almost consistently not set,
and causes more problems than it solves.  It was initially intended as a
flag to indicate whether a driver's usage of driver_data had been
carefully inspected and was ready for values from userspace.  That audit
was never done, so most drivers just get a 0 for driver_data when new
IDs are added from userspace via sysfs.  So remove the flag, allowing
drivers to see the data directly (a followon patch validates the passed
driver_data value against what the drivers expect).

Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index acf8f24037c..c989f58d09b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -347,7 +347,6 @@ struct pci_bus_region {
 struct pci_dynids {
 	spinlock_t lock;            /* protects list, index */
 	struct list_head list;      /* for IDs added at runtime */
-	unsigned int use_driver_data:1; /* pci_device_id->driver_data is used */
 };
 
 /* ---------------------------------------------------------------- */
-- 
cgit v1.2.3


From 0235c4fc7fc6f621dc0dd89eba102ad5aa373390 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 18 Aug 2008 21:38:00 +0200
Subject: PCI PM: Introduce function pci_wake_from_d3

Many device drivers use the following sequence of statements to enable
the device to wake up the system while being in the D3_hot or D3_cold
low power state:

        pci_enable_wake(pdev, PCI_D3hot, 1);
        pci_enable_wake(pdev, PCI_D3cold, 1);

However, the second call is not necessary if the first one succeeds (the
ordering of the statements above doesn't matter here) and it may even be
harmful, because we are not supposed to enable PME# after the wake-up
power has been enabled for the device.

To allow drivers to overcome this problem, introduce function
pci_wake_from_d3() that will enable the device to wake up the system
from any of D3_hot and D3_cold as long as the wake-up from at least one
of them is supported.

Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index c989f58d09b..f7e7dbc0919 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -644,6 +644,7 @@ pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 bool pci_pme_capable(struct pci_dev *dev, pci_power_t state);
 void pci_pme_active(struct pci_dev *dev, bool enable);
 int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
+int pci_wake_from_d3(struct pci_dev *dev, bool enable);
 pci_power_t pci_target_state(struct pci_dev *dev);
 int pci_prepare_to_sleep(struct pci_dev *dev);
 int pci_back_from_sleep(struct pci_dev *dev);
-- 
cgit v1.2.3


From 16dbef4a831782466b10d4ae56837c5ba17d1948 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 15 Aug 2008 19:36:45 -0700
Subject: PCI: change MSI-x vector to 32bit

We are using 28bit pci (bus/dev/fn + 12 bits) as irq number, so the
cache for irq number should be 32 bit too.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Vasquez <andrew.vasquez@qlogic.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index f7e7dbc0919..8a4d0bebc31 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -725,7 +725,7 @@ enum pci_dma_burst_strategy {
 };
 
 struct msix_entry {
-	u16 	vector;	/* kernel uses to write allocated vector */
+	u32	vector;	/* kernel uses to write allocated vector */
 	u16	entry;	/* driver uses to specify entry, OS writes */
 };
 
-- 
cgit v1.2.3


From 37a84ec668ba251ae02cf2c2c664baf6b247ae1f Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Thu, 28 Aug 2008 15:40:59 -0700
Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak DeviceIDs

This patch updates the Intel Ibex Peak (PCH) LPC and SMBus Controller
DeviceIDs.

The LPC Controller ID is set by Firmware within the range of
0x3b00-3b1f.  This range is included in pci_ids.h using min and max
values, and irq.c now has code to handle the range (in lieu of 32
additions to a SWITCH statement).

The SMBus Controller ID is a fixed-value and will not change.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci_ids.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8edddc240e4..e5d344bfcb7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2454,9 +2454,9 @@
 #define PCI_DEVICE_ID_INTEL_ICH10_3	0x3a1a
 #define PCI_DEVICE_ID_INTEL_ICH10_4	0x3a30
 #define PCI_DEVICE_ID_INTEL_ICH10_5	0x3a60
-#define PCI_DEVICE_ID_INTEL_PCH_0	0x3b10
-#define PCI_DEVICE_ID_INTEL_PCH_1	0x3b11
-#define PCI_DEVICE_ID_INTEL_PCH_2	0x3b30
+#define PCI_DEVICE_ID_INTEL_PCH_LPC_MIN	0x3b00
+#define PCI_DEVICE_ID_INTEL_PCH_LPC_MAX	0x3b1f
+#define PCI_DEVICE_ID_INTEL_PCH_SMBUS	0x3b30
 #define PCI_DEVICE_ID_INTEL_IOAT_SNB	0x402f
 #define PCI_DEVICE_ID_INTEL_5100_16	0x65f0
 #define PCI_DEVICE_ID_INTEL_5100_21	0x65f5
-- 
cgit v1.2.3


From c322b28a04c084a467a862766f74c40c917a721c Mon Sep 17 00:00:00 2001
From: "Zhao, Yu" <yu.zhao@intel.com>
Date: Mon, 13 Oct 2008 19:36:05 +0800
Subject: PCI: use same arg names in PCI_VDEVICE comment

This cleanup makes the argument names in PCI_VDEVICE comment consistent
with those used in its definition.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8a4d0bebc31..008005674b6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -455,8 +455,8 @@ struct pci_driver {
 
 /**
  * PCI_VDEVICE - macro used to describe a specific pci device in short form
- * @vend: the vendor name
- * @dev: the 16 bit PCI Device ID
+ * @vendor: the vendor name
+ * @device: the 16 bit PCI Device ID
  *
  * This macro is used to create a struct pci_device_id that matches a
  * specific PCI device.  The subvendor, and subdevice fields will be set
-- 
cgit v1.2.3


From 58c3a727cb73b75a9104d295f096cca12959a5a5 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Tue, 14 Oct 2008 14:02:53 +0800
Subject: PCI: support PCIe ARI capability

This patch adds support for PCI Express Alternative Routing-ID
Interpretation (ARI) capability.

The ARI capability extends the Function Number field of the PCI Express
Endpoint by reusing the Device Number which is otherwise hardwired to 0.
With ARI, an Endpoint can have up to 256 functions.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h      |  1 +
 include/linux/pci_regs.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 008005674b6..7e9a1f0715e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -214,6 +214,7 @@ struct pci_dev {
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
 	unsigned int 	msi_enabled:1;
 	unsigned int	msix_enabled:1;
+	unsigned int	ari_enabled:1;	/* ARI forwarding */
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	pci_dev_flags_t dev_flags;
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 450684f7eaa..eb6686b88f9 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -419,6 +419,10 @@
 #define  PCI_EXP_RTCTL_CRSSVE	0x10	/* CRS Software Visibility Enable */
 #define PCI_EXP_RTCAP		30	/* Root Capabilities */
 #define PCI_EXP_RTSTA		32	/* Root Status */
+#define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
+#define  PCI_EXP_DEVCAP2_ARI	0x20	/* Alternative Routing-ID */
+#define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
+#define  PCI_EXP_DEVCTL2_ARI	0x20	/* Alternative Routing-ID */
 
 /* Extended Capabilities (PCI-X 2.0 and Express) */
 #define PCI_EXT_CAP_ID(header)		(header & 0x0000ffff)
@@ -429,6 +433,7 @@
 #define PCI_EXT_CAP_ID_VC	2
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
+#define PCI_EXT_CAP_ID_ARI	14
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -536,5 +541,14 @@
 #define HT_CAPTYPE_GEN3		0xD0	/* Generation 3 hypertransport configuration */
 #define HT_CAPTYPE_PM		0xE0	/* Hypertransport powermanagement configuration */
 
+/* Alternative Routing-ID Interpretation */
+#define PCI_ARI_CAP		0x04	/* ARI Capability Register */
+#define  PCI_ARI_CAP_MFVC	0x0001	/* MFVC Function Groups Capability */
+#define  PCI_ARI_CAP_ACS	0x0002	/* ACS Function Groups Capability */
+#define  PCI_ARI_CAP_NFN(x)	(((x) >> 8) & 0xff) /* Next Function Number */
+#define PCI_ARI_CTRL		0x06	/* ARI Control Register */
+#define  PCI_ARI_CTRL_MFVC	0x0001	/* MFVC Function Groups Enable */
+#define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
+#define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From aa42d7c6138afdc54f74e971456a0fbfec16b77b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 28 Sep 2008 16:36:11 -0700
Subject: PCI: introduce an pci_ioremap(pdev, barnr) function

A common thing in many PCI drivers is to ioremap() an entire bar.  This
is a slightly fragile thing right now, needing both an address and a
size, and many driver writers do.. various things there.

This patch introduces an pci_ioremap() function taking just a PCI device
struct and the bar number as arguments, and figures this all out itself,
in one place.  In addition, we can add various sanity checks to this
function (the patch already checks to make sure that the bar in question
really is a MEM bar; few to no drivers do that sort of thing).

Hopefully with this type of API we get less chance of mistakes in
drivers with ioremap() operations.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e9a1f0715e..46ad282ffe4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1119,5 +1119,18 @@ static inline void pci_mmcfg_early_init(void) { }
 static inline void pci_mmcfg_late_init(void) { }
 #endif
 
+static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
+{
+	/*
+	 * Make sure the BAR is actually a memory resource, not an IO resource
+	 */
+	if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	return ioremap_nocache(pci_resource_start(pdev, bar),
+				     pci_resource_len(pdev, bar));
+}
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 0927678f55c9a50c296f7e6dae85e87b8236e155 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Sat, 18 Oct 2008 17:33:19 -0700
Subject: PCI: use pci_find_ext_capability everywhere

Remove some open coded (and buggy) versions of pci_find_ext_capability
in favor of the real routine in the PCI core.

Tested-by: Tomasz Czernecki <czernecki@gmail.com>
Acked-by: Andrew Vasquez <andrew.vasquez@qlogic.com>
Reviewed-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/aer.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index f2518141de8..a2383a72356 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -18,10 +18,6 @@ static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
-static inline int pci_find_aer_capability(struct pci_dev *dev)
-{
-	return 0;
-}
 static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 270c66be9b4a6f2be53ef3aec5dc8e7b07782ec9 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yu.zhao@intel.com>
Date: Sun, 19 Oct 2008 20:35:20 +0800
Subject: PCI: fix AER capability check

The 'use pci_find_ext_capability everywhere' cleanup brought a new bug,
which makes the AER stop working.  Fix it by actually using find_ext_cap
instead of just find_cap.  Drop the unused config space size define while
we're at it.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/aer.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index a2383a72356..f7df1eefc10 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -10,7 +10,6 @@
 #if defined(CONFIG_PCIEAER)
 /* pci-e port driver needs this function to enable aer */
 extern int pci_enable_pcie_error_reporting(struct pci_dev *dev);
-extern int pci_find_aer_capability(struct pci_dev *dev);
 extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 #else
-- 
cgit v1.2.3


From 96499871f45b9126157b1a5c512d6e30f1635225 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 20 Oct 2008 19:45:43 +0200
Subject: PCI: fix pci_ioremap_bar() on s390

s390 doesn't have ioremap_*, so protect the definition of the new
pci_ioremap_bar function with CONFIG_HAS_IOMEM to avoid build breakage.

Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 46ad282ffe4..085187be29c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1119,6 +1119,7 @@ static inline void pci_mmcfg_early_init(void) { }
 static inline void pci_mmcfg_late_init(void) { }
 #endif
 
+#ifdef CONFIG_HAS_IOMEM
 static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
 {
 	/*
@@ -1131,6 +1132,7 @@ static inline void * pci_ioremap_bar(struct pci_dev *pdev, int bar)
 	return ioremap_nocache(pci_resource_start(pdev, bar),
 				     pci_resource_len(pdev, bar));
 }
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3