From 1cf24bdbbbd2eb5439796dc399ab1649d150ed1d Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 22 Nov 2006 09:55:08 -0600
Subject: [IA64] - Reduce overhead of FP exception logging messages

Improve the scalability of the fpswa code that rate-limits
logging of messages.

There are 2 distinctly different problems in this code.

1) If prctl is used to disable logging, last_time is never
   updated. The result is that fpu_swa_count is zeroed out on
   EVERY fp fault. This causes a very very hot cache line.
   The fix reduces the wallclock time of a 1024p FP exception test
   from 28734 sec to 19 sec!!!

2) On VERY large systems, excessive messages are logged because
   multiple cpus can each reset or increment fpu_swa_count at
   about the same time. The result is that hundreds of messages
   are logged each second. The fixes reduces the logging rate
   to ~1 per second.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/traps.c | 50 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 10 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index fffa9e0826b..ab684747036 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -307,6 +307,15 @@ fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long
 	return ret.status;
 }
 
+struct fpu_swa_msg {
+	unsigned long count;
+	unsigned long time;
+};
+static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast);
+DECLARE_PER_CPU(struct fpu_swa_msg, cpulast);
+static struct fpu_swa_msg last __cacheline_aligned;
+
+
 /*
  * Handle floating-point assist faults and traps.
  */
@@ -316,8 +325,6 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 	long exception, bundle[2];
 	unsigned long fault_ip;
 	struct siginfo siginfo;
-	static int fpu_swa_count = 0;
-	static unsigned long last_time;
 
 	fault_ip = regs->cr_iip;
 	if (!fp_fault && (ia64_psr(regs)->ri == 0))
@@ -325,14 +332,37 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
 	if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
 		return -1;
 
-	if (jiffies - last_time > 5*HZ)
-		fpu_swa_count = 0;
-	if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) {
-		last_time = jiffies;
-		++fpu_swa_count;
-		printk(KERN_WARNING
-		       "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
-		       current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr);
+	if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT))  {
+		unsigned long count, current_jiffies = jiffies;
+		struct fpu_swa_msg *cp = &__get_cpu_var(cpulast);
+
+		if (unlikely(current_jiffies > cp->time))
+			cp->count = 0;
+		if (unlikely(cp->count < 5)) {
+			cp->count++;
+			cp->time = current_jiffies + 5 * HZ;
+
+			/* minimize races by grabbing a copy of count BEFORE checking last.time. */
+			count = last.count;
+			barrier();
+
+			/*
+			 * Lower 4 bits are used as a count. Upper bits are a sequence
+			 * number that is updated when count is reset. The cmpxchg will
+			 * fail is seqno has changed. This minimizes mutiple cpus
+			 * reseting the count.
+			 */
+			if (current_jiffies > last.time)
+				(void) cmpxchg_acq(&last.count, count, 16 + (count & ~15));
+
+			/* used fetchadd to atomically update the count */
+			if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) {
+				last.time = current_jiffies + 5 * HZ;
+				printk(KERN_WARNING
+		       			"%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
+		       			current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr);
+			}
+		}
 	}
 
 	exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
-- 
cgit v1.2.3