diff options
Diffstat (limited to 'arch/powerpc/kernel')
37 files changed, 5632 insertions, 147 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9a74b7ab03a..9ed551b6c17 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -12,12 +12,12 @@ CFLAGS_btext.o += -fPIC endif obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ - irq.o signal_32.o pmc.o vdso.o + irq.o align.o signal_32.o pmc.o vdso.o obj-y += vdso32/ obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o systbl.o \ paca.o ioctl32.o cpu_setup_power4.o \ - firmware.o sysfs.o udbg.o + firmware.o sysfs.o udbg.o idle_64.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o obj-$(CONFIG_POWER4) += idle_power4.o @@ -25,7 +25,7 @@ obj-$(CONFIG_PPC_OF) += of_device.o procfs-$(CONFIG_PPC64) := proc_ppc64.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64) := rtas_pci.o -obj-$(CONFIG_PPC_RTAS) += rtas.o $(rtaspci-y) +obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y) obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_LPARCFG) += lparcfg.o @@ -35,6 +35,7 @@ obj-$(CONFIG_PPC_PSERIES) += udbg_16550.o obj-$(CONFIG_PPC_MAPLE) += udbg_16550.o udbgscc-$(CONFIG_PPC64) := udbg_scc.o obj-$(CONFIG_PPC_PMAC) += $(udbgscc-y) +obj64-$(CONFIG_PPC_MULTIPLATFORM) += nvram_64.o ifeq ($(CONFIG_PPC_MERGE),y) @@ -49,12 +50,23 @@ extra-y += vmlinux.lds obj-y += process.o init_task.o time.o \ prom.o traps.o setup-common.o obj-$(CONFIG_PPC32) += entry_32.o setup_32.o misc_32.o systbl.o -obj-$(CONFIG_PPC64) += misc_64.o +obj-$(CONFIG_PPC64) += misc_64.o dma_64.o iommu.o obj-$(CONFIG_PPC_OF) += prom_init.o obj-$(CONFIG_MODULES) += ppc_ksyms.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_6xx) += idle_6xx.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_KPROBES) += kprobes.o + +module-$(CONFIG_PPC64) += module_64.o +obj-$(CONFIG_MODULES) += $(module-y) + +pci64-$(CONFIG_PPC64) += pci_64.o pci_dn.o pci_iommu.o \ + pci_direct_iommu.o iomap.o +obj-$(CONFIG_PCI) += $(pci64-y) + +kexec64-$(CONFIG_PPC64) += machine_kexec_64.o +obj-$(CONFIG_KEXEC) += $(kexec64-y) ifeq ($(CONFIG_PPC_ISERIES),y) $(obj)/head_64.o: $(obj)/lparmap.s @@ -62,13 +74,12 @@ AFLAGS_head_64.o += -I$(obj) endif else -# stuff used from here for ARCH=ppc or ARCH=ppc64 +# stuff used from here for ARCH=ppc smpobj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PPC64) += traps.o process.o init_task.o time.o \ - setup-common.o $(smpobj-y) - endif +obj-$(CONFIG_PPC64) += $(obj64-y) + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c new file mode 100644 index 00000000000..faaec9c6f78 --- /dev/null +++ b/arch/powerpc/kernel/align.c @@ -0,0 +1,530 @@ +/* align.c - handle alignment exceptions for the Power PC. + * + * Copyright (c) 1996 Paul Mackerras <paulus@cs.anu.edu.au> + * Copyright (c) 1998-1999 TiVo, Inc. + * PowerPC 403GCX modifications. + * Copyright (c) 1999 Grant Erickson <grant@lcse.umn.edu> + * PowerPC 403GCX/405GP modifications. + * Copyright (c) 2001-2002 PPC64 team, IBM Corp + * 64-bit and Power4 support + * Copyright (c) 2005 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * Merge ppc32 and ppc64 implementations + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/cache.h> +#include <asm/cputable.h> + +struct aligninfo { + unsigned char len; + unsigned char flags; +}; + +#define IS_XFORM(inst) (((inst) >> 26) == 31) +#define IS_DSFORM(inst) (((inst) >> 26) >= 56) + +#define INVALID { 0, 0 } + +#define LD 1 /* load */ +#define ST 2 /* store */ +#define SE 4 /* sign-extend value */ +#define F 8 /* to/from fp regs */ +#define U 0x10 /* update index register */ +#define M 0x20 /* multiple load/store */ +#define SW 0x40 /* byte swap int or ... */ +#define S 0x40 /* ... single-precision fp */ +#define SX 0x40 /* byte count in XER */ +#define HARD 0x80 /* string, stwcx. */ + +#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ + +#define SWAP(a, b) (t = (a), (a) = (b), (b) = t) + +/* + * The PowerPC stores certain bits of the instruction that caused the + * alignment exception in the DSISR register. This array maps those + * bits to information about the operand length and what the + * instruction would do. + */ +static struct aligninfo aligninfo[128] = { + { 4, LD }, /* 00 0 0000: lwz / lwarx */ + INVALID, /* 00 0 0001 */ + { 4, ST }, /* 00 0 0010: stw */ + INVALID, /* 00 0 0011 */ + { 2, LD }, /* 00 0 0100: lhz */ + { 2, LD+SE }, /* 00 0 0101: lha */ + { 2, ST }, /* 00 0 0110: sth */ + { 4, LD+M }, /* 00 0 0111: lmw */ + { 4, LD+F+S }, /* 00 0 1000: lfs */ + { 8, LD+F }, /* 00 0 1001: lfd */ + { 4, ST+F+S }, /* 00 0 1010: stfs */ + { 8, ST+F }, /* 00 0 1011: stfd */ + INVALID, /* 00 0 1100 */ + { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ + INVALID, /* 00 0 1110 */ + { 8, ST }, /* 00 0 1111: std/stdu */ + { 4, LD+U }, /* 00 1 0000: lwzu */ + INVALID, /* 00 1 0001 */ + { 4, ST+U }, /* 00 1 0010: stwu */ + INVALID, /* 00 1 0011 */ + { 2, LD+U }, /* 00 1 0100: lhzu */ + { 2, LD+SE+U }, /* 00 1 0101: lhau */ + { 2, ST+U }, /* 00 1 0110: sthu */ + { 4, ST+M }, /* 00 1 0111: stmw */ + { 4, LD+F+S+U }, /* 00 1 1000: lfsu */ + { 8, LD+F+U }, /* 00 1 1001: lfdu */ + { 4, ST+F+S+U }, /* 00 1 1010: stfsu */ + { 8, ST+F+U }, /* 00 1 1011: stfdu */ + INVALID, /* 00 1 1100 */ + INVALID, /* 00 1 1101 */ + INVALID, /* 00 1 1110 */ + INVALID, /* 00 1 1111 */ + { 8, LD }, /* 01 0 0000: ldx */ + INVALID, /* 01 0 0001 */ + { 8, ST }, /* 01 0 0010: stdx */ + INVALID, /* 01 0 0011 */ + INVALID, /* 01 0 0100 */ + { 4, LD+SE }, /* 01 0 0101: lwax */ + INVALID, /* 01 0 0110 */ + INVALID, /* 01 0 0111 */ + { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */ + { 4, LD+M+HARD }, /* 01 0 1001: lswi */ + { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */ + { 4, ST+M+HARD }, /* 01 0 1011: stswi */ + INVALID, /* 01 0 1100 */ + { 8, LD+U }, /* 01 0 1101: ldu */ + INVALID, /* 01 0 1110 */ + { 8, ST+U }, /* 01 0 1111: stdu */ + { 8, LD+U }, /* 01 1 0000: ldux */ + INVALID, /* 01 1 0001 */ + { 8, ST+U }, /* 01 1 0010: stdux */ + INVALID, /* 01 1 0011 */ + INVALID, /* 01 1 0100 */ + { 4, LD+SE+U }, /* 01 1 0101: lwaux */ + INVALID, /* 01 1 0110 */ + INVALID, /* 01 1 0111 */ + INVALID, /* 01 1 1000 */ + INVALID, /* 01 1 1001 */ + INVALID, /* 01 1 1010 */ + INVALID, /* 01 1 1011 */ + INVALID, /* 01 1 1100 */ + INVALID, /* 01 1 1101 */ + INVALID, /* 01 1 1110 */ + INVALID, /* 01 1 1111 */ + INVALID, /* 10 0 0000 */ + INVALID, /* 10 0 0001 */ + INVALID, /* 10 0 0010: stwcx. */ + INVALID, /* 10 0 0011 */ + INVALID, /* 10 0 0100 */ + INVALID, /* 10 0 0101 */ + INVALID, /* 10 0 0110 */ + INVALID, /* 10 0 0111 */ + { 4, LD+SW }, /* 10 0 1000: lwbrx */ + INVALID, /* 10 0 1001 */ + { 4, ST+SW }, /* 10 0 1010: stwbrx */ + INVALID, /* 10 0 1011 */ + { 2, LD+SW }, /* 10 0 1100: lhbrx */ + { 4, LD+SE }, /* 10 0 1101 lwa */ + { 2, ST+SW }, /* 10 0 1110: sthbrx */ + INVALID, /* 10 0 1111 */ + INVALID, /* 10 1 0000 */ + INVALID, /* 10 1 0001 */ + INVALID, /* 10 1 0010 */ + INVALID, /* 10 1 0011 */ + INVALID, /* 10 1 0100 */ + INVALID, /* 10 1 0101 */ + INVALID, /* 10 1 0110 */ + INVALID, /* 10 1 0111 */ + INVALID, /* 10 1 1000 */ + INVALID, /* 10 1 1001 */ + INVALID, /* 10 1 1010 */ + INVALID, /* 10 1 1011 */ + INVALID, /* 10 1 1100 */ + INVALID, /* 10 1 1101 */ + INVALID, /* 10 1 1110 */ + { 0, ST+HARD }, /* 10 1 1111: dcbz */ + { 4, LD }, /* 11 0 0000: lwzx */ + INVALID, /* 11 0 0001 */ + { 4, ST }, /* 11 0 0010: stwx */ + INVALID, /* 11 0 0011 */ + { 2, LD }, /* 11 0 0100: lhzx */ + { 2, LD+SE }, /* 11 0 0101: lhax */ + { 2, ST }, /* 11 0 0110: sthx */ + INVALID, /* 11 0 0111 */ + { 4, LD+F+S }, /* 11 0 1000: lfsx */ + { 8, LD+F }, /* 11 0 1001: lfdx */ + { 4, ST+F+S }, /* 11 0 1010: stfsx */ + { 8, ST+F }, /* 11 0 1011: stfdx */ + INVALID, /* 11 0 1100 */ + { 8, LD+M }, /* 11 0 1101: lmd */ + INVALID, /* 11 0 1110 */ + { 8, ST+M }, /* 11 0 1111: stmd */ + { 4, LD+U }, /* 11 1 0000: lwzux */ + INVALID, /* 11 1 0001 */ + { 4, ST+U }, /* 11 1 0010: stwux */ + INVALID, /* 11 1 0011 */ + { 2, LD+U }, /* 11 1 0100: lhzux */ + { 2, LD+SE+U }, /* 11 1 0101: lhaux */ + { 2, ST+U }, /* 11 1 0110: sthux */ + INVALID, /* 11 1 0111 */ + { 4, LD+F+S+U }, /* 11 1 1000: lfsux */ + { 8, LD+F+U }, /* 11 1 1001: lfdux */ + { 4, ST+F+S+U }, /* 11 1 1010: stfsux */ + { 8, ST+F+U }, /* 11 1 1011: stfdux */ + INVALID, /* 11 1 1100 */ + INVALID, /* 11 1 1101 */ + INVALID, /* 11 1 1110 */ + INVALID, /* 11 1 1111 */ +}; + +/* + * Create a DSISR value from the instruction + */ +static inline unsigned make_dsisr(unsigned instr) +{ + unsigned dsisr; + + + /* bits 6:15 --> 22:31 */ + dsisr = (instr & 0x03ff0000) >> 16; + + if (IS_XFORM(instr)) { + /* bits 29:30 --> 15:16 */ + dsisr |= (instr & 0x00000006) << 14; + /* bit 25 --> 17 */ + dsisr |= (instr & 0x00000040) << 8; + /* bits 21:24 --> 18:21 */ + dsisr |= (instr & 0x00000780) << 3; + } else { + /* bit 5 --> 17 */ + dsisr |= (instr & 0x04000000) >> 12; + /* bits 1: 4 --> 18:21 */ + dsisr |= (instr & 0x78000000) >> 17; + /* bits 30:31 --> 12:13 */ + if (IS_DSFORM(instr)) + dsisr |= (instr & 0x00000003) << 18; + } + + return dsisr; +} + +/* + * The dcbz (data cache block zero) instruction + * gives an alignment fault if used on non-cacheable + * memory. We handle the fault mainly for the + * case when we are running with the cache disabled + * for debugging. + */ +static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) +{ + long __user *p; + int i, size; + +#ifdef __powerpc64__ + size = ppc64_caches.dline_size; +#else + size = L1_CACHE_BYTES; +#endif + p = (long __user *) (regs->dar & -size); + if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size)) + return -EFAULT; + for (i = 0; i < size / sizeof(long); ++i) + if (__put_user(0, p+i)) + return -EFAULT; + return 1; +} + +/* + * Emulate load & store multiple instructions + * On 64-bit machines, these instructions only affect/use the + * bottom 4 bytes of each register, and the loads clear the + * top 4 bytes of the affected register. + */ +#ifdef CONFIG_PPC64 +#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) +#else +#define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) +#endif + +static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int nb, + unsigned int flags, unsigned int instr) +{ + unsigned long *rptr; + unsigned int nb0, i; + + /* + * We do not try to emulate 8 bytes multiple as they aren't really + * available in our operating environments and we don't try to + * emulate multiples operations in kernel land as they should never + * be used/generated there at least not on unaligned boundaries + */ + if (unlikely((nb > 4) || !user_mode(regs))) + return 0; + + /* lmw, stmw, lswi/x, stswi/x */ + nb0 = 0; + if (flags & HARD) { + if (flags & SX) { + nb = regs->xer & 127; + if (nb == 0) + return 1; + } else { + if (__get_user(instr, + (unsigned int __user *)regs->nip)) + return -EFAULT; + nb = (instr >> 11) & 0x1f; + if (nb == 0) + nb = 32; + } + if (nb + reg * 4 > 128) { + nb0 = nb + reg * 4 - 128; + nb = 128 - reg * 4; + } + } else { + /* lwm, stmw */ + nb = (32 - reg) * 4; + } + + if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0)) + return -EFAULT; /* bad address */ + + rptr = ®s->gpr[reg]; + if (flags & LD) { + /* + * This zeroes the top 4 bytes of the affected registers + * in 64-bit mode, and also zeroes out any remaining + * bytes of the last register for lsw*. + */ + memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long)); + if (nb0 > 0) + memset(®s->gpr[0], 0, + ((nb0 + 3) / 4) * sizeof(unsigned long)); + + for (i = 0; i < nb; ++i) + if (__get_user(REG_BYTE(rptr, i), addr + i)) + return -EFAULT; + if (nb0 > 0) { + rptr = ®s->gpr[0]; + addr += nb; + for (i = 0; i < nb0; ++i) + if (__get_user(REG_BYTE(rptr, i), addr + i)) + return -EFAULT; + } + + } else { + for (i = 0; i < nb; ++i) + if (__put_user(REG_BYTE(rptr, i), addr + i)) + return -EFAULT; + if (nb0 > 0) { + rptr = ®s->gpr[0]; + addr += nb; + for (i = 0; i < nb0; ++i) + if (__put_user(REG_BYTE(rptr, i), addr + i)) + return -EFAULT; + } + } + return 1; +} + + +/* + * Called on alignment exception. Attempts to fixup + * + * Return 1 on success + * Return 0 if unable to handle the interrupt + * Return -EFAULT if data address is bad + */ + +int fix_alignment(struct pt_regs *regs) +{ + unsigned int instr, nb, flags; + unsigned int reg, areg; + unsigned int dsisr; + unsigned char __user *addr; + unsigned char __user *p; + int ret, t; + union { + u64 ll; + double dd; + unsigned char v[8]; + struct { + unsigned hi32; + int low32; + } x32; + struct { + unsigned char hi48[6]; + short low16; + } x16; + } data; + + /* + * We require a complete register set, if not, then our assembly + * is broken + */ + CHECK_FULL_REGS(regs); + + dsisr = regs->dsisr; + + /* Some processors don't provide us with a DSISR we can use here, + * let's make one up from the instruction + */ + if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) { + unsigned int real_instr; + if (unlikely(__get_user(real_instr, + (unsigned int __user *)regs->nip))) + return -EFAULT; + dsisr = make_dsisr(real_instr); + } + + /* extract the operation and registers from the dsisr */ + reg = (dsisr >> 5) & 0x1f; /* source/dest register */ + areg = dsisr & 0x1f; /* register to update */ + instr = (dsisr >> 10) & 0x7f; + instr |= (dsisr >> 13) & 0x60; + + /* Lookup the operation in our table */ + nb = aligninfo[instr].len; + flags = aligninfo[instr].flags; + + /* DAR has the operand effective address */ + addr = (unsigned char __user *)regs->dar; + + /* A size of 0 indicates an instruction we don't support, with + * the exception of DCBZ which is handled as a special case here + */ + if (instr == DCBZ) + return emulate_dcbz(regs, addr); + if (unlikely(nb == 0)) + return 0; + + /* Load/Store Multiple instructions are handled in their own + * function + */ + if (flags & M) + return emulate_multiple(regs, addr, reg, nb, flags, instr); + + /* Verify the address of the operand */ + if (unlikely(user_mode(regs) && + !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), + addr, nb))) + return -EFAULT; + + /* Force the fprs into the save area so we can reference them */ + if (flags & F) { + /* userland only */ + if (unlikely(!user_mode(regs))) + return 0; + flush_fp_to_thread(current); + } + + /* If we are loading, get the data from user space, else + * get it from register values + */ + if (flags & LD) { + data.ll = 0; + ret = 0; + p = addr; + switch (nb) { + case 8: + ret |= __get_user(data.v[0], p++); + ret |= __get_user(data.v[1], p++); + ret |= __get_user(data.v[2], p++); + ret |= __get_user(data.v[3], p++); + case 4: + ret |= __get_user(data.v[4], p++); + ret |= __get_user(data.v[5], p++); + case 2: + ret |= __get_user(data.v[6], p++); + ret |= __get_user(data.v[7], p++); + if (unlikely(ret)) + return -EFAULT; + } + } else if (flags & F) + data.dd = current->thread.fpr[reg]; + else + data.ll = regs->gpr[reg]; + + /* Perform other misc operations like sign extension, byteswap, + * or floating point single precision conversion + */ + switch (flags & ~U) { + case LD+SE: /* sign extend */ + if ( nb == 2 ) + data.ll = data.x16.low16; + else /* nb must be 4 */ + data.ll = data.x32.low32; + break; + case LD+S: /* byte-swap */ + case ST+S: + if (nb == 2) { + SWAP(data.v[6], data.v[7]); + } else { + SWAP(data.v[4], data.v[7]); + SWAP(data.v[5], data.v[6]); + } + break; + + /* Single-precision FP load and store require conversions... */ + case LD+F+S: +#ifdef CONFIG_PPC_FPU + preempt_disable(); + enable_kernel_fp(); + cvt_fd((float *)&data.v[4], &data.dd, ¤t->thread); + preempt_enable(); +#else + return 0; +#endif + break; + case ST+F+S: +#ifdef CONFIG_PPC_FPU + preempt_disable(); + enable_kernel_fp(); + cvt_df(&data.dd, (float *)&data.v[4], ¤t->thread); + preempt_enable(); +#else + return 0; +#endif + break; + } + + /* Store result to memory or update registers */ + if (flags & ST) { + ret = 0; + p = addr; + switch (nb) { + case 8: + ret |= __put_user(data.v[0], p++); + ret |= __put_user(data.v[1], p++); + ret |= __put_user(data.v[2], p++); + ret |= __put_user(data.v[3], p++); + case 4: + ret |= __put_user(data.v[4], p++); + ret |= __put_user(data.v[5], p++); + case 2: + ret |= __put_user(data.v[6], p++); + ret |= __put_user(data.v[7], p++); + } + if (unlikely(ret)) + return -EFAULT; + } else if (flags & F) + current->thread.fpr[reg] = data.dd; + else + regs->gpr[reg] = data.ll; + + /* Update RA as needed */ + if (flags & U) + regs->gpr[areg] = regs->dar; + + return 1; +} diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4550eb4f4fb..91538d2445b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -270,13 +270,15 @@ int main(void) DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec)); DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec)); + DEFINE(TSPC64_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC64_TV_NSEC, offsetof(struct timespec, tv_nsec)); DEFINE(TSPC32_TV_SEC, offsetof(struct compat_timespec, tv_sec)); DEFINE(TSPC32_TV_NSEC, offsetof(struct compat_timespec, tv_nsec)); #else DEFINE(TVAL32_TV_SEC, offsetof(struct timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct timeval, tv_usec)); - DEFINE(TSPEC32_TV_SEC, offsetof(struct timespec, tv_sec)); - DEFINE(TSPEC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); #endif /* timeval/timezone offsets for use by vdso */ DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c new file mode 100644 index 00000000000..7c3419656cc --- /dev/null +++ b/arch/powerpc/kernel/dma_64.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Implements the generic device dma API for ppc64. Handles + * the pci and vio busses + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +/* Include the busses we support */ +#include <linux/pci.h> +#include <asm/vio.h> +#include <asm/scatterlist.h> +#include <asm/bug.h> + +static struct dma_mapping_ops *get_dma_ops(struct device *dev) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return &pci_dma_ops; +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return &vio_dma_ops; +#endif + return NULL; +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->dma_supported(dev, mask); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 dma_mask) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return pci_set_dma_mask(to_pci_dev(dev), dma_mask); +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return -EIO; +#endif /* CONFIG_IBMVIO */ + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->alloc_coherent(dev, size, dma_handle, flag); + BUG(); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); + else + BUG(); +} +EXPORT_SYMBOL(dma_free_coherent); + +dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, cpu_addr, size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_single); + +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_addr, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_single); + +dma_addr_t dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, + (page_address(page) + offset), size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_page); + +void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_address, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_page); + +int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_sg(dev, sg, nents, direction); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_map_sg); + +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_sg(dev, sg, nhwentries, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5063c603fad..8d60fa99fc4 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -24,7 +24,7 @@ * Copyright 2002-2004 MontaVista Software, Inc. * PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org> * Copyright 2004 Freescale Semiconductor, Inc - * PowerPC e500 modifications, Kumar Gala <kumar.gala@freescale.com> + * PowerPC e500 modifications, Kumar Gala <galak@kernel.crashing.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/arch/powerpc/kernel/idle_64.c b/arch/powerpc/kernel/idle_64.c new file mode 100644 index 00000000000..b879d3057ef --- /dev/null +++ b/arch/powerpc/kernel/idle_64.c @@ -0,0 +1,121 @@ +/* + * Idle daemon for PowerPC. Idle daemon will handle any action + * that needs to be taken when the system becomes idle. + * + * Originally Written by Cort Dougan (cort@cs.nmt.edu) + * + * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com> + * + * Additional shared processor, SMT, and firmware support + * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/cpu.h> +#include <linux/sysctl.h> + +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/time.h> +#include <asm/machdep.h> +#include <asm/smp.h> + +extern void power4_idle(void); + +void default_idle(void) +{ + unsigned int cpu = smp_processor_id(); + set_thread_flag(TIF_POLLING_NRFLAG); + + while (1) { + if (!need_resched()) { + while (!need_resched() && !cpu_is_offline(cpu)) { + ppc64_runlatch_off(); + + /* + * Go into low thread priority and possibly + * low power mode. + */ + HMT_low(); + HMT_very_low(); + } + + HMT_medium(); + } + + ppc64_runlatch_on(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) + cpu_die(); + } +} + +void native_idle(void) +{ + while (1) { + ppc64_runlatch_off(); + + if (!need_resched()) + power4_idle(); + + if (need_resched()) { + ppc64_runlatch_on(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } + + if (cpu_is_offline(smp_processor_id()) && + system_state == SYSTEM_RUNNING) + cpu_die(); + } +} + +void cpu_idle(void) +{ + BUG_ON(NULL == ppc_md.idle_loop); + ppc_md.idle_loop(); +} + +int powersave_nap; + +#ifdef CONFIG_SYSCTL +/* + * Register the sysctl to set/clear powersave_nap. + */ +static ctl_table powersave_nap_ctl_table[]={ + { + .ctl_name = KERN_PPC_POWERSAVE_NAP, + .procname = "powersave-nap", + .data = &powersave_nap, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0, }, +}; +static ctl_table powersave_nap_sysctl_root[] = { + { 1, "kernel", NULL, 0, 0755, powersave_nap_ctl_table, }, + { 0,}, +}; + +static int __init +register_powersave_nap_sysctl(void) +{ + register_sysctl_table(powersave_nap_sysctl_root, 0); + + return 0; +} +__initcall(register_powersave_nap_sysctl); +#endif diff --git a/arch/powerpc/kernel/ioctl32.c b/arch/powerpc/kernel/ioctl32.c index 3fa6a93adbd..0fa3d27fef0 100644 --- a/arch/powerpc/kernel/ioctl32.c +++ b/arch/powerpc/kernel/ioctl32.c @@ -40,10 +40,6 @@ IOCTL_TABLE_START #define DECLARES #include "compat_ioctl.c" -/* Little p (/dev/rtc, /dev/envctrl, etc.) */ -COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ -COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ - IOCTL_TABLE_END int ioctl_table_size = ARRAY_SIZE(ioctl_start); diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c new file mode 100644 index 00000000000..6160c8dbb7c --- /dev/null +++ b/arch/powerpc/kernel/iomap.c @@ -0,0 +1,146 @@ +/* + * arch/ppc64/kernel/iomap.c + * + * ppc64 "iomap" interface implementation. + * + * (C) Copyright 2004 Linus Torvalds + */ +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/mm.h> +#include <asm/io.h> + +/* + * Here comes the ppc64 implementation of the IOMAP + * interfaces. + */ +unsigned int fastcall ioread8(void __iomem *addr) +{ + return readb(addr); +} +unsigned int fastcall ioread16(void __iomem *addr) +{ + return readw(addr); +} +unsigned int fastcall ioread16be(void __iomem *addr) +{ + return in_be16(addr); +} +unsigned int fastcall ioread32(void __iomem *addr) +{ + return readl(addr); +} +unsigned int fastcall ioread32be(void __iomem *addr) +{ + return in_be32(addr); +} +EXPORT_SYMBOL(ioread8); +EXPORT_SYMBOL(ioread16); +EXPORT_SYMBOL(ioread16be); +EXPORT_SYMBOL(ioread32); +EXPORT_SYMBOL(ioread32be); + +void fastcall iowrite8(u8 val, void __iomem *addr) +{ + writeb(val, addr); +} +void fastcall iowrite16(u16 val, void __iomem *addr) +{ + writew(val, addr); +} +void fastcall iowrite16be(u16 val, void __iomem *addr) +{ + out_be16(addr, val); +} +void fastcall iowrite32(u32 val, void __iomem *addr) +{ + writel(val, addr); +} +void fastcall iowrite32be(u32 val, void __iomem *addr) +{ + out_be32(addr, val); +} +EXPORT_SYMBOL(iowrite8); +EXPORT_SYMBOL(iowrite16); +EXPORT_SYMBOL(iowrite16be); +EXPORT_SYMBOL(iowrite32); +EXPORT_SYMBOL(iowrite32be); + +/* + * These are the "repeat read/write" functions. Note the + * non-CPU byte order. We do things in "IO byteorder" + * here. + * + * FIXME! We could make these do EEH handling if we really + * wanted. Not clear if we do. + */ +void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insb((u8 __iomem *) addr, dst, count); +} +void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insw_ns((u16 __iomem *) addr, dst, count); +} +void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insl_ns((u32 __iomem *) addr, dst, count); +} +EXPORT_SYMBOL(ioread8_rep); +EXPORT_SYMBOL(ioread16_rep); +EXPORT_SYMBOL(ioread32_rep); + +void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsb((u8 __iomem *) addr, src, count); +} +void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsw_ns((u16 __iomem *) addr, src, count); +} +void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsl_ns((u32 __iomem *) addr, src, count); +} +EXPORT_SYMBOL(iowrite8_rep); +EXPORT_SYMBOL(iowrite16_rep); +EXPORT_SYMBOL(iowrite32_rep); + +void __iomem *ioport_map(unsigned long port, unsigned int len) +{ + if (!_IO_IS_VALID(port)) + return NULL; + return (void __iomem *) (port+pci_io_base); +} + +void ioport_unmap(void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max) +{ + unsigned long start = pci_resource_start(dev, bar); + unsigned long len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (!len) + return NULL; + if (max && len > max) + len = max; + if (flags & IORESOURCE_IO) + return ioport_map(start, len); + if (flags & IORESOURCE_MEM) + return ioremap(start, len); + /* What? */ + return NULL; +} + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(pci_iomap); +EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c new file mode 100644 index 00000000000..4d9b4388918 --- /dev/null +++ b/arch/powerpc/kernel/iommu.c @@ -0,0 +1,572 @@ +/* + * arch/ppc64/kernel/iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes, virtual merging: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * and Ben. Herrenschmidt, IBM Corporation + * + * Dynamic DMA mapping support, bus-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> + +#define DBG(...) + +#ifdef CONFIG_IOMMU_VMERGE +static int novmerge = 0; +#else +static int novmerge = 1; +#endif + +static int __init setup_iommu(char *str) +{ + if (!strcmp(str, "novmerge")) + novmerge = 1; + else if (!strcmp(str, "vmerge")) + novmerge = 0; + return 1; +} + +__setup("iommu=", setup_iommu); + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned long npages, + unsigned long *handle, + unsigned int align_order) +{ + unsigned long n, end, i, start; + unsigned long limit; + int largealloc = npages > 15; + int pass = 0; + unsigned long align_mask; + + align_mask = 0xffffffffffffffffl >> (64 - align_order); + + /* This allocator was derived from x86_64's bit string search */ + + /* Sanity check */ + if (unlikely(npages) == 0) { + if (printk_ratelimit()) + WARN_ON(1); + return DMA_ERROR_CODE; + } + + if (handle && *handle) + start = *handle; + else + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + /* Use only half of the table for small allocs (15 pages or less) */ + limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + + if (largealloc && start < tbl->it_halfpoint) + start = tbl->it_halfpoint; + + /* The case below can happen if we have a small segment appended + * to a large, or when the previous alloc was at the very end of + * the available space. If so, go back to the initial start. + */ + if (start >= limit) + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + again: + + n = find_next_zero_bit(tbl->it_map, limit, start); + + /* Align allocation */ + n = (n + align_mask) & ~align_mask; + + end = n + npages; + + if (unlikely(end >= limit)) { + if (likely(pass < 2)) { + /* First failure, just rescan the half of the table. + * Second failure, rescan the other half of the table. + */ + start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; + limit = pass ? tbl->it_size : limit; + pass++; + goto again; + } else { + /* Third failure, give up */ + return DMA_ERROR_CODE; + } + } + + for (i = n; i < end; i++) + if (test_bit(i, tbl->it_map)) { + start = i+1; + goto again; + } + + for (i = n; i < end; i++) + __set_bit(i, tbl->it_map); + + /* Bump the hint to a new block for small allocs. */ + if (largealloc) { + /* Don't bump to new block to avoid fragmentation */ + tbl->it_largehint = end; + } else { + /* Overflow will be taken care of at the next allocation */ + tbl->it_hint = (end + tbl->it_blocksize - 1) & + ~(tbl->it_blocksize - 1); + } + + /* Update handle for SG allocations */ + if (handle) + *handle = end; + + return n; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, + unsigned int npages, enum dma_data_direction direction, + unsigned int align_order) +{ + unsigned long entry, flags; + dma_addr_t ret = DMA_ERROR_CODE; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + entry = iommu_range_alloc(tbl, npages, NULL, align_order); + + if (unlikely(entry == DMA_ERROR_CODE)) { + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return DMA_ERROR_CODE; + } + + entry += tbl->it_offset; /* Offset into real TCE table */ + ret = entry << PAGE_SHIFT; /* Set the return dma address */ + + /* Put the TCEs in the HW table */ + ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK, + direction); + + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return ret; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + unsigned long i; + + entry = dma_addr >> PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + if (((free_entry + npages) > tbl->it_size) || + (entry < tbl->it_offset)) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_free: invalid entry\n"); + printk(KERN_INFO "\tentry = 0x%lx\n", entry); + printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr); + printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl); + printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno); + printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size); + printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset); + printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index); + WARN_ON(1); + } + return; + } + + ppc_md.tce_free(tbl, entry, npages); + + for (i = 0; i < npages; i++) + __clear_bit(free_entry+i, tbl->it_map); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long flags; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + __iommu_free(tbl, dma_addr, npages); + + /* Make sure TLB cache is flushed if the HW needs it. We do + * not do an mb() here on purpose, it is not needed on any of + * the current platforms. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + dma_addr_t dma_next = 0, dma_addr; + unsigned long flags; + struct scatterlist *s, *outs, *segstart; + int outcount, incount; + unsigned long handle; + + BUG_ON(direction == DMA_NONE); + + if ((nelems == 0) || !tbl) + return 0; + + outs = s = segstart = &sglist[0]; + outcount = 1; + incount = nelems; + handle = 0; + + /* Init first segment length for backout at failure */ + outs->dma_length = 0; + + DBG("mapping %d elements:\n", nelems); + + spin_lock_irqsave(&(tbl->it_lock), flags); + + for (s = outs; nelems; nelems--, s++) { + unsigned long vaddr, npages, entry, slen; + + slen = s->length; + /* Sanity check */ + if (slen == 0) { + dma_next = 0; + continue; + } + /* Allocate iommu entries for that segment */ + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + entry = iommu_range_alloc(tbl, npages, &handle, 0); + + DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); + + /* Handle failure */ + if (unlikely(entry == DMA_ERROR_CODE)) { + if (printk_ratelimit()) + printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx" + " npages %lx\n", tbl, vaddr, npages); + goto failure; + } + + /* Convert entry to a dma_addr_t */ + entry += tbl->it_offset; + dma_addr = entry << PAGE_SHIFT; + dma_addr |= s->offset; + + DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n", + npages, entry, dma_addr); + + /* Insert into HW table */ + ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction); + + /* If we are in an open segment, try merging */ + if (segstart != s) { + DBG(" - trying merge...\n"); + /* We cannot merge if: + * - allocated dma_addr isn't contiguous to previous allocation + */ + if (novmerge || (dma_addr != dma_next)) { + /* Can't merge: create a new segment */ + segstart = s; + outcount++; outs++; + DBG(" can't merge, new segment.\n"); + } else { + outs->dma_length += s->length; + DBG(" merged, new len: %lx\n", outs->dma_length); + } + } + + if (segstart == s) { + /* This is a new segment, fill entries */ + DBG(" - filling new segment.\n"); + outs->dma_address = dma_addr; + outs->dma_length = slen; + } + + /* Calculate next page pointer for contiguous check */ + dma_next = dma_addr + slen; + + DBG(" - dma next is: %lx\n", dma_next); + } + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + DBG("mapped %d elements:\n", outcount); + + /* For the sake of iommu_unmap_sg, we clear out the length in the + * next entry of the sglist if we didn't fill the list completely + */ + if (outcount < incount) { + outs++; + outs->dma_address = DMA_ERROR_CODE; + outs->dma_length = 0; + } + return outcount; + + failure: + for (s = &sglist[0]; s <= outs; s++) { + if (s->dma_length != 0) { + unsigned long vaddr, npages; + + vaddr = s->dma_address & PAGE_MASK; + npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr) + >> PAGE_SHIFT; + __iommu_free(tbl, vaddr, npages); + } + } + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return 0; +} + + +void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + if (!tbl) + return; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + while (nelems--) { + unsigned int npages; + dma_addr_t dma_handle = sglist->dma_address; + + if (sglist->dma_length == 0) + break; + npages = (PAGE_ALIGN(dma_handle + sglist->dma_length) + - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; + __iommu_free(tbl, dma_handle, npages); + sglist++; + } + + /* Flush/invalidate TLBs if necessary. As for iommu_free(), we + * do not do an mb() here, the affected platforms do not need it + * when freeing. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +/* + * Build a iommu_table structure. This contains a bit map which + * is used to manage allocation of the tce space. + */ +struct iommu_table *iommu_init_table(struct iommu_table *tbl) +{ + unsigned long sz; + static int welcomed = 0; + + /* Set aside 1/4 of the table for large allocations. */ + tbl->it_halfpoint = tbl->it_size * 3 / 4; + + /* number of bytes needed for the bitmap */ + sz = (tbl->it_size + 7) >> 3; + + tbl->it_map = (unsigned long *)__get_free_pages(GFP_ATOMIC, get_order(sz)); + if (!tbl->it_map) + panic("iommu_init_table: Can't allocate %ld bytes\n", sz); + + memset(tbl->it_map, 0, sz); + + tbl->it_hint = 0; + tbl->it_largehint = tbl->it_halfpoint; + spin_lock_init(&tbl->it_lock); + + /* Clear the hardware table in case firmware left allocations in it */ + ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + + if (!welcomed) { + printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", + novmerge ? "disabled" : "enabled"); + welcomed = 1; + } + + return tbl; +} + +void iommu_free_table(struct device_node *dn) +{ + struct pci_dn *pdn = dn->data; + struct iommu_table *tbl = pdn->iommu_table; + unsigned long bitmap_sz, i; + unsigned int order; + + if (!tbl || !tbl->it_map) { + printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__, + dn->full_name); + return; + } + + /* verify that table contains no entries */ + /* it_size is in entries, and we're examining 64 at a time */ + for (i = 0; i < (tbl->it_size/64); i++) { + if (tbl->it_map[i] != 0) { + printk(KERN_WARNING "%s: Unexpected TCEs for %s\n", + __FUNCTION__, dn->full_name); + break; + } + } + + /* calculate bitmap size in bytes */ + bitmap_sz = (tbl->it_size + 7) / 8; + + /* free bitmap */ + order = get_order(bitmap_sz); + free_pages((unsigned long) tbl->it_map, order); + + /* free table */ + kfree(tbl); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_handle = DMA_ERROR_CODE; + unsigned long uaddr; + unsigned int npages; + + BUG_ON(direction == DMA_NONE); + + uaddr = (unsigned long)vaddr; + npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + + if (tbl) { + dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 0); + if (dma_handle == DMA_ERROR_CODE) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_alloc failed, " + "tbl %p vaddr %p npages %d\n", + tbl, vaddr, npages); + } + } else + dma_handle |= (uaddr & ~PAGE_MASK); + } + + return dma_handle; +} + +void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + + if (tbl) + iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - + (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* + * Client asked for way too much space. This is checked later + * anyway. It is easier to debug here for the drivers than in + * the tce tables. + */ + if (order >= IOMAP_MAX_ORDER) { + printk("iommu_alloc_consistent size too large: 0x%lx\n", size); + return NULL; + } + + if (!tbl) + return NULL; + + /* Alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + return NULL; + memset(ret, 0, size); + + /* Set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, order); + if (mapping == DMA_ERROR_CODE) { + free_pages((unsigned long)ret, order); + ret = NULL; + } else + *dma_handle = mapping; + return ret; +} + +void iommu_free_coherent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + + if (tbl) { + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); + } +} diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4b7940693f3..5a71ed9612f 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -71,6 +71,11 @@ #include <asm/paca.h> #endif +int __irq_offset_value; +#ifdef CONFIG_PPC32 +EXPORT_SYMBOL(__irq_offset_value); +#endif + static int ppc_spurious_interrupts; #if defined(CONFIG_PPC_ISERIES) && defined(CONFIG_SMP) @@ -98,7 +103,6 @@ extern atomic_t ipi_sent; EXPORT_SYMBOL(irq_desc); int distribute_irqs = 1; -int __irq_offset_value; u64 ppc64_interrupt_controller; #endif /* CONFIG_PPC64 */ @@ -311,7 +315,6 @@ void __init init_IRQ(void) } #ifdef CONFIG_PPC64 -#ifndef CONFIG_PPC_ISERIES /* * Virtual IRQ mapping code, used on systems with XICS interrupt controllers. */ @@ -420,8 +423,6 @@ unsigned int real_irq_to_virt_slowpath(unsigned int real_irq) } -#endif /* CONFIG_PPC_ISERIES */ - #ifdef CONFIG_IRQSTACKS struct thread_info *softirq_ctx[NR_CPUS]; struct thread_info *hardirq_ctx[NR_CPUS]; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c new file mode 100644 index 00000000000..511af54e623 --- /dev/null +++ b/arch/powerpc/kernel/kprobes.c @@ -0,0 +1,459 @@ +/* + * Kernel Probes (KProbes) + * arch/ppc64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Nov Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port + * for PPC64 + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <asm/cacheflush.h> +#include <asm/kdebug.h> +#include <asm/sstep.h> + +static DECLARE_MUTEX(kprobe_mutex); +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + int ret = 0; + kprobe_opcode_t insn = *p->addr; + + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); + ret = -EINVAL; + } else if (IS_MTMSRD(insn) || IS_RFID(insn)) { + printk("Cannot register a kprobe on rfid or mtmsrd\n"); + ret = -EINVAL; + } + + /* insn must be on a special executable page on ppc64 */ + if (!ret) { + down(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + up(&kprobe_mutex); + if (!p->ainsn.insn) + ret = -ENOMEM; + } + return ret; +} + +void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + down(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + up(&kprobe_mutex); +} + +static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + kprobe_opcode_t insn = *p->ainsn.insn; + + regs->msr |= MSR_SE; + + /* single step inline if it is a trap variant */ + if (is_trap(insn)) + regs->nip = (unsigned long)p->addr; + else + regs->nip = (unsigned long)p->ainsn.insn; +} + +static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; +} + +static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_msr = regs->msr; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + +static inline int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + unsigned int *addr = (unsigned int *)regs->nip; + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + kprobe_opcode_t insn = *p->ainsn.insn; + if (kcb->kprobe_status == KPROBE_HIT_SS && + is_trap(insn)) { + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + goto no_kprobe; + } + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_saved_msr = regs->msr; + p->nmissed++; + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * PowerPC has multiple variants of the "trap" + * instruction. If the current instruction is a + * trap variant, it could belong to someone else + */ + kprobe_opcode_t cur_insn = *addr; + if (is_trap(cur_insn)) + goto no_kprobe; + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + set_current_kprobe(p, regs, kcb); + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +void kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->nip = orig_ret_address; + + reset_current_kprobe(); + spin_unlock_irqrestore(&kretprobe_lock, flags); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + int ret; + unsigned int insn = *p->ainsn.insn; + + regs->nip = (unsigned long)p->addr; + ret = emulate_step(regs, insn); + if (ret == 0) + regs->nip = (unsigned long)p->addr + 4; +} + +static inline int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs); + regs->msr |= kcb->kprobe_saved_msr; + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, msr + * will have SE set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->msr & MSR_SE) + return 0; + + return 1; +} + +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + if (kcb->kprobe_status & KPROBE_HIT_SS) { + resume_execution(cur, regs); + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + + reset_current_kprobe(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch (val) { + case DIE_BPT: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs)); + + /* setup return addr to the jprobe handler routine */ + regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry); + regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); + + return 1; +} + +void __kprobes jprobe_return(void) +{ + asm volatile("trap" ::: "memory"); +} + +void __kprobes jprobe_return_end(void) +{ +}; + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + /* + * FIXME - we should ideally be validating that we got here 'cos + * of the "trap" in jprobe_return() above, before restoring the + * saved regs... + */ + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + preempt_enable_no_resched(); + return 1; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index 1b3ba8a440a..9dda16ccde7 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -42,32 +42,6 @@ /* #define LPARCFG_DEBUG */ -/* find a better place for this function... */ -static void log_plpar_hcall_return(unsigned long rc, char *tag) -{ - if (rc == 0) /* success, return */ - return; -/* check for null tag ? */ - if (rc == H_Hardware) - printk(KERN_INFO - "plpar-hcall (%s) failed with hardware fault\n", tag); - else if (rc == H_Function) - printk(KERN_INFO - "plpar-hcall (%s) failed; function not allowed\n", tag); - else if (rc == H_Authority) - printk(KERN_INFO - "plpar-hcall (%s) failed; not authorized to this function\n", - tag); - else if (rc == H_Parameter) - printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n", - tag); - else - printk(KERN_INFO - "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n", - tag, rc); - -} - static struct proc_dir_entry *proc_ppc64_lparcfg; #define LPARCFG_BUFF_SIZE 4096 @@ -172,6 +146,31 @@ static int lparcfg_data(struct seq_file *m, void *v) /* * Methods used to fetch LPAR data when running on a pSeries platform. */ +/* find a better place for this function... */ +static void log_plpar_hcall_return(unsigned long rc, char *tag) +{ + if (rc == 0) /* success, return */ + return; +/* check for null tag ? */ + if (rc == H_Hardware) + printk(KERN_INFO + "plpar-hcall (%s) failed with hardware fault\n", tag); + else if (rc == H_Function) + printk(KERN_INFO + "plpar-hcall (%s) failed; function not allowed\n", tag); + else if (rc == H_Authority) + printk(KERN_INFO + "plpar-hcall (%s) failed; not authorized to this function\n", + tag); + else if (rc == H_Parameter) + printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n", + tag); + else + printk(KERN_INFO + "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n", + tag, rc); + +} /* * H_GET_PPP hcall returns info in 4 parms. diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c new file mode 100644 index 00000000000..97c51e452be --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -0,0 +1,358 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * + * Copyright (C) 2004-2005, IBM Corp. + * + * Created by: Milton D Miller II + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + +#include <linux/cpumask.h> +#include <linux/kexec.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/errno.h> + +#include <asm/page.h> +#include <asm/current.h> +#include <asm/machdep.h> +#include <asm/cacheflush.h> +#include <asm/paca.h> +#include <asm/mmu.h> +#include <asm/sections.h> /* _end */ +#include <asm/prom.h> +#include <asm/smp.h> + +#define HASH_GROUP_SIZE 0x80 /* size of each hash group, asm/mmu.h */ + +/* Have this around till we move it into crash specific file */ +note_buf_t crash_notes[NR_CPUS]; + +/* Dummy for now. Not sure if we need to have a crash shutdown in here + * and if what it will achieve. Letting it be now to compile the code + * in generic kexec environment + */ +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* do nothing right now */ + /* smp_relase_cpus() if we want smp on panic kernel */ + /* cpu_irq_down to isolate us until we are ready */ +} + +int machine_kexec_prepare(struct kimage *image) +{ + int i; + unsigned long begin, end; /* limits of segment */ + unsigned long low, high; /* limits of blocked memory range */ + struct device_node *node; + unsigned long *basep; + unsigned int *sizep; + + if (!ppc_md.hpte_clear_all) + return -ENOENT; + + /* + * Since we use the kernel fault handlers and paging code to + * handle the virtual mode, we must make sure no destination + * overlaps kernel static data or bss. + */ + for (i = 0; i < image->nr_segments; i++) + if (image->segment[i].mem < __pa(_end)) + return -ETXTBSY; + + /* + * For non-LPAR, we absolutely can not overwrite the mmu hash + * table, since we are still using the bolted entries in it to + * do the copy. Check that here. + * + * It is safe if the end is below the start of the blocked + * region (end <= low), or if the beginning is after the + * end of the blocked region (begin >= high). Use the + * boolean identity !(a || b) === (!a && !b). + */ + if (htab_address) { + low = __pa(htab_address); + high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE; + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + /* We also should not overwrite the tce tables */ + for (node = of_find_node_by_type(NULL, "pci"); node != NULL; + node = of_find_node_by_type(node, "pci")) { + basep = (unsigned long *)get_property(node, "linux,tce-base", + NULL); + sizep = (unsigned int *)get_property(node, "linux,tce-size", + NULL); + if (basep == NULL || sizep == NULL) + continue; + + low = *basep; + high = low + (*sizep); + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + /* we do nothing in prepare that needs to be undone */ +} + +#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) + +static void copy_segments(unsigned long ind) +{ + unsigned long entry; + unsigned long *ptr; + void *dest; + void *addr; + + /* + * We rely on kexec_load to create a lists that properly + * initializes these pointers before they are used. + * We will still crash if the list is wrong, but at least + * the compiler will be quiet. + */ + ptr = NULL; + dest = NULL; + + for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { + addr = __va(entry & PAGE_MASK); + + switch (entry & IND_FLAGS) { + case IND_DESTINATION: + dest = addr; + break; + case IND_INDIRECTION: + ptr = addr; + break; + case IND_SOURCE: + copy_page(dest, addr); + dest += PAGE_SIZE; + } + } +} + +void kexec_copy_flush(struct kimage *image) +{ + long i, nr_segments = image->nr_segments; + struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; + + /* save the ranges on the stack to efficiently flush the icache */ + memcpy(ranges, image->segment, sizeof(ranges)); + + /* + * After this call we may not use anything allocated in dynamic + * memory, including *image. + * + * Only globals and the stack are allowed. + */ + copy_segments(image->head); + + /* + * we need to clear the icache for all dest pages sometime, + * including ones that were in place on the original copy + */ + for (i = 0; i < nr_segments; i++) + flush_icache_range(ranges[i].mem + KERNELBASE, + ranges[i].mem + KERNELBASE + + ranges[i].memsz); +} + +#ifdef CONFIG_SMP + +/* FIXME: we should schedule this function to be called on all cpus based + * on calling the interrupts, but we would like to call it off irq level + * so that the interrupt controller is clean. + */ +void kexec_smp_down(void *arg) +{ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 1); + + local_irq_disable(); + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void kexec_prepare_cpus(void) +{ + int my_cpu, i, notified=-1; + + smp_call_function(kexec_smp_down, NULL, 0, /* wait */0); + my_cpu = get_cpu(); + + /* check the others cpus are now down (via paca hw cpu id == -1) */ + for (i=0; i < NR_CPUS; i++) { + if (i == my_cpu) + continue; + + while (paca[i].hw_cpu_id != -1) { + barrier(); + if (!cpu_possible(i)) { + printk("kexec: cpu %d hw_cpu_id %d is not" + " possible, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (!cpu_online(i)) { + /* Fixme: this can be spinning in + * pSeries_secondary_wait with a paca + * waiting for it to go online. + */ + printk("kexec: cpu %d hw_cpu_id %d is not" + " online, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (i != notified) { + printk( "kexec: waiting for cpu %d (physical" + " %d) to go down\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } + + /* after we tell the others to go down */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + + put_cpu(); + + local_irq_disable(); +} + +#else /* ! SMP */ + +static void kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + * + * We need to release the cpus if we are ever going from an + * UP to an SMP kernel. + */ + smp_release_cpus(); + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + local_irq_disable(); +} + +#endif /* SMP */ + +/* + * kexec thread structure and stack. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. It also must be statically allocated + * or allocated as part of the kimage, because everything else may be + * overwritten when we copy the kexec image. We piggyback on the + * "init_task" linker section here to statically allocate a stack. + * + * We could use a smaller stack if we don't care about anything using + * current, but that audit has not been performed. + */ +union thread_union kexec_stack + __attribute__((__section__(".data.init_task"))) = { }; + +/* Our assembly helper, in kexec_stub.S */ +extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) ATTRIB_NORET; + +/* too late to fail here */ +void machine_kexec(struct kimage *image) +{ + + /* prepare control code if any */ + + /* shutdown other cpus into our wait loop and quiesce interrupts */ + kexec_prepare_cpus(); + + /* switch to a staticly allocated stack. Based on irq stack code. + * XXX: the task struct will likely be invalid once we do the copy! + */ + kexec_stack.thread_info.task = current_thread_info()->task; + kexec_stack.thread_info.flags = 0; + + /* Some things are best done in assembly. Finding globals with + * a toc is easier in C, so pass in what we can. + */ + kexec_sequence(&kexec_stack, image->start, image, + page_address(image->control_code_page), + ppc_md.hpte_clear_all); + /* NOTREACHED */ +} + +/* Values we need to export to the second kernel via the device tree. */ +static unsigned long htab_base, htab_size, kernel_end; + +static struct property htab_base_prop = { + .name = "linux,htab-base", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_base, +}; + +static struct property htab_size_prop = { + .name = "linux,htab-size", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_size, +}; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(unsigned long), + .value = (unsigned char *)&kernel_end, +}; + +static void __init export_htab_values(void) +{ + struct device_node *node; + + node = of_find_node_by_path("/chosen"); + if (!node) + return; + + kernel_end = __pa(_end); + prom_add_property(node, &kernel_end_prop); + + /* On machines with no htab htab_address is NULL */ + if (NULL == htab_address) + goto out; + + htab_base = __pa(htab_address); + prom_add_property(node, &htab_base_prop); + + htab_size = 1UL << ppc64_pft_size; + prom_add_property(node, &htab_size_prop); + + out: + of_node_put(node); +} + +void __init kexec_setup(void) +{ + export_htab_values(); +} diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index f6d84a75ed2..624a983a967 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -27,14 +27,6 @@ .text - .align 5 -_GLOBAL(__delay) - cmpwi 0,r3,0 - mtctr r3 - beqlr -1: bdnz 1b - blr - /* * This returns the high 64 bits of the product of two 64-bit numbers. */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c new file mode 100644 index 00000000000..928b8581fcb --- /dev/null +++ b/arch/powerpc/kernel/module_64.c @@ -0,0 +1,455 @@ +/* Kernel module help for PPC64. + Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <asm/module.h> +#include <asm/uaccess.h> + +/* FIXME: We don't do .init separately. To do this, we'd need to have + a separate r2 value in the init and core section, and stub between + them, too. + + Using a magic allocator which places modules within 32MB solves + this, and makes other things simpler. Anton? + --RR. */ +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +/* There's actually a third entry here, but it's unused */ +struct ppc64_opd_entry +{ + unsigned long funcaddr; + unsigned long r2; +}; + +/* Like PPC32, we need little trampolines to do > 24-bit jumps (into + the kernel itself). But on PPC64, these need to be used for every + jump, actually, to reset r2 (TOC+0x8000). */ +struct ppc64_stub_entry +{ + /* 28 byte jump instruction sequence (7 instructions) */ + unsigned char jump[28]; + unsigned char unused[4]; + /* Data for the above code */ + struct ppc64_opd_entry opd; +}; + +/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external) + function which may be more than 24-bits away. We could simply + patch the new r2 value and function pointer into the stub, but it's + significantly shorter to put these values at the end of the stub + code, and patch the stub address (32-bits relative to the TOC ptr, + r2) into the stub. */ +static struct ppc64_stub_entry ppc64_stub = +{ .jump = { + 0x3d, 0x82, 0x00, 0x00, /* addis r12,r2, <high> */ + 0x39, 0x8c, 0x00, 0x00, /* addi r12,r12, <low> */ + /* Save current r2 value in magic place on the stack. */ + 0xf8, 0x41, 0x00, 0x28, /* std r2,40(r1) */ + 0xe9, 0x6c, 0x00, 0x20, /* ld r11,32(r12) */ + 0xe8, 0x4c, 0x00, 0x28, /* ld r2,40(r12) */ + 0x7d, 0x69, 0x03, 0xa6, /* mtctr r11 */ + 0x4e, 0x80, 0x04, 0x20 /* bctr */ +} }; + +/* Count how many different 24-bit relocations (different symbol, + different addend) */ +static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, j, ret = 0; + + /* FIXME: Only count external ones --RR */ + /* Sure, this is order(n^2), but it's usually short, and not + time critical */ + for (i = 0; i < num; i++) { + /* Only count 24-bit relocs, others don't need stubs */ + if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) + continue; + for (j = 0; j < i; j++) { + /* If this addend appeared before, it's + already been counted */ + if (rela[i].r_info == rela[j].r_info + && rela[i].r_addend == rela[j].r_addend) + break; + } + if (j == i) ret++; + } + return ret; +} + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + + return vmalloc_exec(size); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* Get size of potential trampolines required. */ +static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, + const Elf64_Shdr *sechdrs) +{ + /* One extra reloc so it's always 0-funcaddr terminated */ + unsigned long relocs = 1; + unsigned i; + + /* Every relocated section... */ + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_RELA) { + DEBUGP("Found relocations in section %u\n", i); + DEBUGP("Ptr: %p. Number: %lu\n", + (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela)); + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela)); + } + } + + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); + return relocs * sizeof(struct ppc64_stub_entry); +} + +static void dedotify_versions(struct modversion_info *vers, + unsigned long size) +{ + struct modversion_info *end; + + for (end = (void *)vers + size; vers < end; vers++) + if (vers->name[0] == '.') + memmove(vers->name, vers->name+1, strlen(vers->name)); +} + +/* Undefined symbols which refer to .funcname, hack to funcname */ +static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) +{ + unsigned int i; + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF) { + char *name = strtab + syms[i].st_name; + if (name[0] == '.') + memmove(name, name+1, strlen(name)); + } + } +} + +int module_frob_arch_sections(Elf64_Ehdr *hdr, + Elf64_Shdr *sechdrs, + char *secstrings, + struct module *me) +{ + unsigned int i; + + /* Find .toc and .stubs sections, symtab and strtab */ + for (i = 1; i < hdr->e_shnum; i++) { + char *p; + if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0) + me->arch.stubs_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) + me->arch.toc_section = i; + else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) + dedotify_versions((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size); + + /* We don't handle .init for the moment: rename to _init */ + while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) + p[0] = '_'; + + if (sechdrs[i].sh_type == SHT_SYMTAB) + dedotify((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf64_Sym), + (void *)hdr + + sechdrs[sechdrs[i].sh_link].sh_offset); + } + if (!me->arch.stubs_section || !me->arch.toc_section) { + printk("%s: doesn't contain .toc or .stubs.\n", me->name); + return -ENOEXEC; + } + + /* Override the stubs size */ + sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + return 0; +} + +int apply_relocate(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name); + return -ENOEXEC; +} + +/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this + gives the value maximum span in an instruction which uses a signed + offset) */ +static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me) +{ + return sechdrs[me->arch.toc_section].sh_addr + 0x8000; +} + +/* Both low and high 16 bits are added as SIGNED additions, so if low + 16 bits has high bit set, high 16 bits must be adjusted. These + macros do that (stolen from binutils). */ +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +/* Patch stub to reference function and correct r2 value. */ +static inline int create_stub(Elf64_Shdr *sechdrs, + struct ppc64_stub_entry *entry, + struct ppc64_opd_entry *opd, + struct module *me) +{ + Elf64_Half *loc1, *loc2; + long reladdr; + + *entry = ppc64_stub; + + loc1 = (Elf64_Half *)&entry->jump[2]; + loc2 = (Elf64_Half *)&entry->jump[6]; + + /* Stub uses address relative to r2. */ + reladdr = (unsigned long)entry - my_r2(sechdrs, me); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + printk("%s: Address %p of stub out of range of %p.\n", + me->name, (void *)reladdr, (void *)my_r2); + return 0; + } + DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr); + + *loc1 = PPC_HA(reladdr); + *loc2 = PPC_LO(reladdr); + entry->opd.funcaddr = opd->funcaddr; + entry->opd.r2 = opd->r2; + return 1; +} + +/* Create stub to jump to function described in this OPD: we need the + stub to set up the TOC ptr (r2) for the function. */ +static unsigned long stub_for_addr(Elf64_Shdr *sechdrs, + unsigned long opdaddr, + struct module *me) +{ + struct ppc64_stub_entry *stubs; + struct ppc64_opd_entry *opd = (void *)opdaddr; + unsigned int i, num_stubs; + + num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs); + + /* Find this stub, or if that fails, the next avail. entry */ + stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; + for (i = 0; stubs[i].opd.funcaddr; i++) { + BUG_ON(i >= num_stubs); + + if (stubs[i].opd.funcaddr == opd->funcaddr) + return (unsigned long)&stubs[i]; + } + + if (!create_stub(sechdrs, &stubs[i], opd, me)) + return 0; + + return (unsigned long)&stubs[i]; +} + +/* We expect a noop next: if it is, replace it with instruction to + restore r2. */ +static int restore_r2(u32 *instruction, struct module *me) +{ + if (*instruction != 0x60000000) { + printk("%s: Expect noop after relocate, got %08x\n", + me->name, *instruction); + return 0; + } + *instruction = 0xe8410028; /* ld r2,40(r1) */ + return 1; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + unsigned long *location; + unsigned long value; + + DEBUGP("Applying ADD relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rela[i].r_offset; + /* This is the symbol it is referring to */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info); + + DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n", + location, (long)ELF64_R_TYPE(rela[i].r_info), + strtab + sym->st_name, (unsigned long)sym->st_value, + (long)rela[i].r_addend); + + /* `Everything is relative'. */ + value = sym->st_value + rela[i].r_addend; + + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_PPC64_ADDR32: + /* Simply set it */ + *(u32 *)location = value; + break; + + case R_PPC64_ADDR64: + /* Simply set it */ + *(unsigned long *)location = value; + break; + + case R_PPC64_TOC: + *(unsigned long *)location = my_r2(sechdrs, me); + break; + + case R_PPC64_TOC16: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if (value + 0x8000 > 0xffff) { + printk("%s: bad TOC16 relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_TOC16_DS: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0 || value + 0x8000 > 0xffff) { + printk("%s: bad TOC16_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC_REL24: + /* FIXME: Handle weak symbols here --RR */ + if (sym->st_shndx == SHN_UNDEF) { + /* External: go via stub */ + value = stub_for_addr(sechdrs, value, me); + if (!value) + return -ENOENT; + if (!restore_r2((u32 *)location + 1, me)) + return -ENOEXEC; + } + + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){ + printk("%s: REL24 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + + /* Only replace bits 2 through 26 */ + *(uint32_t *)location + = (*(uint32_t *)location & ~0x03fffffc) + | (value & 0x03fffffc); + break; + + default: + printk("%s: Unknown ADD relocation: %lu\n", + me->name, + (unsigned long)ELF64_R_TYPE(rela[i].r_info)); + return -ENOEXEC; + } + } + + return 0; +} + +LIST_HEAD(module_bug_list); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, struct module *me) +{ + char *secstrings; + unsigned int i; + + me->arch.bug_table = NULL; + me->arch.num_bugs = 0; + + /* Find the __bug_table section, if present */ + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings+sechdrs[i].sh_name, "__bug_table")) + continue; + me->arch.bug_table = (void *) sechdrs[i].sh_addr; + me->arch.num_bugs = sechdrs[i].sh_size / sizeof(struct bug_entry); + break; + } + + /* + * Strictly speaking this should have a spinlock to protect against + * traversals, but since we only traverse on BUG()s, a spinlock + * could potentially lead to deadlock and thus be counter-productive. + */ + list_add(&me->arch.bug_list, &module_bug_list); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + list_del(&mod->arch.bug_list); +} + +struct bug_entry *module_find_bug(unsigned long bugaddr) +{ + struct mod_arch_specific *mod; + unsigned int i; + struct bug_entry *bug; + + list_for_each_entry(mod, &module_bug_list, bug_list) { + bug = mod->bug_table; + for (i = 0; i < mod->num_bugs; ++i, ++bug) + if (bugaddr == bug->bug_addr) + return bug; + } + return NULL; +} diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c new file mode 100644 index 00000000000..c0fcd29918c --- /dev/null +++ b/arch/powerpc/kernel/nvram_64.c @@ -0,0 +1,742 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * /dev/nvram driver for PPC64 + * + * This perhaps should live in drivers/char + * + * TODO: Split the /dev/nvram part (that one can use + * drivers/char/generic_nvram.c) from the arch & partition + * parsing code. + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/fcntl.h> +#include <linux/nvram.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> +#include <asm/nvram.h> +#include <asm/rtas.h> +#include <asm/prom.h> +#include <asm/machdep.h> + +#undef DEBUG_NVRAM + +static int nvram_scan_partitions(void); +static int nvram_setup_partition(void); +static int nvram_create_os_partition(void); +static int nvram_remove_os_partition(void); + +static struct nvram_partition * nvram_part; +static long nvram_error_log_index = -1; +static long nvram_error_log_size = 0; + +int no_logging = 1; /* Until we initialize everything, + * make sure we don't try logging + * anything */ + +extern volatile int error_log_cnt; + +struct err_log_info { + int error_type; + unsigned int seq_num; +}; + +static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin) +{ + int size; + + if (ppc_md.nvram_size == NULL) + return -ENODEV; + size = ppc_md.nvram_size(); + + switch (origin) { + case 1: + offset += file->f_pos; + break; + case 2: + offset += size; + break; + } + if (offset < 0) + return -EINVAL; + file->f_pos = offset; + return file->f_pos; +} + + +static ssize_t dev_nvram_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t len; + char *tmp_buffer; + int size; + + if (ppc_md.nvram_size == NULL) + return -ENODEV; + size = ppc_md.nvram_size(); + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + if (*ppos >= size) + return 0; + if (count > size) + count = size; + + tmp_buffer = (char *) kmalloc(count, GFP_KERNEL); + if (!tmp_buffer) { + printk(KERN_ERR "dev_read_nvram: kmalloc failed\n"); + return -ENOMEM; + } + + len = ppc_md.nvram_read(tmp_buffer, count, ppos); + if ((long)len <= 0) { + kfree(tmp_buffer); + return len; + } + + if (copy_to_user(buf, tmp_buffer, len)) { + kfree(tmp_buffer); + return -EFAULT; + } + + kfree(tmp_buffer); + return len; + +} + +static ssize_t dev_nvram_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + ssize_t len; + char * tmp_buffer; + int size; + + if (ppc_md.nvram_size == NULL) + return -ENODEV; + size = ppc_md.nvram_size(); + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + if (*ppos >= size) + return 0; + if (count > size) + count = size; + + tmp_buffer = (char *) kmalloc(count, GFP_KERNEL); + if (!tmp_buffer) { + printk(KERN_ERR "dev_nvram_write: kmalloc failed\n"); + return -ENOMEM; + } + + if (copy_from_user(tmp_buffer, buf, count)) { + kfree(tmp_buffer); + return -EFAULT; + } + + len = ppc_md.nvram_write(tmp_buffer, count, ppos); + if ((long)len <= 0) { + kfree(tmp_buffer); + return len; + } + + kfree(tmp_buffer); + return len; +} + +static int dev_nvram_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + switch(cmd) { +#ifdef CONFIG_PPC_PMAC + case OBSOLETE_PMAC_NVRAM_GET_OFFSET: + printk(KERN_WARNING "nvram: Using obsolete PMAC_NVRAM_GET_OFFSET ioctl\n"); + case IOC_NVRAM_GET_OFFSET: { + int part, offset; + + if (_machine != PLATFORM_POWERMAC) + return -EINVAL; + if (copy_from_user(&part, (void __user*)arg, sizeof(part)) != 0) + return -EFAULT; + if (part < pmac_nvram_OF || part > pmac_nvram_NR) + return -EINVAL; + offset = pmac_get_partition(part); + if (offset < 0) + return offset; + if (copy_to_user((void __user*)arg, &offset, sizeof(offset)) != 0) + return -EFAULT; + return 0; + } +#endif /* CONFIG_PPC_PMAC */ + } + return -EINVAL; +} + +struct file_operations nvram_fops = { + .owner = THIS_MODULE, + .llseek = dev_nvram_llseek, + .read = dev_nvram_read, + .write = dev_nvram_write, + .ioctl = dev_nvram_ioctl, +}; + +static struct miscdevice nvram_dev = { + NVRAM_MINOR, + "nvram", + &nvram_fops +}; + + +#ifdef DEBUG_NVRAM +static void nvram_print_partitions(char * label) +{ + struct list_head * p; + struct nvram_partition * tmp_part; + + printk(KERN_WARNING "--------%s---------\n", label); + printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n"); + list_for_each(p, &nvram_part->partition) { + tmp_part = list_entry(p, struct nvram_partition, partition); + printk(KERN_WARNING "%d \t%02x\t%02x\t%d\t%s\n", + tmp_part->index, tmp_part->header.signature, + tmp_part->header.checksum, tmp_part->header.length, + tmp_part->header.name); + } +} +#endif + + +static int nvram_write_header(struct nvram_partition * part) +{ + loff_t tmp_index; + int rc; + + tmp_index = part->index; + rc = ppc_md.nvram_write((char *)&part->header, NVRAM_HEADER_LEN, &tmp_index); + + return rc; +} + + +static unsigned char nvram_checksum(struct nvram_header *p) +{ + unsigned int c_sum, c_sum2; + unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */ + c_sum = p->signature + p->length + sp[0] + sp[1] + sp[2] + sp[3] + sp[4] + sp[5]; + + /* The sum may have spilled into the 3rd byte. Fold it back. */ + c_sum = ((c_sum & 0xffff) + (c_sum >> 16)) & 0xffff; + /* The sum cannot exceed 2 bytes. Fold it into a checksum */ + c_sum2 = (c_sum >> 8) + (c_sum << 8); + c_sum = ((c_sum + c_sum2) >> 8) & 0xff; + return c_sum; +} + + +/* + * Find an nvram partition, sig can be 0 for any + * partition or name can be NULL for any name, else + * tries to match both + */ +struct nvram_partition *nvram_find_partition(int sig, const char *name) +{ + struct nvram_partition * part; + struct list_head * p; + + list_for_each(p, &nvram_part->partition) { + part = list_entry(p, struct nvram_partition, partition); + + if (sig && part->header.signature != sig) + continue; + if (name && 0 != strncmp(name, part->header.name, 12)) + continue; + return part; + } + return NULL; +} +EXPORT_SYMBOL(nvram_find_partition); + + +static int nvram_remove_os_partition(void) +{ + struct list_head *i; + struct list_head *j; + struct nvram_partition * part; + struct nvram_partition * cur_part; + int rc; + + list_for_each(i, &nvram_part->partition) { + part = list_entry(i, struct nvram_partition, partition); + if (part->header.signature != NVRAM_SIG_OS) + continue; + + /* Make os partition a free partition */ + part->header.signature = NVRAM_SIG_FREE; + sprintf(part->header.name, "wwwwwwwwwwww"); + part->header.checksum = nvram_checksum(&part->header); + + /* Merge contiguous free partitions backwards */ + list_for_each_prev(j, &part->partition) { + cur_part = list_entry(j, struct nvram_partition, partition); + if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { + break; + } + + part->header.length += cur_part->header.length; + part->header.checksum = nvram_checksum(&part->header); + part->index = cur_part->index; + + list_del(&cur_part->partition); + kfree(cur_part); + j = &part->partition; /* fixup our loop */ + } + + /* Merge contiguous free partitions forwards */ + list_for_each(j, &part->partition) { + cur_part = list_entry(j, struct nvram_partition, partition); + if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { + break; + } + + part->header.length += cur_part->header.length; + part->header.checksum = nvram_checksum(&part->header); + + list_del(&cur_part->partition); + kfree(cur_part); + j = &part->partition; /* fixup our loop */ + } + + rc = nvram_write_header(part); + if (rc <= 0) { + printk(KERN_ERR "nvram_remove_os_partition: nvram_write failed (%d)\n", rc); + return rc; + } + + } + + return 0; +} + +/* nvram_create_os_partition + * + * Create a OS linux partition to buffer error logs. + * Will create a partition starting at the first free + * space found if space has enough room. + */ +static int nvram_create_os_partition(void) +{ + struct nvram_partition *part; + struct nvram_partition *new_part; + struct nvram_partition *free_part = NULL; + int seq_init[2] = { 0, 0 }; + loff_t tmp_index; + long size = 0; + int rc; + + /* Find a free partition that will give us the maximum needed size + If can't find one that will give us the minimum size needed */ + list_for_each_entry(part, &nvram_part->partition, partition) { + if (part->header.signature != NVRAM_SIG_FREE) + continue; + + if (part->header.length >= NVRAM_MAX_REQ) { + size = NVRAM_MAX_REQ; + free_part = part; + break; + } + if (!size && part->header.length >= NVRAM_MIN_REQ) { + size = NVRAM_MIN_REQ; + free_part = part; + } + } + if (!size) + return -ENOSPC; + + /* Create our OS partition */ + new_part = kmalloc(sizeof(*new_part), GFP_KERNEL); + if (!new_part) { + printk(KERN_ERR "nvram_create_os_partition: kmalloc failed\n"); + return -ENOMEM; + } + + new_part->index = free_part->index; + new_part->header.signature = NVRAM_SIG_OS; + new_part->header.length = size; + strcpy(new_part->header.name, "ppc64,linux"); + new_part->header.checksum = nvram_checksum(&new_part->header); + + rc = nvram_write_header(new_part); + if (rc <= 0) { + printk(KERN_ERR "nvram_create_os_partition: nvram_write_header \ + failed (%d)\n", rc); + return rc; + } + + /* make sure and initialize to zero the sequence number and the error + type logged */ + tmp_index = new_part->index + NVRAM_HEADER_LEN; + rc = ppc_md.nvram_write((char *)&seq_init, sizeof(seq_init), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_create_os_partition: nvram_write " + "failed (%d)\n", rc); + return rc; + } + + nvram_error_log_index = new_part->index + NVRAM_HEADER_LEN; + nvram_error_log_size = ((part->header.length - 1) * + NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); + + list_add_tail(&new_part->partition, &free_part->partition); + + if (free_part->header.length <= size) { + list_del(&free_part->partition); + kfree(free_part); + return 0; + } + + /* Adjust the partition we stole the space from */ + free_part->index += size * NVRAM_BLOCK_LEN; + free_part->header.length -= size; + free_part->header.checksum = nvram_checksum(&free_part->header); + + rc = nvram_write_header(free_part); + if (rc <= 0) { + printk(KERN_ERR "nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); + return rc; + } + + return 0; +} + + +/* nvram_setup_partition + * + * This will setup the partition we need for buffering the + * error logs and cleanup partitions if needed. + * + * The general strategy is the following: + * 1.) If there is ppc64,linux partition large enough then use it. + * 2.) If there is not a ppc64,linux partition large enough, search + * for a free partition that is large enough. + * 3.) If there is not a free partition large enough remove + * _all_ OS partitions and consolidate the space. + * 4.) Will first try getting a chunk that will satisfy the maximum + * error log size (NVRAM_MAX_REQ). + * 5.) If the max chunk cannot be allocated then try finding a chunk + * that will satisfy the minum needed (NVRAM_MIN_REQ). + */ +static int nvram_setup_partition(void) +{ + struct list_head * p; + struct nvram_partition * part; + int rc; + + /* For now, we don't do any of this on pmac, until I + * have figured out if it's worth killing some unused stuffs + * in our nvram, as Apple defined partitions use pretty much + * all of the space + */ + if (_machine == PLATFORM_POWERMAC) + return -ENOSPC; + + /* see if we have an OS partition that meets our needs. + will try getting the max we need. If not we'll delete + partitions and try again. */ + list_for_each(p, &nvram_part->partition) { + part = list_entry(p, struct nvram_partition, partition); + if (part->header.signature != NVRAM_SIG_OS) + continue; + + if (strcmp(part->header.name, "ppc64,linux")) + continue; + + if (part->header.length >= NVRAM_MIN_REQ) { + /* found our partition */ + nvram_error_log_index = part->index + NVRAM_HEADER_LEN; + nvram_error_log_size = ((part->header.length - 1) * + NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); + return 0; + } + } + + /* try creating a partition with the free space we have */ + rc = nvram_create_os_partition(); + if (!rc) { + return 0; + } + + /* need to free up some space */ + rc = nvram_remove_os_partition(); + if (rc) { + return rc; + } + + /* create a partition in this new space */ + rc = nvram_create_os_partition(); + if (rc) { + printk(KERN_ERR "nvram_create_os_partition: Could not find a " + "NVRAM partition large enough\n"); + return rc; + } + + return 0; +} + + +static int nvram_scan_partitions(void) +{ + loff_t cur_index = 0; + struct nvram_header phead; + struct nvram_partition * tmp_part; + unsigned char c_sum; + char * header; + int total_size; + int err; + + if (ppc_md.nvram_size == NULL) + return -ENODEV; + total_size = ppc_md.nvram_size(); + + header = (char *) kmalloc(NVRAM_HEADER_LEN, GFP_KERNEL); + if (!header) { + printk(KERN_ERR "nvram_scan_partitions: Failed kmalloc\n"); + return -ENOMEM; + } + + while (cur_index < total_size) { + + err = ppc_md.nvram_read(header, NVRAM_HEADER_LEN, &cur_index); + if (err != NVRAM_HEADER_LEN) { + printk(KERN_ERR "nvram_scan_partitions: Error parsing " + "nvram partitions\n"); + goto out; + } + + cur_index -= NVRAM_HEADER_LEN; /* nvram_read will advance us */ + + memcpy(&phead, header, NVRAM_HEADER_LEN); + + err = 0; + c_sum = nvram_checksum(&phead); + if (c_sum != phead.checksum) { + printk(KERN_WARNING "WARNING: nvram partition checksum" + " was %02x, should be %02x!\n", + phead.checksum, c_sum); + printk(KERN_WARNING "Terminating nvram partition scan\n"); + goto out; + } + if (!phead.length) { + printk(KERN_WARNING "WARNING: nvram corruption " + "detected: 0-length partition\n"); + goto out; + } + tmp_part = (struct nvram_partition *) + kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); + err = -ENOMEM; + if (!tmp_part) { + printk(KERN_ERR "nvram_scan_partitions: kmalloc failed\n"); + goto out; + } + + memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN); + tmp_part->index = cur_index; + list_add_tail(&tmp_part->partition, &nvram_part->partition); + + cur_index += phead.length * NVRAM_BLOCK_LEN; + } + err = 0; + + out: + kfree(header); + return err; +} + +static int __init nvram_init(void) +{ + int error; + int rc; + + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) + return -ENODEV; + + rc = misc_register(&nvram_dev); + if (rc != 0) { + printk(KERN_ERR "nvram_init: failed to register device\n"); + return rc; + } + + /* initialize our anchor for the nvram partition list */ + nvram_part = (struct nvram_partition *) kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); + if (!nvram_part) { + printk(KERN_ERR "nvram_init: Failed kmalloc\n"); + return -ENOMEM; + } + INIT_LIST_HEAD(&nvram_part->partition); + + /* Get all the NVRAM partitions */ + error = nvram_scan_partitions(); + if (error) { + printk(KERN_ERR "nvram_init: Failed nvram_scan_partitions\n"); + return error; + } + + if(nvram_setup_partition()) + printk(KERN_WARNING "nvram_init: Could not find nvram partition" + " for nvram buffered error logging.\n"); + +#ifdef DEBUG_NVRAM + nvram_print_partitions("NVRAM Partitions"); +#endif + + return rc; +} + +void __exit nvram_cleanup(void) +{ + misc_deregister( &nvram_dev ); +} + + +#ifdef CONFIG_PPC_PSERIES + +/* nvram_write_error_log + * + * We need to buffer the error logs into nvram to ensure that we have + * the failure information to decode. If we have a severe error there + * is no way to guarantee that the OS or the machine is in a state to + * get back to user land and write the error to disk. For example if + * the SCSI device driver causes a Machine Check by writing to a bad + * IO address, there is no way of guaranteeing that the device driver + * is in any state that is would also be able to write the error data + * captured to disk, thus we buffer it in NVRAM for analysis on the + * next boot. + * + * In NVRAM the partition containing the error log buffer will looks like: + * Header (in bytes): + * +-----------+----------+--------+------------+------------------+ + * | signature | checksum | length | name | data | + * |0 |1 |2 3|4 15|16 length-1| + * +-----------+----------+--------+------------+------------------+ + * + * The 'data' section would look like (in bytes): + * +--------------+------------+-----------------------------------+ + * | event_logged | sequence # | error log | + * |0 3|4 7|8 nvram_error_log_size-1| + * +--------------+------------+-----------------------------------+ + * + * event_logged: 0 if event has not been logged to syslog, 1 if it has + * sequence #: The unique sequence # for each event. (until it wraps) + * error log: The error log from event_scan + */ +int nvram_write_error_log(char * buff, int length, unsigned int err_type) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (no_logging) { + return -EPERM; + } + + if (nvram_error_log_index == -1) { + return -ESPIPE; + } + + if (length > nvram_error_log_size) { + length = nvram_error_log_size; + } + + info.error_type = err_type; + info.seq_num = error_log_cnt; + + tmp_index = nvram_error_log_index; + + rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + + rc = ppc_md.nvram_write(buff, length, &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + + return 0; +} + +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char * buff, int length, unsigned int * err_type) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (nvram_error_log_index == -1) + return -1; + + if (length > nvram_error_log_size) + length = nvram_error_log_size; + + tmp_index = nvram_error_log_index; + + rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + return rc; + } + + rc = ppc_md.nvram_read(buff, length, &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + return rc; + } + + error_log_cnt = info.seq_num; + *err_type = info.error_type; + + return 0; +} + +/* This doesn't actually zero anything, but it sets the event_logged + * word to tell that this event is safely in syslog. + */ +int nvram_clear_error_log(void) +{ + loff_t tmp_index; + int clear_word = ERR_FLAG_ALREADY_LOGGED; + int rc; + + tmp_index = nvram_error_log_index; + + rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + + return 0; +} + +#endif /* CONFIG_PPC_PSERIES */ + +module_init(nvram_init); +module_exit(nvram_cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c new file mode 100644 index 00000000000..5a5b2468508 --- /dev/null +++ b/arch/powerpc/kernel/pci_64.c @@ -0,0 +1,1381 @@ +/* + * Port for PPC64 David Engebretsen, IBM Corp. + * Contains common pci routines for ppc64 platform, pSeries and iSeries brands. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/syscalls.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/byteorder.h> +#include <asm/irq.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +unsigned long pci_probe_only = 1; +unsigned long pci_assign_all_buses = 0; + +/* + * legal IO pages under MAX_ISA_PORT. This is to ensure we don't touch + * devices we don't have access to. + */ +unsigned long io_page_mask; + +EXPORT_SYMBOL(io_page_mask); + +#ifdef CONFIG_PPC_MULTIPLATFORM +static void fixup_resource(struct resource *res, struct pci_dev *dev); +static void do_bus_setup(struct pci_bus *bus); +#endif + +unsigned int pcibios_assign_all_busses(void) +{ + return pci_assign_all_buses; +} + +/* pci_io_base -- the base address from which io bars are offsets. + * This is the lowest I/O base address (so bar values are always positive), + * and it *must* be the start of ISA space if an ISA bus exists because + * ISA drivers use hard coded offsets. If no ISA bus exists a dummy + * page is mapped and isa_io_limit prevents access to it. + */ +unsigned long isa_io_base; /* NULL if no ISA bus */ +EXPORT_SYMBOL(isa_io_base); +unsigned long pci_io_base; +EXPORT_SYMBOL(pci_io_base); + +void iSeries_pcibios_init(void); + +LIST_HEAD(hose_list); + +struct dma_mapping_ops pci_dma_ops; +EXPORT_SYMBOL(pci_dma_ops); + +int global_phb_number; /* Global phb counter */ + +/* Cached ISA bridge dev. */ +struct pci_dev *ppc64_isabridge_dev = NULL; + +static void fixup_broken_pcnet32(struct pci_dev* dev) +{ + if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) { + dev->vendor = PCI_VENDOR_ID_AMD; + pci_write_config_word(dev, PCI_VENDOR_ID, PCI_VENDOR_ID_AMD); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TRIDENT, PCI_ANY_ID, fixup_broken_pcnet32); + +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + res->start = region->start + offset; + res->end = region->end + offset; +} + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pcibios_resource_to_bus); +EXPORT_SYMBOL(pcibios_bus_to_resource); +#endif + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + * + * Why? Because some silly external IO cards only decode + * the low 10 bits of the IO address. The 0x00-0xff region + * is reserved for motherboard devices that decode all 16 + * bits, so it's ok to allocate at, say, 0x2800-0x28ff, + * but we want to try to avoid allocating at 0x2900-0x2bff + * which might have be mirrored at 0x0100-0x03ff.. + */ +void pcibios_align_resource(void *data, struct resource *res, + unsigned long size, unsigned long align) +{ + struct pci_dev *dev = data; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start = res->start; + unsigned long alignto; + + if (res->flags & IORESOURCE_IO) { + unsigned long offset = (unsigned long)hose->io_base_virt - + pci_io_base; + /* Make sure we start at our min on all hoses */ + if (start - offset < PCIBIOS_MIN_IO) + start = PCIBIOS_MIN_IO + offset; + + /* + * Put everything into 0x00-0xff region modulo 0x400 + */ + if (start & 0x300) + start = (start + 0x3ff) & ~0x3ff; + + } else if (res->flags & IORESOURCE_MEM) { + /* Make sure we start at our min on all hoses */ + if (start - hose->pci_mem_offset < PCIBIOS_MIN_MEM) + start = PCIBIOS_MIN_MEM + hose->pci_mem_offset; + + /* Align to multiple of size of minimum base. */ + alignto = max(0x1000UL, align); + start = ALIGN(start, alignto); + } + + res->start = start; +} + +static DEFINE_SPINLOCK(hose_spinlock); + +/* + * pci_controller(phb) initialized common variables. + */ +static void __devinit pci_setup_pci_controller(struct pci_controller *hose) +{ + memset(hose, 0, sizeof(struct pci_controller)); + + spin_lock(&hose_spinlock); + hose->global_number = global_phb_number++; + list_add_tail(&hose->list_node, &hose_list); + spin_unlock(&hose_spinlock); +} + +static void add_linux_pci_domain(struct device_node *dev, + struct pci_controller *phb) +{ + struct property *of_prop; + unsigned int size; + + of_prop = (struct property *) + get_property(dev, "linux,pci-domain", &size); + if (of_prop != NULL) + return; + WARN_ON(of_prop && size < sizeof(int)); + if (of_prop && size < sizeof(int)) + of_prop = NULL; + size = sizeof(struct property) + sizeof(int); + if (of_prop == NULL) { + if (mem_init_done) + of_prop = kmalloc(size, GFP_KERNEL); + else + of_prop = alloc_bootmem(size); + } + memset(of_prop, 0, sizeof(struct property)); + of_prop->name = "linux,pci-domain"; + of_prop->length = sizeof(int); + of_prop->value = (unsigned char *)&of_prop[1]; + *((int *)of_prop->value) = phb->global_number; + prom_add_property(dev, of_prop); +} + +struct pci_controller * pcibios_alloc_controller(struct device_node *dev) +{ + struct pci_controller *phb; + + if (mem_init_done) + phb = kmalloc(sizeof(struct pci_controller), GFP_KERNEL); + else + phb = alloc_bootmem(sizeof (struct pci_controller)); + if (phb == NULL) + return NULL; + pci_setup_pci_controller(phb); + phb->arch_data = dev; + phb->is_dynamic = mem_init_done; + if (dev) + add_linux_pci_domain(dev, phb); + return phb; +} + +void pcibios_free_controller(struct pci_controller *phb) +{ + if (phb->arch_data) { + struct device_node *np = phb->arch_data; + int *domain = (int *)get_property(np, + "linux,pci-domain", NULL); + if (domain) + *domain = -1; + } + if (phb->is_dynamic) + kfree(phb); +} + +static void __init pcibios_claim_one_bus(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child_bus; + + list_for_each_entry(dev, &b->devices, bus_list) { + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (r->parent || !r->start || !r->flags) + continue; + pci_claim_resource(dev, i); + } + } + + list_for_each_entry(child_bus, &b->children, node) + pcibios_claim_one_bus(child_bus); +} + +#ifndef CONFIG_PPC_ISERIES +static void __init pcibios_claim_of_setup(void) +{ + struct pci_bus *b; + + list_for_each_entry(b, &pci_root_buses, node) + pcibios_claim_one_bus(b); +} +#endif + +#ifdef CONFIG_PPC_MULTIPLATFORM +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + u32 *prop; + int len; + + prop = (u32 *) get_property(np, name, &len); + if (prop && len >= 4) + return *prop; + return def; +} + +static unsigned int pci_parse_of_flags(u32 addr0) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + return flags; +} + +#define GET_64BIT(prop, i) ((((u64) (prop)[(i)]) << 32) | (prop)[(i)+1]) + +static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct resource *res; + u32 *addrs, i; + int proplen; + + addrs = (u32 *) get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(addrs[0]); + if (!flags) + continue; + base = GET_64BIT(addrs, 1); + size = GET_64BIT(addrs, 3); + if (!size) + continue; + i = addrs[0] & 0xff; + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->start = base; + res->end = base + size - 1; + res->flags = flags; + res->name = pci_name(dev); + fixup_resource(res, dev); + } +} + +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + + dev = kmalloc(sizeof(struct pci_dev), GFP_KERNEL); + if (!dev) + return NULL; + type = get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + memset(dev, 0, sizeof(struct pci_dev)); + dev->bus = bus; + dev->sysdata = node; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = 256; /*pci_cfg_space_size(dev);*/ + + sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + + dev->current_state = 4; /* unknown power state */ + + if (!strcmp(type, "pci")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + dev->irq = NO_IRQ; + if (node->n_intrs > 0) { + dev->irq = node->intrs[0].line; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, + dev->irq); + } + } + + pci_parse_of_addrs(node, dev); + + pci_device_add(dev, bus); + + /* XXX pci_scan_msi_device(dev); */ + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +void __devinit of_scan_bus(struct device_node *node, + struct pci_bus *bus) +{ + struct device_node *child = NULL; + u32 *reg; + int reglen, devfn; + struct pci_dev *dev; + + while ((child = of_get_next_child(node, child)) != NULL) { + reg = (u32 *) get_property(child, "reg", ®len); + if (reg == NULL || reglen < 20) + continue; + devfn = (reg[0] >> 8) & 0xff; + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(child, bus, devfn); + if (!dev) + continue; + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + of_scan_pci_bridge(child, dev); + } + + do_bus_setup(bus); +} +EXPORT_SYMBOL(of_scan_bus); + +void __devinit of_scan_pci_bridge(struct device_node *node, + struct pci_dev *dev) +{ + struct pci_bus *bus; + u32 *busrange, *ranges; + int len, i, mode; + struct resource *res; + unsigned int flags; + u64 size; + + /* parse bus-range property */ + busrange = (u32 *) get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_ERR "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = (u32 *) get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_ERR "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + + bus->primary = dev->bus->number; + bus->subordinate = busrange[1]; + bus->bridge_ctl = 0; + bus->sysdata = node; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(ranges[0]); + size = GET_64BIT(ranges, 6); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->start = GET_64BIT(ranges, 1); + res->end = res->start + size - 1; + res->flags = flags; + fixup_resource(res, dev); + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void __devinit scan_phb(struct pci_controller *hose) +{ + struct pci_bus *bus; + struct device_node *node = hose->arch_data; + int i, mode; + struct resource *res; + + bus = pci_create_bus(NULL, hose->first_busno, hose->ops, node); + if (bus == NULL) { + printk(KERN_ERR "Failed to create bus for PCI domain %04x\n", + hose->global_number); + return; + } + bus->secondary = hose->first_busno; + hose->bus = bus; + + bus->resource[0] = res = &hose->io_resource; + if (res->flags && request_resource(&ioport_resource, res)) + printk(KERN_ERR "Failed to request PCI IO region " + "on PCI domain %04x\n", hose->global_number); + + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + bus->resource[i+1] = res; + if (res->flags && request_resource(&iomem_resource, res)) + printk(KERN_ERR "Failed to request PCI memory region " + "on PCI domain %04x\n", hose->global_number); + } + + mode = PCI_PROBE_NORMAL; +#ifdef CONFIG_PPC_MULTIPLATFORM + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) { + bus->subordinate = hose->last_busno; + of_scan_bus(node, bus); + } +#endif /* CONFIG_PPC_MULTIPLATFORM */ + if (mode == PCI_PROBE_NORMAL) + hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + pci_bus_add_devices(bus); +} + +static int __init pcibios_init(void) +{ + struct pci_controller *hose, *tmp; + + /* For now, override phys_mem_access_prot. If we need it, + * later, we may move that initialization to each ppc_md + */ + ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; + +#ifdef CONFIG_PPC_ISERIES + iSeries_pcibios_init(); +#endif + + printk("PCI: Probing PCI hardware\n"); + + /* Scan all of the recorded PCI controllers. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + scan_phb(hose); + +#ifndef CONFIG_PPC_ISERIES + if (pci_probe_only) + pcibios_claim_of_setup(); + else + /* FIXME: `else' will be removed when + pci_assign_unassigned_resources() is able to work + correctly with [partially] allocated PCI tree. */ + pci_assign_unassigned_resources(); +#endif /* !CONFIG_PPC_ISERIES */ + + /* Call machine dependent final fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + + /* Cache the location of the ISA bridge (if we have one) */ + ppc64_isabridge_dev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (ppc64_isabridge_dev != NULL) + printk("ISA bridge at %s\n", pci_name(ppc64_isabridge_dev)); + +#ifdef CONFIG_PPC_MULTIPLATFORM + /* map in PCI I/O space */ + phbs_remap_io(); +#endif + + printk("PCI: Probing PCI hardware done\n"); + + return 0; +} + +subsys_initcall(pcibios_init); + +char __init *pcibios_setup(char *str) +{ + return str; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, oldcmd; + int i; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + oldcmd = cmd; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *res = &dev->resource[i]; + + /* Only set up the requested stuff */ + if (!(mask & (1<<i))) + continue; + + if (res->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (res->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + + if (cmd != oldcmd) { + printk(KERN_DEBUG "PCI: Enabling device: (%s), cmd %x\n", + pci_name(dev), cmd); + /* Enable the appropriate bits in the PCI command register. */ + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * Return the domain number for this bus. + */ +int pci_domain_nr(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + + return hose->global_number; +#endif +} + +EXPORT_SYMBOL(pci_domain_nr); + +/* Decide whether to display the domain number in /proc */ +int pci_proc_domain(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + return hose->buid; +#endif +} + +/* + * Platform support for /proc/bus/pci/X/Y mmap()s, + * modelled on the sparc64 implementation by Dave Miller. + * -- paulus. + */ + +/* + * Adjust vm_pgoff of VMA such that it is the physical page offset + * corresponding to the 32-bit pci bus offset for DEV requested by the user. + * + * Basically, the user finds the base address for his device which he wishes + * to mmap. They read the 32-bit value from the config space base register, + * add whatever PAGE_SIZE multiple offset they wish, and feed this into the + * offset parameter of mmap on /proc/bus/pci/XXX for that device. + * + * Returns negative error code on failure, zero on success. + */ +static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, + unsigned long *offset, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long io_offset = 0; + int i, res_bit; + + if (hose == 0) + return NULL; /* should never happen */ + + /* If memory, add on the PCI bridge address offset */ + if (mmap_state == pci_mmap_mem) { + *offset += hose->pci_mem_offset; + res_bit = IORESOURCE_MEM; + } else { + io_offset = (unsigned long)hose->io_base_virt - pci_io_base; + *offset += io_offset; + res_bit = IORESOURCE_IO; + } + + /* + * Check that the offset requested corresponds to one of the + * resources of the device. + */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &dev->resource[i]; + int flags = rp->flags; + + /* treat ROM as memory (should be already) */ + if (i == PCI_ROM_RESOURCE) + flags |= IORESOURCE_MEM; + + /* Active and same type? */ + if ((flags & res_bit) == 0) + continue; + + /* In the range of this resource? */ + if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end) + continue; + + /* found it! construct the final physical address */ + if (mmap_state == pci_mmap_io) + *offset += hose->io_base_phys - io_offset; + return rp; + } + + return NULL; +} + +/* + * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci + * device mapping. + */ +static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, + pgprot_t protection, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long prot = pgprot_val(protection); + + /* Write combine is always 0 on non-memory space mappings. On + * memory space, if the user didn't pass 1, we check for a + * "prefetchable" resource. This is a bit hackish, but we use + * this to workaround the inability of /sysfs to provide a write + * combine bit + */ + if (mmap_state != pci_mmap_mem) + write_combine = 0; + else if (write_combine == 0) { + if (rp->flags & IORESOURCE_PREFETCH) + write_combine = 1; + } + + /* XXX would be nice to have a way to ask for write-through */ + prot |= _PAGE_NO_CACHE; + if (write_combine) + prot &= ~_PAGE_GUARDED; + else + prot |= _PAGE_GUARDED; + + printk("PCI map for %s:%lx, prot: %lx\n", pci_name(dev), rp->start, + prot); + + return __pgprot(prot); +} + +/* + * This one is used by /dev/mem and fbdev who have no clue about the + * PCI device, it tries to find the PCI device first and calls the + * above routine + */ +pgprot_t pci_phys_mem_access_prot(struct file *file, + unsigned long pfn, + unsigned long size, + pgprot_t protection) +{ + struct pci_dev *pdev = NULL; + struct resource *found = NULL; + unsigned long prot = pgprot_val(protection); + unsigned long offset = pfn << PAGE_SHIFT; + int i; + + if (page_is_ram(pfn)) + return __pgprot(prot); + + prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; + + for_each_pci_dev(pdev) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &pdev->resource[i]; + int flags = rp->flags; + + /* Active and same type? */ + if ((flags & IORESOURCE_MEM) == 0) + continue; + /* In the range of this resource? */ + if (offset < (rp->start & PAGE_MASK) || + offset > rp->end) + continue; + found = rp; + break; + } + if (found) + break; + } + if (found) { + if (found->flags & IORESOURCE_PREFETCH) + prot &= ~_PAGE_GUARDED; + pci_dev_put(pdev); + } + + DBG("non-PCI map for %lx, prot: %lx\n", offset, prot); + + return __pgprot(prot); +} + + +/* + * Perform the actual remap of the pages for a PCI device mapping, as + * appropriate for this architecture. The region in the process to map + * is described by vm_start and vm_end members of VMA, the base physical + * address is found in vm_pgoff. + * The pci device structure is provided so that architectures may make mapping + * decisions on a per-device or per-bus basis. + * + * Returns a negative error code on failure, zero on success. + */ +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + struct resource *rp; + int ret; + + rp = __pci_mmap_make_offset(dev, &offset, mmap_state); + if (rp == NULL) + return -EINVAL; + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO; + vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, + vma->vm_page_prot, + mmap_state, write_combine); + + ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + + return ret; +} + +#ifdef CONFIG_PPC_MULTIPLATFORM +static ssize_t pci_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev; + struct device_node *np; + + pdev = to_pci_dev (dev); + np = pci_device_to_OF_node(pdev); + if (np == NULL || np->full_name == NULL) + return 0; + return sprintf(buf, "%s", np->full_name); +} +static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void pcibios_add_platform_entries(struct pci_dev *pdev) +{ +#ifdef CONFIG_PPC_MULTIPLATFORM + device_create_file(&pdev->dev, &dev_attr_devspec); +#endif /* CONFIG_PPC_MULTIPLATFORM */ +} + +#ifdef CONFIG_PPC_MULTIPLATFORM + +#define ISA_SPACE_MASK 0x1 +#define ISA_SPACE_IO 0x1 + +static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node, + unsigned long phb_io_base_phys, + void __iomem * phb_io_base_virt) +{ + struct isa_range *range; + unsigned long pci_addr; + unsigned int isa_addr; + unsigned int size; + int rlen = 0; + + range = (struct isa_range *) get_property(isa_node, "ranges", &rlen); + if (range == NULL || (rlen < sizeof(struct isa_range))) { + printk(KERN_ERR "no ISA ranges or unexpected isa range size," + "mapping 64k\n"); + __ioremap_explicit(phb_io_base_phys, + (unsigned long)phb_io_base_virt, + 0x10000, _PAGE_NO_CACHE | _PAGE_GUARDED); + return; + } + + /* From "ISA Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 1: an ISA address + * cells 2 - 4: a PCI address + * (size depending on dev->n_addr_cells) + * cell 5: the size of the range + */ + if ((range->isa_addr.a_hi && ISA_SPACE_MASK) == ISA_SPACE_IO) { + isa_addr = range->isa_addr.a_lo; + pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | + range->pci_addr.a_lo; + + /* Assume these are both zero */ + if ((pci_addr != 0) || (isa_addr != 0)) { + printk(KERN_ERR "unexpected isa to pci mapping: %s\n", + __FUNCTION__); + return; + } + + size = PAGE_ALIGN(range->size); + + __ioremap_explicit(phb_io_base_phys, + (unsigned long) phb_io_base_virt, + size, _PAGE_NO_CACHE | _PAGE_GUARDED); + } +} + +void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose, + struct device_node *dev, int prim) +{ + unsigned int *ranges, pci_space; + unsigned long size; + int rlen = 0; + int memno = 0; + struct resource *res; + int np, na = prom_n_addr_cells(dev); + unsigned long pci_addr, cpu_phys_addr; + + np = na + 5; + + /* From "PCI Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 2: a PCI address + * cells 3 or 3+4: a CPU physical address + * (size depending on dev->n_addr_cells) + * cells 4+5 or 5+6: the size of the range + */ + ranges = (unsigned int *) get_property(dev, "ranges", &rlen); + if (ranges == NULL) + return; + hose->io_base_phys = 0; + while ((rlen -= np * sizeof(unsigned int)) >= 0) { + res = NULL; + pci_space = ranges[0]; + pci_addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + + cpu_phys_addr = ranges[3]; + if (na >= 2) + cpu_phys_addr = (cpu_phys_addr << 32) | ranges[4]; + + size = ((unsigned long)ranges[na+3] << 32) | ranges[na+4]; + ranges += np; + if (size == 0) + continue; + + /* Now consume following elements while they are contiguous */ + while (rlen >= np * sizeof(unsigned int)) { + unsigned long addr, phys; + + if (ranges[0] != pci_space) + break; + addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + phys = ranges[3]; + if (na >= 2) + phys = (phys << 32) | ranges[4]; + if (addr != pci_addr + size || + phys != cpu_phys_addr + size) + break; + + size += ((unsigned long)ranges[na+3] << 32) + | ranges[na+4]; + ranges += np; + rlen -= np * sizeof(unsigned int); + } + + switch ((pci_space >> 24) & 0x3) { + case 1: /* I/O space */ + hose->io_base_phys = cpu_phys_addr; + hose->pci_io_size = size; + + res = &hose->io_resource; + res->flags = IORESOURCE_IO; + res->start = pci_addr; + DBG("phb%d: IO 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + break; + case 2: /* memory space */ + memno = 0; + while (memno < 3 && hose->mem_resources[memno].flags) + ++memno; + + if (memno == 0) + hose->pci_mem_offset = cpu_phys_addr - pci_addr; + if (memno < 3) { + res = &hose->mem_resources[memno]; + res->flags = IORESOURCE_MEM; + res->start = cpu_phys_addr; + DBG("phb%d: MEM 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + } + break; + } + if (res != NULL) { + res->name = dev->full_name; + res->end = res->start + size - 1; + res->parent = NULL; + res->sibling = NULL; + res->child = NULL; + } + } +} + +void __init pci_setup_phb_io(struct pci_controller *hose, int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + struct device_node *isa_dn; + + hose->io_base_virt = reserve_phb_iospace(size); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) { + pci_io_base = (unsigned long)hose->io_base_virt; + isa_dn = of_find_node_by_type(NULL, "isa"); + if (isa_dn) { + isa_io_base = pci_io_base; + pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys, + hose->io_base_virt); + of_node_put(isa_dn); + /* Allow all IO */ + io_page_mask = -1; + } + } + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + +void __devinit pci_setup_phb_io_dynamic(struct pci_controller *hose, + int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + + hose->io_base_virt = __ioremap(hose->io_base_phys, size, + _PAGE_NO_CACHE | _PAGE_GUARDED); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) + pci_io_base = (unsigned long)hose->io_base_virt; + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + + +static int get_bus_io_range(struct pci_bus *bus, unsigned long *start_phys, + unsigned long *start_virt, unsigned long *size) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus_region region; + struct resource *res; + + if (bus->self) { + res = bus->resource[0]; + pcibios_resource_to_bus(bus->self, ®ion, res); + *start_phys = hose->io_base_phys + region.start; + *start_virt = (unsigned long) hose->io_base_virt + + region.start; + if (region.end > region.start) + *size = region.end - region.start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, region.start, region.end); + return 1; + } + + } else { + /* Root Bus */ + res = &hose->io_resource; + *start_phys = hose->io_base_phys; + *start_virt = (unsigned long) hose->io_base_virt; + if (res->end > res->start) + *size = res->end - res->start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, res->start, res->end); + return 1; + } + } + + return 0; +} + +int unmap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + if (iounmap_explicit((void __iomem *) start_virt, size)) + return 1; + + return 0; +} +EXPORT_SYMBOL(unmap_bus_range); + +int remap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + if (start_phys == 0) + return 1; + printk("mapping IO %lx -> %lx, size: %lx\n", start_phys, start_virt, size); + if (__ioremap_explicit(start_phys, start_virt, size, + _PAGE_NO_CACHE | _PAGE_GUARDED)) + return 1; + + return 0; +} +EXPORT_SYMBOL(remap_bus_range); + +void phbs_remap_io(void) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + remap_bus_range(hose->bus); +} + +/* + * ppc64 can have multifunction devices that do not respond to function 0. + * In this case we must scan all functions. + * XXX this can go now, we use the OF device tree in all the + * cases that caused problems. -- paulus + */ +int pcibios_scan_all_fns(struct pci_bus *bus, int devfn) +{ + return 0; +} + +static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start, end, mask, offset; + + if (res->flags & IORESOURCE_IO) { + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + start = res->start += offset; + end = res->end += offset; + + /* Need to allow IO access to pages that are in the + ISA range */ + if (start < MAX_ISA_PORT) { + if (end > MAX_ISA_PORT) + end = MAX_ISA_PORT; + + start >>= PAGE_SHIFT; + end >>= PAGE_SHIFT; + + /* get the range of pages for the map */ + mask = ((1 << (end+1)) - 1) ^ ((1 << start) - 1); + io_page_mask |= mask; + } + } else if (res->flags & IORESOURCE_MEM) { + res->start += hose->pci_mem_offset; + res->end += hose->pci_mem_offset; + } +} + +void __devinit pcibios_fixup_device_resources(struct pci_dev *dev, + struct pci_bus *bus) +{ + /* Update device resources. */ + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) + if (dev->resource[i].flags) + fixup_resource(&dev->resource[i], dev); +} +EXPORT_SYMBOL(pcibios_fixup_device_resources); + +static void __devinit do_bus_setup(struct pci_bus *bus) +{ + struct pci_dev *dev; + + ppc_md.iommu_bus_setup(bus); + + list_for_each_entry(dev, &bus->devices, bus_list) + ppc_md.iommu_dev_setup(dev); + + if (ppc_md.irq_bus_setup) + ppc_md.irq_bus_setup(bus); +} + +void __devinit pcibios_fixup_bus(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + + if (dev && pci_probe_only && + (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + /* This is a subordinate bridge */ + + pci_read_bridge_bases(bus); + pcibios_fixup_device_resources(dev, bus); + } + + do_bus_setup(bus); + + if (!pci_probe_only) + return; + + list_for_each_entry(dev, &bus->devices, bus_list) + if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI) + pcibios_fixup_device_resources(dev, bus); +} +EXPORT_SYMBOL(pcibios_fixup_bus); + +/* + * Reads the interrupt pin to determine if interrupt is use by card. + * If the interrupt is used, then gets the interrupt line from the + * openfirmware and sets it in the pci_dev and pci_config line. + */ +int pci_read_irq_line(struct pci_dev *pci_dev) +{ + u8 intpin; + struct device_node *node; + + pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin); + if (intpin == 0) + return 0; + + node = pci_device_to_OF_node(pci_dev); + if (node == NULL) + return -1; + + if (node->n_intrs == 0) + return -1; + + pci_dev->irq = node->intrs[0].line; + + pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq); + + return 0; +} +EXPORT_SYMBOL(pci_read_irq_line); + +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = pci_io_base - (unsigned long)hose->io_base_virt + + hose->io_base_phys; + + *start = rsrc->start + offset; + *end = rsrc->end + offset; +} + +#endif /* CONFIG_PPC_MULTIPLATFORM */ + + +#define IOBASE_BRIDGE_NUMBER 0 +#define IOBASE_MEMORY 1 +#define IOBASE_IO 2 +#define IOBASE_ISA_IO 3 +#define IOBASE_ISA_MEM 4 + +long sys_pciconfig_iobase(long which, unsigned long in_bus, + unsigned long in_devfn) +{ + struct pci_controller* hose; + struct list_head *ln; + struct pci_bus *bus = NULL; + struct device_node *hose_node; + + /* Argh ! Please forgive me for that hack, but that's the + * simplest way to get existing XFree to not lockup on some + * G5 machines... So when something asks for bus 0 io base + * (bus 0 is HT root), we return the AGP one instead. + */ + if (machine_is_compatible("MacRISC4")) + if (in_bus == 0) + in_bus = 0xf0; + + /* That syscall isn't quite compatible with PCI domains, but it's + * used on pre-domains setup. We return the first match + */ + + for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) { + bus = pci_bus_b(ln); + if (in_bus >= bus->number && in_bus < (bus->number + bus->subordinate)) + break; + bus = NULL; + } + if (bus == NULL || bus->sysdata == NULL) + return -ENODEV; + + hose_node = (struct device_node *)bus->sysdata; + hose = PCI_DN(hose_node)->phb; + + switch (which) { + case IOBASE_BRIDGE_NUMBER: + return (long)hose->first_busno; + case IOBASE_MEMORY: + return (long)hose->pci_mem_offset; + case IOBASE_IO: + return (long)hose->io_base_phys; + case IOBASE_ISA_IO: + return (long)isa_io_base; + case IOBASE_ISA_MEM: + return -EINVAL; + } + + return -EOPNOTSUPP; +} diff --git a/arch/powerpc/kernel/pci_direct_iommu.c b/arch/powerpc/kernel/pci_direct_iommu.c new file mode 100644 index 00000000000..e1a32f802c0 --- /dev/null +++ b/arch/powerpc/kernel/pci_direct_iommu.c @@ -0,0 +1,94 @@ +/* + * Support for DMA from PCI devices to main memory on + * machines without an iommu or with directly addressable + * RAM (typically a pmac with 2Gb of RAM or less) + * + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/pmac_feature.h> +#include <asm/abs_addr.h> +#include <asm/ppc-pci.h> + +static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret; + + ret = (void *)__get_free_pages(flag, get_order(size)); + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_abs(ret); + } + return ret; +} + +static void pci_direct_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static dma_addr_t pci_direct_map_single(struct device *hwdev, void *ptr, + size_t size, enum dma_data_direction direction) +{ + return virt_to_abs(ptr); +} + +static void pci_direct_unmap_single(struct device *hwdev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction direction) +{ +} + +static int pci_direct_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + int i; + + for (i = 0; i < nents; i++, sg++) { + sg->dma_address = page_to_phys(sg->page) + sg->offset; + sg->dma_length = sg->length; + } + + return nents; +} + +static void pci_direct_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ +} + +static int pci_direct_dma_supported(struct device *dev, u64 mask) +{ + return mask < 0x100000000ull; +} + +void __init pci_direct_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_direct_alloc_coherent; + pci_dma_ops.free_coherent = pci_direct_free_coherent; + pci_dma_ops.map_single = pci_direct_map_single; + pci_dma_ops.unmap_single = pci_direct_unmap_single; + pci_dma_ops.map_sg = pci_direct_map_sg; + pci_dma_ops.unmap_sg = pci_direct_unmap_sg; + pci_dma_ops.dma_supported = pci_direct_dma_supported; +} diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c new file mode 100644 index 00000000000..12c4c9e9bbc --- /dev/null +++ b/arch/powerpc/kernel/pci_dn.c @@ -0,0 +1,230 @@ +/* + * pci_dn.c + * + * Copyright (C) 2001 Todd Inglett, IBM Corporation + * + * PCI manipulation via device_nodes. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/pSeries_reconfig.h> +#include <asm/ppc-pci.h> + +/* + * Traverse_func that inits the PCI fields of the device node. + * NOTE: this *must* be done before read/write config to the device. + */ +static void * __devinit update_dn_pci_info(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + int *type = (int *)get_property(dn, "ibm,pci-config-space-type", NULL); + u32 *regs; + struct pci_dn *pdn; + + if (mem_init_done) + pdn = kmalloc(sizeof(*pdn), GFP_KERNEL); + else + pdn = alloc_bootmem(sizeof(*pdn)); + if (pdn == NULL) + return NULL; + memset(pdn, 0, sizeof(*pdn)); + dn->data = pdn; + pdn->node = dn; + pdn->phb = phb; + regs = (u32 *)get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + pdn->busno = (regs[0] >> 16) & 0xff; + pdn->devfn = (regs[0] >> 8) & 0xff; + } + + pdn->pci_ext_config_space = (type && *type == 1); + return NULL; +} + +/* + * Traverse a device tree stopping each PCI device in the tree. + * This is done depth first. As each node is processed, a "pre" + * function is called and the children are processed recursively. + * + * The "pre" func returns a value. If non-zero is returned from + * the "pre" func, the traversal stops and this value is returned. + * This return value is useful when using traverse as a method of + * finding a device. + * + * NOTE: we do not run the func for devices that do not appear to + * be PCI except for the start node which we assume (this is good + * because the start node is often a phb which may be missing PCI + * properties). + * We use the class-code as an indicator. If we run into + * one of these nodes we also assume its siblings are non-pci for + * performance. + */ +void *traverse_pci_devices(struct device_node *start, traverse_func pre, + void *data) +{ + struct device_node *dn, *nextdn; + void *ret; + + /* We started with a phb, iterate all childs */ + for (dn = start->child; dn; dn = nextdn) { + u32 *classp, class; + + nextdn = NULL; + classp = (u32 *)get_property(dn, "class-code", NULL); + class = classp ? *classp : 0; + + if (pre && ((ret = pre(dn, data)) != NULL)) + return ret; + + /* If we are a PCI bridge, go down */ + if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || + (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS)) + /* Depth first...do children */ + nextdn = dn->child; + else if (dn->sibling) + /* ok, try next sibling instead. */ + nextdn = dn->sibling; + if (!nextdn) { + /* Walk up to next valid sibling. */ + do { + dn = dn->parent; + if (dn == start) + return NULL; + } while (dn->sibling == NULL); + nextdn = dn->sibling; + } + } + return NULL; +} + +/** + * pci_devs_phb_init_dynamic - setup pci devices under this PHB + * phb: pci-to-host bridge (top-level bridge connecting to cpu) + * + * This routine is called both during boot, (before the memory + * subsystem is set up, before kmalloc is valid) and during the + * dynamic lpar operation of adding a PHB to a running system. + */ +void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node * dn = (struct device_node *) phb->arch_data; + struct pci_dn *pdn; + + /* PHB nodes themselves must not match */ + update_dn_pci_info(dn, phb); + pdn = dn->data; + if (pdn) { + pdn->devfn = pdn->busno = -1; + pdn->phb = phb; + } + + /* Update dn->phb ptrs for new phb and children devices */ + traverse_pci_devices(dn, update_dn_pci_info, phb); +} + +/* + * Traversal func that looks for a <busno,devfcn> value. + * If found, the pci_dn is returned (thus terminating the traversal). + */ +static void *is_devfn_node(struct device_node *dn, void *data) +{ + int busno = ((unsigned long)data >> 8) & 0xff; + int devfn = ((unsigned long)data) & 0xff; + struct pci_dn *pci = dn->data; + + if (pci && (devfn == pci->devfn) && (busno == pci->busno)) + return dn; + return NULL; +} + +/* + * This is the "slow" path for looking up a device_node from a + * pci_dev. It will hunt for the device under its parent's + * phb and then update sysdata for a future fastpath. + * + * It may also do fixups on the actual device since this happens + * on the first read/write. + * + * Note that it also must deal with devices that don't exist. + * In this case it may probe for real hardware ("just in case") + * and add a device_node to the device tree if necessary. + * + */ +struct device_node *fetch_dev_dn(struct pci_dev *dev) +{ + struct device_node *orig_dn = dev->sysdata; + struct device_node *dn; + unsigned long searchval = (dev->bus->number << 8) | dev->devfn; + + dn = traverse_pci_devices(orig_dn, is_devfn_node, (void *)searchval); + if (dn) + dev->sysdata = dn; + return dn; +} +EXPORT_SYMBOL(fetch_dev_dn); + +static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) +{ + struct device_node *np = node; + struct pci_dn *pci = NULL; + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + pci = np->parent->data; + if (pci) + update_dn_pci_info(np, pci->phb); + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pci_dn_reconfig_nb = { + .notifier_call = pci_dn_reconfig_notifier, +}; + +/** + * pci_devs_phb_init - Initialize phbs and pci devs under them. + * + * This routine walks over all phb's (pci-host bridges) on the + * system, and sets up assorted pci-related structures + * (including pci info in the device node structs) for each + * pci device found underneath. This routine runs once, + * early in the boot sequence. + */ +void __init pci_devs_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + /* This must be done first so the device nodes have valid pci info! */ + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + pci_devs_phb_init_dynamic(phb); + + pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); +} diff --git a/arch/powerpc/kernel/pci_iommu.c b/arch/powerpc/kernel/pci_iommu.c new file mode 100644 index 00000000000..bdf15dbbf4f --- /dev/null +++ b/arch/powerpc/kernel/pci_iommu.c @@ -0,0 +1,128 @@ +/* + * arch/ppc64/kernel/pci_iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * + * Dynamic DMA mapping support, platform-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +/* + * We can use ->sysdata directly and avoid the extra work in + * pci_device_to_OF_node since ->sysdata will have been initialised + * in the iommu init code for all devices. + */ +#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata)) + +static inline struct iommu_table *devnode_table(struct device *dev) +{ + struct pci_dev *pdev; + + if (!dev) { + pdev = ppc64_isabridge_dev; + if (!pdev) + return NULL; + } else + pdev = to_pci_dev(dev); + + return PCI_DN(PCI_GET_DN(pdev))->iommu_table; +} + + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle, + flag); +} + +static void pci_iommu_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + iommu_free_coherent(devnode_table(hwdev), size, vaddr, dma_handle); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +static dma_addr_t pci_iommu_map_single(struct device *hwdev, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + return iommu_map_single(devnode_table(hwdev), vaddr, size, direction); +} + + +static void pci_iommu_unmap_single(struct device *hwdev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction); +} + + +static int pci_iommu_map_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + return iommu_map_sg(pdev, devnode_table(pdev), sglist, + nelems, direction); +} + +static void pci_iommu_unmap_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction); +} + +/* We support DMA to/from any memory page via the iommu */ +static int pci_iommu_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +void pci_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_iommu_alloc_coherent; + pci_dma_ops.free_coherent = pci_iommu_free_coherent; + pci_dma_ops.map_single = pci_iommu_map_single; + pci_dma_ops.unmap_single = pci_iommu_unmap_single; + pci_dma_ops.map_sg = pci_iommu_map_sg; + pci_dma_ops.unmap_sg = pci_iommu_unmap_sg; + pci_dma_ops.dma_supported = pci_iommu_dma_supported; +} diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 5dcf4ba05ee..59846b40d52 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -105,6 +105,13 @@ EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__strnlen_user); +#ifndef __powerpc64__ +EXPORT_SYMBOL(__ide_mm_insl); +EXPORT_SYMBOL(__ide_mm_outsw); +EXPORT_SYMBOL(__ide_mm_insw); +EXPORT_SYMBOL(__ide_mm_outsl); +#endif + EXPORT_SYMBOL(_insb); EXPORT_SYMBOL(_outsb); EXPORT_SYMBOL(_insw); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 6a5b468edb4..3bf968e7409 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -1368,6 +1368,7 @@ prom_n_addr_cells(struct device_node* np) /* No #address-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_addr_cells); int prom_n_size_cells(struct device_node* np) @@ -1383,6 +1384,7 @@ prom_n_size_cells(struct device_node* np) /* No #size-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_size_cells); /** * Work out the sense (active-low level / active-high edge) diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c new file mode 100644 index 00000000000..635d3b9a881 --- /dev/null +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -0,0 +1,105 @@ +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/rtc.h> +#include <linux/delay.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/time.h> + + +#define MAX_RTC_WAIT 5000 /* 5 sec */ +#define RTAS_CLOCK_BUSY (-2) +unsigned long __init rtas_get_boot_time(void) +{ + int ret[8]; + int error, wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + wait_time = rtas_extended_busy_delay_time(error); + /* This is boot time so we spin. */ + udelay(wait_time*1000); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return 0; + } + + return mktime(ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]); +} + +/* NOTE: get_rtc_time will get an error if executed in interrupt context + * and if a delay is needed to read the clock. In this case we just + * silently return without updating rtc_tm. + */ +void rtas_get_rtc_time(struct rtc_time *rtc_tm) +{ + int ret[8]; + int error, wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt() && printk_ratelimit()) { + memset(&rtc_tm, 0, sizeof(struct rtc_time)); + printk(KERN_WARNING "error: reading clock" + " would delay interrupt\n"); + return; /* delay not allowed */ + } + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return; + } + + rtc_tm->tm_sec = ret[5]; + rtc_tm->tm_min = ret[4]; + rtc_tm->tm_hour = ret[3]; + rtc_tm->tm_mday = ret[2]; + rtc_tm->tm_mon = ret[1] - 1; + rtc_tm->tm_year = ret[0] - 1900; +} + +int rtas_set_rtc_time(struct rtc_time *tm) +{ + int error, wait_time; + u64 max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + tm->tm_year + 1900, tm->tm_mon + 1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec, 0); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt()) + return 1; /* probably decrementer */ + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) + printk(KERN_WARNING "error: setting the clock failed (%d)\n", + error); + + return 0; +} diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 0e5a8e11665..60dec2401c2 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -304,75 +304,18 @@ static int __devinit setup_phb(struct device_node *dev, struct pci_controller *phb, unsigned int addr_size_words) { - pci_setup_pci_controller(phb); - if (is_python(dev)) python_countermeasures(dev, addr_size_words); if (phb_set_bus_ranges(dev, phb)) return 1; - phb->arch_data = dev; phb->ops = &rtas_pci_ops; phb->buid = get_phb_buid(dev); return 0; } -static void __devinit add_linux_pci_domain(struct device_node *dev, - struct pci_controller *phb, - struct property *of_prop) -{ - memset(of_prop, 0, sizeof(struct property)); - of_prop->name = "linux,pci-domain"; - of_prop->length = sizeof(phb->global_number); - of_prop->value = (unsigned char *)&of_prop[1]; - memcpy(of_prop->value, &phb->global_number, sizeof(phb->global_number)); - prom_add_property(dev, of_prop); -} - -static struct pci_controller * __init alloc_phb(struct device_node *dev, - unsigned int addr_size_words) -{ - struct pci_controller *phb; - struct property *of_prop; - - phb = alloc_bootmem(sizeof(struct pci_controller)); - if (phb == NULL) - return NULL; - - of_prop = alloc_bootmem(sizeof(struct property) + - sizeof(phb->global_number)); - if (!of_prop) - return NULL; - - if (setup_phb(dev, phb, addr_size_words)) - return NULL; - - add_linux_pci_domain(dev, phb, of_prop); - - return phb; -} - -static struct pci_controller * __devinit alloc_phb_dynamic(struct device_node *dev, unsigned int addr_size_words) -{ - struct pci_controller *phb; - - phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller), - GFP_KERNEL); - if (phb == NULL) - return NULL; - - if (setup_phb(dev, phb, addr_size_words)) - return NULL; - - phb->is_dynamic = 1; - - /* TODO: linux,pci-domain? */ - - return phb; -} - unsigned long __init find_and_init_phbs(void) { struct device_node *node; @@ -397,10 +340,10 @@ unsigned long __init find_and_init_phbs(void) if (node->type == NULL || strcmp(node->type, "pci") != 0) continue; - phb = alloc_phb(node, root_size_cells); + phb = pcibios_alloc_controller(node); if (!phb) continue; - + setup_phb(node, phb, root_size_cells); pci_process_bridge_OF_ranges(phb, node, 0); pci_setup_phb_io(phb, index == 0); #ifdef CONFIG_PPC_PSERIES @@ -446,10 +389,10 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) root_size_cells = prom_n_size_cells(root); primary = list_empty(&hose_list); - phb = alloc_phb_dynamic(dn, root_size_cells); + phb = pcibios_alloc_controller(dn); if (!phb) return NULL; - + setup_phb(dn, phb, root_size_cells); pci_process_bridge_OF_ranges(phb, dn, primary); pci_setup_phb_io_dynamic(phb, primary); @@ -505,8 +448,7 @@ int pcibios_remove_root_bus(struct pci_controller *phb) } list_del(&phb->list_node); - if (phb->is_dynamic) - kfree(phb); + pcibios_free_controller(phb); return 0; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 33e7f2c7f19..bd3eb4292b5 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -59,6 +59,7 @@ #undef DEBUG #ifdef DEBUG +#include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else #define DBG(fmt...) diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index c98cfcc9cd9..e5694335bf1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -57,10 +57,6 @@ extern void power4_idle(void); boot_infos_t *boot_infos; struct ide_machdep_calls ppc_ide_md; -/* XXX should go elsewhere */ -int __irq_offset_value; -EXPORT_SYMBOL(__irq_offset_value); - int boot_cpuid; EXPORT_SYMBOL_GPL(boot_cpuid); int boot_cpuid_phys; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index fdbd9f9122f..608fee7c7e2 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,6 +59,7 @@ #include <asm/firmware.h> #include <asm/xmon.h> #include <asm/udbg.h> +#include <asm/kexec.h> #include "setup.h" @@ -415,6 +416,10 @@ void __init setup_system(void) */ unflatten_device_tree(); +#ifdef CONFIG_KEXEC + kexec_setup(); /* requires unflattened device tree. */ +#endif + /* * Fill the ppc64_caches & systemcfg structures with informations * retreived from the device-tree. Need to be called before diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 8bdf95b7e42..5a2eba60dd3 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -403,8 +403,6 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, ELF_NFPREG * sizeof(double))) return 1; - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ - #ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { @@ -818,6 +816,9 @@ static int handle_rt_signal(unsigned long sig, struct k_sigaction *ka, goto badframe; regs->link = (unsigned long) frame->tramp; } + + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; @@ -1097,6 +1098,8 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, regs->link = (unsigned long) frame->mctx.tramp; } + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 58194e15071..1decf278553 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -131,9 +131,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, flush_fp_to_thread(current); - /* Make sure signal doesn't get spurrious FP exceptions */ - current->thread.fpscr.val = 0; - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); @@ -423,6 +420,9 @@ static int setup_rt_frame(int signr, struct k_sigaction *ka, siginfo_t *info, if (err) goto badframe; + /* Make sure signal handler doesn't get spurious FP exceptions */ + current->thread.fpscr.val = 0; + /* Set up to return from userspace. */ if (vdso64_rt_sigtramp && current->thread.vdso_base) { regs->link = current->thread.vdso_base + vdso64_rt_sigtramp; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 62dfc5b8d76..30374d2f88e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -49,15 +49,16 @@ #include <asm/paca.h> #endif -int smp_hw_index[NR_CPUS]; -struct thread_info *secondary_ti; - #ifdef DEBUG +#include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else #define DBG(fmt...) #endif +int smp_hw_index[NR_CPUS]; +struct thread_info *secondary_ti; + cpumask_t cpu_possible_map = CPU_MASK_NONE; cpumask_t cpu_online_map = CPU_MASK_NONE; cpumask_t cpu_sibling_map[NR_CPUS] = { [0 ... NR_CPUS-1] = CPU_MASK_NONE }; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 070b4b458aa..de8479769bb 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -130,6 +130,34 @@ unsigned long tb_last_stamp; */ DEFINE_PER_CPU(unsigned long, last_jiffy); +void __delay(unsigned long loops) +{ + unsigned long start; + int diff; + + if (__USE_RTC()) { + start = get_rtcl(); + do { + /* the RTCL register wraps at 1000000000 */ + diff = get_rtcl() - start; + if (diff < 0) + diff += 1000000000; + } while (diff < loops); + } else { + start = get_tbl(); + while (get_tbl() - start < loops) + HMT_low(); + HMT_medium(); + } +} +EXPORT_SYMBOL(__delay); + +void udelay(unsigned long usecs) +{ + __delay(tb_ticks_per_usec * usecs); +} +EXPORT_SYMBOL(udelay); + static __inline__ void timer_check_rtc(void) { /* diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S index c8db993574e..09629aea3e4 100644 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -35,6 +35,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ srwi. r8,r8,7 /* compute line count */ + crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 mr r3,r6 @@ -58,6 +59,7 @@ V_FUNCTION_END(__kernel_sync_dicache) */ V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) .cfi_startproc + crclr cr0*4+so sync isync li r3,0 diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index a08c26e8783..4709f1d9542 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -54,7 +54,6 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r4,r3 bl __get_datapage@local mtlr r12 @@ -63,6 +62,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) beqlr li r0,__NR_syscalls stw r0,0(r4) + crclr cr0*4+so blr .cfi_endproc V_FUNCTION_END(__kernel_get_syscall_map) @@ -77,8 +77,10 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) mflr r12 .cfi_register lr,r12 bl __get_datapage@local - lwz r3,CFG_TB_TICKS_PER_SEC(r3) lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) + lwz r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 + crclr cr0*4+so + blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index aeb5fc9b87b..7eebff03a04 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -59,6 +59,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) stw r5,TZONE_TZ_DSTTIME(r11) 1: mtlr r12 + crclr cr0*4+so li r3,0 blr @@ -83,7 +84,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* Check for supported clock IDs */ cmpli cr0,r3,CLOCK_REALTIME cmpli cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f mflr r12 /* r12 saves lr */ @@ -91,7 +92,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mr r10,r3 /* r10 saves id */ mr r11,r4 /* r11 saves tp */ bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ + mr r9,r3 /* datapage ptr in r9 */ beq cr1,50f /* if monotonic -> jump there */ /* @@ -117,6 +118,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mulli r5,r5,1000 stw r5,TSPC32_TV_NSEC(r11) mtlr r12 + crclr cr0*4+so li r3,0 blr @@ -173,14 +175,19 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r4,r4,r7 lis r5,NSEC_PER_SEC@h ori r5,r5,NSEC_PER_SEC@l - cmpli cr0,r4,r5 + cmpl cr0,r4,r5 + cmpli cr1,r4,0 blt 1f subf r4,r5,r4 addi r3,r3,1 +1: bge cr1,1f + addi r3,r3,-1 + add r4,r4,r5 1: stw r3,TSPC32_TV_SEC(r11) stw r4,TSPC32_TV_NSEC(r11) mtlr r12 + crclr cr0*4+so li r3,0 blr @@ -210,11 +217,12 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f li r3,0 cmpli cr0,r4,0 + crclr cr0*4+so beqlr lis r5,CLOCK_REALTIME_RES@h ori r5,r5,CLOCK_REALTIME_RES@l diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S index d4a0ad28d53..cb4ae0a5edd 100644 --- a/arch/powerpc/kernel/vdso64/cacheflush.S +++ b/arch/powerpc/kernel/vdso64/cacheflush.S @@ -35,6 +35,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ srwi. r8,r8,7 /* compute line count */ + crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 mr r3,r6 @@ -58,6 +59,7 @@ V_FUNCTION_END(__kernel_sync_dicache) */ V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) .cfi_startproc + crclr cr0*4+so sync isync li r3,0 diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index e67eda0f8cd..3b2dd7d0c1e 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -54,12 +54,12 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r4,r3 bl V_LOCAL_FUNC(__get_datapage) mtlr r12 addi r3,r3,CFG_SYSCALL_MAP64 cmpli cr0,r4,0 + crclr cr0*4+so beqlr li r0,__NR_syscalls stw r0,0(r4) @@ -80,5 +80,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) bl V_LOCAL_FUNC(__get_datapage) ld r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 + crclr cr0*4+so + blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index d371c02a8c0..ccaeda5136d 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -1,4 +1,5 @@ -/* + + /* * Userland implementation of gettimeofday() for 64 bits processes in a * ppc64 kernel for use in the vDSO * @@ -51,6 +52,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) stw r4,TZONE_TZ_MINWEST(r10) stw r5,TZONE_TZ_DSTTIME(r10) 1: mtlr r12 + crclr cr0*4+so li r3,0 /* always success */ blr .cfi_endproc @@ -68,7 +70,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f mflr r12 /* r12 saves lr */ @@ -84,19 +86,21 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,0x3b9a /* r7 = 1000000000 = NSEC_PER_SEC */ - ori r7,r7,0xca00 + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ std r5,TSPC64_TV_SEC(r11) /* store sec in tv */ subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* nsec = (xsec * NSEC_PER_SEC) / + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / * XSEC_PER_SEC */ rldicl r0,r0,44,20 + mulli r0,r0,1000 /* nsec = usec * 1000 */ std r0,TSPC64_TV_NSEC(r11) /* store nsec in tp */ mtlr r12 + crclr cr0*4+so li r3,0 blr @@ -106,15 +110,16 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) 50: bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,0x3b9a /* r7 = 1000000000 = NSEC_PER_SEC */ - ori r7,r7,0xca00 + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* nsec = (xsec * NSEC_PER_SEC) / + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / * XSEC_PER_SEC */ rldicl r6,r0,44,20 + mulli r6,r6,1000 /* nsec = usec * 1000 */ /* now we must fixup using wall to monotonic. We need to snapshot * that value and do the counter trick again. Fortunately, we still @@ -123,8 +128,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * can be used */ - lwz r4,WTOM_CLOCK_SEC(r9) - lwz r7,WTOM_CLOCK_NSEC(r9) + lwa r4,WTOM_CLOCK_SEC(r3) + lwa r7,WTOM_CLOCK_NSEC(r3) /* We now have our result in r4,r7. We create a fake dependency * on that result and re-check the counter @@ -144,14 +149,19 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r7,r7,r6 lis r9,NSEC_PER_SEC@h ori r9,r9,NSEC_PER_SEC@l - cmpli cr0,r7,r9 + cmpl cr0,r7,r9 + cmpli cr1,r7,0 blt 1f subf r7,r9,r7 addi r4,r4,1 +1: bge cr1,1f + addi r4,r4,-1 + add r7,r7,r9 1: std r4,TSPC64_TV_SEC(r11) std r7,TSPC64_TV_NSEC(r11) mtlr r12 + crclr cr0*4+so li r3,0 blr @@ -181,11 +191,12 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) /* Check for supported clock IDs */ cmpwi cr0,r3,CLOCK_REALTIME cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0,cr0,cr1 + cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f li r3,0 cmpli cr0,r4,0 + crclr cr0*4+so beqlr lis r5,CLOCK_REALTIME_RES@h ori r5,r5,CLOCK_REALTIME_RES@l |