diff options
Diffstat (limited to 'arch/powerpc')
143 files changed, 15600 insertions, 2711 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f4e25c648fb..94df74bcc0e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -261,7 +261,7 @@ config PPC_ISERIES config EMBEDDED6xx bool "Embedded 6xx/7xx/7xxx-based board" - depends on PPC32 + depends on PPC32 && BROKEN config APUS bool "Amiga-APUS" @@ -305,7 +305,7 @@ config PPC_PMAC64 config PPC_PREP bool " PowerPC Reference Platform (PReP) based machines" - depends on PPC_MULTIPLATFORM && PPC32 + depends on PPC_MULTIPLATFORM && PPC32 && BROKEN select PPC_I8259 select PPC_INDIRECT_PCI default y @@ -404,6 +404,14 @@ config CPU_FREQ_PMAC this currently includes some models of iBook & Titanium PowerBook. +config CPU_FREQ_PMAC64 + bool "Support for some Apple G5s" + depends on CPU_FREQ && PMAC_SMU && PPC64 + select CPU_FREQ_TABLE + help + This adds support for frequency switching on Apple iMac G5, + and some of the more recent desktop G5 machines as well. + config PPC601_SYNC_FIX bool "Workarounds for PPC601 bugs" depends on 6xx && (PPC_PREP || PPC_PMAC) @@ -484,6 +492,7 @@ source "fs/Kconfig.binfmt" config FORCE_MAX_ZONEORDER int depends on PPC64 + default "9" if PPC_64K_PAGES default "13" config MATH_EMULATION @@ -572,17 +581,12 @@ config ARCH_FLATMEM_ENABLE def_bool y depends on PPC64 && !NUMA -config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on SMP && PPC_PSERIES - -config ARCH_DISCONTIGMEM_DEFAULT +config ARCH_SPARSEMEM_ENABLE def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE -config ARCH_SPARSEMEM_ENABLE +config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE + depends on SMP && PPC_PSERIES source "mm/Kconfig" @@ -590,6 +594,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y depends on NEED_MULTIPLE_NODES +config ARCH_MEMORY_PROBE + def_bool y + depends on MEMORY_HOTPLUG + # Some NUMA nodes have memory ranges that span # other nodes. Even though a pfn is valid and # between a node's start and end pfns, it may not @@ -603,6 +611,16 @@ config NODES_SPAN_OTHER_NODES def_bool y depends on NEED_MULTIPLE_NODES +config PPC_64K_PAGES + bool "64k page size" + depends on PPC64 + help + This option changes the kernel logical page size to 64k. On machines + without processor support for 64k pages, the kernel will simulate + them by loading each individual 4k page on demand transparently, + while on hardware with such support, it will be used to map + normal application pages. + config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on PPC64 && SMP @@ -907,8 +925,22 @@ source "arch/powerpc/platforms/iseries/Kconfig" source "lib/Kconfig" +menu "Instrumentation Support" + depends on EXPERIMENTAL + source "arch/powerpc/oprofile/Kconfig" +config KPROBES + bool "Kprobes (EXPERIMENTAL)" + depends on PPC64 + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". +endmenu + source "arch/powerpc/Kconfig.debug" source "security/Kconfig" diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 0baf64ec80d..30a30bf559e 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -9,16 +9,6 @@ config DEBUG_STACKOVERFLOW This option will cause messages to be printed if free stack space drops below a certain limit. -config KPROBES - bool "Kprobes" - depends on DEBUG_KERNEL && PPC64 - help - Kprobes allows you to trap at almost any kernel address and - execute a callback function. register_kprobe() establishes - a probepoint and specifies the callback. Kprobes is useful - for kernel debugging, non-intrusive instrumentation and testing. - If in doubt, say "N". - config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL && PPC64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 5bc11bd36c1..d41ad2e675d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -187,7 +187,7 @@ archprepare: checkbin # Temporary hack until we have migrated to asm-powerpc include/asm: arch/$(ARCH)/include/asm -arch/$(ARCH)/include/asm: +arch/$(ARCH)/include/asm: FORCE $(Q)if [ ! -d arch/$(ARCH)/include ]; then mkdir -p arch/$(ARCH)/include; fi $(Q)ln -fsn $(srctree)/include/asm-$(OLDARCH) arch/$(ARCH)/include/asm diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 6323065fbf2..e76854f8c12 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -1,18 +1,32 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-rc4 -# Thu Oct 20 08:30:23 2005 +# Linux kernel version: 2.6.14 +# Mon Nov 7 13:37:59 2005 # +CONFIG_PPC64=y CONFIG_64BIT=y +CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_HARDIRQS=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_GENERIC_ISA_DMA=y +CONFIG_PPC=y CONFIG_EARLY_PRINTK=y CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_FORCE_MAX_ZONEORDER=13 + +# +# Processor support +# +CONFIG_POWER4_ONLY=y +CONFIG_POWER4=y +CONFIG_PPC_FPU=y +CONFIG_ALTIVEC=y +CONFIG_PPC_STD_MMU=y +CONFIG_SMP=y +CONFIG_NR_CPUS=2 # # Code maturity level options @@ -67,30 +81,60 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_KMOD=y CONFIG_STOP_MACHINE=y -CONFIG_SYSVIPC_COMPAT=y # # Platform support # -# CONFIG_PPC_ISERIES is not set CONFIG_PPC_MULTIPLATFORM=y +# CONFIG_PPC_ISERIES is not set +# CONFIG_EMBEDDED6xx is not set +# CONFIG_APUS is not set # CONFIG_PPC_PSERIES is not set -# CONFIG_PPC_BPA is not set CONFIG_PPC_PMAC=y +CONFIG_PPC_PMAC64=y # CONFIG_PPC_MAPLE is not set -CONFIG_PPC=y -CONFIG_PPC64=y +# CONFIG_PPC_CELL is not set CONFIG_PPC_OF=y -CONFIG_MPIC=y -CONFIG_ALTIVEC=y -CONFIG_KEXEC=y CONFIG_U3_DART=y -CONFIG_PPC_PMAC64=y -CONFIG_BOOTX_TEXT=y -CONFIG_POWER4_ONLY=y +CONFIG_MPIC=y +# CONFIG_PPC_RTAS is not set +# CONFIG_MMIO_NVRAM is not set +# CONFIG_PPC_MPC106 is not set +CONFIG_GENERIC_TBSYNC=y +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=y +# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_PMAC64=y +# CONFIG_WANT_EARLY_SERIAL is not set + +# +# Kernel options +# +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_BKL is not set +CONFIG_BINFMT_ELF=y +# CONFIG_BINFMT_MISC is not set +CONFIG_FORCE_MAX_ZONEORDER=13 CONFIG_IOMMU_VMERGE=y -CONFIG_SMP=y -CONFIG_NR_CPUS=2 +# CONFIG_HOTPLUG_CPU is not set +CONFIG_KEXEC=y +CONFIG_IRQ_ALL_CPUS=y +# CONFIG_NUMA is not set CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_SELECT_MEMORY_MODEL=y @@ -100,28 +144,21 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set -# CONFIG_NUMA is not set +CONFIG_SPLIT_PTLOCK_CPUS=4 +# CONFIG_PPC_64K_PAGES is not set # CONFIG_SCHED_SMT is not set -CONFIG_PREEMPT_NONE=y -# CONFIG_PREEMPT_VOLUNTARY is not set -# CONFIG_PREEMPT is not set -# CONFIG_PREEMPT_BKL is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 -CONFIG_GENERIC_HARDIRQS=y -CONFIG_SECCOMP=y -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set -# CONFIG_HOTPLUG_CPU is not set CONFIG_PROC_DEVICETREE=y # CONFIG_CMDLINE_BOOL is not set +# CONFIG_PM is not set +CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y # -# Bus Options +# Bus options # +CONFIG_GENERIC_ISA_DMA=y +# CONFIG_PPC_I8259 is not set +# CONFIG_PPC_INDIRECT_PCI is not set CONFIG_PCI=y CONFIG_PCI_DOMAINS=y CONFIG_PCI_LEGACY_PROC=y @@ -136,6 +173,7 @@ CONFIG_PCI_LEGACY_PROC=y # PCI Hotplug Support # # CONFIG_HOTPLUG_PCI is not set +CONFIG_KERNEL_START=0xc000000000000000 # # Networking @@ -276,6 +314,10 @@ CONFIG_LLC=y # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set CONFIG_NET_CLS_ROUTE=y @@ -348,6 +390,11 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # CONFIG_ATA_OVER_ETH is not set # @@ -449,6 +496,7 @@ CONFIG_SCSI_SPI_ATTRS=y # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set # CONFIG_BLK_DEV_3W_XXXX_RAID is not set # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -465,10 +513,12 @@ CONFIG_SCSI_SATA_SVW=y # CONFIG_SCSI_ATA_PIIX is not set # CONFIG_SCSI_SATA_MV is not set # CONFIG_SCSI_SATA_NV is not set -# CONFIG_SCSI_SATA_PROMISE is not set +# CONFIG_SCSI_PDC_ADMA is not set # CONFIG_SCSI_SATA_QSTOR is not set +# CONFIG_SCSI_SATA_PROMISE is not set # CONFIG_SCSI_SATA_SX4 is not set # CONFIG_SCSI_SATA_SIL is not set +# CONFIG_SCSI_SATA_SIL24 is not set # CONFIG_SCSI_SATA_SIS is not set # CONFIG_SCSI_SATA_ULI is not set # CONFIG_SCSI_SATA_VIA is not set @@ -567,6 +617,9 @@ CONFIG_IEEE1394_RAWIO=y CONFIG_ADB_PMU=y CONFIG_PMAC_SMU=y CONFIG_THERM_PM72=y +CONFIG_WINDFARM=y +CONFIG_WINDFARM_PM81=y +CONFIG_WINDFARM_PM91=y # # Network device support @@ -603,6 +656,7 @@ CONFIG_SUNGEM=y # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set # CONFIG_NET_PCI is not set +# CONFIG_FEC_8XX is not set # # Ethernet (1000 Mbit) @@ -768,6 +822,7 @@ CONFIG_MAX_RAW_DEVS=256 # TPM devices # # CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set # # I2C support @@ -820,6 +875,7 @@ CONFIG_I2C_PMAC_SMU=y # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_RTC8564 is not set # CONFIG_SENSORS_MAX6875 is not set +# CONFIG_RTC_X1205_I2C is not set # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set @@ -876,10 +932,9 @@ CONFIG_FB_OF=y # CONFIG_FB_ASILIANT is not set # CONFIG_FB_IMSTT is not set # CONFIG_FB_VGA16 is not set -# CONFIG_FB_NVIDIA is not set -CONFIG_FB_RIVA=y -# CONFIG_FB_RIVA_I2C is not set -# CONFIG_FB_RIVA_DEBUG is not set +CONFIG_FB_NVIDIA=y +CONFIG_FB_NVIDIA_I2C=y +# CONFIG_FB_RIVA is not set # CONFIG_FB_MATROX is not set # CONFIG_FB_RADEON_OLD is not set CONFIG_FB_RADEON=y @@ -924,7 +979,96 @@ CONFIG_LCD_DEVICE=y # # Sound # -# CONFIG_SOUND is not set +CONFIG_SOUND=m + +# +# Advanced Linux Sound Architecture +# +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_SEQUENCER=m +# CONFIG_SND_SEQ_DUMMY is not set +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_SEQUENCER_OSS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_GENERIC_DRIVER=y + +# +# Generic devices +# +# CONFIG_SND_DUMMY is not set +# CONFIG_SND_VIRMIDI is not set +# CONFIG_SND_MTPAV is not set +# CONFIG_SND_SERIAL_U16550 is not set +# CONFIG_SND_MPU401 is not set + +# +# PCI devices +# +# CONFIG_SND_ALI5451 is not set +# CONFIG_SND_ATIIXP is not set +# CONFIG_SND_ATIIXP_MODEM is not set +# CONFIG_SND_AU8810 is not set +# CONFIG_SND_AU8820 is not set +# CONFIG_SND_AU8830 is not set +# CONFIG_SND_AZT3328 is not set +# CONFIG_SND_BT87X is not set +# CONFIG_SND_CS46XX is not set +# CONFIG_SND_CS4281 is not set +# CONFIG_SND_EMU10K1 is not set +# CONFIG_SND_EMU10K1X is not set +# CONFIG_SND_CA0106 is not set +# CONFIG_SND_KORG1212 is not set +# CONFIG_SND_MIXART is not set +# CONFIG_SND_NM256 is not set +# CONFIG_SND_RME32 is not set +# CONFIG_SND_RME96 is not set +# CONFIG_SND_RME9652 is not set +# CONFIG_SND_HDSP is not set +# CONFIG_SND_HDSPM is not set +# CONFIG_SND_TRIDENT is not set +# CONFIG_SND_YMFPCI is not set +# CONFIG_SND_AD1889 is not set +# CONFIG_SND_ALS4000 is not set +# CONFIG_SND_CMIPCI is not set +# CONFIG_SND_ENS1370 is not set +# CONFIG_SND_ENS1371 is not set +# CONFIG_SND_ES1938 is not set +# CONFIG_SND_ES1968 is not set +# CONFIG_SND_MAESTRO3 is not set +# CONFIG_SND_FM801 is not set +# CONFIG_SND_ICE1712 is not set +# CONFIG_SND_ICE1724 is not set +# CONFIG_SND_INTEL8X0 is not set +# CONFIG_SND_INTEL8X0M is not set +# CONFIG_SND_SONICVIBES is not set +# CONFIG_SND_VIA82XX is not set +# CONFIG_SND_VIA82XX_MODEM is not set +# CONFIG_SND_VX222 is not set +# CONFIG_SND_HDA_INTEL is not set + +# +# ALSA PowerMac devices +# +CONFIG_SND_POWERMAC=m +CONFIG_SND_POWERMAC_AUTO_DRC=y + +# +# USB devices +# +CONFIG_SND_USB_AUDIO=m +# CONFIG_SND_USB_USX2Y is not set + +# +# Open Sound System +# +# CONFIG_SOUND_PRIME is not set # # USB support @@ -958,12 +1102,16 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set +# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set CONFIG_USB_ACM=m CONFIG_USB_PRINTER=y # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1074,6 +1222,7 @@ CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y CONFIG_USB_SERIAL_KLSI=m CONFIG_USB_SERIAL_KOBIL_SCT=m CONFIG_USB_SERIAL_MCT_U232=m +# CONFIG_USB_SERIAL_NOKIA_DKU2 is not set CONFIG_USB_SERIAL_PL2303=m # CONFIG_USB_SERIAL_HP4X is not set CONFIG_USB_SERIAL_SAFE=m @@ -1311,6 +1460,20 @@ CONFIG_NLS_ISO8859_15=y CONFIG_NLS_UTF8=y # +# Library routines +# +CONFIG_CRC_CCITT=m +# CONFIG_CRC16 is not set +CONFIG_CRC32=y +CONFIG_LIBCRC32C=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m + +# # Profiling support # CONFIG_PROFILING=y @@ -1331,12 +1494,14 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y +# CONFIG_DEBUG_VM is not set +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_KPROBES is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUGGER is not set -# CONFIG_PPCDBG is not set CONFIG_IRQSTACKS=y +CONFIG_BOOTX_TEXT=y # # Security options @@ -1376,17 +1541,3 @@ CONFIG_CRYPTO_TEST=m # # Hardware crypto devices # - -# -# Library routines -# -CONFIG_CRC_CCITT=m -# CONFIG_CRC16 is not set -CONFIG_CRC32=y -CONFIG_LIBCRC32C=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 9f09dff9e11..913962c1dae 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -1,18 +1,33 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-rc4 -# Thu Oct 20 08:32:17 2005 +# Linux kernel version: 2.6.15-rc1 +# Mon Nov 14 15:27:00 2005 # +CONFIG_PPC64=y CONFIG_64BIT=y +CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_HARDIRQS=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_GENERIC_ISA_DMA=y +CONFIG_PPC=y CONFIG_EARLY_PRINTK=y CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_FORCE_MAX_ZONEORDER=13 + +# +# Processor support +# +# CONFIG_POWER4_ONLY is not set +CONFIG_POWER3=y +CONFIG_POWER4=y +CONFIG_PPC_FPU=y +CONFIG_ALTIVEC=y +CONFIG_PPC_STD_MMU=y +CONFIG_SMP=y +CONFIG_NR_CPUS=128 # # Code maturity level options @@ -68,75 +83,103 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_KMOD=y CONFIG_STOP_MACHINE=y -CONFIG_SYSVIPC_COMPAT=y + +# +# Block layer +# + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Platform support # -# CONFIG_PPC_ISERIES is not set CONFIG_PPC_MULTIPLATFORM=y +# CONFIG_PPC_ISERIES is not set +# CONFIG_EMBEDDED6xx is not set +# CONFIG_APUS is not set CONFIG_PPC_PSERIES=y -# CONFIG_PPC_BPA is not set # CONFIG_PPC_PMAC is not set # CONFIG_PPC_MAPLE is not set -CONFIG_PPC=y -CONFIG_PPC64=y +# CONFIG_PPC_CELL is not set CONFIG_PPC_OF=y CONFIG_XICS=y +# CONFIG_U3_DART is not set CONFIG_MPIC=y -CONFIG_ALTIVEC=y -CONFIG_PPC_SPLPAR=y -CONFIG_KEXEC=y +CONFIG_PPC_RTAS=y +CONFIG_RTAS_ERROR_LOGGING=y +CONFIG_RTAS_PROC=y +CONFIG_RTAS_FLASH=m +# CONFIG_MMIO_NVRAM is not set CONFIG_IBMVIO=y -# CONFIG_U3_DART is not set -# CONFIG_BOOTX_TEXT is not set -# CONFIG_POWER4_ONLY is not set +# CONFIG_PPC_MPC106 is not set +# CONFIG_GENERIC_TBSYNC is not set +# CONFIG_CPU_FREQ is not set +# CONFIG_WANT_EARLY_SERIAL is not set + +# +# Kernel options +# +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +# CONFIG_PREEMPT_BKL is not set +CONFIG_BINFMT_ELF=y +# CONFIG_BINFMT_MISC is not set +CONFIG_FORCE_MAX_ZONEORDER=13 CONFIG_IOMMU_VMERGE=y -CONFIG_SMP=y -CONFIG_NR_CPUS=128 +CONFIG_HOTPLUG_CPU=y +CONFIG_KEXEC=y +# CONFIG_IRQ_ALL_CPUS is not set +CONFIG_PPC_SPLPAR=y +CONFIG_EEH=y +CONFIG_SCANLOG=m +CONFIG_LPARCFG=y +CONFIG_NUMA=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_FLATMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_SELECT_MEMORY_MODEL=y # CONFIG_FLATMEM_MANUAL is not set -CONFIG_DISCONTIGMEM_MANUAL=y -# CONFIG_SPARSEMEM_MANUAL is not set -CONFIG_DISCONTIGMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPARSEMEM_EXTREME=y +# CONFIG_MEMORY_HOTPLUG is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_NODES_SPAN_OTHER_NODES=y -CONFIG_NUMA=y +# CONFIG_PPC_64K_PAGES is not set CONFIG_SCHED_SMT=y -CONFIG_PREEMPT_NONE=y -# CONFIG_PREEMPT_VOLUNTARY is not set -# CONFIG_PREEMPT is not set -# CONFIG_PREEMPT_BKL is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 -CONFIG_EEH=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_PPC_RTAS=y -CONFIG_RTAS_PROC=y -CONFIG_RTAS_FLASH=m -CONFIG_SCANLOG=m -CONFIG_LPARCFG=y -CONFIG_SECCOMP=y -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set -CONFIG_HOTPLUG_CPU=y CONFIG_PROC_DEVICETREE=y # CONFIG_CMDLINE_BOOL is not set +# CONFIG_PM is not set +CONFIG_SECCOMP=y CONFIG_ISA_DMA_API=y # -# Bus Options +# Bus options # +CONFIG_GENERIC_ISA_DMA=y +CONFIG_PPC_I8259=y +# CONFIG_PPC_INDIRECT_PCI is not set CONFIG_PCI=y CONFIG_PCI_DOMAINS=y CONFIG_PCI_LEGACY_PROC=y @@ -156,6 +199,7 @@ CONFIG_HOTPLUG_PCI=m # CONFIG_HOTPLUG_PCI_SHPC is not set CONFIG_HOTPLUG_PCI_RPA=m CONFIG_HOTPLUG_PCI_RPA_DLPAR=m +CONFIG_KERNEL_START=0xc000000000000000 # # Networking @@ -197,6 +241,10 @@ CONFIG_TCP_CONG_BIC=y # CONFIG_IPV6 is not set CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set + +# +# Core Netfilter Configuration +# CONFIG_NETFILTER_NETLINK=y CONFIG_NETFILTER_NETLINK_QUEUE=m CONFIG_NETFILTER_NETLINK_LOG=m @@ -299,6 +347,10 @@ CONFIG_LLC=y # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set CONFIG_NET_CLS_ROUTE=y @@ -368,14 +420,6 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=65536 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -473,6 +517,7 @@ CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set # CONFIG_BLK_DEV_3W_XXXX_RAID is not set # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -559,6 +604,7 @@ CONFIG_DM_MULTIPATH_EMC=m # # Macintosh device drivers # +# CONFIG_WINDFARM is not set # # Network device support @@ -645,7 +691,6 @@ CONFIG_IXGB=m # CONFIG_IXGB_NAPI is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set -# CONFIG_2BUFF_MODE is not set # # Token Ring devices @@ -674,6 +719,7 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m +# CONFIG_PPP_MPPE is not set CONFIG_PPPOE=m # CONFIG_SLIP is not set # CONFIG_NET_FC is not set @@ -784,6 +830,8 @@ CONFIG_HVCS=m # # CONFIG_WATCHDOG is not set # CONFIG_RTC is not set +CONFIG_GEN_RTC=y +# CONFIG_GEN_RTC_X is not set # CONFIG_DTLK is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set @@ -801,6 +849,7 @@ CONFIG_MAX_RAW_DEVS=1024 # TPM devices # # CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set # # I2C support @@ -852,6 +901,7 @@ CONFIG_I2C_ALGOBIT=y # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_RTC8564 is not set # CONFIG_SENSORS_MAX6875 is not set +# CONFIG_RTC_X1205_I2C is not set # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set @@ -893,7 +943,6 @@ CONFIG_FB=y CONFIG_FB_CFB_FILLRECT=y CONFIG_FB_CFB_COPYAREA=y CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SOFT_CURSOR=y CONFIG_FB_MACMODES=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y @@ -905,6 +954,7 @@ CONFIG_FB_OF=y # CONFIG_FB_ASILIANT is not set # CONFIG_FB_IMSTT is not set # CONFIG_FB_VGA16 is not set +# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_NVIDIA is not set # CONFIG_FB_RIVA is not set CONFIG_FB_MATROX=y @@ -927,7 +977,6 @@ CONFIG_FB_RADEON_I2C=y # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set -# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_VIRTUAL is not set # @@ -936,6 +985,7 @@ CONFIG_FB_RADEON_I2C=y # CONFIG_VGA_CONSOLE is not set CONFIG_DUMMY_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set # CONFIG_FONTS is not set CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y @@ -990,12 +1040,15 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set # CONFIG_USB_ACM is not set # CONFIG_USB_PRINTER is not set # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1106,6 +1159,7 @@ CONFIG_INFINIBAND_MTHCA=m # CONFIG_INFINIBAND_MTHCA_DEBUG is not set CONFIG_INFINIBAND_IPOIB=m # CONFIG_INFINIBAND_IPOIB_DEBUG is not set +# CONFIG_INFINIBAND_SRP is not set # # SN Devices @@ -1288,10 +1342,25 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_UTF8 is not set # -# Profiling support +# Library routines +# +CONFIG_CRC_CCITT=m +# CONFIG_CRC16 is not set +CONFIG_CRC32=y +CONFIG_LIBCRC32C=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m + +# +# Instrumentation Support # CONFIG_PROFILING=y CONFIG_OPROFILE=y +# CONFIG_KPROBES is not set # # Kernel hacking @@ -1308,14 +1377,15 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y +# CONFIG_DEBUG_VM is not set +# CONFIG_RCU_TORTURE_TEST is not set CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_KPROBES is not set CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUGGER=y CONFIG_XMON=y CONFIG_XMON_DEFAULT=y -# CONFIG_PPCDBG is not set CONFIG_IRQSTACKS=y +# CONFIG_BOOTX_TEXT is not set # # Security options @@ -1355,17 +1425,3 @@ CONFIG_CRYPTO_TEST=m # # Hardware crypto devices # - -# -# Library routines -# -CONFIG_CRC_CCITT=m -# CONFIG_CRC16 is not set -CONFIG_CRC32=y -CONFIG_LIBCRC32C=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b3ae2993efb..4970e3721a8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -4,6 +4,7 @@ ifeq ($(CONFIG_PPC64),y) EXTRA_CFLAGS += -mno-minimal-toc +CFLAGS_ioctl32.o += -Ifs/ endif ifeq ($(CONFIG_PPC32),y) CFLAGS_prom_init.o += -fPIC @@ -11,17 +12,29 @@ CFLAGS_btext.o += -fPIC endif obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ - signal_32.o pmc.o + irq.o signal_32.o pmc.o vdso.o +obj-y += vdso32/ obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ - signal_64.o ptrace32.o systbl.o + signal_64.o ptrace32.o systbl.o \ + paca.o ioctl32.o cpu_setup_power4.o \ + firmware.o sysfs.o udbg.o +obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o obj-$(CONFIG_POWER4) += idle_power4.o obj-$(CONFIG_PPC_OF) += of_device.o -obj-$(CONFIG_PPC_RTAS) += rtas.o +procfs-$(CONFIG_PPC64) := proc_ppc64.o +obj-$(CONFIG_PROC_FS) += $(procfs-y) +rtaspci-$(CONFIG_PPC64) := rtas_pci.o +obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y) obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o +obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o +obj-$(CONFIG_PPC_PSERIES) += udbg_16550.o +obj-$(CONFIG_PPC_MAPLE) += udbg_16550.o +udbgscc-$(CONFIG_PPC64) := udbg_scc.o +obj-$(CONFIG_PPC_PMAC) += $(udbgscc-y) ifeq ($(CONFIG_PPC_MERGE),y) @@ -36,12 +49,23 @@ extra-y += vmlinux.lds obj-y += process.o init_task.o time.o \ prom.o traps.o setup-common.o obj-$(CONFIG_PPC32) += entry_32.o setup_32.o misc_32.o systbl.o -obj-$(CONFIG_PPC64) += misc_64.o +obj-$(CONFIG_PPC64) += misc_64.o dma_64.o iommu.o obj-$(CONFIG_PPC_OF) += prom_init.o obj-$(CONFIG_MODULES) += ppc_ksyms.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_6xx) += idle_6xx.o obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_KPROBES) += kprobes.o + +module-$(CONFIG_PPC64) += module_64.o +obj-$(CONFIG_MODULES) += $(module-y) + +pci64-$(CONFIG_PPC64) += pci_64.o pci_dn.o pci_iommu.o \ + pci_direct_iommu.o iomap.o +obj-$(CONFIG_PCI) += $(pci64-y) + +kexec64-$(CONFIG_PPC64) += machine_kexec_64.o +obj-$(CONFIG_KEXEC) += $(kexec64-y) ifeq ($(CONFIG_PPC_ISERIES),y) $(obj)/head_64.o: $(obj)/lparmap.s @@ -49,11 +73,8 @@ AFLAGS_head_64.o += -I$(obj) endif else -# stuff used from here for ARCH=ppc or ARCH=ppc64 +# stuff used from here for ARCH=ppc smpobj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_PPC64) += traps.o process.o init_task.o time.o \ - setup-common.o $(smpobj-y) - endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index bc5a3689cc0..91538d2445b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -37,12 +37,12 @@ #include <asm/cputable.h> #include <asm/thread_info.h> #include <asm/rtas.h> +#include <asm/vdso_datapage.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> #include <asm/lppaca.h> #include <asm/iseries/hv_lp_event.h> #include <asm/cache.h> -#include <asm/systemcfg.h> #include <asm/compat.h> #endif @@ -106,7 +106,6 @@ int main(void) DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size)); DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size)); DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); - DEFINE(PLATFORM, offsetof(struct systemcfg, platform)); DEFINE(PLATFORM_LPAR, PLATFORM_LPAR); /* paca */ @@ -125,6 +124,9 @@ int main(void) DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); +#ifdef CONFIG_PPC_64K_PAGES + DEFINE(PACAPGDIR, offsetof(struct paca_struct, pgdir)); +#endif #ifdef CONFIG_HUGETLB_PAGE DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas)); DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas)); @@ -249,25 +251,44 @@ int main(void) DEFINE(TASK_SIZE, TASK_SIZE); DEFINE(NUM_USER_SEGMENTS, TASK_SIZE>>28); -#else /* CONFIG_PPC64 */ - /* systemcfg offsets for use by vdso */ - DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct systemcfg, tb_orig_stamp)); - DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct systemcfg, tb_ticks_per_sec)); - DEFINE(CFG_TB_TO_XS, offsetof(struct systemcfg, tb_to_xs)); - DEFINE(CFG_STAMP_XSEC, offsetof(struct systemcfg, stamp_xsec)); - DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct systemcfg, tb_update_count)); - DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct systemcfg, tz_minuteswest)); - DEFINE(CFG_TZ_DSTTIME, offsetof(struct systemcfg, tz_dsttime)); - DEFINE(CFG_SYSCALL_MAP32, offsetof(struct systemcfg, syscall_map_32)); - DEFINE(CFG_SYSCALL_MAP64, offsetof(struct systemcfg, syscall_map_64)); +#endif /* ! CONFIG_PPC64 */ - /* timeval/timezone offsets for use by vdso */ + /* datapage offsets for use by vdso */ + DEFINE(CFG_TB_ORIG_STAMP, offsetof(struct vdso_data, tb_orig_stamp)); + DEFINE(CFG_TB_TICKS_PER_SEC, offsetof(struct vdso_data, tb_ticks_per_sec)); + DEFINE(CFG_TB_TO_XS, offsetof(struct vdso_data, tb_to_xs)); + DEFINE(CFG_STAMP_XSEC, offsetof(struct vdso_data, stamp_xsec)); + DEFINE(CFG_TB_UPDATE_COUNT, offsetof(struct vdso_data, tb_update_count)); + DEFINE(CFG_TZ_MINUTEWEST, offsetof(struct vdso_data, tz_minuteswest)); + DEFINE(CFG_TZ_DSTTIME, offsetof(struct vdso_data, tz_dsttime)); + DEFINE(CFG_SYSCALL_MAP32, offsetof(struct vdso_data, syscall_map_32)); + DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec)); + DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); +#ifdef CONFIG_PPC64 + DEFINE(CFG_SYSCALL_MAP64, offsetof(struct vdso_data, syscall_map_64)); DEFINE(TVAL64_TV_SEC, offsetof(struct timeval, tv_sec)); DEFINE(TVAL64_TV_USEC, offsetof(struct timeval, tv_usec)); DEFINE(TVAL32_TV_SEC, offsetof(struct compat_timeval, tv_sec)); DEFINE(TVAL32_TV_USEC, offsetof(struct compat_timeval, tv_usec)); + DEFINE(TSPC64_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC64_TV_NSEC, offsetof(struct timespec, tv_nsec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct compat_timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct compat_timespec, tv_nsec)); +#else + DEFINE(TVAL32_TV_SEC, offsetof(struct timeval, tv_sec)); + DEFINE(TVAL32_TV_USEC, offsetof(struct timeval, tv_usec)); + DEFINE(TSPC32_TV_SEC, offsetof(struct timespec, tv_sec)); + DEFINE(TSPC32_TV_NSEC, offsetof(struct timespec, tv_nsec)); +#endif + /* timeval/timezone offsets for use by vdso */ DEFINE(TZONE_TZ_MINWEST, offsetof(struct timezone, tz_minuteswest)); DEFINE(TZONE_TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); -#endif /* CONFIG_PPC64 */ + + /* Other bits used by the vdso */ + DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); + DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); + DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); + DEFINE(CLOCK_REALTIME_RES, TICK_NSEC); + return 0; } diff --git a/arch/powerpc/kernel/cpu_setup_power4.S b/arch/powerpc/kernel/cpu_setup_power4.S new file mode 100644 index 00000000000..cca942fe611 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_power4.S @@ -0,0 +1,233 @@ +/* + * This file contains low level CPU setup functions. + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> + +_GLOBAL(__970_cpu_preinit) + /* + * Do nothing if not running in HV mode + */ + mfmsr r0 + rldicl. r0,r0,4,63 + beqlr + + /* + * Deal only with PPC970 and PPC970FX. + */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bnelr +1: + + /* Make sure HID4:rm_ci is off before MMU is turned off, that large + * pages are enabled with HID4:61 and clear HID5:DCBZ_size and + * HID5:DCBZ32_ill + */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + rldimi r3,r0,2,61 /* clear bit 61 (lg_pg_en) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + mfspr r3,SPRN_HID5 + rldimi r3,r0,6,56 /* clear bits 56 & 57 (DCBZ*) */ + sync + mtspr SPRN_HID5,r3 + isync + sync + + /* Setup some basic HID1 features */ + mfspr r0,SPRN_HID1 + li r3,0x1200 /* enable i-fetch cacheability */ + sldi r3,r3,44 /* and prefetch */ + or r0,r0,r3 + mtspr SPRN_HID1,r0 + mtspr SPRN_HID1,r0 + isync + + /* Clear HIOR */ + li r0,0 + sync + mtspr SPRN_HIOR,0 /* Clear interrupt prefix */ + isync + blr + +_GLOBAL(__setup_cpu_power4) + blr + +_GLOBAL(__setup_cpu_be) + /* Set large page sizes LP=0: 16MB, LP=1: 64KB */ + addi r3, 0, 0 + ori r3, r3, HID6_LB + sldi r3, r3, 32 + nor r3, r3, r3 + mfspr r4, SPRN_HID6 + and r4, r4, r3 + addi r3, 0, 0x02000 + sldi r3, r3, 32 + or r4, r4, r3 + mtspr SPRN_HID6, r4 + blr + +_GLOBAL(__setup_cpu_ppc970) + mfspr r0,SPRN_HID0 + li r11,5 /* clear DOZE and SLEEP */ + rldimi r0,r11,52,8 /* set NAP and DPM */ + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + sync + isync + blr + +/* Definitions for the table use to save CPU states */ +#define CS_HID0 0 +#define CS_HID1 8 +#define CS_HID4 16 +#define CS_HID5 24 +#define CS_SIZE 32 + + .data + .balign L1_CACHE_BYTES,0 +cpu_state_storage: + .space CS_SIZE + .balign L1_CACHE_BYTES,0 + .text + +/* Called in normal context to backup CPU 0 state. This + * does not include cache settings. This function is also + * called for machine sleep. This does not include the MMU + * setup, BATs, etc... but rather the "special" registers + * like HID0, HID1, HID4, etc... + */ +_GLOBAL(__save_cpu_setup) + /* Some CR fields are volatile, we back it up all */ + mfcr r7 + + /* Get storage ptr */ + LOADADDR(r5,cpu_state_storage) + + /* We only deal with 970 for now */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bne 2f + +1: /* Save HID0,1,4 and 5 */ + mfspr r3,SPRN_HID0 + std r3,CS_HID0(r5) + mfspr r3,SPRN_HID1 + std r3,CS_HID1(r5) + mfspr r3,SPRN_HID4 + std r3,CS_HID4(r5) + mfspr r3,SPRN_HID5 + std r3,CS_HID5(r5) + +2: + mtcr r7 + blr + +/* Called with no MMU context (typically MSR:IR/DR off) to + * restore CPU state as backed up by the previous + * function. This does not include cache setting + */ +_GLOBAL(__restore_cpu_setup) + /* Get storage ptr (FIXME when using anton reloc as we + * are running with translation disabled here + */ + LOADADDR(r5,cpu_state_storage) + + /* We only deal with 970 for now */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bnelr + +1: /* Before accessing memory, we make sure rm_ci is clear */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + + /* Clear interrupt prefix */ + li r0,0 + sync + mtspr SPRN_HIOR,0 + isync + + /* Restore HID0 */ + ld r3,CS_HID0(r5) + sync + isync + mtspr SPRN_HID0,r3 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + sync + isync + + /* Restore HID1 */ + ld r3,CS_HID1(r5) + sync + isync + mtspr SPRN_HID1,r3 + mtspr SPRN_HID1,r3 + sync + isync + + /* Restore HID4 */ + ld r3,CS_HID4(r5) + sync + isync + mtspr SPRN_HID4,r3 + sync + isync + + /* Restore HID5 */ + ld r3,CS_HID5(r5) + sync + isync + mtspr SPRN_HID5,r3 + sync + isync + blr + diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index b91345fa080..1d85cedbbb7 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -52,6 +52,9 @@ extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); #define COMMON_USER (PPC_FEATURE_32 | PPC_FEATURE_HAS_FPU | \ PPC_FEATURE_HAS_MMU) #define COMMON_USER_PPC64 (COMMON_USER | PPC_FEATURE_64) +#define COMMON_USER_POWER4 (COMMON_USER_PPC64 | PPC_FEATURE_POWER4) +#define COMMON_USER_POWER5 (COMMON_USER_PPC64 | PPC_FEATURE_POWER5) +#define COMMON_USER_POWER5_PLUS (COMMON_USER_PPC64 | PPC_FEATURE_POWER5_PLUS) /* We only set the spe features if the kernel was compiled with @@ -160,7 +163,7 @@ struct cpu_spec cpu_specs[] = { .pvr_value = 0x00350000, .cpu_name = "POWER4 (gp)", .cpu_features = CPU_FTRS_POWER4, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER4, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, @@ -175,7 +178,7 @@ struct cpu_spec cpu_specs[] = { .pvr_value = 0x00380000, .cpu_name = "POWER4+ (gq)", .cpu_features = CPU_FTRS_POWER4, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER4, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 8, @@ -190,7 +193,7 @@ struct cpu_spec cpu_specs[] = { .pvr_value = 0x00390000, .cpu_name = "PPC970", .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, .icache_bsize = 128, .dcache_bsize = 128, @@ -212,7 +215,7 @@ struct cpu_spec cpu_specs[] = { #else .cpu_features = CPU_FTRS_PPC970, #endif - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, .icache_bsize = 128, .dcache_bsize = 128, @@ -230,7 +233,7 @@ struct cpu_spec cpu_specs[] = { .pvr_value = 0x00440000, .cpu_name = "PPC970MP", .cpu_features = CPU_FTRS_PPC970, - .cpu_user_features = COMMON_USER_PPC64 | + .cpu_user_features = COMMON_USER_POWER4 | PPC_FEATURE_HAS_ALTIVEC_COMP, .icache_bsize = 128, .dcache_bsize = 128, @@ -240,12 +243,12 @@ struct cpu_spec cpu_specs[] = { .oprofile_model = &op_model_power4, #endif }, - { /* Power5 */ + { /* Power5 GR */ .pvr_mask = 0xffff0000, .pvr_value = 0x003a0000, .cpu_name = "POWER5 (gr)", .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER5, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, @@ -255,12 +258,12 @@ struct cpu_spec cpu_specs[] = { .oprofile_model = &op_model_power4, #endif }, - { /* Power5 */ + { /* Power5 GS */ .pvr_mask = 0xffff0000, .pvr_value = 0x003b0000, .cpu_name = "POWER5 (gs)", .cpu_features = CPU_FTRS_POWER5, - .cpu_user_features = COMMON_USER_PPC64, + .cpu_user_features = COMMON_USER_POWER5_PLUS, .icache_bsize = 128, .dcache_bsize = 128, .num_pmcs = 6, @@ -276,7 +279,7 @@ struct cpu_spec cpu_specs[] = { .cpu_name = "Cell Broadband Engine", .cpu_features = CPU_FTRS_CELL, .cpu_user_features = COMMON_USER_PPC64 | - PPC_FEATURE_HAS_ALTIVEC_COMP, + PPC_FEATURE_CELL | PPC_FEATURE_HAS_ALTIVEC_COMP, .icache_bsize = 128, .dcache_bsize = 128, .cpu_setup = __setup_cpu_be, @@ -929,6 +932,16 @@ struct cpu_spec cpu_specs[] = { .icache_bsize = 32, .dcache_bsize = 32, }, + { /* 440SPe Rev. A */ + .pvr_mask = 0xff000fff, + .pvr_value = 0x53000890, + .cpu_name = "440SPe Rev. A", + .cpu_features = CPU_FTR_SPLIT_ID_CACHE | + CPU_FTR_USE_TB, + .cpu_user_features = PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU, + .icache_bsize = 32, + .dcache_bsize = 32, + }, #endif /* CONFIG_44x */ #ifdef CONFIG_FSL_BOOKE { /* e200z5 */ diff --git a/arch/powerpc/kernel/dma_64.c b/arch/powerpc/kernel/dma_64.c new file mode 100644 index 00000000000..7c3419656cc --- /dev/null +++ b/arch/powerpc/kernel/dma_64.c @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Implements the generic device dma API for ppc64. Handles + * the pci and vio busses + */ + +#include <linux/device.h> +#include <linux/dma-mapping.h> +/* Include the busses we support */ +#include <linux/pci.h> +#include <asm/vio.h> +#include <asm/scatterlist.h> +#include <asm/bug.h> + +static struct dma_mapping_ops *get_dma_ops(struct device *dev) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return &pci_dma_ops; +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return &vio_dma_ops; +#endif + return NULL; +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->dma_supported(dev, mask); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_supported); + +int dma_set_mask(struct device *dev, u64 dma_mask) +{ +#ifdef CONFIG_PCI + if (dev->bus == &pci_bus_type) + return pci_set_dma_mask(to_pci_dev(dev), dma_mask); +#endif +#ifdef CONFIG_IBMVIO + if (dev->bus == &vio_bus_type) + return -EIO; +#endif /* CONFIG_IBMVIO */ + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->alloc_coherent(dev, size, dma_handle, flag); + BUG(); + return NULL; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); + else + BUG(); +} +EXPORT_SYMBOL(dma_free_coherent); + +dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, cpu_addr, size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_single); + +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_addr, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_single); + +dma_addr_t dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_single(dev, + (page_address(page) + offset), size, direction); + BUG(); + return (dma_addr_t)0; +} +EXPORT_SYMBOL(dma_map_page); + +void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_single(dev, dma_address, size, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_page); + +int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + return dma_ops->map_sg(dev, sg, nents, direction); + BUG(); + return 0; +} +EXPORT_SYMBOL(dma_map_sg); + +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, + enum dma_data_direction direction) +{ + struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops) + dma_ops->unmap_sg(dev, sg, nhwentries, direction); + else + BUG(); +} +EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c new file mode 100644 index 00000000000..65eae752a52 --- /dev/null +++ b/arch/powerpc/kernel/firmware.c @@ -0,0 +1,45 @@ +/* + * Extracted from cputable.c + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> + +#include <asm/firmware.h> + +unsigned long ppc64_firmware_features; + +#ifdef CONFIG_PPC_PSERIES +firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { + {FW_FEATURE_PFT, "hcall-pft"}, + {FW_FEATURE_TCE, "hcall-tce"}, + {FW_FEATURE_SPRG0, "hcall-sprg0"}, + {FW_FEATURE_DABR, "hcall-dabr"}, + {FW_FEATURE_COPY, "hcall-copy"}, + {FW_FEATURE_ASR, "hcall-asr"}, + {FW_FEATURE_DEBUG, "hcall-debug"}, + {FW_FEATURE_PERF, "hcall-perf"}, + {FW_FEATURE_DUMP, "hcall-dump"}, + {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, + {FW_FEATURE_MIGRATE, "hcall-migrate"}, + {FW_FEATURE_PERFMON, "hcall-perfmon"}, + {FW_FEATURE_CRQ, "hcall-crq"}, + {FW_FEATURE_VIO, "hcall-vio"}, + {FW_FEATURE_RDMA, "hcall-rdma"}, + {FW_FEATURE_LLAN, "hcall-lLAN"}, + {FW_FEATURE_BULK, "hcall-bulk"}, + {FW_FEATURE_XDABR, "hcall-xdabr"}, + {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_SPLPAR, "hcall-splpar"}, +}; +#endif diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 4d6001fa1cf..b780b42c95f 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -41,20 +41,20 @@ _GLOBAL(load_up_fpu) #ifndef CONFIG_SMP LOADBASE(r3, last_task_used_math) toreal(r3) - LDL r4,OFF(last_task_used_math)(r3) - CMPI 0,r4,0 + PPC_LL r4,OFF(last_task_used_math)(r3) + PPC_LCMPI 0,r4,0 beq 1f toreal(r4) addi r4,r4,THREAD /* want last_task_used_math->thread */ SAVE_32FPRS(0, r4) mffs fr0 stfd fr0,THREAD_FPSCR(r4) - LDL r5,PT_REGS(r4) + PPC_LL r5,PT_REGS(r4) toreal(r5) - LDL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) li r10,MSR_FP|MSR_FE0|MSR_FE1 andc r4,r4,r10 /* disable FP for previous task */ - STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) 1: #endif /* CONFIG_SMP */ /* enable use of FP after return */ @@ -77,7 +77,7 @@ _GLOBAL(load_up_fpu) #ifndef CONFIG_SMP subi r4,r5,THREAD fromreal(r4) - STL r4,OFF(last_task_used_math)(r3) + PPC_STL r4,OFF(last_task_used_math)(r3) #endif /* CONFIG_SMP */ /* restore registers and return */ /* we haven't used ctr or xer or lr */ @@ -97,24 +97,24 @@ _GLOBAL(giveup_fpu) MTMSRD(r5) /* enable use of fpu now */ SYNC_601 isync - CMPI 0,r3,0 + PPC_LCMPI 0,r3,0 beqlr- /* if no previous owner, done */ addi r3,r3,THREAD /* want THREAD of task */ - LDL r5,PT_REGS(r3) - CMPI 0,r5,0 + PPC_LL r5,PT_REGS(r3) + PPC_LCMPI 0,r5,0 SAVE_32FPRS(0, r3) mffs fr0 stfd fr0,THREAD_FPSCR(r3) beq 1f - LDL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_LL r4,_MSR-STACK_FRAME_OVERHEAD(r5) li r3,MSR_FP|MSR_FE0|MSR_FE1 andc r4,r4,r3 /* disable FP for previous task */ - STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) + PPC_STL r4,_MSR-STACK_FRAME_OVERHEAD(r5) 1: #ifndef CONFIG_SMP li r5,0 LOADBASE(r4,last_task_used_math) - STL r5,OFF(last_task_used_math)(r4) + PPC_STL r5,OFF(last_task_used_math)(r4) #endif /* CONFIG_SMP */ blr diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index b102e3a2415..ccdf94731e3 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -1100,6 +1100,7 @@ start_here: mr r3,r31 mr r4,r30 bl machine_init + bl __save_cpu_setup bl MMU_init #ifdef CONFIG_APUS diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 45d81976987..8a8bf79ef04 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -28,7 +28,6 @@ #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> -#include <asm/systemcfg.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/bug.h> @@ -195,11 +194,11 @@ exception_marker: #define EX_R12 24 #define EX_R13 32 #define EX_SRR0 40 -#define EX_R3 40 /* SLB miss saves R3, but not SRR0 */ #define EX_DAR 48 -#define EX_LR 48 /* SLB miss saves LR, but not DAR */ #define EX_DSISR 56 #define EX_CCR 60 +#define EX_R3 64 +#define EX_LR 72 #define EXCEPTION_PROLOG_PSERIES(area, label) \ mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ @@ -419,17 +418,22 @@ data_access_slb_pSeries: mtspr SPRN_SPRG1,r13 RUNLATCH_ON(r13) mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_DAR std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ + mfcr r9 +#ifdef __DISABLED__ + /* Keep that around for when we re-implement dynamic VSIDs */ + cmpdi r3,0 + bge slb_miss_user_pseries +#endif /* __DISABLED__ */ std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - std r3,PACA_EXSLB+EX_R3(r13) - mfspr r9,SPRN_SPRG1 - std r9,PACA_EXSLB+EX_R13(r13) - mfcr r9 + mfspr r10,SPRN_SPRG1 + std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ - mfspr r3,SPRN_DAR - b .do_slb_miss /* Rel. branch works in real mode */ + b .slb_miss_realmode /* Rel. branch works in real mode */ STD_EXCEPTION_PSERIES(0x400, instruction_access) @@ -440,17 +444,22 @@ instruction_access_slb_pSeries: mtspr SPRN_SPRG1,r13 RUNLATCH_ON(r13) mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + std r3,PACA_EXSLB+EX_R3(r13) + mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ + mfcr r9 +#ifdef __DISABLED__ + /* Keep that around for when we re-implement dynamic VSIDs */ + cmpdi r3,0 + bge slb_miss_user_pseries +#endif /* __DISABLED__ */ std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - std r3,PACA_EXSLB+EX_R3(r13) - mfspr r9,SPRN_SPRG1 - std r9,PACA_EXSLB+EX_R13(r13) - mfcr r9 + mfspr r10,SPRN_SPRG1 + std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ - mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - b .do_slb_miss /* Rel. branch works in real mode */ + b .slb_miss_realmode /* Rel. branch works in real mode */ STD_EXCEPTION_PSERIES(0x500, hardware_interrupt) STD_EXCEPTION_PSERIES(0x600, alignment) @@ -509,6 +518,38 @@ _GLOBAL(do_stab_bolted_pSeries) EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) /* + * We have some room here we use that to put + * the peries slb miss user trampoline code so it's reasonably + * away from slb_miss_user_common to avoid problems with rfid + * + * This is used for when the SLB miss handler has to go virtual, + * which doesn't happen for now anymore but will once we re-implement + * dynamic VSIDs for shared page tables + */ +#ifdef __DISABLED__ +slb_miss_user_pseries: + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r10,SPRG1 + ld r11,PACA_EXSLB+EX_R9(r13) + ld r12,PACA_EXSLB+EX_R3(r13) + std r10,PACA_EXGEN+EX_R13(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R3(r13) + clrrdi r12,r13,32 + mfmsr r10 + mfspr r11,SRR0 /* save SRR0 */ + ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ + ori r10,r10,MSR_IR|MSR_DR|MSR_RI + mtspr SRR0,r12 + mfspr r12,SRR1 /* and SRR1 */ + mtspr SRR1,r10 + rfid + b . /* prevent spec. execution */ +#endif /* __DISABLED__ */ + +/* * Vectors for the FWNMI option. Share common code. */ .globl system_reset_fwnmi @@ -559,22 +600,59 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) .globl data_access_slb_iSeries data_access_slb_iSeries: mtspr SPRN_SPRG1,r13 /* save r13 */ - EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB) + mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) - ld r12,PACALPPACA+LPPACASRR1(r13) mfspr r3,SPRN_DAR - b .do_slb_miss + std r9,PACA_EXSLB+EX_R9(r13) + mfcr r9 +#ifdef __DISABLED__ + cmpdi r3,0 + bge slb_miss_user_iseries +#endif + std r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG1 + std r10,PACA_EXSLB+EX_R13(r13) + ld r12,PACALPPACA+LPPACASRR1(r13); + b .slb_miss_realmode STD_EXCEPTION_ISERIES(0x400, instruction_access, PACA_EXGEN) .globl instruction_access_slb_iSeries instruction_access_slb_iSeries: mtspr SPRN_SPRG1,r13 /* save r13 */ - EXCEPTION_PROLOG_ISERIES_1(PACA_EXSLB) + mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) - ld r12,PACALPPACA+LPPACASRR1(r13) - ld r3,PACALPPACA+LPPACASRR0(r13) - b .do_slb_miss + ld r3,PACALPPACA+LPPACASRR0(r13) /* get SRR0 value */ + std r9,PACA_EXSLB+EX_R9(r13) + mfcr r9 +#ifdef __DISABLED__ + cmpdi r3,0 + bge .slb_miss_user_iseries +#endif + std r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG1 + std r10,PACA_EXSLB+EX_R13(r13) + ld r12,PACALPPACA+LPPACASRR1(r13); + b .slb_miss_realmode + +#ifdef __DISABLED__ +slb_miss_user_iseries: + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r10,SPRG1 + ld r11,PACA_EXSLB+EX_R9(r13) + ld r12,PACA_EXSLB+EX_R3(r13) + std r10,PACA_EXGEN+EX_R13(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R3(r13) + EXCEPTION_PROLOG_ISERIES_2 + b slb_miss_user_common +#endif MASKABLE_EXCEPTION_ISERIES(0x500, hardware_interrupt) STD_EXCEPTION_ISERIES(0x600, alignment, PACA_EXGEN) @@ -809,6 +887,126 @@ instruction_access_common: li r5,0x400 b .do_hash_page /* Try to handle as hpte fault */ +/* + * Here is the common SLB miss user that is used when going to virtual + * mode for SLB misses, that is currently not used + */ +#ifdef __DISABLED__ + .align 7 + .globl slb_miss_user_common +slb_miss_user_common: + mflr r10 + std r3,PACA_EXGEN+EX_DAR(r13) + stw r9,PACA_EXGEN+EX_CCR(r13) + std r10,PACA_EXGEN+EX_LR(r13) + std r11,PACA_EXGEN+EX_SRR0(r13) + bl .slb_allocate_user + + ld r10,PACA_EXGEN+EX_LR(r13) + ld r3,PACA_EXGEN+EX_R3(r13) + lwz r9,PACA_EXGEN+EX_CCR(r13) + ld r11,PACA_EXGEN+EX_SRR0(r13) + mtlr r10 + beq- slb_miss_fault + + andi. r10,r12,MSR_RI /* check for unrecoverable exception */ + beq- unrecov_user_slb + mfmsr r10 + +.machine push +.machine "power4" + mtcrf 0x80,r9 +.machine pop + + clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ + mtmsrd r10,1 + + mtspr SRR0,r11 + mtspr SRR1,r12 + + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + rfid + b . + +slb_miss_fault: + EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) + ld r4,PACA_EXGEN+EX_DAR(r13) + li r5,0 + std r4,_DAR(r1) + std r5,_DSISR(r1) + b .handle_page_fault + +unrecov_user_slb: + EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) + DISABLE_INTS + bl .save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl .unrecoverable_exception + b 1b + +#endif /* __DISABLED__ */ + + +/* + * r13 points to the PACA, r9 contains the saved CR, + * r12 contain the saved SRR1, SRR0 is still ready for return + * r3 has the faulting address + * r9 - r13 are saved in paca->exslb. + * r3 is saved in paca->slb_r3 + * We assume we aren't going to take any exceptions during this procedure. + */ +_GLOBAL(slb_miss_realmode) + mflr r10 + + stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ + std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + + bl .slb_allocate_realmode + + /* All done -- return from exception. */ + + ld r10,PACA_EXSLB+EX_LR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ +#ifdef CONFIG_PPC_ISERIES + ld r11,PACALPPACA+LPPACASRR0(r13) /* get SRR0 value */ +#endif /* CONFIG_PPC_ISERIES */ + + mtlr r10 + + andi. r10,r12,MSR_RI /* check for unrecoverable exception */ + beq- unrecov_slb + +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + +#ifdef CONFIG_PPC_ISERIES + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 +#endif /* CONFIG_PPC_ISERIES */ + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + rfid + b . /* prevent speculative execution */ + +unrecov_slb: + EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) + DISABLE_INTS + bl .save_nvgprs +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl .unrecoverable_exception + b 1b + .align 7 .globl hardware_interrupt_common .globl hardware_interrupt_entry @@ -1139,62 +1337,6 @@ _GLOBAL(do_stab_bolted) b . /* prevent speculative execution */ /* - * r13 points to the PACA, r9 contains the saved CR, - * r11 and r12 contain the saved SRR0 and SRR1. - * r3 has the faulting address - * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 - * We assume we aren't going to take any exceptions during this procedure. - */ -_GLOBAL(do_slb_miss) - mflr r10 - - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ - std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - - bl .slb_allocate /* handle it */ - - /* All done -- return from exception. */ - - ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) - lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ -#ifdef CONFIG_PPC_ISERIES - ld r11,PACALPPACA+LPPACASRR0(r13) /* get SRR0 value */ -#endif /* CONFIG_PPC_ISERIES */ - - mtlr r10 - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- unrecov_slb - -.machine push -.machine "power4" - mtcrf 0x80,r9 - mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ -.machine pop - -#ifdef CONFIG_PPC_ISERIES - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r12 -#endif /* CONFIG_PPC_ISERIES */ - ld r9,PACA_EXSLB+EX_R9(r13) - ld r10,PACA_EXSLB+EX_R10(r13) - ld r11,PACA_EXSLB+EX_R11(r13) - ld r12,PACA_EXSLB+EX_R12(r13) - ld r13,PACA_EXSLB+EX_R13(r13) - rfid - b . /* prevent speculative execution */ - -unrecov_slb: - EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB) - DISABLE_INTS - bl .save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl .unrecoverable_exception - b 1b - -/* * Space for CPU0's segment table. * * On iSeries, the hypervisor must fill in at least one entry before @@ -1554,22 +1696,14 @@ _GLOBAL(pmac_secondary_start) * SPRG3 = paca virtual address */ _GLOBAL(__secondary_start) + /* Set thread priority to MEDIUM */ + HMT_MEDIUM - HMT_MEDIUM /* Set thread priority to MEDIUM */ - + /* Load TOC */ ld r2,PACATOC(r13) - li r6,0 - stb r6,PACAPROCENABLED(r13) - -#ifndef CONFIG_PPC_ISERIES - /* Initialize the page table pointer register. */ - LOADADDR(r6,_SDR1) - ld r6,0(r6) /* get the value of _SDR1 */ - mtspr SPRN_SDR1,r6 /* set the htab location */ -#endif - /* Initialize the first segment table (or SLB) entry */ - ld r3,PACASTABVIRT(r13) /* get addr of segment table */ - bl .stab_initialize + + /* Do early setup for that CPU (stab, slb, hash table pointer) */ + bl .early_setup_secondary /* Initialize the kernel stack. Just a repeat for iSeries. */ LOADADDR(r3,current_set) @@ -1578,37 +1712,7 @@ _GLOBAL(__secondary_start) addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD std r1,PACAKSAVE(r13) - ld r3,PACASTABREAL(r13) /* get raddr of segment table */ - ori r4,r3,1 /* turn on valid bit */ - -#ifdef CONFIG_PPC_ISERIES - li r0,-1 /* hypervisor call */ - li r3,1 - sldi r3,r3,63 /* 0x8000000000000000 */ - ori r3,r3,4 /* 0x8000000000000004 */ - sc /* HvCall_setASR */ -#else - /* set the ASR */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - andi. r3,r3,PLATFORM_LPAR /* Test if bit 0 is set (LPAR bit) */ - beq 98f /* branch if result is 0 */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmpwi r3,0x37 /* SStar */ - beq 97f - cmpwi r3,0x36 /* IStar */ - beq 97f - cmpwi r3,0x34 /* Pulsar */ - bne 98f -97: li r3,H_SET_ASR /* hcall = H_SET_ASR */ - HVSC /* Invoking hcall */ - b 99f -98: /* !(rpa hypervisor) || !(star) */ - mtasr r4 /* set the stab location */ -99: -#endif + /* Clear backchain so we get nice backtraces */ li r7,0 mtlr r7 @@ -1631,6 +1735,7 @@ _GLOBAL(start_secondary_prolog) li r3,0 std r3,0(r1) /* Zero the stack frame pointer */ bl .start_secondary + b . #endif /* @@ -1750,40 +1855,6 @@ _STATIC(start_here_multiplatform) mr r3,r31 bl .early_setup - /* set the ASR */ - ld r3,PACASTABREAL(r13) - ori r4,r3,1 /* turn on valid bit */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - andi. r3,r3,PLATFORM_LPAR /* Test if bit 0 is set (LPAR bit) */ - beq 98f /* branch if result is 0 */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmpwi r3,0x37 /* SStar */ - beq 97f - cmpwi r3,0x36 /* IStar */ - beq 97f - cmpwi r3,0x34 /* Pulsar */ - bne 98f -97: li r3,H_SET_ASR /* hcall = H_SET_ASR */ - HVSC /* Invoking hcall */ - b 99f -98: /* !(rpa hypervisor) || !(star) */ - mtasr r4 /* set the stab location */ -99: - /* Set SDR1 (hash table pointer) */ - ld r3,systemcfg@got(r2) /* r3 = ptr to systemcfg */ - ld r3,0(r3) - lwz r3,PLATFORM(r3) /* r3 = platform flags */ - /* Test if bit 0 is set (LPAR bit) */ - andi. r3,r3,PLATFORM_LPAR - bne 98f /* branch if result is !0 */ - LOADADDR(r6,_SDR1) /* Only if NOT LPAR */ - add r6,r6,r26 - ld r6,0(r6) /* get the value of _SDR1 */ - mtspr SPRN_SDR1,r6 /* set the htab location */ -98: LOADADDR(r3,.start_here_common) SET_REG_TO_CONST(r4, MSR_KERNEL) mtspr SPRN_SRR0,r3 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5063c603fad..8d60fa99fc4 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -24,7 +24,7 @@ * Copyright 2002-2004 MontaVista Software, Inc. * PowerPC 44x support, Matt Porter <mporter@kernel.crashing.org> * Copyright 2004 Freescale Semiconductor, Inc - * PowerPC e500 modifications, Kumar Gala <kumar.gala@freescale.com> + * PowerPC e500 modifications, Kumar Gala <galak@kernel.crashing.org> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the diff --git a/arch/powerpc/kernel/ioctl32.c b/arch/powerpc/kernel/ioctl32.c new file mode 100644 index 00000000000..0fa3d27fef0 --- /dev/null +++ b/arch/powerpc/kernel/ioctl32.c @@ -0,0 +1,45 @@ +/* + * ioctl32.c: Conversion between 32bit and 64bit native ioctls. + * + * Based on sparc64 ioctl32.c by: + * + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * + * ppc64 changes: + * + * Copyright (C) 2000 Ken Aaker (kdaaker@rchland.vnet.ibm.com) + * Copyright (C) 2001 Anton Blanchard (antonb@au.ibm.com) + * + * These routines maintain argument size conversion between 32bit and 64bit + * ioctls. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define INCLUDES +#include "compat_ioctl.c" +#include <linux/syscalls.h> + +#define CODE +#include "compat_ioctl.c" + +#define HANDLE_IOCTL(cmd,handler) { cmd, (ioctl_trans_handler_t)handler, NULL }, +#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl) + +#define IOCTL_TABLE_START \ + struct ioctl_trans ioctl_start[] = { +#define IOCTL_TABLE_END \ + }; + +IOCTL_TABLE_START +#include <linux/compat_ioctl.h> +#define DECLARES +#include "compat_ioctl.c" + +IOCTL_TABLE_END + +int ioctl_table_size = ARRAY_SIZE(ioctl_start); diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c new file mode 100644 index 00000000000..6160c8dbb7c --- /dev/null +++ b/arch/powerpc/kernel/iomap.c @@ -0,0 +1,146 @@ +/* + * arch/ppc64/kernel/iomap.c + * + * ppc64 "iomap" interface implementation. + * + * (C) Copyright 2004 Linus Torvalds + */ +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/mm.h> +#include <asm/io.h> + +/* + * Here comes the ppc64 implementation of the IOMAP + * interfaces. + */ +unsigned int fastcall ioread8(void __iomem *addr) +{ + return readb(addr); +} +unsigned int fastcall ioread16(void __iomem *addr) +{ + return readw(addr); +} +unsigned int fastcall ioread16be(void __iomem *addr) +{ + return in_be16(addr); +} +unsigned int fastcall ioread32(void __iomem *addr) +{ + return readl(addr); +} +unsigned int fastcall ioread32be(void __iomem *addr) +{ + return in_be32(addr); +} +EXPORT_SYMBOL(ioread8); +EXPORT_SYMBOL(ioread16); +EXPORT_SYMBOL(ioread16be); +EXPORT_SYMBOL(ioread32); +EXPORT_SYMBOL(ioread32be); + +void fastcall iowrite8(u8 val, void __iomem *addr) +{ + writeb(val, addr); +} +void fastcall iowrite16(u16 val, void __iomem *addr) +{ + writew(val, addr); +} +void fastcall iowrite16be(u16 val, void __iomem *addr) +{ + out_be16(addr, val); +} +void fastcall iowrite32(u32 val, void __iomem *addr) +{ + writel(val, addr); +} +void fastcall iowrite32be(u32 val, void __iomem *addr) +{ + out_be32(addr, val); +} +EXPORT_SYMBOL(iowrite8); +EXPORT_SYMBOL(iowrite16); +EXPORT_SYMBOL(iowrite16be); +EXPORT_SYMBOL(iowrite32); +EXPORT_SYMBOL(iowrite32be); + +/* + * These are the "repeat read/write" functions. Note the + * non-CPU byte order. We do things in "IO byteorder" + * here. + * + * FIXME! We could make these do EEH handling if we really + * wanted. Not clear if we do. + */ +void ioread8_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insb((u8 __iomem *) addr, dst, count); +} +void ioread16_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insw_ns((u16 __iomem *) addr, dst, count); +} +void ioread32_rep(void __iomem *addr, void *dst, unsigned long count) +{ + _insl_ns((u32 __iomem *) addr, dst, count); +} +EXPORT_SYMBOL(ioread8_rep); +EXPORT_SYMBOL(ioread16_rep); +EXPORT_SYMBOL(ioread32_rep); + +void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsb((u8 __iomem *) addr, src, count); +} +void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsw_ns((u16 __iomem *) addr, src, count); +} +void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) +{ + _outsl_ns((u32 __iomem *) addr, src, count); +} +EXPORT_SYMBOL(iowrite8_rep); +EXPORT_SYMBOL(iowrite16_rep); +EXPORT_SYMBOL(iowrite32_rep); + +void __iomem *ioport_map(unsigned long port, unsigned int len) +{ + if (!_IO_IS_VALID(port)) + return NULL; + return (void __iomem *) (port+pci_io_base); +} + +void ioport_unmap(void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max) +{ + unsigned long start = pci_resource_start(dev, bar); + unsigned long len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (!len) + return NULL; + if (max && len > max) + len = max; + if (flags & IORESOURCE_IO) + return ioport_map(start, len); + if (flags & IORESOURCE_MEM) + return ioremap(start, len); + /* What? */ + return NULL; +} + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + /* Nothing to do */ +} +EXPORT_SYMBOL(pci_iomap); +EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c new file mode 100644 index 00000000000..4d9b4388918 --- /dev/null +++ b/arch/powerpc/kernel/iommu.c @@ -0,0 +1,572 @@ +/* + * arch/ppc64/kernel/iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes, virtual merging: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * and Ben. Herrenschmidt, IBM Corporation + * + * Dynamic DMA mapping support, bus-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> + +#define DBG(...) + +#ifdef CONFIG_IOMMU_VMERGE +static int novmerge = 0; +#else +static int novmerge = 1; +#endif + +static int __init setup_iommu(char *str) +{ + if (!strcmp(str, "novmerge")) + novmerge = 1; + else if (!strcmp(str, "vmerge")) + novmerge = 0; + return 1; +} + +__setup("iommu=", setup_iommu); + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned long npages, + unsigned long *handle, + unsigned int align_order) +{ + unsigned long n, end, i, start; + unsigned long limit; + int largealloc = npages > 15; + int pass = 0; + unsigned long align_mask; + + align_mask = 0xffffffffffffffffl >> (64 - align_order); + + /* This allocator was derived from x86_64's bit string search */ + + /* Sanity check */ + if (unlikely(npages) == 0) { + if (printk_ratelimit()) + WARN_ON(1); + return DMA_ERROR_CODE; + } + + if (handle && *handle) + start = *handle; + else + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + /* Use only half of the table for small allocs (15 pages or less) */ + limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + + if (largealloc && start < tbl->it_halfpoint) + start = tbl->it_halfpoint; + + /* The case below can happen if we have a small segment appended + * to a large, or when the previous alloc was at the very end of + * the available space. If so, go back to the initial start. + */ + if (start >= limit) + start = largealloc ? tbl->it_largehint : tbl->it_hint; + + again: + + n = find_next_zero_bit(tbl->it_map, limit, start); + + /* Align allocation */ + n = (n + align_mask) & ~align_mask; + + end = n + npages; + + if (unlikely(end >= limit)) { + if (likely(pass < 2)) { + /* First failure, just rescan the half of the table. + * Second failure, rescan the other half of the table. + */ + start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; + limit = pass ? tbl->it_size : limit; + pass++; + goto again; + } else { + /* Third failure, give up */ + return DMA_ERROR_CODE; + } + } + + for (i = n; i < end; i++) + if (test_bit(i, tbl->it_map)) { + start = i+1; + goto again; + } + + for (i = n; i < end; i++) + __set_bit(i, tbl->it_map); + + /* Bump the hint to a new block for small allocs. */ + if (largealloc) { + /* Don't bump to new block to avoid fragmentation */ + tbl->it_largehint = end; + } else { + /* Overflow will be taken care of at the next allocation */ + tbl->it_hint = (end + tbl->it_blocksize - 1) & + ~(tbl->it_blocksize - 1); + } + + /* Update handle for SG allocations */ + if (handle) + *handle = end; + + return n; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, + unsigned int npages, enum dma_data_direction direction, + unsigned int align_order) +{ + unsigned long entry, flags; + dma_addr_t ret = DMA_ERROR_CODE; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + entry = iommu_range_alloc(tbl, npages, NULL, align_order); + + if (unlikely(entry == DMA_ERROR_CODE)) { + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return DMA_ERROR_CODE; + } + + entry += tbl->it_offset; /* Offset into real TCE table */ + ret = entry << PAGE_SHIFT; /* Set the return dma address */ + + /* Put the TCEs in the HW table */ + ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK, + direction); + + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + return ret; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; + unsigned long i; + + entry = dma_addr >> PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + if (((free_entry + npages) > tbl->it_size) || + (entry < tbl->it_offset)) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_free: invalid entry\n"); + printk(KERN_INFO "\tentry = 0x%lx\n", entry); + printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr); + printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl); + printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno); + printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size); + printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset); + printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index); + WARN_ON(1); + } + return; + } + + ppc_md.tce_free(tbl, entry, npages); + + for (i = 0; i < npages; i++) + __clear_bit(free_entry+i, tbl->it_map); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long flags; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + __iommu_free(tbl, dma_addr, npages); + + /* Make sure TLB cache is flushed if the HW needs it. We do + * not do an mb() here on purpose, it is not needed on any of + * the current platforms. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + dma_addr_t dma_next = 0, dma_addr; + unsigned long flags; + struct scatterlist *s, *outs, *segstart; + int outcount, incount; + unsigned long handle; + + BUG_ON(direction == DMA_NONE); + + if ((nelems == 0) || !tbl) + return 0; + + outs = s = segstart = &sglist[0]; + outcount = 1; + incount = nelems; + handle = 0; + + /* Init first segment length for backout at failure */ + outs->dma_length = 0; + + DBG("mapping %d elements:\n", nelems); + + spin_lock_irqsave(&(tbl->it_lock), flags); + + for (s = outs; nelems; nelems--, s++) { + unsigned long vaddr, npages, entry, slen; + + slen = s->length; + /* Sanity check */ + if (slen == 0) { + dma_next = 0; + continue; + } + /* Allocate iommu entries for that segment */ + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + entry = iommu_range_alloc(tbl, npages, &handle, 0); + + DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); + + /* Handle failure */ + if (unlikely(entry == DMA_ERROR_CODE)) { + if (printk_ratelimit()) + printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx" + " npages %lx\n", tbl, vaddr, npages); + goto failure; + } + + /* Convert entry to a dma_addr_t */ + entry += tbl->it_offset; + dma_addr = entry << PAGE_SHIFT; + dma_addr |= s->offset; + + DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n", + npages, entry, dma_addr); + + /* Insert into HW table */ + ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction); + + /* If we are in an open segment, try merging */ + if (segstart != s) { + DBG(" - trying merge...\n"); + /* We cannot merge if: + * - allocated dma_addr isn't contiguous to previous allocation + */ + if (novmerge || (dma_addr != dma_next)) { + /* Can't merge: create a new segment */ + segstart = s; + outcount++; outs++; + DBG(" can't merge, new segment.\n"); + } else { + outs->dma_length += s->length; + DBG(" merged, new len: %lx\n", outs->dma_length); + } + } + + if (segstart == s) { + /* This is a new segment, fill entries */ + DBG(" - filling new segment.\n"); + outs->dma_address = dma_addr; + outs->dma_length = slen; + } + + /* Calculate next page pointer for contiguous check */ + dma_next = dma_addr + slen; + + DBG(" - dma next is: %lx\n", dma_next); + } + + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); + + /* Make sure updates are seen by hardware */ + mb(); + + DBG("mapped %d elements:\n", outcount); + + /* For the sake of iommu_unmap_sg, we clear out the length in the + * next entry of the sglist if we didn't fill the list completely + */ + if (outcount < incount) { + outs++; + outs->dma_address = DMA_ERROR_CODE; + outs->dma_length = 0; + } + return outcount; + + failure: + for (s = &sglist[0]; s <= outs; s++) { + if (s->dma_length != 0) { + unsigned long vaddr, npages; + + vaddr = s->dma_address & PAGE_MASK; + npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr) + >> PAGE_SHIFT; + __iommu_free(tbl, vaddr, npages); + } + } + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return 0; +} + + +void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + if (!tbl) + return; + + spin_lock_irqsave(&(tbl->it_lock), flags); + + while (nelems--) { + unsigned int npages; + dma_addr_t dma_handle = sglist->dma_address; + + if (sglist->dma_length == 0) + break; + npages = (PAGE_ALIGN(dma_handle + sglist->dma_length) + - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; + __iommu_free(tbl, dma_handle, npages); + sglist++; + } + + /* Flush/invalidate TLBs if necessary. As for iommu_free(), we + * do not do an mb() here, the affected platforms do not need it + * when freeing. + */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + spin_unlock_irqrestore(&(tbl->it_lock), flags); +} + +/* + * Build a iommu_table structure. This contains a bit map which + * is used to manage allocation of the tce space. + */ +struct iommu_table *iommu_init_table(struct iommu_table *tbl) +{ + unsigned long sz; + static int welcomed = 0; + + /* Set aside 1/4 of the table for large allocations. */ + tbl->it_halfpoint = tbl->it_size * 3 / 4; + + /* number of bytes needed for the bitmap */ + sz = (tbl->it_size + 7) >> 3; + + tbl->it_map = (unsigned long *)__get_free_pages(GFP_ATOMIC, get_order(sz)); + if (!tbl->it_map) + panic("iommu_init_table: Can't allocate %ld bytes\n", sz); + + memset(tbl->it_map, 0, sz); + + tbl->it_hint = 0; + tbl->it_largehint = tbl->it_halfpoint; + spin_lock_init(&tbl->it_lock); + + /* Clear the hardware table in case firmware left allocations in it */ + ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + + if (!welcomed) { + printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n", + novmerge ? "disabled" : "enabled"); + welcomed = 1; + } + + return tbl; +} + +void iommu_free_table(struct device_node *dn) +{ + struct pci_dn *pdn = dn->data; + struct iommu_table *tbl = pdn->iommu_table; + unsigned long bitmap_sz, i; + unsigned int order; + + if (!tbl || !tbl->it_map) { + printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__, + dn->full_name); + return; + } + + /* verify that table contains no entries */ + /* it_size is in entries, and we're examining 64 at a time */ + for (i = 0; i < (tbl->it_size/64); i++) { + if (tbl->it_map[i] != 0) { + printk(KERN_WARNING "%s: Unexpected TCEs for %s\n", + __FUNCTION__, dn->full_name); + break; + } + } + + /* calculate bitmap size in bytes */ + bitmap_sz = (tbl->it_size + 7) / 8; + + /* free bitmap */ + order = get_order(bitmap_sz); + free_pages((unsigned long) tbl->it_map, order); + + /* free table */ + kfree(tbl); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_handle = DMA_ERROR_CODE; + unsigned long uaddr; + unsigned int npages; + + BUG_ON(direction == DMA_NONE); + + uaddr = (unsigned long)vaddr; + npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + + if (tbl) { + dma_handle = iommu_alloc(tbl, vaddr, npages, direction, 0); + if (dma_handle == DMA_ERROR_CODE) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_alloc failed, " + "tbl %p vaddr %p npages %d\n", + tbl, vaddr, npages); + } + } else + dma_handle |= (uaddr & ~PAGE_MASK); + } + + return dma_handle; +} + +void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + + if (tbl) + iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - + (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* + * Client asked for way too much space. This is checked later + * anyway. It is easier to debug here for the drivers than in + * the tce tables. + */ + if (order >= IOMAP_MAX_ORDER) { + printk("iommu_alloc_consistent size too large: 0x%lx\n", size); + return NULL; + } + + if (!tbl) + return NULL; + + /* Alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + return NULL; + memset(ret, 0, size); + + /* Set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL, order); + if (mapping == DMA_ERROR_CODE) { + free_pages((unsigned long)ret, order); + ret = NULL; + } else + *dma_handle = mapping; + return ret; +} + +void iommu_free_coherent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + + if (tbl) { + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); + } +} diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c new file mode 100644 index 00000000000..5a71ed9612f --- /dev/null +++ b/arch/powerpc/kernel/irq.c @@ -0,0 +1,479 @@ +/* + * arch/ppc/kernel/irq.c + * + * Derived from arch/i386/kernel/irq.c + * Copyright (C) 1992 Linus Torvalds + * Adapted from arch/i386 by Gary Thomas + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Updated and modified by Cort Dougan <cort@fsmlabs.com> + * Copyright (C) 1996-2001 Cort Dougan + * Adapted for Power Macintosh by Paul Mackerras + * Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au) + * Amiga/APUS changes by Jesper Skov (jskov@cygnus.co.uk). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + * + * The MPC8xx has an interrupt mask in the SIU. If a bit is set, the + * interrupt is _enabled_. As expected, IRQ0 is bit 0 in the 32-bit + * mask register (of which only 16 are defined), hence the weird shifting + * and complement of the cached_irq_mask. I want to be able to stuff + * this right into the SIU SMASK register. + * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx + * to reduce code space and undefined function references. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/threads.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/config.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <linux/profile.h> +#include <linux/bitops.h> +#ifdef CONFIG_PPC64 +#include <linux/kallsyms.h> +#endif + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/irq.h> +#include <asm/cache.h> +#include <asm/prom.h> +#include <asm/ptrace.h> +#include <asm/machdep.h> +#ifdef CONFIG_PPC64 +#include <asm/iseries/it_lp_queue.h> +#include <asm/paca.h> +#endif + +int __irq_offset_value; +#ifdef CONFIG_PPC32 +EXPORT_SYMBOL(__irq_offset_value); +#endif + +static int ppc_spurious_interrupts; + +#if defined(CONFIG_PPC_ISERIES) && defined(CONFIG_SMP) +extern void iSeries_smp_message_recv(struct pt_regs *); +#endif + +#ifdef CONFIG_PPC32 +#define NR_MASK_WORDS ((NR_IRQS + 31) / 32) + +unsigned long ppc_cached_irq_mask[NR_MASK_WORDS]; +atomic_t ppc_n_lost_interrupts; + +#ifdef CONFIG_TAU_INT +extern int tau_initialized; +extern int tau_interrupts(int); +#endif + +#if defined(CONFIG_SMP) && !defined(CONFIG_PPC_MERGE) +extern atomic_t ipi_recv; +extern atomic_t ipi_sent; +#endif +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +EXPORT_SYMBOL(irq_desc); + +int distribute_irqs = 1; +u64 ppc64_interrupt_controller; +#endif /* CONFIG_PPC64 */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *)v, j; + struct irqaction *action; + irq_desc_t *desc; + unsigned long flags; + + if (i == 0) { + seq_puts(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%d ", j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + desc = get_irq_desc(i); + spin_lock_irqsave(&desc->lock, flags); + action = desc->action; + if (!action || !action->handler) + goto skip; + seq_printf(p, "%3d: ", i); +#ifdef CONFIG_SMP + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#else + seq_printf(p, "%10u ", kstat_irqs(i)); +#endif /* CONFIG_SMP */ + if (desc->handler) + seq_printf(p, " %s ", desc->handler->typename); + else + seq_puts(p, " None "); + seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge "); + seq_printf(p, " %s", action->name); + for (action = action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&desc->lock, flags); + } else if (i == NR_IRQS) { +#ifdef CONFIG_PPC32 +#ifdef CONFIG_TAU_INT + if (tau_initialized){ + seq_puts(p, "TAU: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", tau_interrupts(j)); + seq_puts(p, " PowerPC Thermal Assist (cpu temp)\n"); + } +#endif +#if defined(CONFIG_SMP) && !defined(CONFIG_PPC_MERGE) + /* should this be per processor send/receive? */ + seq_printf(p, "IPI (recv/sent): %10u/%u\n", + atomic_read(&ipi_recv), atomic_read(&ipi_sent)); +#endif +#endif /* CONFIG_PPC32 */ + seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts); + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for_each_irq(irq) { + cpumask_t mask; + + if (irq_desc[irq].status & IRQ_PER_CPU) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +#ifdef CONFIG_PPC_ISERIES +void do_IRQ(struct pt_regs *regs) +{ + struct paca_struct *lpaca; + + irq_enter(); + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 2KB free? */ + { + long sp; + + sp = __get_SP() & (THREAD_SIZE-1); + + if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { + printk("do_IRQ: stack overflow: %ld\n", + sp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + + lpaca = get_paca(); +#ifdef CONFIG_SMP + if (lpaca->lppaca.int_dword.fields.ipi_cnt) { + lpaca->lppaca.int_dword.fields.ipi_cnt = 0; + iSeries_smp_message_recv(regs); + } +#endif /* CONFIG_SMP */ + if (hvlpevent_is_pending()) + process_hvlpevents(regs); + + irq_exit(); + + if (lpaca->lppaca.int_dword.fields.decr_int) { + lpaca->lppaca.int_dword.fields.decr_int = 0; + /* Signal a fake decrementer interrupt */ + timer_interrupt(regs); + } +} + +#else /* CONFIG_PPC_ISERIES */ + +void do_IRQ(struct pt_regs *regs) +{ + int irq; +#ifdef CONFIG_IRQSTACKS + struct thread_info *curtp, *irqtp; +#endif + + irq_enter(); + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 2KB free? */ + { + long sp; + + sp = __get_SP() & (THREAD_SIZE-1); + + if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { + printk("do_IRQ: stack overflow: %ld\n", + sp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + + /* + * Every platform is required to implement ppc_md.get_irq. + * This function will either return an irq number or -1 to + * indicate there are no more pending. + * The value -2 is for buggy hardware and means that this IRQ + * has already been handled. -- Tom + */ + irq = ppc_md.get_irq(regs); + + if (irq >= 0) { +#ifdef CONFIG_IRQSTACKS + /* Switch to the irq stack to handle this */ + curtp = current_thread_info(); + irqtp = hardirq_ctx[smp_processor_id()]; + if (curtp != irqtp) { + irqtp->task = curtp->task; + irqtp->flags = 0; + call___do_IRQ(irq, regs, irqtp); + irqtp->task = NULL; + if (irqtp->flags) + set_bits(irqtp->flags, &curtp->flags); + } else +#endif + __do_IRQ(irq, regs); + } else +#ifdef CONFIG_PPC32 + if (irq != -2) +#endif + /* That's not SMP safe ... but who cares ? */ + ppc_spurious_interrupts++; + irq_exit(); +} + +#endif /* CONFIG_PPC_ISERIES */ + +void __init init_IRQ(void) +{ +#ifdef CONFIG_PPC64 + static int once = 0; + + if (once) + return; + + once++; + +#endif + ppc_md.init_IRQ(); +#ifdef CONFIG_PPC64 + irq_ctx_init(); +#endif +} + +#ifdef CONFIG_PPC64 +/* + * Virtual IRQ mapping code, used on systems with XICS interrupt controllers. + */ + +#define UNDEFINED_IRQ 0xffffffff +unsigned int virt_irq_to_real_map[NR_IRQS]; + +/* + * Don't use virtual irqs 0, 1, 2 for devices. + * The pcnet32 driver considers interrupt numbers < 2 to be invalid, + * and 2 is the XICS IPI interrupt. + * We limit virtual irqs to 17 less than NR_IRQS so that when we + * offset them by 16 (to reserve the first 16 for ISA interrupts) + * we don't end up with an interrupt number >= NR_IRQS. + */ +#define MIN_VIRT_IRQ 3 +#define MAX_VIRT_IRQ (NR_IRQS - NUM_ISA_INTERRUPTS - 1) +#define NR_VIRT_IRQS (MAX_VIRT_IRQ - MIN_VIRT_IRQ + 1) + +void +virt_irq_init(void) +{ + int i; + for (i = 0; i < NR_IRQS; i++) + virt_irq_to_real_map[i] = UNDEFINED_IRQ; +} + +/* Create a mapping for a real_irq if it doesn't already exist. + * Return the virtual irq as a convenience. + */ +int virt_irq_create_mapping(unsigned int real_irq) +{ + unsigned int virq, first_virq; + static int warned; + + if (ppc64_interrupt_controller == IC_OPEN_PIC) + return real_irq; /* no mapping for openpic (for now) */ + + if (ppc64_interrupt_controller == IC_CELL_PIC) + return real_irq; /* no mapping for iic either */ + + /* don't map interrupts < MIN_VIRT_IRQ */ + if (real_irq < MIN_VIRT_IRQ) { + virt_irq_to_real_map[real_irq] = real_irq; + return real_irq; + } + + /* map to a number between MIN_VIRT_IRQ and MAX_VIRT_IRQ */ + virq = real_irq; + if (virq > MAX_VIRT_IRQ) + virq = (virq % NR_VIRT_IRQS) + MIN_VIRT_IRQ; + + /* search for this number or a free slot */ + first_virq = virq; + while (virt_irq_to_real_map[virq] != UNDEFINED_IRQ) { + if (virt_irq_to_real_map[virq] == real_irq) + return virq; + if (++virq > MAX_VIRT_IRQ) + virq = MIN_VIRT_IRQ; + if (virq == first_virq) + goto nospace; /* oops, no free slots */ + } + + virt_irq_to_real_map[virq] = real_irq; + return virq; + + nospace: + if (!warned) { + printk(KERN_CRIT "Interrupt table is full\n"); + printk(KERN_CRIT "Increase NR_IRQS (currently %d) " + "in your kernel sources and rebuild.\n", NR_IRQS); + warned = 1; + } + return NO_IRQ; +} + +/* + * In most cases will get a hit on the very first slot checked in the + * virt_irq_to_real_map. Only when there are a large number of + * IRQs will this be expensive. + */ +unsigned int real_irq_to_virt_slowpath(unsigned int real_irq) +{ + unsigned int virq; + unsigned int first_virq; + + virq = real_irq; + + if (virq > MAX_VIRT_IRQ) + virq = (virq % NR_VIRT_IRQS) + MIN_VIRT_IRQ; + + first_virq = virq; + + do { + if (virt_irq_to_real_map[virq] == real_irq) + return virq; + + virq++; + + if (virq >= MAX_VIRT_IRQ) + virq = 0; + + } while (first_virq != virq); + + return NO_IRQ; + +} + +#ifdef CONFIG_IRQSTACKS +struct thread_info *softirq_ctx[NR_CPUS]; +struct thread_info *hardirq_ctx[NR_CPUS]; + +void irq_ctx_init(void) +{ + struct thread_info *tp; + int i; + + for_each_cpu(i) { + memset((void *)softirq_ctx[i], 0, THREAD_SIZE); + tp = softirq_ctx[i]; + tp->cpu = i; + tp->preempt_count = SOFTIRQ_OFFSET; + + memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); + tp = hardirq_ctx[i]; + tp->cpu = i; + tp->preempt_count = HARDIRQ_OFFSET; + } +} + +void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curtp, *irqtp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curtp = current_thread_info(); + irqtp = softirq_ctx[smp_processor_id()]; + irqtp->task = curtp->task; + call_do_softirq(irqtp); + irqtp->task = NULL; + } + + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); + +#endif /* CONFIG_IRQSTACKS */ + +static int __init setup_noirqdistrib(char *str) +{ + distribute_irqs = 0; + return 1; +} + +__setup("noirqdistrib", setup_noirqdistrib); +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c new file mode 100644 index 00000000000..511af54e623 --- /dev/null +++ b/arch/powerpc/kernel/kprobes.c @@ -0,0 +1,459 @@ +/* + * Kernel Probes (KProbes) + * arch/ppc64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Nov Ananth N Mavinakayanahalli <ananth@in.ibm.com> kprobes port + * for PPC64 + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <asm/cacheflush.h> +#include <asm/kdebug.h> +#include <asm/sstep.h> + +static DECLARE_MUTEX(kprobe_mutex); +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + int ret = 0; + kprobe_opcode_t insn = *p->addr; + + if ((unsigned long)p->addr & 0x03) { + printk("Attempt to register kprobe at an unaligned address\n"); + ret = -EINVAL; + } else if (IS_MTMSRD(insn) || IS_RFID(insn)) { + printk("Cannot register a kprobe on rfid or mtmsrd\n"); + ret = -EINVAL; + } + + /* insn must be on a special executable page on ppc64 */ + if (!ret) { + down(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + up(&kprobe_mutex); + if (!p->ainsn.insn) + ret = -ENOMEM; + } + return ret; +} + +void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + down(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + up(&kprobe_mutex); +} + +static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + kprobe_opcode_t insn = *p->ainsn.insn; + + regs->msr |= MSR_SE; + + /* single step inline if it is a trap variant */ + if (is_trap(insn)) + regs->nip = (unsigned long)p->addr; + else + regs->nip = (unsigned long)p->ainsn.insn; +} + +static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; +} + +static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_msr = regs->msr; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, + struct pt_regs *regs) +{ + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *)regs->link; + + /* Replace the return addr with trampoline addr */ + regs->link = (unsigned long)kretprobe_trampoline; + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + +static inline int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + unsigned int *addr = (unsigned int *)regs->nip; + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + kprobe_opcode_t insn = *p->ainsn.insn; + if (kcb->kprobe_status == KPROBE_HIT_SS && + is_trap(insn)) { + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + goto no_kprobe; + } + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_saved_msr = regs->msr; + p->nmissed++; + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * PowerPC has multiple variants of the "trap" + * instruction. If the current instruction is a + * trap variant, it could belong to someone else + */ + kprobe_opcode_t cur_insn = *addr; + if (is_trap(cur_insn)) + goto no_kprobe; + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + set_current_kprobe(p, regs, kcb); + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Function return probe trampoline: + * - init_kprobes() establishes a probepoint here + * - When the probed function returns, this probe + * causes the handlers to fire + */ +void kretprobe_trampoline_holder(void) +{ + asm volatile(".global kretprobe_trampoline\n" + "kretprobe_trampoline:\n" + "nop\n"); +} + +/* + * Called when the probe at kretprobe trampoline is hit + */ +int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->nip = orig_ret_address; + + reset_current_kprobe(); + spin_unlock_irqrestore(&kretprobe_lock, flags); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "breakpoint" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + */ +static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + int ret; + unsigned int insn = *p->ainsn.insn; + + regs->nip = (unsigned long)p->addr; + ret = emulate_step(regs, insn); + if (ret == 0) + regs->nip = (unsigned long)p->addr + 4; +} + +static inline int post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs); + regs->msr |= kcb->kprobe_saved_msr; + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, msr + * will have SE set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->msr & MSR_SE) + return 0; + + return 1; +} + +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + if (kcb->kprobe_status & KPROBE_HIT_SS) { + resume_execution(cur, regs); + regs->msr &= ~MSR_SE; + regs->msr |= kcb->kprobe_saved_msr; + + reset_current_kprobe(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch (val) { + case DIE_BPT: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_SSTEP: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + memcpy(&kcb->jprobe_saved_regs, regs, sizeof(struct pt_regs)); + + /* setup return addr to the jprobe handler routine */ + regs->nip = (unsigned long)(((func_descr_t *)jp->entry)->entry); + regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); + + return 1; +} + +void __kprobes jprobe_return(void) +{ + asm volatile("trap" ::: "memory"); +} + +void __kprobes jprobe_return_end(void) +{ +}; + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + /* + * FIXME - we should ideally be validating that we got here 'cos + * of the "trap" in jprobe_return() above, before restoring the + * saved regs... + */ + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + preempt_enable_no_resched(); + return 1; +} + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c new file mode 100644 index 00000000000..9dda16ccde7 --- /dev/null +++ b/arch/powerpc/kernel/lparcfg.c @@ -0,0 +1,608 @@ +/* + * PowerPC64 LPAR Configuration Information Driver + * + * Dave Engebretsen engebret@us.ibm.com + * Copyright (c) 2003 Dave Engebretsen + * Will Schmidt willschm@us.ibm.com + * SPLPAR updates, Copyright (c) 2003 Will Schmidt IBM Corporation. + * seq_file updates, Copyright (c) 2004 Will Schmidt IBM Corporation. + * Nathan Lynch nathanl@austin.ibm.com + * Added lparcfg_write, Copyright (C) 2004 Nathan Lynch IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This driver creates a proc file at /proc/ppc64/lparcfg which contains + * keyword - value pairs that specify the configuration of the partition. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> +#include <asm/iseries/hv_lp_config.h> +#include <asm/lppaca.h> +#include <asm/hvcall.h> +#include <asm/firmware.h> +#include <asm/rtas.h> +#include <asm/system.h> +#include <asm/time.h> +#include <asm/iseries/it_exp_vpd_panel.h> +#include <asm/prom.h> +#include <asm/vdso_datapage.h> + +#define MODULE_VERS "1.6" +#define MODULE_NAME "lparcfg" + +/* #define LPARCFG_DEBUG */ + +static struct proc_dir_entry *proc_ppc64_lparcfg; +#define LPARCFG_BUFF_SIZE 4096 + +#ifdef CONFIG_PPC_ISERIES + +/* + * For iSeries legacy systems, the PPA purr function is available from the + * emulated_time_base field in the paca. + */ +static unsigned long get_purr(void) +{ + unsigned long sum_purr = 0; + int cpu; + struct paca_struct *lpaca; + + for_each_cpu(cpu) { + lpaca = paca + cpu; + sum_purr += lpaca->lppaca.emulated_time_base; + +#ifdef PURR_DEBUG + printk(KERN_INFO "get_purr for cpu (%d) has value (%ld) \n", + cpu, lpaca->lppaca.emulated_time_base); +#endif + } + return sum_purr; +} + +#define lparcfg_write NULL + +/* + * Methods used to fetch LPAR data when running on an iSeries platform. + */ +static int lparcfg_data(struct seq_file *m, void *v) +{ + unsigned long pool_id, lp_index; + int shared, entitled_capacity, max_entitled_capacity; + int processors, max_processors; + struct paca_struct *lpaca = get_paca(); + unsigned long purr = get_purr(); + + seq_printf(m, "%s %s \n", MODULE_NAME, MODULE_VERS); + + shared = (int)(lpaca->lppaca_ptr->shared_proc); + seq_printf(m, "serial_number=%c%c%c%c%c%c%c\n", + e2a(xItExtVpdPanel.mfgID[2]), + e2a(xItExtVpdPanel.mfgID[3]), + e2a(xItExtVpdPanel.systemSerial[1]), + e2a(xItExtVpdPanel.systemSerial[2]), + e2a(xItExtVpdPanel.systemSerial[3]), + e2a(xItExtVpdPanel.systemSerial[4]), + e2a(xItExtVpdPanel.systemSerial[5])); + + seq_printf(m, "system_type=%c%c%c%c\n", + e2a(xItExtVpdPanel.machineType[0]), + e2a(xItExtVpdPanel.machineType[1]), + e2a(xItExtVpdPanel.machineType[2]), + e2a(xItExtVpdPanel.machineType[3])); + + lp_index = HvLpConfig_getLpIndex(); + seq_printf(m, "partition_id=%d\n", (int)lp_index); + + seq_printf(m, "system_active_processors=%d\n", + (int)HvLpConfig_getSystemPhysicalProcessors()); + + seq_printf(m, "system_potential_processors=%d\n", + (int)HvLpConfig_getSystemPhysicalProcessors()); + + processors = (int)HvLpConfig_getPhysicalProcessors(); + seq_printf(m, "partition_active_processors=%d\n", processors); + + max_processors = (int)HvLpConfig_getMaxPhysicalProcessors(); + seq_printf(m, "partition_potential_processors=%d\n", max_processors); + + if (shared) { + entitled_capacity = HvLpConfig_getSharedProcUnits(); + max_entitled_capacity = HvLpConfig_getMaxSharedProcUnits(); + } else { + entitled_capacity = processors * 100; + max_entitled_capacity = max_processors * 100; + } + seq_printf(m, "partition_entitled_capacity=%d\n", entitled_capacity); + + seq_printf(m, "partition_max_entitled_capacity=%d\n", + max_entitled_capacity); + + if (shared) { + pool_id = HvLpConfig_getSharedPoolIndex(); + seq_printf(m, "pool=%d\n", (int)pool_id); + seq_printf(m, "pool_capacity=%d\n", + (int)(HvLpConfig_getNumProcsInSharedPool(pool_id) * + 100)); + seq_printf(m, "purr=%ld\n", purr); + } + + seq_printf(m, "shared_processor_mode=%d\n", shared); + + return 0; +} +#endif /* CONFIG_PPC_ISERIES */ + +#ifdef CONFIG_PPC_PSERIES +/* + * Methods used to fetch LPAR data when running on a pSeries platform. + */ +/* find a better place for this function... */ +static void log_plpar_hcall_return(unsigned long rc, char *tag) +{ + if (rc == 0) /* success, return */ + return; +/* check for null tag ? */ + if (rc == H_Hardware) + printk(KERN_INFO + "plpar-hcall (%s) failed with hardware fault\n", tag); + else if (rc == H_Function) + printk(KERN_INFO + "plpar-hcall (%s) failed; function not allowed\n", tag); + else if (rc == H_Authority) + printk(KERN_INFO + "plpar-hcall (%s) failed; not authorized to this function\n", + tag); + else if (rc == H_Parameter) + printk(KERN_INFO "plpar-hcall (%s) failed; Bad parameter(s)\n", + tag); + else + printk(KERN_INFO + "plpar-hcall (%s) failed with unexpected rc(0x%lx)\n", + tag, rc); + +} + +/* + * H_GET_PPP hcall returns info in 4 parms. + * entitled_capacity,unallocated_capacity, + * aggregation, resource_capability). + * + * R4 = Entitled Processor Capacity Percentage. + * R5 = Unallocated Processor Capacity Percentage. + * R6 (AABBCCDDEEFFGGHH). + * XXXX - reserved (0) + * XXXX - reserved (0) + * XXXX - Group Number + * XXXX - Pool Number. + * R7 (IIJJKKLLMMNNOOPP). + * XX - reserved. (0) + * XX - bit 0-6 reserved (0). bit 7 is Capped indicator. + * XX - variable processor Capacity Weight + * XX - Unallocated Variable Processor Capacity Weight. + * XXXX - Active processors in Physical Processor Pool. + * XXXX - Processors active on platform. + */ +static unsigned int h_get_ppp(unsigned long *entitled, + unsigned long *unallocated, + unsigned long *aggregation, + unsigned long *resource) +{ + unsigned long rc; + rc = plpar_hcall_4out(H_GET_PPP, 0, 0, 0, 0, entitled, unallocated, + aggregation, resource); + + log_plpar_hcall_return(rc, "H_GET_PPP"); + + return rc; +} + +static void h_pic(unsigned long *pool_idle_time, unsigned long *num_procs) +{ + unsigned long rc; + unsigned long dummy; + rc = plpar_hcall(H_PIC, 0, 0, 0, 0, pool_idle_time, num_procs, &dummy); + + if (rc != H_Authority) + log_plpar_hcall_return(rc, "H_PIC"); +} + +/* Track sum of all purrs across all processors. This is used to further */ +/* calculate usage values by different applications */ + +static unsigned long get_purr(void) +{ + unsigned long sum_purr = 0; + int cpu; + struct cpu_usage *cu; + + for_each_cpu(cpu) { + cu = &per_cpu(cpu_usage_array, cpu); + sum_purr += cu->current_tb; + } + return sum_purr; +} + +#define SPLPAR_CHARACTERISTICS_TOKEN 20 +#define SPLPAR_MAXLENGTH 1026*(sizeof(char)) + +/* + * parse_system_parameter_string() + * Retrieve the potential_processors, max_entitled_capacity and friends + * through the get-system-parameter rtas call. Replace keyword strings as + * necessary. + */ +static void parse_system_parameter_string(struct seq_file *m) +{ + int call_status; + + char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!local_buffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d \n", + __FILE__, __FUNCTION__, __LINE__); + return; + } + + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + SPLPAR_CHARACTERISTICS_TOKEN, + __pa(rtas_data_buf)); + memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH); + spin_unlock(&rtas_data_buf_lock); + + if (call_status != 0) { + printk(KERN_INFO + "%s %s Error calling get-system-parameter (0x%x)\n", + __FILE__, __FUNCTION__, call_status); + } else { + int splpar_strlen; + int idx, w_idx; + char *workbuffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL); + if (!workbuffer) { + printk(KERN_ERR "%s %s kmalloc failure at line %d \n", + __FILE__, __FUNCTION__, __LINE__); + kfree(local_buffer); + return; + } +#ifdef LPARCFG_DEBUG + printk(KERN_INFO "success calling get-system-parameter \n"); +#endif + splpar_strlen = local_buffer[0] * 16 + local_buffer[1]; + local_buffer += 2; /* step over strlen value */ + + memset(workbuffer, 0, SPLPAR_MAXLENGTH); + w_idx = 0; + idx = 0; + while ((*local_buffer) && (idx < splpar_strlen)) { + workbuffer[w_idx++] = local_buffer[idx++]; + if ((local_buffer[idx] == ',') + || (local_buffer[idx] == '\0')) { + workbuffer[w_idx] = '\0'; + if (w_idx) { + /* avoid the empty string */ + seq_printf(m, "%s\n", workbuffer); + } + memset(workbuffer, 0, SPLPAR_MAXLENGTH); + idx++; /* skip the comma */ + w_idx = 0; + } else if (local_buffer[idx] == '=') { + /* code here to replace workbuffer contents + with different keyword strings */ + if (0 == strcmp(workbuffer, "MaxEntCap")) { + strcpy(workbuffer, + "partition_max_entitled_capacity"); + w_idx = strlen(workbuffer); + } + if (0 == strcmp(workbuffer, "MaxPlatProcs")) { + strcpy(workbuffer, + "system_potential_processors"); + w_idx = strlen(workbuffer); + } + } + } + kfree(workbuffer); + local_buffer -= 2; /* back up over strlen value */ + } + kfree(local_buffer); +} + +/* Return the number of processors in the system. + * This function reads through the device tree and counts + * the virtual processors, this does not include threads. + */ +static int lparcfg_count_active_processors(void) +{ + struct device_node *cpus_dn = NULL; + int count = 0; + + while ((cpus_dn = of_find_node_by_type(cpus_dn, "cpu"))) { +#ifdef LPARCFG_DEBUG + printk(KERN_ERR "cpus_dn %p \n", cpus_dn); +#endif + count++; + } + return count; +} + +static int lparcfg_data(struct seq_file *m, void *v) +{ + int partition_potential_processors; + int partition_active_processors; + struct device_node *rootdn; + const char *model = ""; + const char *system_id = ""; + unsigned int *lp_index_ptr, lp_index = 0; + struct device_node *rtas_node; + int *lrdrp; + + rootdn = find_path_device("/"); + if (rootdn) { + model = get_property(rootdn, "model", NULL); + system_id = get_property(rootdn, "system-id", NULL); + lp_index_ptr = (unsigned int *) + get_property(rootdn, "ibm,partition-no", NULL); + if (lp_index_ptr) + lp_index = *lp_index_ptr; + } + + seq_printf(m, "%s %s \n", MODULE_NAME, MODULE_VERS); + + seq_printf(m, "serial_number=%s\n", system_id); + + seq_printf(m, "system_type=%s\n", model); + + seq_printf(m, "partition_id=%d\n", (int)lp_index); + + rtas_node = find_path_device("/rtas"); + lrdrp = (int *)get_property(rtas_node, "ibm,lrdr-capacity", NULL); + + if (lrdrp == NULL) { + partition_potential_processors = vdso_data->processorCount; + } else { + partition_potential_processors = *(lrdrp + 4); + } + + partition_active_processors = lparcfg_count_active_processors(); + + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + unsigned long h_entitled, h_unallocated; + unsigned long h_aggregation, h_resource; + unsigned long pool_idle_time, pool_procs; + unsigned long purr; + + h_get_ppp(&h_entitled, &h_unallocated, &h_aggregation, + &h_resource); + + seq_printf(m, "R4=0x%lx\n", h_entitled); + seq_printf(m, "R5=0x%lx\n", h_unallocated); + seq_printf(m, "R6=0x%lx\n", h_aggregation); + seq_printf(m, "R7=0x%lx\n", h_resource); + + purr = get_purr(); + + /* this call handles the ibm,get-system-parameter contents */ + parse_system_parameter_string(m); + + seq_printf(m, "partition_entitled_capacity=%ld\n", h_entitled); + + seq_printf(m, "group=%ld\n", (h_aggregation >> 2 * 8) & 0xffff); + + seq_printf(m, "system_active_processors=%ld\n", + (h_resource >> 0 * 8) & 0xffff); + + /* pool related entries are apropriate for shared configs */ + if (paca[0].lppaca.shared_proc) { + + h_pic(&pool_idle_time, &pool_procs); + + seq_printf(m, "pool=%ld\n", + (h_aggregation >> 0 * 8) & 0xffff); + + /* report pool_capacity in percentage */ + seq_printf(m, "pool_capacity=%ld\n", + ((h_resource >> 2 * 8) & 0xffff) * 100); + + seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); + + seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + } + + seq_printf(m, "unallocated_capacity_weight=%ld\n", + (h_resource >> 4 * 8) & 0xFF); + + seq_printf(m, "capacity_weight=%ld\n", + (h_resource >> 5 * 8) & 0xFF); + + seq_printf(m, "capped=%ld\n", (h_resource >> 6 * 8) & 0x01); + + seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated); + + seq_printf(m, "purr=%ld\n", purr); + + } else { /* non SPLPAR case */ + + seq_printf(m, "system_active_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "system_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "partition_max_entitled_capacity=%d\n", + partition_potential_processors * 100); + + seq_printf(m, "partition_entitled_capacity=%d\n", + partition_active_processors * 100); + } + + seq_printf(m, "partition_active_processors=%d\n", + partition_active_processors); + + seq_printf(m, "partition_potential_processors=%d\n", + partition_potential_processors); + + seq_printf(m, "shared_processor_mode=%d\n", paca[0].lppaca.shared_proc); + + return 0; +} + +/* + * Interface for changing system parameters (variable capacity weight + * and entitled capacity). Format of input is "param_name=value"; + * anything after value is ignored. Valid parameters at this time are + * "partition_entitled_capacity" and "capacity_weight". We use + * H_SET_PPP to alter parameters. + * + * This function should be invoked only on systems with + * FW_FEATURE_SPLPAR. + */ +static ssize_t lparcfg_write(struct file *file, const char __user * buf, + size_t count, loff_t * off) +{ + char *kbuf; + char *tmp; + u64 new_entitled, *new_entitled_ptr = &new_entitled; + u8 new_weight, *new_weight_ptr = &new_weight; + + unsigned long current_entitled; /* parameters for h_get_ppp */ + unsigned long dummy; + unsigned long resource; + u8 current_weight; + + ssize_t retval = -ENOMEM; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + + retval = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + + retval = -EINVAL; + kbuf[count - 1] = '\0'; + tmp = strchr(kbuf, '='); + if (!tmp) + goto out; + + *tmp++ = '\0'; + + if (!strcmp(kbuf, "partition_entitled_capacity")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + goto out; + new_weight_ptr = ¤t_weight; + } else if (!strcmp(kbuf, "capacity_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + goto out; + new_entitled_ptr = ¤t_entitled; + } else + goto out; + + /* Get our current parameters */ + retval = h_get_ppp(¤t_entitled, &dummy, &dummy, &resource); + if (retval) { + retval = -EIO; + goto out; + } + + current_weight = (resource >> 5 * 8) & 0xFF; + + pr_debug("%s: current_entitled = %lu, current_weight = %lu\n", + __FUNCTION__, current_entitled, current_weight); + + pr_debug("%s: new_entitled = %lu, new_weight = %lu\n", + __FUNCTION__, *new_entitled_ptr, *new_weight_ptr); + + retval = plpar_hcall_norets(H_SET_PPP, *new_entitled_ptr, + *new_weight_ptr); + + if (retval == H_Success || retval == H_Constrained) { + retval = count; + } else if (retval == H_Busy) { + retval = -EBUSY; + } else if (retval == H_Hardware) { + retval = -EIO; + } else if (retval == H_Parameter) { + retval = -EINVAL; + } else { + printk(KERN_WARNING "%s: received unknown hv return code %ld", + __FUNCTION__, retval); + retval = -EIO; + } + +out: + kfree(kbuf); + return retval; +} + +#endif /* CONFIG_PPC_PSERIES */ + +static int lparcfg_open(struct inode *inode, struct file *file) +{ + return single_open(file, lparcfg_data, NULL); +} + +struct file_operations lparcfg_fops = { + .owner = THIS_MODULE, + .read = seq_read, + .open = lparcfg_open, + .release = single_release, +}; + +int __init lparcfg_init(void) +{ + struct proc_dir_entry *ent; + mode_t mode = S_IRUSR | S_IRGRP | S_IROTH; + + /* Allow writing if we have FW_FEATURE_SPLPAR */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + lparcfg_fops.write = lparcfg_write; + mode |= S_IWUSR; + } + + ent = create_proc_entry("ppc64/lparcfg", mode, NULL); + if (ent) { + ent->proc_fops = &lparcfg_fops; + ent->data = kmalloc(LPARCFG_BUFF_SIZE, GFP_KERNEL); + if (!ent->data) { + printk(KERN_ERR + "Failed to allocate buffer for lparcfg\n"); + remove_proc_entry("lparcfg", ent->parent); + return -ENOMEM; + } + } else { + printk(KERN_ERR "Failed to create ppc64/lparcfg\n"); + return -EIO; + } + + proc_ppc64_lparcfg = ent; + return 0; +} + +void __exit lparcfg_cleanup(void) +{ + if (proc_ppc64_lparcfg) { + kfree(proc_ppc64_lparcfg->data); + remove_proc_entry("lparcfg", proc_ppc64_lparcfg->parent); + } +} + +module_init(lparcfg_init); +module_exit(lparcfg_cleanup); +MODULE_DESCRIPTION("Interface for LPAR configuration data"); +MODULE_AUTHOR("Dave Engebretsen"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/kernel/lparmap.c b/arch/powerpc/kernel/lparmap.c index eded971d1bf..5a05a797485 100644 --- a/arch/powerpc/kernel/lparmap.c +++ b/arch/powerpc/kernel/lparmap.c @@ -25,7 +25,7 @@ const struct LparMap __attribute__((__section__(".text"))) xLparMap = { .xRanges = { { .xPages = HvPagesToMap, .xOffset = 0, - .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - PAGE_SHIFT), + .xVPN = KERNEL_VSID(KERNELBASE) << (SID_SHIFT - HW_PAGE_SHIFT), }, }, }; diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c new file mode 100644 index 00000000000..97c51e452be --- /dev/null +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -0,0 +1,358 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * + * Copyright (C) 2004-2005, IBM Corp. + * + * Created by: Milton D Miller II + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + +#include <linux/cpumask.h> +#include <linux/kexec.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/errno.h> + +#include <asm/page.h> +#include <asm/current.h> +#include <asm/machdep.h> +#include <asm/cacheflush.h> +#include <asm/paca.h> +#include <asm/mmu.h> +#include <asm/sections.h> /* _end */ +#include <asm/prom.h> +#include <asm/smp.h> + +#define HASH_GROUP_SIZE 0x80 /* size of each hash group, asm/mmu.h */ + +/* Have this around till we move it into crash specific file */ +note_buf_t crash_notes[NR_CPUS]; + +/* Dummy for now. Not sure if we need to have a crash shutdown in here + * and if what it will achieve. Letting it be now to compile the code + * in generic kexec environment + */ +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* do nothing right now */ + /* smp_relase_cpus() if we want smp on panic kernel */ + /* cpu_irq_down to isolate us until we are ready */ +} + +int machine_kexec_prepare(struct kimage *image) +{ + int i; + unsigned long begin, end; /* limits of segment */ + unsigned long low, high; /* limits of blocked memory range */ + struct device_node *node; + unsigned long *basep; + unsigned int *sizep; + + if (!ppc_md.hpte_clear_all) + return -ENOENT; + + /* + * Since we use the kernel fault handlers and paging code to + * handle the virtual mode, we must make sure no destination + * overlaps kernel static data or bss. + */ + for (i = 0; i < image->nr_segments; i++) + if (image->segment[i].mem < __pa(_end)) + return -ETXTBSY; + + /* + * For non-LPAR, we absolutely can not overwrite the mmu hash + * table, since we are still using the bolted entries in it to + * do the copy. Check that here. + * + * It is safe if the end is below the start of the blocked + * region (end <= low), or if the beginning is after the + * end of the blocked region (begin >= high). Use the + * boolean identity !(a || b) === (!a && !b). + */ + if (htab_address) { + low = __pa(htab_address); + high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE; + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + /* We also should not overwrite the tce tables */ + for (node = of_find_node_by_type(NULL, "pci"); node != NULL; + node = of_find_node_by_type(node, "pci")) { + basep = (unsigned long *)get_property(node, "linux,tce-base", + NULL); + sizep = (unsigned int *)get_property(node, "linux,tce-size", + NULL); + if (basep == NULL || sizep == NULL) + continue; + + low = *basep; + high = low + (*sizep); + + for (i = 0; i < image->nr_segments; i++) { + begin = image->segment[i].mem; + end = begin + image->segment[i].memsz; + + if ((begin < high) && (end > low)) + return -ETXTBSY; + } + } + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + /* we do nothing in prepare that needs to be undone */ +} + +#define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) + +static void copy_segments(unsigned long ind) +{ + unsigned long entry; + unsigned long *ptr; + void *dest; + void *addr; + + /* + * We rely on kexec_load to create a lists that properly + * initializes these pointers before they are used. + * We will still crash if the list is wrong, but at least + * the compiler will be quiet. + */ + ptr = NULL; + dest = NULL; + + for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { + addr = __va(entry & PAGE_MASK); + + switch (entry & IND_FLAGS) { + case IND_DESTINATION: + dest = addr; + break; + case IND_INDIRECTION: + ptr = addr; + break; + case IND_SOURCE: + copy_page(dest, addr); + dest += PAGE_SIZE; + } + } +} + +void kexec_copy_flush(struct kimage *image) +{ + long i, nr_segments = image->nr_segments; + struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; + + /* save the ranges on the stack to efficiently flush the icache */ + memcpy(ranges, image->segment, sizeof(ranges)); + + /* + * After this call we may not use anything allocated in dynamic + * memory, including *image. + * + * Only globals and the stack are allowed. + */ + copy_segments(image->head); + + /* + * we need to clear the icache for all dest pages sometime, + * including ones that were in place on the original copy + */ + for (i = 0; i < nr_segments; i++) + flush_icache_range(ranges[i].mem + KERNELBASE, + ranges[i].mem + KERNELBASE + + ranges[i].memsz); +} + +#ifdef CONFIG_SMP + +/* FIXME: we should schedule this function to be called on all cpus based + * on calling the interrupts, but we would like to call it off irq level + * so that the interrupt controller is clean. + */ +void kexec_smp_down(void *arg) +{ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 1); + + local_irq_disable(); + kexec_smp_wait(); + /* NOTREACHED */ +} + +static void kexec_prepare_cpus(void) +{ + int my_cpu, i, notified=-1; + + smp_call_function(kexec_smp_down, NULL, 0, /* wait */0); + my_cpu = get_cpu(); + + /* check the others cpus are now down (via paca hw cpu id == -1) */ + for (i=0; i < NR_CPUS; i++) { + if (i == my_cpu) + continue; + + while (paca[i].hw_cpu_id != -1) { + barrier(); + if (!cpu_possible(i)) { + printk("kexec: cpu %d hw_cpu_id %d is not" + " possible, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (!cpu_online(i)) { + /* Fixme: this can be spinning in + * pSeries_secondary_wait with a paca + * waiting for it to go online. + */ + printk("kexec: cpu %d hw_cpu_id %d is not" + " online, ignoring\n", + i, paca[i].hw_cpu_id); + break; + } + if (i != notified) { + printk( "kexec: waiting for cpu %d (physical" + " %d) to go down\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } + + /* after we tell the others to go down */ + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + + put_cpu(); + + local_irq_disable(); +} + +#else /* ! SMP */ + +static void kexec_prepare_cpus(void) +{ + /* + * move the secondarys to us so that we can copy + * the new kernel 0-0x100 safely + * + * do this if kexec in setup.c ? + * + * We need to release the cpus if we are ever going from an + * UP to an SMP kernel. + */ + smp_release_cpus(); + if (ppc_md.kexec_cpu_down) + ppc_md.kexec_cpu_down(0, 0); + local_irq_disable(); +} + +#endif /* SMP */ + +/* + * kexec thread structure and stack. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. It also must be statically allocated + * or allocated as part of the kimage, because everything else may be + * overwritten when we copy the kexec image. We piggyback on the + * "init_task" linker section here to statically allocate a stack. + * + * We could use a smaller stack if we don't care about anything using + * current, but that audit has not been performed. + */ +union thread_union kexec_stack + __attribute__((__section__(".data.init_task"))) = { }; + +/* Our assembly helper, in kexec_stub.S */ +extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, + void *image, void *control, + void (*clear_all)(void)) ATTRIB_NORET; + +/* too late to fail here */ +void machine_kexec(struct kimage *image) +{ + + /* prepare control code if any */ + + /* shutdown other cpus into our wait loop and quiesce interrupts */ + kexec_prepare_cpus(); + + /* switch to a staticly allocated stack. Based on irq stack code. + * XXX: the task struct will likely be invalid once we do the copy! + */ + kexec_stack.thread_info.task = current_thread_info()->task; + kexec_stack.thread_info.flags = 0; + + /* Some things are best done in assembly. Finding globals with + * a toc is easier in C, so pass in what we can. + */ + kexec_sequence(&kexec_stack, image->start, image, + page_address(image->control_code_page), + ppc_md.hpte_clear_all); + /* NOTREACHED */ +} + +/* Values we need to export to the second kernel via the device tree. */ +static unsigned long htab_base, htab_size, kernel_end; + +static struct property htab_base_prop = { + .name = "linux,htab-base", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_base, +}; + +static struct property htab_size_prop = { + .name = "linux,htab-size", + .length = sizeof(unsigned long), + .value = (unsigned char *)&htab_size, +}; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(unsigned long), + .value = (unsigned char *)&kernel_end, +}; + +static void __init export_htab_values(void) +{ + struct device_node *node; + + node = of_find_node_by_path("/chosen"); + if (!node) + return; + + kernel_end = __pa(_end); + prom_add_property(node, &kernel_end_prop); + + /* On machines with no htab htab_address is NULL */ + if (NULL == htab_address) + goto out; + + htab_base = __pa(htab_address); + prom_add_property(node, &htab_base_prop); + + htab_size = 1UL << ppc64_pft_size; + prom_add_property(node, &htab_size_prop); + + out: + of_node_put(node); +} + +void __init kexec_setup(void) +{ + export_htab_values(); +} diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 3bedb532aed..f6d84a75ed2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -519,7 +519,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) * * flush_icache_range(unsigned long start, unsigned long stop) */ -_GLOBAL(flush_icache_range) +_GLOBAL(__flush_icache_range) BEGIN_FTR_SECTION blr /* for 601, do nothing */ END_FTR_SECTION_IFCLR(CPU_FTR_SPLIT_ID_CACHE) @@ -607,27 +607,6 @@ _GLOBAL(invalidate_dcache_range) sync /* wait for dcbi's to get to ram */ blr -#ifdef CONFIG_NOT_COHERENT_CACHE -/* - * 40x cores have 8K or 16K dcache and 32 byte line size. - * 44x has a 32K dcache and 32 byte line size. - * 8xx has 1, 2, 4, 8K variants. - * For now, cover the worst case of the 44x. - * Must be called with external interrupts disabled. - */ -#define CACHE_NWAYS 64 -#define CACHE_NLINES 16 - -_GLOBAL(flush_dcache_all) - li r4, (2 * CACHE_NWAYS * CACHE_NLINES) - mtctr r4 - lis r5, KERNELBASE@h -1: lwz r3, 0(r5) /* Load one word from every line */ - addi r5, r5, L1_CACHE_BYTES - bdnz 1b - blr -#endif /* CONFIG_NOT_COHERENT_CACHE */ - /* * Flush a particular page from the data cache to RAM. * Note: this is necessary because the instruction cache does *not* diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index b3e95ff0dba..ae48a002f81 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -89,12 +89,12 @@ _GLOBAL(call_do_softirq) mtlr r0 blr -_GLOBAL(call_handle_IRQ_event) +_GLOBAL(call___do_IRQ) mflr r0 std r0,16(r1) - stdu r1,THREAD_SIZE-112(r6) - mr r1,r6 - bl .handle_IRQ_event + stdu r1,THREAD_SIZE-112(r5) + mr r1,r5 + bl .__do_IRQ ld r1,0(r1) ld r0,16(r1) mtlr r0 @@ -604,6 +604,76 @@ _GLOBAL(real_writeb) #endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE) */ /* + * SCOM access functions for 970 (FX only for now) + * + * unsigned long scom970_read(unsigned int address); + * void scom970_write(unsigned int address, unsigned long value); + * + * The address passed in is the 24 bits register address. This code + * is 970 specific and will not check the status bits, so you should + * know what you are doing. + */ +_GLOBAL(scom970_read) + /* interrupts off */ + mfmsr r4 + ori r0,r4,MSR_EE + xori r0,r0,MSR_EE + mtmsrd r0,1 + + /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + * (including parity). On current CPUs they must be 0'd, + * and finally or in RW bit + */ + rlwinm r3,r3,8,0,15 + ori r3,r3,0x8000 + + /* do the actual scom read */ + sync + mtspr SPRN_SCOMC,r3 + isync + mfspr r3,SPRN_SCOMD + isync + mfspr r0,SPRN_SCOMC + isync + + /* XXX: fixup result on some buggy 970's (ouch ! we lost a bit, bah + * that's the best we can do). Not implemented yet as we don't use + * the scom on any of the bogus CPUs yet, but may have to be done + * ultimately + */ + + /* restore interrupts */ + mtmsrd r4,1 + blr + + +_GLOBAL(scom970_write) + /* interrupts off */ + mfmsr r5 + ori r0,r5,MSR_EE + xori r0,r0,MSR_EE + mtmsrd r0,1 + + /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + * (including parity). On current CPUs they must be 0'd. + */ + + rlwinm r3,r3,8,0,15 + + sync + mtspr SPRN_SCOMD,r4 /* write data */ + isync + mtspr SPRN_SCOMC,r3 /* write command */ + isync + mfspr 3,SPRN_SCOMC + isync + + /* restore interrupts */ + mtmsrd r5,1 + blr + + +/* * Create a kernel thread * kernel_thread(fn, arg, flags) */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c new file mode 100644 index 00000000000..928b8581fcb --- /dev/null +++ b/arch/powerpc/kernel/module_64.c @@ -0,0 +1,455 @@ +/* Kernel module help for PPC64. + Copyright (C) 2001, 2003 Rusty Russell IBM Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/module.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <asm/module.h> +#include <asm/uaccess.h> + +/* FIXME: We don't do .init separately. To do this, we'd need to have + a separate r2 value in the init and core section, and stub between + them, too. + + Using a magic allocator which places modules within 32MB solves + this, and makes other things simpler. Anton? + --RR. */ +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , ...) +#endif + +/* There's actually a third entry here, but it's unused */ +struct ppc64_opd_entry +{ + unsigned long funcaddr; + unsigned long r2; +}; + +/* Like PPC32, we need little trampolines to do > 24-bit jumps (into + the kernel itself). But on PPC64, these need to be used for every + jump, actually, to reset r2 (TOC+0x8000). */ +struct ppc64_stub_entry +{ + /* 28 byte jump instruction sequence (7 instructions) */ + unsigned char jump[28]; + unsigned char unused[4]; + /* Data for the above code */ + struct ppc64_opd_entry opd; +}; + +/* We use a stub to fix up r2 (TOC ptr) and to jump to the (external) + function which may be more than 24-bits away. We could simply + patch the new r2 value and function pointer into the stub, but it's + significantly shorter to put these values at the end of the stub + code, and patch the stub address (32-bits relative to the TOC ptr, + r2) into the stub. */ +static struct ppc64_stub_entry ppc64_stub = +{ .jump = { + 0x3d, 0x82, 0x00, 0x00, /* addis r12,r2, <high> */ + 0x39, 0x8c, 0x00, 0x00, /* addi r12,r12, <low> */ + /* Save current r2 value in magic place on the stack. */ + 0xf8, 0x41, 0x00, 0x28, /* std r2,40(r1) */ + 0xe9, 0x6c, 0x00, 0x20, /* ld r11,32(r12) */ + 0xe8, 0x4c, 0x00, 0x28, /* ld r2,40(r12) */ + 0x7d, 0x69, 0x03, 0xa6, /* mtctr r11 */ + 0x4e, 0x80, 0x04, 0x20 /* bctr */ +} }; + +/* Count how many different 24-bit relocations (different symbol, + different addend) */ +static unsigned int count_relocs(const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, j, ret = 0; + + /* FIXME: Only count external ones --RR */ + /* Sure, this is order(n^2), but it's usually short, and not + time critical */ + for (i = 0; i < num; i++) { + /* Only count 24-bit relocs, others don't need stubs */ + if (ELF64_R_TYPE(rela[i].r_info) != R_PPC_REL24) + continue; + for (j = 0; j < i; j++) { + /* If this addend appeared before, it's + already been counted */ + if (rela[i].r_info == rela[j].r_info + && rela[i].r_addend == rela[j].r_addend) + break; + } + if (j == i) ret++; + } + return ret; +} + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + + return vmalloc_exec(size); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* Get size of potential trampolines required. */ +static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, + const Elf64_Shdr *sechdrs) +{ + /* One extra reloc so it's always 0-funcaddr terminated */ + unsigned long relocs = 1; + unsigned i; + + /* Every relocated section... */ + for (i = 1; i < hdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_RELA) { + DEBUGP("Found relocations in section %u\n", i); + DEBUGP("Ptr: %p. Number: %lu\n", + (void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size / sizeof(Elf64_Rela)); + relocs += count_relocs((void *)sechdrs[i].sh_addr, + sechdrs[i].sh_size + / sizeof(Elf64_Rela)); + } + } + + DEBUGP("Looks like a total of %lu stubs, max\n", relocs); + return relocs * sizeof(struct ppc64_stub_entry); +} + +static void dedotify_versions(struct modversion_info *vers, + unsigned long size) +{ + struct modversion_info *end; + + for (end = (void *)vers + size; vers < end; vers++) + if (vers->name[0] == '.') + memmove(vers->name, vers->name+1, strlen(vers->name)); +} + +/* Undefined symbols which refer to .funcname, hack to funcname */ +static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) +{ + unsigned int i; + + for (i = 1; i < numsyms; i++) { + if (syms[i].st_shndx == SHN_UNDEF) { + char *name = strtab + syms[i].st_name; + if (name[0] == '.') + memmove(name, name+1, strlen(name)); + } + } +} + +int module_frob_arch_sections(Elf64_Ehdr *hdr, + Elf64_Shdr *sechdrs, + char *secstrings, + struct module *me) +{ + unsigned int i; + + /* Find .toc and .stubs sections, symtab and strtab */ + for (i = 1; i < hdr->e_shnum; i++) { + char *p; + if (strcmp(secstrings + sechdrs[i].sh_name, ".stubs") == 0) + me->arch.stubs_section = i; + else if (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0) + me->arch.toc_section = i; + else if (strcmp(secstrings+sechdrs[i].sh_name,"__versions")==0) + dedotify_versions((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size); + + /* We don't handle .init for the moment: rename to _init */ + while ((p = strstr(secstrings + sechdrs[i].sh_name, ".init"))) + p[0] = '_'; + + if (sechdrs[i].sh_type == SHT_SYMTAB) + dedotify((void *)hdr + sechdrs[i].sh_offset, + sechdrs[i].sh_size / sizeof(Elf64_Sym), + (void *)hdr + + sechdrs[sechdrs[i].sh_link].sh_offset); + } + if (!me->arch.stubs_section || !me->arch.toc_section) { + printk("%s: doesn't contain .toc or .stubs.\n", me->name); + return -ENOEXEC; + } + + /* Override the stubs size */ + sechdrs[me->arch.stubs_section].sh_size = get_stubs_size(hdr, sechdrs); + return 0; +} + +int apply_relocate(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "%s: Non-ADD RELOCATION unsupported\n", me->name); + return -ENOEXEC; +} + +/* r2 is the TOC pointer: it actually points 0x8000 into the TOC (this + gives the value maximum span in an instruction which uses a signed + offset) */ +static inline unsigned long my_r2(Elf64_Shdr *sechdrs, struct module *me) +{ + return sechdrs[me->arch.toc_section].sh_addr + 0x8000; +} + +/* Both low and high 16 bits are added as SIGNED additions, so if low + 16 bits has high bit set, high 16 bits must be adjusted. These + macros do that (stolen from binutils). */ +#define PPC_LO(v) ((v) & 0xffff) +#define PPC_HI(v) (((v) >> 16) & 0xffff) +#define PPC_HA(v) PPC_HI ((v) + 0x8000) + +/* Patch stub to reference function and correct r2 value. */ +static inline int create_stub(Elf64_Shdr *sechdrs, + struct ppc64_stub_entry *entry, + struct ppc64_opd_entry *opd, + struct module *me) +{ + Elf64_Half *loc1, *loc2; + long reladdr; + + *entry = ppc64_stub; + + loc1 = (Elf64_Half *)&entry->jump[2]; + loc2 = (Elf64_Half *)&entry->jump[6]; + + /* Stub uses address relative to r2. */ + reladdr = (unsigned long)entry - my_r2(sechdrs, me); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { + printk("%s: Address %p of stub out of range of %p.\n", + me->name, (void *)reladdr, (void *)my_r2); + return 0; + } + DEBUGP("Stub %p get data from reladdr %li\n", entry, reladdr); + + *loc1 = PPC_HA(reladdr); + *loc2 = PPC_LO(reladdr); + entry->opd.funcaddr = opd->funcaddr; + entry->opd.r2 = opd->r2; + return 1; +} + +/* Create stub to jump to function described in this OPD: we need the + stub to set up the TOC ptr (r2) for the function. */ +static unsigned long stub_for_addr(Elf64_Shdr *sechdrs, + unsigned long opdaddr, + struct module *me) +{ + struct ppc64_stub_entry *stubs; + struct ppc64_opd_entry *opd = (void *)opdaddr; + unsigned int i, num_stubs; + + num_stubs = sechdrs[me->arch.stubs_section].sh_size / sizeof(*stubs); + + /* Find this stub, or if that fails, the next avail. entry */ + stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; + for (i = 0; stubs[i].opd.funcaddr; i++) { + BUG_ON(i >= num_stubs); + + if (stubs[i].opd.funcaddr == opd->funcaddr) + return (unsigned long)&stubs[i]; + } + + if (!create_stub(sechdrs, &stubs[i], opd, me)) + return 0; + + return (unsigned long)&stubs[i]; +} + +/* We expect a noop next: if it is, replace it with instruction to + restore r2. */ +static int restore_r2(u32 *instruction, struct module *me) +{ + if (*instruction != 0x60000000) { + printk("%s: Expect noop after relocate, got %08x\n", + me->name, *instruction); + return 0; + } + *instruction = 0xe8410028; /* ld r2,40(r1) */ + return 1; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + unsigned long *location; + unsigned long value; + + DEBUGP("Applying ADD relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rela); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rela[i].r_offset; + /* This is the symbol it is referring to */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info); + + DEBUGP("RELOC at %p: %li-type as %s (%lu) + %li\n", + location, (long)ELF64_R_TYPE(rela[i].r_info), + strtab + sym->st_name, (unsigned long)sym->st_value, + (long)rela[i].r_addend); + + /* `Everything is relative'. */ + value = sym->st_value + rela[i].r_addend; + + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_PPC64_ADDR32: + /* Simply set it */ + *(u32 *)location = value; + break; + + case R_PPC64_ADDR64: + /* Simply set it */ + *(unsigned long *)location = value; + break; + + case R_PPC64_TOC: + *(unsigned long *)location = my_r2(sechdrs, me); + break; + + case R_PPC64_TOC16: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if (value + 0x8000 > 0xffff) { + printk("%s: bad TOC16 relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xffff) + | (value & 0xffff); + break; + + case R_PPC64_TOC16_DS: + /* Subtact TOC pointer */ + value -= my_r2(sechdrs, me); + if ((value & 3) != 0 || value + 0x8000 > 0xffff) { + printk("%s: bad TOC16_DS relocation (%lu)\n", + me->name, value); + return -ENOEXEC; + } + *((uint16_t *) location) + = (*((uint16_t *) location) & ~0xfffc) + | (value & 0xfffc); + break; + + case R_PPC_REL24: + /* FIXME: Handle weak symbols here --RR */ + if (sym->st_shndx == SHN_UNDEF) { + /* External: go via stub */ + value = stub_for_addr(sechdrs, value, me); + if (!value) + return -ENOENT; + if (!restore_r2((u32 *)location + 1, me)) + return -ENOEXEC; + } + + /* Convert value to relative */ + value -= (unsigned long)location; + if (value + 0x2000000 > 0x3ffffff || (value & 3) != 0){ + printk("%s: REL24 %li out of range!\n", + me->name, (long int)value); + return -ENOEXEC; + } + + /* Only replace bits 2 through 26 */ + *(uint32_t *)location + = (*(uint32_t *)location & ~0x03fffffc) + | (value & 0x03fffffc); + break; + + default: + printk("%s: Unknown ADD relocation: %lu\n", + me->name, + (unsigned long)ELF64_R_TYPE(rela[i].r_info)); + return -ENOEXEC; + } + } + + return 0; +} + +LIST_HEAD(module_bug_list); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, struct module *me) +{ + char *secstrings; + unsigned int i; + + me->arch.bug_table = NULL; + me->arch.num_bugs = 0; + + /* Find the __bug_table section, if present */ + secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (i = 1; i < hdr->e_shnum; i++) { + if (strcmp(secstrings+sechdrs[i].sh_name, "__bug_table")) + continue; + me->arch.bug_table = (void *) sechdrs[i].sh_addr; + me->arch.num_bugs = sechdrs[i].sh_size / sizeof(struct bug_entry); + break; + } + + /* + * Strictly speaking this should have a spinlock to protect against + * traversals, but since we only traverse on BUG()s, a spinlock + * could potentially lead to deadlock and thus be counter-productive. + */ + list_add(&me->arch.bug_list, &module_bug_list); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + list_del(&mod->arch.bug_list); +} + +struct bug_entry *module_find_bug(unsigned long bugaddr) +{ + struct mod_arch_specific *mod; + unsigned int i; + struct bug_entry *bug; + + list_for_each_entry(mod, &module_bug_list, bug_list) { + bug = mod->bug_table; + for (i = 0; i < mod->num_bugs; ++i, ++bug) + if (bugaddr == bug->bug_addr) + return bug; + } + return NULL; +} diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c new file mode 100644 index 00000000000..a7b68f911eb --- /dev/null +++ b/arch/powerpc/kernel/paca.c @@ -0,0 +1,135 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/threads.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/page.h> +#include <asm/lppaca.h> +#include <asm/iseries/it_lp_queue.h> +#include <asm/paca.h> + + +/* This symbol is provided by the linker - let it fill in the paca + * field correctly */ +extern unsigned long __toc_start; + +/* The Paca is an array with one entry per processor. Each contains an + * lppaca, which contains the information shared between the + * hypervisor and Linux. Each also contains an ItLpRegSave area which + * is used by the hypervisor to save registers. + * On systems with hardware multi-threading, there are two threads + * per processor. The Paca array must contain an entry for each thread. + * The VPD Areas will give a max logical processors = 2 * max physical + * processors. The processor VPD array needs one entry per physical + * processor (not thread). + */ +#define PACA_INIT_COMMON(number, start, asrr, asrv) \ + .lock_token = 0x8000, \ + .paca_index = (number), /* Paca Index */ \ + .default_decr = 0x00ff0000, /* Initial Decr */ \ + .kernel_toc = (unsigned long)(&__toc_start) + 0x8000UL, \ + .stab_real = (asrr), /* Real pointer to segment table */ \ + .stab_addr = (asrv), /* Virt pointer to segment table */ \ + .cpu_start = (start), /* Processor start */ \ + .hw_cpu_id = 0xffff, \ + .lppaca = { \ + .desc = 0xd397d781, /* "LpPa" */ \ + .size = sizeof(struct lppaca), \ + .dyn_proc_status = 2, \ + .decr_val = 0x00ff0000, \ + .fpregs_in_use = 1, \ + .end_of_quantum = 0xfffffffffffffffful, \ + .slb_count = 64, \ + .vmxregs_in_use = 0, \ + }, \ + +#ifdef CONFIG_PPC_ISERIES +#define PACA_INIT_ISERIES(number) \ + .lppaca_ptr = &paca[number].lppaca, \ + .reg_save_ptr = &paca[number].reg_save, \ + .reg_save = { \ + .xDesc = 0xd397d9e2, /* "LpRS" */ \ + .xSize = sizeof(struct ItLpRegSave) \ + } + +#define PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 0, 0, 0) \ + PACA_INIT_ISERIES(number) \ +} + +#define BOOTCPU_PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 1, 0, (u64)&initial_stab) \ + PACA_INIT_ISERIES(number) \ +} + +#else +#define PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 0, 0, 0) \ +} + +#define BOOTCPU_PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, (u64)&initial_stab) \ +} +#endif + +struct paca_struct paca[] = { + BOOTCPU_PACA_INIT(0), +#if NR_CPUS > 1 + PACA_INIT( 1), PACA_INIT( 2), PACA_INIT( 3), +#if NR_CPUS > 4 + PACA_INIT( 4), PACA_INIT( 5), PACA_INIT( 6), PACA_INIT( 7), +#if NR_CPUS > 8 + PACA_INIT( 8), PACA_INIT( 9), PACA_INIT( 10), PACA_INIT( 11), + PACA_INIT( 12), PACA_INIT( 13), PACA_INIT( 14), PACA_INIT( 15), + PACA_INIT( 16), PACA_INIT( 17), PACA_INIT( 18), PACA_INIT( 19), + PACA_INIT( 20), PACA_INIT( 21), PACA_INIT( 22), PACA_INIT( 23), + PACA_INIT( 24), PACA_INIT( 25), PACA_INIT( 26), PACA_INIT( 27), + PACA_INIT( 28), PACA_INIT( 29), PACA_INIT( 30), PACA_INIT( 31), +#if NR_CPUS > 32 + PACA_INIT( 32), PACA_INIT( 33), PACA_INIT( 34), PACA_INIT( 35), + PACA_INIT( 36), PACA_INIT( 37), PACA_INIT( 38), PACA_INIT( 39), + PACA_INIT( 40), PACA_INIT( 41), PACA_INIT( 42), PACA_INIT( 43), + PACA_INIT( 44), PACA_INIT( 45), PACA_INIT( 46), PACA_INIT( 47), + PACA_INIT( 48), PACA_INIT( 49), PACA_INIT( 50), PACA_INIT( 51), + PACA_INIT( 52), PACA_INIT( 53), PACA_INIT( 54), PACA_INIT( 55), + PACA_INIT( 56), PACA_INIT( 57), PACA_INIT( 58), PACA_INIT( 59), + PACA_INIT( 60), PACA_INIT( 61), PACA_INIT( 62), PACA_INIT( 63), +#if NR_CPUS > 64 + PACA_INIT( 64), PACA_INIT( 65), PACA_INIT( 66), PACA_INIT( 67), + PACA_INIT( 68), PACA_INIT( 69), PACA_INIT( 70), PACA_INIT( 71), + PACA_INIT( 72), PACA_INIT( 73), PACA_INIT( 74), PACA_INIT( 75), + PACA_INIT( 76), PACA_INIT( 77), PACA_INIT( 78), PACA_INIT( 79), + PACA_INIT( 80), PACA_INIT( 81), PACA_INIT( 82), PACA_INIT( 83), + PACA_INIT( 84), PACA_INIT( 85), PACA_INIT( 86), PACA_INIT( 87), + PACA_INIT( 88), PACA_INIT( 89), PACA_INIT( 90), PACA_INIT( 91), + PACA_INIT( 92), PACA_INIT( 93), PACA_INIT( 94), PACA_INIT( 95), + PACA_INIT( 96), PACA_INIT( 97), PACA_INIT( 98), PACA_INIT( 99), + PACA_INIT(100), PACA_INIT(101), PACA_INIT(102), PACA_INIT(103), + PACA_INIT(104), PACA_INIT(105), PACA_INIT(106), PACA_INIT(107), + PACA_INIT(108), PACA_INIT(109), PACA_INIT(110), PACA_INIT(111), + PACA_INIT(112), PACA_INIT(113), PACA_INIT(114), PACA_INIT(115), + PACA_INIT(116), PACA_INIT(117), PACA_INIT(118), PACA_INIT(119), + PACA_INIT(120), PACA_INIT(121), PACA_INIT(122), PACA_INIT(123), + PACA_INIT(124), PACA_INIT(125), PACA_INIT(126), PACA_INIT(127), +#endif +#endif +#endif +#endif +#endif +}; +EXPORT_SYMBOL(paca); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c new file mode 100644 index 00000000000..3cef1b8f57f --- /dev/null +++ b/arch/powerpc/kernel/pci_64.c @@ -0,0 +1,1319 @@ +/* + * Port for PPC64 David Engebretsen, IBM Corp. + * Contains common pci routines for ppc64 platform, pSeries and iSeries brands. + * + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * Rework, based on alpha PCI code. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/list.h> +#include <linux/syscalls.h> + +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/byteorder.h> +#include <asm/irq.h> +#include <asm/machdep.h> +#include <asm/udbg.h> +#include <asm/ppc-pci.h> + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +unsigned long pci_probe_only = 1; +unsigned long pci_assign_all_buses = 0; + +/* + * legal IO pages under MAX_ISA_PORT. This is to ensure we don't touch + * devices we don't have access to. + */ +unsigned long io_page_mask; + +EXPORT_SYMBOL(io_page_mask); + +#ifdef CONFIG_PPC_MULTIPLATFORM +static void fixup_resource(struct resource *res, struct pci_dev *dev); +static void do_bus_setup(struct pci_bus *bus); +#endif + +unsigned int pcibios_assign_all_busses(void) +{ + return pci_assign_all_buses; +} + +/* pci_io_base -- the base address from which io bars are offsets. + * This is the lowest I/O base address (so bar values are always positive), + * and it *must* be the start of ISA space if an ISA bus exists because + * ISA drivers use hard coded offsets. If no ISA bus exists a dummy + * page is mapped and isa_io_limit prevents access to it. + */ +unsigned long isa_io_base; /* NULL if no ISA bus */ +EXPORT_SYMBOL(isa_io_base); +unsigned long pci_io_base; +EXPORT_SYMBOL(pci_io_base); + +void iSeries_pcibios_init(void); + +LIST_HEAD(hose_list); + +struct dma_mapping_ops pci_dma_ops; +EXPORT_SYMBOL(pci_dma_ops); + +int global_phb_number; /* Global phb counter */ + +/* Cached ISA bridge dev. */ +struct pci_dev *ppc64_isabridge_dev = NULL; + +static void fixup_broken_pcnet32(struct pci_dev* dev) +{ + if ((dev->class>>8 == PCI_CLASS_NETWORK_ETHERNET)) { + dev->vendor = PCI_VENDOR_ID_AMD; + pci_write_config_word(dev, PCI_VENDOR_ID, PCI_VENDOR_ID_AMD); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TRIDENT, PCI_ANY_ID, fixup_broken_pcnet32); + +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + unsigned long offset = 0; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + if (!hose) + return; + + if (res->flags & IORESOURCE_IO) + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + if (res->flags & IORESOURCE_MEM) + offset = hose->pci_mem_offset; + + res->start = region->start + offset; + res->end = region->end + offset; +} + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pcibios_resource_to_bus); +EXPORT_SYMBOL(pcibios_bus_to_resource); +#endif + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + * + * Why? Because some silly external IO cards only decode + * the low 10 bits of the IO address. The 0x00-0xff region + * is reserved for motherboard devices that decode all 16 + * bits, so it's ok to allocate at, say, 0x2800-0x28ff, + * but we want to try to avoid allocating at 0x2900-0x2bff + * which might have be mirrored at 0x0100-0x03ff.. + */ +void pcibios_align_resource(void *data, struct resource *res, + unsigned long size, unsigned long align) +{ + struct pci_dev *dev = data; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start = res->start; + unsigned long alignto; + + if (res->flags & IORESOURCE_IO) { + unsigned long offset = (unsigned long)hose->io_base_virt - + pci_io_base; + /* Make sure we start at our min on all hoses */ + if (start - offset < PCIBIOS_MIN_IO) + start = PCIBIOS_MIN_IO + offset; + + /* + * Put everything into 0x00-0xff region modulo 0x400 + */ + if (start & 0x300) + start = (start + 0x3ff) & ~0x3ff; + + } else if (res->flags & IORESOURCE_MEM) { + /* Make sure we start at our min on all hoses */ + if (start - hose->pci_mem_offset < PCIBIOS_MIN_MEM) + start = PCIBIOS_MIN_MEM + hose->pci_mem_offset; + + /* Align to multiple of size of minimum base. */ + alignto = max(0x1000UL, align); + start = ALIGN(start, alignto); + } + + res->start = start; +} + +static DEFINE_SPINLOCK(hose_spinlock); + +/* + * pci_controller(phb) initialized common variables. + */ +void __devinit pci_setup_pci_controller(struct pci_controller *hose) +{ + memset(hose, 0, sizeof(struct pci_controller)); + + spin_lock(&hose_spinlock); + hose->global_number = global_phb_number++; + list_add_tail(&hose->list_node, &hose_list); + spin_unlock(&hose_spinlock); +} + +static void __init pcibios_claim_one_bus(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child_bus; + + list_for_each_entry(dev, &b->devices, bus_list) { + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (r->parent || !r->start || !r->flags) + continue; + pci_claim_resource(dev, i); + } + } + + list_for_each_entry(child_bus, &b->children, node) + pcibios_claim_one_bus(child_bus); +} + +#ifndef CONFIG_PPC_ISERIES +static void __init pcibios_claim_of_setup(void) +{ + struct pci_bus *b; + + list_for_each_entry(b, &pci_root_buses, node) + pcibios_claim_one_bus(b); +} +#endif + +#ifdef CONFIG_PPC_MULTIPLATFORM +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + u32 *prop; + int len; + + prop = (u32 *) get_property(np, name, &len); + if (prop && len >= 4) + return *prop; + return def; +} + +static unsigned int pci_parse_of_flags(u32 addr0) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + return flags; +} + +#define GET_64BIT(prop, i) ((((u64) (prop)[(i)]) << 32) | (prop)[(i)+1]) + +static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct resource *res; + u32 *addrs, i; + int proplen; + + addrs = (u32 *) get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(addrs[0]); + if (!flags) + continue; + base = GET_64BIT(addrs, 1); + size = GET_64BIT(addrs, 3); + if (!size) + continue; + i = addrs[0] & 0xff; + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->start = base; + res->end = base + size - 1; + res->flags = flags; + res->name = pci_name(dev); + fixup_resource(res, dev); + } +} + +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + + dev = kmalloc(sizeof(struct pci_dev), GFP_KERNEL); + if (!dev) + return NULL; + type = get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + memset(dev, 0, sizeof(struct pci_dev)); + dev->bus = bus; + dev->sysdata = node; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = 256; /*pci_cfg_space_size(dev);*/ + + sprintf(pci_name(dev), "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + + dev->current_state = 4; /* unknown power state */ + + if (!strcmp(type, "pci")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + dev->irq = NO_IRQ; + if (node->n_intrs > 0) { + dev->irq = node->intrs[0].line; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, + dev->irq); + } + } + + pci_parse_of_addrs(node, dev); + + pci_device_add(dev, bus); + + /* XXX pci_scan_msi_device(dev); */ + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +void __devinit of_scan_bus(struct device_node *node, + struct pci_bus *bus) +{ + struct device_node *child = NULL; + u32 *reg; + int reglen, devfn; + struct pci_dev *dev; + + while ((child = of_get_next_child(node, child)) != NULL) { + reg = (u32 *) get_property(child, "reg", ®len); + if (reg == NULL || reglen < 20) + continue; + devfn = (reg[0] >> 8) & 0xff; + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(child, bus, devfn); + if (!dev) + continue; + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + of_scan_pci_bridge(child, dev); + } + + do_bus_setup(bus); +} +EXPORT_SYMBOL(of_scan_bus); + +void __devinit of_scan_pci_bridge(struct device_node *node, + struct pci_dev *dev) +{ + struct pci_bus *bus; + u32 *busrange, *ranges; + int len, i, mode; + struct resource *res; + unsigned int flags; + u64 size; + + /* parse bus-range property */ + busrange = (u32 *) get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_ERR "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = (u32 *) get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_ERR "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + + bus->primary = dev->bus->number; + bus->subordinate = busrange[1]; + bus->bridge_ctl = 0; + bus->sysdata = node; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(ranges[0]); + size = GET_64BIT(ranges, 6); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->start = GET_64BIT(ranges, 1); + res->end = res->start + size - 1; + res->flags = flags; + fixup_resource(res, dev); + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void __devinit scan_phb(struct pci_controller *hose) +{ + struct pci_bus *bus; + struct device_node *node = hose->arch_data; + int i, mode; + struct resource *res; + + bus = pci_create_bus(NULL, hose->first_busno, hose->ops, node); + if (bus == NULL) { + printk(KERN_ERR "Failed to create bus for PCI domain %04x\n", + hose->global_number); + return; + } + bus->secondary = hose->first_busno; + hose->bus = bus; + + bus->resource[0] = res = &hose->io_resource; + if (res->flags && request_resource(&ioport_resource, res)) + printk(KERN_ERR "Failed to request PCI IO region " + "on PCI domain %04x\n", hose->global_number); + + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + bus->resource[i+1] = res; + if (res->flags && request_resource(&iomem_resource, res)) + printk(KERN_ERR "Failed to request PCI memory region " + "on PCI domain %04x\n", hose->global_number); + } + + mode = PCI_PROBE_NORMAL; +#ifdef CONFIG_PPC_MULTIPLATFORM + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + if (mode == PCI_PROBE_DEVTREE) { + bus->subordinate = hose->last_busno; + of_scan_bus(node, bus); + } +#endif /* CONFIG_PPC_MULTIPLATFORM */ + if (mode == PCI_PROBE_NORMAL) + hose->last_busno = bus->subordinate = pci_scan_child_bus(bus); + pci_bus_add_devices(bus); +} + +static int __init pcibios_init(void) +{ + struct pci_controller *hose, *tmp; + + /* For now, override phys_mem_access_prot. If we need it, + * later, we may move that initialization to each ppc_md + */ + ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; + +#ifdef CONFIG_PPC_ISERIES + iSeries_pcibios_init(); +#endif + + printk("PCI: Probing PCI hardware\n"); + + /* Scan all of the recorded PCI controllers. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + scan_phb(hose); + +#ifndef CONFIG_PPC_ISERIES + if (pci_probe_only) + pcibios_claim_of_setup(); + else + /* FIXME: `else' will be removed when + pci_assign_unassigned_resources() is able to work + correctly with [partially] allocated PCI tree. */ + pci_assign_unassigned_resources(); +#endif /* !CONFIG_PPC_ISERIES */ + + /* Call machine dependent final fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + + /* Cache the location of the ISA bridge (if we have one) */ + ppc64_isabridge_dev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL); + if (ppc64_isabridge_dev != NULL) + printk("ISA bridge at %s\n", pci_name(ppc64_isabridge_dev)); + +#ifdef CONFIG_PPC_MULTIPLATFORM + /* map in PCI I/O space */ + phbs_remap_io(); +#endif + + printk("PCI: Probing PCI hardware done\n"); + + return 0; +} + +subsys_initcall(pcibios_init); + +char __init *pcibios_setup(char *str) +{ + return str; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, oldcmd; + int i; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + oldcmd = cmd; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *res = &dev->resource[i]; + + /* Only set up the requested stuff */ + if (!(mask & (1<<i))) + continue; + + if (res->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (res->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + + if (cmd != oldcmd) { + printk(KERN_DEBUG "PCI: Enabling device: (%s), cmd %x\n", + pci_name(dev), cmd); + /* Enable the appropriate bits in the PCI command register. */ + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * Return the domain number for this bus. + */ +int pci_domain_nr(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + + return hose->global_number; +#endif +} + +EXPORT_SYMBOL(pci_domain_nr); + +/* Decide whether to display the domain number in /proc */ +int pci_proc_domain(struct pci_bus *bus) +{ +#ifdef CONFIG_PPC_ISERIES + return 0; +#else + struct pci_controller *hose = pci_bus_to_host(bus); + return hose->buid; +#endif +} + +/* + * Platform support for /proc/bus/pci/X/Y mmap()s, + * modelled on the sparc64 implementation by Dave Miller. + * -- paulus. + */ + +/* + * Adjust vm_pgoff of VMA such that it is the physical page offset + * corresponding to the 32-bit pci bus offset for DEV requested by the user. + * + * Basically, the user finds the base address for his device which he wishes + * to mmap. They read the 32-bit value from the config space base register, + * add whatever PAGE_SIZE multiple offset they wish, and feed this into the + * offset parameter of mmap on /proc/bus/pci/XXX for that device. + * + * Returns negative error code on failure, zero on success. + */ +static struct resource *__pci_mmap_make_offset(struct pci_dev *dev, + unsigned long *offset, + enum pci_mmap_state mmap_state) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long io_offset = 0; + int i, res_bit; + + if (hose == 0) + return NULL; /* should never happen */ + + /* If memory, add on the PCI bridge address offset */ + if (mmap_state == pci_mmap_mem) { + *offset += hose->pci_mem_offset; + res_bit = IORESOURCE_MEM; + } else { + io_offset = (unsigned long)hose->io_base_virt - pci_io_base; + *offset += io_offset; + res_bit = IORESOURCE_IO; + } + + /* + * Check that the offset requested corresponds to one of the + * resources of the device. + */ + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &dev->resource[i]; + int flags = rp->flags; + + /* treat ROM as memory (should be already) */ + if (i == PCI_ROM_RESOURCE) + flags |= IORESOURCE_MEM; + + /* Active and same type? */ + if ((flags & res_bit) == 0) + continue; + + /* In the range of this resource? */ + if (*offset < (rp->start & PAGE_MASK) || *offset > rp->end) + continue; + + /* found it! construct the final physical address */ + if (mmap_state == pci_mmap_io) + *offset += hose->io_base_phys - io_offset; + return rp; + } + + return NULL; +} + +/* + * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci + * device mapping. + */ +static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp, + pgprot_t protection, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long prot = pgprot_val(protection); + + /* Write combine is always 0 on non-memory space mappings. On + * memory space, if the user didn't pass 1, we check for a + * "prefetchable" resource. This is a bit hackish, but we use + * this to workaround the inability of /sysfs to provide a write + * combine bit + */ + if (mmap_state != pci_mmap_mem) + write_combine = 0; + else if (write_combine == 0) { + if (rp->flags & IORESOURCE_PREFETCH) + write_combine = 1; + } + + /* XXX would be nice to have a way to ask for write-through */ + prot |= _PAGE_NO_CACHE; + if (write_combine) + prot &= ~_PAGE_GUARDED; + else + prot |= _PAGE_GUARDED; + + printk("PCI map for %s:%lx, prot: %lx\n", pci_name(dev), rp->start, + prot); + + return __pgprot(prot); +} + +/* + * This one is used by /dev/mem and fbdev who have no clue about the + * PCI device, it tries to find the PCI device first and calls the + * above routine + */ +pgprot_t pci_phys_mem_access_prot(struct file *file, + unsigned long pfn, + unsigned long size, + pgprot_t protection) +{ + struct pci_dev *pdev = NULL; + struct resource *found = NULL; + unsigned long prot = pgprot_val(protection); + unsigned long offset = pfn << PAGE_SHIFT; + int i; + + if (page_is_ram(pfn)) + return __pgprot(prot); + + prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; + + for_each_pci_dev(pdev) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) { + struct resource *rp = &pdev->resource[i]; + int flags = rp->flags; + + /* Active and same type? */ + if ((flags & IORESOURCE_MEM) == 0) + continue; + /* In the range of this resource? */ + if (offset < (rp->start & PAGE_MASK) || + offset > rp->end) + continue; + found = rp; + break; + } + if (found) + break; + } + if (found) { + if (found->flags & IORESOURCE_PREFETCH) + prot &= ~_PAGE_GUARDED; + pci_dev_put(pdev); + } + + DBG("non-PCI map for %lx, prot: %lx\n", offset, prot); + + return __pgprot(prot); +} + + +/* + * Perform the actual remap of the pages for a PCI device mapping, as + * appropriate for this architecture. The region in the process to map + * is described by vm_start and vm_end members of VMA, the base physical + * address is found in vm_pgoff. + * The pci device structure is provided so that architectures may make mapping + * decisions on a per-device or per-bus basis. + * + * Returns a negative error code on failure, zero on success. + */ +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + struct resource *rp; + int ret; + + rp = __pci_mmap_make_offset(dev, &offset, mmap_state); + if (rp == NULL) + return -EINVAL; + + vma->vm_pgoff = offset >> PAGE_SHIFT; + vma->vm_flags |= VM_SHM | VM_LOCKED | VM_IO; + vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp, + vma->vm_page_prot, + mmap_state, write_combine); + + ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); + + return ret; +} + +#ifdef CONFIG_PPC_MULTIPLATFORM +static ssize_t pci_show_devspec(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev; + struct device_node *np; + + pdev = to_pci_dev (dev); + np = pci_device_to_OF_node(pdev); + if (np == NULL || np->full_name == NULL) + return 0; + return sprintf(buf, "%s", np->full_name); +} +static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL); +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +void pcibios_add_platform_entries(struct pci_dev *pdev) +{ +#ifdef CONFIG_PPC_MULTIPLATFORM + device_create_file(&pdev->dev, &dev_attr_devspec); +#endif /* CONFIG_PPC_MULTIPLATFORM */ +} + +#ifdef CONFIG_PPC_MULTIPLATFORM + +#define ISA_SPACE_MASK 0x1 +#define ISA_SPACE_IO 0x1 + +static void __devinit pci_process_ISA_OF_ranges(struct device_node *isa_node, + unsigned long phb_io_base_phys, + void __iomem * phb_io_base_virt) +{ + struct isa_range *range; + unsigned long pci_addr; + unsigned int isa_addr; + unsigned int size; + int rlen = 0; + + range = (struct isa_range *) get_property(isa_node, "ranges", &rlen); + if (range == NULL || (rlen < sizeof(struct isa_range))) { + printk(KERN_ERR "no ISA ranges or unexpected isa range size," + "mapping 64k\n"); + __ioremap_explicit(phb_io_base_phys, + (unsigned long)phb_io_base_virt, + 0x10000, _PAGE_NO_CACHE | _PAGE_GUARDED); + return; + } + + /* From "ISA Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 1: an ISA address + * cells 2 - 4: a PCI address + * (size depending on dev->n_addr_cells) + * cell 5: the size of the range + */ + if ((range->isa_addr.a_hi && ISA_SPACE_MASK) == ISA_SPACE_IO) { + isa_addr = range->isa_addr.a_lo; + pci_addr = (unsigned long) range->pci_addr.a_mid << 32 | + range->pci_addr.a_lo; + + /* Assume these are both zero */ + if ((pci_addr != 0) || (isa_addr != 0)) { + printk(KERN_ERR "unexpected isa to pci mapping: %s\n", + __FUNCTION__); + return; + } + + size = PAGE_ALIGN(range->size); + + __ioremap_explicit(phb_io_base_phys, + (unsigned long) phb_io_base_virt, + size, _PAGE_NO_CACHE | _PAGE_GUARDED); + } +} + +void __devinit pci_process_bridge_OF_ranges(struct pci_controller *hose, + struct device_node *dev, int prim) +{ + unsigned int *ranges, pci_space; + unsigned long size; + int rlen = 0; + int memno = 0; + struct resource *res; + int np, na = prom_n_addr_cells(dev); + unsigned long pci_addr, cpu_phys_addr; + + np = na + 5; + + /* From "PCI Binding to 1275" + * The ranges property is laid out as an array of elements, + * each of which comprises: + * cells 0 - 2: a PCI address + * cells 3 or 3+4: a CPU physical address + * (size depending on dev->n_addr_cells) + * cells 4+5 or 5+6: the size of the range + */ + rlen = 0; + hose->io_base_phys = 0; + ranges = (unsigned int *) get_property(dev, "ranges", &rlen); + while ((rlen -= np * sizeof(unsigned int)) >= 0) { + res = NULL; + pci_space = ranges[0]; + pci_addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + + cpu_phys_addr = ranges[3]; + if (na >= 2) + cpu_phys_addr = (cpu_phys_addr << 32) | ranges[4]; + + size = ((unsigned long)ranges[na+3] << 32) | ranges[na+4]; + ranges += np; + if (size == 0) + continue; + + /* Now consume following elements while they are contiguous */ + while (rlen >= np * sizeof(unsigned int)) { + unsigned long addr, phys; + + if (ranges[0] != pci_space) + break; + addr = ((unsigned long)ranges[1] << 32) | ranges[2]; + phys = ranges[3]; + if (na >= 2) + phys = (phys << 32) | ranges[4]; + if (addr != pci_addr + size || + phys != cpu_phys_addr + size) + break; + + size += ((unsigned long)ranges[na+3] << 32) + | ranges[na+4]; + ranges += np; + rlen -= np * sizeof(unsigned int); + } + + switch ((pci_space >> 24) & 0x3) { + case 1: /* I/O space */ + hose->io_base_phys = cpu_phys_addr; + hose->pci_io_size = size; + + res = &hose->io_resource; + res->flags = IORESOURCE_IO; + res->start = pci_addr; + DBG("phb%d: IO 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + break; + case 2: /* memory space */ + memno = 0; + while (memno < 3 && hose->mem_resources[memno].flags) + ++memno; + + if (memno == 0) + hose->pci_mem_offset = cpu_phys_addr - pci_addr; + if (memno < 3) { + res = &hose->mem_resources[memno]; + res->flags = IORESOURCE_MEM; + res->start = cpu_phys_addr; + DBG("phb%d: MEM 0x%lx -> 0x%lx\n", hose->global_number, + res->start, res->start + size - 1); + } + break; + } + if (res != NULL) { + res->name = dev->full_name; + res->end = res->start + size - 1; + res->parent = NULL; + res->sibling = NULL; + res->child = NULL; + } + } +} + +void __init pci_setup_phb_io(struct pci_controller *hose, int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + struct device_node *isa_dn; + + hose->io_base_virt = reserve_phb_iospace(size); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) { + pci_io_base = (unsigned long)hose->io_base_virt; + isa_dn = of_find_node_by_type(NULL, "isa"); + if (isa_dn) { + isa_io_base = pci_io_base; + pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys, + hose->io_base_virt); + of_node_put(isa_dn); + /* Allow all IO */ + io_page_mask = -1; + } + } + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + +void __devinit pci_setup_phb_io_dynamic(struct pci_controller *hose, + int primary) +{ + unsigned long size = hose->pci_io_size; + unsigned long io_virt_offset; + struct resource *res; + + hose->io_base_virt = __ioremap(hose->io_base_phys, size, + _PAGE_NO_CACHE | _PAGE_GUARDED); + DBG("phb%d io_base_phys 0x%lx io_base_virt 0x%lx\n", + hose->global_number, hose->io_base_phys, + (unsigned long) hose->io_base_virt); + + if (primary) + pci_io_base = (unsigned long)hose->io_base_virt; + + io_virt_offset = (unsigned long)hose->io_base_virt - pci_io_base; + res = &hose->io_resource; + res->start += io_virt_offset; + res->end += io_virt_offset; +} + + +static int get_bus_io_range(struct pci_bus *bus, unsigned long *start_phys, + unsigned long *start_virt, unsigned long *size) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pci_bus_region region; + struct resource *res; + + if (bus->self) { + res = bus->resource[0]; + pcibios_resource_to_bus(bus->self, ®ion, res); + *start_phys = hose->io_base_phys + region.start; + *start_virt = (unsigned long) hose->io_base_virt + + region.start; + if (region.end > region.start) + *size = region.end - region.start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, region.start, region.end); + return 1; + } + + } else { + /* Root Bus */ + res = &hose->io_resource; + *start_phys = hose->io_base_phys; + *start_virt = (unsigned long) hose->io_base_virt; + if (res->end > res->start) + *size = res->end - res->start + 1; + else { + printk("%s(): unexpected region 0x%lx->0x%lx\n", + __FUNCTION__, res->start, res->end); + return 1; + } + } + + return 0; +} + +int unmap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + if (iounmap_explicit((void __iomem *) start_virt, size)) + return 1; + + return 0; +} +EXPORT_SYMBOL(unmap_bus_range); + +int remap_bus_range(struct pci_bus *bus) +{ + unsigned long start_phys; + unsigned long start_virt; + unsigned long size; + + if (!bus) { + printk(KERN_ERR "%s() expected bus\n", __FUNCTION__); + return 1; + } + + + if (get_bus_io_range(bus, &start_phys, &start_virt, &size)) + return 1; + printk("mapping IO %lx -> %lx, size: %lx\n", start_phys, start_virt, size); + if (__ioremap_explicit(start_phys, start_virt, size, + _PAGE_NO_CACHE | _PAGE_GUARDED)) + return 1; + + return 0; +} +EXPORT_SYMBOL(remap_bus_range); + +void phbs_remap_io(void) +{ + struct pci_controller *hose, *tmp; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + remap_bus_range(hose->bus); +} + +/* + * ppc64 can have multifunction devices that do not respond to function 0. + * In this case we must scan all functions. + * XXX this can go now, we use the OF device tree in all the + * cases that caused problems. -- paulus + */ +int pcibios_scan_all_fns(struct pci_bus *bus, int devfn) +{ + return 0; +} + +static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long start, end, mask, offset; + + if (res->flags & IORESOURCE_IO) { + offset = (unsigned long)hose->io_base_virt - pci_io_base; + + start = res->start += offset; + end = res->end += offset; + + /* Need to allow IO access to pages that are in the + ISA range */ + if (start < MAX_ISA_PORT) { + if (end > MAX_ISA_PORT) + end = MAX_ISA_PORT; + + start >>= PAGE_SHIFT; + end >>= PAGE_SHIFT; + + /* get the range of pages for the map */ + mask = ((1 << (end+1)) - 1) ^ ((1 << start) - 1); + io_page_mask |= mask; + } + } else if (res->flags & IORESOURCE_MEM) { + res->start += hose->pci_mem_offset; + res->end += hose->pci_mem_offset; + } +} + +void __devinit pcibios_fixup_device_resources(struct pci_dev *dev, + struct pci_bus *bus) +{ + /* Update device resources. */ + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) + if (dev->resource[i].flags) + fixup_resource(&dev->resource[i], dev); +} +EXPORT_SYMBOL(pcibios_fixup_device_resources); + +static void __devinit do_bus_setup(struct pci_bus *bus) +{ + struct pci_dev *dev; + + ppc_md.iommu_bus_setup(bus); + + list_for_each_entry(dev, &bus->devices, bus_list) + ppc_md.iommu_dev_setup(dev); + + if (ppc_md.irq_bus_setup) + ppc_md.irq_bus_setup(bus); +} + +void __devinit pcibios_fixup_bus(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + + if (dev && pci_probe_only && + (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + /* This is a subordinate bridge */ + + pci_read_bridge_bases(bus); + pcibios_fixup_device_resources(dev, bus); + } + + do_bus_setup(bus); + + if (!pci_probe_only) + return; + + list_for_each_entry(dev, &bus->devices, bus_list) + if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI) + pcibios_fixup_device_resources(dev, bus); +} +EXPORT_SYMBOL(pcibios_fixup_bus); + +/* + * Reads the interrupt pin to determine if interrupt is use by card. + * If the interrupt is used, then gets the interrupt line from the + * openfirmware and sets it in the pci_dev and pci_config line. + */ +int pci_read_irq_line(struct pci_dev *pci_dev) +{ + u8 intpin; + struct device_node *node; + + pci_read_config_byte(pci_dev, PCI_INTERRUPT_PIN, &intpin); + if (intpin == 0) + return 0; + + node = pci_device_to_OF_node(pci_dev); + if (node == NULL) + return -1; + + if (node->n_intrs == 0) + return -1; + + pci_dev->irq = node->intrs[0].line; + + pci_write_config_byte(pci_dev, PCI_INTERRUPT_LINE, pci_dev->irq); + + return 0; +} +EXPORT_SYMBOL(pci_read_irq_line); + +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + u64 *start, u64 *end) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + unsigned long offset = 0; + + if (hose == NULL) + return; + + if (rsrc->flags & IORESOURCE_IO) + offset = pci_io_base - (unsigned long)hose->io_base_virt + + hose->io_base_phys; + + *start = rsrc->start + offset; + *end = rsrc->end + offset; +} + +#endif /* CONFIG_PPC_MULTIPLATFORM */ + + +#define IOBASE_BRIDGE_NUMBER 0 +#define IOBASE_MEMORY 1 +#define IOBASE_IO 2 +#define IOBASE_ISA_IO 3 +#define IOBASE_ISA_MEM 4 + +long sys_pciconfig_iobase(long which, unsigned long in_bus, + unsigned long in_devfn) +{ + struct pci_controller* hose; + struct list_head *ln; + struct pci_bus *bus = NULL; + struct device_node *hose_node; + + /* Argh ! Please forgive me for that hack, but that's the + * simplest way to get existing XFree to not lockup on some + * G5 machines... So when something asks for bus 0 io base + * (bus 0 is HT root), we return the AGP one instead. + */ + if (machine_is_compatible("MacRISC4")) + if (in_bus == 0) + in_bus = 0xf0; + + /* That syscall isn't quite compatible with PCI domains, but it's + * used on pre-domains setup. We return the first match + */ + + for (ln = pci_root_buses.next; ln != &pci_root_buses; ln = ln->next) { + bus = pci_bus_b(ln); + if (in_bus >= bus->number && in_bus < (bus->number + bus->subordinate)) + break; + bus = NULL; + } + if (bus == NULL || bus->sysdata == NULL) + return -ENODEV; + + hose_node = (struct device_node *)bus->sysdata; + hose = PCI_DN(hose_node)->phb; + + switch (which) { + case IOBASE_BRIDGE_NUMBER: + return (long)hose->first_busno; + case IOBASE_MEMORY: + return (long)hose->pci_mem_offset; + case IOBASE_IO: + return (long)hose->io_base_phys; + case IOBASE_ISA_IO: + return (long)isa_io_base; + case IOBASE_ISA_MEM: + return -EINVAL; + } + + return -EOPNOTSUPP; +} diff --git a/arch/powerpc/kernel/pci_direct_iommu.c b/arch/powerpc/kernel/pci_direct_iommu.c new file mode 100644 index 00000000000..e1a32f802c0 --- /dev/null +++ b/arch/powerpc/kernel/pci_direct_iommu.c @@ -0,0 +1,94 @@ +/* + * Support for DMA from PCI devices to main memory on + * machines without an iommu or with directly addressable + * RAM (typically a pmac with 2Gb of RAM or less) + * + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/pmac_feature.h> +#include <asm/abs_addr.h> +#include <asm/ppc-pci.h> + +static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret; + + ret = (void *)__get_free_pages(flag, get_order(size)); + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_abs(ret); + } + return ret; +} + +static void pci_direct_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static dma_addr_t pci_direct_map_single(struct device *hwdev, void *ptr, + size_t size, enum dma_data_direction direction) +{ + return virt_to_abs(ptr); +} + +static void pci_direct_unmap_single(struct device *hwdev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction direction) +{ +} + +static int pci_direct_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + int i; + + for (i = 0; i < nents; i++, sg++) { + sg->dma_address = page_to_phys(sg->page) + sg->offset; + sg->dma_length = sg->length; + } + + return nents; +} + +static void pci_direct_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ +} + +static int pci_direct_dma_supported(struct device *dev, u64 mask) +{ + return mask < 0x100000000ull; +} + +void __init pci_direct_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_direct_alloc_coherent; + pci_dma_ops.free_coherent = pci_direct_free_coherent; + pci_dma_ops.map_single = pci_direct_map_single; + pci_dma_ops.unmap_single = pci_direct_unmap_single; + pci_dma_ops.map_sg = pci_direct_map_sg; + pci_dma_ops.unmap_sg = pci_direct_unmap_sg; + pci_dma_ops.dma_supported = pci_direct_dma_supported; +} diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c new file mode 100644 index 00000000000..12c4c9e9bbc --- /dev/null +++ b/arch/powerpc/kernel/pci_dn.c @@ -0,0 +1,230 @@ +/* + * pci_dn.c + * + * Copyright (C) 2001 Todd Inglett, IBM Corporation + * + * PCI manipulation via device_nodes. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/pSeries_reconfig.h> +#include <asm/ppc-pci.h> + +/* + * Traverse_func that inits the PCI fields of the device node. + * NOTE: this *must* be done before read/write config to the device. + */ +static void * __devinit update_dn_pci_info(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + int *type = (int *)get_property(dn, "ibm,pci-config-space-type", NULL); + u32 *regs; + struct pci_dn *pdn; + + if (mem_init_done) + pdn = kmalloc(sizeof(*pdn), GFP_KERNEL); + else + pdn = alloc_bootmem(sizeof(*pdn)); + if (pdn == NULL) + return NULL; + memset(pdn, 0, sizeof(*pdn)); + dn->data = pdn; + pdn->node = dn; + pdn->phb = phb; + regs = (u32 *)get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + pdn->busno = (regs[0] >> 16) & 0xff; + pdn->devfn = (regs[0] >> 8) & 0xff; + } + + pdn->pci_ext_config_space = (type && *type == 1); + return NULL; +} + +/* + * Traverse a device tree stopping each PCI device in the tree. + * This is done depth first. As each node is processed, a "pre" + * function is called and the children are processed recursively. + * + * The "pre" func returns a value. If non-zero is returned from + * the "pre" func, the traversal stops and this value is returned. + * This return value is useful when using traverse as a method of + * finding a device. + * + * NOTE: we do not run the func for devices that do not appear to + * be PCI except for the start node which we assume (this is good + * because the start node is often a phb which may be missing PCI + * properties). + * We use the class-code as an indicator. If we run into + * one of these nodes we also assume its siblings are non-pci for + * performance. + */ +void *traverse_pci_devices(struct device_node *start, traverse_func pre, + void *data) +{ + struct device_node *dn, *nextdn; + void *ret; + + /* We started with a phb, iterate all childs */ + for (dn = start->child; dn; dn = nextdn) { + u32 *classp, class; + + nextdn = NULL; + classp = (u32 *)get_property(dn, "class-code", NULL); + class = classp ? *classp : 0; + + if (pre && ((ret = pre(dn, data)) != NULL)) + return ret; + + /* If we are a PCI bridge, go down */ + if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || + (class >> 8) == PCI_CLASS_BRIDGE_CARDBUS)) + /* Depth first...do children */ + nextdn = dn->child; + else if (dn->sibling) + /* ok, try next sibling instead. */ + nextdn = dn->sibling; + if (!nextdn) { + /* Walk up to next valid sibling. */ + do { + dn = dn->parent; + if (dn == start) + return NULL; + } while (dn->sibling == NULL); + nextdn = dn->sibling; + } + } + return NULL; +} + +/** + * pci_devs_phb_init_dynamic - setup pci devices under this PHB + * phb: pci-to-host bridge (top-level bridge connecting to cpu) + * + * This routine is called both during boot, (before the memory + * subsystem is set up, before kmalloc is valid) and during the + * dynamic lpar operation of adding a PHB to a running system. + */ +void __devinit pci_devs_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node * dn = (struct device_node *) phb->arch_data; + struct pci_dn *pdn; + + /* PHB nodes themselves must not match */ + update_dn_pci_info(dn, phb); + pdn = dn->data; + if (pdn) { + pdn->devfn = pdn->busno = -1; + pdn->phb = phb; + } + + /* Update dn->phb ptrs for new phb and children devices */ + traverse_pci_devices(dn, update_dn_pci_info, phb); +} + +/* + * Traversal func that looks for a <busno,devfcn> value. + * If found, the pci_dn is returned (thus terminating the traversal). + */ +static void *is_devfn_node(struct device_node *dn, void *data) +{ + int busno = ((unsigned long)data >> 8) & 0xff; + int devfn = ((unsigned long)data) & 0xff; + struct pci_dn *pci = dn->data; + + if (pci && (devfn == pci->devfn) && (busno == pci->busno)) + return dn; + return NULL; +} + +/* + * This is the "slow" path for looking up a device_node from a + * pci_dev. It will hunt for the device under its parent's + * phb and then update sysdata for a future fastpath. + * + * It may also do fixups on the actual device since this happens + * on the first read/write. + * + * Note that it also must deal with devices that don't exist. + * In this case it may probe for real hardware ("just in case") + * and add a device_node to the device tree if necessary. + * + */ +struct device_node *fetch_dev_dn(struct pci_dev *dev) +{ + struct device_node *orig_dn = dev->sysdata; + struct device_node *dn; + unsigned long searchval = (dev->bus->number << 8) | dev->devfn; + + dn = traverse_pci_devices(orig_dn, is_devfn_node, (void *)searchval); + if (dn) + dev->sysdata = dn; + return dn; +} +EXPORT_SYMBOL(fetch_dev_dn); + +static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) +{ + struct device_node *np = node; + struct pci_dn *pci = NULL; + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + pci = np->parent->data; + if (pci) + update_dn_pci_info(np, pci->phb); + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pci_dn_reconfig_nb = { + .notifier_call = pci_dn_reconfig_notifier, +}; + +/** + * pci_devs_phb_init - Initialize phbs and pci devs under them. + * + * This routine walks over all phb's (pci-host bridges) on the + * system, and sets up assorted pci-related structures + * (including pci info in the device node structs) for each + * pci device found underneath. This routine runs once, + * early in the boot sequence. + */ +void __init pci_devs_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + /* This must be done first so the device nodes have valid pci info! */ + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + pci_devs_phb_init_dynamic(phb); + + pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); +} diff --git a/arch/powerpc/kernel/pci_iommu.c b/arch/powerpc/kernel/pci_iommu.c new file mode 100644 index 00000000000..bdf15dbbf4f --- /dev/null +++ b/arch/powerpc/kernel/pci_iommu.c @@ -0,0 +1,128 @@ +/* + * arch/ppc64/kernel/pci_iommu.c + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup, new allocation schemes: + * Copyright (C) 2004 Olof Johansson, IBM Corporation + * + * Dynamic DMA mapping support, platform-independent parts. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> + +/* + * We can use ->sysdata directly and avoid the extra work in + * pci_device_to_OF_node since ->sysdata will have been initialised + * in the iommu init code for all devices. + */ +#define PCI_GET_DN(dev) ((struct device_node *)((dev)->sysdata)) + +static inline struct iommu_table *devnode_table(struct device *dev) +{ + struct pci_dev *pdev; + + if (!dev) { + pdev = ppc64_isabridge_dev; + if (!pdev) + return NULL; + } else + pdev = to_pci_dev(dev); + + return PCI_DN(PCI_GET_DN(pdev))->iommu_table; +} + + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle, + flag); +} + +static void pci_iommu_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + iommu_free_coherent(devnode_table(hwdev), size, vaddr, dma_handle); +} + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +static dma_addr_t pci_iommu_map_single(struct device *hwdev, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + return iommu_map_single(devnode_table(hwdev), vaddr, size, direction); +} + + +static void pci_iommu_unmap_single(struct device *hwdev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction); +} + + +static int pci_iommu_map_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + return iommu_map_sg(pdev, devnode_table(pdev), sglist, + nelems, direction); +} + +static void pci_iommu_unmap_sg(struct device *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) +{ + iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction); +} + +/* We support DMA to/from any memory page via the iommu */ +static int pci_iommu_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +void pci_iommu_init(void) +{ + pci_dma_ops.alloc_coherent = pci_iommu_alloc_coherent; + pci_dma_ops.free_coherent = pci_iommu_free_coherent; + pci_dma_ops.map_single = pci_iommu_map_single; + pci_dma_ops.unmap_single = pci_iommu_unmap_single; + pci_dma_ops.map_sg = pci_iommu_map_sg; + pci_dma_ops.unmap_sg = pci_iommu_unmap_sg; + pci_dma_ops.dma_supported = pci_iommu_dma_supported; +} diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 47d6f7e2ea9..5dcf4ba05ee 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -44,6 +44,7 @@ #include <asm/cputable.h> #include <asm/btext.h> #include <asm/div64.h> +#include <asm/signal.h> #ifdef CONFIG_8xx #include <asm/commproc.h> @@ -56,7 +57,6 @@ extern void machine_check_exception(struct pt_regs *regs); extern void alignment_exception(struct pt_regs *regs); extern void program_check_exception(struct pt_regs *regs); extern void single_step_exception(struct pt_regs *regs); -extern int do_signal(sigset_t *, struct pt_regs *); extern int pmac_newworld; extern int sys_sigreturn(struct pt_regs *regs); @@ -188,9 +188,6 @@ EXPORT_SYMBOL(adb_try_handler_change); EXPORT_SYMBOL(cuda_request); EXPORT_SYMBOL(cuda_poll); #endif /* CONFIG_ADB_CUDA */ -#if defined(CONFIG_PPC_MULTIPLATFORM) && defined(CONFIG_PPC32) -EXPORT_SYMBOL(_machine); -#endif #ifdef CONFIG_PPC_PMAC EXPORT_SYMBOL(sys_ctrler); #endif diff --git a/arch/powerpc/kernel/proc_ppc64.c b/arch/powerpc/kernel/proc_ppc64.c new file mode 100644 index 00000000000..7ba42a405f4 --- /dev/null +++ b/arch/powerpc/kernel/proc_ppc64.c @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> +#include <linux/kernel.h> + +#include <asm/vdso_datapage.h> +#include <asm/rtas.h> +#include <asm/uaccess.h> +#include <asm/prom.h> + +static loff_t page_map_seek( struct file *file, loff_t off, int whence); +static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos); +static int page_map_mmap( struct file *file, struct vm_area_struct *vma ); + +static struct file_operations page_map_fops = { + .llseek = page_map_seek, + .read = page_map_read, + .mmap = page_map_mmap +}; + +/* + * Create the ppc64 and ppc64/rtas directories early. This allows us to + * assume that they have been previously created in drivers. + */ +static int __init proc_ppc64_create(void) +{ + struct proc_dir_entry *root; + + root = proc_mkdir("ppc64", NULL); + if (!root) + return 1; + + if (!(platform_is_pseries() || _machine == PLATFORM_CELL)) + return 0; + + if (!proc_mkdir("rtas", root)) + return 1; + + if (!proc_symlink("rtas", NULL, "ppc64/rtas")) + return 1; + + return 0; +} +core_initcall(proc_ppc64_create); + +static int __init proc_ppc64_init(void) +{ + struct proc_dir_entry *pde; + + pde = create_proc_entry("ppc64/systemcfg", S_IFREG|S_IRUGO, NULL); + if (!pde) + return 1; + pde->nlink = 1; + pde->data = vdso_data; + pde->size = PAGE_SIZE; + pde->proc_fops = &page_map_fops; + + return 0; +} +__initcall(proc_ppc64_init); + +static loff_t page_map_seek( struct file *file, loff_t off, int whence) +{ + loff_t new; + struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode); + + switch(whence) { + case 0: + new = off; + break; + case 1: + new = file->f_pos + off; + break; + case 2: + new = dp->size + off; + break; + default: + return -EINVAL; + } + if ( new < 0 || new > dp->size ) + return -EINVAL; + return (file->f_pos = new); +} + +static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode); + return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size); +} + +static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) +{ + struct proc_dir_entry *dp = PDE(file->f_dentry->d_inode); + + vma->vm_flags |= VM_SHM | VM_LOCKED; + + if ((vma->vm_end - vma->vm_start) > dp->size) + return -EINVAL; + + remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT, + dp->size, vma->vm_page_prot); + return 0; +} + diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 96843211cc5..de69fb37c73 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -46,10 +46,10 @@ #include <asm/processor.h> #include <asm/mmu.h> #include <asm/prom.h> +#include <asm/machdep.h> #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/time.h> -#include <asm/machdep.h> #endif extern unsigned long _get_SP(void); @@ -203,10 +203,8 @@ int dump_spe(struct pt_regs *regs, elf_vrregset_t *evrregs) int set_dabr(unsigned long dabr) { -#ifdef CONFIG_PPC64 if (ppc_md.set_dabr) return ppc_md.set_dabr(dabr); -#endif mtspr(SPRN_DABR, dabr); return 0; @@ -554,12 +552,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_SLB)) { unsigned long sp_vsid = get_kernel_vsid(sp); + unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; sp_vsid <<= SLB_VSID_SHIFT; - sp_vsid |= SLB_VSID_KERNEL; - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - sp_vsid |= SLB_VSID_L; - + sp_vsid |= SLB_VSID_KERNEL | llp; p->thread.ksp_vsid = sp_vsid; } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index eec2da69550..3bf968e7409 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -48,9 +48,6 @@ #include <asm/machdep.h> #include <asm/pSeries_reconfig.h> #include <asm/pci-bridge.h> -#ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> -#endif #ifdef DEBUG #define DBG(fmt...) printk(KERN_ERR fmt) @@ -74,10 +71,6 @@ struct isa_reg_property { typedef int interpret_func(struct device_node *, unsigned long *, int, int, int); -extern struct rtas_t rtas; -extern struct lmb lmb; -extern unsigned long klimit; - static int __initdata dt_root_addr_cells; static int __initdata dt_root_size_cells; @@ -391,7 +384,7 @@ static int __devinit finish_node_interrupts(struct device_node *np, #ifdef CONFIG_PPC64 /* We offset irq numbers for the u3 MPIC by 128 in PowerMac */ - if (systemcfg->platform == PLATFORM_POWERMAC && ic && ic->parent) { + if (_machine == PLATFORM_POWERMAC && ic && ic->parent) { char *name = get_property(ic->parent, "name", NULL); if (name && !strcmp(name, "u3")) np->intrs[intrcount].line += 128; @@ -724,10 +717,10 @@ static inline char *find_flat_dt_string(u32 offset) * used to extract the memory informations at boot before we can * unflatten the tree */ -static int __init scan_flat_dt(int (*it)(unsigned long node, - const char *uname, int depth, - void *data), - void *data) +int __init of_scan_flat_dt(int (*it)(unsigned long node, + const char *uname, int depth, + void *data), + void *data) { unsigned long p = ((unsigned long)initial_boot_params) + initial_boot_params->off_dt_struct; @@ -784,8 +777,8 @@ static int __init scan_flat_dt(int (*it)(unsigned long node, * This function can be used within scan_flattened_dt callback to get * access to properties */ -static void* __init get_flat_dt_prop(unsigned long node, const char *name, - unsigned long *size) +void* __init of_get_flat_dt_prop(unsigned long node, const char *name, + unsigned long *size) { unsigned long p = node; @@ -1087,27 +1080,14 @@ void __init unflatten_device_tree(void) static int __init early_init_dt_scan_cpus(unsigned long node, const char *uname, int depth, void *data) { - char *type = get_flat_dt_prop(node, "device_type", NULL); u32 *prop; - unsigned long size = 0; + unsigned long size; + char *type = of_get_flat_dt_prop(node, "device_type", &size); /* We are scanning "cpu" nodes only */ if (type == NULL || strcmp(type, "cpu") != 0) return 0; -#ifdef CONFIG_PPC_PSERIES - /* On LPAR, look for the first ibm,pft-size property for the hash table size - */ - if (systemcfg->platform == PLATFORM_PSERIES_LPAR && ppc64_pft_size == 0) { - u32 *pft_size; - pft_size = get_flat_dt_prop(node, "ibm,pft-size", NULL); - if (pft_size != NULL) { - /* pft_size[0] is the NUMA CEC cookie */ - ppc64_pft_size = pft_size[1]; - } - } -#endif - boot_cpuid = 0; boot_cpuid_phys = 0; if (initial_boot_params && initial_boot_params->version >= 2) { @@ -1117,8 +1097,9 @@ static int __init early_init_dt_scan_cpus(unsigned long node, boot_cpuid_phys = initial_boot_params->boot_cpuid_phys; } else { /* Check if it's the boot-cpu, set it's hw index now */ - if (get_flat_dt_prop(node, "linux,boot-cpu", NULL) != NULL) { - prop = get_flat_dt_prop(node, "reg", NULL); + if (of_get_flat_dt_prop(node, + "linux,boot-cpu", NULL) != NULL) { + prop = of_get_flat_dt_prop(node, "reg", NULL); if (prop != NULL) boot_cpuid_phys = *prop; } @@ -1127,14 +1108,14 @@ static int __init early_init_dt_scan_cpus(unsigned long node, #ifdef CONFIG_ALTIVEC /* Check if we have a VMX and eventually update CPU features */ - prop = (u32 *)get_flat_dt_prop(node, "ibm,vmx", &size); + prop = (u32 *)of_get_flat_dt_prop(node, "ibm,vmx", NULL); if (prop && (*prop) > 0) { cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC; cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC; } /* Same goes for Apple's "altivec" property */ - prop = (u32 *)get_flat_dt_prop(node, "altivec", NULL); + prop = (u32 *)of_get_flat_dt_prop(node, "altivec", NULL); if (prop) { cur_cpu_spec->cpu_features |= CPU_FTR_ALTIVEC; cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_ALTIVEC; @@ -1147,7 +1128,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node, * this by looking at the size of the ibm,ppc-interrupt-server#s * property */ - prop = (u32 *)get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", + prop = (u32 *)of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &size); cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT; if (prop && ((size / sizeof(u32)) > 1)) @@ -1170,34 +1151,30 @@ static int __init early_init_dt_scan_chosen(unsigned long node, return 0; /* get platform type */ - prop = (u32 *)get_flat_dt_prop(node, "linux,platform", NULL); + prop = (u32 *)of_get_flat_dt_prop(node, "linux,platform", NULL); if (prop == NULL) return 0; -#ifdef CONFIG_PPC64 - systemcfg->platform = *prop; -#else #ifdef CONFIG_PPC_MULTIPLATFORM _machine = *prop; #endif -#endif #ifdef CONFIG_PPC64 /* check if iommu is forced on or off */ - if (get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL) + if (of_get_flat_dt_prop(node, "linux,iommu-off", NULL) != NULL) iommu_is_off = 1; - if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL) + if (of_get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL) iommu_force_on = 1; #endif - lprop = get_flat_dt_prop(node, "linux,memory-limit", NULL); + lprop = of_get_flat_dt_prop(node, "linux,memory-limit", NULL); if (lprop) memory_limit = *lprop; #ifdef CONFIG_PPC64 - lprop = get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-start", NULL); if (lprop) tce_alloc_start = *lprop; - lprop = get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); + lprop = of_get_flat_dt_prop(node, "linux,tce-alloc-end", NULL); if (lprop) tce_alloc_end = *lprop; #endif @@ -1209,9 +1186,9 @@ static int __init early_init_dt_scan_chosen(unsigned long node, { u64 *basep, *entryp; - basep = get_flat_dt_prop(node, "linux,rtas-base", NULL); - entryp = get_flat_dt_prop(node, "linux,rtas-entry", NULL); - prop = get_flat_dt_prop(node, "linux,rtas-size", NULL); + basep = of_get_flat_dt_prop(node, "linux,rtas-base", NULL); + entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL); + prop = of_get_flat_dt_prop(node, "linux,rtas-size", NULL); if (basep && entryp && prop) { rtas.base = *basep; rtas.entry = *entryp; @@ -1232,11 +1209,11 @@ static int __init early_init_dt_scan_root(unsigned long node, if (depth != 0) return 0; - prop = get_flat_dt_prop(node, "#size-cells", NULL); + prop = of_get_flat_dt_prop(node, "#size-cells", NULL); dt_root_size_cells = (prop == NULL) ? 1 : *prop; DBG("dt_root_size_cells = %x\n", dt_root_size_cells); - prop = get_flat_dt_prop(node, "#address-cells", NULL); + prop = of_get_flat_dt_prop(node, "#address-cells", NULL); dt_root_addr_cells = (prop == NULL) ? 2 : *prop; DBG("dt_root_addr_cells = %x\n", dt_root_addr_cells); @@ -1271,15 +1248,22 @@ static unsigned long __init dt_mem_next_cell(int s, cell_t **cellp) static int __init early_init_dt_scan_memory(unsigned long node, const char *uname, int depth, void *data) { - char *type = get_flat_dt_prop(node, "device_type", NULL); + char *type = of_get_flat_dt_prop(node, "device_type", NULL); cell_t *reg, *endp; unsigned long l; /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) + if (type == NULL) { + /* + * The longtrail doesn't have a device_type on the + * /memory node, so look for the node called /memory@0. + */ + if (depth != 1 || strcmp(uname, "memory@0") != 0) + return 0; + } else if (strcmp(type, "memory") != 0) return 0; - reg = (cell_t *)get_flat_dt_prop(node, "reg", &l); + reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); if (reg == NULL) return 0; @@ -1343,17 +1327,14 @@ void __init early_init_devtree(void *params) * device-tree, including the platform type, initrd location and * size, TCE reserve, and more ... */ - scan_flat_dt(early_init_dt_scan_chosen, NULL); + of_scan_flat_dt(early_init_dt_scan_chosen, NULL); /* Scan memory nodes and rebuild LMBs */ lmb_init(); - scan_flat_dt(early_init_dt_scan_root, NULL); - scan_flat_dt(early_init_dt_scan_memory, NULL); + of_scan_flat_dt(early_init_dt_scan_root, NULL); + of_scan_flat_dt(early_init_dt_scan_memory, NULL); lmb_enforce_memory_limit(memory_limit); lmb_analyze(); -#ifdef CONFIG_PPC64 - systemcfg->physicalMemorySize = lmb_phys_mem_size(); -#endif lmb_reserve(0, __pa(klimit)); DBG("Phys. mem: %lx\n", lmb_phys_mem_size()); @@ -1363,10 +1344,10 @@ void __init early_init_devtree(void *params) DBG("Scanning CPUs ...\n"); - /* Retreive hash table size from flattened tree plus other - * CPU related informations (altivec support, boot CPU ID, ...) + /* Retreive CPU related informations from the flat tree + * (altivec support, boot CPU ID, ...) */ - scan_flat_dt(early_init_dt_scan_cpus, NULL); + of_scan_flat_dt(early_init_dt_scan_cpus, NULL); DBG(" <- early_init_devtree()\n"); } @@ -1387,6 +1368,7 @@ prom_n_addr_cells(struct device_node* np) /* No #address-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_addr_cells); int prom_n_size_cells(struct device_node* np) @@ -1402,6 +1384,7 @@ prom_n_size_cells(struct device_node* np) /* No #size-cells property for the root node, default to 1 */ return 1; } +EXPORT_SYMBOL(prom_n_size_cells); /** * Work out the sense (active-low level / active-high edge) @@ -1920,7 +1903,7 @@ static int of_finish_dynamic_node(struct device_node *node, /* We don't support that function on PowerMac, at least * not yet */ - if (systemcfg->platform == PLATFORM_POWERMAC) + if (_machine == PLATFORM_POWERMAC) return -ENODEV; /* fix up new node's linux_phandle field */ @@ -1986,14 +1969,31 @@ EXPORT_SYMBOL(get_property); /* * Add a property to a node */ -void prom_add_property(struct device_node* np, struct property* prop) +int prom_add_property(struct device_node* np, struct property* prop) { - struct property **next = &np->properties; + struct property **next; prop->next = NULL; - while (*next) + write_lock(&devtree_lock); + next = &np->properties; + while (*next) { + if (strcmp(prop->name, (*next)->name) == 0) { + /* duplicate ! don't insert it */ + write_unlock(&devtree_lock); + return -1; + } next = &(*next)->next; + } *next = prop; + write_unlock(&devtree_lock); + +#ifdef CONFIG_PROC_DEVICETREE + /* try to add to proc as well if it was initialized */ + if (np->pde) + proc_device_tree_add_prop(np->pde, prop); +#endif /* CONFIG_PROC_DEVICETREE */ + + return 0; } /* I quickly hacked that one, check against spec ! */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index c758b6624d7..4ce0105c308 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -94,11 +94,17 @@ extern const struct linux_logo logo_linux_clut224; #ifdef CONFIG_PPC64 #define RELOC(x) (*PTRRELOC(&(x))) #define ADDR(x) (u32) add_reloc_offset((unsigned long)(x)) +#define OF_WORKAROUNDS 0 #else #define RELOC(x) (x) #define ADDR(x) (u32) (x) +#define OF_WORKAROUNDS of_workarounds +int of_workarounds; #endif +#define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ +#define OF_WA_LONGTRAIL 2 /* work around longtrail bugs */ + #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ RELOC(__FILE__), __LINE__); \ @@ -111,11 +117,6 @@ extern const struct linux_logo logo_linux_clut224; #define prom_debug(x...) #endif -#ifdef CONFIG_PPC32 -#define PLATFORM_POWERMAC _MACH_Pmac -#define PLATFORM_CHRP _MACH_chrp -#endif - typedef u32 prom_arg_t; @@ -128,10 +129,11 @@ struct prom_args { struct prom_t { ihandle root; - ihandle chosen; + phandle chosen; int cpu; ihandle stdout; ihandle mmumap; + ihandle memory; }; struct mem_map_entry { @@ -360,16 +362,36 @@ static void __init prom_printf(const char *format, ...) static unsigned int __init prom_claim(unsigned long virt, unsigned long size, unsigned long align) { - int ret; struct prom_t *_prom = &RELOC(prom); - ret = call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, - (prom_arg_t)align); - if (ret != -1 && _prom->mmumap != 0) - /* old pmacs need us to map as well */ + if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) { + /* + * Old OF requires we claim physical and virtual separately + * and then map explicitly (assuming virtual mode) + */ + int ret; + prom_arg_t result; + + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), _prom->memory, + align, size, virt); + if (ret != 0 || result == -1) + return -1; + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), _prom->mmumap, + align, size, virt); + if (ret != 0) { + call_prom("call-method", 4, 1, ADDR("release"), + _prom->memory, size, virt); + return -1; + } + /* the 0x12 is M (coherence) + PP == read/write */ call_prom("call-method", 6, 1, - ADDR("map"), _prom->mmumap, 0, size, virt, virt); - return ret; + ADDR("map"), _prom->mmumap, 0x12, size, virt, virt); + return virt; + } + return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, + (prom_arg_t)align); } static void __init __attribute__((noreturn)) prom_panic(const char *reason) @@ -403,23 +425,64 @@ static int __init prom_next_node(phandle *nodep) } } -static int __init prom_getprop(phandle node, const char *pname, +static int inline prom_getprop(phandle node, const char *pname, void *value, size_t valuelen) { return call_prom("getprop", 4, 1, node, ADDR(pname), (u32)(unsigned long) value, (u32) valuelen); } -static int __init prom_getproplen(phandle node, const char *pname) +static int inline prom_getproplen(phandle node, const char *pname) { return call_prom("getproplen", 2, 1, node, ADDR(pname)); } -static int __init prom_setprop(phandle node, const char *pname, - void *value, size_t valuelen) +static void add_string(char **str, const char *q) { - return call_prom("setprop", 4, 1, node, ADDR(pname), - (u32)(unsigned long) value, (u32) valuelen); + char *p = *str; + + while (*q) + *p++ = *q++; + *p++ = ' '; + *str = p; +} + +static char *tohex(unsigned int x) +{ + static char digits[] = "0123456789abcdef"; + static char result[9]; + int i; + + result[8] = 0; + i = 8; + do { + --i; + result[i] = digits[x & 0xf]; + x >>= 4; + } while (x != 0 && i > 0); + return &result[i]; +} + +static int __init prom_setprop(phandle node, const char *nodename, + const char *pname, void *value, size_t valuelen) +{ + char cmd[256], *p; + + if (!(OF_WORKAROUNDS & OF_WA_LONGTRAIL)) + return call_prom("setprop", 4, 1, node, ADDR(pname), + (u32)(unsigned long) value, (u32) valuelen); + + /* gah... setprop doesn't work on longtrail, have to use interpret */ + p = cmd; + add_string(&p, "dev"); + add_string(&p, nodename); + add_string(&p, tohex((u32)(unsigned long) value)); + add_string(&p, tohex(valuelen)); + add_string(&p, tohex(ADDR(pname))); + add_string(&p, tohex(strlen(RELOC(pname)))); + add_string(&p, "property"); + *p = 0; + return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); } /* We can't use the standard versions because of RELOC headaches. */ @@ -980,7 +1043,7 @@ static void __init prom_instantiate_rtas(void) rtas_inst = call_prom("open", 1, 1, ADDR("/rtas")); if (!IHANDLE_VALID(rtas_inst)) { - prom_printf("opening rtas package failed"); + prom_printf("opening rtas package failed (%x)\n", rtas_inst); return; } @@ -988,7 +1051,7 @@ static void __init prom_instantiate_rtas(void) if (call_prom_ret("call-method", 3, 2, &entry, ADDR("instantiate-rtas"), - rtas_inst, base) == PROM_ERROR + rtas_inst, base) != 0 || entry == 0) { prom_printf(" failed\n"); return; @@ -997,8 +1060,10 @@ static void __init prom_instantiate_rtas(void) reserve_mem(base, size); - prom_setprop(rtas_node, "linux,rtas-base", &base, sizeof(base)); - prom_setprop(rtas_node, "linux,rtas-entry", &entry, sizeof(entry)); + prom_setprop(rtas_node, "/rtas", "linux,rtas-base", + &base, sizeof(base)); + prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", + &entry, sizeof(entry)); prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); @@ -1089,10 +1154,6 @@ static void __init prom_initialize_tce_table(void) if (base < local_alloc_bottom) local_alloc_bottom = base; - /* Save away the TCE table attributes for later use. */ - prom_setprop(node, "linux,tce-base", &base, sizeof(base)); - prom_setprop(node, "linux,tce-size", &minsize, sizeof(minsize)); - /* It seems OF doesn't null-terminate the path :-( */ memset(path, 0, sizeof(path)); /* Call OF to setup the TCE hardware */ @@ -1101,6 +1162,10 @@ static void __init prom_initialize_tce_table(void) prom_printf("package-to-path failed\n"); } + /* Save away the TCE table attributes for later use. */ + prom_setprop(node, path, "linux,tce-base", &base, sizeof(base)); + prom_setprop(node, path, "linux,tce-size", &minsize, sizeof(minsize)); + prom_debug("TCE table: %s\n", path); prom_debug("\tnode = 0x%x\n", node); prom_debug("\tbase = 0x%x\n", base); @@ -1342,6 +1407,7 @@ static void __init prom_init_client_services(unsigned long pp) /* * For really old powermacs, we need to map things we claim. * For that, we need the ihandle of the mmu. + * Also, on the longtrail, we need to work around other bugs. */ static void __init prom_find_mmu(void) { @@ -1355,12 +1421,19 @@ static void __init prom_find_mmu(void) if (prom_getprop(oprom, "model", version, sizeof(version)) <= 0) return; version[sizeof(version) - 1] = 0; - prom_printf("OF version is '%s'\n", version); /* XXX might need to add other versions here */ - if (strcmp(version, "Open Firmware, 1.0.5") != 0) + if (strcmp(version, "Open Firmware, 1.0.5") == 0) + of_workarounds = OF_WA_CLAIM; + else if (strncmp(version, "FirmWorks,3.", 12) == 0) { + of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL; + call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim"); + } else return; + _prom->memory = call_prom("open", 1, 1, ADDR("/memory")); prom_getprop(_prom->chosen, "mmu", &_prom->mmumap, sizeof(_prom->mmumap)); + if (!IHANDLE_VALID(_prom->memory) || !IHANDLE_VALID(_prom->mmumap)) + of_workarounds &= ~OF_WA_CLAIM; /* hmmm */ } #else #define prom_find_mmu() @@ -1382,16 +1455,17 @@ static void __init prom_init_stdout(void) memset(path, 0, 256); call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255); val = call_prom("instance-to-package", 1, 1, _prom->stdout); - prom_setprop(_prom->chosen, "linux,stdout-package", &val, sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,stdout-package", + &val, sizeof(val)); prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device)); - prom_setprop(_prom->chosen, "linux,stdout-path", - RELOC(of_stdout_device), strlen(RELOC(of_stdout_device))+1); + prom_setprop(_prom->chosen, "/chosen", "linux,stdout-path", + path, strlen(path) + 1); /* If it's a display, note it */ memset(type, 0, sizeof(type)); prom_getprop(val, "device_type", type, sizeof(type)); if (strcmp(type, RELOC("display")) == 0) - prom_setprop(val, "linux,boot-display", NULL, 0); + prom_setprop(val, path, "linux,boot-display", NULL, 0); } static void __init prom_close_stdin(void) @@ -1408,8 +1482,9 @@ static int __init prom_find_machine_type(void) struct prom_t *_prom = &RELOC(prom); char compat[256]; int len, i = 0; +#ifdef CONFIG_PPC64 phandle rtas; - +#endif len = prom_getprop(_prom->root, "compatible", compat, sizeof(compat)-1); if (len > 0) { @@ -1513,7 +1588,7 @@ static void __init prom_check_displays(void) /* Success */ prom_printf("done\n"); - prom_setprop(node, "linux,opened", NULL, 0); + prom_setprop(node, path, "linux,opened", NULL, 0); /* Setup a usable color table when the appropriate * method is available. Should update this to set-colors */ @@ -1872,7 +1947,7 @@ static void __init fixup_device_tree(void) if (prom_getprop(u3, "device-rev", &u3_rev, sizeof(u3_rev)) == PROM_ERROR) return; - if (u3_rev != 0x35 && u3_rev != 0x37) + if (u3_rev < 0x35 || u3_rev > 0x39) return; /* does it need fixup ? */ if (prom_getproplen(i2c, "interrupts") > 0) @@ -1883,9 +1958,11 @@ static void __init fixup_device_tree(void) /* interrupt on this revision of u3 is number 0 and level */ interrupts[0] = 0; interrupts[1] = 1; - prom_setprop(i2c, "interrupts", &interrupts, sizeof(interrupts)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupts", + &interrupts, sizeof(interrupts)); parent = (u32)mpic; - prom_setprop(i2c, "interrupt-parent", &parent, sizeof(parent)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupt-parent", + &parent, sizeof(parent)); #endif } @@ -1921,11 +1998,11 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; val = RELOC(prom_initrd_start); - prom_setprop(_prom->chosen, "linux,initrd-start", &val, - sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,initrd-start", + &val, sizeof(val)); val = RELOC(prom_initrd_end); - prom_setprop(_prom->chosen, "linux,initrd-end", &val, - sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,initrd-end", + &val, sizeof(val)); reserve_mem(RELOC(prom_initrd_start), RELOC(prom_initrd_end) - RELOC(prom_initrd_start)); @@ -1968,14 +2045,15 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_init_client_services(pp); /* - * Init prom stdout device + * See if this OF is old enough that we need to do explicit maps + * and other workarounds */ - prom_init_stdout(); + prom_find_mmu(); /* - * See if this OF is old enough that we need to do explicit maps + * Init prom stdout device */ - prom_find_mmu(); + prom_init_stdout(); /* * Check for an initrd @@ -1988,14 +2066,15 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ RELOC(of_platform) = prom_find_machine_type(); getprop_rval = RELOC(of_platform); - prom_setprop(_prom->chosen, "linux,platform", + prom_setprop(_prom->chosen, "/chosen", "linux,platform", &getprop_rval, sizeof(getprop_rval)); #ifdef CONFIG_PPC_PSERIES /* * On pSeries, inform the firmware about our capabilities */ - if (RELOC(of_platform) & PLATFORM_PSERIES) + if (RELOC(of_platform) == PLATFORM_PSERIES || + RELOC(of_platform) == PLATFORM_PSERIES_LPAR) prom_send_capabilities(); #endif @@ -2049,21 +2128,23 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * Fill in some infos for use by the kernel later on */ if (RELOC(prom_memory_limit)) - prom_setprop(_prom->chosen, "linux,memory-limit", + prom_setprop(_prom->chosen, "/chosen", "linux,memory-limit", &RELOC(prom_memory_limit), sizeof(prom_memory_limit)); #ifdef CONFIG_PPC64 if (RELOC(ppc64_iommu_off)) - prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0); + prom_setprop(_prom->chosen, "/chosen", "linux,iommu-off", + NULL, 0); if (RELOC(iommu_force_on)) - prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0); + prom_setprop(_prom->chosen, "/chosen", "linux,iommu-force-on", + NULL, 0); if (RELOC(prom_tce_alloc_start)) { - prom_setprop(_prom->chosen, "linux,tce-alloc-start", + prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-start", &RELOC(prom_tce_alloc_start), sizeof(prom_tce_alloc_start)); - prom_setprop(_prom->chosen, "linux,tce-alloc-end", + prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-end", &RELOC(prom_tce_alloc_end), sizeof(prom_tce_alloc_end)); } @@ -2080,8 +2161,13 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_printf("copying OF device tree ...\n"); flatten_device_tree(); - /* in case stdin is USB and still active on IBM machines... */ - prom_close_stdin(); + /* + * in case stdin is USB and still active on IBM machines... + * Unfortunately quiesce crashes on some powermacs if we have + * closed stdin already (in particular the powerbook 101). + */ + if (RELOC(of_platform) != PLATFORM_POWERMAC) + prom_close_stdin(); /* * Call OF "quiesce" method to shut down pending DMA's from diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 568ea335d61..3d2abd95c7a 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -248,46 +248,10 @@ void ptrace_disable(struct task_struct *child) clear_single_step(child); } -long sys_ptrace(long request, long pid, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, long addr, long data) { - struct task_struct *child; int ret = -EPERM; - lock_kernel(); - if (request == PTRACE_TRACEME) { - /* are we already being traced? */ - if (current->ptrace & PT_PTRACED) - goto out; - ret = security_ptrace(current->parent, current); - if (ret) - goto out; - /* set the ptrace bit in the process flags. */ - current->ptrace |= PT_PTRACED; - ret = 0; - goto out; - } - ret = -ESRCH; - read_lock(&tasklist_lock); - child = find_task_by_pid(pid); - if (child) - get_task_struct(child); - read_unlock(&tasklist_lock); - if (!child) - goto out; - - ret = -EPERM; - if (pid == 1) /* you may not mess with init */ - goto out_tsk; - - if (request == PTRACE_ATTACH) { - ret = ptrace_attach(child); - goto out_tsk; - } - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out_tsk; - switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ @@ -540,10 +504,7 @@ long sys_ptrace(long request, long pid, long addr, long data) ret = ptrace_request(child, request, addr, data); break; } -out_tsk: - put_task_struct(child); -out: - unlock_kernel(); + return ret; } diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 5bdd5b079d9..7a95b8a2835 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -32,7 +32,6 @@ #include <asm/rtas.h> #include <asm/machdep.h> /* for ppc_md */ #include <asm/time.h> -#include <asm/systemcfg.h> /* Token for Sensors */ #define KEY_SWITCH 0x0001 @@ -259,7 +258,7 @@ static int __init proc_rtas_init(void) { struct proc_dir_entry *entry; - if (!(systemcfg->platform & PLATFORM_PSERIES)) + if (_machine != PLATFORM_PSERIES && _machine != PLATFORM_PSERIES_LPAR) return 1; rtas_node = of_find_node_by_name(NULL, "rtas"); diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c new file mode 100644 index 00000000000..7b948662704 --- /dev/null +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -0,0 +1,105 @@ +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/rtc.h> +#include <linux/delay.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/time.h> + + +#define MAX_RTC_WAIT 5000 /* 5 sec */ +#define RTAS_CLOCK_BUSY (-2) +unsigned long __init rtas_get_boot_time(void) +{ + int ret[8]; + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + wait_time = rtas_extended_busy_delay_time(error); + /* This is boot time so we spin. */ + udelay(wait_time*1000); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return 0; + } + + return mktime(ret[0], ret[1], ret[2], ret[3], ret[4], ret[5]); +} + +/* NOTE: get_rtc_time will get an error if executed in interrupt context + * and if a delay is needed to read the clock. In this case we just + * silently return without updating rtc_tm. + */ +void rtas_get_rtc_time(struct rtc_time *rtc_tm) +{ + int ret[8]; + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt() && printk_ratelimit()) { + memset(&rtc_tm, 0, sizeof(struct rtc_time)); + printk(KERN_WARNING "error: reading clock" + " would delay interrupt\n"); + return; /* delay not allowed */ + } + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) { + printk(KERN_WARNING "error: reading the clock failed (%d)\n", + error); + return; + } + + rtc_tm->tm_sec = ret[5]; + rtc_tm->tm_min = ret[4]; + rtc_tm->tm_hour = ret[3]; + rtc_tm->tm_mday = ret[2]; + rtc_tm->tm_mon = ret[1] - 1; + rtc_tm->tm_year = ret[0] - 1900; +} + +int rtas_set_rtc_time(struct rtc_time *tm) +{ + int error, wait_time; + unsigned long max_wait_tb; + + max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT; + do { + error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL, + tm->tm_year + 1900, tm->tm_mon + 1, + tm->tm_mday, tm->tm_hour, tm->tm_min, + tm->tm_sec, 0); + if (error == RTAS_CLOCK_BUSY || rtas_is_extended_busy(error)) { + if (in_interrupt()) + return 1; /* probably decrementer */ + wait_time = rtas_extended_busy_delay_time(error); + msleep(wait_time); + error = RTAS_CLOCK_BUSY; + } + } while (error == RTAS_CLOCK_BUSY && (get_tb() < max_wait_tb)); + + if (error != 0 && printk_ratelimit()) + printk(KERN_WARNING "error: setting the clock failed (%d)\n", + error); + + return 0; +} diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index b7fc2d88495..4283fa33f78 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -17,6 +17,7 @@ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/delay.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -28,9 +29,6 @@ #include <asm/delay.h> #include <asm/uaccess.h> #include <asm/lmb.h> -#ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> -#endif struct rtas_t rtas = { .lock = SPIN_LOCK_UNLOCKED @@ -83,7 +81,7 @@ void call_rtas_display_status_delay(unsigned char c) while (width-- > 0) call_rtas_display_status(' '); width = 16; - udelay(500000); + mdelay(500); pending_newline = 1; } else { if (pending_newline) { @@ -608,7 +606,6 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) return 0; } -#ifdef CONFIG_SMP /* This version can't take the spinlock, because it never returns */ struct rtas_args rtas_stop_self_args = { @@ -633,7 +630,6 @@ void rtas_stop_self(void) panic("Alas, I survived.\n"); } -#endif /* * Call early during boot, before mem init or bootmem, to retreive the RTAS @@ -672,7 +668,7 @@ void __init rtas_initialize(void) * the stop-self token if any */ #ifdef CONFIG_PPC64 - if (systemcfg->platform == PLATFORM_PSERIES_LPAR) + if (_machine == PLATFORM_PSERIES_LPAR) rtas_region = min(lmb.rmo_size, RTAS_INSTANTIATE_MAX); #endif rtas_rmo_buf = lmb_alloc_base(RTAS_RMOBUF_MAX, PAGE_SIZE, rtas_region); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c new file mode 100644 index 00000000000..0e5a8e11665 --- /dev/null +++ b/arch/powerpc/kernel/rtas_pci.c @@ -0,0 +1,513 @@ +/* + * arch/ppc64/kernel/rtas_pci.c + * + * Copyright (C) 2001 Dave Engebretsen, IBM Corporation + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * + * RTAS specific routines for PCI. + * + * Based on code from pci.c, chrp_pci.c and pSeries_pci.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/irq.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/pci-bridge.h> +#include <asm/iommu.h> +#include <asm/rtas.h> +#include <asm/mpic.h> +#include <asm/ppc-pci.h> + +/* RTAS tokens */ +static int read_pci_config; +static int write_pci_config; +static int ibm_read_pci_config; +static int ibm_write_pci_config; + +static inline int config_access_valid(struct pci_dn *dn, int where) +{ + if (where < 256) + return 1; + if (where < 4096 && dn->pci_ext_config_space) + return 1; + + return 0; +} + +static int of_device_available(struct device_node * dn) +{ + char * status; + + status = get_property(dn, "status", NULL); + + if (!status) + return 1; + + if (!strcmp(status, "okay")) + return 1; + + return 0; +} + +static int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +{ + int returnval = -1; + unsigned long buid, addr; + int ret; + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(pdn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = ((where & 0xf00) << 20) | (pdn->busno << 16) | + (pdn->devfn << 8) | (where & 0xff); + buid = pdn->phb->buid; + if (buid) { + ret = rtas_call(ibm_read_pci_config, 4, 2, &returnval, + addr, BUID_HI(buid), BUID_LO(buid), size); + } else { + ret = rtas_call(read_pci_config, 2, 2, &returnval, addr, size); + } + *val = returnval; + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (returnval == EEH_IO_ERROR_VALUE(size) && + eeh_dn_check_failure (pdn->node, NULL)) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct device_node *busdn, *dn; + + if (bus->self) + busdn = pci_device_to_OF_node(bus->self); + else + busdn = bus->sysdata; /* must be a phb */ + + /* Search only direct children of the bus */ + for (dn = busdn->child; dn; dn = dn->sibling) { + struct pci_dn *pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn + && of_device_available(dn)) + return rtas_read_config(pdn, where, size, val); + } + + return PCIBIOS_DEVICE_NOT_FOUND; +} + +int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) +{ + unsigned long buid, addr; + int ret; + + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + if (!config_access_valid(pdn, where)) + return PCIBIOS_BAD_REGISTER_NUMBER; + + addr = ((where & 0xf00) << 20) | (pdn->busno << 16) | + (pdn->devfn << 8) | (where & 0xff); + buid = pdn->phb->buid; + if (buid) { + ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, + BUID_HI(buid), BUID_LO(buid), size, (ulong) val); + } else { + ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); + } + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + +static int rtas_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct device_node *busdn, *dn; + + if (bus->self) + busdn = pci_device_to_OF_node(bus->self); + else + busdn = bus->sysdata; /* must be a phb */ + + /* Search only direct children of the bus */ + for (dn = busdn->child; dn; dn = dn->sibling) { + struct pci_dn *pdn = PCI_DN(dn); + if (pdn && pdn->devfn == devfn + && of_device_available(dn)) + return rtas_write_config(pdn, where, size, val); + } + return PCIBIOS_DEVICE_NOT_FOUND; +} + +struct pci_ops rtas_pci_ops = { + rtas_pci_read_config, + rtas_pci_write_config +}; + +int is_python(struct device_node *dev) +{ + char *model = (char *)get_property(dev, "model", NULL); + + if (model && strstr(model, "Python")) + return 1; + + return 0; +} + +static int get_phb_reg_prop(struct device_node *dev, + unsigned int addr_size_words, + struct reg_property64 *reg) +{ + unsigned int *ui_ptr = NULL, len; + + /* Found a PHB, now figure out where his registers are mapped. */ + ui_ptr = (unsigned int *)get_property(dev, "reg", &len); + if (ui_ptr == NULL) + return 1; + + if (addr_size_words == 1) { + reg->address = ((struct reg_property32 *)ui_ptr)->address; + reg->size = ((struct reg_property32 *)ui_ptr)->size; + } else { + *reg = *((struct reg_property64 *)ui_ptr); + } + + return 0; +} + +static void python_countermeasures(struct device_node *dev, + unsigned int addr_size_words) +{ + struct reg_property64 reg_struct; + void __iomem *chip_regs; + volatile u32 val; + + if (get_phb_reg_prop(dev, addr_size_words, ®_struct)) + return; + + /* Python's register file is 1 MB in size. */ + chip_regs = ioremap(reg_struct.address & ~(0xfffffUL), 0x100000); + + /* + * Firmware doesn't always clear this bit which is critical + * for good performance - Anton + */ + +#define PRG_CL_RESET_VALID 0x00010000 + + val = in_be32(chip_regs + 0xf6030); + if (val & PRG_CL_RESET_VALID) { + printk(KERN_INFO "Python workaround: "); + val &= ~PRG_CL_RESET_VALID; + out_be32(chip_regs + 0xf6030, val); + /* + * We must read it back for changes to + * take effect + */ + val = in_be32(chip_regs + 0xf6030); + printk("reg0: %x\n", val); + } + + iounmap(chip_regs); +} + +void __init init_pci_config_tokens (void) +{ + read_pci_config = rtas_token("read-pci-config"); + write_pci_config = rtas_token("write-pci-config"); + ibm_read_pci_config = rtas_token("ibm,read-pci-config"); + ibm_write_pci_config = rtas_token("ibm,write-pci-config"); +} + +unsigned long __devinit get_phb_buid (struct device_node *phb) +{ + int addr_cells; + unsigned int *buid_vals; + unsigned int len; + unsigned long buid; + + if (ibm_read_pci_config == -1) return 0; + + /* PHB's will always be children of the root node, + * or so it is promised by the current firmware. */ + if (phb->parent == NULL) + return 0; + if (phb->parent->parent) + return 0; + + buid_vals = (unsigned int *) get_property(phb, "reg", &len); + if (buid_vals == NULL) + return 0; + + addr_cells = prom_n_addr_cells(phb); + if (addr_cells == 1) { + buid = (unsigned long) buid_vals[0]; + } else { + buid = (((unsigned long)buid_vals[0]) << 32UL) | + (((unsigned long)buid_vals[1]) & 0xffffffff); + } + return buid; +} + +static int phb_set_bus_ranges(struct device_node *dev, + struct pci_controller *phb) +{ + int *bus_range; + unsigned int len; + + bus_range = (int *) get_property(dev, "bus-range", &len); + if (bus_range == NULL || len < 2 * sizeof(int)) { + return 1; + } + + phb->first_busno = bus_range[0]; + phb->last_busno = bus_range[1]; + + return 0; +} + +static int __devinit setup_phb(struct device_node *dev, + struct pci_controller *phb, + unsigned int addr_size_words) +{ + pci_setup_pci_controller(phb); + + if (is_python(dev)) + python_countermeasures(dev, addr_size_words); + + if (phb_set_bus_ranges(dev, phb)) + return 1; + + phb->arch_data = dev; + phb->ops = &rtas_pci_ops; + phb->buid = get_phb_buid(dev); + + return 0; +} + +static void __devinit add_linux_pci_domain(struct device_node *dev, + struct pci_controller *phb, + struct property *of_prop) +{ + memset(of_prop, 0, sizeof(struct property)); + of_prop->name = "linux,pci-domain"; + of_prop->length = sizeof(phb->global_number); + of_prop->value = (unsigned char *)&of_prop[1]; + memcpy(of_prop->value, &phb->global_number, sizeof(phb->global_number)); + prom_add_property(dev, of_prop); +} + +static struct pci_controller * __init alloc_phb(struct device_node *dev, + unsigned int addr_size_words) +{ + struct pci_controller *phb; + struct property *of_prop; + + phb = alloc_bootmem(sizeof(struct pci_controller)); + if (phb == NULL) + return NULL; + + of_prop = alloc_bootmem(sizeof(struct property) + + sizeof(phb->global_number)); + if (!of_prop) + return NULL; + + if (setup_phb(dev, phb, addr_size_words)) + return NULL; + + add_linux_pci_domain(dev, phb, of_prop); + + return phb; +} + +static struct pci_controller * __devinit alloc_phb_dynamic(struct device_node *dev, unsigned int addr_size_words) +{ + struct pci_controller *phb; + + phb = (struct pci_controller *)kmalloc(sizeof(struct pci_controller), + GFP_KERNEL); + if (phb == NULL) + return NULL; + + if (setup_phb(dev, phb, addr_size_words)) + return NULL; + + phb->is_dynamic = 1; + + /* TODO: linux,pci-domain? */ + + return phb; +} + +unsigned long __init find_and_init_phbs(void) +{ + struct device_node *node; + struct pci_controller *phb; + unsigned int root_size_cells = 0; + unsigned int index; + unsigned int *opprop = NULL; + struct device_node *root = of_find_node_by_path("/"); + + if (ppc64_interrupt_controller == IC_OPEN_PIC) { + opprop = (unsigned int *)get_property(root, + "platform-open-pic", NULL); + } + + root_size_cells = prom_n_size_cells(root); + + index = 0; + + for (node = of_get_next_child(root, NULL); + node != NULL; + node = of_get_next_child(root, node)) { + if (node->type == NULL || strcmp(node->type, "pci") != 0) + continue; + + phb = alloc_phb(node, root_size_cells); + if (!phb) + continue; + + pci_process_bridge_OF_ranges(phb, node, 0); + pci_setup_phb_io(phb, index == 0); +#ifdef CONFIG_PPC_PSERIES + if (ppc64_interrupt_controller == IC_OPEN_PIC && pSeries_mpic) { + int addr = root_size_cells * (index + 2) - 1; + mpic_assign_isu(pSeries_mpic, index, opprop[addr]); + } +#endif + index++; + } + + of_node_put(root); + pci_devs_phb_init(); + + /* + * pci_probe_only and pci_assign_all_buses can be set via properties + * in chosen. + */ + if (of_chosen) { + int *prop; + + prop = (int *)get_property(of_chosen, "linux,pci-probe-only", + NULL); + if (prop) + pci_probe_only = *prop; + + prop = (int *)get_property(of_chosen, + "linux,pci-assign-all-buses", NULL); + if (prop) + pci_assign_all_buses = *prop; + } + + return 0; +} + +struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) +{ + struct device_node *root = of_find_node_by_path("/"); + unsigned int root_size_cells = 0; + struct pci_controller *phb; + int primary; + + root_size_cells = prom_n_size_cells(root); + + primary = list_empty(&hose_list); + phb = alloc_phb_dynamic(dn, root_size_cells); + if (!phb) + return NULL; + + pci_process_bridge_OF_ranges(phb, dn, primary); + + pci_setup_phb_io_dynamic(phb, primary); + of_node_put(root); + + pci_devs_phb_init_dynamic(phb); + scan_phb(phb); + + return phb; +} +EXPORT_SYMBOL(init_phb_dynamic); + +/* RPA-specific bits for removing PHBs */ +int pcibios_remove_root_bus(struct pci_controller *phb) +{ + struct pci_bus *b = phb->bus; + struct resource *res; + int rc, i; + + res = b->resource[0]; + if (!res->flags) { + printk(KERN_ERR "%s: no IO resource for PHB %s\n", __FUNCTION__, + b->name); + return 1; + } + + rc = unmap_bus_range(b); + if (rc) { + printk(KERN_ERR "%s: failed to unmap IO on bus %s\n", + __FUNCTION__, b->name); + return 1; + } + + if (release_resource(res)) { + printk(KERN_ERR "%s: failed to release IO on bus %s\n", + __FUNCTION__, b->name); + return 1; + } + + for (i = 1; i < 3; ++i) { + res = b->resource[i]; + if (!res->flags && i == 0) { + printk(KERN_ERR "%s: no MEM resource for PHB %s\n", + __FUNCTION__, b->name); + return 1; + } + if (res->flags && release_resource(res)) { + printk(KERN_ERR + "%s: failed to release IO %d on bus %s\n", + __FUNCTION__, i, b->name); + return 1; + } + } + + list_del(&phb->list_node); + if (phb->is_dynamic) + kfree(phb); + + return 0; +} +EXPORT_SYMBOL(pcibios_remove_root_bus); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index d43fa8c0e5a..33e7f2c7f19 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,6 +33,7 @@ #include <asm/io.h> #include <asm/prom.h> #include <asm/processor.h> +#include <asm/vdso_datapage.h> #include <asm/pgtable.h> #include <asm/smp.h> #include <asm/elf.h> @@ -51,6 +52,9 @@ #include <asm/page.h> #include <asm/mmu.h> #include <asm/lmb.h> +#include <asm/xmon.h> + +#include "setup.h" #undef DEBUG @@ -60,6 +64,13 @@ #define DBG(fmt...) #endif +#ifdef CONFIG_PPC_MULTIPLATFORM +int _machine = 0; +EXPORT_SYMBOL(_machine); +#endif + +unsigned long klimit = (unsigned long) _end; + /* * This still seems to be needed... -- paulus */ @@ -405,6 +416,44 @@ static int __init set_preferred_console(void) console_initcall(set_preferred_console); #endif /* CONFIG_PPC_MULTIPLATFORM */ +void __init check_for_initrd(void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long *prop; + + DBG(" -> check_for_initrd()\n"); + + if (of_chosen) { + prop = (unsigned long *)get_property(of_chosen, + "linux,initrd-start", NULL); + if (prop != NULL) { + initrd_start = (unsigned long)__va(*prop); + prop = (unsigned long *)get_property(of_chosen, + "linux,initrd-end", NULL); + if (prop != NULL) { + initrd_end = (unsigned long)__va(*prop); + initrd_below_start_ok = 1; + } else + initrd_start = 0; + } + } + + /* If we were passed an initrd, set the ROOT_DEV properly if the values + * look sensible. If not, clear initrd reference. + */ + if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE && + initrd_end > initrd_start) + ROOT_DEV = Root_RAM0; + else + initrd_start = initrd_end = 0; + + if (initrd_start) + printk("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end); + + DBG(" <- check_for_initrd()\n"); +#endif /* CONFIG_BLK_DEV_INITRD */ +} + #ifdef CONFIG_SMP /** @@ -470,8 +519,8 @@ void __init smp_setup_cpu_maps(void) * On pSeries LPAR, we need to know how many cpus * could possibly be added to this partition. */ - if (systemcfg->platform == PLATFORM_PSERIES_LPAR && - (dn = of_find_node_by_path("/rtas"))) { + if (_machine == PLATFORM_PSERIES_LPAR && + (dn = of_find_node_by_path("/rtas"))) { int num_addr_cell, num_size_cell, maxcpus; unsigned int *ireg; @@ -515,7 +564,27 @@ void __init smp_setup_cpu_maps(void) cpu_set(cpu ^ 0x1, cpu_sibling_map[cpu]); } - systemcfg->processorCount = num_present_cpus(); + vdso_data->processorCount = num_present_cpus(); #endif /* CONFIG_PPC64 */ } #endif /* CONFIG_SMP */ + +#ifdef CONFIG_XMON +static int __init early_xmon(char *p) +{ + /* ensure xmon is enabled */ + if (p) { + if (strncmp(p, "on", 2) == 0) + xmon_init(1); + if (strncmp(p, "off", 3) == 0) + xmon_init(0); + if (strncmp(p, "early", 5) != 0) + return 0; + } + xmon_init(1); + debugger(NULL); + + return 0; +} +early_param("xmon", early_xmon); +#endif diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h new file mode 100644 index 00000000000..2ebba755272 --- /dev/null +++ b/arch/powerpc/kernel/setup.h @@ -0,0 +1,6 @@ +#ifndef _POWERPC_KERNEL_SETUP_H +#define _POWERPC_KERNEL_SETUP_H + +void check_for_initrd(void); + +#endif /* _POWERPC_KERNEL_SETUP_H */ diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index b45eedbb4b3..e5694335bf1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -40,6 +40,8 @@ #include <asm/xmon.h> #include <asm/time.h> +#include "setup.h" + #define DBG(fmt...) #if defined CONFIG_KGDB @@ -55,10 +57,6 @@ extern void power4_idle(void); boot_infos_t *boot_infos; struct ide_machdep_calls ppc_ide_md; -/* XXX should go elsewhere */ -int __irq_offset_value; -EXPORT_SYMBOL(__irq_offset_value); - int boot_cpuid; EXPORT_SYMBOL_GPL(boot_cpuid); int boot_cpuid_phys; @@ -70,8 +68,6 @@ unsigned int DMA_MODE_WRITE; int have_of = 1; #ifdef CONFIG_PPC_MULTIPLATFORM -int _machine = 0; - extern void prep_init(void); extern void pmac_init(void); extern void chrp_init(void); @@ -279,13 +275,13 @@ arch_initcall(ppc_init); /* Warning, IO base is not yet inited */ void __init setup_arch(char **cmdline_p) { - extern char *klimit; extern void do_init_bootmem(void); /* so udelay does something sensible, assume <= 1000 bogomips */ loops_per_jiffy = 500000000 / HZ; unflatten_device_tree(); + check_for_initrd(); finish_device_tree(); smp_setup_cpu_maps(); @@ -302,14 +298,9 @@ void __init setup_arch(char **cmdline_p) pmac_feature_init(); /* New cool way */ #endif -#ifdef CONFIG_XMON - xmon_map_scc(); - if (strstr(cmd_line, "xmon")) { - xmon_init(1); - debugger(NULL); - } -#endif /* CONFIG_XMON */ - if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); +#ifdef CONFIG_XMON_DEFAULT + xmon_init(1); +#endif #if defined(CONFIG_KGDB) if (ppc_md.kgdb_map_scc) @@ -342,7 +333,7 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = PAGE_OFFSET; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (unsigned long) klimit; + init_mm.brk = klimit; /* Save unparsed command line copy for /proc/cmdline */ strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6b52cce872b..608fee7c7e2 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -41,7 +41,6 @@ #include <asm/elf.h> #include <asm/machdep.h> #include <asm/paca.h> -#include <asm/ppcdebug.h> #include <asm/time.h> #include <asm/cputable.h> #include <asm/sections.h> @@ -58,8 +57,11 @@ #include <asm/lmb.h> #include <asm/iseries/it_lp_naca.h> #include <asm/firmware.h> -#include <asm/systemcfg.h> #include <asm/xmon.h> +#include <asm/udbg.h> +#include <asm/kexec.h> + +#include "setup.h" #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -94,15 +96,6 @@ extern void udbg_init_maple_realmode(void); do { udbg_putc = call_rtas_display_status_delay; } while(0) #endif -/* extern void *stab; */ -extern unsigned long klimit; - -extern void mm_init_ppc64(void); -extern void stab_initialize(unsigned long stab); -extern void htab_initialize(void); -extern void early_init_devtree(void *flat_dt); -extern void unflatten_device_tree(void); - int have_of = 1; int boot_cpuid = 0; int boot_cpuid_phys = 0; @@ -244,12 +237,6 @@ void __init early_setup(unsigned long dt_ptr) DBG(" -> early_setup()\n"); /* - * Fill the default DBG level (do we want to keep - * that old mecanism around forever ?) - */ - ppcdbg_initialize(); - - /* * Do early initializations using the flattened device * tree, like retreiving the physical memory map or * calculating/retreiving the hash table size @@ -260,11 +247,10 @@ void __init early_setup(unsigned long dt_ptr) * Iterate all ppc_md structures until we find the proper * one for the current machine type */ - DBG("Probing machine type for platform %x...\n", - systemcfg->platform); + DBG("Probing machine type for platform %x...\n", _machine); for (mach = machines; *mach; mach++) { - if ((*mach)->probe(systemcfg->platform)) + if ((*mach)->probe(_machine)) break; } /* What can we do if we didn't find ? */ @@ -277,20 +263,47 @@ void __init early_setup(unsigned long dt_ptr) DBG("Found, Initializing memory management...\n"); /* - * Initialize stab / SLB management + * Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before stab/slb initialization as + * this is currently where the page size encoding is obtained */ - if (!firmware_has_feature(FW_FEATURE_ISERIES)) - stab_initialize(lpaca->stab_real); + htab_initialize(); /* - * Initialize the MMU Hash table and create the linear mapping - * of memory + * Initialize stab / SLB management except on iSeries */ - htab_initialize(); + if (!firmware_has_feature(FW_FEATURE_ISERIES)) { + if (cpu_has_feature(CPU_FTR_SLB)) + slb_initialize(); + else + stab_initialize(lpaca->stab_real); + } DBG(" <- early_setup()\n"); } +#ifdef CONFIG_SMP +void early_setup_secondary(void) +{ + struct paca_struct *lpaca = get_paca(); + + /* Mark enabled in PACA */ + lpaca->proc_enabled = 0; + + /* Initialize hash table for that CPU */ + htab_initialize_secondary(); + + /* Initialize STAB/SLB. We use a virtual address as it works + * in real mode on pSeries and we want a virutal address on + * iSeries anyway + */ + if (cpu_has_feature(CPU_FTR_SLB)) + slb_initialize(); + else + stab_initialize(lpaca->stab_addr); +} + +#endif /* CONFIG_SMP */ #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) void smp_release_cpus(void) @@ -316,7 +329,8 @@ void smp_release_cpus(void) #endif /* CONFIG_SMP || CONFIG_KEXEC */ /* - * Initialize some remaining members of the ppc64_caches and systemcfg structures + * Initialize some remaining members of the ppc64_caches and systemcfg + * structures * (at least until we get rid of them completely). This is mostly some * cache informations about the CPU that will be used by cache flush * routines and/or provided to userland @@ -341,7 +355,7 @@ static void __init initialize_cache_info(void) const char *dc, *ic; /* Then read cache informations */ - if (systemcfg->platform == PLATFORM_POWERMAC) { + if (_machine == PLATFORM_POWERMAC) { dc = "d-cache-block-size"; ic = "i-cache-block-size"; } else { @@ -361,9 +375,8 @@ static void __init initialize_cache_info(void) DBG("Argh, can't find dcache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); - systemcfg->dcache_size = ppc64_caches.dsize = size; - systemcfg->dcache_line_size = - ppc64_caches.dline_size = lsize; + ppc64_caches.dsize = size; + ppc64_caches.dline_size = lsize; ppc64_caches.log_dline_size = __ilog2(lsize); ppc64_caches.dlines_per_page = PAGE_SIZE / lsize; @@ -379,60 +392,16 @@ static void __init initialize_cache_info(void) DBG("Argh, can't find icache properties ! " "sizep: %p, lsizep: %p\n", sizep, lsizep); - systemcfg->icache_size = ppc64_caches.isize = size; - systemcfg->icache_line_size = - ppc64_caches.iline_size = lsize; + ppc64_caches.isize = size; + ppc64_caches.iline_size = lsize; ppc64_caches.log_iline_size = __ilog2(lsize); ppc64_caches.ilines_per_page = PAGE_SIZE / lsize; } } - /* Add an eye catcher and the systemcfg layout version number */ - strcpy(systemcfg->eye_catcher, "SYSTEMCFG:PPC64"); - systemcfg->version.major = SYSTEMCFG_MAJOR; - systemcfg->version.minor = SYSTEMCFG_MINOR; - systemcfg->processor = mfspr(SPRN_PVR); - DBG(" <- initialize_cache_info()\n"); } -static void __init check_for_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - u64 *prop; - - DBG(" -> check_for_initrd()\n"); - - if (of_chosen) { - prop = (u64 *)get_property(of_chosen, - "linux,initrd-start", NULL); - if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); - prop = (u64 *)get_property(of_chosen, - "linux,initrd-end", NULL); - if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); - initrd_below_start_ok = 1; - } else - initrd_start = 0; - } - } - - /* If we were passed an initrd, set the ROOT_DEV properly if the values - * look sensible. If not, clear initrd reference. - */ - if (initrd_start >= KERNELBASE && initrd_end >= KERNELBASE && - initrd_end > initrd_start) - ROOT_DEV = Root_RAM0; - else - initrd_start = initrd_end = 0; - - if (initrd_start) - printk("Found initrd at 0x%lx:0x%lx\n", initrd_start, initrd_end); - - DBG(" <- check_for_initrd()\n"); -#endif /* CONFIG_BLK_DEV_INITRD */ -} /* * Do some initial setup of the system. The parameters are those which @@ -447,6 +416,10 @@ void __init setup_system(void) */ unflatten_device_tree(); +#ifdef CONFIG_KEXEC + kexec_setup(); /* requires unflattened device tree. */ +#endif + /* * Fill the ppc64_caches & systemcfg structures with informations * retreived from the device-tree. Need to be called before @@ -516,16 +489,14 @@ void __init setup_system(void) printk("-----------------------------------------------------\n"); printk("ppc64_pft_size = 0x%lx\n", ppc64_pft_size); - printk("ppc64_debug_switch = 0x%lx\n", ppc64_debug_switch); - printk("ppc64_interrupt_controller = 0x%ld\n", ppc64_interrupt_controller); - printk("systemcfg = 0x%p\n", systemcfg); - printk("systemcfg->platform = 0x%x\n", systemcfg->platform); - printk("systemcfg->processorCount = 0x%lx\n", systemcfg->processorCount); - printk("systemcfg->physicalMemorySize = 0x%lx\n", systemcfg->physicalMemorySize); + printk("ppc64_interrupt_controller = 0x%ld\n", + ppc64_interrupt_controller); + printk("platform = 0x%x\n", _machine); + printk("physicalMemorySize = 0x%lx\n", lmb_phys_mem_size()); printk("ppc64_caches.dcache_line_size = 0x%x\n", - ppc64_caches.dline_size); + ppc64_caches.dline_size); printk("ppc64_caches.icache_line_size = 0x%x\n", - ppc64_caches.iline_size); + ppc64_caches.iline_size); printk("htab_address = 0x%p\n", htab_address); printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); printk("-----------------------------------------------------\n"); @@ -552,10 +523,12 @@ static void __init irqstack_early_init(void) * SLB misses on them. */ for_each_cpu(i) { - softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); - hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + softirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc_base(THREAD_SIZE, + THREAD_SIZE, 0x10000000)); + hardirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc_base(THREAD_SIZE, + THREAD_SIZE, 0x10000000)); } } #else @@ -583,35 +556,8 @@ static void __init emergency_stack_init(void) limit = min(0x10000000UL, lmb.rmo_size); for_each_cpu(i) - paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128, - limit)) + PAGE_SIZE; -} - -/* - * Called from setup_arch to initialize the bitmap of available - * syscalls in the systemcfg page - */ -void __init setup_syscall_map(void) -{ - unsigned int i, count64 = 0, count32 = 0; - extern unsigned long *sys_call_table; - extern unsigned long sys_ni_syscall; - - - for (i = 0; i < __NR_syscalls; i++) { - if (sys_call_table[i*2] != sys_ni_syscall) { - count64++; - systemcfg->syscall_map_64[i >> 5] |= - 0x80000000UL >> (i & 0x1f); - } - if (sys_call_table[i*2+1] != sys_ni_syscall) { - count32++; - systemcfg->syscall_map_32[i >> 5] |= - 0x80000000UL >> (i & 0x1f); - } - } - printk(KERN_INFO "Syscall map setup, %d 32-bit and %d 64-bit syscalls\n", - count32, count64); + paca[i].emergency_sp = + __va(lmb_alloc_base(HW_PAGE_SIZE, 128, limit)) + HW_PAGE_SIZE; } /* @@ -655,9 +601,6 @@ void __init setup_arch(char **cmdline_p) do_init_bootmem(); sparse_init(); - /* initialize the syscall map in systemcfg */ - setup_syscall_map(); - #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; #endif @@ -895,26 +838,6 @@ int check_legacy_ioport(unsigned long base_port) } EXPORT_SYMBOL(check_legacy_ioport); -#ifdef CONFIG_XMON -static int __init early_xmon(char *p) -{ - /* ensure xmon is enabled */ - if (p) { - if (strncmp(p, "on", 2) == 0) - xmon_init(1); - if (strncmp(p, "off", 3) == 0) - xmon_init(0); - if (strncmp(p, "early", 5) != 0) - return 0; - } - xmon_init(1); - debugger(NULL); - - return 0; -} -early_param("xmon", early_xmon); -#endif - void cpu_die(void) { if (ppc_md.cpu_die) diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 876c57c1136..5a2eba60dd3 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -42,11 +42,11 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/sigcontext.h> +#include <asm/vdso.h> #ifdef CONFIG_PPC64 #include "ppc32.h" -#include <asm/ppcdebug.h> #include <asm/unistd.h> -#include <asm/vdso.h> #else #include <asm/ucontext.h> #include <asm/pgtable.h> @@ -403,8 +403,6 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, ELF_NFPREG * sizeof(double))) return 1; - current->thread.fpscr.val = 0; /* turn off all fp exceptions */ - #ifdef CONFIG_ALTIVEC /* save altivec registers */ if (current->thread.used_vr) { @@ -809,18 +807,18 @@ static int handle_rt_signal(unsigned long sig, struct k_sigaction *ka, /* Save user registers on the stack */ frame = &rt_sf->uc.uc_mcontext; -#ifdef CONFIG_PPC64 if (vdso32_rt_sigtramp && current->thread.vdso_base) { if (save_user_regs(regs, frame, 0)) goto badframe; regs->link = current->thread.vdso_base + vdso32_rt_sigtramp; - } else -#endif - { + } else { if (save_user_regs(regs, frame, __NR_rt_sigreturn)) goto badframe; regs->link = (unsigned long) frame->tramp; } + + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; @@ -1090,19 +1088,18 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, || __put_user(sig, &sc->signal)) goto badframe; -#ifdef CONFIG_PPC64 if (vdso32_sigtramp && current->thread.vdso_base) { if (save_user_regs(regs, &frame->mctx, 0)) goto badframe; regs->link = current->thread.vdso_base + vdso32_sigtramp; - } else -#endif - { + } else { if (save_user_regs(regs, &frame->mctx, __NR_sigreturn)) goto badframe; regs->link = (unsigned long) frame->mctx.tramp; } + current->thread.fpscr.val = 0; /* turn off all fp exceptions */ + if (put_user(regs->gpr[1], (u32 __user *)newsp)) goto badframe; regs->gpr[1] = newsp; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index ec9d0984b6a..1decf278553 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -33,7 +33,6 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm/ppcdebug.h> #include <asm/unistd.h> #include <asm/cacheflush.h> #include <asm/vdso.h> @@ -132,9 +131,6 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, flush_fp_to_thread(current); - /* Make sure signal doesn't get spurrious FP exceptions */ - current->thread.fpscr.val = 0; - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); @@ -424,6 +420,9 @@ static int setup_rt_frame(int signr, struct k_sigaction *ka, siginfo_t *info, if (err) goto badframe; + /* Make sure signal handler doesn't get spurious FP exceptions */ + current->thread.fpscr.val = 0; + /* Set up to return from userspace. */ if (vdso64_rt_sigtramp && current->thread.vdso_base) { regs->link = current->thread.vdso_base + vdso64_rt_sigtramp; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 1794a694a92..62dfc5b8d76 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -40,11 +40,11 @@ #include <asm/prom.h> #include <asm/smp.h> #include <asm/time.h> -#include <asm/xmon.h> #include <asm/machdep.h> #include <asm/cputable.h> #include <asm/system.h> #include <asm/mpic.h> +#include <asm/vdso_datapage.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> #endif @@ -369,9 +369,11 @@ int generic_cpu_disable(void) if (cpu == boot_cpuid) return -EBUSY; - systemcfg->processorCount--; cpu_clear(cpu, cpu_online_map); +#ifdef CONFIG_PPC64 + vdso_data->processorCount--; fixup_irqs(cpu_online_map); +#endif return 0; } @@ -389,9 +391,11 @@ int generic_cpu_enable(unsigned int cpu) while (!cpu_online(cpu)) cpu_relax(); +#ifdef CONFIG_PPC64 fixup_irqs(cpu_online_map); /* counter the irq disable in fixup_irqs */ local_irq_enable(); +#endif return 0; } @@ -420,7 +424,9 @@ void generic_mach_cpu_die(void) while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) cpu_relax(); +#ifdef CONFIG_PPC64 flush_tlb_pending(); +#endif cpu_set(cpu, cpu_online_map); local_irq_enable(); } @@ -511,6 +517,7 @@ int __devinit start_secondary(void *unused) smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); + preempt_disable(); cpu_callin_map[cpu] = 1; smp_ops->setup_cpu(cpu); diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index a8210ed5c68..9c921d1c408 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -52,7 +52,6 @@ #include <asm/semaphore.h> #include <asm/time.h> #include <asm/mmu_context.h> -#include <asm/systemcfg.h> #include <asm/ppc-pci.h> /* readdir & getdents */ diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c new file mode 100644 index 00000000000..0f0c3a9ae2e --- /dev/null +++ b/arch/powerpc/kernel/sysfs.c @@ -0,0 +1,383 @@ +#include <linux/config.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/cpumask.h> +#include <linux/notifier.h> + +#include <asm/current.h> +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> +#include <asm/prom.h> +#include <asm/paca.h> +#include <asm/lppaca.h> +#include <asm/machdep.h> +#include <asm/smp.h> + +static DEFINE_PER_CPU(struct cpu, cpu_devices); + +/* SMT stuff */ + +#ifdef CONFIG_PPC_MULTIPLATFORM +/* default to snooze disabled */ +DEFINE_PER_CPU(unsigned long, smt_snooze_delay); + +static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + ssize_t ret; + unsigned long snooze; + + ret = sscanf(buf, "%lu", &snooze); + if (ret != 1) + return -EINVAL; + + per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze; + + return count; +} + +static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + + return sprintf(buf, "%lu\n", per_cpu(smt_snooze_delay, cpu->sysdev.id)); +} + +static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, + store_smt_snooze_delay); + +/* Only parse OF options if the matching cmdline option was not specified */ +static int smt_snooze_cmdline; + +static int __init smt_setup(void) +{ + struct device_node *options; + unsigned int *val; + unsigned int cpu; + + if (!cpu_has_feature(CPU_FTR_SMT)) + return 1; + + options = find_path_device("/options"); + if (!options) + return 1; + + val = (unsigned int *)get_property(options, "ibm,smt-snooze-delay", + NULL); + if (!smt_snooze_cmdline && val) { + for_each_cpu(cpu) + per_cpu(smt_snooze_delay, cpu) = *val; + } + + return 1; +} +__initcall(smt_setup); + +static int __init setup_smt_snooze_delay(char *str) +{ + unsigned int cpu; + int snooze; + + if (!cpu_has_feature(CPU_FTR_SMT)) + return 1; + + smt_snooze_cmdline = 1; + + if (get_option(&str, &snooze)) { + for_each_cpu(cpu) + per_cpu(smt_snooze_delay, cpu) = snooze; + } + + return 1; +} +__setup("smt-snooze-delay=", setup_smt_snooze_delay); + +#endif /* CONFIG_PPC_MULTIPLATFORM */ + +/* + * Enabling PMCs will slow partition context switch times so we only do + * it the first time we write to the PMCs. + */ + +static DEFINE_PER_CPU(char, pmcs_enabled); + +void ppc64_enable_pmcs(void) +{ + /* Only need to enable them once */ + if (__get_cpu_var(pmcs_enabled)) + return; + + __get_cpu_var(pmcs_enabled) = 1; + + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); +} +EXPORT_SYMBOL(ppc64_enable_pmcs); + +/* XXX convert to rusty's on_one_cpu */ +static unsigned long run_on_cpu(unsigned long cpu, + unsigned long (*func)(unsigned long), + unsigned long arg) +{ + cpumask_t old_affinity = current->cpus_allowed; + unsigned long ret; + + /* should return -EINVAL to userspace */ + if (set_cpus_allowed(current, cpumask_of_cpu(cpu))) + return 0; + + ret = func(arg); + + set_cpus_allowed(current, old_affinity); + + return ret; +} + +#define SYSFS_PMCSETUP(NAME, ADDRESS) \ +static unsigned long read_##NAME(unsigned long junk) \ +{ \ + return mfspr(ADDRESS); \ +} \ +static unsigned long write_##NAME(unsigned long val) \ +{ \ + ppc64_enable_pmcs(); \ + mtspr(ADDRESS, val); \ + return 0; \ +} \ +static ssize_t show_##NAME(struct sys_device *dev, char *buf) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); \ + unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \ + return sprintf(buf, "%lx\n", val); \ +} \ +static ssize_t __attribute_used__ \ + store_##NAME(struct sys_device *dev, const char *buf, size_t count) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); \ + unsigned long val; \ + int ret = sscanf(buf, "%lx", &val); \ + if (ret != 1) \ + return -EINVAL; \ + run_on_cpu(cpu->sysdev.id, write_##NAME, val); \ + return count; \ +} + +SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); +SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); +SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); +SYSFS_PMCSETUP(pmc1, SPRN_PMC1); +SYSFS_PMCSETUP(pmc2, SPRN_PMC2); +SYSFS_PMCSETUP(pmc3, SPRN_PMC3); +SYSFS_PMCSETUP(pmc4, SPRN_PMC4); +SYSFS_PMCSETUP(pmc5, SPRN_PMC5); +SYSFS_PMCSETUP(pmc6, SPRN_PMC6); +SYSFS_PMCSETUP(pmc7, SPRN_PMC7); +SYSFS_PMCSETUP(pmc8, SPRN_PMC8); +SYSFS_PMCSETUP(purr, SPRN_PURR); + +static SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0); +static SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1); +static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra); +static SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1); +static SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2); +static SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3); +static SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4); +static SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5); +static SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6); +static SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7); +static SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8); +static SYSDEV_ATTR(purr, 0600, show_purr, NULL); + +static void register_cpu_online(unsigned int cpu) +{ + struct cpu *c = &per_cpu(cpu_devices, cpu); + struct sys_device *s = &c->sysdev; + +#ifndef CONFIG_PPC_ISERIES + if (cpu_has_feature(CPU_FTR_SMT)) + sysdev_create_file(s, &attr_smt_snooze_delay); +#endif + + /* PMC stuff */ + + sysdev_create_file(s, &attr_mmcr0); + sysdev_create_file(s, &attr_mmcr1); + + if (cpu_has_feature(CPU_FTR_MMCRA)) + sysdev_create_file(s, &attr_mmcra); + + if (cur_cpu_spec->num_pmcs >= 1) + sysdev_create_file(s, &attr_pmc1); + if (cur_cpu_spec->num_pmcs >= 2) + sysdev_create_file(s, &attr_pmc2); + if (cur_cpu_spec->num_pmcs >= 3) + sysdev_create_file(s, &attr_pmc3); + if (cur_cpu_spec->num_pmcs >= 4) + sysdev_create_file(s, &attr_pmc4); + if (cur_cpu_spec->num_pmcs >= 5) + sysdev_create_file(s, &attr_pmc5); + if (cur_cpu_spec->num_pmcs >= 6) + sysdev_create_file(s, &attr_pmc6); + if (cur_cpu_spec->num_pmcs >= 7) + sysdev_create_file(s, &attr_pmc7); + if (cur_cpu_spec->num_pmcs >= 8) + sysdev_create_file(s, &attr_pmc8); + + if (cpu_has_feature(CPU_FTR_SMT)) + sysdev_create_file(s, &attr_purr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void unregister_cpu_online(unsigned int cpu) +{ + struct cpu *c = &per_cpu(cpu_devices, cpu); + struct sys_device *s = &c->sysdev; + + BUG_ON(c->no_control); + +#ifndef CONFIG_PPC_ISERIES + if (cpu_has_feature(CPU_FTR_SMT)) + sysdev_remove_file(s, &attr_smt_snooze_delay); +#endif + + /* PMC stuff */ + + sysdev_remove_file(s, &attr_mmcr0); + sysdev_remove_file(s, &attr_mmcr1); + + if (cpu_has_feature(CPU_FTR_MMCRA)) + sysdev_remove_file(s, &attr_mmcra); + + if (cur_cpu_spec->num_pmcs >= 1) + sysdev_remove_file(s, &attr_pmc1); + if (cur_cpu_spec->num_pmcs >= 2) + sysdev_remove_file(s, &attr_pmc2); + if (cur_cpu_spec->num_pmcs >= 3) + sysdev_remove_file(s, &attr_pmc3); + if (cur_cpu_spec->num_pmcs >= 4) + sysdev_remove_file(s, &attr_pmc4); + if (cur_cpu_spec->num_pmcs >= 5) + sysdev_remove_file(s, &attr_pmc5); + if (cur_cpu_spec->num_pmcs >= 6) + sysdev_remove_file(s, &attr_pmc6); + if (cur_cpu_spec->num_pmcs >= 7) + sysdev_remove_file(s, &attr_pmc7); + if (cur_cpu_spec->num_pmcs >= 8) + sysdev_remove_file(s, &attr_pmc8); + + if (cpu_has_feature(CPU_FTR_SMT)) + sysdev_remove_file(s, &attr_purr); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __devinit sysfs_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned int)(long)hcpu; + + switch (action) { + case CPU_ONLINE: + register_cpu_online(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + unregister_cpu_online(cpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata sysfs_cpu_nb = { + .notifier_call = sysfs_cpu_notify, +}; + +/* NUMA stuff */ + +#ifdef CONFIG_NUMA +static struct node node_devices[MAX_NUMNODES]; + +static void register_nodes(void) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_online(i)) { + int p_node = parent_node(i); + struct node *parent = NULL; + + if (p_node != i) + parent = &node_devices[p_node]; + + register_node(&node_devices[i], i, parent); + } + } +} +#else +static void register_nodes(void) +{ + return; +} +#endif + +/* Only valid if CPU is present. */ +static ssize_t show_physical_id(struct sys_device *dev, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + + return sprintf(buf, "%d\n", get_hard_smp_processor_id(cpu->sysdev.id)); +} +static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL); + +static int __init topology_init(void) +{ + int cpu; + struct node *parent = NULL; + + register_nodes(); + + register_cpu_notifier(&sysfs_cpu_nb); + + for_each_cpu(cpu) { + struct cpu *c = &per_cpu(cpu_devices, cpu); + +#ifdef CONFIG_NUMA + /* The node to which a cpu belongs can't be known + * until the cpu is made present. + */ + parent = NULL; + if (cpu_present(cpu)) + parent = &node_devices[cpu_to_node(cpu)]; +#endif + /* + * For now, we just see if the system supports making + * the RTAS calls for CPU hotplug. But, there may be a + * more comprehensive way to do this for an individual + * CPU. For instance, the boot cpu might never be valid + * for hotplugging. + */ + if (!ppc_md.cpu_die) + c->no_control = 1; + + if (cpu_online(cpu) || (c->no_control == 0)) { + register_cpu(c, cpu, parent); + + sysdev_create_file(&c->sysdev, &attr_physical_id); + } + + if (cpu_online(cpu)) + register_cpu_online(cpu); + } + + return 0; +} +__initcall(topology_init); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6996a593dcb..070b4b458aa 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -61,14 +61,16 @@ #include <asm/prom.h> #include <asm/irq.h> #include <asm/div64.h> +#include <asm/smp.h> +#include <asm/vdso_datapage.h> #ifdef CONFIG_PPC64 -#include <asm/systemcfg.h> #include <asm/firmware.h> #endif #ifdef CONFIG_PPC_ISERIES #include <asm/iseries/it_lp_queue.h> #include <asm/iseries/hv_call_xm.h> #endif +#include <asm/smp.h> /* keep track of when we need to update the rtc */ time_t last_rtc_update; @@ -118,10 +120,6 @@ static unsigned adjusting_time = 0; unsigned long ppc_proc_freq; unsigned long ppc_tb_freq; -#ifdef CONFIG_PPC32 /* XXX for now */ -#define boot_cpuid 0 -#endif - u64 tb_last_jiffy __cacheline_aligned_in_smp; unsigned long tb_last_stamp; @@ -263,7 +261,6 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, do_gtod.varp = temp_varp; do_gtod.var_idx = temp_idx; -#ifdef CONFIG_PPC64 /* * tb_update_count is used to allow the userspace gettimeofday code * to assure itself that it sees a consistent view of the tb_to_xs and @@ -273,14 +270,15 @@ static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. */ - ++(systemcfg->tb_update_count); + ++(vdso_data->tb_update_count); smp_wmb(); - systemcfg->tb_orig_stamp = new_tb_stamp; - systemcfg->stamp_xsec = new_stamp_xsec; - systemcfg->tb_to_xs = new_tb_to_xs; + vdso_data->tb_orig_stamp = new_tb_stamp; + vdso_data->stamp_xsec = new_stamp_xsec; + vdso_data->tb_to_xs = new_tb_to_xs; + vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; smp_wmb(); - ++(systemcfg->tb_update_count); -#endif + ++(vdso_data->tb_update_count); } /* @@ -359,8 +357,8 @@ static void iSeries_tb_recal(void) do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; do_gtod.varp->tb_to_xs = tb_to_xs; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->tb_to_xs = tb_to_xs; + vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -485,6 +483,8 @@ void __init smp_space_timers(unsigned int max_cpus) unsigned long offset = tb_ticks_per_jiffy / max_cpus; unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid); + /* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */ + previous_tb -= tb_ticks_per_jiffy; for_each_cpu(i) { if (i != boot_cpuid) { previous_tb += offset; @@ -560,10 +560,8 @@ int do_settimeofday(struct timespec *tv) new_xsec += (u64)new_sec * XSEC_PER_SEC - tb_delta_xs; update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); -#ifdef CONFIG_PPC64 - systemcfg->tz_minuteswest = sys_tz.tz_minuteswest; - systemcfg->tz_dsttime = sys_tz.tz_dsttime; -#endif + vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; + vdso_data->tz_dsttime = sys_tz.tz_dsttime; write_sequnlock_irqrestore(&xtime_lock, flags); clock_was_set(); @@ -712,13 +710,12 @@ void __init time_init(void) do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; do_gtod.varp->tb_to_xs = tb_to_xs; do_gtod.tb_to_us = tb_to_us; -#ifdef CONFIG_PPC64 - systemcfg->tb_orig_stamp = tb_last_jiffy; - systemcfg->tb_update_count = 0; - systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; - systemcfg->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC; - systemcfg->tb_to_xs = tb_to_xs; -#endif + + vdso_data->tb_orig_stamp = tb_last_jiffy; + vdso_data->tb_update_count = 0; + vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_data->stamp_xsec = xtime.tv_sec * XSEC_PER_SEC; + vdso_data->tb_to_xs = tb_to_xs; time_freq = 0; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 07e5ee40b87..1511454c469 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -39,7 +39,6 @@ #include <asm/io.h> #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/xmon.h> #include <asm/pmc.h> #ifdef CONFIG_PPC32 #include <asm/reg.h> @@ -50,7 +49,6 @@ #ifdef CONFIG_PPC64 #include <asm/firmware.h> #include <asm/processor.h> -#include <asm/systemcfg.h> #endif #ifdef CONFIG_PPC64 /* XXX */ @@ -130,7 +128,7 @@ int die(const char *str, struct pt_regs *regs, long err) nl = 1; #endif #ifdef CONFIG_PPC64 - switch (systemcfg->platform) { + switch (_machine) { case PLATFORM_PSERIES: printk("PSERIES "); nl = 1; @@ -748,22 +746,12 @@ static int check_bug_trap(struct pt_regs *regs) return 0; if (bug->line & BUG_WARNING_TRAP) { /* this is a WARN_ON rather than BUG/BUG_ON */ -#ifdef CONFIG_XMON - xmon_printf(KERN_ERR "Badness in %s at %s:%ld\n", - bug->function, bug->file, - bug->line & ~BUG_WARNING_TRAP); -#endif /* CONFIG_XMON */ printk(KERN_ERR "Badness in %s at %s:%ld\n", bug->function, bug->file, bug->line & ~BUG_WARNING_TRAP); dump_stack(); return 1; } -#ifdef CONFIG_XMON - xmon_printf(KERN_CRIT "kernel BUG in %s at %s:%ld!\n", - bug->function, bug->file, bug->line); - xmon(regs); -#endif /* CONFIG_XMON */ printk(KERN_CRIT "kernel BUG in %s at %s:%ld!\n", bug->function, bug->file, bug->line); @@ -898,10 +886,6 @@ void altivec_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); } -#ifdef CONFIG_PPC64 -extern perf_irq_t perf_irq; -#endif - #if defined(CONFIG_PPC64) || defined(CONFIG_E500) void performance_monitor_exception(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c new file mode 100644 index 00000000000..0d878e72fc4 --- /dev/null +++ b/arch/powerpc/kernel/udbg.c @@ -0,0 +1,125 @@ +/* + * polling mode stateless debugging stuff, originally for NS16550 Serial Ports + * + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <stdarg.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/console.h> +#include <asm/processor.h> + +void (*udbg_putc)(unsigned char c); +unsigned char (*udbg_getc)(void); +int (*udbg_getc_poll)(void); + +/* udbg library, used by xmon et al */ +void udbg_puts(const char *s) +{ + if (udbg_putc) { + char c; + + if (s && *s != '\0') { + while ((c = *s++) != '\0') + udbg_putc(c); + } + } +#if 0 + else { + printk("%s", s); + } +#endif +} + +int udbg_write(const char *s, int n) +{ + int remain = n; + char c; + + if (!udbg_putc) + return 0; + + if (s && *s != '\0') { + while (((c = *s++) != '\0') && (remain-- > 0)) { + udbg_putc(c); + } + } + + return n - remain; +} + +int udbg_read(char *buf, int buflen) +{ + char c, *p = buf; + int i; + + if (!udbg_getc) + return 0; + + for (i = 0; i < buflen; ++i) { + do { + c = udbg_getc(); + } while (c == 0x11 || c == 0x13); + if (c == 0) + break; + *p++ = c; + } + + return i; +} + +#define UDBG_BUFSIZE 256 +void udbg_printf(const char *fmt, ...) +{ + unsigned char buf[UDBG_BUFSIZE]; + va_list args; + + va_start(args, fmt); + vsnprintf(buf, UDBG_BUFSIZE, fmt, args); + udbg_puts(buf); + va_end(args); +} + +/* + * Early boot console based on udbg + */ +static void udbg_console_write(struct console *con, const char *s, + unsigned int n) +{ + udbg_write(s, n); +} + +static struct console udbg_console = { + .name = "udbg", + .write = udbg_console_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +static int early_console_initialized; + +void __init disable_early_printk(void) +{ + if (!early_console_initialized) + return; + unregister_console(&udbg_console); + early_console_initialized = 0; +} + +/* called by setup_system */ +void register_early_udbg_console(void) +{ + early_console_initialized = 1; + register_console(&udbg_console); +} + +#if 0 /* if you want to use this as a regular output console */ +console_initcall(register_udbg_console); +#endif diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c new file mode 100644 index 00000000000..9313574ab93 --- /dev/null +++ b/arch/powerpc/kernel/udbg_16550.c @@ -0,0 +1,123 @@ +/* + * udbg for for NS16550 compatable serial ports + * + * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <linux/types.h> +#include <asm/udbg.h> +#include <asm/io.h> + +extern u8 real_readb(volatile u8 __iomem *addr); +extern void real_writeb(u8 data, volatile u8 __iomem *addr); + +struct NS16550 { + /* this struct must be packed */ + unsigned char rbr; /* 0 */ + unsigned char ier; /* 1 */ + unsigned char fcr; /* 2 */ + unsigned char lcr; /* 3 */ + unsigned char mcr; /* 4 */ + unsigned char lsr; /* 5 */ + unsigned char msr; /* 6 */ + unsigned char scr; /* 7 */ +}; + +#define thr rbr +#define iir fcr +#define dll rbr +#define dlm ier +#define dlab lcr + +#define LSR_DR 0x01 /* Data ready */ +#define LSR_OE 0x02 /* Overrun */ +#define LSR_PE 0x04 /* Parity error */ +#define LSR_FE 0x08 /* Framing error */ +#define LSR_BI 0x10 /* Break */ +#define LSR_THRE 0x20 /* Xmit holding register empty */ +#define LSR_TEMT 0x40 /* Xmitter empty */ +#define LSR_ERR 0x80 /* Error */ + +static volatile struct NS16550 __iomem *udbg_comport; + +static void udbg_550_putc(unsigned char c) +{ + if (udbg_comport) { + while ((in_8(&udbg_comport->lsr) & LSR_THRE) == 0) + /* wait for idle */; + out_8(&udbg_comport->thr, c); + if (c == '\n') + udbg_550_putc('\r'); + } +} + +static int udbg_550_getc_poll(void) +{ + if (udbg_comport) { + if ((in_8(&udbg_comport->lsr) & LSR_DR) != 0) + return in_8(&udbg_comport->rbr); + else + return -1; + } + return -1; +} + +static unsigned char udbg_550_getc(void) +{ + if (udbg_comport) { + while ((in_8(&udbg_comport->lsr) & LSR_DR) == 0) + /* wait for char */; + return in_8(&udbg_comport->rbr); + } + return 0; +} + +void udbg_init_uart(void __iomem *comport, unsigned int speed) +{ + u16 dll = speed ? (115200 / speed) : 12; + + if (comport) { + udbg_comport = (struct NS16550 __iomem *)comport; + out_8(&udbg_comport->lcr, 0x00); + out_8(&udbg_comport->ier, 0xff); + out_8(&udbg_comport->ier, 0x00); + out_8(&udbg_comport->lcr, 0x80); /* Access baud rate */ + out_8(&udbg_comport->dll, dll & 0xff); /* 1 = 115200, 2 = 57600, + 3 = 38400, 12 = 9600 baud */ + out_8(&udbg_comport->dlm, dll >> 8); /* dll >> 8 which should be zero + for fast rates; */ + out_8(&udbg_comport->lcr, 0x03); /* 8 data, 1 stop, no parity */ + out_8(&udbg_comport->mcr, 0x03); /* RTS/DTR */ + out_8(&udbg_comport->fcr ,0x07); /* Clear & enable FIFOs */ + udbg_putc = udbg_550_putc; + udbg_getc = udbg_550_getc; + udbg_getc_poll = udbg_550_getc_poll; + } +} + +#ifdef CONFIG_PPC_MAPLE +void udbg_maple_real_putc(unsigned char c) +{ + if (udbg_comport) { + while ((real_readb(&udbg_comport->lsr) & LSR_THRE) == 0) + /* wait for idle */; + real_writeb(c, &udbg_comport->thr); eieio(); + if (c == '\n') + udbg_maple_real_putc('\r'); + } +} + +void udbg_init_maple_realmode(void) +{ + udbg_comport = (volatile struct NS16550 __iomem *)0xf40003f8; + + udbg_putc = udbg_maple_real_putc; + udbg_getc = NULL; + udbg_getc_poll = NULL; +} +#endif /* CONFIG_PPC_MAPLE */ diff --git a/arch/powerpc/kernel/udbg_scc.c b/arch/powerpc/kernel/udbg_scc.c new file mode 100644 index 00000000000..820c5355150 --- /dev/null +++ b/arch/powerpc/kernel/udbg_scc.c @@ -0,0 +1,135 @@ +/* + * udbg for for zilog scc ports as found on Apple PowerMacs + * + * Copyright (C) 2001-2005 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <linux/types.h> +#include <asm/udbg.h> +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pmac_feature.h> + +extern u8 real_readb(volatile u8 __iomem *addr); +extern void real_writeb(u8 data, volatile u8 __iomem *addr); + +#define SCC_TXRDY 4 +#define SCC_RXRDY 1 + +static volatile u8 __iomem *sccc; +static volatile u8 __iomem *sccd; + +static void udbg_scc_putc(unsigned char c) +{ + if (sccc) { + while ((in_8(sccc) & SCC_TXRDY) == 0) + ; + out_8(sccd, c); + if (c == '\n') + udbg_scc_putc('\r'); + } +} + +static int udbg_scc_getc_poll(void) +{ + if (sccc) { + if ((in_8(sccc) & SCC_RXRDY) != 0) + return in_8(sccd); + else + return -1; + } + return -1; +} + +static unsigned char udbg_scc_getc(void) +{ + if (sccc) { + while ((in_8(sccc) & SCC_RXRDY) == 0) + ; + return in_8(sccd); + } + return 0; +} + +static unsigned char scc_inittab[] = { + 13, 0, /* set baud rate divisor */ + 12, 0, + 14, 1, /* baud rate gen enable, src=rtxc */ + 11, 0x50, /* clocks = br gen */ + 5, 0xea, /* tx 8 bits, assert DTR & RTS */ + 4, 0x46, /* x16 clock, 1 stop */ + 3, 0xc1, /* rx enable, 8 bits */ +}; + +void udbg_init_scc(struct device_node *np) +{ + u32 *reg; + unsigned long addr; + int i, x; + + if (np == NULL) + np = of_find_node_by_name(NULL, "escc"); + if (np == NULL || np->parent == NULL) + return; + + udbg_printf("found SCC...\n"); + /* Get address within mac-io ASIC */ + reg = (u32 *)get_property(np, "reg", NULL); + if (reg == NULL) + return; + addr = reg[0]; + udbg_printf("local addr: %lx\n", addr); + /* Get address of mac-io PCI itself */ + reg = (u32 *)get_property(np->parent, "assigned-addresses", NULL); + if (reg == NULL) + return; + addr += reg[2]; + udbg_printf("final addr: %lx\n", addr); + + /* Setup for 57600 8N1 */ + addr += 0x20; + sccc = (volatile u8 * __iomem) ioremap(addr & PAGE_MASK, PAGE_SIZE) ; + sccc += addr & ~PAGE_MASK; + sccd = sccc + 0x10; + + udbg_printf("ioremap result sccc: %p\n", sccc); + mb(); + + for (i = 20000; i != 0; --i) + x = in_8(sccc); + out_8(sccc, 0x09); /* reset A or B side */ + out_8(sccc, 0xc0); + for (i = 0; i < sizeof(scc_inittab); ++i) + out_8(sccc, scc_inittab[i]); + + udbg_putc = udbg_scc_putc; + udbg_getc = udbg_scc_getc; + udbg_getc_poll = udbg_scc_getc_poll; + + udbg_puts("Hello World !\n"); +} + +static void udbg_real_scc_putc(unsigned char c) +{ + while ((real_readb(sccc) & SCC_TXRDY) == 0) + ; + real_writeb(c, sccd); + if (c == '\n') + udbg_real_scc_putc('\r'); +} + +void udbg_init_pmac_realmode(void) +{ + sccc = (volatile u8 __iomem *)0x80013020ul; + sccd = (volatile u8 __iomem *)0x80013030ul; + + udbg_putc = udbg_real_scc_putc; + udbg_getc = NULL; + udbg_getc_poll = NULL; +} diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c new file mode 100644 index 00000000000..0d4d8bec0df --- /dev/null +++ b/arch/powerpc/kernel/vdso.c @@ -0,0 +1,746 @@ +/* + * linux/arch/ppc64/kernel/vdso.c + * + * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp. + * <benh@kernel.crashing.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/bootmem.h> + +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/lmb.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/vdso.h> +#include <asm/vdso_datapage.h> + +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + +/* Max supported size for symbol names */ +#define MAX_SYMNAME 64 + +extern char vdso32_start, vdso32_end; +static void *vdso32_kbase = &vdso32_start; +unsigned int vdso32_pages; +unsigned long vdso32_sigtramp; +unsigned long vdso32_rt_sigtramp; + +#ifdef CONFIG_PPC64 +extern char vdso64_start, vdso64_end; +static void *vdso64_kbase = &vdso64_start; +unsigned int vdso64_pages; +unsigned long vdso64_rt_sigtramp; +#endif /* CONFIG_PPC64 */ + +/* + * The vdso data page (aka. systemcfg for old ppc64 fans) is here. + * Once the early boot kernel code no longer needs to muck around + * with it, it will become dynamically allocated + */ +static union { + struct vdso_data data; + u8 page[PAGE_SIZE]; +} vdso_data_store __attribute__((__section__(".data.page_aligned"))); +struct vdso_data *vdso_data = &vdso_data_store.data; + +/* Format of the patch table */ +struct vdso_patch_def +{ + unsigned long ftr_mask, ftr_value; + const char *gen_name; + const char *fix_name; +}; + +/* Table of functions to patch based on the CPU type/revision + * + * Currently, we only change sync_dicache to do nothing on processors + * with a coherent icache + */ +static struct vdso_patch_def vdso_patches[] = { + { + CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, + "__kernel_sync_dicache", "__kernel_sync_dicache_p5" + }, + { + CPU_FTR_USE_TB, 0, + "__kernel_gettimeofday", NULL + }, +}; + +/* + * Some infos carried around for each of them during parsing at + * boot time. + */ +struct lib32_elfinfo +{ + Elf32_Ehdr *hdr; /* ptr to ELF */ + Elf32_Sym *dynsym; /* ptr to .dynsym section */ + unsigned long dynsymsize; /* size of .dynsym section */ + char *dynstr; /* ptr to .dynstr section */ + unsigned long text; /* offset of .text section in .so */ +}; + +struct lib64_elfinfo +{ + Elf64_Ehdr *hdr; + Elf64_Sym *dynsym; + unsigned long dynsymsize; + char *dynstr; + unsigned long text; +}; + + +#ifdef __DEBUG +static void dump_one_vdso_page(struct page *pg, struct page *upg) +{ + printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT), + page_count(pg), + pg->flags); + if (upg/* && pg != upg*/) { + printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg) + << PAGE_SHIFT), + page_count(upg), + upg->flags); + } + printk("\n"); +} + +static void dump_vdso_pages(struct vm_area_struct * vma) +{ + int i; + + if (!vma || test_thread_flag(TIF_32BIT)) { + printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase); + for (i=0; i<vdso32_pages; i++) { + struct page *pg = virt_to_page(vdso32_kbase + + i*PAGE_SIZE); + struct page *upg = (vma && vma->vm_mm) ? + follow_page(vma->vm_mm, vma->vm_start + + i*PAGE_SIZE, 0) + : NULL; + dump_one_vdso_page(pg, upg); + } + } + if (!vma || !test_thread_flag(TIF_32BIT)) { + printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase); + for (i=0; i<vdso64_pages; i++) { + struct page *pg = virt_to_page(vdso64_kbase + + i*PAGE_SIZE); + struct page *upg = (vma && vma->vm_mm) ? + follow_page(vma->vm_mm, vma->vm_start + + i*PAGE_SIZE, 0) + : NULL; + dump_one_vdso_page(pg, upg); + } + } +} +#endif /* DEBUG */ + +/* + * Keep a dummy vma_close for now, it will prevent VMA merging. + */ +static void vdso_vma_close(struct vm_area_struct * vma) +{ +} + +/* + * Our nopage() function, maps in the actual vDSO kernel pages, they will + * be mapped read-only by do_no_page(), and eventually COW'ed, either + * right away for an initial write access, or by do_wp_page(). + */ +static struct page * vdso_vma_nopage(struct vm_area_struct * vma, + unsigned long address, int *type) +{ + unsigned long offset = address - vma->vm_start; + struct page *pg; +#ifdef CONFIG_PPC64 + void *vbase = test_thread_flag(TIF_32BIT) ? + vdso32_kbase : vdso64_kbase; +#else + void *vbase = vdso32_kbase; +#endif + + DBG("vdso_vma_nopage(current: %s, address: %016lx, off: %lx)\n", + current->comm, address, offset); + + if (address < vma->vm_start || address > vma->vm_end) + return NOPAGE_SIGBUS; + + /* + * Last page is systemcfg. + */ + if ((vma->vm_end - address) <= PAGE_SIZE) + pg = virt_to_page(vdso_data); + else + pg = virt_to_page(vbase + offset); + + get_page(pg); + DBG(" ->page count: %d\n", page_count(pg)); + + return pg; +} + +static struct vm_operations_struct vdso_vmops = { + .close = vdso_vma_close, + .nopage = vdso_vma_nopage, +}; + +/* + * This is called from binfmt_elf, we create the special vma for the + * vDSO and insert it into the mm struct tree + */ +int arch_setup_additional_pages(struct linux_binprm *bprm, + int executable_stack) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long vdso_pages; + unsigned long vdso_base; + +#ifdef CONFIG_PPC64 + if (test_thread_flag(TIF_32BIT)) { + vdso_pages = vdso32_pages; + vdso_base = VDSO32_MBASE; + } else { + vdso_pages = vdso64_pages; + vdso_base = VDSO64_MBASE; + } +#else + vdso_pages = vdso32_pages; + vdso_base = VDSO32_MBASE; +#endif + + current->thread.vdso_base = 0; + + /* vDSO has a problem and was disabled, just don't "enable" it for the + * process + */ + if (vdso_pages == 0) + return 0; + + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (vma == NULL) + return -ENOMEM; + + memset(vma, 0, sizeof(*vma)); + + /* Add a page to the vdso size for the data page */ + vdso_pages ++; + + /* + * pick a base address for the vDSO in process space. We try to put it + * at vdso_base which is the "natural" base for it, but we might fail + * and end up putting it elsewhere. + */ + vdso_base = get_unmapped_area(NULL, vdso_base, + vdso_pages << PAGE_SHIFT, 0, 0); + if (vdso_base & ~PAGE_MASK) { + kmem_cache_free(vm_area_cachep, vma); + return (int)vdso_base; + } + + current->thread.vdso_base = vdso_base; + + vma->vm_mm = mm; + vma->vm_start = current->thread.vdso_base; + vma->vm_end = vma->vm_start + (vdso_pages << PAGE_SHIFT); + + /* + * our vma flags don't have VM_WRITE so by default, the process isn't + * allowed to write those pages. + * gdb can break that with ptrace interface, and thus trigger COW on + * those pages but it's then your responsibility to never do that on + * the "data" page of the vDSO or you'll stop getting kernel updates + * and your nice userland gettimeofday will be totally dead. + * It's fine to use that for setting breakpoints in the vDSO code + * pages though + */ + vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | + VM_MAYEXEC | VM_RESERVED; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; + vma->vm_ops = &vdso_vmops; + + down_write(&mm->mmap_sem); + if (insert_vm_struct(mm, vma)) { + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); + return -ENOMEM; + } + mm->total_vm += (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + up_write(&mm->mmap_sem); + + return 0; +} + +static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname, + unsigned long *size) +{ + Elf32_Shdr *sechdrs; + unsigned int i; + char *secnames; + + /* Grab section headers and strings so we can tell who is who */ + sechdrs = (void *)ehdr + ehdr->e_shoff; + secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; + + /* Find the section they want */ + for (i = 1; i < ehdr->e_shnum; i++) { + if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { + if (size) + *size = sechdrs[i].sh_size; + return (void *)ehdr + sechdrs[i].sh_offset; + } + } + *size = 0; + return NULL; +} + +static Elf32_Sym * __init find_symbol32(struct lib32_elfinfo *lib, + const char *symname) +{ + unsigned int i; + char name[MAX_SYMNAME], *c; + + for (i = 0; i < (lib->dynsymsize / sizeof(Elf32_Sym)); i++) { + if (lib->dynsym[i].st_name == 0) + continue; + strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, + MAX_SYMNAME); + c = strchr(name, '@'); + if (c) + *c = 0; + if (strcmp(symname, name) == 0) + return &lib->dynsym[i]; + } + return NULL; +} + +/* Note that we assume the section is .text and the symbol is relative to + * the library base + */ +static unsigned long __init find_function32(struct lib32_elfinfo *lib, + const char *symname) +{ + Elf32_Sym *sym = find_symbol32(lib, symname); + + if (sym == NULL) { + printk(KERN_WARNING "vDSO32: function %s not found !\n", + symname); + return 0; + } + return sym->st_value - VDSO32_LBASE; +} + +static int vdso_do_func_patch32(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64, + const char *orig, const char *fix) +{ + Elf32_Sym *sym32_gen, *sym32_fix; + + sym32_gen = find_symbol32(v32, orig); + if (sym32_gen == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", orig); + return -1; + } + if (fix == NULL) { + sym32_gen->st_name = 0; + return 0; + } + sym32_fix = find_symbol32(v32, fix); + if (sym32_fix == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol %s !\n", fix); + return -1; + } + sym32_gen->st_value = sym32_fix->st_value; + sym32_gen->st_size = sym32_fix->st_size; + sym32_gen->st_info = sym32_fix->st_info; + sym32_gen->st_other = sym32_fix->st_other; + sym32_gen->st_shndx = sym32_fix->st_shndx; + + return 0; +} + + +#ifdef CONFIG_PPC64 + +static void * __init find_section64(Elf64_Ehdr *ehdr, const char *secname, + unsigned long *size) +{ + Elf64_Shdr *sechdrs; + unsigned int i; + char *secnames; + + /* Grab section headers and strings so we can tell who is who */ + sechdrs = (void *)ehdr + ehdr->e_shoff; + secnames = (void *)ehdr + sechdrs[ehdr->e_shstrndx].sh_offset; + + /* Find the section they want */ + for (i = 1; i < ehdr->e_shnum; i++) { + if (strcmp(secnames+sechdrs[i].sh_name, secname) == 0) { + if (size) + *size = sechdrs[i].sh_size; + return (void *)ehdr + sechdrs[i].sh_offset; + } + } + if (size) + *size = 0; + return NULL; +} + +static Elf64_Sym * __init find_symbol64(struct lib64_elfinfo *lib, + const char *symname) +{ + unsigned int i; + char name[MAX_SYMNAME], *c; + + for (i = 0; i < (lib->dynsymsize / sizeof(Elf64_Sym)); i++) { + if (lib->dynsym[i].st_name == 0) + continue; + strlcpy(name, lib->dynstr + lib->dynsym[i].st_name, + MAX_SYMNAME); + c = strchr(name, '@'); + if (c) + *c = 0; + if (strcmp(symname, name) == 0) + return &lib->dynsym[i]; + } + return NULL; +} + +/* Note that we assume the section is .text and the symbol is relative to + * the library base + */ +static unsigned long __init find_function64(struct lib64_elfinfo *lib, + const char *symname) +{ + Elf64_Sym *sym = find_symbol64(lib, symname); + + if (sym == NULL) { + printk(KERN_WARNING "vDSO64: function %s not found !\n", + symname); + return 0; + } +#ifdef VDS64_HAS_DESCRIPTORS + return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) - + VDSO64_LBASE; +#else + return sym->st_value - VDSO64_LBASE; +#endif +} + +static int vdso_do_func_patch64(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64, + const char *orig, const char *fix) +{ + Elf64_Sym *sym64_gen, *sym64_fix; + + sym64_gen = find_symbol64(v64, orig); + if (sym64_gen == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", orig); + return -1; + } + if (fix == NULL) { + sym64_gen->st_name = 0; + return 0; + } + sym64_fix = find_symbol64(v64, fix); + if (sym64_fix == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol %s !\n", fix); + return -1; + } + sym64_gen->st_value = sym64_fix->st_value; + sym64_gen->st_size = sym64_fix->st_size; + sym64_gen->st_info = sym64_fix->st_info; + sym64_gen->st_other = sym64_fix->st_other; + sym64_gen->st_shndx = sym64_fix->st_shndx; + + return 0; +} + +#endif /* CONFIG_PPC64 */ + + +static __init int vdso_do_find_sections(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + void *sect; + + /* + * Locate symbol tables & text section + */ + + v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize); + v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL); + if (v32->dynsym == NULL || v32->dynstr == NULL) { + printk(KERN_ERR "vDSO32: required symbol section not found\n"); + return -1; + } + sect = find_section32(v32->hdr, ".text", NULL); + if (sect == NULL) { + printk(KERN_ERR "vDSO32: the .text section was not found\n"); + return -1; + } + v32->text = sect - vdso32_kbase; + +#ifdef CONFIG_PPC64 + v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize); + v64->dynstr = find_section64(v64->hdr, ".dynstr", NULL); + if (v64->dynsym == NULL || v64->dynstr == NULL) { + printk(KERN_ERR "vDSO64: required symbol section not found\n"); + return -1; + } + sect = find_section64(v64->hdr, ".text", NULL); + if (sect == NULL) { + printk(KERN_ERR "vDSO64: the .text section was not found\n"); + return -1; + } + v64->text = sect - vdso64_kbase; +#endif /* CONFIG_PPC64 */ + + return 0; +} + +static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + /* + * Find signal trampolines + */ + +#ifdef CONFIG_PPC64 + vdso64_rt_sigtramp = find_function64(v64, "__kernel_sigtramp_rt64"); +#endif + vdso32_sigtramp = find_function32(v32, "__kernel_sigtramp32"); + vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32"); +} + +static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + Elf32_Sym *sym32; +#ifdef CONFIG_PPC64 + Elf64_Sym *sym64; + + sym64 = find_symbol64(v64, "__kernel_datapage_offset"); + if (sym64 == NULL) { + printk(KERN_ERR "vDSO64: Can't find symbol " + "__kernel_datapage_offset !\n"); + return -1; + } + *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) = + (vdso64_pages << PAGE_SHIFT) - + (sym64->st_value - VDSO64_LBASE); +#endif /* CONFIG_PPC64 */ + + sym32 = find_symbol32(v32, "__kernel_datapage_offset"); + if (sym32 == NULL) { + printk(KERN_ERR "vDSO32: Can't find symbol " + "__kernel_datapage_offset !\n"); + return -1; + } + *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) = + (vdso32_pages << PAGE_SHIFT) - + (sym32->st_value - VDSO32_LBASE); + + return 0; +} + +static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32, + struct lib64_elfinfo *v64) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vdso_patches); i++) { + struct vdso_patch_def *patch = &vdso_patches[i]; + int match = (cur_cpu_spec->cpu_features & patch->ftr_mask) + == patch->ftr_value; + if (!match) + continue; + + DBG("replacing %s with %s...\n", patch->gen_name, + patch->fix_name ? "NONE" : patch->fix_name); + + /* + * Patch the 32 bits and 64 bits symbols. Note that we do not + * patch the "." symbol on 64 bits. + * It would be easy to do, but doesn't seem to be necessary, + * patching the OPD symbol is enough. + */ + vdso_do_func_patch32(v32, v64, patch->gen_name, + patch->fix_name); +#ifdef CONFIG_PPC64 + vdso_do_func_patch64(v32, v64, patch->gen_name, + patch->fix_name); +#endif /* CONFIG_PPC64 */ + } + + return 0; +} + + +static __init int vdso_setup(void) +{ + struct lib32_elfinfo v32; + struct lib64_elfinfo v64; + + v32.hdr = vdso32_kbase; +#ifdef CONFIG_PPC64 + v64.hdr = vdso64_kbase; +#endif + if (vdso_do_find_sections(&v32, &v64)) + return -1; + + if (vdso_fixup_datapage(&v32, &v64)) + return -1; + + if (vdso_fixup_alt_funcs(&v32, &v64)) + return -1; + + vdso_setup_trampolines(&v32, &v64); + + return 0; +} + +/* + * Called from setup_arch to initialize the bitmap of available + * syscalls in the systemcfg page + */ +static void __init vdso_setup_syscall_map(void) +{ + unsigned int i; + extern unsigned long *sys_call_table; + extern unsigned long sys_ni_syscall; + + + for (i = 0; i < __NR_syscalls; i++) { +#ifdef CONFIG_PPC64 + if (sys_call_table[i*2] != sys_ni_syscall) + vdso_data->syscall_map_64[i >> 5] |= + 0x80000000UL >> (i & 0x1f); + if (sys_call_table[i*2+1] != sys_ni_syscall) + vdso_data->syscall_map_32[i >> 5] |= + 0x80000000UL >> (i & 0x1f); +#else /* CONFIG_PPC64 */ + if (sys_call_table[i] != sys_ni_syscall) + vdso_data->syscall_map_32[i >> 5] |= + 0x80000000UL >> (i & 0x1f); +#endif /* CONFIG_PPC64 */ + } +} + + +void __init vdso_init(void) +{ + int i; + +#ifdef CONFIG_PPC64 + /* + * Fill up the "systemcfg" stuff for backward compatiblity + */ + strcpy(vdso_data->eye_catcher, "SYSTEMCFG:PPC64"); + vdso_data->version.major = SYSTEMCFG_MAJOR; + vdso_data->version.minor = SYSTEMCFG_MINOR; + vdso_data->processor = mfspr(SPRN_PVR); + vdso_data->platform = _machine; + vdso_data->physicalMemorySize = lmb_phys_mem_size(); + vdso_data->dcache_size = ppc64_caches.dsize; + vdso_data->dcache_line_size = ppc64_caches.dline_size; + vdso_data->icache_size = ppc64_caches.isize; + vdso_data->icache_line_size = ppc64_caches.iline_size; + + /* + * Calculate the size of the 64 bits vDSO + */ + vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; + DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); +#endif /* CONFIG_PPC64 */ + + + /* + * Calculate the size of the 32 bits vDSO + */ + vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT; + DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages); + + + /* + * Setup the syscall map in the vDOS + */ + vdso_setup_syscall_map(); + /* + * Initialize the vDSO images in memory, that is do necessary + * fixups of vDSO symbols, locate trampolines, etc... + */ + if (vdso_setup()) { + printk(KERN_ERR "vDSO setup failure, not enabled !\n"); + vdso32_pages = 0; +#ifdef CONFIG_PPC64 + vdso64_pages = 0; +#endif + return; + } + + /* Make sure pages are in the correct state */ + for (i = 0; i < vdso32_pages; i++) { + struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + + } +#ifdef CONFIG_PPC64 + for (i = 0; i < vdso64_pages; i++) { + struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); + ClearPageReserved(pg); + get_page(pg); + } +#endif /* CONFIG_PPC64 */ + + get_page(virt_to_page(vdso_data)); +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + return 0; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + return NULL; +} + diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile new file mode 100644 index 00000000000..8a3bed5f143 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -0,0 +1,40 @@ + +# List of files in the vdso, has to be asm only for now + +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o + +# Build rules + +ifeq ($(CONFIG_PPC32),y) +CROSS32CC := $(CC) +endif + +targets := $(obj-vdso32) vdso32.so +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + + +EXTRA_CFLAGS := -shared -s -fno-common -fno-builtin +EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 +EXTRA_AFLAGS := -D__VDSO32__ -s + +obj-y += vdso32_wrapper.o +extra-y += vdso32.lds +CPPFLAGS_vdso32.lds += -P -C -Upowerpc + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso32.so: $(src)/vdso32.lds $(obj-vdso32) + $(call if_changed,vdso32ld) + +# assembly rules for the .S files +$(obj-vdso32): %.o: %.S + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32ld = VDSO32L $@ + cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< + diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S new file mode 100644 index 00000000000..c8db993574e --- /dev/null +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -0,0 +1,67 @@ +/* + * vDSO provided cache flush routines + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> + + .text + +/* + * Default "generic" version of __kernel_sync_dicache. + * + * void __kernel_sync_dicache(unsigned long start, unsigned long end) + * + * Flushes the data cache & invalidate the instruction cache for the + * provided range [start, end[ + * + * Note: all CPUs supported by this kernel have a 128 bytes cache + * line size so we don't have to peek that info from the datapage + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache) + .cfi_startproc + li r5,127 + andc r6,r3,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 /* ensure we get enough */ + srwi. r8,r8,7 /* compute line count */ + beqlr /* nothing to do? */ + mtctr r8 + mr r3,r6 +1: dcbst 0,r3 + addi r3,r3,128 + bdnz 1b + sync + mtctr r8 +1: icbi 0,r6 + addi r6,r6,128 + bdnz 1b + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache) + + +/* + * POWER5 version of __kernel_sync_dicache + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) + .cfi_startproc + sync + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache_p5) + diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S new file mode 100644 index 00000000000..f6b38472318 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -0,0 +1,85 @@ +/* + * Access to the shared data page by the vDSO & syscall map + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text +V_FUNCTION_BEGIN(__get_datapage) + .cfi_startproc + /* We don't want that exposed or overridable as we want other objects + * to be able to bl directly to here + */ + .protected __get_datapage + .hidden __get_datapage + + mflr r0 + .cfi_register lr,r0 + + bcl 20,31,1f + .global __kernel_datapage_offset; +__kernel_datapage_offset: + .long 0 +1: + mflr r3 + mtlr r0 + lwz r0,0(r3) + add r3,r0,r3 + blr + .cfi_endproc +V_FUNCTION_END(__get_datapage) + +/* + * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; + * + * returns a pointer to the syscall map. the map is agnostic to the + * size of "long", unlike kernel bitops, it stores bits from top to + * bottom so that memory actually contains a linear bitmap + * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of + * 32 bits int at N >> 5. + */ +V_FUNCTION_BEGIN(__kernel_get_syscall_map) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r4,r3 + bl __get_datapage@local + mtlr r12 + addi r3,r3,CFG_SYSCALL_MAP32 + cmpli cr0,r4,0 + beqlr + li r0,__NR_syscalls + stw r0,0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_syscall_map) + +/* + * void unsigned long long __kernel_get_tbfreq(void); + * + * returns the timebase frequency in HZ + */ +V_FUNCTION_BEGIN(__kernel_get_tbfreq) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + bl __get_datapage@local + lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) + lwz r3,CFG_TB_TICKS_PER_SEC(r3) + mtlr r12 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S new file mode 100644 index 00000000000..0a32a41d50b --- /dev/null +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -0,0 +1,319 @@ +/* + * Userland implementation of gettimeofday() for 32 bits processes in a + * ppc64 kernel for use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org, + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + + .text +/* + * Exact prototype of gettimeofday + * + * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); + * + */ +V_FUNCTION_BEGIN(__kernel_gettimeofday) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r10,r3 /* r10 saves tv */ + mr r11,r4 /* r11 saves tz */ + bl __get_datapage@local /* get data page */ + mr r9, r3 /* datapage ptr in r9 */ + bl __do_get_xsec@local /* get xsec from tb & kernel */ + bne- 2f /* out of line -> do syscall */ + + /* seconds are xsec >> 20 */ + rlwinm r5,r4,12,20,31 + rlwimi r5,r3,12,0,19 + stw r5,TVAL32_TV_SEC(r10) + + /* get remaining xsec and convert to usec. we scale + * up remaining xsec by 12 bits and get the top 32 bits + * of the multiplication + */ + rlwinm r5,r4,12,0,19 + lis r6,1000000@h + ori r6,r6,1000000@l + mulhwu r5,r5,r6 + stw r5,TVAL32_TV_USEC(r10) + + cmpli cr0,r11,0 /* check if tz is NULL */ + beq 1f + lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ + lwz r5,CFG_TZ_DSTTIME(r9) + stw r4,TZONE_TZ_MINWEST(r11) + stw r5,TZONE_TZ_DSTTIME(r11) + +1: mtlr r12 + li r3,0 + blr + +2: + mtlr r12 + mr r3,r10 + mr r4,r11 + li r0,__NR_gettimeofday + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_gettimeofday) + +/* + * Exact prototype of clock_gettime() + * + * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_gettime) + .cfi_startproc + /* Check for supported clock IDs */ + cmpli cr0,r3,CLOCK_REALTIME + cmpli cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + mflr r12 /* r12 saves lr */ + .cfi_register lr,r12 + mr r10,r3 /* r10 saves id */ + mr r11,r4 /* r11 saves tp */ + bl __get_datapage@local /* get data page */ + mr r9,r3 /* datapage ptr in r9 */ + beq cr1,50f /* if monotonic -> jump there */ + + /* + * CLOCK_REALTIME + */ + + bl __do_get_xsec@local /* get xsec from tb & kernel */ + bne- 98f /* out of line -> do syscall */ + + /* seconds are xsec >> 20 */ + rlwinm r5,r4,12,20,31 + rlwimi r5,r3,12,0,19 + stw r5,TSPC32_TV_SEC(r11) + + /* get remaining xsec and convert to nsec. we scale + * up remaining xsec by 12 bits and get the top 32 bits + * of the multiplication, then we multiply by 1000 + */ + rlwinm r5,r4,12,0,19 + lis r6,1000000@h + ori r6,r6,1000000@l + mulhwu r5,r5,r6 + mulli r5,r5,1000 + stw r5,TSPC32_TV_NSEC(r11) + mtlr r12 + li r3,0 + blr + + /* + * CLOCK_MONOTONIC + */ + +50: bl __do_get_xsec@local /* get xsec from tb & kernel */ + bne- 98f /* out of line -> do syscall */ + + /* seconds are xsec >> 20 */ + rlwinm r6,r4,12,20,31 + rlwimi r6,r3,12,0,19 + + /* get remaining xsec and convert to nsec. we scale + * up remaining xsec by 12 bits and get the top 32 bits + * of the multiplication, then we multiply by 1000 + */ + rlwinm r7,r4,12,0,19 + lis r5,1000000@h + ori r5,r5,1000000@l + mulhwu r7,r7,r5 + mulli r7,r7,1000 + + /* now we must fixup using wall to monotonic. We need to snapshot + * that value and do the counter trick again. Fortunately, we still + * have the counter value in r8 that was returned by __do_get_xsec. + * At this point, r6,r7 contain our sec/nsec values, r3,r4 and r5 + * can be used + */ + + lwz r3,WTOM_CLOCK_SEC(r9) + lwz r4,WTOM_CLOCK_NSEC(r9) + + /* We now have our result in r3,r4. We create a fake dependency + * on that result and re-check the counter + */ + or r5,r4,r3 + xor r0,r5,r5 + add r9,r9,r0 +#ifdef CONFIG_PPC64 + lwz r0,(CFG_TB_UPDATE_COUNT+4)(r9) +#else + lwz r0,(CFG_TB_UPDATE_COUNT)(r9) +#endif + cmpl cr0,r8,r0 /* check if updated */ + bne- 50b + + /* Calculate and store result. Note that this mimmics the C code, + * which may cause funny results if nsec goes negative... is that + * possible at all ? + */ + add r3,r3,r6 + add r4,r4,r7 + lis r5,NSEC_PER_SEC@h + ori r5,r5,NSEC_PER_SEC@l + cmpl cr0,r4,r5 + cmpli cr1,r4,0 + blt 1f + subf r4,r5,r4 + addi r3,r3,1 +1: bge cr1,1f + addi r3,r3,-1 + add r4,r4,r5 +1: stw r3,TSPC32_TV_SEC(r11) + stw r4,TSPC32_TV_NSEC(r11) + + mtlr r12 + li r3,0 + blr + + /* + * syscall fallback + */ +98: + mtlr r12 + mr r3,r10 + mr r4,r11 +99: + li r0,__NR_clock_gettime + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_gettime) + + +/* + * Exact prototype of clock_getres() + * + * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_getres) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + li r3,0 + cmpli cr0,r4,0 + beqlr + lis r5,CLOCK_REALTIME_RES@h + ori r5,r5,CLOCK_REALTIME_RES@l + stw r3,TSPC32_TV_SEC(r4) + stw r5,TSPC32_TV_NSEC(r4) + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_getres + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_getres) + + +/* + * This is the core of gettimeofday() & friends, it returns the xsec + * value in r3 & r4 and expects the datapage ptr (non clobbered) + * in r9. clobbers r0,r4,r5,r6,r7,r8. + * When returning, r8 contains the counter value that can be reused + * by the monotonic clock implementation + */ +__do_get_xsec: + .cfi_startproc + /* Check for update count & load values. We use the low + * order 32 bits of the update count + */ +#ifdef CONFIG_PPC64 +1: lwz r8,(CFG_TB_UPDATE_COUNT+4)(r9) +#else +1: lwz r8,(CFG_TB_UPDATE_COUNT)(r9) +#endif + andi. r0,r8,1 /* pending update ? loop */ + bne- 1b + xor r0,r8,r8 /* create dependency */ + add r9,r9,r0 + + /* Load orig stamp (offset to TB) */ + lwz r5,CFG_TB_ORIG_STAMP(r9) + lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) + + /* Get a stable TB value */ +2: mftbu r3 + mftbl r4 + mftbu r0 + cmpl cr0,r3,r0 + bne- 2b + + /* Substract tb orig stamp. If the high part is non-zero, we jump to + * the slow path which call the syscall. + * If it's ok, then we have our 32 bits tb_ticks value in r7 + */ + subfc r7,r6,r4 + subfe. r0,r5,r3 + bne- 3f + + /* Load scale factor & do multiplication */ + lwz r5,CFG_TB_TO_XS(r9) /* load values */ + lwz r6,(CFG_TB_TO_XS+4)(r9) + mulhwu r4,r7,r5 + mulhwu r6,r7,r6 + mullw r0,r7,r5 + addc r6,r6,r0 + + /* At this point, we have the scaled xsec value in r4 + XER:CA + * we load & add the stamp since epoch + */ + lwz r5,CFG_STAMP_XSEC(r9) + lwz r6,(CFG_STAMP_XSEC+4)(r9) + adde r4,r4,r6 + addze r3,r5 + + /* We now have our result in r3,r4. We create a fake dependency + * on that result and re-check the counter + */ + or r6,r4,r3 + xor r0,r6,r6 + add r9,r9,r0 +#ifdef CONFIG_PPC64 + lwz r0,(CFG_TB_UPDATE_COUNT+4)(r9) +#else + lwz r0,(CFG_TB_UPDATE_COUNT)(r9) +#endif + cmpl cr0,r8,r0 /* check if updated */ + bne- 1b + + /* Warning ! The caller expects CR:EQ to be set to indicate a + * successful calculation (so it won't fallback to the syscall + * method). We have overriden that CR bit in the counter check, + * but fortunately, the loop exit condition _is_ CR:EQ set, so + * we can exit safely here. If you change this code, be careful + * of that side effect. + */ +3: blr + .cfi_endproc diff --git a/arch/powerpc/kernel/vdso32/note.S b/arch/powerpc/kernel/vdso32/note.S new file mode 100644 index 00000000000..d4b5be4f3d5 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/note.S @@ -0,0 +1,25 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> + +#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ + .section name, flags; \ + .balign 4; \ + .long 1f - 0f; /* name length */ \ + .long 3f - 2f; /* data length */ \ + .long type; /* note type */ \ +0: .asciz vendor; /* vendor name */ \ +1: .balign 4; \ +2: + +#define ASM_ELF_NOTE_END \ +3: .balign 4; /* pad out section */ \ + .previous + + ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) + .long LINUX_VERSION_CODE + ASM_ELF_NOTE_END diff --git a/arch/powerpc/kernel/vdso32/sigtramp.S b/arch/powerpc/kernel/vdso32/sigtramp.S new file mode 100644 index 00000000000..e0464278191 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/sigtramp.S @@ -0,0 +1,300 @@ +/* + * Signal trampolines for 32 bits processes in a ppc64 kernel for + * use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text + +/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from + the return address to get an address in the middle of the presumed + call instruction. Since we don't have a call here, we artifically + extend the range covered by the unwind info by adding a nop before + the real start. */ + nop +V_FUNCTION_BEGIN(__kernel_sigtramp32) +.Lsig_start = . - 4 + li r0,__NR_sigreturn + sc +.Lsig_end: +V_FUNCTION_END(__kernel_sigtramp32) + +.Lsigrt_start: + nop +V_FUNCTION_BEGIN(__kernel_sigtramp_rt32) + li r0,__NR_rt_sigreturn + sc +.Lsigrt_end: +V_FUNCTION_END(__kernel_sigtramp_rt32) + + .section .eh_frame,"a",@progbits + +/* Register r1 can be found at offset 4 of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define cfa_save \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 RSIZE; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ +9: + +/* Register REGNO can be found at offset OFS of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define rsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .ifne ofs; \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ + .endif; \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. The VMX reg struct is at offset VREGS of + the pt_regs struct. This macro is for REGNO == 0, and contains + 'subroutines' that the other macros jump to. */ +#define vsave_msr0(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit0 */ \ +2: \ + .byte 0x40; /* DW_OP_lit16 */ \ + .byte 0x1e; /* DW_OP_mul */ \ +3: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x12; /* DW_OP_dup */ \ + .byte 0x23; /* DW_OP_plus_uconst */ \ + .uleb128 33*RSIZE; /* msr offset */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x0c; .long 1 << 25; /* DW_OP_const4u */ \ + .byte 0x1a; /* DW_OP_and */ \ + .byte 0x12; /* DW_OP_dup, ret 0 if bra taken */ \ + .byte 0x30; /* DW_OP_lit0 */ \ + .byte 0x29; /* DW_OP_eq */ \ + .byte 0x28; .short 0x7fff; /* DW_OP_bra to end */ \ + .byte 0x13; /* DW_OP_drop, pop the 0 */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x22; /* DW_OP_plus */ \ + .byte 0x2f; .short 0x7fff; /* DW_OP_skip to end */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. REGNO is 1 thru 31. */ +#define vsave_msr1(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit n */ \ + .byte 0x2f; .short 2b - 9f; /* DW_OP_skip */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of + the VMX save block. */ +#define vsave_msr2(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x0a; .short ofs; /* DW_OP_const2u */ \ + .byte 0x2f; .short 3b - 9f; /* DW_OP_skip */ \ +9: + +/* VMX register REGNO is at offset OFS of the VMX save area. */ +#define vsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ +9: + +/* This is where the pt_regs pointer can be found on the stack. */ +#define PTREGS 64+28 + +/* Size of regs. */ +#define RSIZE 4 + +/* This is the offset of the VMX regs. */ +#define VREGS 48*RSIZE+34*8 + +/* Describe where general purpose regs are saved. */ +#define EH_FRAME_GEN \ + cfa_save; \ + rsave ( 0, 0*RSIZE); \ + rsave ( 2, 2*RSIZE); \ + rsave ( 3, 3*RSIZE); \ + rsave ( 4, 4*RSIZE); \ + rsave ( 5, 5*RSIZE); \ + rsave ( 6, 6*RSIZE); \ + rsave ( 7, 7*RSIZE); \ + rsave ( 8, 8*RSIZE); \ + rsave ( 9, 9*RSIZE); \ + rsave (10, 10*RSIZE); \ + rsave (11, 11*RSIZE); \ + rsave (12, 12*RSIZE); \ + rsave (13, 13*RSIZE); \ + rsave (14, 14*RSIZE); \ + rsave (15, 15*RSIZE); \ + rsave (16, 16*RSIZE); \ + rsave (17, 17*RSIZE); \ + rsave (18, 18*RSIZE); \ + rsave (19, 19*RSIZE); \ + rsave (20, 20*RSIZE); \ + rsave (21, 21*RSIZE); \ + rsave (22, 22*RSIZE); \ + rsave (23, 23*RSIZE); \ + rsave (24, 24*RSIZE); \ + rsave (25, 25*RSIZE); \ + rsave (26, 26*RSIZE); \ + rsave (27, 27*RSIZE); \ + rsave (28, 28*RSIZE); \ + rsave (29, 29*RSIZE); \ + rsave (30, 30*RSIZE); \ + rsave (31, 31*RSIZE); \ + rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ + rsave (65, 36*RSIZE); /* lr */ \ + rsave (70, 38*RSIZE) /* cr */ + +/* Describe where the FP regs are saved. */ +#define EH_FRAME_FP \ + rsave (32, 48*RSIZE + 0*8); \ + rsave (33, 48*RSIZE + 1*8); \ + rsave (34, 48*RSIZE + 2*8); \ + rsave (35, 48*RSIZE + 3*8); \ + rsave (36, 48*RSIZE + 4*8); \ + rsave (37, 48*RSIZE + 5*8); \ + rsave (38, 48*RSIZE + 6*8); \ + rsave (39, 48*RSIZE + 7*8); \ + rsave (40, 48*RSIZE + 8*8); \ + rsave (41, 48*RSIZE + 9*8); \ + rsave (42, 48*RSIZE + 10*8); \ + rsave (43, 48*RSIZE + 11*8); \ + rsave (44, 48*RSIZE + 12*8); \ + rsave (45, 48*RSIZE + 13*8); \ + rsave (46, 48*RSIZE + 14*8); \ + rsave (47, 48*RSIZE + 15*8); \ + rsave (48, 48*RSIZE + 16*8); \ + rsave (49, 48*RSIZE + 17*8); \ + rsave (50, 48*RSIZE + 18*8); \ + rsave (51, 48*RSIZE + 19*8); \ + rsave (52, 48*RSIZE + 20*8); \ + rsave (53, 48*RSIZE + 21*8); \ + rsave (54, 48*RSIZE + 22*8); \ + rsave (55, 48*RSIZE + 23*8); \ + rsave (56, 48*RSIZE + 24*8); \ + rsave (57, 48*RSIZE + 25*8); \ + rsave (58, 48*RSIZE + 26*8); \ + rsave (59, 48*RSIZE + 27*8); \ + rsave (60, 48*RSIZE + 28*8); \ + rsave (61, 48*RSIZE + 29*8); \ + rsave (62, 48*RSIZE + 30*8); \ + rsave (63, 48*RSIZE + 31*8) + +/* Describe where the VMX regs are saved. */ +#ifdef CONFIG_ALTIVEC +#define EH_FRAME_VMX \ + vsave_msr0 ( 0); \ + vsave_msr1 ( 1); \ + vsave_msr1 ( 2); \ + vsave_msr1 ( 3); \ + vsave_msr1 ( 4); \ + vsave_msr1 ( 5); \ + vsave_msr1 ( 6); \ + vsave_msr1 ( 7); \ + vsave_msr1 ( 8); \ + vsave_msr1 ( 9); \ + vsave_msr1 (10); \ + vsave_msr1 (11); \ + vsave_msr1 (12); \ + vsave_msr1 (13); \ + vsave_msr1 (14); \ + vsave_msr1 (15); \ + vsave_msr1 (16); \ + vsave_msr1 (17); \ + vsave_msr1 (18); \ + vsave_msr1 (19); \ + vsave_msr1 (20); \ + vsave_msr1 (21); \ + vsave_msr1 (22); \ + vsave_msr1 (23); \ + vsave_msr1 (24); \ + vsave_msr1 (25); \ + vsave_msr1 (26); \ + vsave_msr1 (27); \ + vsave_msr1 (28); \ + vsave_msr1 (29); \ + vsave_msr1 (30); \ + vsave_msr1 (31); \ + vsave_msr2 (33, 32*16+12); \ + vsave (32, 32*16) +#else +#define EH_FRAME_VMX +#endif + +.Lcie: + .long .Lcie_end - .Lcie_start +.Lcie_start: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 4 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 67 /* Return address register column, ap */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel | DW_EH_PE_sdata4. */ + .byte 0x0c,1,0 /* DW_CFA_def_cfa: r1 ofs 0 */ + .balign 4 +.Lcie_end: + + .long .Lfde0_end - .Lfde0_start +.Lfde0_start: + .long .Lfde0_start - .Lcie /* CIE pointer. */ + .long .Lsig_start - . /* PC start, length */ + .long .Lsig_end - .Lsig_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX + .balign 4 +.Lfde0_end: + +/* We have a different stack layout for rt_sigreturn. */ +#undef PTREGS +#define PTREGS 64+16+128+20+28 + + .long .Lfde1_end - .Lfde1_start +.Lfde1_start: + .long .Lfde1_start - .Lcie /* CIE pointer. */ + .long .Lsigrt_start - . /* PC start, length */ + .long .Lsigrt_end - .Lsigrt_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX + .balign 4 +.Lfde1_end: diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S new file mode 100644 index 00000000000..f4bad720cb0 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,117 @@ + +/* + * This is the infamous ld script for the 32 bits vdso + * library + */ +#include <asm/vdso.h> + +/* Default link addresses for the vDSOs */ +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", "elf32-powerpc") +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) + +SECTIONS +{ + . = VDSO32_LBASE + SIZEOF_HEADERS; + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN (16); + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + } + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + + /* Other stuff is appended to the text segment: */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table) } + .fixup : { *(.fixup) } + + .dynamic : { *(.dynamic) } :text :dynamic + .got : { *(.got) } + .plt : { *(.plt) } + + _end = .; + __end = .; + PROVIDE (end = .); + + + /* Stabs debugging sections are here too + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + + /DISCARD/ : { *(.note.GNU-stack) } + /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.* .sdata*) } + /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) } +} + + +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + note PT_NOTE FLAGS(4); /* PF_R */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + __kernel_datapage_offset; /* Has to be there for the kernel to find */ + __kernel_get_syscall_map; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_get_tbfreq; + __kernel_sync_dicache; + __kernel_sync_dicache_p5; + __kernel_sigtramp32; + __kernel_sigtramp_rt32; + local: *; + }; +} diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 00000000000..556f0caa5d8 --- /dev/null +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,13 @@ +#include <linux/init.h> +#include <asm/page.h> + + .section ".data.page_aligned" + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/powerpc/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile new file mode 100644 index 00000000000..ab39988452c --- /dev/null +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -0,0 +1,35 @@ +# List of files in the vdso, has to be asm only for now + +obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o + +# Build rules + +targets := $(obj-vdso64) vdso64.so +obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) + +EXTRA_CFLAGS := -shared -s -fno-common -fno-builtin +EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 +EXTRA_AFLAGS := -D__VDSO64__ -s + +obj-y += vdso64_wrapper.o +extra-y += vdso64.lds +CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) + +# Force dependency (incbin is bad) +$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so + +# link rule for the .so file, .lds has to be first +$(obj)/vdso64.so: $(src)/vdso64.lds $(obj-vdso64) + $(call if_changed,vdso64ld) + +# assembly rules for the .S files +$(obj-vdso64): %.o: %.S + $(call if_changed_dep,vdso64as) + +# actual build commands +quiet_cmd_vdso64ld = VDSO64L $@ + cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ +quiet_cmd_vdso64as = VDSO64A $@ + cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< + + diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S new file mode 100644 index 00000000000..d4a0ad28d53 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/cacheflush.S @@ -0,0 +1,66 @@ +/* + * vDSO provided cache flush routines + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> + + .text + +/* + * Default "generic" version of __kernel_sync_dicache. + * + * void __kernel_sync_dicache(unsigned long start, unsigned long end) + * + * Flushes the data cache & invalidate the instruction cache for the + * provided range [start, end[ + * + * Note: all CPUs supported by this kernel have a 128 bytes cache + * line size so we don't have to peek that info from the datapage + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache) + .cfi_startproc + li r5,127 + andc r6,r3,r5 /* round low to line bdy */ + subf r8,r6,r4 /* compute length */ + add r8,r8,r5 /* ensure we get enough */ + srwi. r8,r8,7 /* compute line count */ + beqlr /* nothing to do? */ + mtctr r8 + mr r3,r6 +1: dcbst 0,r3 + addi r3,r3,128 + bdnz 1b + sync + mtctr r8 +1: icbi 0,r6 + addi r6,r6,128 + bdnz 1b + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache) + + +/* + * POWER5 version of __kernel_sync_dicache + */ +V_FUNCTION_BEGIN(__kernel_sync_dicache_p5) + .cfi_startproc + sync + isync + li r3,0 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_sync_dicache_p5) diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S new file mode 100644 index 00000000000..6393e4137bc --- /dev/null +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -0,0 +1,85 @@ +/* + * Access to the shared data page by the vDSO & syscall map + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> +#include <asm/vdso.h> + + .text +V_FUNCTION_BEGIN(__get_datapage) + .cfi_startproc + /* We don't want that exposed or overridable as we want other objects + * to be able to bl directly to here + */ + .protected __get_datapage + .hidden __get_datapage + + mflr r0 + .cfi_register lr,r0 + + bcl 20,31,1f + .global __kernel_datapage_offset; +__kernel_datapage_offset: + .long 0 +1: + mflr r3 + mtlr r0 + lwz r0,0(r3) + add r3,r0,r3 + blr + .cfi_endproc +V_FUNCTION_END(__get_datapage) + +/* + * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; + * + * returns a pointer to the syscall map. the map is agnostic to the + * size of "long", unlike kernel bitops, it stores bits from top to + * bottom so that memory actually contains a linear bitmap + * check for syscall N by testing bit (0x80000000 >> (N & 0x1f)) of + * 32 bits int at N >> 5. + */ +V_FUNCTION_BEGIN(__kernel_get_syscall_map) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r4,r3 + bl V_LOCAL_FUNC(__get_datapage) + mtlr r12 + addi r3,r3,CFG_SYSCALL_MAP64 + cmpli cr0,r4,0 + beqlr + li r0,__NR_syscalls + stw r0,0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_syscall_map) + + +/* + * void unsigned long __kernel_get_tbfreq(void); + * + * returns the timebase frequency in HZ + */ +V_FUNCTION_BEGIN(__kernel_get_tbfreq) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + bl V_LOCAL_FUNC(__get_datapage) + ld r3,CFG_TB_TICKS_PER_SEC(r3) + mtlr r12 + blr + .cfi_endproc +V_FUNCTION_END(__kernel_get_tbfreq) diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S new file mode 100644 index 00000000000..1a89094715c --- /dev/null +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -0,0 +1,249 @@ + + /* + * Userland implementation of gettimeofday() for 64 bits processes in a + * ppc64 kernel for use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), + * IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/vdso.h> +#include <asm/asm-offsets.h> +#include <asm/unistd.h> + + .text +/* + * Exact prototype of gettimeofday + * + * int __kernel_gettimeofday(struct timeval *tv, struct timezone *tz); + * + */ +V_FUNCTION_BEGIN(__kernel_gettimeofday) + .cfi_startproc + mflr r12 + .cfi_register lr,r12 + + mr r11,r3 /* r11 holds tv */ + mr r10,r4 /* r10 holds tz */ + bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 + rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ + rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ + std r5,TVAL64_TV_SEC(r11) /* store sec in tv */ + subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / + * XSEC_PER_SEC + */ + rldicl r0,r0,44,20 + cmpldi cr0,r10,0 /* check if tz is NULL */ + std r0,TVAL64_TV_USEC(r11) /* store usec in tv */ + beq 1f + lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ + lwz r5,CFG_TZ_DSTTIME(r3) + stw r4,TZONE_TZ_MINWEST(r10) + stw r5,TZONE_TZ_DSTTIME(r10) +1: mtlr r12 + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_gettimeofday) + + +/* + * Exact prototype of clock_gettime() + * + * int __kernel_clock_gettime(clockid_t clock_id, struct timespec *tp); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_gettime) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + mflr r12 /* r12 saves lr */ + .cfi_register lr,r12 + mr r10,r3 /* r10 saves id */ + mr r11,r4 /* r11 saves tp */ + bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + beq cr1,50f /* if monotonic -> jump there */ + + /* + * CLOCK_REALTIME + */ + + bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ + + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 + rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ + rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ + std r5,TSPC64_TV_SEC(r11) /* store sec in tv */ + subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / + * XSEC_PER_SEC + */ + rldicl r0,r0,44,20 + mulli r0,r0,1000 /* nsec = usec * 1000 */ + std r0,TSPC64_TV_NSEC(r11) /* store nsec in tp */ + + mtlr r12 + li r3,0 + blr + + /* + * CLOCK_MONOTONIC + */ + +50: bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ + + lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ + ori r7,r7,16960 + rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ + rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ + subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ + mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / + * XSEC_PER_SEC + */ + rldicl r6,r0,44,20 + mulli r6,r6,1000 /* nsec = usec * 1000 */ + + /* now we must fixup using wall to monotonic. We need to snapshot + * that value and do the counter trick again. Fortunately, we still + * have the counter value in r8 that was returned by __do_get_xsec. + * At this point, r5,r6 contain our sec/nsec values. + * can be used + */ + + lwa r4,WTOM_CLOCK_SEC(r3) + lwa r7,WTOM_CLOCK_NSEC(r3) + + /* We now have our result in r4,r7. We create a fake dependency + * on that result and re-check the counter + */ + or r9,r4,r7 + xor r0,r9,r9 + add r3,r3,r0 + ld r0,CFG_TB_UPDATE_COUNT(r3) + cmpld cr0,r0,r8 /* check if updated */ + bne- 50b + + /* Calculate and store result. Note that this mimmics the C code, + * which may cause funny results if nsec goes negative... is that + * possible at all ? + */ + add r4,r4,r5 + add r7,r7,r6 + lis r9,NSEC_PER_SEC@h + ori r9,r9,NSEC_PER_SEC@l + cmpl cr0,r7,r9 + cmpli cr1,r7,0 + blt 1f + subf r7,r9,r7 + addi r4,r4,1 +1: bge cr1,1f + addi r4,r4,-1 + add r7,r7,r9 +1: std r4,TSPC64_TV_SEC(r11) + std r7,TSPC64_TV_NSEC(r11) + + mtlr r12 + li r3,0 + blr + + /* + * syscall fallback + */ +98: + mtlr r12 + mr r3,r10 + mr r4,r11 +99: + li r0,__NR_clock_gettime + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_gettime) + + +/* + * Exact prototype of clock_getres() + * + * int __kernel_clock_getres(clockid_t clock_id, struct timespec *res); + * + */ +V_FUNCTION_BEGIN(__kernel_clock_getres) + .cfi_startproc + /* Check for supported clock IDs */ + cmpwi cr0,r3,CLOCK_REALTIME + cmpwi cr1,r3,CLOCK_MONOTONIC + cror cr0*4+eq,cr0*4+eq,cr1*4+eq + bne cr0,99f + + li r3,0 + cmpli cr0,r4,0 + beqlr + lis r5,CLOCK_REALTIME_RES@h + ori r5,r5,CLOCK_REALTIME_RES@l + std r3,TSPC64_TV_SEC(r4) + std r5,TSPC64_TV_NSEC(r4) + blr + + /* + * syscall fallback + */ +99: + li r0,__NR_clock_getres + sc + blr + .cfi_endproc +V_FUNCTION_END(__kernel_clock_getres) + + +/* + * This is the core of gettimeofday(), it returns the xsec + * value in r4 and expects the datapage ptr (non clobbered) + * in r3. clobbers r0,r4,r5,r6,r7,r8 + * When returning, r8 contains the counter value that can be reused + */ +V_FUNCTION_BEGIN(__do_get_xsec) + .cfi_startproc + /* check for update count & load values */ +1: ld r8,CFG_TB_UPDATE_COUNT(r3) + andi. r0,r4,1 /* pending update ? loop */ + bne- 1b + xor r0,r4,r4 /* create dependency */ + add r3,r3,r0 + + /* Get TB & offset it */ + mftb r7 + ld r9,CFG_TB_ORIG_STAMP(r3) + subf r7,r9,r7 + + /* Scale result */ + ld r5,CFG_TB_TO_XS(r3) + mulhdu r7,r7,r5 + + /* Add stamp since epoch */ + ld r6,CFG_STAMP_XSEC(r3) + add r4,r6,r7 + + xor r0,r4,r4 + add r3,r3,r0 + ld r0,CFG_TB_UPDATE_COUNT(r3) + cmpld cr0,r0,r8 /* check if updated */ + bne- 1b + blr + .cfi_endproc +V_FUNCTION_END(__do_get_xsec) diff --git a/arch/powerpc/kernel/vdso64/note.S b/arch/powerpc/kernel/vdso64/note.S new file mode 100644 index 00000000000..dc2a509f7e8 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/note.S @@ -0,0 +1 @@ +#include "../vdso32/note.S" diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S new file mode 100644 index 00000000000..31b604ab56d --- /dev/null +++ b/arch/powerpc/kernel/vdso64/sigtramp.S @@ -0,0 +1,295 @@ +/* + * Signal trampoline for 64 bits processes in a ppc64 kernel for + * use in the vDSO + * + * Copyright (C) 2004 Benjamin Herrenschmuidt (benh@kernel.crashing.org), IBM Corp. + * Copyright (C) 2004 Alan Modra (amodra@au.ibm.com)), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/unistd.h> +#include <asm/vdso.h> +#include <asm/ptrace.h> /* XXX for __SIGNAL_FRAMESIZE */ + + .text + +/* The nop here is a hack. The dwarf2 unwind routines subtract 1 from + the return address to get an address in the middle of the presumed + call instruction. Since we don't have a call here, we artifically + extend the range covered by the unwind info by padding before the + real start. */ + nop + .balign 8 +V_FUNCTION_BEGIN(__kernel_sigtramp_rt64) +.Lsigrt_start = . - 4 + addi r1, r1, __SIGNAL_FRAMESIZE + li r0,__NR_rt_sigreturn + sc +.Lsigrt_end: +V_FUNCTION_END(__kernel_sigtramp_rt64) +/* The ".balign 8" above and the following zeros mimic the old stack + trampoline layout. The last magic value is the ucontext pointer, + chosen in such a way that older libgcc unwind code returns a zero + for a sigcontext pointer. */ + .long 0,0,0 + .quad 0,-21*8 + +/* Register r1 can be found at offset 8 of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define cfa_save \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 RSIZE; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ +9: + +/* Register REGNO can be found at offset OFS of a pt_regs structure. + A pointer to the pt_regs is stored in memory at the old sp plus PTREGS. */ +#define rsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .ifne ofs; \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ + .endif; \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. A pointer to the VMX reg struct is at VREGS in + the pt_regs struct. This macro is for REGNO == 0, and contains + 'subroutines' that the other macros jump to. */ +#define vsave_msr0(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit0 */ \ +2: \ + .byte 0x40; /* DW_OP_lit16 */ \ + .byte 0x1e; /* DW_OP_mul */ \ +3: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x12; /* DW_OP_dup */ \ + .byte 0x23; /* DW_OP_plus_uconst */ \ + .uleb128 33*RSIZE; /* msr offset */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x0c; .long 1 << 25; /* DW_OP_const4u */ \ + .byte 0x1a; /* DW_OP_and */ \ + .byte 0x12; /* DW_OP_dup, ret 0 if bra taken */ \ + .byte 0x30; /* DW_OP_lit0 */ \ + .byte 0x29; /* DW_OP_eq */ \ + .byte 0x28; .short 0x7fff; /* DW_OP_bra to end */ \ + .byte 0x13; /* DW_OP_drop, pop the 0 */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x22; /* DW_OP_plus */ \ + .byte 0x2f; .short 0x7fff; /* DW_OP_skip to end */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset REGNO*16 + of the VMX reg struct. REGNO is 1 thru 31. */ +#define vsave_msr1(regno) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x30 + regno; /* DW_OP_lit n */ \ + .byte 0x2f; .short 2b - 9f; /* DW_OP_skip */ \ +9: + +/* If msr bit 1<<25 is set, then VMX register REGNO is at offset OFS of + the VMX save block. */ +#define vsave_msr2(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x0a; .short ofs; /* DW_OP_const2u */ \ + .byte 0x2f; .short 3b - 9f; /* DW_OP_skip */ \ +9: + +/* VMX register REGNO is at offset OFS of the VMX save area. */ +#define vsave(regno, ofs) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno + 77; /* regno */ \ + .uleb128 9f - 1f; /* length */ \ +1: \ + .byte 0x71; .sleb128 PTREGS; /* DW_OP_breg1 */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 VREGS; /* DW_OP_plus_uconst */ \ + .byte 0x06; /* DW_OP_deref */ \ + .byte 0x23; .uleb128 ofs; /* DW_OP_plus_uconst */ \ +9: + +/* This is where the pt_regs pointer can be found on the stack. */ +#define PTREGS 128+168+56 + +/* Size of regs. */ +#define RSIZE 8 + +/* This is the offset of the VMX reg pointer. */ +#define VREGS 48*RSIZE+33*8 + +/* Describe where general purpose regs are saved. */ +#define EH_FRAME_GEN \ + cfa_save; \ + rsave ( 0, 0*RSIZE); \ + rsave ( 2, 2*RSIZE); \ + rsave ( 3, 3*RSIZE); \ + rsave ( 4, 4*RSIZE); \ + rsave ( 5, 5*RSIZE); \ + rsave ( 6, 6*RSIZE); \ + rsave ( 7, 7*RSIZE); \ + rsave ( 8, 8*RSIZE); \ + rsave ( 9, 9*RSIZE); \ + rsave (10, 10*RSIZE); \ + rsave (11, 11*RSIZE); \ + rsave (12, 12*RSIZE); \ + rsave (13, 13*RSIZE); \ + rsave (14, 14*RSIZE); \ + rsave (15, 15*RSIZE); \ + rsave (16, 16*RSIZE); \ + rsave (17, 17*RSIZE); \ + rsave (18, 18*RSIZE); \ + rsave (19, 19*RSIZE); \ + rsave (20, 20*RSIZE); \ + rsave (21, 21*RSIZE); \ + rsave (22, 22*RSIZE); \ + rsave (23, 23*RSIZE); \ + rsave (24, 24*RSIZE); \ + rsave (25, 25*RSIZE); \ + rsave (26, 26*RSIZE); \ + rsave (27, 27*RSIZE); \ + rsave (28, 28*RSIZE); \ + rsave (29, 29*RSIZE); \ + rsave (30, 30*RSIZE); \ + rsave (31, 31*RSIZE); \ + rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ + rsave (65, 36*RSIZE); /* lr */ \ + rsave (70, 38*RSIZE) /* cr */ + +/* Describe where the FP regs are saved. */ +#define EH_FRAME_FP \ + rsave (32, 48*RSIZE + 0*8); \ + rsave (33, 48*RSIZE + 1*8); \ + rsave (34, 48*RSIZE + 2*8); \ + rsave (35, 48*RSIZE + 3*8); \ + rsave (36, 48*RSIZE + 4*8); \ + rsave (37, 48*RSIZE + 5*8); \ + rsave (38, 48*RSIZE + 6*8); \ + rsave (39, 48*RSIZE + 7*8); \ + rsave (40, 48*RSIZE + 8*8); \ + rsave (41, 48*RSIZE + 9*8); \ + rsave (42, 48*RSIZE + 10*8); \ + rsave (43, 48*RSIZE + 11*8); \ + rsave (44, 48*RSIZE + 12*8); \ + rsave (45, 48*RSIZE + 13*8); \ + rsave (46, 48*RSIZE + 14*8); \ + rsave (47, 48*RSIZE + 15*8); \ + rsave (48, 48*RSIZE + 16*8); \ + rsave (49, 48*RSIZE + 17*8); \ + rsave (50, 48*RSIZE + 18*8); \ + rsave (51, 48*RSIZE + 19*8); \ + rsave (52, 48*RSIZE + 20*8); \ + rsave (53, 48*RSIZE + 21*8); \ + rsave (54, 48*RSIZE + 22*8); \ + rsave (55, 48*RSIZE + 23*8); \ + rsave (56, 48*RSIZE + 24*8); \ + rsave (57, 48*RSIZE + 25*8); \ + rsave (58, 48*RSIZE + 26*8); \ + rsave (59, 48*RSIZE + 27*8); \ + rsave (60, 48*RSIZE + 28*8); \ + rsave (61, 48*RSIZE + 29*8); \ + rsave (62, 48*RSIZE + 30*8); \ + rsave (63, 48*RSIZE + 31*8) + +/* Describe where the VMX regs are saved. */ +#ifdef CONFIG_ALTIVEC +#define EH_FRAME_VMX \ + vsave_msr0 ( 0); \ + vsave_msr1 ( 1); \ + vsave_msr1 ( 2); \ + vsave_msr1 ( 3); \ + vsave_msr1 ( 4); \ + vsave_msr1 ( 5); \ + vsave_msr1 ( 6); \ + vsave_msr1 ( 7); \ + vsave_msr1 ( 8); \ + vsave_msr1 ( 9); \ + vsave_msr1 (10); \ + vsave_msr1 (11); \ + vsave_msr1 (12); \ + vsave_msr1 (13); \ + vsave_msr1 (14); \ + vsave_msr1 (15); \ + vsave_msr1 (16); \ + vsave_msr1 (17); \ + vsave_msr1 (18); \ + vsave_msr1 (19); \ + vsave_msr1 (20); \ + vsave_msr1 (21); \ + vsave_msr1 (22); \ + vsave_msr1 (23); \ + vsave_msr1 (24); \ + vsave_msr1 (25); \ + vsave_msr1 (26); \ + vsave_msr1 (27); \ + vsave_msr1 (28); \ + vsave_msr1 (29); \ + vsave_msr1 (30); \ + vsave_msr1 (31); \ + vsave_msr2 (33, 32*16+12); \ + vsave (32, 33*16) +#else +#define EH_FRAME_VMX +#endif + + .section .eh_frame,"a",@progbits +.Lcie: + .long .Lcie_end - .Lcie_start +.Lcie_start: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 4 /* Code alignment factor */ + .sleb128 -8 /* Data alignment factor */ + .byte 67 /* Return address register column, ap */ + .uleb128 1 /* Augmentation value length */ + .byte 0x14 /* DW_EH_PE_pcrel | DW_EH_PE_udata8. */ + .byte 0x0c,1,0 /* DW_CFA_def_cfa: r1 ofs 0 */ + .balign 8 +.Lcie_end: + + .long .Lfde0_end - .Lfde0_start +.Lfde0_start: + .long .Lfde0_start - .Lcie /* CIE pointer. */ + .quad .Lsigrt_start - . /* PC start, length */ + .quad .Lsigrt_end - .Lsigrt_start + .uleb128 0 /* Augmentation */ + EH_FRAME_GEN + EH_FRAME_FP + EH_FRAME_VMX +# Do we really need to describe the frame at this point? ie. will +# we ever have some call chain that returns somewhere past the addi? +# I don't think so, since gcc doesn't support async signals. +# .byte 0x41 /* DW_CFA_advance_loc 1*4 */ +#undef PTREGS +#define PTREGS 168+56 +# EH_FRAME_GEN +# EH_FRAME_FP +# EH_FRAME_VMX + .balign 8 +.Lfde0_end: diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S new file mode 100644 index 00000000000..4bdf224464a --- /dev/null +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -0,0 +1,116 @@ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", "elf64-powerpc") +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) + +SECTIONS +{ + . = VDSO64_LBASE + SIZEOF_HEADERS; + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN (16); + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + *(.sfpr .glink) + } :text + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + + /* Other stuff is appended to the text segment: */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table) } + + .opd ALIGN(8) : { KEEP (*(.opd)) } + .got ALIGN(8) : { *(.got .toc) } + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + + .dynamic : { *(.dynamic) } :text :dynamic + + _end = .; + PROVIDE (end = .); + + /* Stabs debugging sections are here too + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sectio/ns. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + + /DISCARD/ : { *(.note.GNU-stack) } + /DISCARD/ : { *(.branch_lt) } + /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.*) } + /DISCARD/ : { *(.bss .sbss .dynbss .dynsbss) } +} + +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + note PT_NOTE FLAGS(4); /* PF_R */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + __kernel_datapage_offset; /* Has to be there for the kernel to find */ + __kernel_get_syscall_map; + __kernel_gettimeofday; + __kernel_clock_gettime; + __kernel_clock_getres; + __kernel_get_tbfreq; + __kernel_sync_dicache; + __kernel_sync_dicache_p5; + __kernel_sigtramp_rt64; + local: *; + }; +} diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S new file mode 100644 index 00000000000..0529cb9e3b9 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -0,0 +1,13 @@ +#include <linux/init.h> +#include <asm/page.h> + + .section ".data.page_aligned" + + .globl vdso64_start, vdso64_end + .balign PAGE_SIZE +vdso64_start: + .incbin "arch/powerpc/kernel/vdso64/vdso64.so" + .balign PAGE_SIZE +vdso64_end: + + .previous diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 97082a4203a..71a6addf9f7 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -21,6 +21,7 @@ #include <asm/iommu.h> #include <asm/dma.h> #include <asm/vio.h> +#include <asm/prom.h> static const struct vio_device_id *vio_match_device( const struct vio_device_id *, const struct vio_dev *); @@ -265,7 +266,33 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } +static int vio_hotplug(struct device *dev, char **envp, int num_envp, + char *buffer, int buffer_size) +{ + const struct vio_dev *vio_dev = to_vio_dev(dev); + char *cp; + int length; + + if (!num_envp) + return -ENOMEM; + + if (!vio_dev->dev.platform_data) + return -ENODEV; + cp = (char *)get_property(vio_dev->dev.platform_data, "compatible", &length); + if (!cp) + return -ENODEV; + + envp[0] = buffer; + length = scnprintf(buffer, buffer_size, "MODALIAS=vio:T%sS%s", + vio_dev->type, cp); + if (buffer_size - length <= 0) + return -ENOMEM; + envp[1] = NULL; + return 0; +} + struct bus_type vio_bus_type = { .name = "vio", + .hotplug = vio_hotplug, .match = vio_bus_match, }; diff --git a/arch/powerpc/lib/bitops.c b/arch/powerpc/lib/bitops.c index b67ce3004eb..f68ad71a018 100644 --- a/arch/powerpc/lib/bitops.c +++ b/arch/powerpc/lib/bitops.c @@ -41,7 +41,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, tmp = *p; found_first: - tmp &= (~0UL >> (64 - size)); + tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index 733d61618bb..40523b14010 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -11,7 +11,7 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> -_GLOBAL(copy_page) +_GLOBAL(copy_4K_page) std r31,-8(1) std r30,-16(1) std r29,-24(1) diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index a0b3fbbd6fb..6d69ef39b7d 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S @@ -24,7 +24,7 @@ _GLOBAL(__copy_tofrom_user) std r4,-16(r1) std r5,-8(r1) dcbt 0,r4 - beq .Lcopy_page + beq .Lcopy_page_4K andi. r6,r6,7 mtcrf 0x01,r5 blt cr1,.Lshort_copy @@ -366,7 +366,7 @@ _GLOBAL(__copy_tofrom_user) * above (following the .Ldst_aligned label) but it runs slightly * slower on POWER3. */ -.Lcopy_page: +.Lcopy_page_4K: std r31,-32(1) std r30,-40(1) std r29,-48(1) diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 2a912f411eb..35bd03c41dd 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -23,6 +23,7 @@ #if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES) #include <asm/hvcall.h> #include <asm/iseries/hv_call.h> +#include <asm/smp.h> void __spin_yield(raw_spinlock_t *lock) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 841d8b6323a..93d4fbfdb72 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -389,5 +389,22 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) } /* kernel has accessed a bad area */ + + printk(KERN_ALERT "Unable to handle kernel paging request for "); + switch (regs->trap) { + case 0x300: + case 0x380: + printk("data at address 0x%08lx\n", regs->dar); + break; + case 0x400: + case 0x480: + printk("instruction fetch\n"); + break; + default: + printk("unknown fault\n"); + } + printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", + regs->nip); + die("Kernel access of bad area", regs, sig); } diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index af9ca0eb6d5..5d581bb3aa1 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -1,5 +1,5 @@ /* - * Modifications by Kumar Gala (kumar.gala@freescale.com) to support + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support * E500 Book E processors. * * Copyright 2004 Freescale Semiconductor, Inc diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index d6ed9102eee..e0d02c4a261 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -1,7 +1,7 @@ /* * ppc64 MMU hashtable management routines * - * (c) Copyright IBM Corp. 2003 + * (c) Copyright IBM Corp. 2003, 2005 * * Maintained by: Benjamin Herrenschmidt * <benh@kernel.crashing.org> @@ -10,6 +10,7 @@ * described in the kernel's COPYING file. */ +#include <linux/config.h> #include <asm/reg.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -42,14 +43,24 @@ /* Save non-volatile offsets */ #define STK_REG(i) (112 + ((i)-14)*8) + +#ifndef CONFIG_PPC_64K_PAGES + +/***************************************************************************** + * * + * 4K SW & 4K HW pages implementation * + * * + *****************************************************************************/ + + /* - * _hash_page(unsigned long ea, unsigned long access, unsigned long vsid, - * pte_t *ptep, unsigned long trap, int local) + * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local) * - * Adds a page to the hash table. This is the non-LPAR version for now + * Adds a 4K page to the hash table in a segment of 4K pages only */ -_GLOBAL(__hash_page) +_GLOBAL(__hash_page_4K) mflr r0 std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) @@ -88,7 +99,8 @@ _GLOBAL(__hash_page) /* If so, just bail out and refault if needed. Someone else * is changing this PTE anyway and might hash it. */ - bne- bail_ok + bne- htab_bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then * add BUSY,HASHPTE and ACCESSED) */ @@ -118,10 +130,10 @@ _GLOBAL(__hash_page) /* Convert linux PTE bits into HW equivalents */ andi. r3,r30,0x1fe /* Get basic set of flags */ - xori r3,r3,HW_NO_EXEC /* _PAGE_EXEC -> NOEXEC */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ - and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY -> r0 bit 30 */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ andc r0,r30,r0 /* r0 = pte & ~r0 */ rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ @@ -158,19 +170,21 @@ htab_insert_pte: andc r30,r30,r0 ori r30,r30,_PAGE_HASHPTE - /* page number in r5 */ - rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT /* Calculate primary group hash */ and r0,r28,r27 - rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ mr r4,r29 /* Retreive va */ - li r6,0 /* no vflags */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ _GLOBAL(htab_call_hpte_insert1) - bl . /* Will be patched by htab_finish_init() */ + bl . /* Patched by htab_finish_init() */ cmpdi 0,r3,0 bge htab_pte_insert_ok /* Insertion successful */ cmpdi 0,r3,-2 /* Critical failure */ @@ -178,19 +192,21 @@ _GLOBAL(htab_call_hpte_insert1) /* Now try secondary slot */ - /* page number in r5 */ - rldicl r5,r31,64-PTE_SHIFT,PTE_SHIFT + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT /* Calculate secondary group hash */ andc r0,r27,r28 rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ /* Call ppc_md.hpte_insert */ - ld r7,STK_PARM(r4)(r1) /* Retreive new pp bits */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ mr r4,r29 /* Retreive va */ - li r6,HPTE_V_SECONDARY@l /* secondary slot */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ _GLOBAL(htab_call_hpte_insert2) - bl . /* Will be patched by htab_finish_init() */ + bl . /* Patched by htab_finish_init() */ cmpdi 0,r3,0 bge+ htab_pte_insert_ok /* Insertion successful */ cmpdi 0,r3,-2 /* Critical failure */ @@ -207,14 +223,14 @@ _GLOBAL(htab_call_hpte_insert2) rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ /* Call ppc_md.hpte_remove */ _GLOBAL(htab_call_hpte_remove) - bl . /* Will be patched by htab_finish_init() */ + bl . /* Patched by htab_finish_init() */ /* Try all again */ b htab_insert_pte -bail_ok: +htab_bail_ok: li r3,0 - b bail + b htab_bail htab_pte_insert_ok: /* Insert slot number & secondary bit in PTE */ @@ -227,7 +243,7 @@ htab_write_out_pte: ld r6,STK_PARM(r6)(r1) std r30,0(r6) li r3, 0 -bail: +htab_bail: ld r27,STK_REG(r27)(r1) ld r28,STK_REG(r28)(r1) ld r29,STK_REG(r29)(r1) @@ -256,10 +272,10 @@ htab_modify_pte: /* Call ppc_md.hpte_updatepp */ mr r5,r29 /* va */ - li r6,0 /* large is 0 */ + li r6,MMU_PAGE_4K /* page size */ ld r7,STK_PARM(r8)(r1) /* get "local" param */ _GLOBAL(htab_call_hpte_updatepp) - bl . /* Will be patched by htab_finish_init() */ + bl . /* Patched by htab_finish_init() */ /* if we failed because typically the HPTE wasn't really here * we try an insertion. @@ -276,13 +292,556 @@ htab_wrong_access: /* Bail out clearing reservation */ stdcx. r31,0,r6 li r3,1 - b bail + b htab_bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b htab_bail + + +#else /* CONFIG_PPC_64K_PAGES */ + + +/***************************************************************************** + * * + * 64K SW & 4K or 64K HW in a 4K segment pages implementation * + * * + *****************************************************************************/ + +/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, int local) + */ + +/* + * For now, we do NOT implement Admixed pages + */ +_GLOBAL(__hash_page_4K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + * r26 is the hidx mask + * r25 is the index in combo page + */ + std r25,STK_REG(r25)(r1) + std r26,STK_REG(r26)(r1) + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- htab_bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Load the hidx index */ + rldicl r25,r3,64-12,60 + + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 /* r29 = (vsid << 28) */ + rldicl r3,r3,0,36 /* r3 = (ea & 0x0fffffff) */ + or r29,r3,r29 /* r29 = va + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-12,48 /* (ea >> 12) & 0xffff */ + xor r28,r5,r0 + + /* Convert linux PTE bits into HW equivalents */ + andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE. We look for + * the bit at (1 >> (index + 32)) + */ + andi. r0,r31,_PAGE_HASHPTE + li r26,0 /* Default hidx */ + beq htab_insert_pte + ld r6,STK_PARM(r6)(r1) + ori r26,r6,0x8000 /* Load the hidx mask */ + ld r26,0(r26) + addi r5,r25,36 /* Check actual HPTE_SUB bit, this */ + rldcr. r0,r31,r5,0 /* must match pgtable.h definition */ + bne htab_modify_pte + +htab_insert_pte: + /* real page number in r5, PTE RPN value + index */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT + add r5,r5,r25 + sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ +_GLOBAL(htab_call_hpte_insert1) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* real page number in r5, PTE RPN value + index */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT-HW_PAGE_SHIFT + add r5,r5,r25 + sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ +_GLOBAL(htab_call_hpte_insert2) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(htab_call_hpte_remove) + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + +htab_bail_ok: + li r3,0 + b htab_bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE second half, + * clear _PAGE_BUSY and set approriate HPTE slot bit + */ + ld r6,STK_PARM(r6)(r1) + li r0,_PAGE_BUSY + andc r30,r30,r0 + /* HPTE SUB bit */ + li r0,1 + subfic r5,r25,27 /* Must match bit position in */ + sld r0,r0,r5 /* pgtable.h */ + or r30,r30,r0 + /* hindx */ + sldi r5,r25,2 + sld r3,r3,r5 + li r4,0xf + sld r4,r4,r5 + andc r26,r26,r4 + or r26,r26,r3 + ori r5,r6,0x8000 + std r26,0(r5) + lwsync + std r30,0(r6) + li r3, 0 +htab_bail: + ld r25,STK_REG(r25)(r1) + ld r26,STK_REG(r26)(r1) + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + sldi r5,r25,2 + srd r3,r26,r5 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r3,0x8 /* page secondary ? */ + beq 1f + not r5,r5 +1: andi. r3,r3,0x7 /* extract idx alone */ + + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,MMU_PAGE_4K /* page size */ + ld r7,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(htab_call_hpte_updatepp) + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3,0 + b htab_bail + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b htab_bail htab_pte_insert_failure: /* Bail out restoring old PTE */ ld r6,STK_PARM(r6)(r1) std r31,0(r6) li r3,-1 - b bail + b htab_bail + + +/***************************************************************************** + * * + * 64K SW & 64K HW in a 64K segment pages implementation * + * * + *****************************************************************************/ + +_GLOBAL(__hash_page_64K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARM(r6)(r1) + std r8,STK_PARM(r8)(r1) + + /* Add _PAGE_PRESENT to access */ + ori r4,r4,_PAGE_PRESENT + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is "va" + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(r27)(r1) + std r28,STK_REG(r28)(r1) + std r29,STK_REG(r29)(r1) + std r30,STK_REG(r30)(r1) + std r31,STK_REG(r31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- ht64_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- ht64_bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY,HASHPTE and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Calc va and put it in r29 */ + rldicr r29,r5,28,63-28 + rldicl r3,r3,0,36 + or r29,r3,r29 + + /* Calculate hash value for primary slot and store it in r28 */ + rldicl r5,r5,0,25 /* vsid & 0x0000007fffffffff */ + rldicl r0,r3,64-16,52 /* (ea >> 16) & 0xfff */ + xor r28,r5,r0 + + /* Convert linux PTE bits into HW equivalents */ + andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl .hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARM(r4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + andi. r0,r31,_PAGE_HASHPTE + bne ht64_modify_pte + +ht64_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HASHPTE + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 + ori r30,r30,_PAGE_HASHPTE + + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_64K +_GLOBAL(ht64_call_hpte_insert1) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Now try secondary slot */ + + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARM(r4)(r1) /* Retreive new pp bits */ + mr r4,r29 /* Retreive va */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_64K +_GLOBAL(ht64_call_hpte_insert2) + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +_GLOBAL(ht64_call_hpte_remove) + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b ht64_insert_pte + +ht64_bail_ok: + li r3,0 + b ht64_bail + +ht64_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +ht64_write_out_pte: + ld r6,STK_PARM(r6)(r1) + std r30,0(r6) + li r3, 0 +ht64_bail: + ld r27,STK_REG(r27)(r1) + ld r28,STK_REG(r28)(r1) + ld r29,STK_REG(r29)(r1) + ld r30,STK_REG(r30)(r1) + ld r31,STK_REG(r31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +ht64_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_F_SECOND + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* va */ + li r6,MMU_PAGE_64K + ld r7,STK_PARM(r8)(r1) /* get "local" param */ +_GLOBAL(ht64_call_hpte_updatepp) + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- ht64_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b ht64_write_out_pte + +ht64_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b ht64_bail + +ht64_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARM(r6)(r1) + std r31,0(r6) + li r3,-1 + b ht64_bail + + +#endif /* CONFIG_PPC_64K_PAGES */ +/***************************************************************************** + * * + * Huge pages implementation is in hugetlbpage.c * + * * + *****************************************************************************/ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 174d14576c2..d96bcfe4c6f 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -9,6 +9,9 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +#undef DEBUG_LOW + #include <linux/spinlock.h> #include <linux/bitops.h> #include <linux/threads.h> @@ -22,11 +25,84 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/cputable.h> +#include <asm/udbg.h> + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif #define HPTE_LOCK_BIT 3 static DEFINE_SPINLOCK(native_tlbie_lock); +static inline void __tlbie(unsigned long va, unsigned int psize) +{ + unsigned int penc; + + /* clear top 16 bits, non SLS segment */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + va &= ~0xffful; + asm volatile("tlbie %0,0" : : "r" (va) : "memory"); + break; + default: + penc = mmu_psize_defs[psize].penc; + va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + va |= (0x7f >> (8 - penc)) << 12; + asm volatile("tlbie %0,1" : : "r" (va) : "memory"); + break; + } +} + +static inline void __tlbiel(unsigned long va, unsigned int psize) +{ + unsigned int penc; + + /* clear top 16 bits, non SLS segment */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + va &= ~0xffful; + asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" + : : "r"(va) : "memory"); + break; + default: + penc = mmu_psize_defs[psize].penc; + va &= ~((1ul << mmu_psize_defs[psize].shift) - 1); + va |= (0x7f >> (8 - penc)) << 12; + asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)" + : : "r"(va) : "memory"); + break; + } + +} + +static inline void tlbie(unsigned long va, int psize, int local) +{ + unsigned int use_local = local && cpu_has_feature(CPU_FTR_TLBIEL); + int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); + + if (use_local) + use_local = mmu_psize_defs[psize].tlbiel; + if (lock_tlbie && !use_local) + spin_lock(&native_tlbie_lock); + asm volatile("ptesync": : :"memory"); + if (use_local) { + __tlbiel(va, psize); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie(va, psize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + if (lock_tlbie && !use_local) + spin_unlock(&native_tlbie_lock); +} + static inline void native_lock_hpte(hpte_t *hptep) { unsigned long *word = &hptep->v; @@ -48,13 +124,19 @@ static inline void native_unlock_hpte(hpte_t *hptep) } long native_hpte_insert(unsigned long hpte_group, unsigned long va, - unsigned long prpn, unsigned long vflags, - unsigned long rflags) + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize) { hpte_t *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; int i; + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" insert(group=%lx, va=%016lx, pa=%016lx," + " rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, va, pa, rflags, vflags, psize); + } + for (i = 0; i < HPTES_PER_GROUP; i++) { if (! (hptep->v & HPTE_V_VALID)) { /* retry with lock held */ @@ -70,10 +152,13 @@ long native_hpte_insert(unsigned long hpte_group, unsigned long va, if (i == HPTES_PER_GROUP) return -1; - hpte_v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; - if (vflags & HPTE_V_LARGE) - va &= ~(1UL << HPTE_V_AVPN_SHIFT); - hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; + hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", + i, hpte_v, hpte_r); + } hptep->r = hpte_r; /* Guarantee the second dword is visible before the valid bit */ @@ -96,6 +181,8 @@ static long native_hpte_remove(unsigned long hpte_group) int slot_offset; unsigned long hpte_v; + DBG_LOW(" remove(group=%lx)\n", hpte_group); + /* pick a random entry to start at */ slot_offset = mftb() & 0x7; @@ -126,34 +213,51 @@ static long native_hpte_remove(unsigned long hpte_group) return i; } -static inline void set_pp_bit(unsigned long pp, hpte_t *addr) +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long va, int psize, int local) { - unsigned long old; - unsigned long *p = &addr->r; - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - rldimi %0,%2,0,61\n\ - stdcx. %0,0,%3\n\ - bne 1b" - : "=&r" (old), "=m" (*p) - : "r" (pp), "r" (p), "m" (*p) - : "cc"); + hpte_t *hptep = htab_address + slot; + unsigned long hpte_v, want_v; + int ret = 0; + + want_v = hpte_encode_v(va, psize); + + DBG_LOW(" update(va=%016lx, avpnv=%016lx, hash=%016lx, newpp=%x)", + va, want_v & HPTE_V_AVPN, slot, newpp); + + native_lock_hpte(hptep); + + hpte_v = hptep->v; + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { + DBG_LOW(" -> miss\n"); + native_unlock_hpte(hptep); + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N)); + native_unlock_hpte(hptep); + } + + /* Ensure it is out of the tlb too. */ + tlbie(va, psize, local); + + return ret; } -/* - * Only works on small pages. Yes its ugly to have to check each slot in - * the group but we only use this during bootup. - */ -static long native_hpte_find(unsigned long vpn) +static long native_hpte_find(unsigned long va, int psize) { hpte_t *hptep; unsigned long hash; unsigned long i, j; long slot; - unsigned long hpte_v; + unsigned long want_v, hpte_v; - hash = hpt_hash(vpn, 0); + hash = hpt_hash(va, mmu_psize_defs[psize].shift); + want_v = hpte_encode_v(va, psize); for (j = 0; j < 2; j++) { slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; @@ -161,7 +265,7 @@ static long native_hpte_find(unsigned long vpn) hptep = htab_address + slot; hpte_v = hptep->v; - if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID) && ( !!(hpte_v & HPTE_V_SECONDARY) == j)) { /* HPTE matches */ @@ -177,127 +281,101 @@ static long native_hpte_find(unsigned long vpn) return -1; } -static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int large, int local) -{ - hpte_t *hptep = htab_address + slot; - unsigned long hpte_v; - unsigned long avpn = va >> 23; - int ret = 0; - - if (large) - avpn &= ~1; - - native_lock_hpte(hptep); - - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { - native_unlock_hpte(hptep); - ret = -1; - } else { - set_pp_bit(newpp, hptep); - native_unlock_hpte(hptep); - } - - /* Ensure it is out of the tlb too */ - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { - tlbiel(va); - } else { - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - if (lock_tlbie) - spin_lock(&native_tlbie_lock); - tlbie(va, large); - if (lock_tlbie) - spin_unlock(&native_tlbie_lock); - } - - return ret; -} - /* * Update the page protection bits. Intended to be used to create * guard pages for kernel data structures on pages which are bolted * in the HPT. Assumes pages being operated on will not be stolen. - * Does not work on large pages. * * No need to lock here because we should be the only user. */ -static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize) { - unsigned long vsid, va, vpn, flags = 0; + unsigned long vsid, va; long slot; hpte_t *hptep; - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); vsid = get_kernel_vsid(ea); va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> PAGE_SHIFT; - slot = native_hpte_find(vpn); + slot = native_hpte_find(va, psize); if (slot == -1) panic("could not find page to bolt\n"); hptep = htab_address + slot; - set_pp_bit(newpp, hptep); + /* Update the HPTE */ + hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N)); - /* Ensure it is out of the tlb too */ - if (lock_tlbie) - spin_lock_irqsave(&native_tlbie_lock, flags); - tlbie(va, 0); - if (lock_tlbie) - spin_unlock_irqrestore(&native_tlbie_lock, flags); + /* Ensure it is out of the tlb too. */ + tlbie(va, psize, 0); } static void native_hpte_invalidate(unsigned long slot, unsigned long va, - int large, int local) + int psize, int local) { hpte_t *hptep = htab_address + slot; unsigned long hpte_v; - unsigned long avpn = va >> 23; + unsigned long want_v; unsigned long flags; - int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); - - if (large) - avpn &= ~1; local_irq_save(flags); - native_lock_hpte(hptep); + DBG_LOW(" invalidate(va=%016lx, hash: %x)\n", va, slot); + + want_v = hpte_encode_v(va, psize); + native_lock_hpte(hptep); hpte_v = hptep->v; /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - } else { + else /* Invalidate the hpte. NOTE: this also unlocks it */ hptep->v = 0; - } - /* Invalidate the tlb */ - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { - tlbiel(va); - } else { - if (lock_tlbie) - spin_lock(&native_tlbie_lock); - tlbie(va, large); - if (lock_tlbie) - spin_unlock(&native_tlbie_lock); - } + /* Invalidate the TLB */ + tlbie(va, psize, local); + local_irq_restore(flags); } /* + * XXX This need fixing based on page size. It's only used by + * native_hpte_clear() for now which needs fixing too so they + * make a good pair... + */ +static unsigned long slot2va(unsigned long hpte_v, unsigned long slot) +{ + unsigned long avpn = HPTE_V_AVPN_VAL(hpte_v); + unsigned long va; + + va = avpn << 23; + + if (! (hpte_v & HPTE_V_LARGE)) { + unsigned long vpi, pteg; + + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + vpi = ((va >> 28) ^ pteg) & htab_hash_mask; + + va |= vpi << PAGE_SHIFT; + } + + return va; +} + +/* * clear all mappings on kexec. All cpus are in real mode (or they will * be when they isi), and we are the only one left. We rely on our kernel * mapping being 0xC0's and the hardware ignoring those two real bits. * * TODO: add batching support when enabled. remember, no dynamic memory here, * athough there is the control page available... + * + * XXX FIXME: 4k only for now ! */ static void native_hpte_clear(void) { @@ -327,7 +405,7 @@ static void native_hpte_clear(void) if (hpte_v & HPTE_V_VALID) { hptep->v = 0; - tlbie(slot2va(hpte_v, slot), hpte_v & HPTE_V_LARGE); + tlbie(slot2va(hpte_v, slot), MMU_PAGE_4K, 0); } } @@ -335,59 +413,59 @@ static void native_hpte_clear(void) local_irq_restore(flags); } +/* + * Batched hash table flush, we batch the tlbie's to avoid taking/releasing + * the lock all the time + */ static void native_flush_hash_range(unsigned long number, int local) { - unsigned long va, vpn, hash, secondary, slot, flags, avpn; - int i, j; + unsigned long va, hash, index, hidx, shift, slot; hpte_t *hptep; unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + real_pte_t pte; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - unsigned long large = batch->large; + unsigned long psize = batch->psize; + int i; local_irq_save(flags); - j = 0; for (i = 0; i < number; i++) { - va = batch->vaddr[j]; - if (large) - vpn = va >> HPAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - hash = hpt_hash(vpn, large); - secondary = (pte_val(batch->pte[i]) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(batch->pte[i]) & _PAGE_GROUP_IX) >> 12; - - hptep = htab_address + slot; - - avpn = va >> 23; - if (large) - avpn &= ~0x1UL; - - native_lock_hpte(hptep); - - hpte_v = hptep->v; - - /* Even if we miss, we need to invalidate the TLB */ - if ((HPTE_V_AVPN_VAL(hpte_v) != avpn) - || !(hpte_v & HPTE_V_VALID)) { - native_unlock_hpte(hptep); - } else { - /* Invalidate the hpte. NOTE: this also unlocks it */ - hptep->v = 0; - } - - j++; + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, shift) { + hash = hpt_hash(va, shift); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + hptep = htab_address + slot; + want_v = hpte_encode_v(va, psize); + native_lock_hpte(hptep); + hpte_v = hptep->v; + if (!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + hptep->v = 0; + } pte_iterate_hashed_end(); } - if (cpu_has_feature(CPU_FTR_TLBIEL) && !large && local) { + if (cpu_has_feature(CPU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && local) { asm volatile("ptesync":::"memory"); - - for (i = 0; i < j; i++) - __tlbiel(batch->vaddr[i]); - + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, + shift) { + __tlbiel(va, psize); + } pte_iterate_hashed_end(); + } asm volatile("ptesync":::"memory"); } else { int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE); @@ -396,10 +474,15 @@ static void native_flush_hash_range(unsigned long number, int local) spin_lock(&native_tlbie_lock); asm volatile("ptesync":::"memory"); - - for (i = 0; i < j; i++) - __tlbie(batch->vaddr[i], large); - + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, va, index, + shift) { + __tlbie(va, psize); + } pte_iterate_hashed_end(); + } asm volatile("eieio; tlbsync; ptesync":::"memory"); if (lock_tlbie) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6e9e05cce02..706e8a63ced 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -19,6 +19,7 @@ */ #undef DEBUG +#undef DEBUG_LOW #include <linux/config.h> #include <linux/spinlock.h> @@ -32,7 +33,6 @@ #include <linux/init.h> #include <linux/signal.h> -#include <asm/ppcdebug.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -59,6 +59,15 @@ #define DBG(fmt...) #endif +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) + /* * Note: pte --> Linux PTE * HPTE --> PowerPC Hashed Page Table Entry @@ -75,99 +84,302 @@ extern unsigned long dart_tablebase; #endif /* CONFIG_U3_DART */ +static unsigned long _SDR1; +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + hpte_t *htab_address; unsigned long htab_hash_mask; +int mmu_linear_psize = MMU_PAGE_4K; +int mmu_virtual_psize = MMU_PAGE_4K; +#ifdef CONFIG_HUGETLB_PAGE +int mmu_huge_psize = MMU_PAGE_16M; +unsigned int HPAGE_SHIFT; +#endif -unsigned long _SDR1; - -#define KB (1024) -#define MB (1024*KB) - -static inline void loop_forever(void) -{ - volatile unsigned long x = 1; - for(;x;x|=1) - ; -} +/* There are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ -static inline void create_pte_mapping(unsigned long start, unsigned long end, - unsigned long mode, int large) +/* Pre-POWER4 CPUs (4k pages only) + */ +struct mmu_psize_def mmu_psize_defaults_old[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = 0, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = 0, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = 0, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long mode, int psize) { - unsigned long addr; - unsigned int step; + unsigned long vaddr, paddr; + unsigned int step, shift; unsigned long tmp_mode; - unsigned long vflags; + int ret = 0; - if (large) { - step = 16*MB; - vflags = HPTE_V_BOLTED | HPTE_V_LARGE; - } else { - step = 4*KB; - vflags = HPTE_V_BOLTED; - } + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; - for (addr = start; addr < end; addr += step) { + for (vaddr = vstart, paddr = pstart; vaddr < vend; + vaddr += step, paddr += step) { unsigned long vpn, hash, hpteg; - unsigned long vsid = get_kernel_vsid(addr); - unsigned long va = (vsid << 28) | (addr & 0xfffffff); - int ret = -1; - - if (large) - vpn = va >> HPAGE_SHIFT; - else - vpn = va >> PAGE_SHIFT; - + unsigned long vsid = get_kernel_vsid(vaddr); + unsigned long va = (vsid << 28) | (vaddr & 0x0fffffff); + vpn = va >> shift; tmp_mode = mode; /* Make non-kernel text non-executable */ - if (!in_kernel_text(addr)) - tmp_mode = mode | HW_NO_EXEC; - - hash = hpt_hash(vpn, large); + if (!in_kernel_text(vaddr)) + tmp_mode = mode | HPTE_R_N; + hash = hpt_hash(va, shift); hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + /* The crap below can be cleaned once ppd_md.probe() can + * set up the hash callbacks, thus we can just used the + * normal insert callback here. + */ #ifdef CONFIG_PPC_ISERIES - if (systemcfg->platform & PLATFORM_ISERIES_LPAR) - ret = iSeries_hpte_bolt_or_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); + if (_machine == PLATFORM_ISERIES_LPAR) + ret = iSeries_hpte_insert(hpteg, va, + virt_to_abs(paddr), + tmp_mode, + HPTE_V_BOLTED, + psize); else #endif #ifdef CONFIG_PPC_PSERIES - if (systemcfg->platform & PLATFORM_LPAR) + if (_machine & PLATFORM_LPAR) ret = pSeries_lpar_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); + virt_to_abs(paddr), + tmp_mode, + HPTE_V_BOLTED, + psize); else #endif #ifdef CONFIG_PPC_MULTIPLATFORM ret = native_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, - vflags, tmp_mode); + virt_to_abs(paddr), + tmp_mode, HPTE_V_BOLTED, + psize); #endif + if (ret < 0) + break; + } + return ret < 0 ? ret : 0; +} - if (ret == -1) { - ppc64_terminate_msg(0x20, "create_pte_mapping"); - loop_forever(); +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + u32 *prop; + unsigned long size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = (u32 *)of_get_flat_dt_prop(node, + "ibm,segment-page-sizes", &size); + if (prop != NULL) { + DBG("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->cpu_features &= ~(CPU_FTR_16M_PAGE); + while(size > 0) { + unsigned int shift = prop[0]; + unsigned int slbenc = prop[1]; + unsigned int lpnum = prop[2]; + unsigned int lpenc = 0; + struct mmu_psize_def *def; + int idx = -1; + + size -= 3; prop += 3; + while(size > 0 && lpnum) { + if (prop[0] == shift) + lpenc = prop[1]; + prop += 2; size -= 2; + lpnum--; + } + switch(shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + cur_cpu_spec->cpu_features |= CPU_FTR_16M_PAGE; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + if (idx < 0) + continue; + def = &mmu_psize_defs[idx]; + def->shift = shift; + if (shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (shift - 23)) - 1; + def->sllp = slbenc; + def->penc = lpenc; + /* We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (idx == MMU_PAGE_4K || idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + DBG(" %d: shift=%02x, sllp=%04x, avpnm=%08x, " + "tlbiel=%d, penc=%d\n", + idx, shift, def->sllp, def->avpnm, def->tlbiel, + def->penc); } + return 1; + } + return 0; +} + + +static void __init htab_init_page_sizes(void) +{ + int rc; + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults_old, + sizeof(mmu_psize_defaults_old)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + + /* + * Not in the device-tree, let's fallback on known size + * list for 16M capable GP & GR + */ + if ((_machine != PLATFORM_ISERIES_LPAR) && + cpu_has_feature(CPU_FTR_16M_PAGE)) + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + found: + /* + * Pick a size for the linear mapping. Currently, we only support + * 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; + + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K if cache inhibited large pages are supported by the + * processor + */ +#ifdef CONFIG_PPC_64K_PAGES + if (mmu_psize_defs[MMU_PAGE_64K].shift && + cpu_has_feature(CPU_FTR_CI_LARGE_PAGE)) + mmu_virtual_psize = MMU_PAGE_64K; +#endif + + printk(KERN_INFO "Page orders: linear mapping = %d, others = %d\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift); + +#ifdef CONFIG_HUGETLB_PAGE + /* Init large page size. Currently, we pick 16M or 1M depending + * on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_huge_psize = MMU_PAGE_16M; + /* With 4k/4level pagetables, we can't (for now) cope with a + * huge page size < PMD_SIZE */ + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_huge_psize = MMU_PAGE_1M; + + /* Calculate HPAGE_SHIFT and sanity check it */ + if (mmu_psize_defs[mmu_huge_psize].shift > MIN_HUGEPTE_SHIFT && + mmu_psize_defs[mmu_huge_psize].shift < SID_SHIFT) + HPAGE_SHIFT = mmu_psize_defs[mmu_huge_psize].shift; + else + HPAGE_SHIFT = 0; /* No huge pages dude ! */ +#endif /* CONFIG_HUGETLB_PAGE */ +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + char *type = of_get_flat_dt_prop(node, "device_type", NULL); + u32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = (u32 *)of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = prop[1]; + return 1; } + return 0; } -static unsigned long get_hashtable_size(void) +static unsigned long __init htab_get_table_size(void) { - unsigned long rnd_mem_size, pteg_count; + unsigned long mem_size, rnd_mem_size, pteg_count; - /* If hash size wasn't obtained in prom.c, we calculate it now based on - * the total RAM size + /* If hash size isn't already provided by the platform, we try to + * retreive it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); if (ppc64_pft_size) return 1UL << ppc64_pft_size; /* round mem_size up to next power of 2 */ - rnd_mem_size = 1UL << __ilog2(systemcfg->physicalMemorySize); - if (rnd_mem_size < systemcfg->physicalMemorySize) + mem_size = lmb_phys_mem_size(); + rnd_mem_size = 1UL << __ilog2(mem_size); + if (rnd_mem_size < mem_size) rnd_mem_size <<= 1; /* # pages / 2 */ @@ -176,33 +388,40 @@ static unsigned long get_hashtable_size(void) return pteg_count << 7; } +#ifdef CONFIG_MEMORY_HOTPLUG +void create_section_mapping(unsigned long start, unsigned long end) +{ + BUG_ON(htab_bolt_mapping(start, end, start, + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX, + mmu_linear_psize)); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + void __init htab_initialize(void) { unsigned long table, htab_size_bytes; unsigned long pteg_count; unsigned long mode_rw; - int i, use_largepages = 0; unsigned long base = 0, size = 0; + int i; + extern unsigned long tce_alloc_start, tce_alloc_end; DBG(" -> htab_initialize()\n"); + /* Initialize page sizes */ + htab_init_page_sizes(); + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. */ - htab_size_bytes = get_hashtable_size(); + htab_size_bytes = htab_get_table_size(); pteg_count = htab_size_bytes >> 7; - /* For debug, make the HTAB 1/8 as big as it normally would be. */ - ifppcdebug(PPCDBG_HTABSIZE) { - pteg_count >>= 3; - htab_size_bytes = pteg_count << 7; - } - htab_hash_mask = pteg_count - 1; - if (systemcfg->platform & PLATFORM_LPAR) { + if (platform_is_lpar()) { /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; @@ -211,14 +430,11 @@ void __init htab_initialize(void) * the absolute address space. */ table = lmb_alloc(htab_size_bytes, htab_size_bytes); + BUG_ON(table == 0); DBG("Hash table allocated at %lx, size: %lx\n", table, htab_size_bytes); - if ( !table ) { - ppc64_terminate_msg(0x20, "hpt space"); - loop_forever(); - } htab_address = abs_to_virt(table); /* htab absolute addr + encoded htabsize */ @@ -226,6 +442,9 @@ void __init htab_initialize(void) /* Initialize the HPT with no entries */ memset((void *)table, 0, htab_size_bytes); + + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); } mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX; @@ -234,8 +453,6 @@ void __init htab_initialize(void) * _NOT_ map it to avoid cache paradoxes as it's remapped non * cacheable later on */ - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - use_largepages = 1; /* create bolted the linear mapping in the hash table */ for (i=0; i < lmb.memory.cnt; i++) { @@ -246,27 +463,32 @@ void __init htab_initialize(void) #ifdef CONFIG_U3_DART /* Do not map the DART space. Fortunately, it will be aligned - * in such a way that it will not cross two lmb regions and will - * fit within a single 16Mb page. - * The DART space is assumed to be a full 16Mb region even if we - * only use 2Mb of that space. We will use more of it later for - * AGP GART. We have to use a full 16Mb large page. + * in such a way that it will not cross two lmb regions and + * will fit within a single 16Mb page. + * The DART space is assumed to be a full 16Mb region even if + * we only use 2Mb of that space. We will use more of it later + * for AGP GART. We have to use a full 16Mb large page. */ DBG("DART base: %lx\n", dart_tablebase); if (dart_tablebase != 0 && dart_tablebase >= base && dart_tablebase < (base + size)) { if (base != dart_tablebase) - create_pte_mapping(base, dart_tablebase, mode_rw, - use_largepages); + BUG_ON(htab_bolt_mapping(base, dart_tablebase, + base, mode_rw, + mmu_linear_psize)); if ((base + size) > (dart_tablebase + 16*MB)) - create_pte_mapping(dart_tablebase + 16*MB, base + size, - mode_rw, use_largepages); + BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, + base + size, + dart_tablebase+16*MB, + mode_rw, + mmu_linear_psize)); continue; } #endif /* CONFIG_U3_DART */ - create_pte_mapping(base, base + size, mode_rw, use_largepages); - } + BUG_ON(htab_bolt_mapping(base, base + size, base, + mode_rw, mmu_linear_psize)); + } /* * If we have a memory_limit and we've allocated TCEs then we need to @@ -282,8 +504,9 @@ void __init htab_initialize(void) if (base + size >= tce_alloc_start) tce_alloc_start = base + size + 1; - create_pte_mapping(tce_alloc_start, tce_alloc_end, - mode_rw, use_largepages); + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + tce_alloc_start, mode_rw, + mmu_linear_psize)); } DBG(" <- htab_initialize()\n"); @@ -291,6 +514,12 @@ void __init htab_initialize(void) #undef KB #undef MB +void __init htab_initialize_secondary(void) +{ + if (!platform_is_lpar()) + mtspr(SPRN_SDR1, _SDR1); +} + /* * Called by asm hashtable.S for doing lazy icache flush */ @@ -309,7 +538,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) __flush_dcache_icache(page_address(page)); set_bit(PG_arch_1, &page->flags); } else - pp |= HW_NO_EXEC; + pp |= HPTE_R_N; } return pp; } @@ -325,94 +554,169 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) unsigned long vsid; struct mm_struct *mm; pte_t *ptep; - int ret; - int user_region = 0; - int local = 0; cpumask_t tmp; + int rc, user_region = 0, local = 0; - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) - return 1; + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) { + DBG_LOW(" out of pgtable range !\n"); + return 1; + } + + /* Get region & vsid */ switch (REGION_ID(ea)) { case USER_REGION_ID: user_region = 1; mm = current->mm; - if (! mm) + if (! mm) { + DBG_LOW(" user region with no mm !\n"); return 1; - + } vsid = get_vsid(mm->context.id, ea); break; case VMALLOC_REGION_ID: mm = &init_mm; vsid = get_kernel_vsid(ea); break; -#if 0 - case KERNEL_REGION_ID: - /* - * Should never get here - entire 0xC0... region is bolted. - * Send the problem up to do_page_fault - */ -#endif default: /* Not a valid range * Send the problem up to do_page_fault */ return 1; - break; } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + /* Get pgdir */ pgdir = mm->pgd; - if (pgdir == NULL) return 1; + /* Check CPU locality */ tmp = cpumask_of_cpu(smp_processor_id()); if (user_region && cpus_equal(mm->cpu_vm_mask, tmp)) local = 1; - /* Is this a huge page ? */ - if (unlikely(in_hugepage_area(mm->context, ea))) - ret = hash_huge_page(mm, access, ea, vsid, local); - else { - ptep = find_linux_pte(pgdir, ea); - if (ptep == NULL) - return 1; - ret = __hash_page(ea, access, vsid, ptep, trap, local); + /* Handle hugepage regions */ + if (unlikely(in_hugepage_area(mm->context, ea))) { + DBG_LOW(" -> huge page !\n"); + return hash_huge_page(mm, access, ea, vsid, local); + } + + /* Get PTE and page size from page tables */ + ptep = find_linux_pte(pgdir, ea); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + return 1; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (access & ~pte_val(*ptep)) { + DBG_LOW(" no access !\n"); + return 1; } - return ret; + /* Do actual hashing */ +#ifndef CONFIG_PPC_64K_PAGES + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local); +#else + if (mmu_virtual_psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local); + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local); +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + return rc; } -void flush_hash_page(unsigned long va, pte_t pte, int local) +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) { - unsigned long vpn, hash, secondary, slot; - unsigned long huge = pte_huge(pte); + unsigned long vsid; + void *pgdir; + pte_t *ptep; + cpumask_t mask; + unsigned long flags; + int local = 0; + + /* We don't want huge pages prefaulted for now + */ + if (unlikely(in_hugepage_area(mm->context, ea))) + return; + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get PTE, VSID, access mask */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + ptep = find_linux_pte(pgdir, ea); + if (!ptep) + return; + vsid = get_vsid(mm->context.id, ea); - if (huge) - vpn = va >> HPAGE_SHIFT; + /* Hash it in */ + local_irq_save(flags); + mask = cpumask_of_cpu(smp_processor_id()); + if (cpus_equal(mm->cpu_vm_mask, mask)) + local = 1; +#ifndef CONFIG_PPC_64K_PAGES + __hash_page_4K(ea, access, vsid, ptep, trap, local); +#else + if (mmu_virtual_psize == MMU_PAGE_64K) + __hash_page_64K(ea, access, vsid, ptep, trap, local); else - vpn = va >> PAGE_SHIFT; - hash = hpt_hash(vpn, huge); - secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15; - if (secondary) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12; - - ppc_md.hpte_invalidate(slot, va, huge, local); + __hash_page_4K(ea, access, vsid, ptep, trap, local); +#endif /* CONFIG_PPC_64K_PAGES */ + local_irq_restore(flags); +} + +void flush_hash_page(unsigned long va, real_pte_t pte, int psize, int local) +{ + unsigned long hash, index, shift, hidx, slot; + + DBG_LOW("flush_hash_page(va=%016x)\n", va); + pte_iterate_hashed_subpages(pte, psize, va, index, shift) { + hash = hpt_hash(va, shift); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + DBG_LOW(" sub %d: hash=%x, hidx=%x\n", index, slot, hidx); + ppc_md.hpte_invalidate(slot, va, psize, local); + } pte_iterate_hashed_end(); } void flush_hash_range(unsigned long number, int local) { - if (ppc_md.flush_hash_range) { + if (ppc_md.flush_hash_range) ppc_md.flush_hash_range(number, local); - } else { + else { int i; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); for (i = 0; i < number; i++) - flush_hash_page(batch->vaddr[i], batch->pte[i], local); + flush_hash_page(batch->vaddr[i], batch->pte[i], + batch->psize, local); } } @@ -452,6 +756,18 @@ void __init htab_finish_init(void) extern unsigned int *htab_call_hpte_remove; extern unsigned int *htab_call_hpte_updatepp; +#ifdef CONFIG_PPC_64K_PAGES + extern unsigned int *ht64_call_hpte_insert1; + extern unsigned int *ht64_call_hpte_insert2; + extern unsigned int *ht64_call_hpte_remove; + extern unsigned int *ht64_call_hpte_updatepp; + + make_bl(ht64_call_hpte_insert1, ppc_md.hpte_insert); + make_bl(ht64_call_hpte_insert2, ppc_md.hpte_insert); + make_bl(ht64_call_hpte_remove, ppc_md.hpte_remove); + make_bl(ht64_call_hpte_updatepp, ppc_md.hpte_updatepp); +#endif /* CONFIG_PPC_64K_PAGES */ + make_bl(htab_call_hpte_insert1, ppc_md.hpte_insert); make_bl(htab_call_hpte_insert2, ppc_md.hpte_insert); make_bl(htab_call_hpte_remove, ppc_md.hpte_remove); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0ea0994ed97..426c269e552 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -47,10 +47,25 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pu = pud_offset(pg, addr); if (!pud_none(*pu)) { pm = pmd_offset(pu, addr); +#ifdef CONFIG_PPC_64K_PAGES + /* Currently, we use the normal PTE offset within full + * size PTE pages, thus our huge PTEs are scattered in + * the PTE page and we do waste some. We may change + * that in the future, but the current mecanism keeps + * things much simpler + */ + if (!pmd_none(*pm)) { + /* Note: pte_offset_* are all equivalent on + * ppc64 as we don't have HIGHMEM + */ + pt = pte_offset_kernel(pm, addr); + return pt; + } +#else /* CONFIG_PPC_64K_PAGES */ + /* On 4k pages, we put huge PTEs in the PMD page */ pt = (pte_t *)pm; - BUG_ON(!pmd_none(*pm) - && !(pte_present(*pt) && pte_huge(*pt))); return pt; +#endif /* CONFIG_PPC_64K_PAGES */ } } @@ -74,9 +89,16 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) if (pu) { pm = pmd_alloc(mm, pu, addr); if (pm) { +#ifdef CONFIG_PPC_64K_PAGES + /* See comment in huge_pte_offset. Note that if we ever + * want to put the page size in the PMD, we would have + * to open code our own pte_alloc* function in order + * to populate and set the size atomically + */ + pt = pte_alloc_map(mm, pm, addr); +#else /* CONFIG_PPC_64K_PAGES */ pt = (pte_t *)pm; - BUG_ON(!pmd_none(*pm) - && !(pte_present(*pt) && pte_huge(*pt))); +#endif /* CONFIG_PPC_64K_PAGES */ return pt; } } @@ -84,35 +106,29 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return NULL; } -#define HUGEPTE_BATCH_SIZE (HPAGE_SIZE / PMD_SIZE) - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - int i; - if (pte_present(*ptep)) { - pte_clear(mm, addr, ptep); + /* We open-code pte_clear because we need to pass the right + * argument to hpte_update (huge / !huge) + */ + unsigned long old = pte_update(ptep, ~0UL); + if (old & _PAGE_HASHPTE) + hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); flush_tlb_pending(); } - - for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) { - *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - ptep++; - } + *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { unsigned long old = pte_update(ptep, ~0UL); - int i; if (old & _PAGE_HASHPTE) - hpte_update(mm, addr, old, 0); - - for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) - ptep[i] = __pte(0); + hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); + *ptep = __pte(0); return __pte(old); } @@ -196,6 +212,12 @@ static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) BUG_ON(area >= NUM_HIGH_AREAS); + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = 0x100000000UL; + /* Check no VMAs are in the region */ vma = find_vma(mm, start); if (vma && (vma->vm_start < end)) @@ -563,6 +585,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, int lastshift; u16 areamask, curareas; + if (HPAGE_SHIFT == 0) + return -EINVAL; if (len & ~HPAGE_MASK) return -EINVAL; @@ -619,19 +643,15 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local) { pte_t *ptep; - unsigned long va, vpn; - pte_t old_pte, new_pte; - unsigned long rflags, prpn; + unsigned long old_pte, new_pte; + unsigned long va, rflags, pa; long slot; int err = 1; - spin_lock(&mm->page_table_lock); - ptep = huge_pte_offset(mm, ea); /* Search the Linux page table for a match with va */ va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> HPAGE_SHIFT; /* * If no pte found or not present, send the problem up to @@ -640,8 +660,6 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access, if (unlikely(!ptep || pte_none(*ptep))) goto out; -/* BUG_ON(pte_bad(*ptep)); */ - /* * Check the user's access rights to the page. If access should be * prevented then send the problem up to do_page_fault. @@ -661,58 +679,64 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access, */ - old_pte = *ptep; - new_pte = old_pte; - - rflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW)); + do { + old_pte = pte_val(*ptep); + if (old_pte & _PAGE_BUSY) + goto out; + new_pte = old_pte | _PAGE_BUSY | + _PAGE_ACCESSED | _PAGE_HASHPTE; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((pte_val(new_pte) & _PAGE_EXEC) ? 0 : HW_NO_EXEC); + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); /* Check if pte already has an hpte (case 2) */ - if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) { + if (unlikely(old_pte & _PAGE_HASHPTE)) { /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; - hash = hpt_hash(vpn, 1); - if (pte_val(old_pte) & _PAGE_SECONDARY) + hash = hpt_hash(va, HPAGE_SHIFT); + if (old_pte & _PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12; + slot += (old_pte & _PAGE_F_GIX) >> 12; if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1) - pte_val(old_pte) &= ~_PAGE_HPTEFLAGS; + old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(vpn, 1); + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(va, HPAGE_SHIFT); unsigned long hpte_group; - prpn = pte_pfn(old_pte); + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; repeat: hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - /* Update the linux pte with the HPTE slot */ - pte_val(new_pte) &= ~_PAGE_HPTEFLAGS; - pte_val(new_pte) |= _PAGE_HASHPTE; + /* clear HPTE slot informations in new PTE */ + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; /* Add in WIMG bits */ /* XXX We should store these in the pte */ + /* --BenH: I think they are ... */ rflags |= _PAGE_COHERENT; - slot = ppc_md.hpte_insert(hpte_group, va, prpn, - HPTE_V_LARGE, rflags); + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, + mmu_huge_psize); /* Primary is full, try the secondary */ if (unlikely(slot == -1)) { - pte_val(new_pte) |= _PAGE_SECONDARY; + new_pte |= _PAGE_F_SECOND; hpte_group = ((~hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, prpn, - HPTE_V_LARGE | + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, HPTE_V_SECONDARY, - rflags); + mmu_huge_psize); if (slot == -1) { if (mftb() & 0x1) hpte_group = ((hash & htab_hash_mask) * @@ -726,20 +750,18 @@ repeat: if (unlikely(slot == -2)) panic("hash_huge_page: pte_insert failed\n"); - pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX; - - /* - * No need to use ldarx/stdcx here because all who - * might be updating the pte will hold the - * page_table_lock - */ - *ptep = new_pte; + new_pte |= (slot << 12) & _PAGE_F_GIX; } + /* + * No need to use ldarx/stdcx here because all who + * might be updating the pte will hold the + * page_table_lock + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + err = 0; out: - spin_unlock(&mm->page_table_lock); - return err; } diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 4612a79dfb6..7d4b8b5f060 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -84,9 +84,6 @@ void MMU_init(void); /* XXX should be in current.h -- paulus */ extern struct task_struct *current_set[NR_CPUS]; -char *klimit = _end; -struct device_node *memory_node; - extern int init_bootmem_done; /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index b0fc822ec29..1134f70f231 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -20,6 +20,8 @@ * */ +#undef DEBUG + #include <linux/config.h> #include <linux/signal.h> #include <linux/sched.h> @@ -57,7 +59,6 @@ #include <asm/processor.h> #include <asm/mmzone.h> #include <asm/cputable.h> -#include <asm/ppcdebug.h> #include <asm/sections.h> #include <asm/system.h> #include <asm/iommu.h> @@ -65,6 +66,12 @@ #include <asm/vdso.h> #include <asm/imalloc.h> +#ifdef DEBUG +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + #if PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif @@ -73,8 +80,6 @@ #warning TASK_SIZE is smaller than it needs to be. #endif -unsigned long klimit = (unsigned long)_end; - /* max amount of RAM to use */ unsigned long __max_memory; @@ -188,12 +193,21 @@ static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) memset(addr, 0, kmem_cache_size(cache)); } -static const int pgtable_cache_size[2] = { +#ifdef CONFIG_PPC_64K_PAGES +static const unsigned int pgtable_cache_size[3] = { + PTE_TABLE_SIZE, PMD_TABLE_SIZE, PGD_TABLE_SIZE +}; +static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { + "pte_pmd_cache", "pmd_cache", "pgd_cache", +}; +#else +static const unsigned int pgtable_cache_size[2] = { PTE_TABLE_SIZE, PMD_TABLE_SIZE }; static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { "pgd_pte_cache", "pud_pmd_cache", }; +#endif /* CONFIG_PPC_64K_PAGES */ kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; @@ -201,19 +215,16 @@ void pgtable_cache_init(void) { int i; - BUILD_BUG_ON(PTE_TABLE_SIZE != pgtable_cache_size[PTE_CACHE_NUM]); - BUILD_BUG_ON(PMD_TABLE_SIZE != pgtable_cache_size[PMD_CACHE_NUM]); - BUILD_BUG_ON(PUD_TABLE_SIZE != pgtable_cache_size[PUD_CACHE_NUM]); - BUILD_BUG_ON(PGD_TABLE_SIZE != pgtable_cache_size[PGD_CACHE_NUM]); - for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) { int size = pgtable_cache_size[i]; const char *name = pgtable_cache_name[i]; + DBG("Allocating page table cache %s (#%d) " + "for size: %08x...\n", name, i, size); pgtable_cache[i] = kmem_cache_create(name, size, size, - SLAB_HWCACHE_ALIGN - | SLAB_MUST_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN | + SLAB_MUST_HWCACHE_ALIGN, zero_ctor, NULL); if (! pgtable_cache[i]) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 117b00012e1..e2c95fcb805 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -46,9 +46,7 @@ #include <asm/prom.h> #include <asm/lmb.h> #include <asm/sections.h> -#ifdef CONFIG_PPC64 #include <asm/vdso.h> -#endif #include "mmu_decl.h" @@ -61,6 +59,9 @@ int init_bootmem_done; int mem_init_done; unsigned long memory_limit; +extern void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap); + /* * This is called by /dev/mem to know if a given address has to * be mapped non-cacheable or not @@ -107,6 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot); void online_page(struct page *page) { ClearPageReserved(page); + set_page_count(page, 0); free_cold_page(page); totalram_pages++; num_physpages++; @@ -124,6 +126,9 @@ int __devinit add_memory(u64 start, u64 size) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + start += KERNELBASE; + create_section_mapping(start, start + size); + /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; @@ -355,7 +360,7 @@ void __init mem_init(void) } codesize = (unsigned long)&_sdata - (unsigned long)&_stext; - datasize = (unsigned long)&__init_begin - (unsigned long)&_sdata; + datasize = (unsigned long)&_edata - (unsigned long)&_sdata; initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin; bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start; @@ -390,10 +395,8 @@ void __init mem_init(void) mem_init_done = 1; -#ifdef CONFIG_PPC64 /* Initialize the vDSO */ vdso_init(); -#endif } /* @@ -493,18 +496,10 @@ EXPORT_SYMBOL(flush_icache_user_range); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) { - /* handle i-cache coherency */ - unsigned long pfn = pte_pfn(pte); -#ifdef CONFIG_PPC32 - pmd_t *pmd; -#else - unsigned long vsid; - void *pgdir; - pte_t *ptep; - int local = 0; - cpumask_t tmp; - unsigned long flags; +#ifdef CONFIG_PPC_STD_MMU + unsigned long access = 0, trap; #endif + unsigned long pfn = pte_pfn(pte); /* handle i-cache coherency */ if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) && @@ -535,30 +530,21 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(pte) || address >= TASK_SIZE) return; -#ifdef CONFIG_PPC32 - if (Hash == 0) - return; - pmd = pmd_offset(pgd_offset(vma->vm_mm, address), address); - if (!pmd_none(*pmd)) - add_hash_page(vma->vm_mm->context, address, pmd_val(*pmd)); -#else - pgdir = vma->vm_mm->pgd; - if (pgdir == NULL) - return; - ptep = find_linux_pte(pgdir, address); - if (!ptep) + /* We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot + * + * We also avoid filling the hash if not coming from a fault + */ + if (current->thread.regs == NULL) return; - - vsid = get_vsid(vma->vm_mm->context.id, address); - - local_irq_save(flags); - tmp = cpumask_of_cpu(smp_processor_id()); - if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp)) - local = 1; - - __hash_page(address, 0, vsid, ptep, 0x300, local); - local_irq_restore(flags); -#endif -#endif + trap = TRAP(current->thread.regs); + if (trap == 0x400) + access |= _PAGE_EXEC; + else if (trap != 0x300) + return; + hash_preload(vma->vm_mm, address, access, trap); +#endif /* CONFIG_PPC_STD_MMU */ } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 4035cad8d7f..bd2cf133688 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -17,55 +17,123 @@ #include <linux/nodemask.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <asm/sparsemem.h> #include <asm/lmb.h> -#include <asm/machdep.h> -#include <asm/abs_addr.h> #include <asm/system.h> +#include <asm/smp.h> static int numa_enabled = 1; static int numa_debug; #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } -#ifdef DEBUG_NUMA -#define ARRAY_INITIALISER -1 -#else -#define ARRAY_INITIALISER 0 -#endif - -int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = - ARRAY_INITIALISER}; -char *numa_memory_lookup_table; +int numa_cpu_lookup_table[NR_CPUS]; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; -int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; - struct pglist_data *node_data[MAX_NUMNODES]; -bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; + +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(node_data); + +static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; /* - * We need somewhere to store start/span for each node until we have + * We need somewhere to store start/end/node for each region until we have * allocated the real node_data structures. */ +#define MAX_REGIONS (MAX_LMB_REGIONS*2) static struct { - unsigned long node_start_pfn; - unsigned long node_end_pfn; - unsigned long node_present_pages; -} init_node_data[MAX_NUMNODES] __initdata; + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +} init_node_data[MAX_REGIONS] __initdata; -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(numa_cpu_lookup_table); -EXPORT_SYMBOL(numa_memory_lookup_table); -EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(nr_cpus_in_node); +int __init early_pfn_to_nid(unsigned long pfn) +{ + unsigned int i; + + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start_pfn = init_node_data[i].start_pfn; + unsigned long end_pfn = init_node_data[i].end_pfn; + + if ((start_pfn <= pfn) && (pfn < end_pfn)) + return init_node_data[i].nid; + } + + return -1; +} + +void __init add_region(unsigned int nid, unsigned long start_pfn, + unsigned long pages) +{ + unsigned int i; + + dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n", + nid, start_pfn, pages); + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + if (init_node_data[i].end_pfn == start_pfn) { + init_node_data[i].end_pfn += pages; + return; + } + if (init_node_data[i].start_pfn == (start_pfn + pages)) { + init_node_data[i].start_pfn -= pages; + return; + } + } + + /* + * Leave last entry NULL so we dont iterate off the end (we use + * entry.end_pfn to terminate the walk). + */ + if (i >= (MAX_REGIONS - 1)) { + printk(KERN_ERR "WARNING: too many memory regions in " + "numa code, truncating\n"); + return; + } + + init_node_data[i].start_pfn = start_pfn; + init_node_data[i].end_pfn = start_pfn + pages; + init_node_data[i].nid = nid; +} + +/* We assume init_node_data has no overlapping regions */ +void __init get_region(unsigned int nid, unsigned long *start_pfn, + unsigned long *end_pfn, unsigned long *pages_present) +{ + unsigned int i; + + *start_pfn = -1UL; + *end_pfn = *pages_present = 0; + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + + *pages_present += init_node_data[i].end_pfn - + init_node_data[i].start_pfn; + + if (init_node_data[i].start_pfn < *start_pfn) + *start_pfn = init_node_data[i].start_pfn; + + if (init_node_data[i].end_pfn > *end_pfn) + *end_pfn = init_node_data[i].end_pfn; + } + + /* We didnt find a matching region, return start/end as 0 */ + if (*start_pfn == -1UL) + start_pfn = 0; +} static inline void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; - if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { + + if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) cpu_set(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]++; - } } #ifdef CONFIG_HOTPLUG_CPU @@ -77,7 +145,6 @@ static void unmap_cpu_from_node(unsigned long cpu) if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { cpu_clear(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]--; } else { printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", cpu, node); @@ -85,7 +152,7 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static struct device_node * __devinit find_cpu_node(unsigned int cpu) +static struct device_node *find_cpu_node(unsigned int cpu) { unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); struct device_node *cpu_node = NULL; @@ -212,7 +279,7 @@ static int __init get_mem_size_cells(void) return rc; } -static unsigned long read_n_cells(int n, unsigned int **buf) +static unsigned long __init read_n_cells(int n, unsigned int **buf) { unsigned long result = 0; @@ -294,7 +361,8 @@ static int cpu_numa_callback(struct notifier_block *nfb, * or zero. If the returned value of size is 0 the region should be * discarded as it lies wholy above the memory limit. */ -static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) +static unsigned long __init numa_enforce_memory_limit(unsigned long start, + unsigned long size) { /* * We use lmb_end_of_DRAM() in here instead of memory_limit because @@ -319,8 +387,7 @@ static int __init parse_numa_properties(void) struct device_node *cpu = NULL; struct device_node *memory = NULL; int addr_cells, size_cells; - int max_domain = 0; - long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; + int max_domain; unsigned long i; if (numa_enabled == 0) { @@ -328,13 +395,6 @@ static int __init parse_numa_properties(void) return -1; } - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - min_common_depth = find_min_common_depth(); dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); @@ -386,9 +446,6 @@ new_range: start = read_n_cells(addr_cells, &memcell_buf); size = read_n_cells(size_cells, &memcell_buf); - start = _ALIGN_DOWN(start, MEMORY_INCREMENT); - size = _ALIGN_UP(size, MEMORY_INCREMENT); - numa_domain = of_node_numa_domain(memory); if (numa_domain >= MAX_NUMNODES) { @@ -402,44 +459,15 @@ new_range: if (max_domain < numa_domain) max_domain = numa_domain; - if (! (size = numa_enforce_memory_limit(start, size))) { + if (!(size = numa_enforce_memory_limit(start, size))) { if (--ranges) goto new_range; else continue; } - /* - * Initialize new node struct, or add to an existing one. - */ - if (init_node_data[numa_domain].node_end_pfn) { - if ((start / PAGE_SIZE) < - init_node_data[numa_domain].node_start_pfn) - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > - init_node_data[numa_domain].node_end_pfn) - init_node_data[numa_domain].node_end_pfn = - (start / PAGE_SIZE) + - (size / PAGE_SIZE); - - init_node_data[numa_domain].node_present_pages += - size / PAGE_SIZE; - } else { - node_set_online(numa_domain); - - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - init_node_data[numa_domain].node_end_pfn = - init_node_data[numa_domain].node_start_pfn + - size / PAGE_SIZE; - init_node_data[numa_domain].node_present_pages = - size / PAGE_SIZE; - } - - for (i = start ; i < (start+size); i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = - numa_domain; + add_region(numa_domain, start >> PAGE_SHIFT, + size >> PAGE_SHIFT); if (--ranges) goto new_range; @@ -455,32 +483,15 @@ static void __init setup_nonnuma(void) { unsigned long top_of_ram = lmb_end_of_DRAM(); unsigned long total_ram = lmb_phys_mem_size(); - unsigned long i; printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); printk(KERN_INFO "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - if (!numa_memory_lookup_table) { - long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - } - map_cpu_to_node(boot_cpuid, 0); - + add_region(0, 0, lmb_end_of_DRAM() >> PAGE_SHIFT); node_set_online(0); - - init_node_data[0].node_start_pfn = 0; - init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; - init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; - - for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; } static void __init dump_numa_topology(void) @@ -498,8 +509,9 @@ static void __init dump_numa_topology(void) count = 0; - for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { - if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { + for (i = 0; i < lmb_end_of_DRAM(); + i += (1 << SECTION_SIZE_BITS)) { + if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { if (count == 0) printk(" 0x%lx", i); ++count; @@ -524,10 +536,12 @@ static void __init dump_numa_topology(void) * * Returns the physical address of the memory. */ -static unsigned long careful_allocation(int nid, unsigned long size, - unsigned long align, unsigned long end) +static void __init *careful_allocation(int nid, unsigned long size, + unsigned long align, + unsigned long end_pfn) { - unsigned long ret = lmb_alloc_base(size, align, end); + int new_nid; + unsigned long ret = lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); /* retry over all memory */ if (!ret) @@ -541,28 +555,27 @@ static unsigned long careful_allocation(int nid, unsigned long size, * If the memory came from a previously allocated node, we must * retry with the bootmem allocator. */ - if (pa_to_nid(ret) < nid) { - nid = pa_to_nid(ret); - ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), + new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); + if (new_nid < nid) { + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), size, align, 0); if (!ret) panic("numa.c: cannot allocate %lu bytes on node %d", - size, nid); + size, new_nid); - ret = virt_to_abs(ret); + ret = __pa(ret); dbg("alloc_bootmem %lx %lx\n", ret, size); } - return ret; + return (void *)ret; } void __init do_init_bootmem(void) { int nid; - int addr_cells, size_cells; - struct device_node *memory = NULL; + unsigned int i; static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ @@ -580,99 +593,66 @@ void __init do_init_bootmem(void) register_cpu_notifier(&ppc64_numa_nb); for_each_online_node(nid) { - unsigned long start_paddr, end_paddr; - int i; + unsigned long start_pfn, end_pfn, pages_present; unsigned long bootmem_paddr; unsigned long bootmap_pages; - start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; + get_region(nid, &start_pfn, &end_pfn, &pages_present); /* Allocate the node structure node local if possible */ - NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, + NODE_DATA(nid) = careful_allocation(nid, sizeof(struct pglist_data), - SMP_CACHE_BYTES, end_paddr); - NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); + SMP_CACHE_BYTES, end_pfn); + NODE_DATA(nid) = __va(NODE_DATA(nid)); memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); dbg("node %d\n", nid); dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; - NODE_DATA(nid)->node_start_pfn = - init_node_data[nid].node_start_pfn; - NODE_DATA(nid)->node_spanned_pages = - end_paddr - start_paddr; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; if (NODE_DATA(nid)->node_spanned_pages == 0) continue; - dbg("start_paddr = %lx\n", start_paddr); - dbg("end_paddr = %lx\n", end_paddr); + dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); + dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); - bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmem_paddr = (unsigned long)careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, + PAGE_SIZE, end_pfn); + memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); - bootmem_paddr = careful_allocation(nid, - bootmap_pages << PAGE_SHIFT, - PAGE_SIZE, end_paddr); - memset(abs_to_virt(bootmem_paddr), 0, - bootmap_pages << PAGE_SHIFT); dbg("bootmap_paddr = %lx\n", bootmem_paddr); init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, - start_paddr >> PAGE_SHIFT, - end_paddr >> PAGE_SHIFT); + start_pfn, end_pfn); - /* - * We need to do another scan of all memory sections to - * associate memory with the correct node. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + /* Add free regions on this node */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; - ranges = memory->n_addrs; /* ranges in cell */ -new_range: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - if (mem_size) { - dbg("free_bootmem %lx %lx\n", mem_start, mem_size); - free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); - } + start = init_node_data[i].start_pfn << PAGE_SHIFT; + end = init_node_data[i].end_pfn << PAGE_SHIFT; - if (--ranges) /* process all ranges in cell */ - goto new_range; + dbg("free_bootmem %lx %lx\n", start, end - start); + free_bootmem_node(NODE_DATA(nid), start, end - start); } - /* - * Mark reserved regions on this node - */ + /* Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; + unsigned long start_paddr = start_pfn << PAGE_SHIFT; + unsigned long end_paddr = end_pfn << PAGE_SHIFT; - if (pa_to_nid(physbase) != nid && - pa_to_nid(physbase+size-1) != nid) + if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && + early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) continue; if (physbase < end_paddr && @@ -692,46 +672,19 @@ new_range: size); } } - /* - * This loop may look famaliar, but we have to do it again - * after marking our reserved memory to mark memory present - * for sparsemem. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; - ranges = memory->n_addrs; /* ranges in cell */ -new_range2: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) + /* Add regions into sparsemem */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; + + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - memory_present(numa_domain, mem_start >> PAGE_SHIFT, - (mem_start + mem_size) >> PAGE_SHIFT); + start = init_node_data[i].start_pfn; + end = init_node_data[i].end_pfn; - if (--ranges) /* process all ranges in cell */ - goto new_range2; + memory_present(nid, start, end); } - } } @@ -745,21 +698,18 @@ void __init paging_init(void) memset(zholes_size, 0, sizeof(zholes_size)); for_each_online_node(nid) { - unsigned long start_pfn; - unsigned long end_pfn; + unsigned long start_pfn, end_pfn, pages_present; - start_pfn = init_node_data[nid].node_start_pfn; - end_pfn = init_node_data[nid].node_end_pfn; + get_region(nid, &start_pfn, &end_pfn, &pages_present); zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - - init_node_data[nid].node_present_pages; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present; dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start_pfn, zholes_size); + free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, + zholes_size); } } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index b79a7820613..c7f7bb6f30b 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -59,7 +59,6 @@ #include <asm/processor.h> #include <asm/mmzone.h> #include <asm/cputable.h> -#include <asm/ppcdebug.h> #include <asm/sections.h> #include <asm/system.h> #include <asm/iommu.h> @@ -101,7 +100,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - unsigned long vsid; if (mem_init_done) { pgdp = pgd_offset_k(ea); @@ -117,27 +115,17 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } else { - unsigned long va, vpn, hash, hpteg; - /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an * entry in the hardware page table. + * */ - vsid = get_kernel_vsid(ea); - va = (vsid << 28) | (ea & 0xFFFFFFF); - vpn = va >> PAGE_SHIFT; - - hash = hpt_hash(vpn, 0); - - hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); - - /* Panic if a pte grpup is full */ - if (ppc_md.hpte_insert(hpteg, va, pa >> PAGE_SHIFT, - HPTE_V_BOLTED, - _PAGE_NO_CACHE|_PAGE_GUARDED|PP_RWXX) - == -1) { - panic("map_io_page: could not insert mapping"); + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_virtual_psize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; } } return 0; diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index cef9e83cc7e..ed7fcfe5fd3 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -179,6 +179,21 @@ void __init setbat(int index, unsigned long virt, unsigned long phys, } /* + * Preload a translation in the hash table + */ +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) +{ + pmd_t *pmd; + + if (Hash == 0) + return; + pmd = pmd_offset(pgd_offset(mm, ea), ea); + if (!pmd_none(*pmd)) + add_hash_page(mm->context, ea, pmd_val(*pmd)); +} + +/* * Initialize the hash table and patch the instructions in hashtable.S. */ void __init MMU_init_hw(void) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 0473953f6a3..60e852f2f8e 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -14,14 +14,32 @@ * 2 of the License, or (at your option) any later version. */ +#undef DEBUG + #include <linux/config.h> #include <asm/pgtable.h> #include <asm/mmu.h> #include <asm/mmu_context.h> #include <asm/paca.h> #include <asm/cputable.h> +#include <asm/cacheflush.h> + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif -extern void slb_allocate(unsigned long ea); +extern void slb_allocate_realmode(unsigned long ea); +extern void slb_allocate_user(unsigned long ea); + +static void slb_allocate(unsigned long ea) +{ + /* Currently, we do real mode for all SLBs including user, but + * that will change if we bring back dynamic VSIDs + */ + slb_allocate_realmode(ea); +} static inline unsigned long mk_esid_data(unsigned long ea, unsigned long slot) { @@ -46,13 +64,15 @@ static void slb_flush_and_rebolt(void) { /* If you change this make sure you change SLB_NUM_BOLTED * appropriately too. */ - unsigned long ksp_flags = SLB_VSID_KERNEL; + unsigned long linear_llp, virtual_llp, lflags, vflags; unsigned long ksp_esid_data; WARN_ON(!irqs_disabled()); - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - ksp_flags |= SLB_VSID_L; + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp; + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | virtual_llp; ksp_esid_data = mk_esid_data(get_paca()->kstack, 2); if ((ksp_esid_data & ESID_MASK) == KERNELBASE) @@ -67,9 +87,9 @@ static void slb_flush_and_rebolt(void) /* Slot 2 - kernel stack */ "slbmte %2,%3\n" "isync" - :: "r"(mk_vsid_data(VMALLOCBASE, SLB_VSID_KERNEL)), + :: "r"(mk_vsid_data(VMALLOCBASE, vflags)), "r"(mk_esid_data(VMALLOCBASE, 1)), - "r"(mk_vsid_data(ksp_esid_data, ksp_flags)), + "r"(mk_vsid_data(ksp_esid_data, lflags)), "r"(ksp_esid_data) : "memory"); } @@ -102,6 +122,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) get_paca()->slb_cache_ptr = 0; get_paca()->context = mm->context; +#ifdef CONFIG_PPC_64K_PAGES + get_paca()->pgdir = mm->pgd; +#endif /* CONFIG_PPC_64K_PAGES */ /* * preload some userspace segments into the SLB. @@ -131,28 +154,77 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) slb_allocate(unmapped_base); } +static inline void patch_slb_encoding(unsigned int *insn_addr, + unsigned int immed) +{ + /* Assume the instruction had a "0" immediate value, just + * "or" in the new value + */ + *insn_addr |= immed; + flush_icache_range((unsigned long)insn_addr, 4+ + (unsigned long)insn_addr); +} + void slb_initialize(void) { + unsigned long linear_llp, virtual_llp; + static int slb_encoding_inited; + extern unsigned int *slb_miss_kernel_load_linear; + extern unsigned int *slb_miss_kernel_load_virtual; + extern unsigned int *slb_miss_user_load_normal; +#ifdef CONFIG_HUGETLB_PAGE + extern unsigned int *slb_miss_user_load_huge; + unsigned long huge_llp; + + huge_llp = mmu_psize_defs[mmu_huge_psize].sllp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp; + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + patch_slb_encoding(slb_miss_kernel_load_linear, + SLB_VSID_KERNEL | linear_llp); + patch_slb_encoding(slb_miss_kernel_load_virtual, + SLB_VSID_KERNEL | virtual_llp); + patch_slb_encoding(slb_miss_user_load_normal, + SLB_VSID_USER | virtual_llp); + + DBG("SLB: linear LLP = %04x\n", linear_llp); + DBG("SLB: virtual LLP = %04x\n", virtual_llp); +#ifdef CONFIG_HUGETLB_PAGE + patch_slb_encoding(slb_miss_user_load_huge, + SLB_VSID_USER | huge_llp); + DBG("SLB: huge LLP = %04x\n", huge_llp); +#endif + } + /* On iSeries the bolted entries have already been set up by * the hypervisor from the lparMap data in head.S */ #ifndef CONFIG_PPC_ISERIES - unsigned long flags = SLB_VSID_KERNEL; + { + unsigned long lflags, vflags; - /* Invalidate the entire SLB (even slot 0) & all the ERATS */ - if (cpu_has_feature(CPU_FTR_16M_PAGE)) - flags |= SLB_VSID_L; + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | virtual_llp; - asm volatile("isync":::"memory"); - asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + /* Invalidate the entire SLB (even slot 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); - create_slbe(KERNELBASE, flags, 0); - create_slbe(VMALLOCBASE, SLB_VSID_KERNEL, 1); + create_slbe(KERNELBASE, lflags, 0); + + /* VMALLOC space has 4K pages always for now */ + create_slbe(VMALLOCBASE, vflags, 1); + /* We don't bolt the stack for the time being - we're in boot, * so the stack is in the bolted segment. By the time it goes * elsewhere, we'll call _switch() which will bolt in the new * one. */ asm volatile("isync":::"memory"); -#endif + } +#endif /* CONFIG_PPC_ISERIES */ get_paca()->stab_rr = SLB_NUM_BOLTED; } diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index a3a03da503b..950ffc5848c 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -18,61 +18,28 @@ #include <linux/config.h> #include <asm/processor.h> -#include <asm/page.h> -#include <asm/mmu.h> #include <asm/ppc_asm.h> #include <asm/asm-offsets.h> #include <asm/cputable.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> -/* void slb_allocate(unsigned long ea); +/* void slb_allocate_realmode(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). * r3 = faulting address, r13 = PACA * r9, r10, r11 are clobbered by this function * No other registers are examined or changed. */ -_GLOBAL(slb_allocate) - /* - * First find a slot, round robin. Previously we tried to find - * a free slot first but that took too long. Unfortunately we - * dont have any LRU information to help us choose a slot. - */ -#ifdef CONFIG_PPC_ISERIES - /* - * On iSeries, the "bolted" stack segment can be cast out on - * shared processor switch so we need to check for a miss on - * it and restore it to the right slot. - */ - ld r9,PACAKSAVE(r13) - clrrdi r9,r9,28 - clrrdi r11,r3,28 - li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ - cmpld r9,r11 - beq 3f -#endif /* CONFIG_PPC_ISERIES */ - - ld r10,PACASTABRR(r13) - addi r10,r10,1 - /* use a cpu feature mask if we ever change our slb size */ - cmpldi r10,SLB_NUM_ENTRIES - - blt+ 4f - li r10,SLB_NUM_BOLTED - -4: - std r10,PACASTABRR(r13) -3: - /* r3 = faulting address, r10 = entry */ +_GLOBAL(slb_allocate_realmode) + /* r3 = faulting address */ srdi r9,r3,60 /* get region */ - srdi r3,r3,28 /* get esid */ + srdi r10,r3,28 /* get esid */ cmpldi cr7,r9,0xc /* cmp KERNELBASE for later use */ - rldimi r10,r3,28,0 /* r10= ESID<<28 | entry */ - oris r10,r10,SLB_ESID_V@h /* r10 |= SLB_ESID_V */ - - /* r3 = esid, r10 = esid_data, cr7 = <>KERNELBASE */ - + /* r3 = address, r10 = esid, cr7 = <>KERNELBASE */ blt cr7,0f /* user or kernel? */ /* kernel address: proto-VSID = ESID */ @@ -81,43 +48,166 @@ _GLOBAL(slb_allocate) * top segment. That's ok, the scramble below will translate * it to VSID 0, which is reserved as a bad VSID - one which * will never have any pages in it. */ - li r11,SLB_VSID_KERNEL -BEGIN_FTR_SECTION - bne cr7,9f - li r11,(SLB_VSID_KERNEL|SLB_VSID_L) -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) - b 9f -0: /* user address: proto-VSID = context<<15 | ESID */ - srdi. r9,r3,USER_ESID_BITS + /* Check if hitting the linear mapping of the vmalloc/ioremap + * kernel space + */ + bne cr7,1f + + /* Linear mapping encoding bits, the "li" instruction below will + * be patched by the kernel at boot + */ +_GLOBAL(slb_miss_kernel_load_linear) + li r11,0 + b slb_finish_load + +1: /* vmalloc/ioremap mapping encoding bits, the "li" instruction below + * will be patched by the kernel at boot + */ +_GLOBAL(slb_miss_kernel_load_virtual) + li r11,0 + b slb_finish_load + + +0: /* user address: proto-VSID = context << 15 | ESID. First check + * if the address is within the boundaries of the user region + */ + srdi. r9,r10,USER_ESID_BITS bne- 8f /* invalid ea bits set */ + /* Figure out if the segment contains huge pages */ #ifdef CONFIG_HUGETLB_PAGE BEGIN_FTR_SECTION + b 1f +END_FTR_SECTION_IFCLR(CPU_FTR_16M_PAGE) + cmpldi r10,16 + + lhz r9,PACALOWHTLBAREAS(r13) + mr r11,r10 + blt 5f + lhz r9,PACAHIGHHTLBAREAS(r13) - srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT) - srd r9,r9,r11 - lhz r11,PACALOWHTLBAREAS(r13) - srd r11,r11,r3 - or r9,r9,r11 -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) + srdi r11,r10,(HTLB_AREA_SHIFT-SID_SHIFT) + +5: srd r9,r9,r11 + andi. r9,r9,1 + beq 1f +_GLOBAL(slb_miss_user_load_huge) + li r11,0 + b 2f +1: #endif /* CONFIG_HUGETLB_PAGE */ - li r11,SLB_VSID_USER +_GLOBAL(slb_miss_user_load_normal) + li r11,0 -#ifdef CONFIG_HUGETLB_PAGE -BEGIN_FTR_SECTION - rldimi r11,r9,8,55 /* shift masked bit into SLB_VSID_L */ -END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) -#endif /* CONFIG_HUGETLB_PAGE */ +2: + ld r9,PACACONTEXTID(r13) + rldimi r10,r9,USER_ESID_BITS,0 + b slb_finish_load +8: /* invalid EA */ + li r10,0 /* BAD_VSID */ + li r11,SLB_VSID_USER /* flags don't much matter */ + b slb_finish_load + +#ifdef __DISABLED__ + +/* void slb_allocate_user(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + * + * It is called with translation enabled in order to be able to walk the + * page tables. This is not currently used. + */ +_GLOBAL(slb_allocate_user) + /* r3 = faulting address */ + srdi r10,r3,28 /* get esid */ + + crset 4*cr7+lt /* set "user" flag for later */ + + /* check if we fit in the range covered by the pagetables*/ + srdi. r9,r3,PGTABLE_EADDR_SIZE + crnot 4*cr0+eq,4*cr0+eq + beqlr + + /* now we need to get to the page tables in order to get the page + * size encoding from the PMD. In the future, we'll be able to deal + * with 1T segments too by getting the encoding from the PGD instead + */ + ld r9,PACAPGDIR(r13) + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,8,25,28 + ldx r9,r9,r11 /* get pgd_t */ + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,3,17,28 + ldx r9,r9,r11 /* get pmd_t */ + cmpldi cr0,r9,0 + beqlr + + /* build vsid flags */ + andi. r11,r9,SLB_VSID_LLP + ori r11,r11,SLB_VSID_USER + + /* get context to calculate proto-VSID */ ld r9,PACACONTEXTID(r13) - rldimi r3,r9,USER_ESID_BITS,0 + rldimi r10,r9,USER_ESID_BITS,0 + + /* fall through slb_finish_load */ + +#endif /* __DISABLED__ */ + + +/* + * Finish loading of an SLB entry and return + * + * r3 = EA, r10 = proto-VSID, r11 = flags, clobbers r9, cr7 = <>KERNELBASE + */ +slb_finish_load: + ASM_VSID_SCRAMBLE(r10,r9) + rldimi r11,r10,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + + /* r3 = EA, r11 = VSID data */ + /* + * Find a slot, round robin. Previously we tried to find a + * free slot first but that took too long. Unfortunately we + * dont have any LRU information to help us choose a slot. + */ +#ifdef CONFIG_PPC_ISERIES + /* + * On iSeries, the "bolted" stack segment can be cast out on + * shared processor switch so we need to check for a miss on + * it and restore it to the right slot. + */ + ld r9,PACAKSAVE(r13) + clrrdi r9,r9,28 + clrrdi r3,r3,28 + li r10,SLB_NUM_BOLTED-1 /* Stack goes in last bolted slot */ + cmpld r9,r3 + beq 3f +#endif /* CONFIG_PPC_ISERIES */ + + ld r10,PACASTABRR(r13) + addi r10,r10,1 + /* use a cpu feature mask if we ever change our slb size */ + cmpldi r10,SLB_NUM_ENTRIES + + blt+ 4f + li r10,SLB_NUM_BOLTED -9: /* r3 = protovsid, r11 = flags, r10 = esid_data, cr7 = <>KERNELBASE */ - ASM_VSID_SCRAMBLE(r3,r9) +4: + std r10,PACASTABRR(r13) + +3: + rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ + oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ - rldimi r11,r3,SLB_VSID_SHIFT,16 /* combine VSID and flags */ + /* r3 = ESID data, r11 = VSID data */ /* * No need for an isync before or after this slbmte. The exception @@ -125,7 +215,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) */ slbmte r11,r10 - bgelr cr7 /* we're done for kernel addresses */ + /* we're done for kernel addresses */ + crclr 4*cr0+eq /* set result to "success" */ + bgelr cr7 /* Update the slb cache */ lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ @@ -143,9 +235,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE) li r3,SLB_CACHE_ENTRIES+1 2: sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + crclr 4*cr0+eq /* set result to "success" */ blr -8: /* invalid EA */ - li r3,0 /* BAD_VSID */ - li r11,SLB_VSID_USER /* flags don't much matter */ - b 9b diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 1b83f002bf2..cfbb4e1f966 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -20,13 +20,13 @@ #include <asm/cputable.h> #include <asm/lmb.h> #include <asm/abs_addr.h> +#include <asm/firmware.h> struct stab_entry { unsigned long esid_data; unsigned long vsid_data; }; -/* Both the segment table and SLB code uses the following cache */ #define NR_STAB_CACHE_ENTRIES 8 DEFINE_PER_CPU(long, stab_cache_ptr); DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); @@ -186,7 +186,7 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) /* Never flush the first entry. */ ste += 1; for (entry = 1; - entry < (PAGE_SIZE / sizeof(struct stab_entry)); + entry < (HW_PAGE_SIZE / sizeof(struct stab_entry)); entry++, ste++) { unsigned long ea; ea = ste->esid_data & ESID_MASK; @@ -200,6 +200,10 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) __get_cpu_var(stab_cache_ptr) = 0; +#ifdef CONFIG_PPC_64K_PAGES + get_paca()->pgdir = mm->pgd; +#endif /* CONFIG_PPC_64K_PAGES */ + /* Now preload some entries for the new task */ if (test_tsk_thread_flag(tsk, TIF_32BIT)) unmapped_base = TASK_UNMAPPED_BASE_USER32; @@ -223,8 +227,6 @@ void switch_stab(struct task_struct *tsk, struct mm_struct *mm) asm volatile("sync" : : : "memory"); } -extern void slb_initialize(void); - /* * Allocate segment tables for secondary CPUs. These must all go in * the first (bolted) segment, so that do_stab_bolted won't get a @@ -243,18 +245,21 @@ void stabs_alloc(void) if (cpu == 0) continue; /* stab for CPU 0 is statically allocated */ - newstab = lmb_alloc_base(PAGE_SIZE, PAGE_SIZE, 1<<SID_SHIFT); + newstab = lmb_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, + 1<<SID_SHIFT); if (! newstab) panic("Unable to allocate segment table for CPU %d.\n", cpu); newstab += KERNELBASE; - memset((void *)newstab, 0, PAGE_SIZE); + memset((void *)newstab, 0, HW_PAGE_SIZE); paca[cpu].stab_addr = newstab; paca[cpu].stab_real = virt_to_abs(newstab); - printk(KERN_DEBUG "Segment table for CPU %d at 0x%lx virtual, 0x%lx absolute\n", cpu, paca[cpu].stab_addr, paca[cpu].stab_real); + printk(KERN_INFO "Segment table for CPU %d at 0x%lx " + "virtual, 0x%lx absolute\n", + cpu, paca[cpu].stab_addr, paca[cpu].stab_real); } } @@ -266,14 +271,28 @@ void stabs_alloc(void) void stab_initialize(unsigned long stab) { unsigned long vsid = get_kernel_vsid(KERNELBASE); + unsigned long stabreal; - if (cpu_has_feature(CPU_FTR_SLB)) { - slb_initialize(); - } else { - asm volatile("isync; slbia; isync":::"memory"); - make_ste(stab, GET_ESID(KERNELBASE), vsid); + asm volatile("isync; slbia; isync":::"memory"); + make_ste(stab, GET_ESID(KERNELBASE), vsid); - /* Order update */ - asm volatile("sync":::"memory"); + /* Order update */ + asm volatile("sync":::"memory"); + + /* Set ASR */ + stabreal = get_paca()->stab_real | 0x1ul; + +#ifdef CONFIG_PPC_ISERIES + if (firmware_has_feature(FW_FEATURE_ISERIES)) { + HvCall1(HvCallBaseSetASR, stabreal); + return; + } +#endif /* CONFIG_PPC_ISERIES */ +#ifdef CONFIG_PPC_PSERIES + if (platform_is_lpar()) { + plpar_hcall_norets(H_SET_ASR, stabreal); + return; } +#endif + mtspr(SPRN_ASR, stabreal); } diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c index 09ab81a10f4..53e31b834ac 100644 --- a/arch/powerpc/mm/tlb_64.c +++ b/arch/powerpc/mm/tlb_64.c @@ -21,6 +21,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + #include <linux/config.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -30,7 +31,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/tlb.h> -#include <linux/highmem.h> +#include <asm/bug.h> DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -126,28 +127,46 @@ void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) * (if we remove it we should clear the _PTE_HPTEFLAGS bits). */ void hpte_update(struct mm_struct *mm, unsigned long addr, - unsigned long pte, int wrprot) + pte_t *ptep, unsigned long pte, int huge) { struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); unsigned long vsid; + unsigned int psize = mmu_virtual_psize; int i; i = batch->index; + /* We mask the address for the base page size. Huge pages will + * have applied their own masking already + */ + addr &= PAGE_MASK; + + /* Get page size (maybe move back to caller) */ + if (huge) { +#ifdef CONFIG_HUGETLB_PAGE + psize = mmu_huge_psize; +#else + BUG(); +#endif + } + /* * This can happen when we are in the middle of a TLB batch and * we encounter memory pressure (eg copy_page_range when it tries * to allocate a new pte). If we have to reclaim memory and end * up scanning and resetting referenced bits then our batch context * will change mid stream. + * + * We also need to ensure only one page size is present in a given + * batch */ - if (i != 0 && (mm != batch->mm || batch->large != pte_huge(pte))) { + if (i != 0 && (mm != batch->mm || batch->psize != psize)) { flush_tlb_pending(); i = 0; } if (i == 0) { batch->mm = mm; - batch->large = pte_huge(pte); + batch->psize = psize; } if (addr < KERNELBASE) { vsid = get_vsid(mm->context.id, addr); @@ -155,7 +174,7 @@ void hpte_update(struct mm_struct *mm, unsigned long addr, } else vsid = get_kernel_vsid(addr); batch->vaddr[i] = (vsid << 28 ) | (addr & 0x0fffffff); - batch->pte[i] = __pte(pte); + batch->pte[i] = __real_pte(__pte(pte), ptep); batch->index = ++i; if (i >= PPC64_TLB_BATCH_NR) flush_tlb_pending(); @@ -177,7 +196,8 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) local = 1; if (i == 1) - flush_hash_page(batch->vaddr[0], batch->pte[0], local); + flush_hash_page(batch->vaddr[0], batch->pte[0], + batch->psize, local); else flush_hash_range(i, local); batch->index = 0; diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig index 19d37730b66..eb2dece76a5 100644 --- a/arch/powerpc/oprofile/Kconfig +++ b/arch/powerpc/oprofile/Kconfig @@ -1,7 +1,3 @@ - -menu "Profiling support" - depends on EXPERIMENTAL - config PROFILING bool "Profiling support (EXPERIMENTAL)" help @@ -19,5 +15,3 @@ config OPROFILE If unsure, say N. -endmenu - diff --git a/arch/powerpc/oprofile/op_model_fsl_booke.c b/arch/powerpc/oprofile/op_model_fsl_booke.c index 86124a94c9a..26539cda602 100644 --- a/arch/powerpc/oprofile/op_model_fsl_booke.c +++ b/arch/powerpc/oprofile/op_model_fsl_booke.c @@ -7,7 +7,7 @@ * Copyright (c) 2004 Freescale Semiconductor, Inc * * Author: Andy Fleming - * Maintainer: Kumar Gala <Kumar.Gala@freescale.com> + * Maintainer: Kumar Gala <galak@kernel.crashing.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c index 88644931584..a3401b46f3b 100644 --- a/arch/powerpc/oprofile/op_model_power4.c +++ b/arch/powerpc/oprofile/op_model_power4.c @@ -14,9 +14,9 @@ #include <asm/system.h> #include <asm/processor.h> #include <asm/cputable.h> -#include <asm/systemcfg.h> #include <asm/rtas.h> #include <asm/oprofile_impl.h> +#include <asm/reg.h> #define dbg(args...) @@ -81,6 +81,26 @@ static void power4_reg_setup(struct op_counter_config *ctr, extern void ppc64_enable_pmcs(void); +/* + * Older CPUs require the MMCRA sample bit to be always set, but newer + * CPUs only want it set for some groups. Eventually we will remove all + * knowledge of this bit in the kernel, oprofile userspace should be + * setting it when required. + * + * In order to keep current installations working we force the bit for + * those older CPUs. Once everyone has updated their oprofile userspace we + * can remove this hack. + */ +static inline int mmcra_must_set_sample(void) +{ + if (__is_processor(PV_POWER4) || __is_processor(PV_POWER4p) || + __is_processor(PV_970) || __is_processor(PV_970FX) || + __is_processor(PV_970MP)) + return 1; + + return 0; +} + static void power4_cpu_setup(void *unused) { unsigned int mmcr0 = mmcr0_val; @@ -98,7 +118,8 @@ static void power4_cpu_setup(void *unused) mtspr(SPRN_MMCR1, mmcr1_val); - mmcra |= MMCRA_SAMPLE_ENABLE; + if (mmcra_must_set_sample()) + mmcra |= MMCRA_SAMPLE_ENABLE; mtspr(SPRN_MMCRA, mmcra); dbg("setup on cpu %d, mmcr0 %lx\n", smp_processor_id(), @@ -211,8 +232,7 @@ static unsigned long get_pc(struct pt_regs *regs) mmcra = mfspr(SPRN_MMCRA); /* Were we in the hypervisor? */ - if ((systemcfg->platform == PLATFORM_PSERIES_LPAR) && - (mmcra & MMCRA_SIHV)) + if (platform_is_lpar() && (mmcra & MMCRA_SIHV)) /* function descriptor madness */ return *((unsigned long *)hypervisor_bucket); diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index ecd32d5d85f..4099ddab920 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -361,7 +361,9 @@ static void __init chrp_find_openpic(void) printk(KERN_INFO "OpenPIC at %lx\n", opaddr); irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */ - prom_get_irq_senses(init_senses, NUM_8259_INTERRUPTS, NR_IRQS - 4); + prom_get_irq_senses(init_senses, NUM_ISA_INTERRUPTS, NR_IRQS - 4); + /* i8259 cascade is always positive level */ + init_senses[0] = IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE; iranges = (unsigned int *) get_property(np, "interrupt-ranges", &len); if (iranges == NULL) diff --git a/arch/powerpc/platforms/iseries/htab.c b/arch/powerpc/platforms/iseries/htab.c index b3c6c3374ca..30bdcf3925d 100644 --- a/arch/powerpc/platforms/iseries/htab.c +++ b/arch/powerpc/platforms/iseries/htab.c @@ -39,15 +39,16 @@ static inline void iSeries_hunlock(unsigned long slot) spin_unlock(&iSeries_hlocks[(slot >> 4) & 0x3f]); } -static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, - unsigned long prpn, unsigned long vflags, - unsigned long rflags) +long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize) { - unsigned long arpn; long slot; hpte_t lhpte; int secondary = 0; + BUG_ON(psize != MMU_PAGE_4K); + /* * The hypervisor tries both primary and secondary. * If we are being called to insert in the secondary, @@ -59,8 +60,19 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, iSeries_hlock(hpte_group); - slot = HvCallHpt_findValid(&lhpte, va >> PAGE_SHIFT); - BUG_ON(lhpte.v & HPTE_V_VALID); + slot = HvCallHpt_findValid(&lhpte, va >> HW_PAGE_SHIFT); + if (unlikely(lhpte.v & HPTE_V_VALID)) { + if (vflags & HPTE_V_BOLTED) { + HvCallHpt_setSwBits(slot, 0x10, 0); + HvCallHpt_setPp(slot, PP_RWXX); + iSeries_hunlock(hpte_group); + if (slot < 0) + return 0x8 | (slot & 7); + else + return slot & 7; + } + BUG(); + } if (slot == -1) { /* No available entry found in either group */ iSeries_hunlock(hpte_group); @@ -73,10 +85,9 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, slot &= 0x7fffffffffffffff; } - arpn = phys_to_abs(prpn << PAGE_SHIFT) >> PAGE_SHIFT; - lhpte.v = (va >> 23) << HPTE_V_AVPN_SHIFT | vflags | HPTE_V_VALID; - lhpte.r = (arpn << HPTE_R_RPN_SHIFT) | rflags; + lhpte.v = hpte_encode_v(va, MMU_PAGE_4K) | vflags | HPTE_V_VALID; + lhpte.r = hpte_encode_r(phys_to_abs(pa), MMU_PAGE_4K) | rflags; /* Now fill in the actual HPTE */ HvCallHpt_addValidate(slot, secondary, &lhpte); @@ -86,25 +97,6 @@ static long iSeries_hpte_insert(unsigned long hpte_group, unsigned long va, return (secondary << 3) | (slot & 7); } -long iSeries_hpte_bolt_or_insert(unsigned long hpte_group, - unsigned long va, unsigned long prpn, unsigned long vflags, - unsigned long rflags) -{ - long slot; - hpte_t lhpte; - - slot = HvCallHpt_findValid(&lhpte, va >> PAGE_SHIFT); - - if (lhpte.v & HPTE_V_VALID) { - /* Bolt the existing HPTE */ - HvCallHpt_setSwBits(slot, 0x10, 0); - HvCallHpt_setPp(slot, PP_RWXX); - return 0; - } - - return iSeries_hpte_insert(hpte_group, va, prpn, vflags, rflags); -} - static unsigned long iSeries_hpte_getword0(unsigned long slot) { hpte_t hpte; @@ -150,15 +142,17 @@ static long iSeries_hpte_remove(unsigned long hpte_group) * bits 61..63 : PP2,PP1,PP0 */ static long iSeries_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int large, int local) + unsigned long va, int psize, int local) { hpte_t hpte; - unsigned long avpn = va >> 23; + unsigned long want_v; iSeries_hlock(slot); HvCallHpt_get(&hpte, slot); - if ((HPTE_V_AVPN_VAL(hpte.v) == avpn) && (hpte.v & HPTE_V_VALID)) { + want_v = hpte_encode_v(va, MMU_PAGE_4K); + + if (HPTE_V_COMPARE(hpte.v, want_v) && (hpte.v & HPTE_V_VALID)) { /* * Hypervisor expects bits as NPPP, which is * different from how they are mapped in our PP. @@ -210,14 +204,17 @@ static long iSeries_hpte_find(unsigned long vpn) * * No need to lock here because we should be the only user. */ -static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) +static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize) { unsigned long vsid,va,vpn; long slot; + BUG_ON(psize != MMU_PAGE_4K); + vsid = get_kernel_vsid(ea); va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> PAGE_SHIFT; + vpn = va >> HW_PAGE_SHIFT; slot = iSeries_hpte_find(vpn); if (slot == -1) panic("updateboltedpp: Could not find page to bolt\n"); @@ -225,7 +222,7 @@ static void iSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) } static void iSeries_hpte_invalidate(unsigned long slot, unsigned long va, - int large, int local) + int psize, int local) { unsigned long hpte_v; unsigned long avpn = va >> 23; diff --git a/arch/powerpc/platforms/iseries/hvlog.c b/arch/powerpc/platforms/iseries/hvlog.c index 62ec7347968..f476d71194f 100644 --- a/arch/powerpc/platforms/iseries/hvlog.c +++ b/arch/powerpc/platforms/iseries/hvlog.c @@ -22,7 +22,7 @@ void HvCall_writeLogBuffer(const void *buffer, u64 len) while (len) { hv_buf.addr = cur; - left_this_page = ((cur & PAGE_MASK) + PAGE_SIZE) - cur; + left_this_page = ((cur & HW_PAGE_MASK) + HW_PAGE_SIZE) - cur; if (left_this_page > len) left_this_page = len; hv_buf.len = left_this_page; @@ -30,6 +30,6 @@ void HvCall_writeLogBuffer(const void *buffer, u64 len) HvCall2(HvCallBaseWriteLogBuffer, virt_to_abs(&hv_buf), left_this_page); - cur = (cur & PAGE_MASK) + PAGE_SIZE; + cur = (cur & HW_PAGE_MASK) + HW_PAGE_SIZE; } } diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c index 1a6845b5c5a..bf081b34582 100644 --- a/arch/powerpc/platforms/iseries/iommu.c +++ b/arch/powerpc/platforms/iseries/iommu.c @@ -43,9 +43,12 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, u64 rc; union tce_entry tce; + index <<= TCE_PAGE_FACTOR; + npages <<= TCE_PAGE_FACTOR; + while (npages--) { tce.te_word = 0; - tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> PAGE_SHIFT; + tce.te_bits.tb_rpn = virt_to_abs(uaddr) >> TCE_SHIFT; if (tbl->it_type == TCE_VB) { /* Virtual Bus */ @@ -66,7 +69,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, panic("PCI_DMA: HvCallXm_setTce failed, Rc: 0x%lx\n", rc); index++; - uaddr += PAGE_SIZE; + uaddr += TCE_PAGE_SIZE; } } @@ -74,6 +77,9 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) { u64 rc; + npages <<= TCE_PAGE_FACTOR; + index <<= TCE_PAGE_FACTOR; + while (npages--) { rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0); if (rc) @@ -83,27 +89,6 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) } } -#ifdef CONFIG_PCI -/* - * This function compares the known tables to find an iommu_table - * that has already been built for hardware TCEs. - */ -static struct iommu_table *iommu_table_find(struct iommu_table * tbl) -{ - struct pci_dn *pdn; - - list_for_each_entry(pdn, &iSeries_Global_Device_List, Device_List) { - struct iommu_table *it = pdn->iommu_table; - if ((it != NULL) && - (it->it_type == TCE_PCI) && - (it->it_offset == tbl->it_offset) && - (it->it_index == tbl->it_index) && - (it->it_size == tbl->it_size)) - return it; - } - return NULL; -} - /* * Call Hv with the architected data structure to get TCE table info. * info. Put the returned data into the Linux representation of the @@ -113,8 +98,10 @@ static struct iommu_table *iommu_table_find(struct iommu_table * tbl) * 2. TCE table per Bus. * 3. TCE Table per IOA. */ -static void iommu_table_getparms(struct pci_dn *pdn, - struct iommu_table* tbl) +void iommu_table_getparms_iSeries(unsigned long busno, + unsigned char slotno, + unsigned char virtbus, + struct iommu_table* tbl) { struct iommu_table_cb *parms; @@ -124,9 +111,9 @@ static void iommu_table_getparms(struct pci_dn *pdn, memset(parms, 0, sizeof(*parms)); - parms->itc_busno = pdn->busno; - parms->itc_slotno = pdn->LogicalSlot; - parms->itc_virtbus = 0; + parms->itc_busno = busno; + parms->itc_slotno = slotno; + parms->itc_virtbus = virtbus; HvCallXm_getTceTableParms(iseries_hv_addr(parms)); @@ -134,17 +121,40 @@ static void iommu_table_getparms(struct pci_dn *pdn, panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms); /* itc_size is in pages worth of table, it_size is in # of entries */ - tbl->it_size = (parms->itc_size * PAGE_SIZE) / sizeof(union tce_entry); + tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) / + sizeof(union tce_entry)) >> TCE_PAGE_FACTOR; tbl->it_busno = parms->itc_busno; - tbl->it_offset = parms->itc_offset; + tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR; tbl->it_index = parms->itc_index; tbl->it_blocksize = 1; - tbl->it_type = TCE_PCI; + tbl->it_type = virtbus ? TCE_VB : TCE_PCI; kfree(parms); } +#ifdef CONFIG_PCI +/* + * This function compares the known tables to find an iommu_table + * that has already been built for hardware TCEs. + */ +static struct iommu_table *iommu_table_find(struct iommu_table * tbl) +{ + struct pci_dn *pdn; + + list_for_each_entry(pdn, &iSeries_Global_Device_List, Device_List) { + struct iommu_table *it = pdn->iommu_table; + if ((it != NULL) && + (it->it_type == TCE_PCI) && + (it->it_offset == tbl->it_offset) && + (it->it_index == tbl->it_index) && + (it->it_size == tbl->it_size)) + return it; + } + return NULL; +} + + void iommu_devnode_init_iSeries(struct device_node *dn) { struct iommu_table *tbl; @@ -152,7 +162,7 @@ void iommu_devnode_init_iSeries(struct device_node *dn) tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL); - iommu_table_getparms(pdn, tbl); + iommu_table_getparms_iSeries(pdn->busno, pdn->LogicalSlot, 0, tbl); /* Look for existing tce table */ pdn->iommu_table = iommu_table_find(tbl); diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c index c1135912cc0..a58daa15368 100644 --- a/arch/powerpc/platforms/iseries/irq.c +++ b/arch/powerpc/platforms/iseries/irq.c @@ -35,7 +35,6 @@ #include <linux/irq.h> #include <linux/spinlock.h> -#include <asm/ppcdebug.h> #include <asm/iseries/hv_types.h> #include <asm/iseries/hv_lp_event.h> #include <asm/iseries/hv_call_xm.h> @@ -43,13 +42,6 @@ #include "irq.h" #include "call_pci.h" -/* This maps virtual irq numbers to real irqs */ -unsigned int virt_irq_to_real_map[NR_IRQS]; - -/* The next available virtual irq number */ -/* Note: the pcnet32 driver assumes irq numbers < 2 aren't valid. :( */ -static int next_virtual_irq = 2; - static long Pci_Interrupt_Count; static long Pci_Event_Count; @@ -104,6 +96,9 @@ static void intReceived(struct XmPciLpEvent *eventParm, struct pt_regs *regsParm) { int irq; +#ifdef CONFIG_IRQSTACKS + struct thread_info *curtp, *irqtp; +#endif ++Pci_Interrupt_Count; @@ -111,7 +106,20 @@ static void intReceived(struct XmPciLpEvent *eventParm, case XmPciLpEvent_SlotInterrupt: irq = eventParm->hvLpEvent.xCorrelationToken; /* Dispatch the interrupt handlers for this irq */ - ppc_irq_dispatch_handler(regsParm, irq); +#ifdef CONFIG_IRQSTACKS + /* Switch to the irq stack to handle this */ + curtp = current_thread_info(); + irqtp = hardirq_ctx[smp_processor_id()]; + if (curtp != irqtp) { + irqtp->task = curtp->task; + irqtp->flags = 0; + call___do_IRQ(irq, regsParm, irqtp); + irqtp->task = NULL; + if (irqtp->flags) + set_bits(irqtp->flags, &curtp->flags); + } else +#endif + __do_IRQ(irq, regsParm); HvCallPci_eoi(eventParm->eventData.slotInterrupt.busNumber, eventParm->eventData.slotInterrupt.subBusNumber, eventParm->eventData.slotInterrupt.deviceId); @@ -227,8 +235,6 @@ static void iSeries_enable_IRQ(unsigned int irq) /* Unmask secondary INTA */ mask = 0x80000000; HvCallPci_unmaskInterrupts(bus, subBus, deviceId, mask); - PPCDBG(PPCDBG_BUSWALK, "iSeries_enable_IRQ 0x%02X.%02X.%02X 0x%04X\n", - bus, subBus, deviceId, irq); } /* This is called by iSeries_activate_IRQs */ @@ -310,15 +316,11 @@ static void iSeries_disable_IRQ(unsigned int irq) /* Mask secondary INTA */ mask = 0x80000000; HvCallPci_maskInterrupts(bus, subBus, deviceId, mask); - PPCDBG(PPCDBG_BUSWALK, "iSeries_disable_IRQ 0x%02X.%02X.%02X 0x%04X\n", - bus, subBus, deviceId, irq); } /* - * Need to define this so ppc_irq_dispatch_handler will NOT call - * enable_IRQ at the end of interrupt handling. However, this does - * nothing because there is not enough information provided to do - * the EOI HvCall. This is done by XmPciLpEvent.c + * This does nothing because there is not enough information + * provided to do the EOI HvCall. This is done by XmPciLpEvent.c */ static void iSeries_end_IRQ(unsigned int irq) { @@ -341,26 +343,14 @@ static hw_irq_controller iSeries_IRQ_handler = { int __init iSeries_allocate_IRQ(HvBusNumber busNumber, HvSubBusNumber subBusNumber, HvAgentId deviceId) { - unsigned int realirq, virtirq; + int virtirq; + unsigned int realirq; u8 idsel = (deviceId >> 4); u8 function = deviceId & 7; - virtirq = next_virtual_irq++; realirq = ((busNumber - 1) << 6) + ((idsel - 1) << 3) + function; - virt_irq_to_real_map[virtirq] = realirq; + virtirq = virt_irq_create_mapping(realirq); irq_desc[virtirq].handler = &iSeries_IRQ_handler; return virtirq; } - -int virt_irq_create_mapping(unsigned int real_irq) -{ - BUG(); /* Don't call this on iSeries, yet */ - - return 0; -} - -void virt_irq_init(void) -{ - return; -} diff --git a/arch/powerpc/platforms/iseries/misc.S b/arch/powerpc/platforms/iseries/misc.S index 09f14522e17..dfe7aa1ba09 100644 --- a/arch/powerpc/platforms/iseries/misc.S +++ b/arch/powerpc/platforms/iseries/misc.S @@ -15,6 +15,7 @@ #include <asm/processor.h> #include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> .text diff --git a/arch/powerpc/platforms/iseries/pci.c b/arch/powerpc/platforms/iseries/pci.c index 7d7d5884343..4b75131773a 100644 --- a/arch/powerpc/platforms/iseries/pci.c +++ b/arch/powerpc/platforms/iseries/pci.c @@ -32,7 +32,6 @@ #include <asm/prom.h> #include <asm/machdep.h> #include <asm/pci-bridge.h> -#include <asm/ppcdebug.h> #include <asm/iommu.h> #include <asm/abs_addr.h> @@ -207,10 +206,6 @@ static struct device_node *build_device_node(HvBusNumber Bus, struct device_node *node; struct pci_dn *pdn; - PPCDBG(PPCDBG_BUSWALK, - "-build_device_node 0x%02X.%02X.%02X Function: %02X\n", - Bus, SubBus, AgentId, Function); - node = kmalloc(sizeof(struct device_node), GFP_KERNEL); if (node == NULL) return NULL; @@ -243,8 +238,6 @@ unsigned long __init find_and_init_phbs(void) struct pci_controller *phb; HvBusNumber bus; - PPCDBG(PPCDBG_BUSWALK, "find_and_init_phbs Entry\n"); - /* Check all possible buses. */ for (bus = 0; bus < 256; bus++) { int ret = HvCallXm_testBus(bus); @@ -261,9 +254,6 @@ unsigned long __init find_and_init_phbs(void) phb->last_busno = bus; phb->ops = &iSeries_pci_ops; - PPCDBG(PPCDBG_BUSWALK, "PCI:Create iSeries pci_controller(%p), Bus: %04X\n", - phb, bus); - /* Find and connect the devices. */ scan_PHB_slots(phb); } @@ -285,11 +275,9 @@ unsigned long __init find_and_init_phbs(void) */ void iSeries_pcibios_init(void) { - PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Entry.\n"); iomm_table_initialize(); find_and_init_phbs(); io_page_mask = -1; - PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_init Exit.\n"); } /* @@ -301,8 +289,6 @@ void __init iSeries_pci_final_fixup(void) struct device_node *node; int DeviceCount = 0; - PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_fixup Entry.\n"); - /* Fix up at the device node and pci_dev relationship */ mf_display_src(0xC9000100); @@ -316,9 +302,6 @@ void __init iSeries_pci_final_fixup(void) ++DeviceCount; pdev->sysdata = (void *)node; PCI_DN(node)->pcidev = pdev; - PPCDBG(PPCDBG_BUSWALK, - "pdev 0x%p <==> DevNode 0x%p\n", - pdev, node); allocate_device_bars(pdev); iSeries_Device_Information(pdev, DeviceCount); iommu_devnode_init_iSeries(node); @@ -333,13 +316,10 @@ void __init iSeries_pci_final_fixup(void) void pcibios_fixup_bus(struct pci_bus *PciBus) { - PPCDBG(PPCDBG_BUSWALK, "iSeries_pcibios_fixup_bus(0x%04X) Entry.\n", - PciBus->number); } void pcibios_fixup_resources(struct pci_dev *pdev) { - PPCDBG(PPCDBG_BUSWALK, "fixup_resources pdev %p\n", pdev); } /* @@ -401,9 +381,6 @@ static void scan_EADS_bridge(HvBusNumber bus, HvSubBusNumber SubBus, printk("found device at bus %d idsel %d func %d (AgentId %x)\n", bus, IdSel, Function, AgentId); /* Connect EADs: 0x18.00.12 = 0x00 */ - PPCDBG(PPCDBG_BUSWALK, - "PCI:Connect EADs: 0x%02X.%02X.%02X\n", - bus, SubBus, AgentId); HvRc = HvCallPci_getBusUnitInfo(bus, SubBus, AgentId, iseries_hv_addr(BridgeInfo), sizeof(struct HvCallPci_BridgeInfo)); @@ -414,14 +391,6 @@ static void scan_EADS_bridge(HvBusNumber bus, HvSubBusNumber SubBus, BridgeInfo->maxAgents, BridgeInfo->maxSubBusNumber, BridgeInfo->logicalSlotNumber); - PPCDBG(PPCDBG_BUSWALK, - "PCI: BridgeInfo, Type:0x%02X, SubBus:0x%02X, MaxAgents:0x%02X, MaxSubBus: 0x%02X, LSlot: 0x%02X\n", - BridgeInfo->busUnitInfo.deviceType, - BridgeInfo->subBusNumber, - BridgeInfo->maxAgents, - BridgeInfo->maxSubBusNumber, - BridgeInfo->logicalSlotNumber); - if (BridgeInfo->busUnitInfo.deviceType == HvCallPci_BridgeDevice) { /* Scan_Bridge_Slot...: 0x18.00.12 */ @@ -454,9 +423,6 @@ static int scan_bridge_slot(HvBusNumber Bus, /* iSeries_allocate_IRQ.: 0x18.00.12(0xA3) */ Irq = iSeries_allocate_IRQ(Bus, 0, EADsIdSel); - PPCDBG(PPCDBG_BUSWALK, - "PCI:- allocate and assign IRQ 0x%02X.%02X.%02X = 0x%02X\n", - Bus, 0, EADsIdSel, Irq); /* * Connect all functions of any device found. @@ -482,9 +448,6 @@ static int scan_bridge_slot(HvBusNumber Bus, printk("read vendor ID: %x\n", VendorId); /* FoundDevice: 0x18.28.10 = 0x12AE */ - PPCDBG(PPCDBG_BUSWALK, - "PCI:- FoundDevice: 0x%02X.%02X.%02X = 0x%04X, irq %d\n", - Bus, SubBus, AgentId, VendorId, Irq); HvRc = HvCallPci_configStore8(Bus, SubBus, AgentId, PCI_INTERRUPT_LINE, Irq); if (HvRc != 0) diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index fda712b4216..da26639190d 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -39,7 +39,7 @@ #include <asm/sections.h> #include <asm/iommu.h> #include <asm/firmware.h> - +#include <asm/system.h> #include <asm/time.h> #include <asm/paca.h> #include <asm/cache.h> @@ -71,9 +71,7 @@ extern void hvlog(char *fmt, ...); #endif /* Function Prototypes */ -extern void ppcdbg_initialize(void); - -static void build_iSeries_Memory_Map(void); +static unsigned long build_iSeries_Memory_Map(void); static void iseries_shared_idle(void); static void iseries_dedicated_idle(void); #ifdef CONFIG_PCI @@ -86,7 +84,6 @@ static void iSeries_pci_final_fixup(void) { } int piranha_simulator; extern int rd_size; /* Defined in drivers/block/rd.c */ -extern unsigned long klimit; extern unsigned long embedded_sysmap_start; extern unsigned long embedded_sysmap_end; @@ -309,8 +306,6 @@ static void __init iSeries_init_early(void) ppc64_firmware_features = FW_FEATURE_ISERIES; - ppcdbg_initialize(); - ppc64_interrupt_controller = IC_ISERIES; #if defined(CONFIG_BLK_DEV_INITRD) @@ -320,11 +315,11 @@ static void __init iSeries_init_early(void) */ if (naca.xRamDisk) { initrd_start = (unsigned long)__va(naca.xRamDisk); - initrd_end = initrd_start + naca.xRamDiskSize * PAGE_SIZE; + initrd_end = initrd_start + naca.xRamDiskSize * HW_PAGE_SIZE; initrd_below_start_ok = 1; // ramdisk in kernel space ROOT_DEV = Root_RAM0; - if (((rd_size * 1024) / PAGE_SIZE) < naca.xRamDiskSize) - rd_size = (naca.xRamDiskSize * PAGE_SIZE) / 1024; + if (((rd_size * 1024) / HW_PAGE_SIZE) < naca.xRamDiskSize) + rd_size = (naca.xRamDiskSize * HW_PAGE_SIZE) / 1024; } else #endif /* CONFIG_BLK_DEV_INITRD */ { @@ -407,9 +402,11 @@ void mschunks_alloc(unsigned long num_chunks) * a table used to translate Linux's physical addresses to these * absolute addresses. Absolute addresses are needed when * communicating with the hypervisor (e.g. to build HPT entries) + * + * Returns the physical memory size */ -static void __init build_iSeries_Memory_Map(void) +static unsigned long __init build_iSeries_Memory_Map(void) { u32 loadAreaFirstChunk, loadAreaLastChunk, loadAreaSize; u32 nextPhysChunk; @@ -470,13 +467,14 @@ static void __init build_iSeries_Memory_Map(void) */ hptFirstChunk = (u32)addr_to_chunk(HvCallHpt_getHptAddress()); hptSizePages = (u32)HvCallHpt_getHptPages(); - hptSizeChunks = hptSizePages >> (MSCHUNKS_CHUNK_SHIFT - PAGE_SHIFT); + hptSizeChunks = hptSizePages >> + (MSCHUNKS_CHUNK_SHIFT - HW_PAGE_SHIFT); hptLastChunk = hptFirstChunk + hptSizeChunks - 1; printk("HPT absolute addr = %016lx, size = %dK\n", chunk_to_addr(hptFirstChunk), hptSizeChunks * 256); - ppc64_pft_size = __ilog2(hptSizePages * PAGE_SIZE); + ppc64_pft_size = __ilog2(hptSizePages * HW_PAGE_SIZE); /* * The actual hashed page table is in the hypervisor, @@ -541,7 +539,7 @@ static void __init build_iSeries_Memory_Map(void) * which should be equal to * nextPhysChunk */ - systemcfg->physicalMemorySize = chunk_to_addr(nextPhysChunk); + return chunk_to_addr(nextPhysChunk); } /* @@ -549,8 +547,6 @@ static void __init build_iSeries_Memory_Map(void) */ static void __init iSeries_setup_arch(void) { - unsigned procIx = get_paca()->lppaca.dyn_hv_phys_proc_index; - if (get_paca()->lppaca.shared_proc) { ppc_md.idle_loop = iseries_shared_idle; printk(KERN_INFO "Using shared processor idle loop\n"); @@ -566,9 +562,6 @@ static void __init iSeries_setup_arch(void) itVpdAreas.xSlicMaxLogicalProcs); printk("Max physical processors = %d\n", itVpdAreas.xSlicMaxPhysicalProcs); - - systemcfg->processor = xIoHriProcessorVpd[procIx].xPVR; - printk("Processor version = %x\n", systemcfg->processor); } static void iSeries_show_cpuinfo(struct seq_file *m) @@ -629,7 +622,7 @@ static void __init iSeries_fixup_klimit(void) */ if (naca.xRamDisk) klimit = KERNELBASE + (u64)naca.xRamDisk + - (naca.xRamDiskSize * PAGE_SIZE); + (naca.xRamDiskSize * HW_PAGE_SIZE); else { /* * No ram disk was included - check and see if there @@ -697,20 +690,18 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } static void iseries_dedicated_idle(void) { - long oldval; + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); HMT_low(); @@ -723,13 +714,12 @@ static void iseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } @@ -934,7 +924,7 @@ void dt_cpus(struct iseries_flat_dt *dt) dt_end_node(dt); } -void build_flat_dt(struct iseries_flat_dt *dt) +void build_flat_dt(struct iseries_flat_dt *dt, unsigned long phys_mem_size) { u64 tmp[2]; @@ -950,7 +940,7 @@ void build_flat_dt(struct iseries_flat_dt *dt) dt_prop_str(dt, "name", "memory"); dt_prop_str(dt, "device_type", "memory"); tmp[0] = 0; - tmp[1] = systemcfg->physicalMemorySize; + tmp[1] = phys_mem_size; dt_prop_u64_list(dt, "reg", tmp, 2); dt_end_node(dt); @@ -970,13 +960,15 @@ void build_flat_dt(struct iseries_flat_dt *dt) void * __init iSeries_early_setup(void) { + unsigned long phys_mem_size; + iSeries_fixup_klimit(); /* * Initialize the table which translate Linux physical addresses to * AS/400 absolute addresses */ - build_iSeries_Memory_Map(); + phys_mem_size = build_iSeries_Memory_Map(); iSeries_get_cmdline(); @@ -986,7 +978,7 @@ void * __init iSeries_early_setup(void) /* Parse early parameters, in particular mem=x */ parse_early_param(); - build_flat_dt(&iseries_dt); + build_flat_dt(&iseries_dt, phys_mem_size); return (void *) __pa(&iseries_dt); } diff --git a/arch/powerpc/platforms/iseries/smp.c b/arch/powerpc/platforms/iseries/smp.c index 3336bad6772..fcb094ec6ae 100644 --- a/arch/powerpc/platforms/iseries/smp.c +++ b/arch/powerpc/platforms/iseries/smp.c @@ -40,7 +40,6 @@ #include <asm/paca.h> #include <asm/iseries/hv_call.h> #include <asm/time.h> -#include <asm/ppcdebug.h> #include <asm/machdep.h> #include <asm/cputable.h> #include <asm/system.h> diff --git a/arch/powerpc/platforms/iseries/vio.c b/arch/powerpc/platforms/iseries/vio.c index c27a66876c2..384360ee06e 100644 --- a/arch/powerpc/platforms/iseries/vio.c +++ b/arch/powerpc/platforms/iseries/vio.c @@ -30,41 +30,14 @@ static struct iommu_table vio_iommu_table; static void __init iommu_vio_init(void) { - struct iommu_table *t; - struct iommu_table_cb cb; - unsigned long cbp; - unsigned long itc_entries; + iommu_table_getparms_iSeries(255, 0, 0xff, &veth_iommu_table); + veth_iommu_table.it_size /= 2; + vio_iommu_table = veth_iommu_table; + vio_iommu_table.it_offset += veth_iommu_table.it_size; - cb.itc_busno = 255; /* Bus 255 is the virtual bus */ - cb.itc_virtbus = 0xff; /* Ask for virtual bus */ - - cbp = virt_to_abs(&cb); - HvCallXm_getTceTableParms(cbp); - - itc_entries = cb.itc_size * PAGE_SIZE / sizeof(union tce_entry); - veth_iommu_table.it_size = itc_entries / 2; - veth_iommu_table.it_busno = cb.itc_busno; - veth_iommu_table.it_offset = cb.itc_offset; - veth_iommu_table.it_index = cb.itc_index; - veth_iommu_table.it_type = TCE_VB; - veth_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&veth_iommu_table); - - if (!t) + if (!iommu_init_table(&veth_iommu_table)) printk("Virtual Bus VETH TCE table failed.\n"); - - vio_iommu_table.it_size = itc_entries - veth_iommu_table.it_size; - vio_iommu_table.it_busno = cb.itc_busno; - vio_iommu_table.it_offset = cb.itc_offset + - veth_iommu_table.it_size; - vio_iommu_table.it_index = cb.itc_index; - vio_iommu_table.it_type = TCE_VB; - vio_iommu_table.it_blocksize = 1; - - t = iommu_init_table(&vio_iommu_table); - - if (!t) + if (!iommu_init_table(&vio_iommu_table)) printk("Virtual Bus VIO TCE table failed.\n"); } diff --git a/arch/powerpc/platforms/iseries/viopath.c b/arch/powerpc/platforms/iseries/viopath.c index fe97bfbf746..84267269559 100644 --- a/arch/powerpc/platforms/iseries/viopath.c +++ b/arch/powerpc/platforms/iseries/viopath.c @@ -68,7 +68,8 @@ static DEFINE_SPINLOCK(statuslock); * For each kind of event we allocate a buffer that is * guaranteed not to cross a page boundary */ -static unsigned char event_buffer[VIO_MAX_SUBTYPES * 256] __page_aligned; +static unsigned char event_buffer[VIO_MAX_SUBTYPES * 256] + __attribute__((__aligned__(4096))); static atomic_t event_buffer_available[VIO_MAX_SUBTYPES]; static int event_buffer_initialised; @@ -116,12 +117,12 @@ static int proc_viopath_show(struct seq_file *m, void *v) HvLpEvent_Rc hvrc; DECLARE_MUTEX_LOCKED(Semaphore); - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(HW_PAGE_SIZE, GFP_KERNEL); if (!buf) return 0; - memset(buf, 0, PAGE_SIZE); + memset(buf, 0, HW_PAGE_SIZE); - handle = dma_map_single(iSeries_vio_dev, buf, PAGE_SIZE, + handle = dma_map_single(iSeries_vio_dev, buf, HW_PAGE_SIZE, DMA_FROM_DEVICE); hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, @@ -131,7 +132,7 @@ static int proc_viopath_show(struct seq_file *m, void *v) viopath_sourceinst(viopath_hostLp), viopath_targetinst(viopath_hostLp), (u64)(unsigned long)&Semaphore, VIOVERSION << 16, - ((u64)handle) << 32, PAGE_SIZE, 0, 0); + ((u64)handle) << 32, HW_PAGE_SIZE, 0, 0); if (hvrc != HvLpEvent_Rc_Good) printk(VIOPATH_KERN_WARN "hv error on op %d\n", (int)hvrc); @@ -140,7 +141,7 @@ static int proc_viopath_show(struct seq_file *m, void *v) vlanMap = HvLpConfig_getVirtualLanIndexMap(); - buf[PAGE_SIZE-1] = '\0'; + buf[HW_PAGE_SIZE-1] = '\0'; seq_printf(m, "%s", buf); seq_printf(m, "AVAILABLE_VETH=%x\n", vlanMap); seq_printf(m, "SRLNBR=%c%c%c%c%c%c%c\n", @@ -152,7 +153,8 @@ static int proc_viopath_show(struct seq_file *m, void *v) e2a(xItExtVpdPanel.systemSerial[4]), e2a(xItExtVpdPanel.systemSerial[5])); - dma_unmap_single(iSeries_vio_dev, handle, PAGE_SIZE, DMA_FROM_DEVICE); + dma_unmap_single(iSeries_vio_dev, handle, HW_PAGE_SIZE, + DMA_FROM_DEVICE); kfree(buf); return 0; diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index 340c21caeae..895aeb3f75d 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -380,9 +380,6 @@ void __init maple_pcibios_fixup(void) for_each_pci_dev(dev) pci_read_irq_line(dev); - /* Do the mapping of the IO space */ - phbs_remap_io(); - DBG(" <- maple_pcibios_fixup\n"); } diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 4369676f1d5..c9df44fcf57 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,7 +1,8 @@ obj-y += pic.o setup.o time.o feature.o pci.o \ sleep.o low_i2c.o cache.o obj-$(CONFIG_PMAC_BACKLIGHT) += backlight.o -obj-$(CONFIG_CPU_FREQ_PMAC) += cpufreq.o +obj-$(CONFIG_CPU_FREQ_PMAC) += cpufreq_32.o +obj-$(CONFIG_CPU_FREQ_PMAC64) += cpufreq_64.o obj-$(CONFIG_NVRAM) += nvram.o # ppc64 pmac doesn't define CONFIG_NVRAM but needs nvram stuff obj-$(CONFIG_PPC64) += nvram.o diff --git a/arch/powerpc/platforms/powermac/cpufreq.c b/arch/powerpc/platforms/powermac/cpufreq_32.c index c47f8b69725..56fd4e05fed 100644 --- a/arch/powerpc/platforms/powermac/cpufreq.c +++ b/arch/powerpc/platforms/powermac/cpufreq_32.c @@ -397,18 +397,16 @@ static int pmac_cpufreq_target( struct cpufreq_policy *policy, unsigned int relation) { unsigned int newstate = 0; + int rc; if (cpufreq_frequency_table_target(policy, pmac_cpu_freqs, target_freq, relation, &newstate)) return -EINVAL; - return do_set_cpu_speed(newstate, 1); -} + rc = do_set_cpu_speed(newstate, 1); -unsigned int pmac_get_one_cpufreq(int i) -{ - /* Supports only one CPU for now */ - return (i == 0) ? cur_freq : 0; + ppc_proc_freq = cur_freq * 1000ul; + return rc; } static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -474,6 +472,8 @@ static int pmac_cpufreq_resume(struct cpufreq_policy *policy) do_set_cpu_speed(sleep_freq == low_freq ? CPUFREQ_LOW : CPUFREQ_HIGH, 0); + ppc_proc_freq = cur_freq * 1000ul; + no_schedule = 0; return 0; } @@ -547,7 +547,7 @@ static int pmac_cpufreq_init_MacRISC3(struct device_node *cpunode) */ if (low_freq < 98000000) low_freq = 101000000; - + /* Convert those to CPU core clocks */ low_freq = (low_freq * (*ratio)) / 2000; hi_freq = (hi_freq * (*ratio)) / 2000; @@ -714,6 +714,7 @@ out: pmac_cpu_freqs[CPUFREQ_LOW].frequency = low_freq; pmac_cpu_freqs[CPUFREQ_HIGH].frequency = hi_freq; + ppc_proc_freq = cur_freq * 1000ul; printk(KERN_INFO "Registering PowerMac CPU frequency driver\n"); printk(KERN_INFO "Low: %d Mhz, High: %d Mhz, Boot: %d Mhz\n", diff --git a/arch/powerpc/platforms/powermac/cpufreq_64.c b/arch/powerpc/platforms/powermac/cpufreq_64.c new file mode 100644 index 00000000000..39150342c6f --- /dev/null +++ b/arch/powerpc/platforms/powermac/cpufreq_64.c @@ -0,0 +1,323 @@ +/* + * Copyright (C) 2002 - 2005 Benjamin Herrenschmidt <benh@kernel.crashing.org> + * and Markus Demleitner <msdemlei@cl.uni-heidelberg.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This driver adds basic cpufreq support for SMU & 970FX based G5 Macs, + * that is iMac G5 and latest single CPU desktop. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/cpufreq.h> +#include <linux/init.h> +#include <linux/completion.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/irq.h> +#include <asm/sections.h> +#include <asm/cputable.h> +#include <asm/time.h> +#include <asm/smu.h> + +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + +/* see 970FX user manual */ + +#define SCOM_PCR 0x0aa001 /* PCR scom addr */ + +#define PCR_HILO_SELECT 0x80000000U /* 1 = PCR, 0 = PCRH */ +#define PCR_SPEED_FULL 0x00000000U /* 1:1 speed value */ +#define PCR_SPEED_HALF 0x00020000U /* 1:2 speed value */ +#define PCR_SPEED_QUARTER 0x00040000U /* 1:4 speed value */ +#define PCR_SPEED_MASK 0x000e0000U /* speed mask */ +#define PCR_SPEED_SHIFT 17 +#define PCR_FREQ_REQ_VALID 0x00010000U /* freq request valid */ +#define PCR_VOLT_REQ_VALID 0x00008000U /* volt request valid */ +#define PCR_TARGET_TIME_MASK 0x00006000U /* target time */ +#define PCR_STATLAT_MASK 0x00001f00U /* STATLAT value */ +#define PCR_SNOOPLAT_MASK 0x000000f0U /* SNOOPLAT value */ +#define PCR_SNOOPACC_MASK 0x0000000fU /* SNOOPACC value */ + +#define SCOM_PSR 0x408001 /* PSR scom addr */ +/* warning: PSR is a 64 bits register */ +#define PSR_CMD_RECEIVED 0x2000000000000000U /* command received */ +#define PSR_CMD_COMPLETED 0x1000000000000000U /* command completed */ +#define PSR_CUR_SPEED_MASK 0x0300000000000000U /* current speed */ +#define PSR_CUR_SPEED_SHIFT (56) + +/* + * The G5 only supports two frequencies (Quarter speed is not supported) + */ +#define CPUFREQ_HIGH 0 +#define CPUFREQ_LOW 1 + +static struct cpufreq_frequency_table g5_cpu_freqs[] = { + {CPUFREQ_HIGH, 0}, + {CPUFREQ_LOW, 0}, + {0, CPUFREQ_TABLE_END}, +}; + +static struct freq_attr* g5_cpu_freqs_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +/* Power mode data is an array of the 32 bits PCR values to use for + * the various frequencies, retreived from the device-tree + */ +static u32 *g5_pmode_data; +static int g5_pmode_max; +static int g5_pmode_cur; + +static DECLARE_MUTEX(g5_switch_mutex); + + +static struct smu_sdbp_fvt *g5_fvt_table; /* table of op. points */ +static int g5_fvt_count; /* number of op. points */ +static int g5_fvt_cur; /* current op. point */ + +/* ----------------- real hardware interface */ + +static void g5_switch_volt(int speed_mode) +{ + struct smu_simple_cmd cmd; + + DECLARE_COMPLETION(comp); + smu_queue_simple(&cmd, SMU_CMD_POWER_COMMAND, 8, smu_done_complete, + &comp, 'V', 'S', 'L', 'E', 'W', + 0xff, g5_fvt_cur+1, speed_mode); + wait_for_completion(&comp); +} + +static int g5_switch_freq(int speed_mode) +{ + struct cpufreq_freqs freqs; + int to; + + if (g5_pmode_cur == speed_mode) + return 0; + + down(&g5_switch_mutex); + + freqs.old = g5_cpu_freqs[g5_pmode_cur].frequency; + freqs.new = g5_cpu_freqs[speed_mode].frequency; + freqs.cpu = 0; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* If frequency is going up, first ramp up the voltage */ + if (speed_mode < g5_pmode_cur) + g5_switch_volt(speed_mode); + + /* Clear PCR high */ + scom970_write(SCOM_PCR, 0); + /* Clear PCR low */ + scom970_write(SCOM_PCR, PCR_HILO_SELECT | 0); + /* Set PCR low */ + scom970_write(SCOM_PCR, PCR_HILO_SELECT | + g5_pmode_data[speed_mode]); + + /* Wait for completion */ + for (to = 0; to < 10; to++) { + unsigned long psr = scom970_read(SCOM_PSR); + + if ((psr & PSR_CMD_RECEIVED) == 0 && + (((psr >> PSR_CUR_SPEED_SHIFT) ^ + (g5_pmode_data[speed_mode] >> PCR_SPEED_SHIFT)) & 0x3) + == 0) + break; + if (psr & PSR_CMD_COMPLETED) + break; + udelay(100); + } + + /* If frequency is going down, last ramp the voltage */ + if (speed_mode > g5_pmode_cur) + g5_switch_volt(speed_mode); + + g5_pmode_cur = speed_mode; + ppc_proc_freq = g5_cpu_freqs[speed_mode].frequency * 1000ul; + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + up(&g5_switch_mutex); + + return 0; +} + +static int g5_query_freq(void) +{ + unsigned long psr = scom970_read(SCOM_PSR); + int i; + + for (i = 0; i <= g5_pmode_max; i++) + if ((((psr >> PSR_CUR_SPEED_SHIFT) ^ + (g5_pmode_data[i] >> PCR_SPEED_SHIFT)) & 0x3) == 0) + break; + return i; +} + +/* ----------------- cpufreq bookkeeping */ + +static int g5_cpufreq_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, g5_cpu_freqs); +} + +static int g5_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, g5_cpu_freqs, + target_freq, relation, &newstate)) + return -EINVAL; + + return g5_switch_freq(newstate); +} + +static unsigned int g5_cpufreq_get_speed(unsigned int cpu) +{ + return g5_cpu_freqs[g5_pmode_cur].frequency; +} + +static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + if (policy->cpu != 0) + return -ENODEV; + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = g5_cpu_freqs[g5_query_freq()].frequency; + cpufreq_frequency_table_get_attr(g5_cpu_freqs, policy->cpu); + + return cpufreq_frequency_table_cpuinfo(policy, + g5_cpu_freqs); +} + + +static struct cpufreq_driver g5_cpufreq_driver = { + .name = "powermac", + .owner = THIS_MODULE, + .flags = CPUFREQ_CONST_LOOPS, + .init = g5_cpufreq_cpu_init, + .verify = g5_cpufreq_verify, + .target = g5_cpufreq_target, + .get = g5_cpufreq_get_speed, + .attr = g5_cpu_freqs_attr, +}; + + +static int __init g5_cpufreq_init(void) +{ + struct device_node *cpunode; + unsigned int psize, ssize; + struct smu_sdbp_header *shdr; + unsigned long max_freq; + u32 *valp; + int rc = -ENODEV; + + /* Look for CPU and SMU nodes */ + cpunode = of_find_node_by_type(NULL, "cpu"); + if (!cpunode) { + DBG("No CPU node !\n"); + return -ENODEV; + } + + /* Check 970FX for now */ + valp = (u32 *)get_property(cpunode, "cpu-version", NULL); + if (!valp) { + DBG("No cpu-version property !\n"); + goto bail_noprops; + } + if (((*valp) >> 16) != 0x3c) { + DBG("Wrong CPU version: %08x\n", *valp); + goto bail_noprops; + } + + /* Look for the powertune data in the device-tree */ + g5_pmode_data = (u32 *)get_property(cpunode, "power-mode-data",&psize); + if (!g5_pmode_data) { + DBG("No power-mode-data !\n"); + goto bail_noprops; + } + g5_pmode_max = psize / sizeof(u32) - 1; + + /* Look for the FVT table */ + shdr = smu_get_sdb_partition(SMU_SDB_FVT_ID, NULL); + if (!shdr) + goto bail_noprops; + g5_fvt_table = (struct smu_sdbp_fvt *)&shdr[1]; + ssize = (shdr->len * sizeof(u32)) - sizeof(struct smu_sdbp_header); + g5_fvt_count = ssize / sizeof(struct smu_sdbp_fvt); + g5_fvt_cur = 0; + + /* Sanity checking */ + if (g5_fvt_count < 1 || g5_pmode_max < 1) + goto bail_noprops; + + /* + * From what I see, clock-frequency is always the maximal frequency. + * The current driver can not slew sysclk yet, so we really only deal + * with powertune steps for now. We also only implement full freq and + * half freq in this version. So far, I haven't yet seen a machine + * supporting anything else. + */ + valp = (u32 *)get_property(cpunode, "clock-frequency", NULL); + if (!valp) + return -ENODEV; + max_freq = (*valp)/1000; + g5_cpu_freqs[0].frequency = max_freq; + g5_cpu_freqs[1].frequency = max_freq/2; + + /* Check current frequency */ + g5_pmode_cur = g5_query_freq(); + if (g5_pmode_cur > 1) + /* We don't support anything but 1:1 and 1:2, fixup ... */ + g5_pmode_cur = 1; + + /* Force apply current frequency to make sure everything is in + * sync (voltage is right for example). Firmware may leave us with + * a strange setting ... + */ + g5_switch_freq(g5_pmode_cur); + + printk(KERN_INFO "Registering G5 CPU frequency driver\n"); + printk(KERN_INFO "Low: %d Mhz, High: %d Mhz, Cur: %d MHz\n", + g5_cpu_freqs[1].frequency/1000, + g5_cpu_freqs[0].frequency/1000, + g5_cpu_freqs[g5_pmode_cur].frequency/1000); + + rc = cpufreq_register_driver(&g5_cpufreq_driver); + + /* We keep the CPU node on hold... hopefully, Apple G5 don't have + * hotplug CPU with a dynamic device-tree ... + */ + return rc; + + bail_noprops: + of_node_put(cpunode); + + return rc; +} + +module_init(g5_cpufreq_init); + + +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index 8f818d092e2..dfd41b9781a 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -918,9 +918,6 @@ void __init pmac_pci_init(void) PCI_DN(np)->busno = 0xf0; } - /* map in PCI I/O space */ - phbs_remap_io(); - /* pmac_check_ht_link(); */ /* Tell pci.c to not use the common resource allocation mechanism */ diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 83a49e80ac2..90040c49494 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -74,6 +74,9 @@ static DEFINE_SPINLOCK(pmac_pic_lock); #define GATWICK_IRQ_POOL_SIZE 10 static struct interrupt_info gatwick_int_pool[GATWICK_IRQ_POOL_SIZE]; +#define NR_MASK_WORDS ((NR_IRQS + 31) / 32) +static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; + /* * Mark an irq as "lost". This is only used on the pmac * since it can lose interrupts (see pmac_set_irq_mask). diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 80b58c1ec41..7acb0546671 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -193,18 +193,6 @@ static void pmac_show_cpuinfo(struct seq_file *m) pmac_newworld ? "NewWorld" : "OldWorld"); } -static void pmac_show_percpuinfo(struct seq_file *m, int i) -{ -#ifdef CONFIG_CPU_FREQ_PMAC - extern unsigned int pmac_get_one_cpufreq(int i); - unsigned int freq = pmac_get_one_cpufreq(i); - if (freq != 0) { - seq_printf(m, "clock\t\t: %dMHz\n", freq/1000); - return; - } -#endif /* CONFIG_CPU_FREQ_PMAC */ -} - #ifndef CONFIG_ADB_CUDA int find_via_cuda(void) { @@ -767,7 +755,6 @@ struct machdep_calls __initdata pmac_md = { .setup_arch = pmac_setup_arch, .init_early = pmac_init_early, .show_cpuinfo = pmac_show_cpuinfo, - .show_percpuinfo = pmac_show_percpuinfo, .init_IRQ = pmac_pic_init, .get_irq = mpic_get_irq, /* changed later */ .pcibios_fixup = pmac_pcibios_fixup, diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index e1f9443cc87..957b0910342 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -305,9 +305,19 @@ static int __init smp_psurge_probe(void) psurge_start = ioremap(PSURGE_START, 4); psurge_pri_intr = ioremap(PSURGE_PRI_INTR, 4); - /* this is not actually strictly necessary -- paulus. */ - for (i = 1; i < ncpus; ++i) - smp_hw_index[i] = i; + /* + * This is necessary because OF doesn't know about the + * secondary cpu(s), and thus there aren't nodes in the + * device tree for them, and smp_setup_cpu_maps hasn't + * set their bits in cpu_possible_map and cpu_present_map. + */ + if (ncpus > NR_CPUS) + ncpus = NR_CPUS; + for (i = 1; i < ncpus ; ++i) { + cpu_set(i, cpu_present_map); + cpu_set(i, cpu_possible_map); + set_hard_smp_processor_id(i, i); + } if (ppc_md.progress) ppc_md.progress("smp_psurge_probe - done", 0x352); @@ -348,6 +358,7 @@ static void __init psurge_dual_sync_tb(int cpu_nr) int t; set_dec(tb_ticks_per_jiffy); + /* XXX fixme */ set_tb(0, 0); last_jiffy_stamp(cpu_nr) = 0; @@ -363,8 +374,6 @@ static void __init psurge_dual_sync_tb(int cpu_nr) /* now interrupt the secondary, starting both TBs */ psurge_set_ipi(1); - - smp_tb_synchronized = 1; } static struct irqaction psurge_irqaction = { @@ -625,9 +634,8 @@ void smp_core99_give_timebase(void) for (t = 100000; t > 0 && sec_tb_reset; --t) udelay(10); if (sec_tb_reset) + /* XXX BUG_ON here? */ printk(KERN_WARNING "Timeout waiting sync(2) on second CPU\n"); - else - smp_tb_synchronized = 1; /* Now, restart the timebase by leaving the GPIO to an open collector */ pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 0); @@ -810,19 +818,9 @@ static void __devinit smp_core99_setup_cpu(int cpu_nr) } -/* Core99 Macs (dual G4s and G5s) */ -struct smp_ops_t core99_smp_ops = { - .message_pass = smp_mpic_message_pass, - .probe = smp_core99_probe, - .kick_cpu = smp_core99_kick_cpu, - .setup_cpu = smp_core99_setup_cpu, - .give_timebase = smp_core99_give_timebase, - .take_timebase = smp_core99_take_timebase, -}; - #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32) -int __cpu_disable(void) +int smp_core99_cpu_disable(void) { cpu_clear(smp_processor_id(), cpu_online_map); @@ -846,7 +844,7 @@ void cpu_die(void) low_cpu_die(); } -void __cpu_die(unsigned int cpu) +void smp_core99_cpu_die(unsigned int cpu) { int timeout; @@ -858,8 +856,21 @@ void __cpu_die(unsigned int cpu) } msleep(1); } - cpu_callin_map[cpu] = 0; cpu_dead[cpu] = 0; } #endif + +/* Core99 Macs (dual G4s and G5s) */ +struct smp_ops_t core99_smp_ops = { + .message_pass = smp_mpic_message_pass, + .probe = smp_core99_probe, + .kick_cpu = smp_core99_kick_cpu, + .setup_cpu = smp_core99_setup_cpu, + .give_timebase = smp_core99_give_timebase, + .take_timebase = smp_core99_take_timebase, +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32) + .cpu_disable = smp_core99_cpu_disable, + .cpu_die = smp_core99_cpu_die, +#endif +}; diff --git a/arch/powerpc/platforms/powermac/time.c b/arch/powerpc/platforms/powermac/time.c index 5947b21a858..feb0a94e781 100644 --- a/arch/powerpc/platforms/powermac/time.c +++ b/arch/powerpc/platforms/powermac/time.c @@ -102,7 +102,7 @@ static unsigned long from_rtc_time(struct rtc_time *tm) static unsigned long cuda_get_time(void) { struct adb_request req; - unsigned long now; + unsigned int now; if (cuda_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0) return 0; @@ -113,7 +113,7 @@ static unsigned long cuda_get_time(void) req.reply_len); now = (req.reply[3] << 24) + (req.reply[4] << 16) + (req.reply[5] << 8) + req.reply[6]; - return now - RTC_OFFSET; + return ((unsigned long)now) - RTC_OFFSET; } #define cuda_get_rtc_time(tm) to_rtc_time(cuda_get_time(), (tm)) @@ -146,7 +146,7 @@ static int cuda_set_rtc_time(struct rtc_time *tm) static unsigned long pmu_get_time(void) { struct adb_request req; - unsigned long now; + unsigned int now; if (pmu_request(&req, NULL, 1, PMU_READ_RTC) < 0) return 0; @@ -156,7 +156,7 @@ static unsigned long pmu_get_time(void) req.reply_len); now = (req.reply[0] << 24) + (req.reply[1] << 16) + (req.reply[2] << 8) + req.reply[3]; - return now - RTC_OFFSET; + return ((unsigned long)now) - RTC_OFFSET; } #define pmu_get_rtc_time(tm) to_rtc_time(pmu_get_time(), (tm)) @@ -199,6 +199,7 @@ static unsigned long smu_get_time(void) #define smu_set_rtc_time(tm, spin) 0 #endif +/* Can't be __init, it's called when suspending and resuming */ unsigned long pmac_get_boot_time(void) { /* Get the time from the RTC, used only at boot time */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index b9938fece78..06d5ef50121 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -3,3 +3,8 @@ obj-y := pci.o lpar.o hvCall.o nvram.o reconfig.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_XICS) += xics.o +obj-$(CONFIG_SCANLOG) += scanlog.o +obj-$(CONFIG_EEH) += eeh.o eeh_event.o + +obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o +obj-$(CONFIG_HVCS) += hvcserver.o diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c new file mode 100644 index 00000000000..79de2310e70 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -0,0 +1,1212 @@ +/* + * eeh.c + * Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <asm/atomic.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + +#undef DEBUG + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occured (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event has occurred, we assume it + * is broken and panic. This sets the threshold for how many read + * attempts we allow before panicking. + */ +#define EEH_MAX_FAILS 100000 + +/* Misc forward declaraions */ +static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn); + +/* RTAS tokens */ +static int ibm_set_eeh_option; +static int ibm_set_slot_reset; +static int ibm_read_slot_reset_state; +static int ibm_read_slot_reset_state2; +static int ibm_slot_error_detail; + +static int eeh_subsystem_enabled; + +/* Lock to avoid races due to multiple reports of an error */ +static DEFINE_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting slot-error-detail rtas calls */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + +/* System monitoring statistics */ +static DEFINE_PER_CPU(unsigned long, no_device); +static DEFINE_PER_CPU(unsigned long, no_dn); +static DEFINE_PER_CPU(unsigned long, no_cfg_addr); +static DEFINE_PER_CPU(unsigned long, ignored_check); +static DEFINE_PER_CPU(unsigned long, total_mmio_ffs); +static DEFINE_PER_CPU(unsigned long, false_positives); +static DEFINE_PER_CPU(unsigned long, ignored_failures); +static DEFINE_PER_CPU(unsigned long, slot_resets); + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range +{ + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache +{ + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) { + n = n->rb_left; + } else { + if (addr > piar->addr_hi) { + n = n->rb_right; + } else { + pci_dev_get(piar->pcidev); + return piar->pcidev; + } + } + } + + return NULL; +} + +/** + * pci_get_device_by_addr - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +static struct pci_dev *pci_get_device_by_addr(unsigned long addr) +{ + struct pci_dev *dev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + dev = __pci_get_device_by_addr(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return dev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void pci_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + printk(KERN_WARNING "PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = (struct pci_io_addr_range *)kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name (dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __pci_addr_cache_insert_device(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + int i; + int inserted = 0; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + pdn = PCI_DN(dn); + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { +#ifdef DEBUG + printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n", + pci_name(dev), pdn->node->full_name); +#endif + return; + } + + /* The cache holds a reference to the device... */ + pci_dev_get(dev); + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + pci_addr_cache_insert(dev, start, end, flags); + inserted = 1; + } + + /* If there was nothing to add, the cache has no reference... */ + if (!inserted) + pci_dev_put(dev); +} + +/** + * pci_addr_cache_insert_device - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +static void pci_addr_cache_insert_device(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_insert_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __pci_addr_cache_remove_device(struct pci_dev *dev) +{ + struct rb_node *n; + int removed = 0; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + removed = 1; + kfree(piar); + goto restart; + } + n = rb_next(n); + } + + /* The cache no longer holds its reference to this device... */ + if (removed) + pci_dev_put(dev); +} + +/** + * pci_addr_cache_remove_device - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +static void pci_addr_cache_remove_device(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_remove_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * pci_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scaned for devices (after all device resources are known). + */ +void __init pci_addr_cache_build(void) +{ + struct device_node *dn; + struct pci_dev *dev = NULL; + + if (!eeh_subsystem_enabled) + return; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + /* Ignore PCI bridges ( XXX why ??) */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) { + continue; + } + pci_addr_cache_insert_device(dev); + + /* Save the BAR's; firmware doesn't restore these after EEH reset */ + dn = pci_device_to_OF_node(dev); + eeh_save_bars(dev, PCI_DN(dn)); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + pci_addr_cache_print(&pci_io_addr_cache_root); +#endif +} + +/* --------------------------------------------------------------- */ +/* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */ + +void eeh_slot_error_detail (struct pci_dn *pdn, int severity) +{ + unsigned long flags; + int rc; + + /* Log the error with the rtas logger */ + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + rc = rtas_call(ibm_slot_error_detail, + 8, 1, NULL, pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), NULL, 0, + virt_to_phys(slot_errbuf), + eeh_error_buf_size, + severity); + + if (rc == 0) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); +} + +/** + * read_slot_reset_state - Read the reset state of a device node's slot + * @dn: device node to read + * @rets: array to return results in + */ +static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) +{ + int token, outputs; + + if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { + token = ibm_read_slot_reset_state2; + outputs = 4; + } else { + token = ibm_read_slot_reset_state; + rets[2] = 0; /* fake PE Unavailable info */ + outputs = 3; + } + + return rtas_call(token, 3, outputs, rets, pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); +} + +/** + * eeh_token_to_phys - convert EEH address token to phys address + * @token i/o token, should be address in the form 0xA.... + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + + ptep = find_linux_pte(init_mm.pgd, token); + if (!ptep) + return token; + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/** + * Return the "partitionable endpoint" (pe) under which this device lies + */ +static struct device_node * find_device_pe(struct device_node *dn) +{ + while ((dn->parent) && PCI_DN(dn->parent) && + (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + dn = dn->parent; + } + return dn; +} + +/** Mark all devices that are peers of this device as failed. + * Mark the device driver too, so that it can see the failure + * immediately; this is critical, since some drivers poll + * status registers in interrupts ... If a driver is polling, + * and the slot is frozen, then the driver can deadlock in + * an interrupt context, which is bad. + */ + +static void __eeh_mark_slot (struct device_node *dn, int mode_flag) +{ + while (dn) { + if (PCI_DN(dn)) { + PCI_DN(dn)->eeh_mode |= mode_flag; + + if (dn->child) + __eeh_mark_slot (dn->child, mode_flag); + } + dn = dn->sibling; + } +} + +void eeh_mark_slot (struct device_node *dn, int mode_flag) +{ + dn = find_device_pe (dn); + PCI_DN(dn)->eeh_mode |= mode_flag; + __eeh_mark_slot (dn->child, mode_flag); +} + +static void __eeh_clear_slot (struct device_node *dn, int mode_flag) +{ + while (dn) { + if (PCI_DN(dn)) { + PCI_DN(dn)->eeh_mode &= ~mode_flag; + PCI_DN(dn)->eeh_check_count = 0; + if (dn->child) + __eeh_clear_slot (dn->child, mode_flag); + } + dn = dn->sibling; + } +} + +void eeh_clear_slot (struct device_node *dn, int mode_flag) +{ + unsigned long flags; + spin_lock_irqsave(&confirm_error_lock, flags); + dn = find_device_pe (dn); + PCI_DN(dn)->eeh_mode &= ~mode_flag; + PCI_DN(dn)->eeh_check_count = 0; + __eeh_clear_slot (dn->child, mode_flag); + spin_unlock_irqrestore(&confirm_error_lock, flags); +} + +/** + * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze + * @dn device node + * @dev pci device, if known + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) +{ + int ret; + int rets[3]; + unsigned long flags; + struct pci_dn *pdn; + int rc = 0; + + __get_cpu_var(total_mmio_ffs)++; + + if (!eeh_subsystem_enabled) + return 0; + + if (!dn) { + __get_cpu_var(no_dn)++; + return 0; + } + pdn = PCI_DN(dn); + + /* Access to IO BARs might get this far and still not want checking. */ + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { + __get_cpu_var(ignored_check)++; +#ifdef DEBUG + printk ("EEH:ignored check (%x) for %s %s\n", + pdn->eeh_mode, pci_name (dev), dn->full_name); +#endif + return 0; + } + + if (!pdn->eeh_config_addr) { + __get_cpu_var(no_cfg_addr)++; + return 0; + } + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + spin_lock_irqsave(&confirm_error_lock, flags); + rc = 1; + if (pdn->eeh_mode & EEH_MODE_ISOLATED) { + pdn->eeh_check_count ++; + if (pdn->eeh_check_count >= EEH_MAX_FAILS) { + printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", + pdn->eeh_check_count); + dump_stack(); + + /* re-read the slot reset state */ + if (read_slot_reset_state(pdn, rets) != 0) + rets[0] = -1; /* reset state unknown */ + + /* If we are here, then we hit an infinite loop. Stop. */ + panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = read_slot_reset_state(pdn, rets); + + /* If the call to firmware failed, punt */ + if (ret != 0) { + printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", + ret, dn->full_name); + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* If EEH is not supported on this device, punt. */ + if (rets[1] != 1) { + printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", + ret, dn->full_name); + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* If not the kind of error we know about, punt. */ + if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* Note that config-io to empty slots may fail; + * we recognize empty because they don't have children. */ + if ((rets[0] == 5) && (dn->child == NULL)) { + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + __get_cpu_var(slot_resets)++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. */ + eeh_mark_slot (dn, EEH_MODE_ISOLATED); + spin_unlock_irqrestore(&confirm_error_lock, flags); + + eeh_send_failure_event (dn, dev, rets[0], rets[2]); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. */ + if (rets[0] != 5) dump_stack(); + return 1; + +dn_unlock: + spin_unlock_irqrestore(&confirm_error_lock, flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dn_check_failure); + +/** + * eeh_check_failure - check if all 1's data is due to EEH slot freeze + * @token i/o token, should be address in the form 0xA.... + * @val value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct pci_dev *dev; + struct device_node *dn; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + dev = pci_get_device_by_addr(addr); + if (!dev) { + __get_cpu_var(no_device)++; + return val; + } + + dn = pci_device_to_OF_node(dev); + eeh_dn_check_failure (dn, dev); + + pci_dev_put(dev); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + +/* ------------------------------------------------------------- */ +/* The code below deals with error recovery */ + +/** Return negative value if a permanent error, else return + * a number of milliseconds to wait until the PCI slot is + * ready to be used. + */ +static int +eeh_slot_availability(struct pci_dn *pdn) +{ + int rc; + int rets[3]; + + rc = read_slot_reset_state(pdn, rets); + + if (rc) return rc; + + if (rets[1] == 0) return -1; /* EEH is not supported */ + if (rets[0] == 0) return 0; /* Oll Korrect */ + if (rets[0] == 5) { + if (rets[2] == 0) return -1; /* permanently unavailable */ + return rets[2]; /* number of millisecs to wait */ + } + return -1; +} + +/** rtas_pci_slot_reset raises/lowers the pci #RST line + * state: 1/0 to raise/lower the #RST + * + * Clear the EEH-frozen condition on a slot. This routine + * asserts the PCI #RST line if the 'state' argument is '1', + * and drops the #RST line if 'state is '0'. This routine is + * safe to call in an interrupt context. + * + */ + +static void +rtas_pci_slot_reset(struct pci_dn *pdn, int state) +{ + int rc; + + BUG_ON (pdn==NULL); + + if (!pdn->phb) { + printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", + pdn->node->full_name); + return; + } + + rc = rtas_call(ibm_set_slot_reset,4,1, NULL, + pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), + state); + if (rc) { + printk (KERN_WARNING "EEH: Unable to reset the failed slot, (%d) #RST=%d dn=%s\n", + rc, state, pdn->node->full_name); + return; + } +} + +/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second + * dn -- device node to be reset. + */ + +void +rtas_set_slot_reset(struct pci_dn *pdn) +{ + int i, rc; + + rtas_pci_slot_reset (pdn, 1); + + /* The PCI bus requires that the reset be held high for at least + * a 100 milliseconds. We wait a bit longer 'just in case'. */ + +#define PCI_BUS_RST_HOLD_TIME_MSEC 250 + msleep (PCI_BUS_RST_HOLD_TIME_MSEC); + + /* We might get hit with another EEH freeze as soon as the + * pci slot reset line is dropped. Make sure we don't miss + * these, and clear the flag now. */ + eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED); + + rtas_pci_slot_reset (pdn, 0); + + /* After a PCI slot has been reset, the PCI Express spec requires + * a 1.5 second idle time for the bus to stabilize, before starting + * up traffic. */ +#define PCI_BUS_SETTLE_TIME_MSEC 1800 + msleep (PCI_BUS_SETTLE_TIME_MSEC); + + /* Now double check with the firmware to make sure the device is + * ready to be used; if not, wait for recovery. */ + for (i=0; i<10; i++) { + rc = eeh_slot_availability (pdn); + if (rc <= 0) break; + + msleep (rc+100); + } +} + +/* ------------------------------------------------------- */ +/** Save and restore of PCI BARs + * + * Although firmware will set up BARs during boot, it doesn't + * set up device BAR's after a device reset, although it will, + * if requested, set up bridge configuration. Thus, we need to + * configure the PCI devices ourselves. + */ + +/** + * __restore_bars - Restore the Base Address Registers + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static inline void __restore_bars (struct pci_dn *pdn) +{ + int i; + + if (NULL==pdn->phb) return; + for (i=4; i<10; i++) { + rtas_write_config(pdn, i*4, 4, pdn->config_space[i]); + } + + /* 12 == Expansion ROM Address */ + rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]); + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)]) + + rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + + rtas_write_config (pdn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]); +} + +/** + * eeh_restore_bars - restore the PCI config space info + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_restore_bars(struct pci_dn *pdn) +{ + struct device_node *dn; + if (!pdn) + return; + + if (! pdn->eeh_is_bridge) + __restore_bars (pdn); + + dn = pdn->node->child; + while (dn) { + eeh_restore_bars (PCI_DN(dn)); + dn = dn->sibling; + } +} + +/** + * eeh_save_bars - save device bars + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individuallly; but, for the restore, + * an entire slot is reset at a time. + */ +static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn) +{ + int i; + + if (!pdev || !pdn ) + return; + + for (i = 0; i < 16; i++) + pci_read_config_dword(pdev, i * 4, &pdn->config_space[i]); + + if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) + pdn->eeh_is_bridge = 1; +} + +void +rtas_configure_bridge(struct pci_dn *pdn) +{ + int token = rtas_token ("ibm,configure-bridge"); + int rc; + + if (token == RTAS_UNKNOWN_SERVICE) + return; + rc = rtas_call(token,3,1, NULL, + pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid)); + if (rc) { + printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", + rc, pdn->node->full_name); + } +} + +/* ------------------------------------------------------------- */ +/* The code below deals with enabling EEH for devices during the + * early boot sequence. EEH must be enabled before any PCI probing + * can be done. + */ + +#define EEH_ENABLE 1 + +struct eeh_early_enable_info { + unsigned int buid_hi; + unsigned int buid_lo; +}; + +/* Enable eeh for the given device node. */ +static void *early_enable_eeh(struct device_node *dn, void *data) +{ + struct eeh_early_enable_info *info = data; + int ret; + char *status = get_property(dn, "status", NULL); + u32 *class_code = (u32 *)get_property(dn, "class-code", NULL); + u32 *vendor_id = (u32 *)get_property(dn, "vendor-id", NULL); + u32 *device_id = (u32 *)get_property(dn, "device-id", NULL); + u32 *regs; + int enable; + struct pci_dn *pdn = PCI_DN(dn); + + pdn->eeh_mode = 0; + pdn->eeh_check_count = 0; + pdn->eeh_freeze_count = 0; + + if (status && strcmp(status, "ok") != 0) + return NULL; /* ignore devices with bad status */ + + /* Ignore bad nodes. */ + if (!class_code || !vendor_id || !device_id) + return NULL; + + /* There is nothing to check on PCI to ISA bridges */ + if (dn->type && !strcmp(dn->type, "isa")) { + pdn->eeh_mode |= EEH_MODE_NOCHECK; + return NULL; + } + + /* + * Now decide if we are going to "Disable" EEH checking + * for this device. We still run with the EEH hardware active, + * but we won't be checking for ff's. This means a driver + * could return bad data (very bad!), an interrupt handler could + * hang waiting on status bits that won't change, etc. + * But there are a few cases like display devices that make sense. + */ + enable = 1; /* i.e. we will do checking */ + if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY) + enable = 0; + + if (!enable) + pdn->eeh_mode |= EEH_MODE_NOCHECK; + + /* Ok... see if this device supports EEH. Some do, some don't, + * and the only way to find out is to check each and every one. */ + regs = (u32 *)get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + /* Try to enable eeh */ + ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + regs[0], info->buid_hi, info->buid_lo, + EEH_ENABLE); + + if (ret == 0) { + eeh_subsystem_enabled = 1; + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + pdn->eeh_config_addr = regs[0]; +#ifdef DEBUG + printk(KERN_DEBUG "EEH: %s: eeh enabled\n", dn->full_name); +#endif + } else { + + /* This device doesn't support EEH, but it may have an + * EEH parent, in which case we mark it as supported. */ + if (dn->parent && PCI_DN(dn->parent) + && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + /* Parent supports EEH. */ + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr; + return NULL; + } + } + } else { + printk(KERN_WARNING "EEH: %s: unable to get reg property.\n", + dn->full_name); + } + + return NULL; +} + +/* + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +void __init eeh_init(void) +{ + struct device_node *phb, *np; + struct eeh_early_enable_info info; + + spin_lock_init(&confirm_error_lock); + spin_lock_init(&slot_errbuf_lock); + + np = of_find_node_by_path("/rtas"); + if (np == NULL) + return; + + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) + return; + + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + eeh_error_buf_size = 1024; + } + if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " + "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Enable EEH for all adapters. Note that eeh requires buid's */ + for (phb = of_find_node_by_name(NULL, "pci"); phb; + phb = of_find_node_by_name(phb, "pci")) { + unsigned long buid; + + buid = get_phb_buid(phb); + if (buid == 0 || PCI_DN(phb) == NULL) + continue; + + info.buid_lo = BUID_LO(buid); + info.buid_hi = BUID_HI(buid); + traverse_pci_devices(phb, early_enable_eeh, &info); + } + + if (eeh_subsystem_enabled) + printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + printk(KERN_WARNING "EEH: No capable adapters found\n"); +} + +/** + * eeh_add_device_early - enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + struct eeh_early_enable_info info; + + if (!dn || !PCI_DN(dn)) + return; + phb = PCI_DN(dn)->phb; + if (NULL == phb || 0 == phb->buid) { + printk(KERN_WARNING "EEH: Expected buid but found none for %s\n", + dn->full_name); + dump_stack(); + return; + } + + info.buid_hi = BUID_HI(phb->buid); + info.buid_lo = BUID_LO(phb->buid); + early_enable_eeh(dn, &info); +} +EXPORT_SYMBOL_GPL(eeh_add_device_early); + +/** + * eeh_add_device_late - perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + + if (!dev || !eeh_subsystem_enabled) + return; + +#ifdef DEBUG + printk(KERN_DEBUG "EEH: adding device %s\n", pci_name(dev)); +#endif + + pci_dev_get (dev); + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + pdn->pcidev = dev; + + pci_addr_cache_insert_device (dev); + eeh_save_bars(dev, pdn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_late); + +/** + * eeh_remove_device - undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * + * This routine should be when a device is removed from a running + * system (e.g. by hotplug or dlpar). + */ +void eeh_remove_device(struct pci_dev *dev) +{ + struct device_node *dn; + if (!dev || !eeh_subsystem_enabled) + return; + + /* Unregister the device with the EEH/PCI address search system */ +#ifdef DEBUG + printk(KERN_DEBUG "EEH: remove device %s\n", pci_name(dev)); +#endif + pci_addr_cache_remove_device(dev); + + dn = pci_device_to_OF_node(dev); + PCI_DN(dn)->pcidev = NULL; + pci_dev_put (dev); +} +EXPORT_SYMBOL_GPL(eeh_remove_device); + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + unsigned int cpu; + unsigned long ffs = 0, positives = 0, failures = 0; + unsigned long resets = 0; + unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0; + + for_each_cpu(cpu) { + ffs += per_cpu(total_mmio_ffs, cpu); + positives += per_cpu(false_positives, cpu); + failures += per_cpu(ignored_failures, cpu); + resets += per_cpu(slot_resets, cpu); + no_dev += per_cpu(no_device, cpu); + no_dn += per_cpu(no_dn, cpu); + no_cfg += per_cpu(no_cfg_addr, cpu); + no_check += per_cpu(ignored_check, cpu); + } + + if (0 == eeh_subsystem_enabled) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%ld\n" + "no device node=%ld\n" + "no config address=%ld\n" + "check not wanted=%ld\n" + "eeh_total_mmio_ffs=%ld\n" + "eeh_false_positives=%ld\n" + "eeh_ignored_failures=%ld\n" + "eeh_slot_resets=%ld\n", + no_dev, no_dn, no_cfg, no_check, + ffs, positives, failures, resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init eeh_init_proc(void) +{ + struct proc_dir_entry *e; + + if (platform_is_pseries()) { + e = create_proc_entry("ppc64/eeh", 0, NULL); + if (e) + e->proc_fops = &proc_eeh_operations; + } + + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c new file mode 100644 index 00000000000..92497333c2b --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -0,0 +1,155 @@ +/* + * eeh_event.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <asm/eeh_event.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +/* EEH event workqueue setup. */ +static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(eeh_eventlist); +static void eeh_thread_launcher(void *); +DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); + +/** + * eeh_panic - call panic() for an eeh event that cannot be handled. + * The philosophy of this routine is that it is better to panic and + * halt the OS than it is to risk possible data corruption by + * oblivious device drivers that don't know better. + * + * @dev pci device that had an eeh event + * @reset_state current reset state of the device slot + */ +static void eeh_panic(struct pci_dev *dev, int reset_state) +{ + /* + * Since the panic_on_oops sysctl is used to halt the system + * in light of potential corruption, we can use it here. + */ + if (panic_on_oops) { + panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, + pci_name(dev)); + } + else { + printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", + reset_state, pci_name(dev)); + } +} + +/** + * eeh_event_handler - dispatch EEH events. The detection of a frozen + * slot can occur inside an interrupt, where it can be hard to do + * anything about it. The goal of this routine is to pull these + * detection events out of the context of the interrupt handler, and + * re-dispatch them for processing at a later time in a normal context. + * + * @dummy - unused + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + + daemonize ("eehd"); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (event == NULL) + break; + + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + pci_name(event->dev)); + + eeh_panic (event->dev, event->state); + + kfree(event); + } + + return 0; +} + +/** + * eeh_thread_launcher + * + * @dummy - unused + */ +static void eeh_thread_launcher(void *dummy) +{ + if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) + printk(KERN_ERR "Failed to start EEH daemon\n"); +} + +/** + * eeh_send_failure_event - generate a PCI error event + * @dev pci device + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event (struct device_node *dn, + struct pci_dev *dev, + int state, + int time_unavail) +{ + unsigned long flags; + struct eeh_event *event; + + event = kmalloc(sizeof(*event), GFP_ATOMIC); + if (event == NULL) { + printk (KERN_ERR "EEH: out of memory, event not handled\n"); + return 1; + } + + if (dev) + pci_dev_get(dev); + + event->dn = dn; + event->dev = dev; + event->state = state; + event->time_unavail = time_unavail; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + schedule_work(&eeh_event_wq); + + return 0; +} + +/********************** END OF FILE ******************************/ diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c new file mode 100644 index 00000000000..138e128a388 --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -0,0 +1,74 @@ +/* + * hvconsole.c + * Copyright (C) 2004 Hollis Blanchard, IBM Corporation + * Copyright (C) 2004 IBM Corporation + * + * Additional Author(s): + * Ryan S. Arnold <rsa@us.ibm.com> + * + * LPAR console support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/hvcall.h> +#include <asm/hvconsole.h> + +/** + * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper + * @vtermno: The vtermno or unit_address of the adapter from which to fetch the + * data. + * @buf: The character buffer into which to put the character data fetched from + * firmware. + * @count: not used? + */ +int hvc_get_chars(uint32_t vtermno, char *buf, int count) +{ + unsigned long got; + + if (plpar_hcall(H_GET_TERM_CHAR, vtermno, 0, 0, 0, &got, + (unsigned long *)buf, (unsigned long *)buf+1) == H_Success) + return got; + return 0; +} + +EXPORT_SYMBOL(hvc_get_chars); + + +/** + * hvc_put_chars: send characters to firmware for denoted vterm adapter + * @vtermno: The vtermno or unit_address of the adapter from which the data + * originated. + * @buf: The character buffer that contains the character data to send to + * firmware. + * @count: Send this number of characters. + */ +int hvc_put_chars(uint32_t vtermno, const char *buf, int count) +{ + unsigned long *lbuf = (unsigned long *) buf; + long ret; + + ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0], + lbuf[1]); + if (ret == H_Success) + return count; + if (ret == H_Busy) + return 0; + return -EIO; +} + +EXPORT_SYMBOL(hvc_put_chars); diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c new file mode 100644 index 00000000000..4d584172055 --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -0,0 +1,251 @@ +/* + * hvcserver.c + * Copyright (C) 2004 Ryan S Arnold, IBM Corporation + * + * PPC64 virtual I/O console server support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <asm/hvcall.h> +#include <asm/hvcserver.h> +#include <asm/io.h> + +#define HVCS_ARCH_VERSION "1.0.0" + +MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>"); +MODULE_DESCRIPTION("IBM hvcs ppc64 API"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(HVCS_ARCH_VERSION); + +/* + * Convert arch specific return codes into relevant errnos. The hvcs + * functions aren't performance sensitive, so this conversion isn't an + * issue. + */ +int hvcs_convert(long to_convert) +{ + switch (to_convert) { + case H_Success: + return 0; + case H_Parameter: + return -EINVAL; + case H_Hardware: + return -EIO; + case H_Busy: + case H_LongBusyOrder1msec: + case H_LongBusyOrder10msec: + case H_LongBusyOrder100msec: + case H_LongBusyOrder1sec: + case H_LongBusyOrder10sec: + case H_LongBusyOrder100sec: + return -EBUSY; + case H_Function: /* fall through */ + default: + return -EPERM; + } +} + +/** + * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info + * @head: list_head pointer for an allocated list of partner info structs to + * free. + * + * This function is used to free the partner info list that was returned by + * calling hvcs_get_partner_info(). + */ +int hvcs_free_partner_info(struct list_head *head) +{ + struct hvcs_partner_info *pi; + struct list_head *element; + + if (!head) + return -EINVAL; + + while (!list_empty(head)) { + element = head->next; + pi = list_entry(element, struct hvcs_partner_info, node); + list_del(element); + kfree(pi); + } + + return 0; +} +EXPORT_SYMBOL(hvcs_free_partner_info); + +/* Helper function for hvcs_get_partner_info */ +int hvcs_next_partner(uint32_t unit_address, + unsigned long last_p_partition_ID, + unsigned long last_p_unit_address, unsigned long *pi_buff) + +{ + long retval; + retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address, + last_p_partition_ID, + last_p_unit_address, virt_to_phys(pi_buff)); + return hvcs_convert(retval); +} + +/** + * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter + * @unit_address: The unit_address of the vty-server adapter for which this + * function is fetching partner info. + * @head: An initialized list_head pointer to an empty list to use to return the + * list of partner info fetched from the hypervisor to the caller. + * @pi_buff: A page sized buffer pre-allocated prior to calling this function + * that is to be used to be used by firmware as an iterator to keep track + * of the partner info retrieval. + * + * This function returns non-zero on success, or if there is no partner info. + * + * The pi_buff is pre-allocated prior to calling this function because this + * function may be called with a spin_lock held and kmalloc of a page is not + * recommended as GFP_ATOMIC. + * + * The first long of this buffer is used to store a partner unit address. The + * second long is used to store a partner partition ID and starting at + * pi_buff[2] is the 79 character Converged Location Code (diff size than the + * unsigned longs, hence the casting mumbo jumbo you see later). + * + * Invocation of this function should always be followed by an invocation of + * hvcs_free_partner_info() using a pointer to the SAME list head instance + * that was passed as a parameter to this function. + */ +int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, + unsigned long *pi_buff) +{ + /* + * Dealt with as longs because of the hcall interface even though the + * values are uint32_t. + */ + unsigned long last_p_partition_ID; + unsigned long last_p_unit_address; + struct hvcs_partner_info *next_partner_info = NULL; + int more = 1; + int retval; + + memset(pi_buff, 0x00, PAGE_SIZE); + /* invalid parameters */ + if (!head || !pi_buff) + return -EINVAL; + + last_p_partition_ID = last_p_unit_address = ~0UL; + INIT_LIST_HEAD(head); + + do { + retval = hvcs_next_partner(unit_address, last_p_partition_ID, + last_p_unit_address, pi_buff); + if (retval) { + /* + * Don't indicate that we've failed if we have + * any list elements. + */ + if (!list_empty(head)) + return 0; + return retval; + } + + last_p_partition_ID = pi_buff[0]; + last_p_unit_address = pi_buff[1]; + + /* This indicates that there are no further partners */ + if (last_p_partition_ID == ~0UL + && last_p_unit_address == ~0UL) + break; + + /* This is a very small struct and will be freed soon in + * hvcs_free_partner_info(). */ + next_partner_info = kmalloc(sizeof(struct hvcs_partner_info), + GFP_ATOMIC); + + if (!next_partner_info) { + printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to" + " allocate partner info struct.\n"); + hvcs_free_partner_info(head); + return -ENOMEM; + } + + next_partner_info->unit_address + = (unsigned int)last_p_unit_address; + next_partner_info->partition_ID + = (unsigned int)last_p_partition_ID; + + /* copy the Null-term char too */ + strncpy(&next_partner_info->location_code[0], + (char *)&pi_buff[2], + strlen((char *)&pi_buff[2]) + 1); + + list_add_tail(&(next_partner_info->node), head); + next_partner_info = NULL; + + } while (more); + + return 0; +} +EXPORT_SYMBOL(hvcs_get_partner_info); + +/** + * hvcs_register_connection - establish a connection between this vty-server and + * a vty. + * @unit_address: The unit address of the vty-server adapter that is to be + * establish a connection. + * @p_partition_ID: The partition ID of the vty adapter that is to be connected. + * @p_unit_address: The unit address of the vty adapter to which the vty-server + * is to be connected. + * + * If this function is called once and -EINVAL is returned it may + * indicate that the partner info needs to be refreshed for the + * target unit address at which point the caller must invoke + * hvcs_get_partner_info() and then call this function again. If, + * for a second time, -EINVAL is returned then it indicates that + * there is probably already a partner connection registered to a + * different vty-server adapter. It is also possible that a second + * -EINVAL may indicate that one of the parms is not valid, for + * instance if the link was removed between the vty-server adapter + * and the vty adapter that you are trying to open. Don't shoot the + * messenger. Firmware implemented it this way. + */ +int hvcs_register_connection( uint32_t unit_address, + uint32_t p_partition_ID, uint32_t p_unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address, + p_partition_ID, p_unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_register_connection); + +/** + * hvcs_free_connection - free the connection between a vty-server and vty + * @unit_address: The unit address of the vty-server that is to have its + * connection severed. + * + * This function is used to free the partner connection between a vty-server + * adapter and a vty adapter. + * + * If -EBUSY is returned continue to call this function until 0 is returned. + */ +int hvcs_free_connection(uint32_t unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_FREE_VTERM, unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_free_connection); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 513e2723149..97ba5214417 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -37,16 +37,15 @@ #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> -#include <asm/ppcdebug.h> #include <asm/iommu.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/abs_addr.h> #include <asm/pSeries_reconfig.h> -#include <asm/systemcfg.h> #include <asm/firmware.h> #include <asm/tce.h> #include <asm/ppc-pci.h> +#include <asm/udbg.h> #include "plpar_wrappers.h" @@ -582,7 +581,7 @@ void iommu_init_early_pSeries(void) return; } - if (systemcfg->platform & PLATFORM_LPAR) { + if (platform_is_lpar()) { if (firmware_has_feature(FW_FEATURE_MULTITCE)) { ppc_md.tce_build = tce_buildmulti_pSeriesLP; ppc_md.tce_free = tce_freemulti_pSeriesLP; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index e384a5a9179..a50e5f3f396 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -19,7 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#define DEBUG +#undef DEBUG_LOW #include <linux/config.h> #include <linux/kernel.h> @@ -31,20 +31,21 @@ #include <asm/machdep.h> #include <asm/abs_addr.h> #include <asm/mmu_context.h> -#include <asm/ppcdebug.h> #include <asm/iommu.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/prom.h> #include <asm/abs_addr.h> #include <asm/cputable.h> +#include <asm/udbg.h> +#include <asm/smp.h> #include "plpar_wrappers.h" -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) do { udbg_printf(fmt); } while(0) #else -#define DBG(fmt...) +#define DBG_LOW(fmt...) do { } while(0) #endif /* in pSeries_hvCall.S */ @@ -276,8 +277,9 @@ void vpa_init(int cpu) } long pSeries_lpar_hpte_insert(unsigned long hpte_group, - unsigned long va, unsigned long prpn, - unsigned long vflags, unsigned long rflags) + unsigned long va, unsigned long pa, + unsigned long rflags, unsigned long vflags, + int psize) { unsigned long lpar_rc; unsigned long flags; @@ -285,11 +287,28 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long hpte_v, hpte_r; unsigned long dummy0, dummy1; - hpte_v = ((va >> 23) << HPTE_V_AVPN_SHIFT) | vflags | HPTE_V_VALID; - if (vflags & HPTE_V_LARGE) - hpte_v &= ~(1UL << HPTE_V_AVPN_SHIFT); - - hpte_r = (prpn << HPTE_R_RPN_SHIFT) | rflags; + if (!(vflags & HPTE_V_BOLTED)) + DBG_LOW("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " + "rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, va, pa, rflags, vflags, psize); + + hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) + DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); + +#if 1 + { + int i; + for (i=0;i<8;i++) { + unsigned long w0, w1; + plpar_pte_read(0, hpte_group, &w0, &w1); + BUG_ON (HPTE_V_COMPARE(hpte_v, w0) + && (w0 & HPTE_V_VALID)); + } + } +#endif /* Now fill in the actual HPTE */ /* Set CEC cookie to 0 */ @@ -299,23 +318,30 @@ long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Exact = 0 */ flags = 0; - /* XXX why is this here? - Anton */ + /* Make pHyp happy */ if (rflags & (_PAGE_GUARDED|_PAGE_NO_CACHE)) hpte_r &= ~_PAGE_COHERENT; lpar_rc = plpar_hcall(H_ENTER, flags, hpte_group, hpte_v, hpte_r, &slot, &dummy0, &dummy1); - - if (unlikely(lpar_rc == H_PTEG_Full)) + if (unlikely(lpar_rc == H_PTEG_Full)) { + if (!(vflags & HPTE_V_BOLTED)) + DBG_LOW(" full\n"); return -1; + } /* * Since we try and ioremap PHBs we don't own, the pte insert * will fail. However we must catch the failure in hash_page * or we will loop forever, so return -2 in this case. */ - if (unlikely(lpar_rc != H_Success)) + if (unlikely(lpar_rc != H_Success)) { + if (!(vflags & HPTE_V_BOLTED)) + DBG_LOW(" lpar err %d\n", lpar_rc); return -2; + } + if (!(vflags & HPTE_V_BOLTED)) + DBG_LOW(" -> slot: %d\n", slot & 7); /* Because of iSeries, we have to pass down the secondary * bucket bit here as well @@ -340,10 +366,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) /* don't remove a bolted entry */ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset, (0x1UL << 4), &dummy1, &dummy2); - if (lpar_rc == H_Success) return i; - BUG_ON(lpar_rc != H_Not_Found); slot_offset++; @@ -371,20 +395,28 @@ static void pSeries_lpar_hptab_clear(void) * We can probably optimize here and assume the high bits of newpp are * already zero. For now I am paranoid. */ -static long pSeries_lpar_hpte_updatepp(unsigned long slot, unsigned long newpp, - unsigned long va, int large, int local) +static long pSeries_lpar_hpte_updatepp(unsigned long slot, + unsigned long newpp, + unsigned long va, + int psize, int local) { unsigned long lpar_rc; unsigned long flags = (newpp & 7) | H_AVPN; - unsigned long avpn = va >> 23; + unsigned long want_v; - if (large) - avpn &= ~0x1UL; + want_v = hpte_encode_v(va, psize); - lpar_rc = plpar_pte_protect(flags, slot, (avpn << 7)); + DBG_LOW(" update: avpnv=%016lx, hash=%016lx, f=%x, psize: %d ... ", + want_v & HPTE_V_AVPN, slot, flags, psize); - if (lpar_rc == H_Not_Found) + lpar_rc = plpar_pte_protect(flags, slot, want_v & HPTE_V_AVPN); + + if (lpar_rc == H_Not_Found) { + DBG_LOW("not found !\n"); return -1; + } + + DBG_LOW("ok\n"); BUG_ON(lpar_rc != H_Success); @@ -410,21 +442,22 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) return dword0; } -static long pSeries_lpar_hpte_find(unsigned long vpn) +static long pSeries_lpar_hpte_find(unsigned long va, int psize) { unsigned long hash; unsigned long i, j; long slot; - unsigned long hpte_v; + unsigned long want_v, hpte_v; - hash = hpt_hash(vpn, 0); + hash = hpt_hash(va, mmu_psize_defs[psize].shift); + want_v = hpte_encode_v(va, psize); for (j = 0; j < 2; j++) { slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; for (i = 0; i < HPTES_PER_GROUP; i++) { hpte_v = pSeries_lpar_hpte_getword0(slot); - if ((HPTE_V_AVPN_VAL(hpte_v) == (vpn >> 11)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID) && (!!(hpte_v & HPTE_V_SECONDARY) == j)) { /* HPTE matches */ @@ -441,17 +474,15 @@ static long pSeries_lpar_hpte_find(unsigned long vpn) } static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, - unsigned long ea) + unsigned long ea, + int psize) { - unsigned long lpar_rc; - unsigned long vsid, va, vpn, flags; - long slot; + unsigned long lpar_rc, slot, vsid, va, flags; vsid = get_kernel_vsid(ea); va = (vsid << 28) | (ea & 0x0fffffff); - vpn = va >> PAGE_SHIFT; - slot = pSeries_lpar_hpte_find(vpn); + slot = pSeries_lpar_hpte_find(va, psize); BUG_ON(slot == -1); flags = newpp & 7; @@ -461,18 +492,18 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, } static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, - int large, int local) + int psize, int local) { - unsigned long avpn = va >> 23; + unsigned long want_v; unsigned long lpar_rc; unsigned long dummy1, dummy2; - if (large) - avpn &= ~0x1UL; - - lpar_rc = plpar_pte_remove(H_AVPN, slot, (avpn << 7), &dummy1, - &dummy2); + DBG_LOW(" inval : slot=%lx, va=%016lx, psize: %d, local: %d", + slot, va, psize, local); + want_v = hpte_encode_v(va, psize); + lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v & HPTE_V_AVPN, + &dummy1, &dummy2); if (lpar_rc == H_Not_Found) return; @@ -494,7 +525,8 @@ void pSeries_lpar_flush_hash_range(unsigned long number, int local) spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); for (i = 0; i < number; i++) - flush_hash_page(batch->vaddr[i], batch->pte[i], local); + flush_hash_page(batch->vaddr[i], batch->pte[i], + batch->psize, local); if (lock_tlbie) spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index c198656a3bb..999a9620b5c 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -107,7 +107,6 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { - phbs_remap_io(); pSeries_request_regions(); pci_addr_cache_build(); @@ -123,7 +122,7 @@ static void fixup_winbond_82c105(struct pci_dev* dev) int i; unsigned int reg; - if (!(systemcfg->platform & PLATFORM_PSERIES)) + if (!platform_is_pseries()) return; printk("Using INTC for W82c105 IDE controller.\n"); diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h index 382f8c5b0e7..3bd1b3e0600 100644 --- a/arch/powerpc/platforms/pseries/plpar_wrappers.h +++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h @@ -107,14 +107,4 @@ static inline long plpar_put_term_char(unsigned long termno, unsigned long len, lbuf[1]); } -static inline long plpar_set_xdabr(unsigned long address, unsigned long flags) -{ - return plpar_hcall_norets(H_SET_XDABR, address, flags); -} - -static inline long plpar_set_dabr(unsigned long val) -{ - return plpar_hcall_norets(H_SET_DABR, val); -} - #endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 6562ff4b0a8..fbd214d68b0 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -48,7 +48,7 @@ #include <asm/ptrace.h> #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/ppcdebug.h> +#include <asm/udbg.h> static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; static DEFINE_SPINLOCK(ras_log_buf_lock); diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 58c61219d08..d8864164dbe 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -286,10 +286,8 @@ static struct property *new_property(const char *name, const int length, return new; cleanup: - if (new->name) - kfree(new->name); - if (new->value) - kfree(new->value); + kfree(new->name); + kfree(new->value); kfree(new); return NULL; } @@ -410,7 +408,7 @@ static int proc_ppc64_create_ofdt(void) { struct proc_dir_entry *ent; - if (!(systemcfg->platform & PLATFORM_PSERIES)) + if (!platform_is_pseries()) return 0; ent = create_proc_entry("ppc64/ofdt", S_IWUSR, NULL); diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c index e26b0420b6d..a6f628d4c9d 100644 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ b/arch/powerpc/platforms/pseries/rtasd.c @@ -27,7 +27,6 @@ #include <asm/prom.h> #include <asm/nvram.h> #include <asm/atomic.h> -#include <asm/systemcfg.h> #if 0 #define DEBUG(A...) printk(KERN_ERR A) @@ -482,10 +481,12 @@ static int __init rtas_init(void) { struct proc_dir_entry *entry; - /* No RTAS, only warn if we are on a pSeries box */ + if (!platform_is_pseries()) + return 0; + + /* No RTAS */ if (rtas_token("event-scan") == RTAS_UNKNOWN_SERVICE) { - if (systemcfg->platform & PLATFORM_PSERIES) - printk(KERN_INFO "rtasd: no event-scan on system\n"); + printk(KERN_INFO "rtasd: no event-scan on system\n"); return 1; } diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c new file mode 100644 index 00000000000..2edc947f7c4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -0,0 +1,235 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com> + * + * When ppc64 hardware fails the service processor dumps internal state + * of the system. After a reboot the operating system can access a dump + * of this data using this driver. A dump exists if the device-tree + * /chosen/ibm,scan-log-data property exists. + * + * This driver exports /proc/ppc64/scan-log-dump which can be read. + * The driver supports only sequential reads. + * + * The driver looks at a write to the driver for the single word "reset". + * If given, the driver will reset the scanlog so the platform can free it. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> +#include <asm/prom.h> + +#define MODULE_VERS "1.0" +#define MODULE_NAME "scanlog" + +/* Status returns from ibm,scan-log-dump */ +#define SCANLOG_COMPLETE 0 +#define SCANLOG_HWERROR -1 +#define SCANLOG_CONTINUE 1 + +#define DEBUG(A...) do { if (scanlog_debug) printk(KERN_ERR "scanlog: " A); } while (0) + +static int scanlog_debug; +static unsigned int ibm_scan_log_dump; /* RTAS token */ +static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */ + +static ssize_t scanlog_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_dentry->d_inode; + struct proc_dir_entry *dp; + unsigned int *data; + int status; + unsigned long len, off; + unsigned int wait_time; + + dp = PDE(inode); + data = (unsigned int *)dp->data; + + if (!data) { + printk(KERN_ERR "scanlog: read failed no data\n"); + return -EIO; + } + + if (count > RTAS_DATA_BUF_SIZE) + count = RTAS_DATA_BUF_SIZE; + + if (count < 1024) { + /* This is the min supported by this RTAS call. Rather + * than do all the buffering we insist the user code handle + * larger reads. As long as cp works... :) + */ + printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + for (;;) { + wait_time = 500; /* default wait if no data */ + spin_lock(&rtas_data_buf_lock); + memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE); + status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, + (u32) __pa(rtas_data_buf), (u32) count); + memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE); + spin_unlock(&rtas_data_buf_lock); + + DEBUG("status=%d, data[0]=%x, data[1]=%x, data[2]=%x\n", + status, data[0], data[1], data[2]); + switch (status) { + case SCANLOG_COMPLETE: + DEBUG("hit eof\n"); + return 0; + case SCANLOG_HWERROR: + DEBUG("hardware error reading scan log data\n"); + return -EIO; + case SCANLOG_CONTINUE: + /* We may or may not have data yet */ + len = data[1]; + off = data[2]; + if (len > 0) { + if (copy_to_user(buf, ((char *)data)+off, len)) + return -EFAULT; + return len; + } + /* Break to sleep default time */ + break; + default: + if (status > 9900 && status <= 9905) { + wait_time = rtas_extended_busy_delay_time(status); + } else { + printk(KERN_ERR "scanlog: unknown error from rtas: %d\n", status); + return -EIO; + } + } + /* Apparently no data yet. Wait and try again. */ + msleep_interruptible(wait_time); + } + /*NOTREACHED*/ +} + +static ssize_t scanlog_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + char stkbuf[20]; + int status; + + if (count > 19) count = 19; + if (copy_from_user (stkbuf, buf, count)) { + return -EFAULT; + } + stkbuf[count] = 0; + + if (buf) { + if (strncmp(stkbuf, "reset", 5) == 0) { + DEBUG("reset scanlog\n"); + status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0); + DEBUG("rtas returns %d\n", status); + } else if (strncmp(stkbuf, "debugon", 7) == 0) { + printk(KERN_ERR "scanlog: debug on\n"); + scanlog_debug = 1; + } else if (strncmp(stkbuf, "debugoff", 8) == 0) { + printk(KERN_ERR "scanlog: debug off\n"); + scanlog_debug = 0; + } + } + return count; +} + +static int scanlog_open(struct inode * inode, struct file * file) +{ + struct proc_dir_entry *dp = PDE(inode); + unsigned int *data = (unsigned int *)dp->data; + + if (!data) { + printk(KERN_ERR "scanlog: open failed no data\n"); + return -EIO; + } + + if (data[0] != 0) { + /* This imperfect test stops a second copy of the + * data (or a reset while data is being copied) + */ + return -EBUSY; + } + + data[0] = 0; /* re-init so we restart the scan */ + + return 0; +} + +static int scanlog_release(struct inode * inode, struct file * file) +{ + struct proc_dir_entry *dp = PDE(inode); + unsigned int *data = (unsigned int *)dp->data; + + if (!data) { + printk(KERN_ERR "scanlog: release failed no data\n"); + return -EIO; + } + data[0] = 0; + + return 0; +} + +struct file_operations scanlog_fops = { + .owner = THIS_MODULE, + .read = scanlog_read, + .write = scanlog_write, + .open = scanlog_open, + .release = scanlog_release, +}; + +int __init scanlog_init(void) +{ + struct proc_dir_entry *ent; + + ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); + if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ERR "scan-log-dump not implemented on this system\n"); + return -EIO; + } + + ent = create_proc_entry("ppc64/rtas/scan-log-dump", S_IRUSR, NULL); + if (ent) { + ent->proc_fops = &scanlog_fops; + /* Ideally we could allocate a buffer < 4G */ + ent->data = kmalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!ent->data) { + printk(KERN_ERR "Failed to allocate a buffer\n"); + remove_proc_entry("scan-log-dump", ent->parent); + return -ENOMEM; + } + ((unsigned int *)ent->data)[0] = 0; + } else { + printk(KERN_ERR "Failed to create ppc64/scan-log-dump proc entry\n"); + return -EIO; + } + proc_ppc64_scan_log_dump = ent; + + return 0; +} + +void __exit scanlog_cleanup(void) +{ + if (proc_ppc64_scan_log_dump) { + kfree(proc_ppc64_scan_log_dump->data); + remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent); + } +} + +module_init(scanlog_init); +module_exit(scanlog_cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 65bee939eec..b9d9732b2e0 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -65,6 +65,7 @@ #include <asm/ppc-pci.h> #include <asm/i8259.h> #include <asm/udbg.h> +#include <asm/smp.h> #include "plpar_wrappers.h" @@ -199,14 +200,12 @@ static void __init pSeries_setup_arch(void) if (ppc64_interrupt_controller == IC_OPEN_PIC) { ppc_md.init_IRQ = pSeries_init_mpic; ppc_md.get_irq = mpic_get_irq; - ppc_md.cpu_irq_down = mpic_teardown_this_cpu; /* Allocate the mpic now, so that find_and_init_phbs() can * fill the ISUs */ pSeries_setup_mpic(); } else { ppc_md.init_IRQ = xics_init_IRQ; ppc_md.get_irq = xics_get_irq; - ppc_md.cpu_irq_down = xics_teardown_cpu; } #ifdef CONFIG_SMP @@ -248,7 +247,7 @@ static void __init pSeries_setup_arch(void) ppc_md.idle_loop = default_idle; } - if (systemcfg->platform & PLATFORM_LPAR) + if (platform_is_lpar()) ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; else ppc_md.enable_pmcs = power4_enable_pmcs; @@ -305,9 +304,7 @@ static void __init fw_feature_init(void) } of_node_put(dn); - no_rtas: - printk(KERN_INFO "firmware_features = 0x%lx\n", - ppc64_firmware_features); +no_rtas: DBG(" <- fw_feature_init()\n"); } @@ -353,14 +350,15 @@ static void pSeries_mach_cpu_die(void) static int pseries_set_dabr(unsigned long dabr) { - if (firmware_has_feature(FW_FEATURE_XDABR)) { - /* We want to catch accesses from kernel and userspace */ - return plpar_set_xdabr(dabr, H_DABRX_KERNEL | H_DABRX_USER); - } - - return plpar_set_dabr(dabr); + return plpar_hcall_norets(H_SET_DABR, dabr); } +static int pseries_set_xdabr(unsigned long dabr) +{ + /* We want to catch accesses from kernel and userspace */ + return plpar_hcall_norets(H_SET_XDABR, dabr, + H_DABRX_KERNEL | H_DABRX_USER); +} /* * Early initialization. Relocation is on but do not reference unbolted pages @@ -376,7 +374,7 @@ static void __init pSeries_init_early(void) fw_feature_init(); - if (systemcfg->platform & PLATFORM_LPAR) + if (platform_is_lpar()) hpte_init_lpar(); else { hpte_init_native(); @@ -386,7 +384,7 @@ static void __init pSeries_init_early(void) generic_find_legacy_serial_ports(&physport, &default_speed); - if (systemcfg->platform & PLATFORM_LPAR) + if (platform_is_lpar()) find_udbg_vterm(); else if (physport) { /* Map the uart for udbg. */ @@ -396,8 +394,10 @@ static void __init pSeries_init_early(void) DBG("Hello World !\n"); } - if (firmware_has_feature(FW_FEATURE_XDABR | FW_FEATURE_DABR)) + if (firmware_has_feature(FW_FEATURE_DABR)) ppc_md.set_dabr = pseries_set_dabr; + else if (firmware_has_feature(FW_FEATURE_XDABR)) + ppc_md.set_dabr = pseries_set_xdabr; iommu_init_early_pSeries(); @@ -465,6 +465,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) * more. */ clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); /* * SMT dynamic mode. Cede will result in this thread going @@ -477,6 +478,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) cede_processor(); else local_irq_enable(); + set_thread_flag(TIF_POLLING_NRFLAG); } else { /* * Give the HV an opportunity at the processor, since we are @@ -488,11 +490,11 @@ static inline void dedicated_idle_sleep(unsigned int cpu) static void pseries_dedicated_idle(void) { - long oldval; struct paca_struct *lpaca = get_paca(); unsigned int cpu = smp_processor_id(); unsigned long start_snooze; unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay); + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { /* @@ -501,10 +503,7 @@ static void pseries_dedicated_idle(void) */ lpaca->lppaca.idle = 1; - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { start_snooze = __get_tb() + *smt_snooze_delay * tb_ticks_per_usec; @@ -527,15 +526,14 @@ static void pseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } lpaca->lppaca.idle = 0; ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) cpu_die(); @@ -579,7 +577,9 @@ static void pseries_shared_idle(void) lpaca->lppaca.idle = 0; ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) cpu_die(); @@ -588,11 +588,32 @@ static void pseries_shared_idle(void) static int pSeries_pci_probe_mode(struct pci_bus *bus) { - if (systemcfg->platform & PLATFORM_LPAR) + if (platform_is_lpar()) return PCI_PROBE_DEVTREE; return PCI_PROBE_NORMAL; } +#ifdef CONFIG_KEXEC +static void pseries_kexec_cpu_down(int crash_shutdown, int secondary) +{ + /* Don't risk a hypervisor call if we're crashing */ + if (!crash_shutdown) { + unsigned long vpa = __pa(&get_paca()->lppaca); + + if (unregister_vpa(hard_smp_processor_id(), vpa)) { + printk("VPA deregistration of cpu %u (hw_cpu_id %d) " + "failed\n", smp_processor_id(), + hard_smp_processor_id()); + } + } + + if (ppc64_interrupt_controller == IC_OPEN_PIC) + mpic_teardown_this_cpu(secondary); + else + xics_teardown_cpu(secondary); +} +#endif + struct machdep_calls __initdata pSeries_md = { .probe = pSeries_probe, .setup_arch = pSeries_setup_arch, @@ -615,4 +636,7 @@ struct machdep_calls __initdata pSeries_md = { .check_legacy_ioport = pSeries_check_legacy_ioport, .system_reset_exception = pSeries_system_reset_exception, .machine_check_exception = pSeries_machine_check_exception, +#ifdef CONFIG_KEXEC + .kexec_cpu_down = pseries_kexec_cpu_down, +#endif }; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 7a243e8ccd7..5800cde7d5a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -46,6 +46,7 @@ #include <asm/rtas.h> #include <asm/pSeries_reconfig.h> #include <asm/mpic.h> +#include <asm/vdso_datapage.h> #include "plpar_wrappers.h" @@ -96,7 +97,7 @@ int pSeries_cpu_disable(void) int cpu = smp_processor_id(); cpu_clear(cpu, cpu_online_map); - systemcfg->processorCount--; + vdso_data->processorCount--; /*fix boot_cpuid here*/ if (cpu == boot_cpuid) @@ -441,7 +442,7 @@ void __init smp_init_pSeries(void) smp_ops->cpu_die = pSeries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (systemcfg->platform == PLATFORM_PSERIES_LPAR) + if (platform_is_lpar()) pSeries_reconfig_notifier_register(&pSeries_smp_nb); #endif diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index c72c86f05cb..72ac18067ec 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -545,7 +545,9 @@ nextnode: of_node_put(np); } - if (systemcfg->platform == PLATFORM_PSERIES) { + if (platform_is_lpar()) + ops = &pSeriesLP_ops; + else { #ifdef CONFIG_SMP for_each_cpu(i) { int hard_id; @@ -561,12 +563,11 @@ nextnode: #else xics_per_cpu[0] = ioremap(intr_base, intr_size); #endif /* CONFIG_SMP */ - } else if (systemcfg->platform == PLATFORM_PSERIES_LPAR) { - ops = &pSeriesLP_ops; } xics_8259_pic.enable = i8259_pic.enable; xics_8259_pic.disable = i8259_pic.disable; + xics_8259_pic.end = i8259_pic.end; for (i = 0; i < 16; ++i) get_irq_desc(i)->handler = &xics_8259_pic; for (; i < NR_IRQS; ++i) diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index 90bce6e0c19..b7ac32fdd77 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -207,6 +207,9 @@ void __init i8259_init(unsigned long intack_addr, int offset) spin_unlock_irqrestore(&i8259_lock, flags); + for (i = 0; i < NUM_ISA_INTERRUPTS; ++i) + irq_desc[offset + i].handler = &i8259_pic; + /* reserve our resources */ setup_irq(offset + 2, &i8259_irqaction); request_resource(&ioport_resource, &pic1_iores); @@ -216,6 +219,4 @@ void __init i8259_init(unsigned long intack_addr, int offset) if (intack_addr != 0) pci_intack = ioremap(intack_addr, 1); - for (i = 0; i < NUM_ISA_INTERRUPTS; ++i) - irq_desc[offset + i].handler = &i8259_pic; } diff --git a/arch/powerpc/sysdev/u3_iommu.c b/arch/powerpc/sysdev/u3_iommu.c index 607722178c1..f32baf7f469 100644 --- a/arch/powerpc/sysdev/u3_iommu.c +++ b/arch/powerpc/sysdev/u3_iommu.c @@ -37,7 +37,6 @@ #include <linux/vmalloc.h> #include <asm/io.h> #include <asm/prom.h> -#include <asm/ppcdebug.h> #include <asm/iommu.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> @@ -227,7 +226,7 @@ static void iommu_table_u3_setup(void) iommu_table_u3.it_busno = 0; iommu_table_u3.it_offset = 0; /* it_size is in number of entries */ - iommu_table_u3.it_size = dart_tablesize / sizeof(u32); + iommu_table_u3.it_size = (dart_tablesize / sizeof(u32)) >> DART_PAGE_FACTOR; /* Initialize the common IOMMU code */ iommu_table_u3.it_base = (unsigned long)dart_vbase; diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 79a784f0e7a..b20312e5ed2 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -8,4 +8,4 @@ obj-$(CONFIG_8xx) += start_8xx.o obj-$(CONFIG_6xx) += start_32.o obj-$(CONFIG_4xx) += start_32.o obj-$(CONFIG_PPC64) += start_64.o -obj-y += xmon.o ppc-dis.o ppc-opc.o subr_prf.o setjmp.o +obj-y += xmon.o ppc-dis.o ppc-opc.o setjmp.o nonstdio.o diff --git a/arch/powerpc/xmon/nonstdio.c b/arch/powerpc/xmon/nonstdio.c new file mode 100644 index 00000000000..78765833f4c --- /dev/null +++ b/arch/powerpc/xmon/nonstdio.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/string.h> +#include <asm/time.h> +#include "nonstdio.h" + +int xmon_putchar(int c) +{ + char ch = c; + + if (c == '\n') + xmon_putchar('\r'); + return xmon_write(&ch, 1) == 1? c: -1; +} + +static char line[256]; +static char *lineptr; +static int lineleft; + +int xmon_expect(const char *str, unsigned long timeout) +{ + int c; + unsigned long t0; + + /* assume 25MHz default timebase if tb_ticks_per_sec not set yet */ + timeout *= tb_ticks_per_sec? tb_ticks_per_sec: 25000000; + t0 = get_tbl(); + do { + lineptr = line; + for (;;) { + c = xmon_read_poll(); + if (c == -1) { + if (get_tbl() - t0 > timeout) + return 0; + continue; + } + if (c == '\n') + break; + if (c != '\r' && lineptr < &line[sizeof(line) - 1]) + *lineptr++ = c; + } + *lineptr = 0; + } while (strstr(line, str) == NULL); + return 1; +} + +int xmon_getchar(void) +{ + int c; + + if (lineleft == 0) { + lineptr = line; + for (;;) { + c = xmon_readchar(); + if (c == -1 || c == 4) + break; + if (c == '\r' || c == '\n') { + *lineptr++ = '\n'; + xmon_putchar('\n'); + break; + } + switch (c) { + case 0177: + case '\b': + if (lineptr > line) { + xmon_putchar('\b'); + xmon_putchar(' '); + xmon_putchar('\b'); + --lineptr; + } + break; + case 'U' & 0x1F: + while (lineptr > line) { + xmon_putchar('\b'); + xmon_putchar(' '); + xmon_putchar('\b'); + --lineptr; + } + break; + default: + if (lineptr >= &line[sizeof(line) - 1]) + xmon_putchar('\a'); + else { + xmon_putchar(c); + *lineptr++ = c; + } + } + } + lineleft = lineptr - line; + lineptr = line; + } + if (lineleft == 0) + return -1; + --lineleft; + return *lineptr++; +} + +char *xmon_gets(char *str, int nb) +{ + char *p; + int c; + + for (p = str; p < str + nb - 1; ) { + c = xmon_getchar(); + if (c == -1) { + if (p == str) + return NULL; + break; + } + *p++ = c; + if (c == '\n') + break; + } + *p = 0; + return str; +} + +void xmon_printf(const char *format, ...) +{ + va_list args; + int n; + static char xmon_outbuf[1024]; + + va_start(args, format); + n = vsnprintf(xmon_outbuf, sizeof(xmon_outbuf), format, args); + va_end(args); + xmon_write(xmon_outbuf, n); +} diff --git a/arch/powerpc/xmon/nonstdio.h b/arch/powerpc/xmon/nonstdio.h index 84211a21c6f..47cebbd2b1b 100644 --- a/arch/powerpc/xmon/nonstdio.h +++ b/arch/powerpc/xmon/nonstdio.h @@ -1,22 +1,14 @@ -typedef int FILE; -extern FILE *xmon_stdin, *xmon_stdout; #define EOF (-1) -#define stdin xmon_stdin -#define stdout xmon_stdout + #define printf xmon_printf -#define fprintf xmon_fprintf -#define fputs xmon_fputs -#define fgets xmon_fgets #define putchar xmon_putchar -#define getchar xmon_getchar -#define putc xmon_putc -#define getc xmon_getc -#define fopen(n, m) NULL -#define fflush(f) do {} while (0) -#define fclose(f) do {} while (0) -extern char *fgets(char *, int, void *); -extern void xmon_printf(const char *, ...); -extern void xmon_fprintf(void *, const char *, ...); -extern void xmon_sprintf(char *, const char *, ...); -#define perror(s) printf("%s: no files!\n", (s)) +extern int xmon_putchar(int c); +extern int xmon_getchar(void); +extern char *xmon_gets(char *, int); +extern void xmon_printf(const char *, ...); +extern void xmon_map_scc(void); +extern int xmon_expect(const char *str, unsigned long timeout); +extern int xmon_write(void *ptr, int nb); +extern int xmon_readchar(void); +extern int xmon_read_poll(void); diff --git a/arch/powerpc/xmon/setjmp.S b/arch/powerpc/xmon/setjmp.S index f8e40dfd2bf..96a91f10e2e 100644 --- a/arch/powerpc/xmon/setjmp.S +++ b/arch/powerpc/xmon/setjmp.S @@ -14,61 +14,61 @@ _GLOBAL(xmon_setjmp) mflr r0 - STL r0,0(r3) - STL r1,SZL(r3) - STL r2,2*SZL(r3) + PPC_STL r0,0(r3) + PPC_STL r1,SZL(r3) + PPC_STL r2,2*SZL(r3) mfcr r0 - STL r0,3*SZL(r3) - STL r13,4*SZL(r3) - STL r14,5*SZL(r3) - STL r15,6*SZL(r3) - STL r16,7*SZL(r3) - STL r17,8*SZL(r3) - STL r18,9*SZL(r3) - STL r19,10*SZL(r3) - STL r20,11*SZL(r3) - STL r21,12*SZL(r3) - STL r22,13*SZL(r3) - STL r23,14*SZL(r3) - STL r24,15*SZL(r3) - STL r25,16*SZL(r3) - STL r26,17*SZL(r3) - STL r27,18*SZL(r3) - STL r28,19*SZL(r3) - STL r29,20*SZL(r3) - STL r30,21*SZL(r3) - STL r31,22*SZL(r3) + PPC_STL r0,3*SZL(r3) + PPC_STL r13,4*SZL(r3) + PPC_STL r14,5*SZL(r3) + PPC_STL r15,6*SZL(r3) + PPC_STL r16,7*SZL(r3) + PPC_STL r17,8*SZL(r3) + PPC_STL r18,9*SZL(r3) + PPC_STL r19,10*SZL(r3) + PPC_STL r20,11*SZL(r3) + PPC_STL r21,12*SZL(r3) + PPC_STL r22,13*SZL(r3) + PPC_STL r23,14*SZL(r3) + PPC_STL r24,15*SZL(r3) + PPC_STL r25,16*SZL(r3) + PPC_STL r26,17*SZL(r3) + PPC_STL r27,18*SZL(r3) + PPC_STL r28,19*SZL(r3) + PPC_STL r29,20*SZL(r3) + PPC_STL r30,21*SZL(r3) + PPC_STL r31,22*SZL(r3) li r3,0 blr _GLOBAL(xmon_longjmp) - CMPI r4,0 + PPC_LCMPI r4,0 bne 1f li r4,1 -1: LDL r13,4*SZL(r3) - LDL r14,5*SZL(r3) - LDL r15,6*SZL(r3) - LDL r16,7*SZL(r3) - LDL r17,8*SZL(r3) - LDL r18,9*SZL(r3) - LDL r19,10*SZL(r3) - LDL r20,11*SZL(r3) - LDL r21,12*SZL(r3) - LDL r22,13*SZL(r3) - LDL r23,14*SZL(r3) - LDL r24,15*SZL(r3) - LDL r25,16*SZL(r3) - LDL r26,17*SZL(r3) - LDL r27,18*SZL(r3) - LDL r28,19*SZL(r3) - LDL r29,20*SZL(r3) - LDL r30,21*SZL(r3) - LDL r31,22*SZL(r3) - LDL r0,3*SZL(r3) +1: PPC_LL r13,4*SZL(r3) + PPC_LL r14,5*SZL(r3) + PPC_LL r15,6*SZL(r3) + PPC_LL r16,7*SZL(r3) + PPC_LL r17,8*SZL(r3) + PPC_LL r18,9*SZL(r3) + PPC_LL r19,10*SZL(r3) + PPC_LL r20,11*SZL(r3) + PPC_LL r21,12*SZL(r3) + PPC_LL r22,13*SZL(r3) + PPC_LL r23,14*SZL(r3) + PPC_LL r24,15*SZL(r3) + PPC_LL r25,16*SZL(r3) + PPC_LL r26,17*SZL(r3) + PPC_LL r27,18*SZL(r3) + PPC_LL r28,19*SZL(r3) + PPC_LL r29,20*SZL(r3) + PPC_LL r30,21*SZL(r3) + PPC_LL r31,22*SZL(r3) + PPC_LL r0,3*SZL(r3) mtcrf 0x38,r0 - LDL r0,0(r3) - LDL r1,SZL(r3) - LDL r2,2*SZL(r3) + PPC_LL r0,0(r3) + PPC_LL r1,SZL(r3) + PPC_LL r2,2*SZL(r3) mtlr r0 mr r3,r4 blr @@ -84,52 +84,52 @@ _GLOBAL(xmon_longjmp) * different ABIs, though). */ _GLOBAL(xmon_save_regs) - STL r0,0*SZL(r3) - STL r2,2*SZL(r3) - STL r3,3*SZL(r3) - STL r4,4*SZL(r3) - STL r5,5*SZL(r3) - STL r6,6*SZL(r3) - STL r7,7*SZL(r3) - STL r8,8*SZL(r3) - STL r9,9*SZL(r3) - STL r10,10*SZL(r3) - STL r11,11*SZL(r3) - STL r12,12*SZL(r3) - STL r13,13*SZL(r3) - STL r14,14*SZL(r3) - STL r15,15*SZL(r3) - STL r16,16*SZL(r3) - STL r17,17*SZL(r3) - STL r18,18*SZL(r3) - STL r19,19*SZL(r3) - STL r20,20*SZL(r3) - STL r21,21*SZL(r3) - STL r22,22*SZL(r3) - STL r23,23*SZL(r3) - STL r24,24*SZL(r3) - STL r25,25*SZL(r3) - STL r26,26*SZL(r3) - STL r27,27*SZL(r3) - STL r28,28*SZL(r3) - STL r29,29*SZL(r3) - STL r30,30*SZL(r3) - STL r31,31*SZL(r3) + PPC_STL r0,0*SZL(r3) + PPC_STL r2,2*SZL(r3) + PPC_STL r3,3*SZL(r3) + PPC_STL r4,4*SZL(r3) + PPC_STL r5,5*SZL(r3) + PPC_STL r6,6*SZL(r3) + PPC_STL r7,7*SZL(r3) + PPC_STL r8,8*SZL(r3) + PPC_STL r9,9*SZL(r3) + PPC_STL r10,10*SZL(r3) + PPC_STL r11,11*SZL(r3) + PPC_STL r12,12*SZL(r3) + PPC_STL r13,13*SZL(r3) + PPC_STL r14,14*SZL(r3) + PPC_STL r15,15*SZL(r3) + PPC_STL r16,16*SZL(r3) + PPC_STL r17,17*SZL(r3) + PPC_STL r18,18*SZL(r3) + PPC_STL r19,19*SZL(r3) + PPC_STL r20,20*SZL(r3) + PPC_STL r21,21*SZL(r3) + PPC_STL r22,22*SZL(r3) + PPC_STL r23,23*SZL(r3) + PPC_STL r24,24*SZL(r3) + PPC_STL r25,25*SZL(r3) + PPC_STL r26,26*SZL(r3) + PPC_STL r27,27*SZL(r3) + PPC_STL r28,28*SZL(r3) + PPC_STL r29,29*SZL(r3) + PPC_STL r30,30*SZL(r3) + PPC_STL r31,31*SZL(r3) /* go up one stack frame for SP */ - LDL r4,0(r1) - STL r4,1*SZL(r3) + PPC_LL r4,0(r1) + PPC_STL r4,1*SZL(r3) /* get caller's LR */ - LDL r0,LRSAVE(r4) - STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) - STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) + PPC_LL r0,LRSAVE(r4) + PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) mfmsr r0 - STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) mfctr r0 - STL r0,_CTR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CTR-STACK_FRAME_OVERHEAD(r3) mfxer r0 - STL r0,_XER-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_XER-STACK_FRAME_OVERHEAD(r3) mfcr r0 - STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) li r0,0 - STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) blr diff --git a/arch/powerpc/xmon/start_32.c b/arch/powerpc/xmon/start_32.c index 69b658c0f76..c2464df4217 100644 --- a/arch/powerpc/xmon/start_32.c +++ b/arch/powerpc/xmon/start_32.c @@ -11,7 +11,6 @@ #include <linux/cuda.h> #include <linux/kernel.h> #include <linux/errno.h> -#include <linux/sysrq.h> #include <linux/bitops.h> #include <asm/xmon.h> #include <asm/prom.h> @@ -22,10 +21,11 @@ #include <asm/processor.h> #include <asm/delay.h> #include <asm/btext.h> +#include <asm/time.h> +#include "nonstdio.h" static volatile unsigned char __iomem *sccc, *sccd; unsigned int TXRDY, RXRDY, DLAB; -static int xmon_expect(const char *str, unsigned int timeout); static int use_serial; static int use_screen; @@ -33,16 +33,6 @@ static int via_modem; static int xmon_use_sccb; static struct device_node *channel_node; -#define TB_SPEED 25000000 - -static inline unsigned int readtb(void) -{ - unsigned int ret; - - asm volatile("mftb %0" : "=r" (ret) :); - return ret; -} - void buf_access(void) { if (DLAB) @@ -91,23 +81,7 @@ static unsigned long chrp_find_phys_io_base(void) } #endif /* CONFIG_PPC_CHRP */ -#ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_xmon(int key, struct pt_regs *regs, - struct tty_struct *tty) -{ - xmon(regs); -} - -static struct sysrq_key_op sysrq_xmon_op = -{ - .handler = sysrq_handle_xmon, - .help_msg = "Xmon", - .action_msg = "Entering xmon", -}; -#endif - -void -xmon_map_scc(void) +void xmon_map_scc(void) { #ifdef CONFIG_PPC_MULTIPLATFORM volatile unsigned char __iomem *base; @@ -217,8 +191,6 @@ xmon_map_scc(void) RXRDY = 1; DLAB = 0x80; #endif /* platform */ - - register_sysrq_key('x', &sysrq_xmon_op); } static int scc_initialized = 0; @@ -238,8 +210,7 @@ static inline void do_poll_adb(void) #endif /* CONFIG_ADB_CUDA */ } -int -xmon_write(void *handle, void *ptr, int nb) +int xmon_write(void *ptr, int nb) { char *p = ptr; int i, c, ct; @@ -311,8 +282,7 @@ static unsigned char xmon_shift_keytab[128] = "\0.\0*\0+\0\0\0\0\0/\r\0-\0" /* 0x40 - 0x4f */ "\0\0000123456789\0\0\0"; /* 0x50 - 0x5f */ -static int -xmon_get_adb_key(void) +static int xmon_get_adb_key(void) { int k, t, on; @@ -350,32 +320,21 @@ xmon_get_adb_key(void) } #endif /* CONFIG_BOOTX_TEXT */ -int -xmon_read(void *handle, void *ptr, int nb) +int xmon_readchar(void) { - char *p = ptr; - int i; - #ifdef CONFIG_BOOTX_TEXT - if (use_screen) { - for (i = 0; i < nb; ++i) - *p++ = xmon_get_adb_key(); - return i; - } + if (use_screen) + return xmon_get_adb_key(); #endif - if (!scc_initialized) - xmon_init_scc(); - for (i = 0; i < nb; ++i) { + if (!scc_initialized) + xmon_init_scc(); while ((*sccc & RXRDY) == 0) - do_poll_adb(); + do_poll_adb(); buf_access(); - *p++ = *sccd; - } - return i; + return *sccd; } -int -xmon_read_poll(void) +int xmon_read_poll(void) { if ((*sccc & RXRDY) == 0) { do_poll_adb(); @@ -395,8 +354,7 @@ static unsigned char scc_inittab[] = { 3, 0xc1, /* rx enable, 8 bits */ }; -void -xmon_init_scc(void) +void xmon_init_scc(void) { if ( _machine == _MACH_chrp ) { @@ -410,6 +368,7 @@ xmon_init_scc(void) else if ( _machine == _MACH_Pmac ) { int i, x; + unsigned long timeout; if (channel_node != 0) pmac_call_feature( @@ -424,8 +383,12 @@ xmon_init_scc(void) PMAC_FTR_MODEM_ENABLE, channel_node, 0, 1); printk(KERN_INFO "Modem powered up by debugger !\n"); - t0 = readtb(); - while (readtb() - t0 < 3*TB_SPEED) + t0 = get_tbl(); + timeout = 3 * tb_ticks_per_sec; + if (timeout == 0) + /* assume 25MHz if tb_ticks_per_sec not set */ + timeout = 75000000; + while (get_tbl() - t0 < timeout) eieio(); } /* use the B channel if requested */ @@ -447,164 +410,19 @@ xmon_init_scc(void) scc_initialized = 1; if (via_modem) { for (;;) { - xmon_write(NULL, "ATE1V1\r", 7); + xmon_write("ATE1V1\r", 7); if (xmon_expect("OK", 5)) { - xmon_write(NULL, "ATA\r", 4); + xmon_write("ATA\r", 4); if (xmon_expect("CONNECT", 40)) break; } - xmon_write(NULL, "+++", 3); + xmon_write("+++", 3); xmon_expect("OK", 3); } } } -void *xmon_stdin; -void *xmon_stdout; -void *xmon_stderr; - -int xmon_putc(int c, void *f) -{ - char ch = c; - - if (c == '\n') - xmon_putc('\r', f); - return xmon_write(f, &ch, 1) == 1? c: -1; -} - -int xmon_putchar(int c) -{ - return xmon_putc(c, xmon_stdout); -} - -int xmon_fputs(char *str, void *f) -{ - int n = strlen(str); - - return xmon_write(f, str, n) == n? 0: -1; -} - -int -xmon_readchar(void) -{ - char ch; - - for (;;) { - switch (xmon_read(xmon_stdin, &ch, 1)) { - case 1: - return ch; - case -1: - xmon_printf("read(stdin) returned -1\r\n", 0, 0); - return -1; - } - } -} - -static char line[256]; -static char *lineptr; -static int lineleft; - -int xmon_expect(const char *str, unsigned int timeout) -{ - int c; - unsigned int t0; - - timeout *= TB_SPEED; - t0 = readtb(); - do { - lineptr = line; - for (;;) { - c = xmon_read_poll(); - if (c == -1) { - if (readtb() - t0 > timeout) - return 0; - continue; - } - if (c == '\n') - break; - if (c != '\r' && lineptr < &line[sizeof(line) - 1]) - *lineptr++ = c; - } - *lineptr = 0; - } while (strstr(line, str) == NULL); - return 1; -} - -int -xmon_getchar(void) -{ - int c; - - if (lineleft == 0) { - lineptr = line; - for (;;) { - c = xmon_readchar(); - if (c == -1 || c == 4) - break; - if (c == '\r' || c == '\n') { - *lineptr++ = '\n'; - xmon_putchar('\n'); - break; - } - switch (c) { - case 0177: - case '\b': - if (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - case 'U' & 0x1F: - while (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - default: - if (lineptr >= &line[sizeof(line) - 1]) - xmon_putchar('\a'); - else { - xmon_putchar(c); - *lineptr++ = c; - } - } - } - lineleft = lineptr - line; - lineptr = line; - } - if (lineleft == 0) - return -1; - --lineleft; - return *lineptr++; -} - -char * -xmon_fgets(char *str, int nb, void *f) -{ - char *p; - int c; - - for (p = str; p < str + nb - 1; ) { - c = xmon_getchar(); - if (c == -1) { - if (p == str) - return NULL; - break; - } - *p++ = c; - if (c == '\n') - break; - } - *p = 0; - return str; -} - -void -xmon_enter(void) +void xmon_enter(void) { #ifdef CONFIG_ADB_PMU if (_machine == _MACH_Pmac) { @@ -613,8 +431,7 @@ xmon_enter(void) #endif } -void -xmon_leave(void) +void xmon_leave(void) { #ifdef CONFIG_ADB_PMU if (_machine == _MACH_Pmac) { diff --git a/arch/powerpc/xmon/start_64.c b/arch/powerpc/xmon/start_64.c index e50c158191e..712552c4f24 100644 --- a/arch/powerpc/xmon/start_64.c +++ b/arch/powerpc/xmon/start_64.c @@ -6,182 +6,29 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include <linux/config.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/sysrq.h> -#include <linux/init.h> #include <asm/machdep.h> -#include <asm/io.h> -#include <asm/page.h> -#include <asm/prom.h> -#include <asm/processor.h> #include <asm/udbg.h> -#include <asm/system.h> #include "nonstdio.h" -#ifdef CONFIG_MAGIC_SYSRQ - -static void sysrq_handle_xmon(int key, struct pt_regs *pt_regs, - struct tty_struct *tty) -{ - /* ensure xmon is enabled */ - xmon_init(1); - debugger(pt_regs); -} - -static struct sysrq_key_op sysrq_xmon_op = +void xmon_map_scc(void) { - .handler = sysrq_handle_xmon, - .help_msg = "Xmon", - .action_msg = "Entering xmon", -}; - -static int __init setup_xmon_sysrq(void) -{ - register_sysrq_key('x', &sysrq_xmon_op); - return 0; } -__initcall(setup_xmon_sysrq); -#endif /* CONFIG_MAGIC_SYSRQ */ -int -xmon_write(void *handle, void *ptr, int nb) +int xmon_write(void *ptr, int nb) { return udbg_write(ptr, nb); } -int -xmon_read(void *handle, void *ptr, int nb) +int xmon_readchar(void) { - return udbg_read(ptr, nb); + if (udbg_getc) + return udbg_getc(); + return -1; } -int -xmon_read_poll(void) +int xmon_read_poll(void) { if (udbg_getc_poll) return udbg_getc_poll(); return -1; } - -FILE *xmon_stdin; -FILE *xmon_stdout; - -int -xmon_putc(int c, void *f) -{ - char ch = c; - - if (c == '\n') - xmon_putc('\r', f); - return xmon_write(f, &ch, 1) == 1? c: -1; -} - -int -xmon_putchar(int c) -{ - return xmon_putc(c, xmon_stdout); -} - -int -xmon_fputs(char *str, void *f) -{ - int n = strlen(str); - - return xmon_write(f, str, n) == n? 0: -1; -} - -int -xmon_readchar(void) -{ - char ch; - - for (;;) { - switch (xmon_read(xmon_stdin, &ch, 1)) { - case 1: - return ch; - case -1: - xmon_printf("read(stdin) returned -1\r\n", 0, 0); - return -1; - } - } -} - -static char line[256]; -static char *lineptr; -static int lineleft; - -int -xmon_getchar(void) -{ - int c; - - if (lineleft == 0) { - lineptr = line; - for (;;) { - c = xmon_readchar(); - if (c == -1 || c == 4) - break; - if (c == '\r' || c == '\n') { - *lineptr++ = '\n'; - xmon_putchar('\n'); - break; - } - switch (c) { - case 0177: - case '\b': - if (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - case 'U' & 0x1F: - while (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - default: - if (lineptr >= &line[sizeof(line) - 1]) - xmon_putchar('\a'); - else { - xmon_putchar(c); - *lineptr++ = c; - } - } - } - lineleft = lineptr - line; - lineptr = line; - } - if (lineleft == 0) - return -1; - --lineleft; - return *lineptr++; -} - -char * -xmon_fgets(char *str, int nb, void *f) -{ - char *p; - int c; - - for (p = str; p < str + nb - 1; ) { - c = xmon_getchar(); - if (c == -1) { - if (p == str) - return NULL; - break; - } - *p++ = c; - if (c == '\n') - break; - } - *p = 0; - return str; -} diff --git a/arch/powerpc/xmon/start_8xx.c b/arch/powerpc/xmon/start_8xx.c index a48bd594cf6..4c17b0486ad 100644 --- a/arch/powerpc/xmon/start_8xx.c +++ b/arch/powerpc/xmon/start_8xx.c @@ -15,273 +15,30 @@ #include <asm/8xx_immap.h> #include <asm/mpc8xx.h> #include <asm/commproc.h> +#include "nonstdio.h" -extern void xmon_printf(const char *fmt, ...); extern int xmon_8xx_write(char *str, int nb); extern int xmon_8xx_read_poll(void); extern int xmon_8xx_read_char(void); -void prom_drawhex(uint); -void prom_drawstring(const char *str); -static int use_screen = 1; /* default */ - -#define TB_SPEED 25000000 - -static inline unsigned int readtb(void) -{ - unsigned int ret; - - asm volatile("mftb %0" : "=r" (ret) :); - return ret; -} - -void buf_access(void) -{ -} - -void -xmon_map_scc(void) +void xmon_map_scc(void) { - cpmp = (cpm8xx_t *)&(((immap_t *)IMAP_ADDR)->im_cpm); - use_screen = 0; - - prom_drawstring("xmon uses serial port\n"); } -static int scc_initialized = 0; - void xmon_init_scc(void); -int -xmon_write(void *handle, void *ptr, int nb) +int xmon_write(void *ptr, int nb) { - char *p = ptr; - int i, c, ct; - - if (!scc_initialized) - xmon_init_scc(); - return(xmon_8xx_write(ptr, nb)); } -int xmon_wants_key; - -int -xmon_read(void *handle, void *ptr, int nb) +int xmon_readchar(void) { - char *p = ptr; - int i; - - if (!scc_initialized) - xmon_init_scc(); - - for (i = 0; i < nb; ++i) { - *p++ = xmon_8xx_read_char(); - } - return i; + return xmon_8xx_read_char(); } -int -xmon_read_poll(void) +int xmon_read_poll(void) { return(xmon_8xx_read_poll()); } - -void -xmon_init_scc() -{ - scc_initialized = 1; -} - -#if 0 -extern int (*prom_entry)(void *); - -int -xmon_exit(void) -{ - struct prom_args { - char *service; - } args; - - for (;;) { - args.service = "exit"; - (*prom_entry)(&args); - } -} -#endif - -void *xmon_stdin; -void *xmon_stdout; -void *xmon_stderr; - -void -xmon_init(void) -{ -} - -int -xmon_putc(int c, void *f) -{ - char ch = c; - - if (c == '\n') - xmon_putc('\r', f); - return xmon_write(f, &ch, 1) == 1? c: -1; -} - -int -xmon_putchar(int c) -{ - return xmon_putc(c, xmon_stdout); -} - -int -xmon_fputs(char *str, void *f) -{ - int n = strlen(str); - - return xmon_write(f, str, n) == n? 0: -1; -} - -int -xmon_readchar(void) -{ - char ch; - - for (;;) { - switch (xmon_read(xmon_stdin, &ch, 1)) { - case 1: - return ch; - case -1: - xmon_printf("read(stdin) returned -1\r\n", 0, 0); - return -1; - } - } -} - -static char line[256]; -static char *lineptr; -static int lineleft; - -#if 0 -int xmon_expect(const char *str, unsigned int timeout) -{ - int c; - unsigned int t0; - - timeout *= TB_SPEED; - t0 = readtb(); - do { - lineptr = line; - for (;;) { - c = xmon_read_poll(); - if (c == -1) { - if (readtb() - t0 > timeout) - return 0; - continue; - } - if (c == '\n') - break; - if (c != '\r' && lineptr < &line[sizeof(line) - 1]) - *lineptr++ = c; - } - *lineptr = 0; - } while (strstr(line, str) == NULL); - return 1; -} -#endif - -int -xmon_getchar(void) -{ - int c; - - if (lineleft == 0) { - lineptr = line; - for (;;) { - c = xmon_readchar(); - if (c == -1 || c == 4) - break; - if (c == '\r' || c == '\n') { - *lineptr++ = '\n'; - xmon_putchar('\n'); - break; - } - switch (c) { - case 0177: - case '\b': - if (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - case 'U' & 0x1F: - while (lineptr > line) { - xmon_putchar('\b'); - xmon_putchar(' '); - xmon_putchar('\b'); - --lineptr; - } - break; - default: - if (lineptr >= &line[sizeof(line) - 1]) - xmon_putchar('\a'); - else { - xmon_putchar(c); - *lineptr++ = c; - } - } - } - lineleft = lineptr - line; - lineptr = line; - } - if (lineleft == 0) - return -1; - --lineleft; - return *lineptr++; -} - -char * -xmon_fgets(char *str, int nb, void *f) -{ - char *p; - int c; - - for (p = str; p < str + nb - 1; ) { - c = xmon_getchar(); - if (c == -1) { - if (p == str) - return 0; - break; - } - *p++ = c; - if (c == '\n') - break; - } - *p = 0; - return str; -} - -void -prom_drawhex(uint val) -{ - unsigned char buf[10]; - - int i; - for (i = 7; i >= 0; i--) - { - buf[i] = "0123456789abcdef"[val & 0x0f]; - val >>= 4; - } - buf[8] = '\0'; - xmon_fputs(buf, xmon_stdout); -} - -void -prom_drawstring(const char *str) -{ - xmon_fputs(str, xmon_stdout); -} diff --git a/arch/powerpc/xmon/subr_prf.c b/arch/powerpc/xmon/subr_prf.c deleted file mode 100644 index b48738c6dd3..00000000000 --- a/arch/powerpc/xmon/subr_prf.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Written by Cort Dougan to replace the version originally used - * by Paul Mackerras, which came from NetBSD and thus had copyright - * conflicts with Linux. - * - * This file makes liberal use of the standard linux utility - * routines to reduce the size of the binary. We assume we can - * trust some parts of Linux inside the debugger. - * -- Cort (cort@cs.nmt.edu) - * - * Copyright (C) 1999 Cort Dougan. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/module.h> -#include <stdarg.h> -#include "nonstdio.h" - -extern int xmon_write(void *, void *, int); - -void xmon_vfprintf(void *f, const char *fmt, va_list ap) -{ - static char xmon_buf[2048]; - int n; - - n = vsprintf(xmon_buf, fmt, ap); - xmon_write(f, xmon_buf, n); -} - -void xmon_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xmon_vfprintf(stdout, fmt, ap); - va_end(ap); -} -EXPORT_SYMBOL(xmon_printf); - -void xmon_fprintf(void *f, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - xmon_vfprintf(f, fmt, ap); - va_end(ap); -} - diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 1124f114620..c45a6ad5f3b 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1,7 +1,7 @@ /* * Routines providing a simple monitor for use on the PowerMac. * - * Copyright (C) 1996 Paul Mackerras. + * Copyright (C) 1996-2005 Paul Mackerras. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,6 +18,8 @@ #include <linux/kallsyms.h> #include <linux/cpumask.h> #include <linux/module.h> +#include <linux/sysrq.h> +#include <linux/interrupt.h> #include <asm/ptrace.h> #include <asm/string.h> @@ -144,15 +146,10 @@ static void xmon_print_symbol(unsigned long address, const char *mid, static const char *getvecname(unsigned long vec); extern int print_insn_powerpc(unsigned long, unsigned long, int); -extern void printf(const char *fmt, ...); -extern void xmon_vfprintf(void *f, const char *fmt, va_list ap); -extern int xmon_putc(int c, void *f); -extern int putchar(int ch); extern void xmon_enter(void); extern void xmon_leave(void); -extern int xmon_read_poll(void); extern long setjmp(long *); extern void longjmp(long *, long); extern void xmon_save_regs(struct pt_regs *); @@ -748,7 +745,6 @@ cmds(struct pt_regs *excp) printf("%x:", smp_processor_id()); #endif /* CONFIG_SMP */ printf("mon> "); - fflush(stdout); flush_input(); termch = 0; cmd = skipbl(); @@ -1472,17 +1468,23 @@ read_spr(int n) { unsigned int instrs[2]; unsigned long (*code)(void); - unsigned long opd[3]; unsigned long ret = -1UL; +#ifdef CONFIG_PPC64 + unsigned long opd[3]; - instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; opd[0] = (unsigned long)instrs; opd[1] = 0; opd[2] = 0; + code = (unsigned long (*)(void)) opd; +#else + code = (unsigned long (*)(void)) instrs; +#endif + + /* mfspr r3,n; blr */ + instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); + instrs[1] = 0x4e800020; store_inst(instrs); store_inst(instrs+1); - code = (unsigned long (*)(void)) opd; if (setjmp(bus_error_jmp) == 0) { catch_memory_errors = 1; @@ -1504,16 +1506,21 @@ write_spr(int n, unsigned long val) { unsigned int instrs[2]; unsigned long (*code)(unsigned long); +#ifdef CONFIG_PPC64 unsigned long opd[3]; - instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; opd[0] = (unsigned long)instrs; opd[1] = 0; opd[2] = 0; + code = (unsigned long (*)(unsigned long)) opd; +#else + code = (unsigned long (*)(unsigned long)) instrs; +#endif + + instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); + instrs[1] = 0x4e800020; store_inst(instrs); store_inst(instrs+1); - code = (unsigned long (*)(unsigned long)) opd; if (setjmp(bus_error_jmp) == 0) { catch_memory_errors = 1; @@ -1797,7 +1804,7 @@ memex(void) for(;;){ if (!mnoread) n = mread(adrs, val, size); - printf("%.16x%c", adrs, brev? 'r': ' '); + printf(REG"%c", adrs, brev? 'r': ' '); if (!mnoread) { if (brev) byterev(val, size); @@ -1976,17 +1983,18 @@ prdump(unsigned long adrs, long ndump) nr = mread(adrs, temp, r); adrs += nr; for (m = 0; m < r; ++m) { - if ((m & 7) == 0 && m > 0) - putchar(' '); + if ((m & (sizeof(long) - 1)) == 0 && m > 0) + putchar(' '); if (m < nr) printf("%.2x", temp[m]); else printf("%s", fault_chars[fault_type]); } - if (m <= 8) - printf(" "); - for (; m < 16; ++m) + for (; m < 16; ++m) { + if ((m & (sizeof(long) - 1)) == 0) + putchar(' '); printf(" "); + } printf(" |"); for (m = 0; m < r; ++m) { if (m < nr) { @@ -2151,7 +2159,6 @@ memzcan(void) ok = mread(a, &v, 1); if (ok && !ook) { printf("%.8x .. ", a); - fflush(stdout); } else if (!ok && ook) printf("%.8x\n", a - mskip); ook = ok; @@ -2372,7 +2379,7 @@ int inchar(void) { if (lineptr == NULL || *lineptr == 0) { - if (fgets(line, sizeof(line), stdin) == NULL) { + if (xmon_gets(line, sizeof(line)) == NULL) { lineptr = NULL; return EOF; } @@ -2526,4 +2533,29 @@ void xmon_init(int enable) __debugger_dabr_match = NULL; __debugger_fault_handler = NULL; } + xmon_map_scc(); +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_xmon(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + /* ensure xmon is enabled */ + xmon_init(1); + debugger(pt_regs); +} + +static struct sysrq_key_op sysrq_xmon_op = +{ + .handler = sysrq_handle_xmon, + .help_msg = "Xmon", + .action_msg = "Entering xmon", +}; + +static int __init setup_xmon_sysrq(void) +{ + register_sysrq_key('x', &sysrq_xmon_op); + return 0; } +__initcall(setup_xmon_sysrq); +#endif /* CONFIG_MAGIC_SYSRQ */ |