diff options
289 files changed, 8317 insertions, 2917 deletions
@@ -32,6 +32,7 @@ Christoph Hellwig <hch@lst.de> Corey Minyard <minyard@acm.org> David Brownell <david-b@pacbell.net> David Woodhouse <dwmw2@shinybook.infradead.org> +Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> Domen Puncer <domen@coderock.org> Douglas Gilbert <dougg@torque.net> Ed L. Cashin <ecashin@coraid.com> diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 7a16fe1e227..9fe91c02ee4 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -6,7 +6,6 @@ Description: internal state of the kernel memory blocks. Files could be added or removed dynamically to represent hot-add/remove operations. - Users: hotplug memory add/remove tools https://w3.opensource.ibm.com/projects/powerpc-utils/ @@ -19,6 +18,56 @@ Description: This is useful for a user-level agent to determine identify removable sections of the memory before attempting potentially expensive hot-remove memory operation +Users: hotplug memory remove tools + https://w3.opensource.ibm.com/projects/powerpc-utils/ + +What: /sys/devices/system/memory/memoryX/phys_device +Date: September 2008 +Contact: Badari Pulavarty <pbadari@us.ibm.com> +Description: + The file /sys/devices/system/memory/memoryX/phys_device + is read-only and is designed to show the name of physical + memory device. Implementation is currently incomplete. +What: /sys/devices/system/memory/memoryX/phys_index +Date: September 2008 +Contact: Badari Pulavarty <pbadari@us.ibm.com> +Description: + The file /sys/devices/system/memory/memoryX/phys_index + is read-only and contains the section ID in hexadecimal + which is equivalent to decimal X contained in the + memory section directory name. + +What: /sys/devices/system/memory/memoryX/state +Date: September 2008 +Contact: Badari Pulavarty <pbadari@us.ibm.com> +Description: + The file /sys/devices/system/memory/memoryX/state + is read-write. When read, it's contents show the + online/offline state of the memory section. When written, + root can toggle the the online/offline state of a removable + memory section (see removable file description above) + using the following commands. + # echo online > /sys/devices/system/memory/memoryX/state + # echo offline > /sys/devices/system/memory/memoryX/state + + For example, if /sys/devices/system/memory/memory22/removable + contains a value of 1 and + /sys/devices/system/memory/memory22/state contains the + string "online" the following command can be executed by + by root to offline that section. + # echo offline > /sys/devices/system/memory/memory22/state Users: hotplug memory remove tools https://w3.opensource.ibm.com/projects/powerpc-utils/ + +What: /sys/devices/system/node/nodeX/memoryY +Date: September 2008 +Contact: Gary Hade <garyhade@us.ibm.com> +Description: + When CONFIG_NUMA is enabled + /sys/devices/system/node/nodeX/memoryY is a symbolic link that + points to the corresponding /sys/devices/system/memory/memoryY + memory section directory. For example, the following symbolic + link is created for memory section 9 on node0. + /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 + diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt index c74fec8c235..b2a4d6d244d 100644 --- a/Documentation/DMA-mapping.txt +++ b/Documentation/DMA-mapping.txt @@ -26,7 +26,7 @@ mapped only for the time they are actually used and unmapped after the DMA transfer. The following API will work of course even on platforms where no such -hardware exists, see e.g. include/asm-i386/pci.h for how it is implemented on +hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on top of the virt_to_bus interface. First of all, you should make sure diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index ccec5539438..cfbfa15a46b 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -397,7 +397,7 @@ prototypes: }; locking rules: - All except ->poll() may block. + All may block. BKL llseek: no (see below) read: no diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 71df353e367..32e94635484 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1385,6 +1385,15 @@ swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 causes the kernel to prefer to reclaim dentries and inodes. +dirty_background_bytes +---------------------- + +Contains the amount of dirty memory at which the pdflush background writeback +daemon will start writeback. + +If dirty_background_bytes is written, dirty_background_ratio becomes a function +of its value (dirty_background_bytes / the amount of dirtyable system memory). + dirty_background_ratio ---------------------- @@ -1393,14 +1402,29 @@ pages + file cache, not including locked pages and HugePages), the number of pages at which the pdflush background writeback daemon will start writing out dirty data. +If dirty_background_ratio is written, dirty_background_bytes becomes a function +of its value (dirty_background_ratio * the amount of dirtyable system memory). + +dirty_bytes +----------- + +Contains the amount of dirty memory at which a process generating disk writes +will itself start writeback. + +If dirty_bytes is written, dirty_ratio becomes a function of its value +(dirty_bytes / the amount of dirtyable system memory). + dirty_ratio ------------------ +----------- Contains, as a percentage of the dirtyable system memory (free pages + mapped pages + file cache, not including locked pages and HugePages), the number of pages at which a process which is generating disk writes will itself start writing out dirty data. +If dirty_ratio is written, dirty_bytes becomes a function of its value +(dirty_ratio * the amount of dirtyable system memory). + dirty_writeback_centisecs ------------------------- diff --git a/Documentation/hwmon/adt7470 b/Documentation/hwmon/adt7470 index 75d13ca147c..8ce4aa0a0f5 100644 --- a/Documentation/hwmon/adt7470 +++ b/Documentation/hwmon/adt7470 @@ -31,15 +31,11 @@ Each of the measured inputs (temperature, fan speed) has corresponding high/low limit values. The ADT7470 will signal an ALARM if any measured value exceeds either limit. -The ADT7470 DOES NOT sample all inputs continuously. A single pin on the -ADT7470 is connected to a multitude of thermal diodes, but the chip must be -instructed explicitly to read the multitude of diodes. If you want to use -automatic fan control mode, you must manually read any of the temperature -sensors or the fan control algorithm will not run. The chip WILL NOT DO THIS -AUTOMATICALLY; this must be done from userspace. This may be a bug in the chip -design, given that many other AD chips take care of this. The driver will not -read the registers more often than once every 5 seconds. Further, -configuration data is only read once per minute. +The ADT7470 samples all inputs continuously. A kernel thread is started up for +the purpose of periodically querying the temperature sensors, thus allowing the +automatic fan pwm control to set the fan speed. The driver will not read the +registers more often than once every 5 seconds. Further, configuration data is +only read once per minute. Special Features ---------------- @@ -72,5 +68,6 @@ pwm#_auto_point2_temp. Notes ----- -As stated above, the temperature inputs must be read periodically from -userspace in order for the automatic pwm algorithm to run. +The temperature inputs no longer need to be read periodically from userspace in +order for the automatic pwm algorithm to run. This was the case for earlier +versions of the driver. diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 82469917443..f1d63990332 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -84,7 +84,7 @@ Code Seq# Include File Comments 'B' C0-FF advanced bbus <mailto:maassen@uni-freiburg.de> 'C' all linux/soundcard.h -'D' all asm-s390/dasd.h +'D' all arch/s390/include/asm/dasd.h 'E' all linux/input.h 'F' all linux/fb.h 'H' all linux/hiddev.h @@ -105,7 +105,7 @@ Code Seq# Include File Comments 'S' 80-81 scsi/scsi_ioctl.h conflict! 'S' 82-FF scsi/scsi.h conflict! 'T' all linux/soundcard.h conflict! -'T' all asm-i386/ioctls.h conflict! +'T' all arch/x86/include/asm/ioctls.h conflict! 'U' 00-EF linux/drivers/usb/usb.h 'V' all linux/vt.h 'W' 00-1F linux/watchdog.h conflict! @@ -120,7 +120,7 @@ Code Seq# Include File Comments <mailto:natalia@nikhefk.nikhef.nl> 'c' 00-7F linux/comstats.h conflict! 'c' 00-7F linux/coda.h conflict! -'c' 80-9F asm-s390/chsc.h +'c' 80-9F arch/s390/include/asm/chsc.h 'd' 00-FF linux/char/drm/drm/h conflict! 'd' 00-DF linux/video_decoder.h conflict! 'd' F0-FF linux/digi1.h @@ -170,7 +170,7 @@ Code Seq# Include File Comments <mailto:oe@port.de> 0x80 00-1F linux/fb.h 0x81 00-1F linux/videotext.h -0x89 00-06 asm-i386/sockios.h +0x89 00-06 arch/x86/include/asm/sockios.h 0x89 0B-DF linux/sockios.h 0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range 0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index c6841eee959..d73fbd2b2b4 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -71,6 +71,11 @@ The @argument descriptions must begin on the very next line following this opening short function description line, with no intervening empty comment lines. +If a function parameter is "..." (varargs), it should be listed in +kernel-doc notation as: + * @...: description + + Example kernel-doc data structure comment. /** @@ -282,6 +287,32 @@ struct my_struct { }; +Including documentation blocks in source files +---------------------------------------------- + +To facilitate having source code and comments close together, you can +include kernel-doc documentation blocks that are free-form comments +instead of being kernel-doc for functions, structures, unions, +enums, or typedefs. This could be used for something like a +theory of operation for a driver or library code, for example. + +This is done by using a DOC: section keyword with a section title. E.g.: + +/** + * DOC: Theory of Operation + * + * The whizbang foobar is a dilly of a gizmo. It can do whatever you + * want it to do, at any time. It reads your mind. Here's how it works. + * + * foo bar splat + * + * The only drawback to this gizmo is that is can sometimes damage + * hardware, software, or its subject(s). + */ + +DOC: sections are used in SGML templates files as indicated below. + + How to make new SGML template files ----------------------------------- @@ -302,6 +333,9 @@ exported using EXPORT_SYMBOL. !F<filename> <function [functions...]> is replaced by the documentation, in <filename>, for the functions listed. +!P<filename> <section title> is replaced by the contents of the DOC: +section titled <section title> from <filename>. +Spaces are allowed in <section title>; do not quote the <section title>. Tim. */ <twaugh@redhat.com> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a2d8805c03d..0b3f6711d2f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -469,8 +469,8 @@ and is between 256 and 4096 characters. It is defined in the file clearcpuid=BITNUM [X86] Disable CPUID feature X for the kernel. See - include/asm-x86/cpufeature.h for the valid bit numbers. - Note the Linux specific bits are not necessarily + arch/x86/include/asm/cpufeature.h for the valid bit + numbers. Note the Linux specific bits are not necessarily stable over kernel options, but the vendor specific ones should be. Also note that user programs calling CPUID directly @@ -551,6 +551,11 @@ and is between 256 and 4096 characters. It is defined in the file not work reliably with all consoles, but is known to work with serial and VGA consoles. + coredump_filter= + [KNL] Change the default value for + /proc/<pid>/coredump_filter. + See also Documentation/filesystems/proc.txt. + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] @@ -1117,6 +1122,8 @@ and is between 256 and 4096 characters. It is defined in the file If there are multiple matching configurations changing the same attribute, the last one is used. + lmb=debug [KNL] Enable lmb debug messages. + load_ramdisk= [RAM] List of ramdisks to load from floppy See Documentation/blockdev/ramdisk.txt. @@ -1569,6 +1576,10 @@ and is between 256 and 4096 characters. It is defined in the file nr_uarts= [SERIAL] maximum number of UARTs to be registered. + ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + See Documentation/debugging-via-ohci1394.txt for more + info. + olpc_ec_timeout= [OLPC] ms delay when issuing EC commands Rather than timing out after 20 ms if an EC command is not properly ACKed, override the length @@ -1793,10 +1804,10 @@ and is between 256 and 4096 characters. It is defined in the file autoconfiguration. Ranges are in pairs (memory base and size). - dynamic_printk - Enables pr_debug()/dev_dbg() calls if - CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. These can also - be switched on/off via <debugfs>/dynamic_printk/modules + dynamic_printk Enables pr_debug()/dev_dbg() calls if + CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. + These can also be switched on/off via + <debugfs>/dynamic_printk/modules print-fatal-signals= [KNL] debug: print fatal signals @@ -1884,7 +1895,7 @@ and is between 256 and 4096 characters. It is defined in the file reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode Format: <reboot_mode>[,<reboot_mode2>[,...]] - See arch/*/kernel/reboot.c or arch/*/kernel/process.c + See arch/*/kernel/reboot.c or arch/*/kernel/process.c relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. @@ -2432,8 +2443,8 @@ and is between 256 and 4096 characters. It is defined in the file Format: <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] - norandmaps Don't use address space randomization - Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space ______________________________________________________________________ diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index a79633d702b..48b3de90eb1 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -497,7 +497,10 @@ The first column provides the kernel address where the probe is inserted. The second column identifies the type of probe (k - kprobe, r - kretprobe and j - jprobe), while the third column specifies the symbol+offset of the probe. If the probed function belongs to a module, the module name -is also specified. +is also specified. Following columns show probe status. If the probe is on +a virtual address that is no longer valid (module init sections, module +virtual addresses that correspond to modules that've been unloaded), +such probes are marked with [GONE]. /debug/kprobes/enabled: Turn kprobes ON/OFF diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt index 95070028d15..505f1960754 100644 --- a/Documentation/magic-number.txt +++ b/Documentation/magic-number.txt @@ -125,14 +125,14 @@ TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c ROUTER_MAGIC 0x524d4157 wan_device include/linux/wanrouter.h SCC_MAGIC 0x52696368 gs_port drivers/char/scc.h SAVEKMSG_MAGIC1 0x53415645 savekmsg arch/*/amiga/config.c -GDA_MAGIC 0x58464552 gda include/asm-mips64/sn/gda.h +GDA_MAGIC 0x58464552 gda arch/mips/include/asm/sn/gda.h RED_MAGIC1 0x5a2cf071 (any) mm/slab.c STL_PORTMAGIC 0x5a7182c9 stlport include/linux/stallion.h EEPROM_MAGIC_VALUE 0x5ab478d2 lanai_dev drivers/atm/lanai.c HDLCDRV_MAGIC 0x5ac6e778 hdlcdrv_state include/linux/hdlcdrv.h EPCA_MAGIC 0x5c6df104 channel include/linux/epca.h PCXX_MAGIC 0x5c6df104 channel drivers/char/pcxx.h -KV_MAGIC 0x5f4b565f kernel_vars_s include/asm-mips64/sn/klkernvars.h +KV_MAGIC 0x5f4b565f kernel_vars_s arch/mips/include/asm/sn/klkernvars.h I810_STATE_MAGIC 0x63657373 i810_state sound/oss/i810_audio.c TRIDENT_STATE_MAGIC 0x63657373 trient_state sound/oss/trident.c M3_CARD_MAGIC 0x646e6f50 m3_card sound/oss/maestro3.c @@ -158,7 +158,7 @@ CCB_MAGIC 0xf2691ad2 ccb drivers/scsi/ncr53c8xx.c QUEUE_MAGIC_FREE 0xf7e1c9a3 queue_entry drivers/scsi/arm/queue.c QUEUE_MAGIC_USED 0xf7e1cc33 queue_entry drivers/scsi/arm/queue.c HTB_CMAGIC 0xFEFAFEF1 htb_class net/sched/sch_htb.c -NMI_MAGIC 0x48414d4d455201 nmi_s include/asm-mips64/sn/nmi.h +NMI_MAGIC 0x48414d4d455201 nmi_s arch/mips/include/asm/sn/nmi.h Note that there are also defined special per-driver magic numbers in sound memory management. See include/sound/sndmagic.h for complete list of them. Many diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 168117bd6ee..4c2ecf537a4 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -124,7 +124,7 @@ config options. This option can be kernel module too. -------------------------------- -3 sysfs files for memory hotplug +4 sysfs files for memory hotplug -------------------------------- All sections have their device information under /sys/devices/system/memory as @@ -138,11 +138,12 @@ For example, assume 1GiB section size. A device for a memory starting at (0x100000000 / 1Gib = 4) This device covers address range [0x100000000 ... 0x140000000) -Under each section, you can see 3 files. +Under each section, you can see 4 files. /sys/devices/system/memory/memoryXXX/phys_index /sys/devices/system/memory/memoryXXX/phys_device /sys/devices/system/memory/memoryXXX/state +/sys/devices/system/memory/memoryXXX/removable 'phys_index' : read-only and contains section id, same as XXX. 'state' : read-write @@ -150,10 +151,20 @@ Under each section, you can see 3 files. at write: user can specify "online", "offline" command 'phys_device': read-only: designed to show the name of physical memory device. This is not well implemented now. +'removable' : read-only: contains an integer value indicating + whether the memory section is removable or not + removable. A value of 1 indicates that the memory + section is removable and a value of 0 indicates that + it is not removable. NOTE: These directories/files appear after physical memory hotplug phase. +If CONFIG_NUMA is enabled the +/sys/devices/system/memory/memoryXXX memory section +directories can also be accessed via symbolic links located in +the /sys/devices/system/node/node* directories. For example: +/sys/devices/system/node/node0/memory9 -> ../../memory/memory9 -------------------------------- 4. Physical memory hot-add phase @@ -365,7 +376,6 @@ node if necessary. - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. - showing memory section and physical device relationship. - - showing memory section and node relationship (maybe good for NUMA) - showing memory section is under ZONE_MOVABLE or not - test and make it better memory offlining. - support HugeTLB page migration and offlining. diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README index 25a6ed1aaa5..f54962aea84 100644 --- a/Documentation/mips/AU1xxx_IDE.README +++ b/Documentation/mips/AU1xxx_IDE.README @@ -44,7 +44,7 @@ FILES, CONFIGS AND COMPATABILITY Two files are introduced: - a) 'include/asm-mips/mach-au1x00/au1xxx_ide.h' + a) 'arch/mips/include/asm/mach-au1x00/au1xxx_ide.h' containes : struct _auide_hwif timing parameters for PIO mode 0/1/2/3/4 timing parameters for MWDMA 0/1/2 diff --git a/Documentation/powerpc/cpu_features.txt b/Documentation/powerpc/cpu_features.txt index 472739880e8..ffa4183fdb8 100644 --- a/Documentation/powerpc/cpu_features.txt +++ b/Documentation/powerpc/cpu_features.txt @@ -31,7 +31,7 @@ anyways). After detecting the processor type, the kernel patches out sections of code that shouldn't be used by writing nop's over it. Using cpufeatures requires -just 2 macros (found in include/asm-ppc/cputable.h), as seen in head.S +just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S transfer_to_handler: #ifdef CONFIG_ALTIVEC diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt index d30a281c570..10711d9f078 100644 --- a/Documentation/s390/Debugging390.txt +++ b/Documentation/s390/Debugging390.txt @@ -1402,7 +1402,7 @@ Syscalls are implemented on Linux for S390 by the Supervisor call instruction (S possibilities of these as the instruction is made up of a 0xA opcode & the second byte being the syscall number. They are traced using the simple command. TR SVC <Optional value or range> -the syscalls are defined in linux/include/asm-s390/unistd.h +the syscalls are defined in linux/arch/s390/include/asm/unistd.h e.g. to trace all file opens just do TR SVC 5 ( as this is the syscall number of open ) diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt index c4b7b2bd369..480a78ef5a1 100644 --- a/Documentation/s390/cds.txt +++ b/Documentation/s390/cds.txt @@ -98,7 +98,7 @@ platform. Some of the interface routines are specific to Linux/390 and some of them can be found on other Linux platforms implementations too. Miscellaneous function prototypes, data declarations, and macro definitions can be found in the architecture specific C header file -linux/include/asm-s390/irq.h. +linux/arch/s390/include/asm/irq.h. Overview of CDS interface concepts diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt index e0542097369..2d10053dd97 100644 --- a/Documentation/s390/s390dbf.txt +++ b/Documentation/s390/s390dbf.txt @@ -2,7 +2,7 @@ S390 Debug Feature ================== files: arch/s390/kernel/debug.c - include/asm-s390/debug.h + arch/s390/include/asm/debug.h Description: ------------ diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d79eeda7a69..cd05994a49e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -41,7 +41,8 @@ Currently, these files are in /proc/sys/vm: ============================================================== -dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, +dirty_bytes, dirty_ratio, dirty_background_bytes, +dirty_background_ratio, dirty_expire_centisecs, dirty_writeback_centisecs, highmem_is_dirtyable, vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, drop-caches, hugepages_treat_as_movable: diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 125eed560e5..0706a7282a8 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -137,13 +137,6 @@ shrink_page_list() where they will be detected when vmscan walks the reverse map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() will cull the page at that point. -Note that for anonymous pages, shrink_page_list() attempts to add the page to -the swap cache before it tries to unmap the page. To avoid this unnecessary -consumption of swap space, shrink_page_list() calls try_to_munlock() to check -whether any VM_LOCKED vmas map the page without attempting to unmap the page. -If try_to_munlock() returns SWAP_MLOCK, shrink_page_list() will cull the page -without consuming swap space. try_to_munlock() will be described below. - To "cull" an unevictable page, vmscan simply puts the page back on the lru list using putback_lru_page()--the inverse operation to isolate_lru_page()-- after dropping the page lock. Because the condition which makes the page @@ -190,8 +183,8 @@ several places: in the VM_LOCKED flag being set for the vma. 3) in the fault path, if mlocked pages are "culled" in the fault path, and when a VM_LOCKED stack segment is expanded. -4) as mentioned above, in vmscan:shrink_page_list() with attempting to - reclaim a page in a VM_LOCKED vma--via try_to_unmap() or try_to_munlock(). +4) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED vma via try_to_unmap(). Mlocked pages become unlocked and rescued from the unevictable list when: @@ -260,9 +253,9 @@ mlock_fixup() filters several classes of "special" vmas: 2) vmas mapping hugetlbfs page are already effectively pinned into memory. We don't need nor want to mlock() these pages. However, to preserve the - prior behavior of mlock()--before the unevictable/mlock changes--mlock_fixup() - will call make_pages_present() in the hugetlbfs vma range to allocate the - huge pages and populate the ptes. + prior behavior of mlock()--before the unevictable/mlock changes-- + mlock_fixup() will call make_pages_present() in the hugetlbfs vma range + to allocate the huge pages and populate the ptes. 3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of kernel pages, such as the vdso page, relay channel pages, etc. These pages @@ -322,7 +315,7 @@ __mlock_vma_pages_range()--the same function used to mlock a vma range-- passing a flag to indicate that munlock() is being performed. Because the vma access protections could have been changed to PROT_NONE after -faulting in and mlocking some pages, get_user_pages() was unreliable for visiting +faulting in and mlocking pages, get_user_pages() was unreliable for visiting these pages for munlocking. Because we don't want to leave pages mlocked(), get_user_pages() was enhanced to accept a flag to ignore the permissions when fetching the pages--all of which should be resident as a result of previous @@ -416,8 +409,8 @@ Mlocked Pages: munmap()/exit()/exec() System Call Handling When unmapping an mlocked region of memory, whether by an explicit call to munmap() or via an internal unmap from exit() or exec() processing, we must munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any way, -so unmapping them required no processing. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. To munlock a range of memory under the unevictable/mlock infrastructure, the munmap() hander and task address space tear down function call @@ -517,12 +510,10 @@ couldn't be mlocked. Mlocked pages: try_to_munlock() Reverse Map Scan TODO/FIXME: a better name might be page_mlocked()--analogous to the -page_referenced() reverse map walker--especially if we continue to call this -from shrink_page_list(). See related TODO/FIXME below. +page_referenced() reverse map walker. -When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() System -Call Handling" above--tries to munlock a page, or when shrink_page_list() -encounters an anonymous page that is not yet in the swap cache, they need to +When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() +System Call Handling" above--tries to munlock a page, it needs to determine whether or not the page is mapped by any VM_LOCKED vma, without actually attempting to unmap all ptes from the page. For this purpose, the unevictable/mlock infrastructure introduced a variant of try_to_unmap() called @@ -535,10 +526,7 @@ for VM_LOCKED vmas. When such a vma is found for anonymous pages and file pages mapped in linear VMAs, as in the try_to_unmap() case, the functions attempt to acquire the associated mmap semphore, mlock the page via mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the -pre-clearing of the page's PG_mlocked done by munlock_vma_page() and informs -shrink_page_list() that the anonymous page should be culled rather than added -to the swap cache in preparation for a try_to_unmap() that will almost -certainly fail. +pre-clearing of the page's PG_mlocked done by munlock_vma_page. If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() @@ -557,10 +545,7 @@ However, the scan can terminate when it encounters a VM_LOCKED vma and can successfully acquire the vma's mmap semphore for read and mlock the page. Although try_to_munlock() can be called many [very many!] times when munlock()ing a large region or tearing down a large address space that has been -mlocked via mlockall(), overall this is a fairly rare event. In addition, -although shrink_page_list() calls try_to_munlock() for every anonymous page that -it handles that is not yet in the swap cache, on average anonymous pages will -have very short reverse map lists. +mlocked via mlockall(), overall this is a fairly rare event. Mlocked Page: Page Reclaim in shrink_*_list() @@ -588,8 +573,8 @@ Some examples of these unevictable pages on the LRU lists are: munlock_vma_page() was forced to let the page back on to the normal LRU list for vmscan to handle. -shrink_inactive_list() also culls any unevictable pages that it finds -on the inactive lists, again diverting them to the appropriate zone's unevictable +shrink_inactive_list() also culls any unevictable pages that it finds on +the inactive lists, again diverting them to the appropriate zone's unevictable lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from @@ -597,19 +582,7 @@ the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, but will pass on to shrink_page_list(). shrink_page_list() again culls obviously unevictable pages that it could -encounter for similar reason to shrink_inactive_list(). As already discussed, -shrink_page_list() proactively looks for anonymous pages that should have -PG_mlocked set but don't--these would not be detected by page_evictable()--to -avoid adding them to the swap cache unnecessarily. File pages mapped into +encounter for similar reason to shrink_inactive_list(). Pages mapped into VM_LOCKED vmas but without PG_mlocked set will make it all the way to -try_to_unmap(). shrink_page_list() will divert them to the unevictable list when -try_to_unmap() returns SWAP_MLOCK, as discussed above. - -TODO/FIXME: If we can enhance the swap cache to reliably remove entries -with page_count(page) > 2, as long as all ptes are mapped to the page and -not the swap entry, we can probably remove the call to try_to_munlock() in -shrink_page_list() and just remove the page from the swap cache when -try_to_unmap() returns SWAP_MLOCK. Currently, remove_exclusive_swap_page() -doesn't seem to allow that. - - +try_to_unmap(). shrink_page_list() will divert them to the unevictable list +when try_to_unmap() returns SWAP_MLOCK, as discussed above. diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 169ad423a3d..4f913857b8a 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -3,7 +3,7 @@ protocol of kernel. These should be filled by bootloader or 16-bit real-mode setup code of the kernel. References/settings to it mainly are in: - include/asm-x86/bootparam.h + arch/x86/include/asm/bootparam.h Offset Proto Name Meaning diff --git a/MAINTAINERS b/MAINTAINERS index 141aff67bd6..094dd52d730 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -616,7 +616,7 @@ M: mkpetch@internode.on.net S: Maintained ARM/TOSA MACHINE SUPPORT -P: Dmitry Baryshkov +P: Dmitry Eremin-Solenikov M: dbaryshkov@gmail.com P: Dirk Opfer M: dirk@opfer-online.de @@ -1092,11 +1092,8 @@ S: Maintained CHECKPATCH P: Andy Whitcroft -M: apw@shadowen.org -P: Randy Dunlap -M: rdunlap@xenotime.net -P: Joel Schopp -M: jschopp@austin.ibm.com +M: apw@canonical.com +L: linux-kernel@vger.kernel.org S: Supported CISCO 10G ETHERNET DRIVER diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index ca88e54dec9..62b363584b2 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -1,6 +1,7 @@ #ifndef _ALPHA_ATOMIC_H #define _ALPHA_ATOMIC_H +#include <linux/types.h> #include <asm/barrier.h> #include <asm/system.h> @@ -13,14 +14,6 @@ */ -/* - * Counter is volatile to make sure gcc doesn't try to be clever - * and move things around on us. We need to use _exactly_ the address - * the user gave us, not some alias that contains the same information. - */ -typedef struct { volatile int counter; } atomic_t; -typedef struct { volatile long counter; } atomic64_t; - #define ATOMIC_INIT(i) ( (atomic_t) { (i) } ) #define ATOMIC64_INIT(i) ( (atomic64_t) { (i) } ) diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 325f881ccb5..ee99723b3a6 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -12,10 +12,9 @@ #define __ASM_ARM_ATOMIC_H #include <linux/compiler.h> +#include <linux/types.h> #include <asm/system.h> -typedef struct { volatile int counter; } atomic_t; - #define ATOMIC_INIT(i) { (i) } #ifdef __KERNEL__ diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c index 3f9abe0e9af..f692efddd44 100644 --- a/arch/arm/kernel/kprobes.c +++ b/arch/arm/kernel/kprobes.c @@ -92,9 +92,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { - mutex_lock(&kprobe_mutex); free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); p->ainsn.insn = NULL; } } diff --git a/arch/arm/mach-s3c2410/include/mach/spi.h b/arch/arm/mach-s3c2410/include/mach/spi.h index 774f3adfe8a..1d300fb112b 100644 --- a/arch/arm/mach-s3c2410/include/mach/spi.h +++ b/arch/arm/mach-s3c2410/include/mach/spi.h @@ -14,7 +14,7 @@ #define __ASM_ARCH_SPI_H __FILE__ struct s3c2410_spi_info { - unsigned long pin_cs; /* simple gpio cs */ + int pin_cs; /* simple gpio cs */ unsigned int num_cs; /* total chipselects */ int bus_num; /* bus number to use. */ diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h index 7ef3862a73d..31881510774 100644 --- a/arch/avr32/include/asm/atomic.h +++ b/arch/avr32/include/asm/atomic.h @@ -14,9 +14,9 @@ #ifndef __ASM_AVR32_ATOMIC_H #define __ASM_AVR32_ATOMIC_H +#include <linux/types.h> #include <asm/system.h> -typedef struct { volatile int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/arch/avr32/kernel/traps.c b/arch/avr32/kernel/traps.c index 0d987373bc0..d547c8df157 100644 --- a/arch/avr32/kernel/traps.c +++ b/arch/avr32/kernel/traps.c @@ -7,6 +7,7 @@ */ #include <linux/bug.h> +#include <linux/hardirq.h> #include <linux/init.h> #include <linux/kallsyms.h> #include <linux/kdebug.h> diff --git a/arch/blackfin/include/asm/atomic.h b/arch/blackfin/include/asm/atomic.h index 7cf50871860..25776c19064 100644 --- a/arch/blackfin/include/asm/atomic.h +++ b/arch/blackfin/include/asm/atomic.h @@ -1,6 +1,7 @@ #ifndef __ARCH_BLACKFIN_ATOMIC__ #define __ARCH_BLACKFIN_ATOMIC__ +#include <linux/types.h> #include <asm/system.h> /* local_irq_XXX() */ /* @@ -13,9 +14,6 @@ * Tony Kou (tonyko@lineo.ca) Lineo Inc. 2001 */ -typedef struct { - int counter; -} atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h index f71ea686a2e..5718dd8902a 100644 --- a/arch/cris/include/asm/atomic.h +++ b/arch/cris/include/asm/atomic.h @@ -4,7 +4,7 @@ #define __ASM_CRIS_ATOMIC__ #include <linux/compiler.h> - +#include <linux/types.h> #include <asm/system.h> #include <arch/atomic.h> @@ -13,8 +13,6 @@ * resource counting etc.. */ -typedef struct { volatile int counter; } atomic_t; - #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h index b4cf0ea97ed..833186c8dc3 100644 --- a/arch/h8300/include/asm/atomic.h +++ b/arch/h8300/include/asm/atomic.h @@ -1,12 +1,13 @@ #ifndef __ARCH_H8300_ATOMIC__ #define __ARCH_H8300_ATOMIC__ +#include <linux/types.h> + /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ -typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 50c2b83fd5a..d37292bd987 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -17,12 +17,6 @@ #include <asm/intrinsics.h> #include <asm/system.h> -/* - * On IA-64, counter must always be volatile to ensure that that the - * memory accesses are ordered. - */ -typedef struct { volatile __s32 counter; } atomic_t; -typedef struct { volatile __s64 counter; } atomic64_t; #define ATOMIC_INIT(i) ((atomic_t) { (i) }) #define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index f07688da947..097b84d54e7 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -670,9 +670,11 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, p->ainsn.inst_flag & INST_FLAG_BOOSTABLE); - mutex_unlock(&kprobe_mutex); + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, + p->ainsn.inst_flag & INST_FLAG_BOOSTABLE); + p->ainsn.insn = NULL; + } } /* * We are resuming execution after a single step fault, so the pt_regs diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 054bcd9439a..56e12903973 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -692,7 +692,7 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); zone = pgdat->node_zones + ZONE_NORMAL; - ret = __add_pages(zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/m68knommu/include/asm/atomic.h b/arch/m68knommu/include/asm/atomic.h index d5632a305da..6bb674855a3 100644 --- a/arch/m68knommu/include/asm/atomic.h +++ b/arch/m68knommu/include/asm/atomic.h @@ -1,6 +1,7 @@ #ifndef __ARCH_M68KNOMMU_ATOMIC__ #define __ARCH_M68KNOMMU_ATOMIC__ +#include <linux/types.h> #include <asm/system.h> /* @@ -12,7 +13,6 @@ * We do not have SMP m68k systems, so we don't have to deal with that. */ -typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 1232be3885b..c996c3b4d07 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -15,13 +15,12 @@ #define _ASM_ATOMIC_H #include <linux/irqflags.h> +#include <linux/types.h> #include <asm/barrier.h> #include <asm/cpu-features.h> #include <asm/war.h> #include <asm/system.h> -typedef struct { volatile int counter; } atomic_t; - #define ATOMIC_INIT(i) { (i) } /* @@ -404,8 +403,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #ifdef CONFIG_64BIT -typedef struct { volatile long counter; } atomic64_t; - #define ATOMIC64_INIT(i) { (i) } /* diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 57fcc4a5ebb..edbfe25c5fc 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -155,14 +155,11 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) #endif -/* Note that we need not lock read accesses - aligned word writes/reads - * are atomic, so a reader never sees unconsistent values. - * - * Cache-line alignment would conflict with, for example, linux/module.h +/* + * Note that we need not lock read accesses - aligned word writes/reads + * are atomic, so a reader never sees inconsistent values. */ -typedef struct { volatile int counter; } atomic_t; - /* It's possible to reduce all atomic operations to either * __atomic_add_return, atomic_set and atomic_read (the latter * is there only for consistency). @@ -260,8 +257,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #ifdef CONFIG_64BIT -typedef struct { volatile s64 counter; } atomic64_t; - #define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) static __inline__ int diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index 499be5bdd6f..b401950f525 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -5,7 +5,7 @@ * PowerPC atomic operations */ -typedef struct { int counter; } atomic_t; +#include <linux/types.h> #ifdef __KERNEL__ #include <linux/compiler.h> @@ -251,8 +251,6 @@ static __inline__ int atomic_dec_if_positive(atomic_t *v) #ifdef __powerpc64__ -typedef struct { long counter; } atomic64_t; - #define ATOMIC64_INIT(i) { (i) } static __inline__ long atomic64_read(const atomic64_t *v) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 26f0d0ab27a..b1dafb6a974 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -18,6 +18,12 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); /* + * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs + * to override the version in mm/hugetlb.c + */ +#define vma_mmu_pagesize vma_mmu_pagesize + +/* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index de79915452c..989edcdf029 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -96,9 +96,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } } static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 201c7a5486c..9920d6a7cf2 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -512,6 +512,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); } +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); + + return 1UL << mmu_psize_to_shift(psize); +} + /* * Called by asm hashtable.S for doing lazy icache flush */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 53b06ebb3f2..f00f09a77f1 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -132,7 +132,7 @@ int arch_add_memory(int nid, u64 start, u64 size) /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; - return __add_pages(zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 2d184655bc5..de432f2de2d 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -2,6 +2,7 @@ #define __ARCH_S390_ATOMIC__ #include <linux/compiler.h> +#include <linux/types.h> /* * include/asm-s390/atomic.h @@ -23,9 +24,6 @@ * S390 uses 'Compare And Swap' for atomicity in SMP enviroment */ -typedef struct { - int counter; -} __attribute__ ((aligned (4))) atomic_t; #define ATOMIC_INIT(i) { (i) } #ifdef __KERNEL__ @@ -149,9 +147,6 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u) #undef __CS_LOOP #ifdef __s390x__ -typedef struct { - long long counter; -} __attribute__ ((aligned (8))) atomic64_t; #define ATOMIC64_INIT(i) { (i) } #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 569079ec4ff..9b92856632c 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -218,9 +218,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } } static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 158b0d6d704..f0258ca3b17 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -183,7 +183,7 @@ int arch_add_memory(int nid, u64 start, u64 size) rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(zone, PFN_DOWN(start), PFN_DOWN(size)); + rc = __add_pages(nid, zone, PFN_DOWN(start), PFN_DOWN(size)); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h index c043ef00302..6327ffbb199 100644 --- a/arch/sh/include/asm/atomic.h +++ b/arch/sh/include/asm/atomic.h @@ -7,16 +7,15 @@ * */ -typedef struct { volatile int counter; } atomic_t; +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/system.h> #define ATOMIC_INIT(i) ( (atomic_t) { (i) } ) #define atomic_read(v) ((v)->counter) #define atomic_set(v,i) ((v)->counter = (i)) -#include <linux/compiler.h> -#include <asm/system.h> - #if defined(CONFIG_GUSA_RB) #include <asm/atomic-grb.h> #elif defined(CONFIG_CPU_SH4A) diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 88807a2aacc..c0aa3d83ec0 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -13,6 +13,7 @@ */ #include <linux/kernel.h> #include <linux/ptrace.h> +#include <linux/hardirq.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/module.h> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 6cbef8caeb5..3edf297c829 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -311,7 +311,8 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(pgdat->node_zones + ZONE_NORMAL, start_pfn, nr_pages); + ret = __add_pages(nid, pgdat->node_zones + ZONE_NORMAL, + start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 5c944b5a804..ce465975a6a 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -13,8 +13,6 @@ #include <linux/types.h> -typedef struct { volatile int counter; } atomic_t; - #ifdef __KERNEL__ #define ATOMIC_INIT(i) { (i) } diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 5982c5ae7f0..a0a70649269 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -10,9 +10,6 @@ #include <linux/types.h> #include <asm/system.h> -typedef struct { volatile int counter; } atomic_t; -typedef struct { volatile __s64 counter; } atomic64_t; - #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 44e49041949..7384d8accfe 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -64,11 +64,10 @@ good_area: do { int fault; -survive: + fault = handle_mm_fault(mm, vma, address, is_write); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { - err = -ENOMEM; goto out_of_memory; } else if (fault & VM_FAULT_SIGBUS) { err = -EACCES; @@ -104,18 +103,14 @@ out: out_nosemaphore: return err; -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ out_of_memory: - if (is_global_init(current)) { - up_read(&mm->mmap_sem); - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - goto out; + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(&mm->mmap_sem); + pagefault_out_of_memory(); + return 0; } static void bad_segv(struct faultinfo fi, unsigned long ip) @@ -214,9 +209,6 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, si.si_addr = (void __user *)address; current->thread.arch.faultinfo = fi; force_sig_info(SIGBUS, &si, current); - } else if (err == -ENOMEM) { - printk(KERN_INFO "VM: killing process %s\n", current->comm); - do_exit(SIGKILL); } else { BUG_ON(err != -EFAULT); si.si_signo = SIGSEGV; diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index ad5b9f6ecdd..85b46fba422 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_ATOMIC_32_H #include <linux/compiler.h> +#include <linux/types.h> #include <asm/processor.h> #include <asm/cmpxchg.h> @@ -10,15 +11,6 @@ * resource counting etc.. */ -/* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, - * not some alias that contains the same information. - */ -typedef struct { - int counter; -} atomic_t; - #define ATOMIC_INIT(i) { (i) } /** diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h index 279d2a731f3..8c21731984d 100644 --- a/arch/x86/include/asm/atomic_64.h +++ b/arch/x86/include/asm/atomic_64.h @@ -1,25 +1,15 @@ #ifndef _ASM_X86_ATOMIC_64_H #define _ASM_X86_ATOMIC_64_H +#include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> -/* atomic_t should be 32 bit signed type */ - /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ -/* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, - * not some alias that contains the same information. - */ -typedef struct { - int counter; -} atomic_t; - #define ATOMIC_INIT(i) { (i) } /** @@ -191,11 +181,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -/* An 64bit atomic type */ - -typedef struct { - long counter; -} atomic64_t; +/* The 64-bit atomic type */ #define ATOMIC64_INIT(i) { (i) } diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h deleted file mode 100644 index 8b064bd9c55..00000000000 --- a/arch/x86/include/asm/unwind.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_X86_UNWIND_H -#define _ASM_X86_UNWIND_H - -#define UNW_PC(frame) ((void)(frame), 0UL) -#define UNW_SP(frame) ((void)(frame), 0UL) -#define UNW_FP(frame) ((void)(frame), 0UL) - -static inline int arch_unw_user_mode(const void *info) -{ - return 0; -} - -#endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 6c27679ec6a..eead6f8f921 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -376,9 +376,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); - mutex_unlock(&kprobe_mutex); + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + p->ainsn.insn = NULL; + } } static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ce6650eb64e..c9a666cdd3d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/ptrace.h> #include <linux/string.h> -#include <linux/unwind.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> @@ -51,7 +50,6 @@ #include <asm/debugreg.h> #include <asm/atomic.h> #include <asm/system.h> -#include <asm/unwind.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 57ec8c86a87..9e268b6b204 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -667,7 +667,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; -again: /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -859,25 +858,14 @@ no_context: oops_end(flags, regs, sig); #endif -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ out_of_memory: + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - /* - * Re-lookup the vma - in theory the vma tree might - * have changed: - */ - goto again; - } - - printk("VM: killing process %s\n", tsk->comm); - if (error_code & PF_USER) - do_group_exit(SIGKILL); - goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f99a6c6c432..544d724caee 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1079,7 +1079,7 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages); } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9f7a0d24d42..54c437e9654 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -857,7 +857,7 @@ int arch_add_memory(int nid, u64 start, u64 size) if (last_mapped_pfn > max_pfn_mapped) max_pfn_mapped = last_mapped_pfn; - ret = __add_pages(zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); return ret; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 38aca048e95..66a9d814556 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -41,6 +41,7 @@ #include <linux/pm_qos_params.h> #include <linux/clockchips.h> #include <linux/cpuidle.h> +#include <linux/irqflags.h> /* * Include the apic definitions for x86 to have the APIC timer related defines diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5260e9e0df4..989429cfed8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -347,8 +347,9 @@ static inline int memory_probe_init(void) * section belongs to... */ -static int add_memory_block(unsigned long node_id, struct mem_section *section, - unsigned long state, int phys_device) +static int add_memory_block(int nid, struct mem_section *section, + unsigned long state, int phys_device, + enum mem_add_context context) { struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); int ret = 0; @@ -370,6 +371,10 @@ static int add_memory_block(unsigned long node_id, struct mem_section *section, ret = mem_create_simple_file(mem, phys_device); if (!ret) ret = mem_create_simple_file(mem, removable); + if (!ret) { + if (context == HOTPLUG) + ret = register_mem_sect_under_node(mem, nid); + } return ret; } @@ -382,7 +387,7 @@ static int add_memory_block(unsigned long node_id, struct mem_section *section, * * This could be made generic for all sysdev classes. */ -static struct memory_block *find_memory_block(struct mem_section *section) +struct memory_block *find_memory_block(struct mem_section *section) { struct kobject *kobj; struct sys_device *sysdev; @@ -411,6 +416,7 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section, struct memory_block *mem; mem = find_memory_block(section); + unregister_mem_sect_under_nodes(mem); mem_remove_simple_file(mem, phys_index); mem_remove_simple_file(mem, state); mem_remove_simple_file(mem, phys_device); @@ -424,9 +430,9 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section, * need an interface for the VM to add new memory regions, * but without onlining it. */ -int register_new_memory(struct mem_section *section) +int register_new_memory(int nid, struct mem_section *section) { - return add_memory_block(0, section, MEM_OFFLINE, 0); + return add_memory_block(nid, section, MEM_OFFLINE, 0, HOTPLUG); } int unregister_memory_section(struct mem_section *section) @@ -458,7 +464,8 @@ int __init memory_dev_init(void) for (i = 0; i < NR_MEM_SECTIONS; i++) { if (!present_section_nr(i)) continue; - err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, + 0, BOOT); if (!ret) ret = err; } diff --git a/drivers/base/node.c b/drivers/base/node.c index 91636cd8b6c..43fa90b837e 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/mm.h> +#include <linux/memory.h> #include <linux/node.h> #include <linux/hugetlb.h> #include <linux/cpumask.h> @@ -248,6 +249,105 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) return 0; } +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#define page_initialized(page) (page->lru.next) + +static int get_nid_for_pfn(unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid_within(pfn)) + return -1; + page = pfn_to_page(pfn); + if (!page_initialized(page)) + return -1; + return pfn_to_nid(pfn); +} + +/* register memory section under specified node if it spans that node */ +int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) +{ + unsigned long pfn, sect_start_pfn, sect_end_pfn; + + if (!mem_blk) + return -EFAULT; + if (!node_online(nid)) + return 0; + sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index); + sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; + for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { + int page_nid; + + page_nid = get_nid_for_pfn(pfn); + if (page_nid < 0) + continue; + if (page_nid != nid) + continue; + return sysfs_create_link_nowarn(&node_devices[nid].sysdev.kobj, + &mem_blk->sysdev.kobj, + kobject_name(&mem_blk->sysdev.kobj)); + } + /* mem section does not span the specified node */ + return 0; +} + +/* unregister memory section under all nodes that it spans */ +int unregister_mem_sect_under_nodes(struct memory_block *mem_blk) +{ + nodemask_t unlinked_nodes; + unsigned long pfn, sect_start_pfn, sect_end_pfn; + + if (!mem_blk) + return -EFAULT; + nodes_clear(unlinked_nodes); + sect_start_pfn = section_nr_to_pfn(mem_blk->phys_index); + sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; + for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { + unsigned int nid; + + nid = get_nid_for_pfn(pfn); + if (nid < 0) + continue; + if (!node_online(nid)) + continue; + if (node_test_and_set(nid, unlinked_nodes)) + continue; + sysfs_remove_link(&node_devices[nid].sysdev.kobj, + kobject_name(&mem_blk->sysdev.kobj)); + } + return 0; +} + +static int link_mem_sections(int nid) +{ + unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; + unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; + unsigned long pfn; + int err = 0; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *mem_sect; + struct memory_block *mem_blk; + int ret; + + if (!present_section_nr(section_nr)) + continue; + mem_sect = __nr_to_section(section_nr); + mem_blk = find_memory_block(mem_sect); + ret = register_mem_sect_under_node(mem_blk, nid); + if (!err) + err = ret; + + /* discard ref obtained in find_memory_block() */ + kobject_put(&mem_blk->sysdev.kobj); + } + return err; +} +#else +static int link_mem_sections(int nid) { return 0; } +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ + int register_one_node(int nid) { int error = 0; @@ -267,6 +367,9 @@ int register_one_node(int nid) if (cpu_to_node(cpu) == nid) register_cpu_under_node(cpu, nid); } + + /* link memory sections under this node */ + error = link_mem_sections(nid); } return error; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 1697043119b..35914b6e1d2 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -841,7 +841,7 @@ config JS_RTC config GEN_RTC tristate "Generic /dev/rtc emulation" - depends on RTC!=y && !IA64 && !ARM && !M32R && !MIPS && !SPARC && !FRV && !S390 && !SUPERH && !AVR32 + depends on RTC!=y && !IA64 && !ARM && !M32R && !MIPS && !SPARC && !FRV && !S390 && !SUPERH && !AVR32 && !BLACKFIN ---help--- If you say Y here and create a character special file /dev/rtc with major number 10 and minor number 135 using mknod ("man mknod"), you diff --git a/drivers/char/consolemap.c b/drivers/char/consolemap.c index 4246b8e36cb..45d3e80156d 100644 --- a/drivers/char/consolemap.c +++ b/drivers/char/consolemap.c @@ -554,7 +554,7 @@ int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list) __get_user(fontpos, &list->fontpos); if ((err1 = con_insert_unipair(p, unicode,fontpos)) != 0) err = err1; - list++; + list++; } if (con_unify_unimap(vc, p)) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 6431f6921a6..3586b3b3df3 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -425,9 +425,6 @@ static ssize_t read_oldmem(struct file *file, char __user *buf, } #endif -extern long vread(char *buf, char *addr, unsigned long count); -extern long vwrite(char *buf, char *addr, unsigned long count); - #ifdef CONFIG_DEVKMEM /* * This function reads the *virtual* memory as seen by the kernel. diff --git a/drivers/char/random.c b/drivers/char/random.c index c7afc068c28..7c13581ca9c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -407,7 +407,7 @@ struct entropy_store { /* read-write data: */ spinlock_t lock; unsigned add_ptr; - int entropy_count; /* Must at no time exceed ->POOLBITS! */ + int entropy_count; int input_rotate; }; @@ -767,11 +767,10 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, { unsigned long flags; - BUG_ON(r->entropy_count > r->poolinfo->POOLBITS); - /* Hold lock while accounting */ spin_lock_irqsave(&r->lock, flags); + BUG_ON(r->entropy_count > r->poolinfo->POOLBITS); DEBUG_ENT("trying to extract %d bits from %s\n", nbytes * 8, r->name); diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 94966edfb44..d41b9f6f790 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -82,7 +82,7 @@ static void sysrq_handle_loglevel(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_loglevel_op = { .handler = sysrq_handle_loglevel, - .help_msg = "loglevel0-8", + .help_msg = "loglevel(0-9)", .action_msg = "Changing Loglevel", .enable_mask = SYSRQ_ENABLE_LOG, }; @@ -233,7 +233,7 @@ static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_showallcpus_op = { .handler = sysrq_handle_showallcpus, - .help_msg = "aLlcpus", + .help_msg = "show-backtrace-all-active-cpus(L)", .action_msg = "Show backtrace of all active CPUs", .enable_mask = SYSRQ_ENABLE_DUMP, }; @@ -247,7 +247,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, - .help_msg = "showPc", + .help_msg = "show-registers(P)", .action_msg = "Show Regs", .enable_mask = SYSRQ_ENABLE_DUMP, }; @@ -258,7 +258,7 @@ static void sysrq_handle_showstate(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, - .help_msg = "showTasks", + .help_msg = "show-task-states(T)", .action_msg = "Show State", .enable_mask = SYSRQ_ENABLE_DUMP, }; @@ -269,7 +269,7 @@ static void sysrq_handle_showstate_blocked(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_showstate_blocked_op = { .handler = sysrq_handle_showstate_blocked, - .help_msg = "shoW-blocked-tasks", + .help_msg = "show-blocked-tasks(W)", .action_msg = "Show Blocked State", .enable_mask = SYSRQ_ENABLE_DUMP, }; @@ -297,7 +297,7 @@ static void sysrq_handle_showmem(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, - .help_msg = "showMem", + .help_msg = "show-memory-usage(M)", .action_msg = "Show Memory", .enable_mask = SYSRQ_ENABLE_DUMP, }; @@ -323,7 +323,7 @@ static void sysrq_handle_term(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, - .help_msg = "tErm", + .help_msg = "terminate-all-tasks(E)", .action_msg = "Terminate All Tasks", .enable_mask = SYSRQ_ENABLE_SIGNAL, }; @@ -341,7 +341,7 @@ static void sysrq_handle_moom(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_moom_op = { .handler = sysrq_handle_moom, - .help_msg = "Full", + .help_msg = "memory-full-oom-kill(F)", .action_msg = "Manual OOM execution", .enable_mask = SYSRQ_ENABLE_SIGNAL, }; @@ -353,7 +353,7 @@ static void sysrq_handle_kill(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, - .help_msg = "kIll", + .help_msg = "kill-all-tasks(I)", .action_msg = "Kill All Tasks", .enable_mask = SYSRQ_ENABLE_SIGNAL, }; @@ -364,7 +364,7 @@ static void sysrq_handle_unrt(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_unrt_op = { .handler = sysrq_handle_unrt, - .help_msg = "Nice", + .help_msg = "nice-all-RT-tasks(N)", .action_msg = "Nice All RT Tasks", .enable_mask = SYSRQ_ENABLE_RTNICE, }; diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e2667a8c299..eee47fd16d7 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -109,6 +109,13 @@ config EDAC_X38 Support for error detection and correction on the Intel X38 server chipsets. +config EDAC_I5400 + tristate "Intel 5400 (Seaburg) chipsets" + depends on EDAC_MM_EDAC && PCI && X86 + help + Support for error detection and correction the Intel + i5400 MCH chipset (Seaburg). + config EDAC_I82860 tristate "Intel 82860" depends on EDAC_MM_EDAC && PCI && X86_32 diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 62c2d9bad8d..b75196927de 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -20,6 +20,7 @@ endif obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o obj-$(CONFIG_EDAC_I5000) += i5000_edac.o obj-$(CONFIG_EDAC_I5100) += i5100_edac.o +obj-$(CONFIG_EDAC_I5400) += i5400_edac.o obj-$(CONFIG_EDAC_E7XXX) += e7xxx_edac.o obj-$(CONFIG_EDAC_E752X) += e752x_edac.o obj-$(CONFIG_EDAC_I82443BXGX) += i82443bxgx_edac.o diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 4041e914328..ca9113e1c10 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -333,7 +333,7 @@ static int add_edac_dev_to_global_list(struct edac_device_ctl_info *edac_dev) fail0: edac_printk(KERN_WARNING, EDAC_MC, "%s (%s) %s %s already assigned %d\n", - rover->dev->bus_id, edac_dev_name(rover), + dev_name(rover->dev), edac_dev_name(rover), rover->mod_name, rover->ctl_name, rover->dev_idx); return 1; diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index d110392d48f..25d66940b4f 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -401,7 +401,7 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci) fail0: edac_printk(KERN_WARNING, EDAC_MC, - "%s (%s) %s %s already assigned %d\n", p->dev->bus_id, + "%s (%s) %s %s already assigned %d\n", dev_name(p->dev), edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx); return 1; diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 22ec9d5d431..5d3c8083a40 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -150,7 +150,7 @@ static int add_edac_pci_to_global_list(struct edac_pci_ctl_info *pci) fail0: edac_printk(KERN_WARNING, EDAC_PCI, "%s (%s) %s %s already assigned %d\n", - rover->dev->bus_id, edac_dev_name(rover), + dev_name(rover->dev), edac_dev_name(rover), rover->mod_name, rover->ctl_name, rover->pci_idx); return 1; diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index 5c153dccc95..422728cfe99 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -569,7 +569,7 @@ static void edac_pci_dev_parity_test(struct pci_dev *dev) local_irq_restore(flags); - debugf4("PCI STATUS= 0x%04x %s\n", status, dev->dev.bus_id); + debugf4("PCI STATUS= 0x%04x %s\n", status, dev_name(&dev->dev)); /* check the status reg for errors on boards NOT marked as broken * if broken, we cannot trust any of the status bits @@ -600,13 +600,13 @@ static void edac_pci_dev_parity_test(struct pci_dev *dev) } - debugf4("PCI HEADER TYPE= 0x%02x %s\n", header_type, dev->dev.bus_id); + debugf4("PCI HEADER TYPE= 0x%02x %s\n", header_type, dev_name(&dev->dev)); if ((header_type & 0x7F) == PCI_HEADER_TYPE_BRIDGE) { /* On bridges, need to examine secondary status register */ status = get_pci_parity_status(dev, 1); - debugf4("PCI SEC_STATUS= 0x%04x %s\n", status, dev->dev.bus_id); + debugf4("PCI SEC_STATUS= 0x%04x %s\n", status, dev_name(&dev->dev)); /* check the secondary status reg for errors, * on NOT broken boards diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c new file mode 100644 index 00000000000..b08b6d8e2dc --- /dev/null +++ b/drivers/edac/i5400_edac.c @@ -0,0 +1,1476 @@ +/* + * Intel 5400 class Memory Controllers kernel module (Seaburg) + * + * This file may be distributed under the terms of the + * GNU General Public License. + * + * Copyright (c) 2008 by: + * Ben Woodard <woodard@redhat.com> + * Mauro Carvalho Chehab <mchehab@redhat.com> + * + * Red Hat Inc. http://www.redhat.com + * + * Forked and adapted from the i5000_edac driver which was + * written by Douglas Thompson Linux Networx <norsk5@xmission.com> + * + * This module is based on the following document: + * + * Intel 5400 Chipset Memory Controller Hub (MCH) - Datasheet + * http://developer.intel.com/design/chipsets/datashts/313070.htm + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> +#include <linux/slab.h> +#include <linux/edac.h> +#include <linux/mmzone.h> + +#include "edac_core.h" + +/* + * Alter this version for the I5400 module when modifications are made + */ +#define I5400_REVISION " Ver: 1.0.0 " __DATE__ + +#define EDAC_MOD_STR "i5400_edac" + +#define i5400_printk(level, fmt, arg...) \ + edac_printk(level, "i5400", fmt, ##arg) + +#define i5400_mc_printk(mci, level, fmt, arg...) \ + edac_mc_chipset_printk(mci, level, "i5400", fmt, ##arg) + +/* Limits for i5400 */ +#define NUM_MTRS_PER_BRANCH 4 +#define CHANNELS_PER_BRANCH 2 +#define MAX_CHANNELS 4 +#define MAX_DIMMS (MAX_CHANNELS * 4) /* Up to 4 DIMM's per channel */ +#define MAX_CSROWS (MAX_DIMMS * 2) /* max possible csrows per channel */ + +/* Device 16, + * Function 0: System Address + * Function 1: Memory Branch Map, Control, Errors Register + * Function 2: FSB Error Registers + * + * All 3 functions of Device 16 (0,1,2) share the SAME DID and + * uses PCI_DEVICE_ID_INTEL_5400_ERR for device 16 (0,1,2), + * PCI_DEVICE_ID_INTEL_5400_FBD0 and PCI_DEVICE_ID_INTEL_5400_FBD1 + * for device 21 (0,1). + */ + + /* OFFSETS for Function 0 */ +#define AMBASE 0x48 /* AMB Mem Mapped Reg Region Base */ +#define MAXCH 0x56 /* Max Channel Number */ +#define MAXDIMMPERCH 0x57 /* Max DIMM PER Channel Number */ + + /* OFFSETS for Function 1 */ +#define TOLM 0x6C +#define REDMEMB 0x7C +#define REC_ECC_LOCATOR_ODD(x) ((x) & 0x3fe00) /* bits [17:9] indicate ODD, [8:0] indicate EVEN */ +#define MIR0 0x80 +#define MIR1 0x84 +#define AMIR0 0x8c +#define AMIR1 0x90 + + /* Fatal error registers */ +#define FERR_FAT_FBD 0x98 /* also called as FERR_FAT_FB_DIMM at datasheet */ +#define FERR_FAT_FBDCHAN (3<<28) /* channel index where the highest-order error occurred */ + +#define NERR_FAT_FBD 0x9c +#define FERR_NF_FBD 0xa0 /* also called as FERR_NFAT_FB_DIMM at datasheet */ + + /* Non-fatal error register */ +#define NERR_NF_FBD 0xa4 + + /* Enable error mask */ +#define EMASK_FBD 0xa8 + +#define ERR0_FBD 0xac +#define ERR1_FBD 0xb0 +#define ERR2_FBD 0xb4 +#define MCERR_FBD 0xb8 + + /* No OFFSETS for Device 16 Function 2 */ + +/* + * Device 21, + * Function 0: Memory Map Branch 0 + * + * Device 22, + * Function 0: Memory Map Branch 1 + */ + + /* OFFSETS for Function 0 */ +#define AMBPRESENT_0 0x64 +#define AMBPRESENT_1 0x66 +#define MTR0 0x80 +#define MTR1 0x82 +#define MTR2 0x84 +#define MTR3 0x86 + + /* OFFSETS for Function 1 */ +#define NRECFGLOG 0x74 +#define RECFGLOG 0x78 +#define NRECMEMA 0xbe +#define NRECMEMB 0xc0 +#define NRECFB_DIMMA 0xc4 +#define NRECFB_DIMMB 0xc8 +#define NRECFB_DIMMC 0xcc +#define NRECFB_DIMMD 0xd0 +#define NRECFB_DIMME 0xd4 +#define NRECFB_DIMMF 0xd8 +#define REDMEMA 0xdC +#define RECMEMA 0xf0 +#define RECMEMB 0xf4 +#define RECFB_DIMMA 0xf8 +#define RECFB_DIMMB 0xec +#define RECFB_DIMMC 0xf0 +#define RECFB_DIMMD 0xf4 +#define RECFB_DIMME 0xf8 +#define RECFB_DIMMF 0xfC + +/* + * Error indicator bits and masks + * Error masks are according with Table 5-17 of i5400 datasheet + */ + +enum error_mask { + EMASK_M1 = 1<<0, /* Memory Write error on non-redundant retry */ + EMASK_M2 = 1<<1, /* Memory or FB-DIMM configuration CRC read error */ + EMASK_M3 = 1<<2, /* Reserved */ + EMASK_M4 = 1<<3, /* Uncorrectable Data ECC on Replay */ + EMASK_M5 = 1<<4, /* Aliased Uncorrectable Non-Mirrored Demand Data ECC */ + EMASK_M6 = 1<<5, /* Unsupported on i5400 */ + EMASK_M7 = 1<<6, /* Aliased Uncorrectable Resilver- or Spare-Copy Data ECC */ + EMASK_M8 = 1<<7, /* Aliased Uncorrectable Patrol Data ECC */ + EMASK_M9 = 1<<8, /* Non-Aliased Uncorrectable Non-Mirrored Demand Data ECC */ + EMASK_M10 = 1<<9, /* Unsupported on i5400 */ + EMASK_M11 = 1<<10, /* Non-Aliased Uncorrectable Resilver- or Spare-Copy Data ECC */ + EMASK_M12 = 1<<11, /* Non-Aliased Uncorrectable Patrol Data ECC */ + EMASK_M13 = 1<<12, /* Memory Write error on first attempt */ + EMASK_M14 = 1<<13, /* FB-DIMM Configuration Write error on first attempt */ + EMASK_M15 = 1<<14, /* Memory or FB-DIMM configuration CRC read error */ + EMASK_M16 = 1<<15, /* Channel Failed-Over Occurred */ + EMASK_M17 = 1<<16, /* Correctable Non-Mirrored Demand Data ECC */ + EMASK_M18 = 1<<17, /* Unsupported on i5400 */ + EMASK_M19 = 1<<18, /* Correctable Resilver- or Spare-Copy Data ECC */ + EMASK_M20 = 1<<19, /* Correctable Patrol Data ECC */ + EMASK_M21 = 1<<20, /* FB-DIMM Northbound parity error on FB-DIMM Sync Status */ + EMASK_M22 = 1<<21, /* SPD protocol Error */ + EMASK_M23 = 1<<22, /* Non-Redundant Fast Reset Timeout */ + EMASK_M24 = 1<<23, /* Refresh error */ + EMASK_M25 = 1<<24, /* Memory Write error on redundant retry */ + EMASK_M26 = 1<<25, /* Redundant Fast Reset Timeout */ + EMASK_M27 = 1<<26, /* Correctable Counter Threshold Exceeded */ + EMASK_M28 = 1<<27, /* DIMM-Spare Copy Completed */ + EMASK_M29 = 1<<28, /* DIMM-Isolation Completed */ +}; + +/* + * Names to translate bit error into something useful + */ +static const char *error_name[] = { + [0] = "Memory Write error on non-redundant retry", + [1] = "Memory or FB-DIMM configuration CRC read error", + /* Reserved */ + [3] = "Uncorrectable Data ECC on Replay", + [4] = "Aliased Uncorrectable Non-Mirrored Demand Data ECC", + /* M6 Unsupported on i5400 */ + [6] = "Aliased Uncorrectable Resilver- or Spare-Copy Data ECC", + [7] = "Aliased Uncorrectable Patrol Data ECC", + [8] = "Non-Aliased Uncorrectable Non-Mirrored Demand Data ECC", + /* M10 Unsupported on i5400 */ + [10] = "Non-Aliased Uncorrectable Resilver- or Spare-Copy Data ECC", + [11] = "Non-Aliased Uncorrectable Patrol Data ECC", + [12] = "Memory Write error on first attempt", + [13] = "FB-DIMM Configuration Write error on first attempt", + [14] = "Memory or FB-DIMM configuration CRC read error", + [15] = "Channel Failed-Over Occurred", + [16] = "Correctable Non-Mirrored Demand Data ECC", + /* M18 Unsupported on i5400 */ + [18] = "Correctable Resilver- or Spare-Copy Data ECC", + [19] = "Correctable Patrol Data ECC", + [20] = "FB-DIMM Northbound parity error on FB-DIMM Sync Status", + [21] = "SPD protocol Error", + [22] = "Non-Redundant Fast Reset Timeout", + [23] = "Refresh error", + [24] = "Memory Write error on redundant retry", + [25] = "Redundant Fast Reset Timeout", + [26] = "Correctable Counter Threshold Exceeded", + [27] = "DIMM-Spare Copy Completed", + [28] = "DIMM-Isolation Completed", +}; + +/* Fatal errors */ +#define ERROR_FAT_MASK (EMASK_M1 | \ + EMASK_M2 | \ + EMASK_M23) + +/* Correctable errors */ +#define ERROR_NF_CORRECTABLE (EMASK_M27 | \ + EMASK_M20 | \ + EMASK_M19 | \ + EMASK_M18 | \ + EMASK_M17 | \ + EMASK_M16) +#define ERROR_NF_DIMM_SPARE (EMASK_M29 | \ + EMASK_M28) +#define ERROR_NF_SPD_PROTOCOL (EMASK_M22) +#define ERROR_NF_NORTH_CRC (EMASK_M21) + +/* Recoverable errors */ +#define ERROR_NF_RECOVERABLE (EMASK_M26 | \ + EMASK_M25 | \ + EMASK_M24 | \ + EMASK_M15 | \ + EMASK_M14 | \ + EMASK_M13 | \ + EMASK_M12 | \ + EMASK_M11 | \ + EMASK_M9 | \ + EMASK_M8 | \ + EMASK_M7 | \ + EMASK_M5) + +/* uncorrectable errors */ +#define ERROR_NF_UNCORRECTABLE (EMASK_M4) + +/* mask to all non-fatal errors */ +#define ERROR_NF_MASK (ERROR_NF_CORRECTABLE | \ + ERROR_NF_UNCORRECTABLE | \ + ERROR_NF_RECOVERABLE | \ + ERROR_NF_DIMM_SPARE | \ + ERROR_NF_SPD_PROTOCOL | \ + ERROR_NF_NORTH_CRC) + +/* + * Define error masks for the several registers + */ + +/* Enable all fatal and non fatal errors */ +#define ENABLE_EMASK_ALL (ERROR_FAT_MASK | ERROR_NF_MASK) + +/* mask for fatal error registers */ +#define FERR_FAT_MASK ERROR_FAT_MASK + +/* masks for non-fatal error register */ +static inline int to_nf_mask(unsigned int mask) +{ + return (mask & EMASK_M29) | (mask >> 3); +}; + +static inline int from_nf_ferr(unsigned int mask) +{ + return (mask & EMASK_M29) | /* Bit 28 */ + (mask & ((1 << 28) - 1) << 3); /* Bits 0 to 27 */ +}; + +#define FERR_NF_MASK to_nf_mask(ERROR_NF_MASK) +#define FERR_NF_CORRECTABLE to_nf_mask(ERROR_NF_CORRECTABLE) +#define FERR_NF_DIMM_SPARE to_nf_mask(ERROR_NF_DIMM_SPARE) +#define FERR_NF_SPD_PROTOCOL to_nf_mask(ERROR_NF_SPD_PROTOCOL) +#define FERR_NF_NORTH_CRC to_nf_mask(ERROR_NF_NORTH_CRC) +#define FERR_NF_RECOVERABLE to_nf_mask(ERROR_NF_RECOVERABLE) +#define FERR_NF_UNCORRECTABLE to_nf_mask(ERROR_NF_UNCORRECTABLE) + +/* Defines to extract the vaious fields from the + * MTRx - Memory Technology Registers + */ +#define MTR_DIMMS_PRESENT(mtr) ((mtr) & (1 << 10)) +#define MTR_DIMMS_ETHROTTLE(mtr) ((mtr) & (1 << 9)) +#define MTR_DRAM_WIDTH(mtr) (((mtr) & (1 << 8)) ? 8 : 4) +#define MTR_DRAM_BANKS(mtr) (((mtr) & (1 << 6)) ? 8 : 4) +#define MTR_DRAM_BANKS_ADDR_BITS(mtr) ((MTR_DRAM_BANKS(mtr) == 8) ? 3 : 2) +#define MTR_DIMM_RANK(mtr) (((mtr) >> 5) & 0x1) +#define MTR_DIMM_RANK_ADDR_BITS(mtr) (MTR_DIMM_RANK(mtr) ? 2 : 1) +#define MTR_DIMM_ROWS(mtr) (((mtr) >> 2) & 0x3) +#define MTR_DIMM_ROWS_ADDR_BITS(mtr) (MTR_DIMM_ROWS(mtr) + 13) +#define MTR_DIMM_COLS(mtr) ((mtr) & 0x3) +#define MTR_DIMM_COLS_ADDR_BITS(mtr) (MTR_DIMM_COLS(mtr) + 10) + +/* This applies to FERR_NF_FB-DIMM as well as FERR_FAT_FB-DIMM */ +static inline int extract_fbdchan_indx(u32 x) +{ + return (x>>28) & 0x3; +} + +#ifdef CONFIG_EDAC_DEBUG +/* MTR NUMROW */ +static const char *numrow_toString[] = { + "8,192 - 13 rows", + "16,384 - 14 rows", + "32,768 - 15 rows", + "65,536 - 16 rows" +}; + +/* MTR NUMCOL */ +static const char *numcol_toString[] = { + "1,024 - 10 columns", + "2,048 - 11 columns", + "4,096 - 12 columns", + "reserved" +}; +#endif + +/* Device name and register DID (Device ID) */ +struct i5400_dev_info { + const char *ctl_name; /* name for this device */ + u16 fsb_mapping_errors; /* DID for the branchmap,control */ +}; + +/* Table of devices attributes supported by this driver */ +static const struct i5400_dev_info i5400_devs[] = { + { + .ctl_name = "I5400", + .fsb_mapping_errors = PCI_DEVICE_ID_INTEL_5400_ERR, + }, +}; + +struct i5400_dimm_info { + int megabytes; /* size, 0 means not present */ + int dual_rank; +}; + +/* driver private data structure */ +struct i5400_pvt { + struct pci_dev *system_address; /* 16.0 */ + struct pci_dev *branchmap_werrors; /* 16.1 */ + struct pci_dev *fsb_error_regs; /* 16.2 */ + struct pci_dev *branch_0; /* 21.0 */ + struct pci_dev *branch_1; /* 22.0 */ + + u16 tolm; /* top of low memory */ + u64 ambase; /* AMB BAR */ + + u16 mir0, mir1; + + u16 b0_mtr[NUM_MTRS_PER_BRANCH]; /* Memory Technlogy Reg */ + u16 b0_ambpresent0; /* Branch 0, Channel 0 */ + u16 b0_ambpresent1; /* Brnach 0, Channel 1 */ + + u16 b1_mtr[NUM_MTRS_PER_BRANCH]; /* Memory Technlogy Reg */ + u16 b1_ambpresent0; /* Branch 1, Channel 8 */ + u16 b1_ambpresent1; /* Branch 1, Channel 1 */ + + /* DIMM information matrix, allocating architecture maximums */ + struct i5400_dimm_info dimm_info[MAX_CSROWS][MAX_CHANNELS]; + + /* Actual values for this controller */ + int maxch; /* Max channels */ + int maxdimmperch; /* Max DIMMs per channel */ +}; + +/* I5400 MCH error information retrieved from Hardware */ +struct i5400_error_info { + /* These registers are always read from the MC */ + u32 ferr_fat_fbd; /* First Errors Fatal */ + u32 nerr_fat_fbd; /* Next Errors Fatal */ + u32 ferr_nf_fbd; /* First Errors Non-Fatal */ + u32 nerr_nf_fbd; /* Next Errors Non-Fatal */ + + /* These registers are input ONLY if there was a Recoverable Error */ + u32 redmemb; /* Recoverable Mem Data Error log B */ + u16 recmema; /* Recoverable Mem Error log A */ + u32 recmemb; /* Recoverable Mem Error log B */ + + /* These registers are input ONLY if there was a Non-Rec Error */ + u16 nrecmema; /* Non-Recoverable Mem log A */ + u16 nrecmemb; /* Non-Recoverable Mem log B */ + +}; + +/* note that nrec_rdwr changed from NRECMEMA to NRECMEMB between the 5000 and + 5400 better to use an inline function than a macro in this case */ +static inline int nrec_bank(struct i5400_error_info *info) +{ + return ((info->nrecmema) >> 12) & 0x7; +} +static inline int nrec_rank(struct i5400_error_info *info) +{ + return ((info->nrecmema) >> 8) & 0xf; +} +static inline int nrec_buf_id(struct i5400_error_info *info) +{ + return ((info->nrecmema)) & 0xff; +} +static inline int nrec_rdwr(struct i5400_error_info *info) +{ + return (info->nrecmemb) >> 31; +} +/* This applies to both NREC and REC string so it can be used with nrec_rdwr + and rec_rdwr */ +static inline const char *rdwr_str(int rdwr) +{ + return rdwr ? "Write" : "Read"; +} +static inline int nrec_cas(struct i5400_error_info *info) +{ + return ((info->nrecmemb) >> 16) & 0x1fff; +} +static inline int nrec_ras(struct i5400_error_info *info) +{ + return (info->nrecmemb) & 0xffff; +} +static inline int rec_bank(struct i5400_error_info *info) +{ + return ((info->recmema) >> 12) & 0x7; +} +static inline int rec_rank(struct i5400_error_info *info) +{ + return ((info->recmema) >> 8) & 0xf; +} +static inline int rec_rdwr(struct i5400_error_info *info) +{ + return (info->recmemb) >> 31; +} +static inline int rec_cas(struct i5400_error_info *info) +{ + return ((info->recmemb) >> 16) & 0x1fff; +} +static inline int rec_ras(struct i5400_error_info *info) +{ + return (info->recmemb) & 0xffff; +} + +static struct edac_pci_ctl_info *i5400_pci; + +/* + * i5400_get_error_info Retrieve the hardware error information from + * the hardware and cache it in the 'info' + * structure + */ +static void i5400_get_error_info(struct mem_ctl_info *mci, + struct i5400_error_info *info) +{ + struct i5400_pvt *pvt; + u32 value; + + pvt = mci->pvt_info; + + /* read in the 1st FATAL error register */ + pci_read_config_dword(pvt->branchmap_werrors, FERR_FAT_FBD, &value); + + /* Mask only the bits that the doc says are valid + */ + value &= (FERR_FAT_FBDCHAN | FERR_FAT_MASK); + + /* If there is an error, then read in the + NEXT FATAL error register and the Memory Error Log Register A + */ + if (value & FERR_FAT_MASK) { + info->ferr_fat_fbd = value; + + /* harvest the various error data we need */ + pci_read_config_dword(pvt->branchmap_werrors, + NERR_FAT_FBD, &info->nerr_fat_fbd); + pci_read_config_word(pvt->branchmap_werrors, + NRECMEMA, &info->nrecmema); + pci_read_config_word(pvt->branchmap_werrors, + NRECMEMB, &info->nrecmemb); + + /* Clear the error bits, by writing them back */ + pci_write_config_dword(pvt->branchmap_werrors, + FERR_FAT_FBD, value); + } else { + info->ferr_fat_fbd = 0; + info->nerr_fat_fbd = 0; + info->nrecmema = 0; + info->nrecmemb = 0; + } + + /* read in the 1st NON-FATAL error register */ + pci_read_config_dword(pvt->branchmap_werrors, FERR_NF_FBD, &value); + + /* If there is an error, then read in the 1st NON-FATAL error + * register as well */ + if (value & FERR_NF_MASK) { + info->ferr_nf_fbd = value; + + /* harvest the various error data we need */ + pci_read_config_dword(pvt->branchmap_werrors, + NERR_NF_FBD, &info->nerr_nf_fbd); + pci_read_config_word(pvt->branchmap_werrors, + RECMEMA, &info->recmema); + pci_read_config_dword(pvt->branchmap_werrors, + RECMEMB, &info->recmemb); + pci_read_config_dword(pvt->branchmap_werrors, + REDMEMB, &info->redmemb); + + /* Clear the error bits, by writing them back */ + pci_write_config_dword(pvt->branchmap_werrors, + FERR_NF_FBD, value); + } else { + info->ferr_nf_fbd = 0; + info->nerr_nf_fbd = 0; + info->recmema = 0; + info->recmemb = 0; + info->redmemb = 0; + } +} + +/* + * i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, + * struct i5400_error_info *info, + * int handle_errors); + * + * handle the Intel FATAL and unrecoverable errors, if any + */ +static void i5400_proccess_non_recoverable_info(struct mem_ctl_info *mci, + struct i5400_error_info *info, + unsigned long allErrors) +{ + char msg[EDAC_MC_LABEL_LEN + 1 + 90 + 80]; + int branch; + int channel; + int bank; + int buf_id; + int rank; + int rdwr; + int ras, cas; + int errnum; + char *type = NULL; + + if (!allErrors) + return; /* if no error, return now */ + + if (allErrors & ERROR_FAT_MASK) + type = "FATAL"; + else if (allErrors & FERR_NF_UNCORRECTABLE) + type = "NON-FATAL uncorrected"; + else + type = "NON-FATAL recoverable"; + + /* ONLY ONE of the possible error bits will be set, as per the docs */ + + branch = extract_fbdchan_indx(info->ferr_fat_fbd); + channel = branch; + + /* Use the NON-Recoverable macros to extract data */ + bank = nrec_bank(info); + rank = nrec_rank(info); + buf_id = nrec_buf_id(info); + rdwr = nrec_rdwr(info); + ras = nrec_ras(info); + cas = nrec_cas(info); + + debugf0("\t\tCSROW= %d Channels= %d,%d (Branch= %d " + "DRAM Bank= %d Buffer ID = %d rdwr= %s ras= %d cas= %d)\n", + rank, channel, channel + 1, branch >> 1, bank, + buf_id, rdwr_str(rdwr), ras, cas); + + /* Only 1 bit will be on */ + errnum = find_first_bit(&allErrors, ARRAY_SIZE(error_name)); + + /* Form out message */ + snprintf(msg, sizeof(msg), + "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s " + "RAS=%d CAS=%d %s Err=0x%lx (%s))", + type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas, + type, allErrors, error_name[errnum]); + + /* Call the helper to output message */ + edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); +} + +/* + * i5400_process_fatal_error_info(struct mem_ctl_info *mci, + * struct i5400_error_info *info, + * int handle_errors); + * + * handle the Intel NON-FATAL errors, if any + */ +static void i5400_process_nonfatal_error_info(struct mem_ctl_info *mci, + struct i5400_error_info *info) +{ + char msg[EDAC_MC_LABEL_LEN + 1 + 90 + 80]; + unsigned long allErrors; + int branch; + int channel; + int bank; + int rank; + int rdwr; + int ras, cas; + int errnum; + + /* mask off the Error bits that are possible */ + allErrors = from_nf_ferr(info->ferr_nf_fbd & FERR_NF_MASK); + if (!allErrors) + return; /* if no error, return now */ + + /* ONLY ONE of the possible error bits will be set, as per the docs */ + + if (allErrors & (ERROR_NF_UNCORRECTABLE | ERROR_NF_RECOVERABLE)) { + i5400_proccess_non_recoverable_info(mci, info, allErrors); + return; + } + + /* Correctable errors */ + if (allErrors & ERROR_NF_CORRECTABLE) { + debugf0("\tCorrected bits= 0x%lx\n", allErrors); + + branch = extract_fbdchan_indx(info->ferr_nf_fbd); + + channel = 0; + if (REC_ECC_LOCATOR_ODD(info->redmemb)) + channel = 1; + + /* Convert channel to be based from zero, instead of + * from branch base of 0 */ + channel += branch; + + bank = rec_bank(info); + rank = rec_rank(info); + rdwr = rec_rdwr(info); + ras = rec_ras(info); + cas = rec_cas(info); + + /* Only 1 bit will be on */ + errnum = find_first_bit(&allErrors, ARRAY_SIZE(error_name)); + + debugf0("\t\tCSROW= %d Channel= %d (Branch %d " + "DRAM Bank= %d rdwr= %s ras= %d cas= %d)\n", + rank, channel, branch >> 1, bank, + rdwr_str(rdwr), ras, cas); + + /* Form out message */ + snprintf(msg, sizeof(msg), + "Corrected error (Branch=%d DRAM-Bank=%d RDWR=%s " + "RAS=%d CAS=%d, CE Err=0x%lx (%s))", + branch >> 1, bank, rdwr_str(rdwr), ras, cas, + allErrors, error_name[errnum]); + + /* Call the helper to output message */ + edac_mc_handle_fbd_ce(mci, rank, channel, msg); + + return; + } + + /* Miscelaneous errors */ + errnum = find_first_bit(&allErrors, ARRAY_SIZE(error_name)); + + branch = extract_fbdchan_indx(info->ferr_nf_fbd); + + i5400_mc_printk(mci, KERN_EMERG, + "Non-Fatal misc error (Branch=%d Err=%#lx (%s))", + branch >> 1, allErrors, error_name[errnum]); +} + +/* + * i5400_process_error_info Process the error info that is + * in the 'info' structure, previously retrieved from hardware + */ +static void i5400_process_error_info(struct mem_ctl_info *mci, + struct i5400_error_info *info) +{ u32 allErrors; + + /* First handle any fatal errors that occurred */ + allErrors = (info->ferr_fat_fbd & FERR_FAT_MASK); + i5400_proccess_non_recoverable_info(mci, info, allErrors); + + /* now handle any non-fatal errors that occurred */ + i5400_process_nonfatal_error_info(mci, info); +} + +/* + * i5400_clear_error Retrieve any error from the hardware + * but do NOT process that error. + * Used for 'clearing' out of previous errors + * Called by the Core module. + */ +static void i5400_clear_error(struct mem_ctl_info *mci) +{ + struct i5400_error_info info; + + i5400_get_error_info(mci, &info); +} + +/* + * i5400_check_error Retrieve and process errors reported by the + * hardware. Called by the Core module. + */ +static void i5400_check_error(struct mem_ctl_info *mci) +{ + struct i5400_error_info info; + debugf4("MC%d: " __FILE__ ": %s()\n", mci->mc_idx, __func__); + i5400_get_error_info(mci, &info); + i5400_process_error_info(mci, &info); +} + +/* + * i5400_put_devices 'put' all the devices that we have + * reserved via 'get' + */ +static void i5400_put_devices(struct mem_ctl_info *mci) +{ + struct i5400_pvt *pvt; + + pvt = mci->pvt_info; + + /* Decrement usage count for devices */ + pci_dev_put(pvt->branch_1); + pci_dev_put(pvt->branch_0); + pci_dev_put(pvt->fsb_error_regs); + pci_dev_put(pvt->branchmap_werrors); +} + +/* + * i5400_get_devices Find and perform 'get' operation on the MCH's + * device/functions we want to reference for this driver + * + * Need to 'get' device 16 func 1 and func 2 + */ +static int i5400_get_devices(struct mem_ctl_info *mci, int dev_idx) +{ + struct i5400_pvt *pvt; + struct pci_dev *pdev; + + pvt = mci->pvt_info; + pvt->branchmap_werrors = NULL; + pvt->fsb_error_regs = NULL; + pvt->branch_0 = NULL; + pvt->branch_1 = NULL; + + /* Attempt to 'get' the MCH register we want */ + pdev = NULL; + while (!pvt->branchmap_werrors || !pvt->fsb_error_regs) { + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_ERR, pdev); + if (!pdev) { + /* End of list, leave */ + i5400_printk(KERN_ERR, + "'system address,Process Bus' " + "device not found:" + "vendor 0x%x device 0x%x ERR funcs " + "(broken BIOS?)\n", + PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_ERR); + goto error; + } + + /* Store device 16 funcs 1 and 2 */ + switch (PCI_FUNC(pdev->devfn)) { + case 1: + pvt->branchmap_werrors = pdev; + break; + case 2: + pvt->fsb_error_regs = pdev; + break; + } + } + + debugf1("System Address, processor bus- PCI Bus ID: %s %x:%x\n", + pci_name(pvt->system_address), + pvt->system_address->vendor, pvt->system_address->device); + debugf1("Branchmap, control and errors - PCI Bus ID: %s %x:%x\n", + pci_name(pvt->branchmap_werrors), + pvt->branchmap_werrors->vendor, pvt->branchmap_werrors->device); + debugf1("FSB Error Regs - PCI Bus ID: %s %x:%x\n", + pci_name(pvt->fsb_error_regs), + pvt->fsb_error_regs->vendor, pvt->fsb_error_regs->device); + + pvt->branch_0 = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_FBD0, NULL); + if (!pvt->branch_0) { + i5400_printk(KERN_ERR, + "MC: 'BRANCH 0' device not found:" + "vendor 0x%x device 0x%x Func 0 (broken BIOS?)\n", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_FBD0); + goto error; + } + + /* If this device claims to have more than 2 channels then + * fetch Branch 1's information + */ + if (pvt->maxch < CHANNELS_PER_BRANCH) + return 0; + + pvt->branch_1 = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_FBD1, NULL); + if (!pvt->branch_1) { + i5400_printk(KERN_ERR, + "MC: 'BRANCH 1' device not found:" + "vendor 0x%x device 0x%x Func 0 " + "(broken BIOS?)\n", + PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_5400_FBD1); + goto error; + } + + return 0; + +error: + i5400_put_devices(mci); + return -ENODEV; +} + +/* + * determine_amb_present + * + * the information is contained in NUM_MTRS_PER_BRANCH different + * registers determining which of the NUM_MTRS_PER_BRANCH requires + * knowing which channel is in question + * + * 2 branches, each with 2 channels + * b0_ambpresent0 for channel '0' + * b0_ambpresent1 for channel '1' + * b1_ambpresent0 for channel '2' + * b1_ambpresent1 for channel '3' + */ +static int determine_amb_present_reg(struct i5400_pvt *pvt, int channel) +{ + int amb_present; + + if (channel < CHANNELS_PER_BRANCH) { + if (channel & 0x1) + amb_present = pvt->b0_ambpresent1; + else + amb_present = pvt->b0_ambpresent0; + } else { + if (channel & 0x1) + amb_present = pvt->b1_ambpresent1; + else + amb_present = pvt->b1_ambpresent0; + } + + return amb_present; +} + +/* + * determine_mtr(pvt, csrow, channel) + * + * return the proper MTR register as determine by the csrow and desired channel + */ +static int determine_mtr(struct i5400_pvt *pvt, int csrow, int channel) +{ + int mtr; + int n; + + /* There is one MTR for each slot pair of FB-DIMMs, + Each slot may have one or two ranks (2 csrows), + Each slot pair may be at branch 0 or branch 1. + So, csrow should be divided by eight + */ + n = csrow >> 3; + + if (n >= NUM_MTRS_PER_BRANCH) { + debugf0("ERROR: trying to access an invalid csrow: %d\n", + csrow); + return 0; + } + + if (channel < CHANNELS_PER_BRANCH) + mtr = pvt->b0_mtr[n]; + else + mtr = pvt->b1_mtr[n]; + + return mtr; +} + +/* + */ +static void decode_mtr(int slot_row, u16 mtr) +{ + int ans; + + ans = MTR_DIMMS_PRESENT(mtr); + + debugf2("\tMTR%d=0x%x: DIMMs are %s\n", slot_row, mtr, + ans ? "Present" : "NOT Present"); + if (!ans) + return; + + debugf2("\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); + + debugf2("\t\tELECTRICAL THROTTLING is %s\n", + MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled"); + + debugf2("\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr)); + debugf2("\t\tNUMRANK: %s\n", MTR_DIMM_RANK(mtr) ? "double" : "single"); + debugf2("\t\tNUMROW: %s\n", numrow_toString[MTR_DIMM_ROWS(mtr)]); + debugf2("\t\tNUMCOL: %s\n", numcol_toString[MTR_DIMM_COLS(mtr)]); +} + +static void handle_channel(struct i5400_pvt *pvt, int csrow, int channel, + struct i5400_dimm_info *dinfo) +{ + int mtr; + int amb_present_reg; + int addrBits; + + mtr = determine_mtr(pvt, csrow, channel); + if (MTR_DIMMS_PRESENT(mtr)) { + amb_present_reg = determine_amb_present_reg(pvt, channel); + + /* Determine if there is a DIMM present in this DIMM slot */ + if (amb_present_reg & (1 << (csrow >> 1))) { + dinfo->dual_rank = MTR_DIMM_RANK(mtr); + + if (!((dinfo->dual_rank == 0) && + ((csrow & 0x1) == 0x1))) { + /* Start with the number of bits for a Bank + * on the DRAM */ + addrBits = MTR_DRAM_BANKS_ADDR_BITS(mtr); + /* Add thenumber of ROW bits */ + addrBits += MTR_DIMM_ROWS_ADDR_BITS(mtr); + /* add the number of COLUMN bits */ + addrBits += MTR_DIMM_COLS_ADDR_BITS(mtr); + + addrBits += 6; /* add 64 bits per DIMM */ + addrBits -= 20; /* divide by 2^^20 */ + addrBits -= 3; /* 8 bits per bytes */ + + dinfo->megabytes = 1 << addrBits; + } + } + } +} + +/* + * calculate_dimm_size + * + * also will output a DIMM matrix map, if debug is enabled, for viewing + * how the DIMMs are populated + */ +static void calculate_dimm_size(struct i5400_pvt *pvt) +{ + struct i5400_dimm_info *dinfo; + int csrow, max_csrows; + char *p, *mem_buffer; + int space, n; + int channel; + + /* ================= Generate some debug output ================= */ + space = PAGE_SIZE; + mem_buffer = p = kmalloc(space, GFP_KERNEL); + if (p == NULL) { + i5400_printk(KERN_ERR, "MC: %s:%s() kmalloc() failed\n", + __FILE__, __func__); + return; + } + + /* Scan all the actual CSROWS (which is # of DIMMS * 2) + * and calculate the information for each DIMM + * Start with the highest csrow first, to display it first + * and work toward the 0th csrow + */ + max_csrows = pvt->maxdimmperch * 2; + for (csrow = max_csrows - 1; csrow >= 0; csrow--) { + + /* on an odd csrow, first output a 'boundary' marker, + * then reset the message buffer */ + if (csrow & 0x1) { + n = snprintf(p, space, "---------------------------" + "--------------------------------"); + p += n; + space -= n; + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; + } + n = snprintf(p, space, "csrow %2d ", csrow); + p += n; + space -= n; + + for (channel = 0; channel < pvt->maxch; channel++) { + dinfo = &pvt->dimm_info[csrow][channel]; + handle_channel(pvt, csrow, channel, dinfo); + n = snprintf(p, space, "%4d MB | ", dinfo->megabytes); + p += n; + space -= n; + } + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; + } + + /* Output the last bottom 'boundary' marker */ + n = snprintf(p, space, "---------------------------" + "--------------------------------"); + p += n; + space -= n; + debugf2("%s\n", mem_buffer); + p = mem_buffer; + space = PAGE_SIZE; + + /* now output the 'channel' labels */ + n = snprintf(p, space, " "); + p += n; + space -= n; + for (channel = 0; channel < pvt->maxch; channel++) { + n = snprintf(p, space, "channel %d | ", channel); + p += n; + space -= n; + } + + /* output the last message and free buffer */ + debugf2("%s\n", mem_buffer); + kfree(mem_buffer); +} + +/* + * i5400_get_mc_regs read in the necessary registers and + * cache locally + * + * Fills in the private data members + */ +static void i5400_get_mc_regs(struct mem_ctl_info *mci) +{ + struct i5400_pvt *pvt; + u32 actual_tolm; + u16 limit; + int slot_row; + int maxch; + int maxdimmperch; + int way0, way1; + + pvt = mci->pvt_info; + + pci_read_config_dword(pvt->system_address, AMBASE, + (u32 *) &pvt->ambase); + pci_read_config_dword(pvt->system_address, AMBASE + sizeof(u32), + ((u32 *) &pvt->ambase) + sizeof(u32)); + + maxdimmperch = pvt->maxdimmperch; + maxch = pvt->maxch; + + debugf2("AMBASE= 0x%lx MAXCH= %d MAX-DIMM-Per-CH= %d\n", + (long unsigned int)pvt->ambase, pvt->maxch, pvt->maxdimmperch); + + /* Get the Branch Map regs */ + pci_read_config_word(pvt->branchmap_werrors, TOLM, &pvt->tolm); + pvt->tolm >>= 12; + debugf2("\nTOLM (number of 256M regions) =%u (0x%x)\n", pvt->tolm, + pvt->tolm); + + actual_tolm = (u32) ((1000l * pvt->tolm) >> (30 - 28)); + debugf2("Actual TOLM byte addr=%u.%03u GB (0x%x)\n", + actual_tolm/1000, actual_tolm % 1000, pvt->tolm << 28); + + pci_read_config_word(pvt->branchmap_werrors, MIR0, &pvt->mir0); + pci_read_config_word(pvt->branchmap_werrors, MIR1, &pvt->mir1); + + /* Get the MIR[0-1] regs */ + limit = (pvt->mir0 >> 4) & 0x0fff; + way0 = pvt->mir0 & 0x1; + way1 = pvt->mir0 & 0x2; + debugf2("MIR0: limit= 0x%x WAY1= %u WAY0= %x\n", limit, way1, way0); + limit = (pvt->mir1 >> 4) & 0xfff; + way0 = pvt->mir1 & 0x1; + way1 = pvt->mir1 & 0x2; + debugf2("MIR1: limit= 0x%x WAY1= %u WAY0= %x\n", limit, way1, way0); + + /* Get the set of MTR[0-3] regs by each branch */ + for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) { + int where = MTR0 + (slot_row * sizeof(u32)); + + /* Branch 0 set of MTR registers */ + pci_read_config_word(pvt->branch_0, where, + &pvt->b0_mtr[slot_row]); + + debugf2("MTR%d where=0x%x B0 value=0x%x\n", slot_row, where, + pvt->b0_mtr[slot_row]); + + if (pvt->maxch < CHANNELS_PER_BRANCH) { + pvt->b1_mtr[slot_row] = 0; + continue; + } + + /* Branch 1 set of MTR registers */ + pci_read_config_word(pvt->branch_1, where, + &pvt->b1_mtr[slot_row]); + debugf2("MTR%d where=0x%x B1 value=0x%x\n", slot_row, where, + pvt->b1_mtr[slot_row]); + } + + /* Read and dump branch 0's MTRs */ + debugf2("\nMemory Technology Registers:\n"); + debugf2(" Branch 0:\n"); + for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) + decode_mtr(slot_row, pvt->b0_mtr[slot_row]); + + pci_read_config_word(pvt->branch_0, AMBPRESENT_0, + &pvt->b0_ambpresent0); + debugf2("\t\tAMB-Branch 0-present0 0x%x:\n", pvt->b0_ambpresent0); + pci_read_config_word(pvt->branch_0, AMBPRESENT_1, + &pvt->b0_ambpresent1); + debugf2("\t\tAMB-Branch 0-present1 0x%x:\n", pvt->b0_ambpresent1); + + /* Only if we have 2 branchs (4 channels) */ + if (pvt->maxch < CHANNELS_PER_BRANCH) { + pvt->b1_ambpresent0 = 0; + pvt->b1_ambpresent1 = 0; + } else { + /* Read and dump branch 1's MTRs */ + debugf2(" Branch 1:\n"); + for (slot_row = 0; slot_row < NUM_MTRS_PER_BRANCH; slot_row++) + decode_mtr(slot_row, pvt->b1_mtr[slot_row]); + + pci_read_config_word(pvt->branch_1, AMBPRESENT_0, + &pvt->b1_ambpresent0); + debugf2("\t\tAMB-Branch 1-present0 0x%x:\n", + pvt->b1_ambpresent0); + pci_read_config_word(pvt->branch_1, AMBPRESENT_1, + &pvt->b1_ambpresent1); + debugf2("\t\tAMB-Branch 1-present1 0x%x:\n", + pvt->b1_ambpresent1); + } + + /* Go and determine the size of each DIMM and place in an + * orderly matrix */ + calculate_dimm_size(pvt); +} + +/* + * i5400_init_csrows Initialize the 'csrows' table within + * the mci control structure with the + * addressing of memory. + * + * return: + * 0 success + * 1 no actual memory found on this MC + */ +static int i5400_init_csrows(struct mem_ctl_info *mci) +{ + struct i5400_pvt *pvt; + struct csrow_info *p_csrow; + int empty, channel_count; + int max_csrows; + int mtr; + int csrow_megs; + int channel; + int csrow; + + pvt = mci->pvt_info; + + channel_count = pvt->maxch; + max_csrows = pvt->maxdimmperch * 2; + + empty = 1; /* Assume NO memory */ + + for (csrow = 0; csrow < max_csrows; csrow++) { + p_csrow = &mci->csrows[csrow]; + + p_csrow->csrow_idx = csrow; + + /* use branch 0 for the basis */ + mtr = determine_mtr(pvt, csrow, 0); + + /* if no DIMMS on this row, continue */ + if (!MTR_DIMMS_PRESENT(mtr)) + continue; + + /* FAKE OUT VALUES, FIXME */ + p_csrow->first_page = 0 + csrow * 20; + p_csrow->last_page = 9 + csrow * 20; + p_csrow->page_mask = 0xFFF; + + p_csrow->grain = 8; + + csrow_megs = 0; + for (channel = 0; channel < pvt->maxch; channel++) + csrow_megs += pvt->dimm_info[csrow][channel].megabytes; + + p_csrow->nr_pages = csrow_megs << 8; + + /* Assume DDR2 for now */ + p_csrow->mtype = MEM_FB_DDR2; + + /* ask what device type on this row */ + if (MTR_DRAM_WIDTH(mtr)) + p_csrow->dtype = DEV_X8; + else + p_csrow->dtype = DEV_X4; + + p_csrow->edac_mode = EDAC_S8ECD8ED; + + empty = 0; + } + + return empty; +} + +/* + * i5400_enable_error_reporting + * Turn on the memory reporting features of the hardware + */ +static void i5400_enable_error_reporting(struct mem_ctl_info *mci) +{ + struct i5400_pvt *pvt; + u32 fbd_error_mask; + + pvt = mci->pvt_info; + + /* Read the FBD Error Mask Register */ + pci_read_config_dword(pvt->branchmap_werrors, EMASK_FBD, + &fbd_error_mask); + + /* Enable with a '0' */ + fbd_error_mask &= ~(ENABLE_EMASK_ALL); + + pci_write_config_dword(pvt->branchmap_werrors, EMASK_FBD, + fbd_error_mask); +} + +/* + * i5400_get_dimm_and_channel_counts(pdev, &num_csrows, &num_channels) + * + * ask the device how many channels are present and how many CSROWS + * as well + */ +static void i5400_get_dimm_and_channel_counts(struct pci_dev *pdev, + int *num_dimms_per_channel, + int *num_channels) +{ + u8 value; + + /* Need to retrieve just how many channels and dimms per channel are + * supported on this memory controller + */ + pci_read_config_byte(pdev, MAXDIMMPERCH, &value); + *num_dimms_per_channel = (int)value * 2; + + pci_read_config_byte(pdev, MAXCH, &value); + *num_channels = (int)value; +} + +/* + * i5400_probe1 Probe for ONE instance of device to see if it is + * present. + * return: + * 0 for FOUND a device + * < 0 for error code + */ +static int i5400_probe1(struct pci_dev *pdev, int dev_idx) +{ + struct mem_ctl_info *mci; + struct i5400_pvt *pvt; + int num_channels; + int num_dimms_per_channel; + int num_csrows; + + if (dev_idx >= ARRAY_SIZE(i5400_devs)) + return -EINVAL; + + debugf0("MC: " __FILE__ ": %s(), pdev bus %u dev=0x%x fn=0x%x\n", + __func__, + pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + + /* We only are looking for func 0 of the set */ + if (PCI_FUNC(pdev->devfn) != 0) + return -ENODEV; + + /* Ask the devices for the number of CSROWS and CHANNELS so + * that we can calculate the memory resources, etc + * + * The Chipset will report what it can handle which will be greater + * or equal to what the motherboard manufacturer will implement. + * + * As we don't have a motherboard identification routine to determine + * actual number of slots/dimms per channel, we thus utilize the + * resource as specified by the chipset. Thus, we might have + * have more DIMMs per channel than actually on the mobo, but this + * allows the driver to support upto the chipset max, without + * some fancy mobo determination. + */ + i5400_get_dimm_and_channel_counts(pdev, &num_dimms_per_channel, + &num_channels); + num_csrows = num_dimms_per_channel * 2; + + debugf0("MC: %s(): Number of - Channels= %d DIMMS= %d CSROWS= %d\n", + __func__, num_channels, num_dimms_per_channel, num_csrows); + + /* allocate a new MC control structure */ + mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0); + + if (mci == NULL) + return -ENOMEM; + + debugf0("MC: " __FILE__ ": %s(): mci = %p\n", __func__, mci); + + mci->dev = &pdev->dev; /* record ptr to the generic device */ + + pvt = mci->pvt_info; + pvt->system_address = pdev; /* Record this device in our private */ + pvt->maxch = num_channels; + pvt->maxdimmperch = num_dimms_per_channel; + + /* 'get' the pci devices we want to reserve for our use */ + if (i5400_get_devices(mci, dev_idx)) + goto fail0; + + /* Time to get serious */ + i5400_get_mc_regs(mci); /* retrieve the hardware registers */ + + mci->mc_idx = 0; + mci->mtype_cap = MEM_FLAG_FB_DDR2; + mci->edac_ctl_cap = EDAC_FLAG_NONE; + mci->edac_cap = EDAC_FLAG_NONE; + mci->mod_name = "i5400_edac.c"; + mci->mod_ver = I5400_REVISION; + mci->ctl_name = i5400_devs[dev_idx].ctl_name; + mci->dev_name = pci_name(pdev); + mci->ctl_page_to_phys = NULL; + + /* Set the function pointer to an actual operation function */ + mci->edac_check = i5400_check_error; + + /* initialize the MC control structure 'csrows' table + * with the mapping and control information */ + if (i5400_init_csrows(mci)) { + debugf0("MC: Setting mci->edac_cap to EDAC_FLAG_NONE\n" + " because i5400_init_csrows() returned nonzero " + "value\n"); + mci->edac_cap = EDAC_FLAG_NONE; /* no csrows found */ + } else { + debugf1("MC: Enable error reporting now\n"); + i5400_enable_error_reporting(mci); + } + + /* add this new MC control structure to EDAC's list of MCs */ + if (edac_mc_add_mc(mci)) { + debugf0("MC: " __FILE__ + ": %s(): failed edac_mc_add_mc()\n", __func__); + /* FIXME: perhaps some code should go here that disables error + * reporting if we just enabled it + */ + goto fail1; + } + + i5400_clear_error(mci); + + /* allocating generic PCI control info */ + i5400_pci = edac_pci_create_generic_ctl(&pdev->dev, EDAC_MOD_STR); + if (!i5400_pci) { + printk(KERN_WARNING + "%s(): Unable to create PCI control\n", + __func__); + printk(KERN_WARNING + "%s(): PCI error report via EDAC not setup\n", + __func__); + } + + return 0; + + /* Error exit unwinding stack */ +fail1: + + i5400_put_devices(mci); + +fail0: + edac_mc_free(mci); + return -ENODEV; +} + +/* + * i5400_init_one constructor for one instance of device + * + * returns: + * negative on error + * count (>= 0) + */ +static int __devinit i5400_init_one(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int rc; + + debugf0("MC: " __FILE__ ": %s()\n", __func__); + + /* wake up device */ + rc = pci_enable_device(pdev); + if (rc == -EIO) + return rc; + + /* now probe and enable the device */ + return i5400_probe1(pdev, id->driver_data); +} + +/* + * i5400_remove_one destructor for one instance of device + * + */ +static void __devexit i5400_remove_one(struct pci_dev *pdev) +{ + struct mem_ctl_info *mci; + + debugf0(__FILE__ ": %s()\n", __func__); + + if (i5400_pci) + edac_pci_release_generic_ctl(i5400_pci); + + mci = edac_mc_del_mc(&pdev->dev); + if (!mci) + return; + + /* retrieve references to resources, and free those resources */ + i5400_put_devices(mci); + + edac_mc_free(mci); +} + +/* + * pci_device_id table for which devices we are looking for + * + * The "E500P" device is the first device supported. + */ +static const struct pci_device_id i5400_pci_tbl[] __devinitdata = { + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5400_ERR)}, + {0,} /* 0 terminated list. */ +}; + +MODULE_DEVICE_TABLE(pci, i5400_pci_tbl); + +/* + * i5400_driver pci_driver structure for this module + * + */ +static struct pci_driver i5400_driver = { + .name = "i5400_edac", + .probe = i5400_init_one, + .remove = __devexit_p(i5400_remove_one), + .id_table = i5400_pci_tbl, +}; + +/* + * i5400_init Module entry function + * Try to initialize this module for its devices + */ +static int __init i5400_init(void) +{ + int pci_rc; + + debugf2("MC: " __FILE__ ": %s()\n", __func__); + + /* Ensure that the OPSTATE is set correctly for POLL or NMI */ + opstate_init(); + + pci_rc = pci_register_driver(&i5400_driver); + + return (pci_rc < 0) ? pci_rc : 0; +} + +/* + * i5400_exit() Module exit function + * Unregister the driver + */ +static void __exit i5400_exit(void) +{ + debugf2("MC: " __FILE__ ": %s()\n", __func__); + pci_unregister_driver(&i5400_driver); +} + +module_init(i5400_init); +module_exit(i5400_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ben Woodard <woodard@redhat.com>"); +MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@redhat.com>"); +MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); +MODULE_DESCRIPTION("MC Driver for Intel I5400 memory controllers - " + I5400_REVISION); + +module_param(edac_op_state, int, 0444); +MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI"); diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c index ebb037b7875..b2d83b95033 100644 --- a/drivers/edac/i82875p_edac.c +++ b/drivers/edac/i82875p_edac.c @@ -311,9 +311,7 @@ static int i82875p_setup_overfl_dev(struct pci_dev *pdev, } /* cache is irrelevant for PCI bus reads/writes */ - window = ioremap_nocache(pci_resource_start(dev, 0), - pci_resource_len(dev, 0)); - + window = pci_ioremap_bar(dev, 0); if (window == NULL) { i82875p_printk(KERN_ERR, "%s(): Failed to ioremap bar6\n", __func__); diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 0cfcb2d075a..853ef37ec00 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -630,27 +630,22 @@ static int mpc85xx_l2_err_remove(struct of_device *op) } static struct of_device_id mpc85xx_l2_err_of_match[] = { - { - .compatible = "fsl,8540-l2-cache-controller", - }, - { - .compatible = "fsl,8541-l2-cache-controller", - }, - { - .compatible = "fsl,8544-l2-cache-controller", - }, - { - .compatible = "fsl,8548-l2-cache-controller", - }, - { - .compatible = "fsl,8555-l2-cache-controller", - }, - { - .compatible = "fsl,8568-l2-cache-controller", - }, - { - .compatible = "fsl,mpc8572-l2-cache-controller", - }, +/* deprecate the fsl,85.. forms in the future, 2.6.30? */ + { .compatible = "fsl,8540-l2-cache-controller", }, + { .compatible = "fsl,8541-l2-cache-controller", }, + { .compatible = "fsl,8544-l2-cache-controller", }, + { .compatible = "fsl,8548-l2-cache-controller", }, + { .compatible = "fsl,8555-l2-cache-controller", }, + { .compatible = "fsl,8568-l2-cache-controller", }, + { .compatible = "fsl,mpc8536-l2-cache-controller", }, + { .compatible = "fsl,mpc8540-l2-cache-controller", }, + { .compatible = "fsl,mpc8541-l2-cache-controller", }, + { .compatible = "fsl,mpc8544-l2-cache-controller", }, + { .compatible = "fsl,mpc8548-l2-cache-controller", }, + { .compatible = "fsl,mpc8555-l2-cache-controller", }, + { .compatible = "fsl,mpc8560-l2-cache-controller", }, + { .compatible = "fsl,mpc8568-l2-cache-controller", }, + { .compatible = "fsl,mpc8572-l2-cache-controller", }, {}, }; @@ -967,27 +962,22 @@ static int mpc85xx_mc_err_remove(struct of_device *op) } static struct of_device_id mpc85xx_mc_err_of_match[] = { - { - .compatible = "fsl,8540-memory-controller", - }, - { - .compatible = "fsl,8541-memory-controller", - }, - { - .compatible = "fsl,8544-memory-controller", - }, - { - .compatible = "fsl,8548-memory-controller", - }, - { - .compatible = "fsl,8555-memory-controller", - }, - { - .compatible = "fsl,8568-memory-controller", - }, - { - .compatible = "fsl,mpc8572-memory-controller", - }, +/* deprecate the fsl,85.. forms in the future, 2.6.30? */ + { .compatible = "fsl,8540-memory-controller", }, + { .compatible = "fsl,8541-memory-controller", }, + { .compatible = "fsl,8544-memory-controller", }, + { .compatible = "fsl,8548-memory-controller", }, + { .compatible = "fsl,8555-memory-controller", }, + { .compatible = "fsl,8568-memory-controller", }, + { .compatible = "fsl,mpc8536-memory-controller", }, + { .compatible = "fsl,mpc8540-memory-controller", }, + { .compatible = "fsl,mpc8541-memory-controller", }, + { .compatible = "fsl,mpc8544-memory-controller", }, + { .compatible = "fsl,mpc8548-memory-controller", }, + { .compatible = "fsl,mpc8555-memory-controller", }, + { .compatible = "fsl,mpc8560-memory-controller", }, + { .compatible = "fsl,mpc8568-memory-controller", }, + { .compatible = "fsl,mpc8572-memory-controller", }, {}, }; diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 78b989d202a..d76adfea5df 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -468,8 +468,8 @@ const char *dmi_get_system_info(int field) EXPORT_SYMBOL(dmi_get_system_info); /** - * dmi_name_in_serial - Check if string is in the DMI product serial - * information. + * dmi_name_in_serial - Check if string is in the DMI product serial information + * @str: string to check for */ int dmi_name_in_serial(const char *str) { @@ -585,6 +585,8 @@ EXPORT_SYMBOL_GPL(dmi_walk); /** * dmi_match - compare a string to the dmi field (if exists) + * @f: DMI field identifier + * @str: string to compare the DMI field to * * Returns true if the requested field equals to the str (including NULL). */ diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 48f49d93d24..3d2565441b3 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -95,7 +95,7 @@ config GPIO_MAX732X number for these GPIOs. config GPIO_PCA953X - tristate "PCA953x, PCA955x, and MAX7310 I/O ports" + tristate "PCA953x, PCA955x, TCA64xx, and MAX7310 I/O ports" depends on I2C help Say yes here to provide access to several register-oriented @@ -104,9 +104,10 @@ config GPIO_PCA953X 4 bits: pca9536, pca9537 - 8 bits: max7310, pca9534, pca9538, pca9554, pca9557 + 8 bits: max7310, pca9534, pca9538, pca9554, pca9557, + tca6408 - 16 bits: pca9535, pca9539, pca9555 + 16 bits: pca9535, pca9539, pca9555, tca6416 This driver can also be built as a module. If so, the module will be called pca953x. diff --git a/drivers/gpio/pca953x.c b/drivers/gpio/pca953x.c index 9ceeb89f132..37f35388a2a 100644 --- a/drivers/gpio/pca953x.c +++ b/drivers/gpio/pca953x.c @@ -33,7 +33,12 @@ static const struct i2c_device_id pca953x_id[] = { { "pca9554", 8, }, { "pca9555", 16, }, { "pca9557", 8, }, + { "max7310", 8, }, + { "pca6107", 8, }, + { "tca6408", 8, }, + { "tca6416", 16, }, + /* NYET: { "tca6424", 24, }, */ { } }; MODULE_DEVICE_TABLE(i2c, pca953x_id); @@ -47,9 +52,6 @@ struct pca953x_chip { struct gpio_chip gpio_chip; }; -/* NOTE: we can't currently rely on fault codes to come from SMBus - * calls, so we map all errors to EIO here and return zero otherwise. - */ static int pca953x_write_reg(struct pca953x_chip *chip, int reg, uint16_t val) { int ret; @@ -61,7 +63,7 @@ static int pca953x_write_reg(struct pca953x_chip *chip, int reg, uint16_t val) if (ret < 0) { dev_err(&chip->client->dev, "failed writing register\n"); - return -EIO; + return ret; } return 0; @@ -78,7 +80,7 @@ static int pca953x_read_reg(struct pca953x_chip *chip, int reg, uint16_t *val) if (ret < 0) { dev_err(&chip->client->dev, "failed reading register\n"); - return -EIO; + return ret; } *val = (uint16_t)ret; diff --git a/drivers/gpio/twl4030-gpio.c b/drivers/gpio/twl4030-gpio.c index 37d3eec8730..afad1479214 100644 --- a/drivers/gpio/twl4030-gpio.c +++ b/drivers/gpio/twl4030-gpio.c @@ -202,37 +202,6 @@ static int twl4030_get_gpio_datain(int gpio) return ret; } -/* - * Configure debounce timing value for a GPIO pin on TWL4030 - */ -int twl4030_set_gpio_debounce(int gpio, int enable) -{ - u8 d_bnk = gpio >> 3; - u8 d_msk = BIT(gpio & 0x7); - u8 reg = 0; - u8 base = 0; - int ret = 0; - - if (unlikely((gpio >= TWL4030_GPIO_MAX) - || !(gpio_usage_count & BIT(gpio)))) - return -EPERM; - - base = REG_GPIO_DEBEN1 + d_bnk; - mutex_lock(&gpio_lock); - ret = gpio_twl4030_read(base); - if (ret >= 0) { - if (enable) - reg = ret | d_msk; - else - reg = ret & ~d_msk; - - ret = gpio_twl4030_write(base, reg); - } - mutex_unlock(&gpio_lock); - return ret; -} -EXPORT_SYMBOL(twl4030_set_gpio_debounce); - /*----------------------------------------------------------------------*/ static int twl_request(struct gpio_chip *chip, unsigned offset) @@ -405,6 +374,23 @@ static int __devinit gpio_twl4030_pulls(u32 ups, u32 downs) REG_GPIOPUPDCTR1, 5); } +static int __devinit gpio_twl4030_debounce(u32 debounce, u8 mmc_cd) +{ + u8 message[4]; + + /* 30 msec of debouncing is always used for MMC card detect, + * and is optional for everything else. + */ + message[1] = (debounce & 0xff) | (mmc_cd & 0x03); + debounce >>= 8; + message[2] = (debounce & 0xff); + debounce >>= 8; + message[3] = (debounce & 0x03); + + return twl4030_i2c_write(TWL4030_MODULE_GPIO, message, + REG_GPIO_DEBEN1, 3); +} + static int gpio_twl4030_remove(struct platform_device *pdev); static int __devinit gpio_twl4030_probe(struct platform_device *pdev) @@ -439,6 +425,12 @@ no_irqs: pdata->pullups, pdata->pulldowns, ret); + ret = gpio_twl4030_debounce(pdata->debounce, pdata->mmc_cd); + if (ret) + dev_dbg(&pdev->dev, "debounce %.03x %.01x --> %d\n", + pdata->debounce, pdata->mmc_cd, + ret); + twl_gpiochip.base = pdata->gpio_base; twl_gpiochip.ngpio = TWL4030_GPIO_MAX; twl_gpiochip.dev = &pdev->dev; diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c index 3733e36d135..b06a5371585 100644 --- a/drivers/gpu/drm/drm_fops.c +++ b/drivers/gpu/drm/drm_fops.c @@ -183,6 +183,10 @@ int drm_stub_open(struct inode *inode, struct file *filp) old_fops = filp->f_op; filp->f_op = fops_get(&dev->driver->fops); + if (filp->f_op == NULL) { + filp->f_op = old_fops; + goto out; + } if (filp->f_op->open && (err = filp->f_op->open(inode, filp))) { fops_put(filp->f_op); filp->f_op = fops_get(old_fops); diff --git a/drivers/hwmon/adt7462.c b/drivers/hwmon/adt7462.c index 66107b4dc12..1852f27bac5 100644 --- a/drivers/hwmon/adt7462.c +++ b/drivers/hwmon/adt7462.c @@ -204,8 +204,6 @@ I2C_CLIENT_INSMOD_1(adt7462); #define MASK_AND_SHIFT(value, prefix) \ (((value) & prefix##_MASK) >> prefix##_SHIFT) -#define ROUND_DIV(x, divisor) (((x) + ((divisor) / 2)) / (divisor)) - struct adt7462_data { struct device *hwmon_dev; struct attribute_group attrs; @@ -840,7 +838,7 @@ static ssize_t set_temp_min(struct device *dev, if (strict_strtol(buf, 10, &temp) || !temp_enabled(data, attr->index)) return -EINVAL; - temp = ROUND_DIV(temp, 1000) + 64; + temp = DIV_ROUND_CLOSEST(temp, 1000) + 64; temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -878,7 +876,7 @@ static ssize_t set_temp_max(struct device *dev, if (strict_strtol(buf, 10, &temp) || !temp_enabled(data, attr->index)) return -EINVAL; - temp = ROUND_DIV(temp, 1000) + 64; + temp = DIV_ROUND_CLOSEST(temp, 1000) + 64; temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -943,7 +941,7 @@ static ssize_t set_volt_max(struct device *dev, return -EINVAL; temp *= 1000; /* convert mV to uV */ - temp = ROUND_DIV(temp, x); + temp = DIV_ROUND_CLOSEST(temp, x); temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -985,7 +983,7 @@ static ssize_t set_volt_min(struct device *dev, return -EINVAL; temp *= 1000; /* convert mV to uV */ - temp = ROUND_DIV(temp, x); + temp = DIV_ROUND_CLOSEST(temp, x); temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -1250,7 +1248,7 @@ static ssize_t set_pwm_hyst(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = SENSORS_LIMIT(temp, 0, 15); /* package things up */ @@ -1337,7 +1335,7 @@ static ssize_t set_pwm_tmin(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000) + 64; + temp = DIV_ROUND_CLOSEST(temp, 1000) + 64; temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); diff --git a/drivers/hwmon/adt7470.c b/drivers/hwmon/adt7470.c index 1311a595147..633e1a1e9d7 100644 --- a/drivers/hwmon/adt7470.c +++ b/drivers/hwmon/adt7470.c @@ -28,6 +28,7 @@ #include <linux/mutex.h> #include <linux/delay.h> #include <linux/log2.h> +#include <linux/kthread.h> /* Addresses to scan */ static const unsigned short normal_i2c[] = { 0x2C, 0x2E, 0x2F, I2C_CLIENT_END }; @@ -74,6 +75,7 @@ I2C_CLIENT_INSMOD_1(adt7470); #define ADT7470_REG_PWM12_CFG 0x68 #define ADT7470_PWM2_AUTO_MASK 0x40 #define ADT7470_PWM1_AUTO_MASK 0x80 +#define ADT7470_PWM_AUTO_MASK 0xC0 #define ADT7470_REG_PWM34_CFG 0x69 #define ADT7470_PWM3_AUTO_MASK 0x40 #define ADT7470_PWM4_AUTO_MASK 0x80 @@ -128,8 +130,11 @@ I2C_CLIENT_INSMOD_1(adt7470); /* How often do we reread sensor limit values? (In jiffies) */ #define LIMIT_REFRESH_INTERVAL (60 * HZ) -/* sleep 1s while gathering temperature data */ -#define TEMP_COLLECTION_TIME 1000 +/* Wait at least 200ms per sensor for 10 sensors */ +#define TEMP_COLLECTION_TIME 2000 + +/* auto update thing won't fire more than every 2s */ +#define AUTO_UPDATE_INTERVAL 2000 /* datasheet says to divide this number by the fan reading to get fan rpm */ #define FAN_PERIOD_TO_RPM(x) ((90000 * 60) / (x)) @@ -137,8 +142,6 @@ I2C_CLIENT_INSMOD_1(adt7470); #define FAN_PERIOD_INVALID 65535 #define FAN_DATA_VALID(x) ((x) && (x) != FAN_PERIOD_INVALID) -#define ROUND_DIV(x, divisor) (((x) + ((divisor) / 2)) / (divisor)) - struct adt7470_data { struct device *hwmon_dev; struct attribute_group attrs; @@ -148,6 +151,9 @@ struct adt7470_data { unsigned long sensors_last_updated; /* In jiffies */ unsigned long limits_last_updated; /* In jiffies */ + int num_temp_sensors; /* -1 = probe */ + int temperatures_probed; + s8 temp[ADT7470_TEMP_COUNT]; s8 temp_min[ADT7470_TEMP_COUNT]; s8 temp_max[ADT7470_TEMP_COUNT]; @@ -163,6 +169,10 @@ struct adt7470_data { u8 pwm_min[ADT7470_PWM_COUNT]; s8 pwm_tmin[ADT7470_PWM_COUNT]; u8 pwm_auto_temp[ADT7470_PWM_COUNT]; + + struct task_struct *auto_update; + struct completion auto_update_stop; + unsigned int auto_update_interval; }; static int adt7470_probe(struct i2c_client *client, @@ -220,40 +230,126 @@ static void adt7470_init_client(struct i2c_client *client) } } -static struct adt7470_data *adt7470_update_device(struct device *dev) +/* Probe for temperature sensors. Assumes lock is held */ +static int adt7470_read_temperatures(struct i2c_client *client, + struct adt7470_data *data) { - struct i2c_client *client = to_i2c_client(dev); - struct adt7470_data *data = i2c_get_clientdata(client); - unsigned long local_jiffies = jiffies; - u8 cfg; + unsigned long res; int i; + u8 cfg, pwm[4], pwm_cfg[2]; - mutex_lock(&data->lock); - if (time_before(local_jiffies, data->sensors_last_updated + - SENSOR_REFRESH_INTERVAL) - && data->sensors_valid) - goto no_sensor_update; + /* save pwm[1-4] config register */ + pwm_cfg[0] = i2c_smbus_read_byte_data(client, ADT7470_REG_PWM_CFG(0)); + pwm_cfg[1] = i2c_smbus_read_byte_data(client, ADT7470_REG_PWM_CFG(2)); + + /* set manual pwm to whatever it is set to now */ + for (i = 0; i < ADT7470_FAN_COUNT; i++) + pwm[i] = i2c_smbus_read_byte_data(client, ADT7470_REG_PWM(i)); + + /* put pwm in manual mode */ + i2c_smbus_write_byte_data(client, ADT7470_REG_PWM_CFG(0), + pwm_cfg[0] & ~(ADT7470_PWM_AUTO_MASK)); + i2c_smbus_write_byte_data(client, ADT7470_REG_PWM_CFG(2), + pwm_cfg[1] & ~(ADT7470_PWM_AUTO_MASK)); + + /* write pwm control to whatever it was */ + for (i = 0; i < ADT7470_FAN_COUNT; i++) + i2c_smbus_write_byte_data(client, ADT7470_REG_PWM(i), pwm[i]); /* start reading temperature sensors */ cfg = i2c_smbus_read_byte_data(client, ADT7470_REG_CFG); cfg |= 0x80; i2c_smbus_write_byte_data(client, ADT7470_REG_CFG, cfg); - /* - * Delay is 200ms * number of tmp05 sensors. Too bad - * there's no way to figure out how many are connected. - * For now, assume 1s will work. - */ - msleep(TEMP_COLLECTION_TIME); + /* Delay is 200ms * number of temp sensors. */ + res = msleep_interruptible((data->num_temp_sensors >= 0 ? + data->num_temp_sensors * 200 : + TEMP_COLLECTION_TIME)); /* done reading temperature sensors */ cfg = i2c_smbus_read_byte_data(client, ADT7470_REG_CFG); cfg &= ~0x80; i2c_smbus_write_byte_data(client, ADT7470_REG_CFG, cfg); - for (i = 0; i < ADT7470_TEMP_COUNT; i++) + /* restore pwm[1-4] config registers */ + i2c_smbus_write_byte_data(client, ADT7470_REG_PWM_CFG(0), pwm_cfg[0]); + i2c_smbus_write_byte_data(client, ADT7470_REG_PWM_CFG(2), pwm_cfg[1]); + + if (res) { + printk(KERN_ERR "ha ha, interrupted"); + return -EAGAIN; + } + + /* Only count fans if we have to */ + if (data->num_temp_sensors >= 0) + return 0; + + for (i = 0; i < ADT7470_TEMP_COUNT; i++) { data->temp[i] = i2c_smbus_read_byte_data(client, ADT7470_TEMP_REG(i)); + if (data->temp[i]) + data->num_temp_sensors = i + 1; + } + data->temperatures_probed = 1; + return 0; +} + +static int adt7470_update_thread(void *p) +{ + struct i2c_client *client = p; + struct adt7470_data *data = i2c_get_clientdata(client); + + while (!kthread_should_stop()) { + mutex_lock(&data->lock); + adt7470_read_temperatures(client, data); + mutex_unlock(&data->lock); + if (kthread_should_stop()) + break; + msleep_interruptible(data->auto_update_interval); + } + + complete_all(&data->auto_update_stop); + return 0; +} + +static struct adt7470_data *adt7470_update_device(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct adt7470_data *data = i2c_get_clientdata(client); + unsigned long local_jiffies = jiffies; + u8 cfg; + int i; + int need_sensors = 1; + int need_limits = 1; + + /* + * Figure out if we need to update the shadow registers. + * Lockless means that we may occasionally report out of + * date data. + */ + if (time_before(local_jiffies, data->sensors_last_updated + + SENSOR_REFRESH_INTERVAL) && + data->sensors_valid) + need_sensors = 0; + + if (time_before(local_jiffies, data->limits_last_updated + + LIMIT_REFRESH_INTERVAL) && + data->limits_valid) + need_limits = 0; + + if (!need_sensors && !need_limits) + return data; + + mutex_lock(&data->lock); + if (!need_sensors) + goto no_sensor_update; + + if (!data->temperatures_probed) + adt7470_read_temperatures(client, data); + else + for (i = 0; i < ADT7470_TEMP_COUNT; i++) + data->temp[i] = i2c_smbus_read_byte_data(client, + ADT7470_TEMP_REG(i)); for (i = 0; i < ADT7470_FAN_COUNT; i++) data->fan[i] = adt7470_read_word_data(client, @@ -302,9 +398,7 @@ static struct adt7470_data *adt7470_update_device(struct device *dev) data->sensors_valid = 1; no_sensor_update: - if (time_before(local_jiffies, data->limits_last_updated + - LIMIT_REFRESH_INTERVAL) - && data->limits_valid) + if (!need_limits) goto out; for (i = 0; i < ADT7470_TEMP_COUNT; i++) { @@ -338,6 +432,66 @@ out: return data; } +static ssize_t show_auto_update_interval(struct device *dev, + struct device_attribute *devattr, + char *buf) +{ + struct adt7470_data *data = adt7470_update_device(dev); + return sprintf(buf, "%d\n", data->auto_update_interval); +} + +static ssize_t set_auto_update_interval(struct device *dev, + struct device_attribute *devattr, + const char *buf, + size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct adt7470_data *data = i2c_get_clientdata(client); + long temp; + + if (strict_strtol(buf, 10, &temp)) + return -EINVAL; + + temp = SENSORS_LIMIT(temp, 0, 60000); + + mutex_lock(&data->lock); + data->auto_update_interval = temp; + mutex_unlock(&data->lock); + + return count; +} + +static ssize_t show_num_temp_sensors(struct device *dev, + struct device_attribute *devattr, + char *buf) +{ + struct adt7470_data *data = adt7470_update_device(dev); + return sprintf(buf, "%d\n", data->num_temp_sensors); +} + +static ssize_t set_num_temp_sensors(struct device *dev, + struct device_attribute *devattr, + const char *buf, + size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct adt7470_data *data = i2c_get_clientdata(client); + long temp; + + if (strict_strtol(buf, 10, &temp)) + return -EINVAL; + + temp = SENSORS_LIMIT(temp, -1, 10); + + mutex_lock(&data->lock); + data->num_temp_sensors = temp; + if (temp < 0) + data->temperatures_probed = 0; + mutex_unlock(&data->lock); + + return count; +} + static ssize_t show_temp_min(struct device *dev, struct device_attribute *devattr, char *buf) @@ -360,7 +514,7 @@ static ssize_t set_temp_min(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -394,7 +548,7 @@ static ssize_t set_temp_max(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -671,7 +825,7 @@ static ssize_t set_pwm_tmin(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = SENSORS_LIMIT(temp, 0, 255); mutex_lock(&data->lock); @@ -804,6 +958,10 @@ static ssize_t show_alarm(struct device *dev, } static DEVICE_ATTR(alarm_mask, S_IRUGO, show_alarm_mask, NULL); +static DEVICE_ATTR(num_temp_sensors, S_IWUSR | S_IRUGO, show_num_temp_sensors, + set_num_temp_sensors); +static DEVICE_ATTR(auto_update_interval, S_IWUSR | S_IRUGO, + show_auto_update_interval, set_auto_update_interval); static SENSOR_DEVICE_ATTR(temp1_max, S_IWUSR | S_IRUGO, show_temp_max, set_temp_max, 0); @@ -976,6 +1134,8 @@ static SENSOR_DEVICE_ATTR(pwm4_auto_channels_temp, S_IWUSR | S_IRUGO, static struct attribute *adt7470_attr[] = { &dev_attr_alarm_mask.attr, + &dev_attr_num_temp_sensors.attr, + &dev_attr_auto_update_interval.attr, &sensor_dev_attr_temp1_max.dev_attr.attr, &sensor_dev_attr_temp2_max.dev_attr.attr, &sensor_dev_attr_temp3_max.dev_attr.attr, @@ -1108,6 +1268,9 @@ static int adt7470_probe(struct i2c_client *client, goto exit; } + data->num_temp_sensors = -1; + data->auto_update_interval = AUTO_UPDATE_INTERVAL; + i2c_set_clientdata(client, data); mutex_init(&data->lock); @@ -1127,8 +1290,16 @@ static int adt7470_probe(struct i2c_client *client, goto exit_remove; } + init_completion(&data->auto_update_stop); + data->auto_update = kthread_run(adt7470_update_thread, client, + dev_name(data->hwmon_dev)); + if (IS_ERR(data->auto_update)) + goto exit_unregister; + return 0; +exit_unregister: + hwmon_device_unregister(data->hwmon_dev); exit_remove: sysfs_remove_group(&client->dev.kobj, &data->attrs); exit_free: @@ -1141,6 +1312,8 @@ static int adt7470_remove(struct i2c_client *client) { struct adt7470_data *data = i2c_get_clientdata(client); + kthread_stop(data->auto_update); + wait_for_completion(&data->auto_update_stop); hwmon_device_unregister(data->hwmon_dev); sysfs_remove_group(&client->dev.kobj, &data->attrs); kfree(data); diff --git a/drivers/hwmon/adt7473.c b/drivers/hwmon/adt7473.c index 18aa30866a6..0a6ce2367b4 100644 --- a/drivers/hwmon/adt7473.c +++ b/drivers/hwmon/adt7473.c @@ -129,8 +129,6 @@ I2C_CLIENT_INSMOD_1(adt7473); #define FAN_PERIOD_INVALID 65535 #define FAN_DATA_VALID(x) ((x) && (x) != FAN_PERIOD_INVALID) -#define ROUND_DIV(x, divisor) (((x) + ((divisor) / 2)) / (divisor)) - struct adt7473_data { struct device *hwmon_dev; struct attribute_group attrs; @@ -459,7 +457,7 @@ static ssize_t set_temp_min(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = encode_temp(data->temp_twos_complement, temp); mutex_lock(&data->lock); @@ -495,7 +493,7 @@ static ssize_t set_temp_max(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = encode_temp(data->temp_twos_complement, temp); mutex_lock(&data->lock); @@ -720,7 +718,7 @@ static ssize_t set_temp_tmax(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = encode_temp(data->temp_twos_complement, temp); mutex_lock(&data->lock); @@ -756,7 +754,7 @@ static ssize_t set_temp_tmin(struct device *dev, if (strict_strtol(buf, 10, &temp)) return -EINVAL; - temp = ROUND_DIV(temp, 1000); + temp = DIV_ROUND_CLOSEST(temp, 1000); temp = encode_temp(data->temp_twos_complement, temp); mutex_lock(&data->lock); diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index 086c2a5cef0..dca47a591ba 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -131,6 +131,10 @@ static const char* temperature_sensors_sets[][36] = { /* Set 14: iMac 6,1 */ { "TA0P", "TC0D", "TC0H", "TC0P", "TG0D", "TG0H", "TG0P", "TH0P", "TO0P", "Tp0P", NULL }, +/* Set 15: MacBook Air 2,1 */ + { "TB0T", "TB1S", "TB1T", "TB2S", "TB2T", "TC0D", "TN0D", "TTF0", + "TV0P", "TVFP", "TW0P", "Th0P", "Tp0P", "Tp1P", "TpFP", "Ts0P", + "Ts0S", NULL }, }; /* List of keys used to read/write fan speeds */ @@ -1301,11 +1305,17 @@ static __initdata struct dmi_match_data applesmc_dmi_data[] = { { .accelerometer = 0, .light = 0, .temperature_set = 13 }, /* iMac 6: light sensor only, temperature set 14 */ { .accelerometer = 0, .light = 0, .temperature_set = 14 }, +/* MacBook Air 2,1: accelerometer, backlight and temperature set 15 */ + { .accelerometer = 1, .light = 1, .temperature_set = 15 }, }; /* Note that DMI_MATCH(...,"MacBook") will match "MacBookPro1,1". * So we need to put "Apple MacBook Pro" before "Apple MacBook". */ static __initdata struct dmi_system_id applesmc_whitelist[] = { + { applesmc_dmi_match, "Apple MacBook Air 2", { + DMI_MATCH(DMI_BOARD_VENDOR, "Apple"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir2") }, + &applesmc_dmi_data[15]}, { applesmc_dmi_match, "Apple MacBook Air", { DMI_MATCH(DMI_BOARD_VENDOR, "Apple"), DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir") }, diff --git a/drivers/hwmon/ibmpex.c b/drivers/hwmon/ibmpex.c index 537d9fb2ff8..a36363312f2 100644 --- a/drivers/hwmon/ibmpex.c +++ b/drivers/hwmon/ibmpex.c @@ -40,7 +40,7 @@ static inline u16 extract_value(const char *data, int offset) { - return be16_to_cpup((u16 *)&data[offset]); + return be16_to_cpup((__be16 *)&data[offset]); } #define TEMP_SENSOR 1 diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c index 22bf981d393..82607add69a 100644 --- a/drivers/macintosh/therm_adt746x.c +++ b/drivers/macintosh/therm_adt746x.c @@ -554,7 +554,7 @@ thermostat_init(void) const u32 *prop; int i = 0, offset = 0; int err; - + np = of_find_node_by_name(NULL, "fan"); if (!np) return -ENODEV; @@ -613,13 +613,13 @@ thermostat_init(void) } of_dev = of_platform_device_create(np, "temperatures", NULL); - + of_node_put(np); + if (of_dev == NULL) { printk(KERN_ERR "Can't register temperatures device !\n"); - of_node_put(np); return -ENODEV; } - + err = device_create_file(&of_dev->dev, &dev_attr_sensor1_temperature); err |= device_create_file(&of_dev->dev, &dev_attr_sensor2_temperature); err |= device_create_file(&of_dev->dev, &dev_attr_sensor1_limit); diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c index 65d69665f1f..6a32680dbb1 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.c +++ b/drivers/media/dvb/dvb-core/dvbdev.c @@ -79,6 +79,10 @@ static int dvb_device_open(struct inode *inode, struct file *file) file->private_data = dvbdev; old_fops = file->f_op; file->f_op = fops_get(dvbdev->fops); + if (file->f_op == NULL) { + file->f_op = old_fops; + goto fail; + } if(file->f_op->open) err = file->f_op->open(inode,file); if (err) { @@ -90,6 +94,7 @@ static int dvb_device_open(struct inode *inode, struct file *file) unlock_kernel(); return err; } +fail: up_read(&minor_rwsem); unlock_kernel(); return -ENODEV; diff --git a/drivers/media/video/v4l1-compat.c b/drivers/media/video/v4l1-compat.c index d450cab20be..b617bf05e2d 100644 --- a/drivers/media/video/v4l1-compat.c +++ b/drivers/media/video/v4l1-compat.c @@ -203,7 +203,6 @@ static int poll_one(struct file *file, struct poll_wqueues *pwq) table = &pwq->pt; for (;;) { int mask; - set_current_state(TASK_INTERRUPTIBLE); mask = file->f_op->poll(file, table); if (mask & POLLIN) break; @@ -212,9 +211,8 @@ static int poll_one(struct file *file, struct poll_wqueues *pwq) retval = -ERESTARTSYS; break; } - schedule(); + poll_schedule(pwq, TASK_INTERRUPTIBLE); } - set_current_state(TASK_RUNNING); poll_freewait(pwq); return retval; } diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c index 54c2e9ae23e..c455da4ff41 100644 --- a/drivers/message/i2o/device.c +++ b/drivers/message/i2o/device.c @@ -52,7 +52,6 @@ static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd, /** * i2o_device_claim - claim a device for use by an OSM * @dev: I2O device to claim - * @drv: I2O driver which wants to claim the device * * Do the leg work to assign a device to a given OSM. If the claim succeeds, * the owner is the primary. If the attempt fails a negative errno code @@ -80,7 +79,6 @@ int i2o_device_claim(struct i2o_device *dev) /** * i2o_device_claim_release - release a device that the OSM is using * @dev: device to release - * @drv: driver which claimed the device * * Drop a claim by an OSM on a given I2O device. * diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c index e0d474b1743..a0421efe04c 100644 --- a/drivers/message/i2o/driver.c +++ b/drivers/message/i2o/driver.c @@ -173,7 +173,6 @@ void i2o_driver_unregister(struct i2o_driver *drv) * i2o_driver_dispatch - dispatch an I2O reply message * @c: I2O controller of the message * @m: I2O message number - * @msg: I2O message to be delivered * * The reply is delivered to the driver from which the original message * was. This function is only called from interrupt context. diff --git a/drivers/misc/ibmasm/module.c b/drivers/misc/ibmasm/module.c index b5f6add34b0..dc14b0b9cbf 100644 --- a/drivers/misc/ibmasm/module.c +++ b/drivers/misc/ibmasm/module.c @@ -104,8 +104,7 @@ static int __devinit ibmasm_init_one(struct pci_dev *pdev, const struct pci_devi } sp->irq = pdev->irq; - sp->base_address = ioremap(pci_resource_start(pdev, 0), - pci_resource_len(pdev, 0)); + sp->base_address = pci_ioremap_bar(pdev, 0); if (!sp->base_address) { dev_err(sp->dev, "Failed to ioremap pci memory\n"); result = -ENODEV; diff --git a/drivers/misc/ioc4.c b/drivers/misc/ioc4.c index 6f76573e7c8..60b0b1a4fb3 100644 --- a/drivers/misc/ioc4.c +++ b/drivers/misc/ioc4.c @@ -269,6 +269,16 @@ ioc4_variant(struct ioc4_driver_data *idd) return IOC4_VARIANT_PCI_RT; } +static void +ioc4_load_modules(struct work_struct *work) +{ + /* arg just has to be freed */ + + request_module("sgiioc4"); + + kfree(work); +} + /* Adds a new instance of an IOC4 card */ static int ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) @@ -378,6 +388,30 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id) } mutex_unlock(&ioc4_mutex); + /* Request sgiioc4 IDE driver on boards that bring that functionality + * off of IOC4. The root filesystem may be hosted on a drive connected + * to IOC4, so we need to make sure the sgiioc4 driver is loaded as it + * won't be picked up by modprobes due to the ioc4 module owning the + * PCI device. + */ + if (idd->idd_variant != IOC4_VARIANT_PCI_RT) { + struct work_struct *work; + work = kzalloc(sizeof(struct work_struct), GFP_KERNEL); + if (!work) { + printk(KERN_WARNING + "%s: IOC4 unable to allocate memory for " + "load of sub-modules.\n", __func__); + } else { + /* Request the module from a work procedure as the + * modprobe goes out to a userland helper and that + * will hang if done directly from ioc4_probe(). + */ + printk(KERN_INFO "IOC4 loading sgiioc4 submodule\n"); + INIT_WORK(work, ioc4_load_modules); + schedule_work(work); + } + } + return 0; out_misc_region: @@ -462,6 +496,8 @@ ioc4_init(void) static void __devexit ioc4_exit(void) { + /* Ensure ioc4_load_modules() has completed before exiting */ + flush_scheduled_work(); pci_unregister_driver(&ioc4_driver); } diff --git a/drivers/misc/tifm_7xx1.c b/drivers/misc/tifm_7xx1.c index 67503ea71d2..af6173319e0 100644 --- a/drivers/misc/tifm_7xx1.c +++ b/drivers/misc/tifm_7xx1.c @@ -354,8 +354,7 @@ static int tifm_7xx1_probe(struct pci_dev *dev, fm->has_ms_pif = tifm_7xx1_has_ms_pif; pci_set_drvdata(dev, fm); - fm->addr = ioremap(pci_resource_start(dev, 0), - pci_resource_len(dev, 0)); + fm->addr = pci_ioremap_bar(dev, 0); if (!fm->addr) goto err_out_free; diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c index ac2a805ac7e..8901ecf6e03 100644 --- a/drivers/parport/ieee1284.c +++ b/drivers/parport/ieee1284.c @@ -84,7 +84,7 @@ int parport_wait_event (struct parport *port, signed long timeout) add_timer (&timer); ret = down_interruptible (&port->physport->ieee1284.irq); - if (!del_timer (&timer) && !ret) + if (!del_timer_sync(&timer) && !ret) /* Timed out. */ ret = 1; diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 956d3e79f6a..addb87cf44d 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -79,7 +79,6 @@ void rio_dev_put(struct rio_dev *rdev) /** * rio_device_probe - Tell if a RIO device structure has a matching RIO device id structure - * @id: the RIO device id structure to match against * @dev: the RIO device structure to match against * * return 0 and set rio_dev->driver when drv claims rio_dev, else error diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 165a8184335..4ad831de41a 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -35,8 +35,8 @@ config RTC_HCTOSYS_DEVICE default "rtc0" help The RTC device that will be used to (re)initialize the system - clock, usually rtc0. Initialization is done when the system - starts up, and when it resumes from a low power state. This + clock, usually rtc0. Initialization is done when the system + starts up, and when it resumes from a low power state. This device should record time in UTC, since the kernel won't do timezone correction. @@ -44,7 +44,7 @@ config RTC_HCTOSYS_DEVICE functions run, so it must usually be statically linked. This clock should be battery-backed, so that it reads the correct - time when the system boots from a power-off state. Otherwise, your + time when the system boots from a power-off state. Otherwise, your system will need an external clock source (like an NTP server). If the clock you specify here is not battery backed, it may still @@ -69,8 +69,7 @@ config RTC_INTF_SYSFS Say yes here if you want to use your RTCs using sysfs interfaces, /sys/class/rtc/rtc0 through /sys/.../rtcN. - This driver can also be built as a module. If so, the module - will be called rtc-sysfs. + If unsure, say Y. config RTC_INTF_PROC boolean "/proc/driver/rtc (procfs for rtc0)" @@ -78,11 +77,10 @@ config RTC_INTF_PROC default RTC_CLASS help Say yes here if you want to use your first RTC through the proc - interface, /proc/driver/rtc. Other RTCs will not be available + interface, /proc/driver/rtc. Other RTCs will not be available through that API. - This driver can also be built as a module. If so, the module - will be called rtc-proc. + If unsure, say Y. config RTC_INTF_DEV boolean "/dev/rtcN (character devices)" @@ -90,12 +88,14 @@ config RTC_INTF_DEV help Say yes here if you want to use your RTCs using the /dev interfaces, which "udev" sets up as /dev/rtc0 through - /dev/rtcN. You may want to set up a symbolic link so one - of these can be accessed as /dev/rtc, which is a name - expected by "hwclock" and some other programs. + /dev/rtcN. - This driver can also be built as a module. If so, the module - will be called rtc-dev. + You may want to set up a symbolic link so one of these + can be accessed as /dev/rtc, which is a name + expected by "hwclock" and some other programs. Recent + versions of "udev" are known to set up the symlink for you. + + If unsure, say Y. config RTC_INTF_DEV_UIE_EMUL bool "RTC UIE emulation on dev interface" @@ -132,14 +132,14 @@ config RTC_DRV_DS1307 tristate "Dallas/Maxim DS1307/37/38/39/40, ST M41T00" help If you say yes here you get support for various compatible RTC - chips (often with battery backup) connected with I2C. This driver + chips (often with battery backup) connected with I2C. This driver should handle DS1307, DS1337, DS1338, DS1339, DS1340, ST M41T00, - and probably other chips. In some cases the RTC must already + and probably other chips. In some cases the RTC must already have been initialized (by manufacturing or a bootloader). The first seven registers on these chips hold an RTC, and other registers may add features such as NVRAM, a trickle charger for - the RTC/NVRAM backup power, and alarms. NVRAM is visible in + the RTC/NVRAM backup power, and alarms. NVRAM is visible in sysfs, but other chip features may not be available. This driver can also be built as a module. If so, the module @@ -150,10 +150,10 @@ config RTC_DRV_DS1374 depends on RTC_CLASS && I2C help If you say yes here you get support for Dallas Semiconductor - DS1374 real-time clock chips. If an interrupt is associated + DS1374 real-time clock chips. If an interrupt is associated with the device, the alarm functionality is supported. - This driver can also be built as a module. If so, the module + This driver can also be built as a module. If so, the module will be called rtc-ds1374. config RTC_DRV_DS1672 @@ -247,7 +247,7 @@ config RTC_DRV_TWL92330 help If you say yes here you get support for the RTC on the TWL92330 "Menelaus" power management chip, used with OMAP2 - platforms. The support is integrated with the rest of + platforms. The support is integrated with the rest of the Menelaus driver; it's not separate module. config RTC_DRV_TWL4030 @@ -308,7 +308,7 @@ config RTC_DRV_DS1305 tristate "Dallas/Maxim DS1305/DS1306" help Select this driver to get support for the Dallas/Maxim DS1305 - and DS1306 real time clock chips. These support a trickle + and DS1306 real time clock chips. These support a trickle charger, alarms, and NVRAM in addition to the clock. This driver can also be built as a module. If so, the module @@ -317,7 +317,8 @@ config RTC_DRV_DS1305 config RTC_DRV_DS1390 tristate "Dallas/Maxim DS1390/93/94" help - If you say yes here you get support for the DS1390/93/94 chips. + If you say yes here you get support for the + Dallas/Maxim DS1390/93/94 chips. This driver only supports the RTC feature, and not other chip features such as alarms and trickle charging. @@ -381,7 +382,7 @@ config RTC_DRV_CMOS or LPC bus chips, and so on. Your system will need to define the platform device used by - this driver, otherwise it won't be accessible. This means + this driver, otherwise it won't be accessible. This means you can safely enable this driver if you don't know whether or not your board has this kind of hardware. @@ -598,7 +599,7 @@ config RTC_DRV_AT91RM9200 depends on ARCH_AT91RM9200 || ARCH_AT91SAM9RL help Driver for the internal RTC (Realtime Clock) module found on - Atmel AT91RM9200's and AT91SAM9RL chips. On SAM9RL chips + Atmel AT91RM9200's and AT91SAM9RL chips. On SAM9RL chips this is powered by the backup power supply. config RTC_DRV_AT91SAM9 @@ -620,8 +621,8 @@ config RTC_DRV_AT91SAM9_RTT prompt "RTT module Number" if ARCH_AT91SAM9263 depends on RTC_DRV_AT91SAM9 help - More than one RTT module is available. You can choose which - one will be used as an RTC. The default of zero is normally + More than one RTT module is available. You can choose which + one will be used as an RTC. The default of zero is normally OK to use, though some systems use that for non-RTC purposes. config RTC_DRV_AT91SAM9_GPBR @@ -633,10 +634,20 @@ config RTC_DRV_AT91SAM9_GPBR depends on RTC_DRV_AT91SAM9 help The RTC driver needs to use one of the General Purpose Backup - Registers (GPBRs) as well as the RTT. You can choose which one - will be used. The default of zero is normally OK to use, but + Registers (GPBRs) as well as the RTT. You can choose which one + will be used. The default of zero is normally OK to use, but on some systems other software needs to use that register. +config RTC_DRV_AU1XXX + tristate "Au1xxx Counter0 RTC support" + depends on SOC_AU1X00 + help + This is a driver for the Au1xxx on-chip Counter0 (Time-Of-Year + counter) to be used as a RTC. + + This driver can also be built as a module. If so, the module + will be called rtc-au1xxx. + config RTC_DRV_BFIN tristate "Blackfin On-Chip RTC" depends on BLACKFIN && !BF561 @@ -669,6 +680,17 @@ config RTC_DRV_PPC the RTC. This exposes that functionality through the generic RTC class. +config RTC_DRV_PXA + tristate "PXA27x/PXA3xx" + depends on ARCH_PXA + help + If you say Y here you will get access to the real time clock + built into your PXA27x or PXA3xx CPU. + + This RTC driver uses PXA RTC registers available since pxa27x + series (RDxR, RYxR) instead of legacy RCNR, RTAR. + + config RTC_DRV_SUN4V bool "SUN4V Hypervisor RTC" depends on SPARC64 @@ -683,4 +705,22 @@ config RTC_DRV_STARFIRE If you say Y here you will get support for the RTC found on Starfire systems. +config RTC_DRV_TX4939 + tristate "TX4939 SoC" + depends on SOC_TX4939 + help + Driver for the internal RTC (Realtime Clock) module found on + Toshiba TX4939 SoC. + +config RTC_DRV_MV + tristate "Marvell SoC RTC" + depends on ARCH_KIRKWOOD + help + If you say yes here you will get support for the in-chip RTC + that can be found in some of Marvell's SoC devices, such as + the Kirkwood 88F6281 and 88F6192. + + This driver can also be built as a module. If so, the module + will be called rtc-mv. + endif # RTC_CLASS diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 6e79c912bf9..9a4340d48f2 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -20,6 +20,7 @@ rtc-core-$(CONFIG_RTC_INTF_SYSFS) += rtc-sysfs.o obj-$(CONFIG_RTC_DRV_AT32AP700X)+= rtc-at32ap700x.o obj-$(CONFIG_RTC_DRV_AT91RM9200)+= rtc-at91rm9200.o obj-$(CONFIG_RTC_DRV_AT91SAM9) += rtc-at91sam9.o +obj-$(CONFIG_RTC_DRV_AU1XXX) += rtc-au1xxx.o obj-$(CONFIG_RTC_DRV_BFIN) += rtc-bfin.o obj-$(CONFIG_RTC_DRV_CMOS) += rtc-cmos.o obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o @@ -47,6 +48,7 @@ obj-$(CONFIG_RTC_DRV_SUN4V) += rtc-sun4v.o obj-$(CONFIG_RTC_DRV_STARFIRE) += rtc-starfire.o obj-$(CONFIG_RTC_DRV_MAX6900) += rtc-max6900.o obj-$(CONFIG_RTC_DRV_MAX6902) += rtc-max6902.o +obj-$(CONFIG_RTC_DRV_MV) += rtc-mv.o obj-$(CONFIG_RTC_DRV_OMAP) += rtc-omap.o obj-$(CONFIG_RTC_DRV_PCF8563) += rtc-pcf8563.o obj-$(CONFIG_RTC_DRV_PCF8583) += rtc-pcf8583.o @@ -54,6 +56,7 @@ obj-$(CONFIG_RTC_DRV_PL030) += rtc-pl030.o obj-$(CONFIG_RTC_DRV_PL031) += rtc-pl031.o obj-$(CONFIG_RTC_DRV_PARISC) += rtc-parisc.o obj-$(CONFIG_RTC_DRV_PPC) += rtc-ppc.o +obj-$(CONFIG_RTC_DRV_PXA) += rtc-pxa.o obj-$(CONFIG_RTC_DRV_R9701) += rtc-r9701.o obj-$(CONFIG_RTC_DRV_RS5C313) += rtc-rs5c313.o obj-$(CONFIG_RTC_DRV_RS5C348) += rtc-rs5c348.o @@ -66,6 +69,7 @@ obj-$(CONFIG_RTC_DRV_SH) += rtc-sh.o obj-$(CONFIG_RTC_DRV_STK17TA8) += rtc-stk17ta8.o obj-$(CONFIG_RTC_DRV_TEST) += rtc-test.o obj-$(CONFIG_RTC_DRV_TWL4030) += rtc-twl4030.o +obj-$(CONFIG_RTC_DRV_TX4939) += rtc-tx4939.o obj-$(CONFIG_RTC_DRV_V3020) += rtc-v3020.o obj-$(CONFIG_RTC_DRV_VR41XX) += rtc-vr41xx.o obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 4dfdf019fcc..be5a6b73e60 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -48,9 +48,7 @@ static int rtc_suspend(struct device *dev, pm_message_t mesg) struct rtc_time tm; struct timespec ts = current_kernel_time(); - if (strncmp(rtc->dev.bus_id, - CONFIG_RTC_HCTOSYS_DEVICE, - BUS_ID_SIZE) != 0) + if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; rtc_read_time(rtc, &tm); @@ -71,20 +69,18 @@ static int rtc_resume(struct device *dev) time_t newtime; struct timespec time; - if (strncmp(rtc->dev.bus_id, - CONFIG_RTC_HCTOSYS_DEVICE, - BUS_ID_SIZE) != 0) + if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) return 0; rtc_read_time(rtc, &tm); if (rtc_valid_tm(&tm) != 0) { - pr_debug("%s: bogus resume time\n", rtc->dev.bus_id); + pr_debug("%s: bogus resume time\n", dev_name(&rtc->dev)); return 0; } rtc_tm_to_time(&tm, &newtime); if (newtime <= oldtime) { if (newtime < oldtime) - pr_debug("%s: time travel!\n", rtc->dev.bus_id); + pr_debug("%s: time travel!\n", dev_name(&rtc->dev)); return 0; } @@ -156,7 +152,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, init_waitqueue_head(&rtc->irq_queue); strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE); - snprintf(rtc->dev.bus_id, BUS_ID_SIZE, "rtc%d", id); + dev_set_name(&rtc->dev, "rtc%d", id); rtc_dev_prepare(rtc); @@ -169,7 +165,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, rtc_proc_add_device(rtc); dev_info(dev, "rtc core: registered %s as %s\n", - rtc->name, rtc->dev.bus_id); + rtc->name, dev_name(&rtc->dev)); return rtc; diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index fd2c652504f..4348c4b0d45 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -50,10 +50,15 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm) if (!rtc->ops) err = -ENODEV; - else if (!rtc->ops->set_time) - err = -EINVAL; - else + else if (rtc->ops->set_time) err = rtc->ops->set_time(rtc->dev.parent, tm); + else if (rtc->ops->set_mmss) { + unsigned long secs; + err = rtc_tm_to_time(tm, &secs); + if (err == 0) + err = rtc->ops->set_mmss(rtc->dev.parent, secs); + } else + err = -EINVAL; mutex_unlock(&rtc->ops_lock); return err; @@ -389,7 +394,7 @@ static int __rtc_match(struct device *dev, void *data) { char *name = (char *)data; - if (strncmp(dev->bus_id, name, BUS_ID_SIZE) == 0) + if (strcmp(dev_name(dev), name) == 0) return 1; return 0; } @@ -504,9 +509,6 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq) if (rtc->ops->irq_set_freq == NULL) return -ENXIO; - if (!is_power_of_2(freq)) - return -EINVAL; - spin_lock_irqsave(&rtc->irq_task_lock, flags); if (rtc->irq_task != NULL && task == NULL) err = -EBUSY; diff --git a/drivers/rtc/rtc-at32ap700x.c b/drivers/rtc/rtc-at32ap700x.c index 90b9a6503e1..e1ec33e40e3 100644 --- a/drivers/rtc/rtc-at32ap700x.c +++ b/drivers/rtc/rtc-at32ap700x.c @@ -205,7 +205,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) { struct resource *regs; struct rtc_at32ap700x *rtc; - int irq = -1; + int irq; int ret; rtc = kzalloc(sizeof(struct rtc_at32ap700x), GFP_KERNEL); @@ -222,7 +222,7 @@ static int __init at32_rtc_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); - if (irq < 0) { + if (irq <= 0) { dev_dbg(&pdev->dev, "could not get irq\n"); ret = -ENXIO; goto out; diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c new file mode 100644 index 00000000000..8906a688e6a --- /dev/null +++ b/drivers/rtc/rtc-au1xxx.c @@ -0,0 +1,153 @@ +/* + * Au1xxx counter0 (aka Time-Of-Year counter) RTC interface driver. + * + * Copyright (C) 2008 Manuel Lauss <mano@roarinelk.homelinux.net> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +/* All current Au1xxx SoCs have 2 counters fed by an external 32.768 kHz + * crystal. Counter 0, which keeps counting during sleep/powerdown, is + * used to count seconds since the beginning of the unix epoch. + * + * The counters must be configured and enabled by bootloader/board code; + * no checks as to whether they really get a proper 32.768kHz clock are + * made as this would take far too long. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rtc.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <asm/mach-au1x00/au1000.h> + +/* 32kHz clock enabled and detected */ +#define CNTR_OK (SYS_CNTRL_E0 | SYS_CNTRL_32S) + +static int au1xtoy_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + unsigned long t; + + t = au_readl(SYS_TOYREAD); + + rtc_time_to_tm(t, tm); + + return rtc_valid_tm(tm); +} + +static int au1xtoy_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + unsigned long t; + + rtc_tm_to_time(tm, &t); + + au_writel(t, SYS_TOYWRITE); + au_sync(); + + /* wait for the pending register write to succeed. This can + * take up to 6 seconds... + */ + while (au_readl(SYS_COUNTER_CNTRL) & SYS_CNTRL_C0S) + msleep(1); + + return 0; +} + +static struct rtc_class_ops au1xtoy_rtc_ops = { + .read_time = au1xtoy_rtc_read_time, + .set_time = au1xtoy_rtc_set_time, +}; + +static int __devinit au1xtoy_rtc_probe(struct platform_device *pdev) +{ + struct rtc_device *rtcdev; + unsigned long t; + int ret; + + t = au_readl(SYS_COUNTER_CNTRL); + if (!(t & CNTR_OK)) { + dev_err(&pdev->dev, "counters not working; aborting.\n"); + ret = -ENODEV; + goto out_err; + } + + ret = -ETIMEDOUT; + + /* set counter0 tickrate to 1Hz if necessary */ + if (au_readl(SYS_TOYTRIM) != 32767) { + /* wait until hardware gives access to TRIM register */ + t = 0x00100000; + while ((au_readl(SYS_COUNTER_CNTRL) & SYS_CNTRL_T0S) && t--) + msleep(1); + + if (!t) { + /* timed out waiting for register access; assume + * counters are unusable. + */ + dev_err(&pdev->dev, "timeout waiting for access\n"); + goto out_err; + } + + /* set 1Hz TOY tick rate */ + au_writel(32767, SYS_TOYTRIM); + au_sync(); + } + + /* wait until the hardware allows writes to the counter reg */ + while (au_readl(SYS_COUNTER_CNTRL) & SYS_CNTRL_C0S) + msleep(1); + + rtcdev = rtc_device_register("rtc-au1xxx", &pdev->dev, + &au1xtoy_rtc_ops, THIS_MODULE); + if (IS_ERR(rtcdev)) { + ret = PTR_ERR(rtcdev); + goto out_err; + } + + platform_set_drvdata(pdev, rtcdev); + + return 0; + +out_err: + return ret; +} + +static int __devexit au1xtoy_rtc_remove(struct platform_device *pdev) +{ + struct rtc_device *rtcdev = platform_get_drvdata(pdev); + + rtc_device_unregister(rtcdev); + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static struct platform_driver au1xrtc_driver = { + .driver = { + .name = "rtc-au1xxx", + .owner = THIS_MODULE, + }, + .remove = __devexit_p(au1xtoy_rtc_remove), +}; + +static int __init au1xtoy_rtc_init(void) +{ + return platform_driver_probe(&au1xrtc_driver, au1xtoy_rtc_probe); +} + +static void __exit au1xtoy_rtc_exit(void) +{ + platform_driver_unregister(&au1xrtc_driver); +} + +module_init(au1xtoy_rtc_init); +module_exit(au1xtoy_rtc_exit); + +MODULE_DESCRIPTION("Au1xxx TOY-counter-based RTC driver"); +MODULE_AUTHOR("Manuel Lauss <manuel.lauss@gmail.com>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:rtc-au1xxx"); diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index 34439ce3967..aafd3e6ebb0 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -390,7 +390,7 @@ static int __devinit bfin_rtc_probe(struct platform_device *pdev) /* Register our RTC with the RTC framework */ rtc->rtc_dev = rtc_device_register(pdev->name, dev, &bfin_rtc_ops, THIS_MODULE); - if (unlikely(IS_ERR(rtc))) { + if (unlikely(IS_ERR(rtc->rtc_dev))) { ret = PTR_ERR(rtc->rtc_dev); goto err_irq; } diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 6cf8e282338..b6d35f50e40 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/platform_device.h> #include <linux/mod_devicetable.h> +#include <linux/log2.h> /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */ #include <asm-generic/rtc.h> @@ -58,7 +59,7 @@ struct cmos_rtc { }; /* both platform and pnp busses use negative numbers for invalid irqs */ -#define is_valid_irq(n) ((n) >= 0) +#define is_valid_irq(n) ((n) > 0) static const char driver_name[] = "rtc_cmos"; @@ -384,6 +385,8 @@ static int cmos_irq_set_freq(struct device *dev, int freq) if (!is_valid_irq(cmos->irq)) return -ENXIO; + if (!is_power_of_2(freq)) + return -EINVAL; /* 0 = no irqs; 1 = 2^15 Hz ... 15 = 2^0 Hz */ f = ffs(freq); if (f-- > 16) @@ -729,7 +732,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) cmos_rtc.dev = dev; dev_set_drvdata(dev, &cmos_rtc); - rename_region(ports, cmos_rtc.rtc->dev.bus_id); + rename_region(ports, dev_name(&cmos_rtc.rtc->dev)); spin_lock_irq(&rtc_lock); @@ -777,7 +780,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) rtc_cmos_int_handler = cmos_interrupt; retval = request_irq(rtc_irq, rtc_cmos_int_handler, - IRQF_DISABLED, cmos_rtc.rtc->dev.bus_id, + IRQF_DISABLED, dev_name(&cmos_rtc.rtc->dev), cmos_rtc.rtc); if (retval < 0) { dev_dbg(dev, "IRQ %d is already in use\n", rtc_irq); @@ -795,7 +798,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) } pr_info("%s: alarms up to one %s%s, %zd bytes nvram%s\n", - cmos_rtc.rtc->dev.bus_id, + dev_name(&cmos_rtc.rtc->dev), is_valid_irq(rtc_irq) ? (cmos_rtc.mon_alrm ? "year" @@ -885,7 +888,7 @@ static int cmos_suspend(struct device *dev, pm_message_t mesg) } pr_debug("%s: suspend%s, ctrl %02x\n", - cmos_rtc.rtc->dev.bus_id, + dev_name(&cmos_rtc.rtc->dev), (tmp & RTC_AIE) ? ", alarm may wake" : "", tmp); @@ -941,7 +944,7 @@ static int cmos_resume(struct device *dev) } pr_debug("%s: resume, ctrl %02x\n", - cmos_rtc.rtc->dev.bus_id, + dev_name(&cmos_rtc.rtc->dev), tmp); return 0; diff --git a/drivers/rtc/rtc-ds1216.c b/drivers/rtc/rtc-ds1216.c index 9a234a4ec06..4aedc705518 100644 --- a/drivers/rtc/rtc-ds1216.c +++ b/drivers/rtc/rtc-ds1216.c @@ -10,7 +10,7 @@ #include <linux/platform_device.h> #include <linux/bcd.h> -#define DRV_VERSION "0.1" +#define DRV_VERSION "0.2" struct ds1216_regs { u8 tsec; @@ -101,7 +101,8 @@ static int ds1216_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_year = bcd2bin(regs.year); if (tm->tm_year < 70) tm->tm_year += 100; - return 0; + + return rtc_valid_tm(tm); } static int ds1216_rtc_set_time(struct device *dev, struct rtc_time *tm) @@ -138,9 +139,8 @@ static const struct rtc_class_ops ds1216_rtc_ops = { .set_time = ds1216_rtc_set_time, }; -static int __devinit ds1216_rtc_probe(struct platform_device *pdev) +static int __init ds1216_rtc_probe(struct platform_device *pdev) { - struct rtc_device *rtc; struct resource *res; struct ds1216_priv *priv; int ret = 0; @@ -152,7 +152,10 @@ static int __devinit ds1216_rtc_probe(struct platform_device *pdev) priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) return -ENOMEM; - priv->size = res->end - res->start + 1; + + platform_set_drvdata(pdev, priv); + + priv->size = resource_size(res); if (!request_mem_region(res->start, priv->size, pdev->name)) { ret = -EBUSY; goto out; @@ -163,22 +166,18 @@ static int __devinit ds1216_rtc_probe(struct platform_device *pdev) ret = -ENOMEM; goto out; } - rtc = rtc_device_register("ds1216", &pdev->dev, + priv->rtc = rtc_device_register("ds1216", &pdev->dev, &ds1216_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - ret = PTR_ERR(rtc); + if (IS_ERR(priv->rtc)) { + ret = PTR_ERR(priv->rtc); goto out; } - priv->rtc = rtc; - platform_set_drvdata(pdev, priv); /* dummy read to get clock into a known state */ ds1216_read(priv->ioaddr, dummy); return 0; out: - if (priv->rtc) - rtc_device_unregister(priv->rtc); if (priv->ioaddr) iounmap(priv->ioaddr); if (priv->baseaddr) @@ -187,7 +186,7 @@ out: return ret; } -static int __devexit ds1216_rtc_remove(struct platform_device *pdev) +static int __exit ds1216_rtc_remove(struct platform_device *pdev) { struct ds1216_priv *priv = platform_get_drvdata(pdev); @@ -203,13 +202,12 @@ static struct platform_driver ds1216_rtc_platform_driver = { .name = "rtc-ds1216", .owner = THIS_MODULE, }, - .probe = ds1216_rtc_probe, - .remove = __devexit_p(ds1216_rtc_remove), + .remove = __exit_p(ds1216_rtc_remove), }; static int __init ds1216_rtc_init(void) { - return platform_driver_register(&ds1216_rtc_platform_driver); + return platform_driver_probe(&ds1216_rtc_platform_driver, ds1216_rtc_probe); } static void __exit ds1216_rtc_exit(void) diff --git a/drivers/rtc/rtc-ds1390.c b/drivers/rtc/rtc-ds1390.c index 599e976bf01..e54b5c619bd 100644 --- a/drivers/rtc/rtc-ds1390.c +++ b/drivers/rtc/rtc-ds1390.c @@ -1,5 +1,5 @@ /* - * rtc-ds1390.c -- driver for DS1390/93/94 + * rtc-ds1390.c -- driver for the Dallas/Maxim DS1390/93/94 SPI RTC * * Copyright (C) 2008 Mercury IMC Ltd * Written by Mark Jackson <mpfj@mimc.co.uk> @@ -8,11 +8,13 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * NOTE : Currently this driver only supports the bare minimum for read - * and write the RTC. The extra features provided by the chip family + * NOTE: Currently this driver only supports the bare minimum for read + * and write the RTC. The extra features provided by the chip family * (alarms, trickle charger, different control registers) are unavailable. */ +#include <linux/init.h> +#include <linux/module.h> #include <linux/platform_device.h> #include <linux/rtc.h> #include <linux/spi/spi.h> @@ -42,20 +44,6 @@ struct ds1390 { u8 txrx_buf[9]; /* cmd + 8 registers */ }; -static void ds1390_set_reg(struct device *dev, unsigned char address, - unsigned char data) -{ - struct spi_device *spi = to_spi_device(dev); - struct ds1390 *chip = dev_get_drvdata(dev); - - /* Set MSB to indicate write */ - chip->txrx_buf[0] = address | 0x80; - chip->txrx_buf[1] = data; - - /* do the i/o */ - spi_write_then_read(spi, chip->txrx_buf, 2, NULL, 0); -} - static int ds1390_get_reg(struct device *dev, unsigned char address, unsigned char *data) { @@ -78,7 +66,7 @@ static int ds1390_get_reg(struct device *dev, unsigned char address, return 0; } -static int ds1390_get_datetime(struct device *dev, struct rtc_time *dt) +static int ds1390_read_time(struct device *dev, struct rtc_time *dt) { struct spi_device *spi = to_spi_device(dev); struct ds1390 *chip = dev_get_drvdata(dev); @@ -107,7 +95,7 @@ static int ds1390_get_datetime(struct device *dev, struct rtc_time *dt) return rtc_valid_tm(dt); } -static int ds1390_set_datetime(struct device *dev, struct rtc_time *dt) +static int ds1390_set_time(struct device *dev, struct rtc_time *dt) { struct spi_device *spi = to_spi_device(dev); struct ds1390 *chip = dev_get_drvdata(dev); @@ -127,16 +115,6 @@ static int ds1390_set_datetime(struct device *dev, struct rtc_time *dt) return spi_write_then_read(spi, chip->txrx_buf, 8, NULL, 0); } -static int ds1390_read_time(struct device *dev, struct rtc_time *tm) -{ - return ds1390_get_datetime(dev, tm); -} - -static int ds1390_set_time(struct device *dev, struct rtc_time *tm) -{ - return ds1390_set_datetime(dev, tm); -} - static const struct rtc_class_ops ds1390_rtc_ops = { .read_time = ds1390_read_time, .set_time = ds1390_set_time, @@ -149,46 +127,40 @@ static int __devinit ds1390_probe(struct spi_device *spi) struct ds1390 *chip; int res; - printk(KERN_DEBUG "DS1390 SPI RTC driver\n"); - - rtc = rtc_device_register("ds1390", - &spi->dev, &ds1390_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) { - printk(KERN_ALERT "RTC : unable to register device\n"); - return PTR_ERR(rtc); - } - spi->mode = SPI_MODE_3; spi->bits_per_word = 8; spi_setup(spi); chip = kzalloc(sizeof *chip, GFP_KERNEL); if (!chip) { - printk(KERN_ALERT "RTC : unable to allocate device memory\n"); - rtc_device_unregister(rtc); + dev_err(&spi->dev, "unable to allocate device memory\n"); return -ENOMEM; } - chip->rtc = rtc; dev_set_drvdata(&spi->dev, chip); res = ds1390_get_reg(&spi->dev, DS1390_REG_SECONDS, &tmp); - if (res) { - printk(KERN_ALERT "RTC : unable to read device\n"); - rtc_device_unregister(rtc); + if (res != 0) { + dev_err(&spi->dev, "unable to read device\n"); + kfree(chip); return res; } - return 0; + chip->rtc = rtc_device_register("ds1390", + &spi->dev, &ds1390_rtc_ops, THIS_MODULE); + if (IS_ERR(chip->rtc)) { + dev_err(&spi->dev, "unable to register device\n"); + res = PTR_ERR(chip->rtc); + kfree(chip); + } + + return res; } static int __devexit ds1390_remove(struct spi_device *spi) { struct ds1390 *chip = platform_get_drvdata(spi); - struct rtc_device *rtc = chip->rtc; - - if (rtc) - rtc_device_unregister(rtc); + rtc_device_unregister(chip->rtc); kfree(chip); return 0; @@ -215,6 +187,6 @@ static __exit void ds1390_exit(void) } module_exit(ds1390_exit); -MODULE_DESCRIPTION("DS1390/93/94 SPI RTC driver"); +MODULE_DESCRIPTION("Dallas/Maxim DS1390/93/94 SPI RTC driver"); MODULE_AUTHOR("Mark Jackson <mpfj@mimc.co.uk>"); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 25caada7839..23a07fe15a2 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -326,9 +326,9 @@ ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) { + if (pdata->irq <= 0) return -EINVAL; - } + pdata->alrm_mday = alrm->time.tm_mday; pdata->alrm_hour = alrm->time.tm_hour; pdata->alrm_min = alrm->time.tm_min; @@ -346,9 +346,9 @@ ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) { + if (pdata->irq <= 0) return -EINVAL; - } + alrm->time.tm_mday = pdata->alrm_mday < 0 ? 0 : pdata->alrm_mday; alrm->time.tm_hour = pdata->alrm_hour < 0 ? 0 : pdata->alrm_hour; alrm->time.tm_min = pdata->alrm_min < 0 ? 0 : pdata->alrm_min; @@ -385,7 +385,7 @@ ds1511_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) { + if (pdata->irq <= 0) { return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */ } switch (cmd) { @@ -503,7 +503,6 @@ ds1511_rtc_probe(struct platform_device *pdev) if (!pdata) { return -ENOMEM; } - pdata->irq = -1; pdata->size = res->end - res->start + 1; if (!request_mem_region(res->start, pdata->size, pdev->name)) { ret = -EBUSY; @@ -545,13 +544,13 @@ ds1511_rtc_probe(struct platform_device *pdev) * if the platform has an interrupt in mind for this device, * then by all means, set it */ - if (pdata->irq >= 0) { + if (pdata->irq > 0) { rtc_read(RTC_CMD1); if (request_irq(pdata->irq, ds1511_interrupt, IRQF_DISABLED | IRQF_SHARED, pdev->name, pdev) < 0) { dev_warn(&pdev->dev, "interrupt not available.\n"); - pdata->irq = -1; + pdata->irq = 0; } } @@ -572,7 +571,7 @@ ds1511_rtc_probe(struct platform_device *pdev) if (pdata->rtc) { rtc_device_unregister(pdata->rtc); } - if (pdata->irq >= 0) { + if (pdata->irq > 0) { free_irq(pdata->irq, pdev); } if (ds1511_base) { @@ -595,7 +594,7 @@ ds1511_rtc_remove(struct platform_device *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, &ds1511_nvram_attr); rtc_device_unregister(pdata->rtc); pdata->rtc = NULL; - if (pdata->irq >= 0) { + if (pdata->irq > 0) { /* * disable the alarm interrupt */ diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index b9475cd2021..38d472b6340 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -162,7 +162,7 @@ static int ds1553_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -EINVAL; pdata->alrm_mday = alrm->time.tm_mday; pdata->alrm_hour = alrm->time.tm_hour; @@ -179,7 +179,7 @@ static int ds1553_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -EINVAL; alrm->time.tm_mday = pdata->alrm_mday < 0 ? 0 : pdata->alrm_mday; alrm->time.tm_hour = pdata->alrm_hour < 0 ? 0 : pdata->alrm_hour; @@ -213,7 +213,7 @@ static int ds1553_rtc_ioctl(struct device *dev, unsigned int cmd, struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */ switch (cmd) { case RTC_AIE_OFF: @@ -301,7 +301,6 @@ static int __devinit ds1553_rtc_probe(struct platform_device *pdev) pdata = kzalloc(sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->irq = -1; if (!request_mem_region(res->start, RTC_REG_SIZE, pdev->name)) { ret = -EBUSY; goto out; @@ -327,13 +326,13 @@ static int __devinit ds1553_rtc_probe(struct platform_device *pdev) if (readb(ioaddr + RTC_FLAGS) & RTC_FLAGS_BLF) dev_warn(&pdev->dev, "voltage-low detected.\n"); - if (pdata->irq >= 0) { + if (pdata->irq > 0) { writeb(0, ioaddr + RTC_INTERRUPTS); if (request_irq(pdata->irq, ds1553_rtc_interrupt, IRQF_DISABLED | IRQF_SHARED, pdev->name, pdev) < 0) { dev_warn(&pdev->dev, "interrupt not available.\n"); - pdata->irq = -1; + pdata->irq = 0; } } @@ -353,7 +352,7 @@ static int __devinit ds1553_rtc_probe(struct platform_device *pdev) out: if (pdata->rtc) rtc_device_unregister(pdata->rtc); - if (pdata->irq >= 0) + if (pdata->irq > 0) free_irq(pdata->irq, pdev); if (ioaddr) iounmap(ioaddr); @@ -369,7 +368,7 @@ static int __devexit ds1553_rtc_remove(struct platform_device *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, &ds1553_nvram_attr); rtc_device_unregister(pdata->rtc); - if (pdata->irq >= 0) { + if (pdata->irq > 0) { writeb(0, pdata->ioaddr + RTC_INTERRUPTS); free_irq(pdata->irq, pdev); } diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c index 4e91419e891..06dfb54f99b 100644 --- a/drivers/rtc/rtc-ds1672.c +++ b/drivers/rtc/rtc-ds1672.c @@ -83,32 +83,11 @@ static int ds1672_set_mmss(struct i2c_client *client, unsigned long secs) return 0; } -static int ds1672_set_datetime(struct i2c_client *client, struct rtc_time *tm) -{ - unsigned long secs; - - dev_dbg(&client->dev, - "%s: secs=%d, mins=%d, hours=%d, " - "mday=%d, mon=%d, year=%d, wday=%d\n", - __func__, - tm->tm_sec, tm->tm_min, tm->tm_hour, - tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); - - rtc_tm_to_time(tm, &secs); - - return ds1672_set_mmss(client, secs); -} - static int ds1672_rtc_read_time(struct device *dev, struct rtc_time *tm) { return ds1672_get_datetime(to_i2c_client(dev), tm); } -static int ds1672_rtc_set_time(struct device *dev, struct rtc_time *tm) -{ - return ds1672_set_datetime(to_i2c_client(dev), tm); -} - static int ds1672_rtc_set_mmss(struct device *dev, unsigned long secs) { return ds1672_set_mmss(to_i2c_client(dev), secs); @@ -152,7 +131,6 @@ static DEVICE_ATTR(control, S_IRUGO, show_control, NULL); static const struct rtc_class_ops ds1672_rtc_ops = { .read_time = ds1672_rtc_read_time, - .set_time = ds1672_rtc_set_time, .set_mmss = ds1672_rtc_set_mmss, }; diff --git a/drivers/rtc/rtc-ds3234.c b/drivers/rtc/rtc-ds3234.c index 45e5b106af7..c51589ede5b 100644 --- a/drivers/rtc/rtc-ds3234.c +++ b/drivers/rtc/rtc-ds3234.c @@ -1,4 +1,4 @@ -/* drivers/rtc/rtc-ds3234.c +/* rtc-ds3234.c * * Driver for Dallas Semiconductor (DS3234) SPI RTC with Integrated Crystal * and SRAM. @@ -9,13 +9,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * Changelog: - * - * 07-May-2008: Dennis Aberilla <denzzzhome@yahoo.com> - * - Created based on the max6902 code. Only implements the - * date/time keeping functions; no SRAM yet. */ +#include <linux/init.h> +#include <linux/module.h> #include <linux/device.h> #include <linux/platform_device.h> #include <linux/rtc.h> @@ -34,16 +31,7 @@ #define DS3234_REG_CONTROL 0x0E #define DS3234_REG_CONT_STAT 0x0F -#undef DS3234_DEBUG - -struct ds3234 { - struct rtc_device *rtc; - u8 buf[8]; /* Burst read: addr + 7 regs */ - u8 tx_buf[2]; - u8 rx_buf[2]; -}; - -static void ds3234_set_reg(struct device *dev, unsigned char address, +static int ds3234_set_reg(struct device *dev, unsigned char address, unsigned char data) { struct spi_device *spi = to_spi_device(dev); @@ -53,107 +41,45 @@ static void ds3234_set_reg(struct device *dev, unsigned char address, buf[0] = address | 0x80; buf[1] = data; - spi_write(spi, buf, 2); + return spi_write_then_read(spi, buf, 2, NULL, 0); } static int ds3234_get_reg(struct device *dev, unsigned char address, unsigned char *data) { struct spi_device *spi = to_spi_device(dev); - struct ds3234 *chip = dev_get_drvdata(dev); - struct spi_message message; - struct spi_transfer xfer; - int status; - - if (!data) - return -EINVAL; - - /* Build our spi message */ - spi_message_init(&message); - memset(&xfer, 0, sizeof(xfer)); - - /* Address + dummy tx byte */ - xfer.len = 2; - xfer.tx_buf = chip->tx_buf; - xfer.rx_buf = chip->rx_buf; - - chip->tx_buf[0] = address; - chip->tx_buf[1] = 0xff; - spi_message_add_tail(&xfer, &message); + *data = address & 0x7f; - /* do the i/o */ - status = spi_sync(spi, &message); - if (status == 0) - status = message.status; - else - return status; - - *data = chip->rx_buf[1]; - - return status; + return spi_write_then_read(spi, data, 1, data, 1); } -static int ds3234_get_datetime(struct device *dev, struct rtc_time *dt) +static int ds3234_read_time(struct device *dev, struct rtc_time *dt) { + int err; + unsigned char buf[8]; struct spi_device *spi = to_spi_device(dev); - struct ds3234 *chip = dev_get_drvdata(dev); - struct spi_message message; - struct spi_transfer xfer; - int status; - - /* build the message */ - spi_message_init(&message); - memset(&xfer, 0, sizeof(xfer)); - xfer.len = 1 + 7; /* Addr + 7 registers */ - xfer.tx_buf = chip->buf; - xfer.rx_buf = chip->buf; - chip->buf[0] = 0x00; /* Start address */ - spi_message_add_tail(&xfer, &message); - - /* do the i/o */ - status = spi_sync(spi, &message); - if (status == 0) - status = message.status; - else - return status; - /* Seconds, Minutes, Hours, Day, Date, Month, Year */ - dt->tm_sec = bcd2bin(chip->buf[1]); - dt->tm_min = bcd2bin(chip->buf[2]); - dt->tm_hour = bcd2bin(chip->buf[3] & 0x3f); - dt->tm_wday = bcd2bin(chip->buf[4]) - 1; /* 0 = Sun */ - dt->tm_mday = bcd2bin(chip->buf[5]); - dt->tm_mon = bcd2bin(chip->buf[6] & 0x1f) - 1; /* 0 = Jan */ - dt->tm_year = bcd2bin(chip->buf[7] & 0xff) + 100; /* Assume 20YY */ - -#ifdef DS3234_DEBUG - dev_dbg(dev, "\n%s : Read RTC values\n", __func__); - dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour); - dev_dbg(dev, "tm_min : %i\n", dt->tm_min); - dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec); - dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday); - dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday); - dev_dbg(dev, "tm_mon : %i\n", dt->tm_mon); - dev_dbg(dev, "tm_year: %i\n", dt->tm_year); -#endif + buf[0] = 0x00; /* Start address */ - return 0; + err = spi_write_then_read(spi, buf, 1, buf, 8); + if (err != 0) + return err; + + /* Seconds, Minutes, Hours, Day, Date, Month, Year */ + dt->tm_sec = bcd2bin(buf[0]); + dt->tm_min = bcd2bin(buf[1]); + dt->tm_hour = bcd2bin(buf[2] & 0x3f); + dt->tm_wday = bcd2bin(buf[3]) - 1; /* 0 = Sun */ + dt->tm_mday = bcd2bin(buf[4]); + dt->tm_mon = bcd2bin(buf[5] & 0x1f) - 1; /* 0 = Jan */ + dt->tm_year = bcd2bin(buf[6] & 0xff) + 100; /* Assume 20YY */ + + return rtc_valid_tm(dt); } -static int ds3234_set_datetime(struct device *dev, struct rtc_time *dt) +static int ds3234_set_time(struct device *dev, struct rtc_time *dt) { -#ifdef DS3234_DEBUG - dev_dbg(dev, "\n%s : Setting RTC values\n", __func__); - dev_dbg(dev, "tm_sec : %i\n", dt->tm_sec); - dev_dbg(dev, "tm_min : %i\n", dt->tm_min); - dev_dbg(dev, "tm_hour: %i\n", dt->tm_hour); - dev_dbg(dev, "tm_wday: %i\n", dt->tm_wday); - dev_dbg(dev, "tm_mday: %i\n", dt->tm_mday); - dev_dbg(dev, "tm_mon : %i\n", dt->tm_mon); - dev_dbg(dev, "tm_year: %i\n", dt->tm_year); -#endif - ds3234_set_reg(dev, DS3234_REG_SECONDS, bin2bcd(dt->tm_sec)); ds3234_set_reg(dev, DS3234_REG_MINUTES, bin2bcd(dt->tm_min)); ds3234_set_reg(dev, DS3234_REG_HOURS, bin2bcd(dt->tm_hour) & 0x3f); @@ -174,16 +100,6 @@ static int ds3234_set_datetime(struct device *dev, struct rtc_time *dt) return 0; } -static int ds3234_read_time(struct device *dev, struct rtc_time *tm) -{ - return ds3234_get_datetime(dev, tm); -} - -static int ds3234_set_time(struct device *dev, struct rtc_time *tm) -{ - return ds3234_set_datetime(dev, tm); -} - static const struct rtc_class_ops ds3234_rtc_ops = { .read_time = ds3234_read_time, .set_time = ds3234_set_time, @@ -193,31 +109,15 @@ static int __devinit ds3234_probe(struct spi_device *spi) { struct rtc_device *rtc; unsigned char tmp; - struct ds3234 *chip; int res; - rtc = rtc_device_register("ds3234", - &spi->dev, &ds3234_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - spi->mode = SPI_MODE_3; spi->bits_per_word = 8; spi_setup(spi); - chip = kzalloc(sizeof(struct ds3234), GFP_KERNEL); - if (!chip) { - rtc_device_unregister(rtc); - return -ENOMEM; - } - chip->rtc = rtc; - dev_set_drvdata(&spi->dev, chip); - res = ds3234_get_reg(&spi->dev, DS3234_REG_SECONDS, &tmp); - if (res) { - rtc_device_unregister(rtc); + if (res != 0) return res; - } /* Control settings * @@ -246,26 +146,27 @@ static int __devinit ds3234_probe(struct spi_device *spi) ds3234_get_reg(&spi->dev, DS3234_REG_CONT_STAT, &tmp); dev_info(&spi->dev, "Ctrl/Stat Reg: 0x%02x\n", tmp); + rtc = rtc_device_register("ds3234", + &spi->dev, &ds3234_rtc_ops, THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + dev_set_drvdata(&spi->dev, rtc); + return 0; } static int __devexit ds3234_remove(struct spi_device *spi) { - struct ds3234 *chip = platform_get_drvdata(spi); - struct rtc_device *rtc = chip->rtc; - - if (rtc) - rtc_device_unregister(rtc); - - kfree(chip); + struct rtc_device *rtc = platform_get_drvdata(spi); + rtc_device_unregister(rtc); return 0; } static struct spi_driver ds3234_driver = { .driver = { .name = "ds3234", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = ds3234_probe, @@ -274,7 +175,6 @@ static struct spi_driver ds3234_driver = { static __init int ds3234_init(void) { - printk(KERN_INFO "DS3234 SPI RTC Driver\n"); return spi_register_driver(&ds3234_driver); } module_init(ds3234_init); diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 36e4ac0bd69..f7a3283dd02 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -49,18 +49,6 @@ static int ep93xx_rtc_set_mmss(struct device *dev, unsigned long secs) return 0; } -static int ep93xx_rtc_set_time(struct device *dev, struct rtc_time *tm) -{ - int err; - unsigned long secs; - - err = rtc_tm_to_time(tm, &secs); - if (err != 0) - return err; - - return ep93xx_rtc_set_mmss(dev, secs); -} - static int ep93xx_rtc_proc(struct device *dev, struct seq_file *seq) { unsigned short preload, delete; @@ -75,7 +63,6 @@ static int ep93xx_rtc_proc(struct device *dev, struct seq_file *seq) static const struct rtc_class_ops ep93xx_rtc_ops = { .read_time = ep93xx_rtc_read_time, - .set_time = ep93xx_rtc_set_time, .set_mmss = ep93xx_rtc_set_mmss, .proc = ep93xx_rtc_proc, }; diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 43afb7ab528..33921a6b170 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -450,7 +450,7 @@ static int __devinit m48t59_rtc_probe(struct platform_device *pdev) * the mode without IRQ. */ m48t59->irq = platform_get_irq(pdev, 0); - if (m48t59->irq < 0) + if (m48t59->irq <= 0) m48t59->irq = NO_IRQ; if (m48t59->irq != NO_IRQ) { diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 2f6507df7b4..36a8ea9ed8b 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -9,14 +9,6 @@ * * Driver for MAX6902 spi RTC * - * Changelog: - * - * 24-May-2006: Raphael Assenat <raph@8d.com> - * - Major rework - * Converted to rtc_device and uses the SPI layer. - * - * ??-???-2005: Someone at Compulab - * - Initial driver creation. */ #include <linux/module.h> @@ -26,7 +18,6 @@ #include <linux/rtc.h> #include <linux/spi/spi.h> #include <linux/bcd.h> -#include <linux/delay.h> #define MAX6902_REG_SECONDS 0x01 #define MAX6902_REG_MINUTES 0x03 @@ -38,16 +29,7 @@ #define MAX6902_REG_CONTROL 0x0F #define MAX6902_REG_CENTURY 0x13 -#undef MAX6902_DEBUG - -struct max6902 { - struct rtc_device *rtc; - u8 buf[9]; /* Burst read cmd + 8 registers */ - u8 tx_buf[2]; - u8 rx_buf[2]; -}; - -static void max6902_set_reg(struct device *dev, unsigned char address, +static int max6902_set_reg(struct device *dev, unsigned char address, unsigned char data) { struct spi_device *spi = to_spi_device(dev); @@ -57,113 +39,58 @@ static void max6902_set_reg(struct device *dev, unsigned char address, buf[0] = address & 0x7f; buf[1] = data; - spi_write(spi, buf, 2); + return spi_write_then_read(spi, buf, 2, NULL, 0); } static int max6902_get_reg(struct device *dev, unsigned char address, unsigned char *data) { struct spi_device *spi = to_spi_device(dev); - struct max6902 *chip = dev_get_drvdata(dev); - struct spi_message message; - struct spi_transfer xfer; - int status; - - if (!data) - return -EINVAL; - - /* Build our spi message */ - spi_message_init(&message); - memset(&xfer, 0, sizeof(xfer)); - xfer.len = 2; - /* Can tx_buf and rx_buf be equal? The doc in spi.h is not sure... */ - xfer.tx_buf = chip->tx_buf; - xfer.rx_buf = chip->rx_buf; /* Set MSB to indicate read */ - chip->tx_buf[0] = address | 0x80; - - spi_message_add_tail(&xfer, &message); + *data = address | 0x80; - /* do the i/o */ - status = spi_sync(spi, &message); - - if (status == 0) - *data = chip->rx_buf[1]; - return status; + return spi_write_then_read(spi, data, 1, data, 1); } -static int max6902_get_datetime(struct device *dev, struct rtc_time *dt) +static int max6902_read_time(struct device *dev, struct rtc_time *dt) { - unsigned char tmp; - int century; - int err; + int err, century; struct spi_device *spi = to_spi_device(dev); - struct max6902 *chip = dev_get_drvdata(dev); - struct spi_message message; - struct spi_transfer xfer; - int status; + unsigned char buf[8]; - err = max6902_get_reg(dev, MAX6902_REG_CENTURY, &tmp); - if (err) - return err; - - /* build the message */ - spi_message_init(&message); - memset(&xfer, 0, sizeof(xfer)); - xfer.len = 1 + 7; /* Burst read command + 7 registers */ - xfer.tx_buf = chip->buf; - xfer.rx_buf = chip->buf; - chip->buf[0] = 0xbf; /* Burst read */ - spi_message_add_tail(&xfer, &message); + buf[0] = 0xbf; /* Burst read */ - /* do the i/o */ - status = spi_sync(spi, &message); - if (status) - return status; + err = spi_write_then_read(spi, buf, 1, buf, 8); + if (err != 0) + return err; /* The chip sends data in this order: * Seconds, Minutes, Hours, Date, Month, Day, Year */ - dt->tm_sec = bcd2bin(chip->buf[1]); - dt->tm_min = bcd2bin(chip->buf[2]); - dt->tm_hour = bcd2bin(chip->buf[3]); - dt->tm_mday = bcd2bin(chip->buf[4]); - dt->tm_mon = bcd2bin(chip->buf[5]) - 1; - dt->tm_wday = bcd2bin(chip->buf[6]); - dt->tm_year = bcd2bin(chip->buf[7]); + dt->tm_sec = bcd2bin(buf[0]); + dt->tm_min = bcd2bin(buf[1]); + dt->tm_hour = bcd2bin(buf[2]); + dt->tm_mday = bcd2bin(buf[3]); + dt->tm_mon = bcd2bin(buf[4]) - 1; + dt->tm_wday = bcd2bin(buf[5]); + dt->tm_year = bcd2bin(buf[6]); + + /* Read century */ + err = max6902_get_reg(dev, MAX6902_REG_CENTURY, &buf[0]); + if (err != 0) + return err; - century = bcd2bin(tmp) * 100; + century = bcd2bin(buf[0]) * 100; dt->tm_year += century; dt->tm_year -= 1900; -#ifdef MAX6902_DEBUG - printk("\n%s : Read RTC values\n",__func__); - printk("tm_hour: %i\n",dt->tm_hour); - printk("tm_min : %i\n",dt->tm_min); - printk("tm_sec : %i\n",dt->tm_sec); - printk("tm_year: %i\n",dt->tm_year); - printk("tm_mon : %i\n",dt->tm_mon); - printk("tm_mday: %i\n",dt->tm_mday); - printk("tm_wday: %i\n",dt->tm_wday); -#endif - - return 0; + return rtc_valid_tm(dt); } -static int max6902_set_datetime(struct device *dev, struct rtc_time *dt) +static int max6902_set_time(struct device *dev, struct rtc_time *dt) { - dt->tm_year = dt->tm_year+1900; - -#ifdef MAX6902_DEBUG - printk("\n%s : Setting RTC values\n",__func__); - printk("tm_sec : %i\n",dt->tm_sec); - printk("tm_min : %i\n",dt->tm_min); - printk("tm_hour: %i\n",dt->tm_hour); - printk("tm_mday: %i\n",dt->tm_mday); - printk("tm_wday: %i\n",dt->tm_wday); - printk("tm_year: %i\n",dt->tm_year); -#endif + dt->tm_year = dt->tm_year + 1900; /* Remove write protection */ max6902_set_reg(dev, 0xF, 0); @@ -173,10 +100,10 @@ static int max6902_set_datetime(struct device *dev, struct rtc_time *dt) max6902_set_reg(dev, 0x05, bin2bcd(dt->tm_hour)); max6902_set_reg(dev, 0x07, bin2bcd(dt->tm_mday)); - max6902_set_reg(dev, 0x09, bin2bcd(dt->tm_mon+1)); + max6902_set_reg(dev, 0x09, bin2bcd(dt->tm_mon + 1)); max6902_set_reg(dev, 0x0B, bin2bcd(dt->tm_wday)); - max6902_set_reg(dev, 0x0D, bin2bcd(dt->tm_year%100)); - max6902_set_reg(dev, 0x13, bin2bcd(dt->tm_year/100)); + max6902_set_reg(dev, 0x0D, bin2bcd(dt->tm_year % 100)); + max6902_set_reg(dev, 0x13, bin2bcd(dt->tm_year / 100)); /* Compulab used a delay here. However, the datasheet * does not mention a delay being required anywhere... */ @@ -188,16 +115,6 @@ static int max6902_set_datetime(struct device *dev, struct rtc_time *dt) return 0; } -static int max6902_read_time(struct device *dev, struct rtc_time *tm) -{ - return max6902_get_datetime(dev, tm); -} - -static int max6902_set_time(struct device *dev, struct rtc_time *tm) -{ - return max6902_set_datetime(dev, tm); -} - static const struct rtc_class_ops max6902_rtc_ops = { .read_time = max6902_read_time, .set_time = max6902_set_time, @@ -207,45 +124,29 @@ static int __devinit max6902_probe(struct spi_device *spi) { struct rtc_device *rtc; unsigned char tmp; - struct max6902 *chip; int res; - rtc = rtc_device_register("max6902", - &spi->dev, &max6902_rtc_ops, THIS_MODULE); - if (IS_ERR(rtc)) - return PTR_ERR(rtc); - spi->mode = SPI_MODE_3; spi->bits_per_word = 8; spi_setup(spi); - chip = kzalloc(sizeof *chip, GFP_KERNEL); - if (!chip) { - rtc_device_unregister(rtc); - return -ENOMEM; - } - chip->rtc = rtc; - dev_set_drvdata(&spi->dev, chip); - res = max6902_get_reg(&spi->dev, MAX6902_REG_SECONDS, &tmp); - if (res) { - rtc_device_unregister(rtc); + if (res != 0) return res; - } + + rtc = rtc_device_register("max6902", + &spi->dev, &max6902_rtc_ops, THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); return 0; } static int __devexit max6902_remove(struct spi_device *spi) { - struct max6902 *chip = platform_get_drvdata(spi); - struct rtc_device *rtc = chip->rtc; - - if (rtc) - rtc_device_unregister(rtc); - - kfree(chip); + struct rtc_device *rtc = platform_get_drvdata(spi); + rtc_device_unregister(rtc); return 0; } @@ -261,7 +162,6 @@ static struct spi_driver max6902_driver = { static __init int max6902_init(void) { - printk("max6902 spi driver\n"); return spi_register_driver(&max6902_driver); } module_init(max6902_init); diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c new file mode 100644 index 00000000000..45f12dcd371 --- /dev/null +++ b/drivers/rtc/rtc-mv.c @@ -0,0 +1,163 @@ +/* + * Driver for the RTC in Marvell SoCs. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/rtc.h> +#include <linux/bcd.h> +#include <linux/io.h> +#include <linux/platform_device.h> + + +#define RTC_TIME_REG_OFFS 0 +#define RTC_SECONDS_OFFS 0 +#define RTC_MINUTES_OFFS 8 +#define RTC_HOURS_OFFS 16 +#define RTC_WDAY_OFFS 24 +#define RTC_HOURS_12H_MODE (1 << 22) /* 12 hours mode */ + +#define RTC_DATE_REG_OFFS 4 +#define RTC_MDAY_OFFS 0 +#define RTC_MONTH_OFFS 8 +#define RTC_YEAR_OFFS 16 + + +struct rtc_plat_data { + struct rtc_device *rtc; + void __iomem *ioaddr; +}; + +static int mv_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + void __iomem *ioaddr = pdata->ioaddr; + u32 rtc_reg; + + rtc_reg = (bin2bcd(tm->tm_sec) << RTC_SECONDS_OFFS) | + (bin2bcd(tm->tm_min) << RTC_MINUTES_OFFS) | + (bin2bcd(tm->tm_hour) << RTC_HOURS_OFFS) | + (bin2bcd(tm->tm_wday) << RTC_WDAY_OFFS); + writel(rtc_reg, ioaddr + RTC_TIME_REG_OFFS); + + rtc_reg = (bin2bcd(tm->tm_mday) << RTC_MDAY_OFFS) | + (bin2bcd(tm->tm_mon + 1) << RTC_MONTH_OFFS) | + (bin2bcd(tm->tm_year % 100) << RTC_YEAR_OFFS); + writel(rtc_reg, ioaddr + RTC_DATE_REG_OFFS); + + return 0; +} + +static int mv_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + void __iomem *ioaddr = pdata->ioaddr; + u32 rtc_time, rtc_date; + unsigned int year, month, day, hour, minute, second, wday; + + rtc_time = readl(ioaddr + RTC_TIME_REG_OFFS); + rtc_date = readl(ioaddr + RTC_DATE_REG_OFFS); + + second = rtc_time & 0x7f; + minute = (rtc_time >> RTC_MINUTES_OFFS) & 0x7f; + hour = (rtc_time >> RTC_HOURS_OFFS) & 0x3f; /* assume 24 hours mode */ + wday = (rtc_time >> RTC_WDAY_OFFS) & 0x7; + + day = rtc_date & 0x3f; + month = (rtc_date >> RTC_MONTH_OFFS) & 0x3f; + year = (rtc_date >> RTC_YEAR_OFFS) & 0xff; + + tm->tm_sec = bcd2bin(second); + tm->tm_min = bcd2bin(minute); + tm->tm_hour = bcd2bin(hour); + tm->tm_mday = bcd2bin(day); + tm->tm_wday = bcd2bin(wday); + tm->tm_mon = bcd2bin(month) - 1; + /* hw counts from year 2000, but tm_year is relative to 1900 */ + tm->tm_year = bcd2bin(year) + 100; + + return rtc_valid_tm(tm); +} + +static const struct rtc_class_ops mv_rtc_ops = { + .read_time = mv_rtc_read_time, + .set_time = mv_rtc_set_time, +}; + +static int __init mv_rtc_probe(struct platform_device *pdev) +{ + struct resource *res; + struct rtc_plat_data *pdata; + resource_size_t size; + u32 rtc_time; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + + size = resource_size(res); + if (!devm_request_mem_region(&pdev->dev, res->start, size, + pdev->name)) + return -EBUSY; + + pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, size); + if (!pdata->ioaddr) + return -ENOMEM; + + /* make sure the 24 hours mode is enabled */ + rtc_time = readl(pdata->ioaddr + RTC_TIME_REG_OFFS); + if (rtc_time & RTC_HOURS_12H_MODE) { + dev_err(&pdev->dev, "24 Hours mode not supported.\n"); + return -EINVAL; + } + + platform_set_drvdata(pdev, pdata); + pdata->rtc = rtc_device_register(pdev->name, &pdev->dev, + &mv_rtc_ops, THIS_MODULE); + if (IS_ERR(pdata->rtc)) + return PTR_ERR(pdata->rtc); + + return 0; +} + +static int __exit mv_rtc_remove(struct platform_device *pdev) +{ + struct rtc_plat_data *pdata = platform_get_drvdata(pdev); + + rtc_device_unregister(pdata->rtc); + return 0; +} + +static struct platform_driver mv_rtc_driver = { + .remove = __exit_p(mv_rtc_remove), + .driver = { + .name = "rtc-mv", + .owner = THIS_MODULE, + }, +}; + +static __init int mv_init(void) +{ + return platform_driver_probe(&mv_rtc_driver, mv_rtc_probe); +} + +static __exit void mv_exit(void) +{ + platform_driver_unregister(&mv_rtc_driver); +} + +module_init(mv_init); +module_exit(mv_exit); + +MODULE_AUTHOR("Saeed Bishara <saeed@marvell.com>"); +MODULE_DESCRIPTION("Marvell RTC driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:rtc-mv"); diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c new file mode 100644 index 00000000000..cc7eb8767b8 --- /dev/null +++ b/drivers/rtc/rtc-pxa.c @@ -0,0 +1,489 @@ +/* + * Real Time Clock interface for XScale PXA27x and PXA3xx + * + * Copyright (C) 2008 Robert Jarzmik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/module.h> +#include <linux/rtc.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/io.h> + +#define TIMER_FREQ CLOCK_TICK_RATE +#define RTC_DEF_DIVIDER (32768 - 1) +#define RTC_DEF_TRIM 0 +#define MAXFREQ_PERIODIC 1000 + +/* + * PXA Registers and bits definitions + */ +#define RTSR_PICE (1 << 15) /* Periodic interrupt count enable */ +#define RTSR_PIALE (1 << 14) /* Periodic interrupt Alarm enable */ +#define RTSR_PIAL (1 << 13) /* Periodic interrupt detected */ +#define RTSR_SWALE2 (1 << 11) /* RTC stopwatch alarm2 enable */ +#define RTSR_SWAL2 (1 << 10) /* RTC stopwatch alarm2 detected */ +#define RTSR_SWALE1 (1 << 9) /* RTC stopwatch alarm1 enable */ +#define RTSR_SWAL1 (1 << 8) /* RTC stopwatch alarm1 detected */ +#define RTSR_RDALE2 (1 << 7) /* RTC alarm2 enable */ +#define RTSR_RDAL2 (1 << 6) /* RTC alarm2 detected */ +#define RTSR_RDALE1 (1 << 5) /* RTC alarm1 enable */ +#define RTSR_RDAL1 (1 << 4) /* RTC alarm1 detected */ +#define RTSR_HZE (1 << 3) /* HZ interrupt enable */ +#define RTSR_ALE (1 << 2) /* RTC alarm interrupt enable */ +#define RTSR_HZ (1 << 1) /* HZ rising-edge detected */ +#define RTSR_AL (1 << 0) /* RTC alarm detected */ +#define RTSR_TRIG_MASK (RTSR_AL | RTSR_HZ | RTSR_RDAL1 | RTSR_RDAL2\ + | RTSR_SWAL1 | RTSR_SWAL2) +#define RYxR_YEAR_S 9 +#define RYxR_YEAR_MASK (0xfff << RYxR_YEAR_S) +#define RYxR_MONTH_S 5 +#define RYxR_MONTH_MASK (0xf << RYxR_MONTH_S) +#define RYxR_DAY_MASK 0x1f +#define RDxR_HOUR_S 12 +#define RDxR_HOUR_MASK (0x1f << RDxR_HOUR_S) +#define RDxR_MIN_S 6 +#define RDxR_MIN_MASK (0x3f << RDxR_MIN_S) +#define RDxR_SEC_MASK 0x3f + +#define RTSR 0x08 +#define RTTR 0x0c +#define RDCR 0x10 +#define RYCR 0x14 +#define RDAR1 0x18 +#define RYAR1 0x1c +#define RTCPICR 0x34 +#define PIAR 0x38 + +#define rtc_readl(pxa_rtc, reg) \ + __raw_readl((pxa_rtc)->base + (reg)) +#define rtc_writel(pxa_rtc, reg, value) \ + __raw_writel((value), (pxa_rtc)->base + (reg)) + +struct pxa_rtc { + struct resource *ress; + void __iomem *base; + int irq_1Hz; + int irq_Alrm; + struct rtc_device *rtc; + spinlock_t lock; /* Protects this structure */ + struct rtc_time rtc_alarm; +}; + +static u32 ryxr_calc(struct rtc_time *tm) +{ + return ((tm->tm_year + 1900) << RYxR_YEAR_S) + | ((tm->tm_mon + 1) << RYxR_MONTH_S) + | tm->tm_mday; +} + +static u32 rdxr_calc(struct rtc_time *tm) +{ + return (tm->tm_hour << RDxR_HOUR_S) | (tm->tm_min << RDxR_MIN_S) + | tm->tm_sec; +} + +static void tm_calc(u32 rycr, u32 rdcr, struct rtc_time *tm) +{ + tm->tm_year = ((rycr & RYxR_YEAR_MASK) >> RYxR_YEAR_S) - 1900; + tm->tm_mon = (((rycr & RYxR_MONTH_MASK) >> RYxR_MONTH_S)) - 1; + tm->tm_mday = (rycr & RYxR_DAY_MASK); + tm->tm_hour = (rdcr & RDxR_HOUR_MASK) >> RDxR_HOUR_S; + tm->tm_min = (rdcr & RDxR_MIN_MASK) >> RDxR_MIN_S; + tm->tm_sec = rdcr & RDxR_SEC_MASK; +} + +static void rtsr_clear_bits(struct pxa_rtc *pxa_rtc, u32 mask) +{ + u32 rtsr; + + rtsr = rtc_readl(pxa_rtc, RTSR); + rtsr &= ~RTSR_TRIG_MASK; + rtsr &= ~mask; + rtc_writel(pxa_rtc, RTSR, rtsr); +} + +static void rtsr_set_bits(struct pxa_rtc *pxa_rtc, u32 mask) +{ + u32 rtsr; + + rtsr = rtc_readl(pxa_rtc, RTSR); + rtsr &= ~RTSR_TRIG_MASK; + rtsr |= mask; + rtc_writel(pxa_rtc, RTSR, rtsr); +} + +static irqreturn_t pxa_rtc_irq(int irq, void *dev_id) +{ + struct platform_device *pdev = to_platform_device(dev_id); + struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev); + u32 rtsr; + unsigned long events = 0; + + spin_lock(&pxa_rtc->lock); + + /* clear interrupt sources */ + rtsr = rtc_readl(pxa_rtc, RTSR); + rtc_writel(pxa_rtc, RTSR, rtsr); + + /* temporary disable rtc interrupts */ + rtsr_clear_bits(pxa_rtc, RTSR_RDALE1 | RTSR_PIALE | RTSR_HZE); + + /* clear alarm interrupt if it has occurred */ + if (rtsr & RTSR_RDAL1) + rtsr &= ~RTSR_RDALE1; + + /* update irq data & counter */ + if (rtsr & RTSR_RDAL1) + events |= RTC_AF | RTC_IRQF; + if (rtsr & RTSR_HZ) + events |= RTC_UF | RTC_IRQF; + if (rtsr & RTSR_PIAL) + events |= RTC_PF | RTC_IRQF; + + rtc_update_irq(pxa_rtc->rtc, 1, events); + + /* enable back rtc interrupts */ + rtc_writel(pxa_rtc, RTSR, rtsr & ~RTSR_TRIG_MASK); + + spin_unlock(&pxa_rtc->lock); + return IRQ_HANDLED; +} + +static int pxa_rtc_open(struct device *dev) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + int ret; + + ret = request_irq(pxa_rtc->irq_1Hz, pxa_rtc_irq, IRQF_DISABLED, + "rtc 1Hz", dev); + if (ret < 0) { + dev_err(dev, "can't get irq %i, err %d\n", pxa_rtc->irq_1Hz, + ret); + goto err_irq_1Hz; + } + ret = request_irq(pxa_rtc->irq_Alrm, pxa_rtc_irq, IRQF_DISABLED, + "rtc Alrm", dev); + if (ret < 0) { + dev_err(dev, "can't get irq %i, err %d\n", pxa_rtc->irq_Alrm, + ret); + goto err_irq_Alrm; + } + + return 0; + +err_irq_Alrm: + free_irq(pxa_rtc->irq_1Hz, dev); +err_irq_1Hz: + return ret; +} + +static void pxa_rtc_release(struct device *dev) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + + spin_lock_irq(&pxa_rtc->lock); + rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE); + spin_unlock_irq(&pxa_rtc->lock); + + free_irq(pxa_rtc->irq_Alrm, dev); + free_irq(pxa_rtc->irq_1Hz, dev); +} + +static int pxa_periodic_irq_set_freq(struct device *dev, int freq) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + int period_ms; + + if (freq < 1 || freq > MAXFREQ_PERIODIC) + return -EINVAL; + + period_ms = 1000 / freq; + rtc_writel(pxa_rtc, PIAR, period_ms); + + return 0; +} + +static int pxa_periodic_irq_set_state(struct device *dev, int enabled) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + + if (enabled) + rtsr_set_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); + else + rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_PICE); + + return 0; +} + +static int pxa_rtc_ioctl(struct device *dev, unsigned int cmd, + unsigned long arg) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + int ret = 0; + + spin_lock_irq(&pxa_rtc->lock); + switch (cmd) { + case RTC_AIE_OFF: + rtsr_clear_bits(pxa_rtc, RTSR_RDALE1); + break; + case RTC_AIE_ON: + rtsr_set_bits(pxa_rtc, RTSR_RDALE1); + break; + case RTC_UIE_OFF: + rtsr_clear_bits(pxa_rtc, RTSR_HZE); + break; + case RTC_UIE_ON: + rtsr_set_bits(pxa_rtc, RTSR_HZE); + break; + default: + ret = -ENOIOCTLCMD; + } + + spin_unlock_irq(&pxa_rtc->lock); + return ret; +} + +static int pxa_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + u32 rycr, rdcr; + + rycr = rtc_readl(pxa_rtc, RYCR); + rdcr = rtc_readl(pxa_rtc, RDCR); + + tm_calc(rycr, rdcr, tm); + return 0; +} + +static int pxa_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + + rtc_writel(pxa_rtc, RYCR, ryxr_calc(tm)); + rtc_writel(pxa_rtc, RDCR, rdxr_calc(tm)); + + return 0; +} + +static int pxa_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + u32 rtsr, ryar, rdar; + + ryar = rtc_readl(pxa_rtc, RYAR1); + rdar = rtc_readl(pxa_rtc, RDAR1); + tm_calc(ryar, rdar, &alrm->time); + + rtsr = rtc_readl(pxa_rtc, RTSR); + alrm->enabled = (rtsr & RTSR_RDALE1) ? 1 : 0; + alrm->pending = (rtsr & RTSR_RDAL1) ? 1 : 0; + return 0; +} + +static int pxa_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + u32 rtsr; + + spin_lock_irq(&pxa_rtc->lock); + + rtc_writel(pxa_rtc, RYAR1, ryxr_calc(&alrm->time)); + rtc_writel(pxa_rtc, RDAR1, rdxr_calc(&alrm->time)); + + rtsr = rtc_readl(pxa_rtc, RTSR); + if (alrm->enabled) + rtsr |= RTSR_RDALE1; + else + rtsr &= ~RTSR_RDALE1; + rtc_writel(pxa_rtc, RTSR, rtsr); + + spin_unlock_irq(&pxa_rtc->lock); + + return 0; +} + +static int pxa_rtc_proc(struct device *dev, struct seq_file *seq) +{ + struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev); + + seq_printf(seq, "trim/divider\t: 0x%08x\n", rtc_readl(pxa_rtc, RTTR)); + seq_printf(seq, "update_IRQ\t: %s\n", + (rtc_readl(pxa_rtc, RTSR) & RTSR_HZE) ? "yes" : "no"); + seq_printf(seq, "periodic_IRQ\t: %s\n", + (rtc_readl(pxa_rtc, RTSR) & RTSR_PIALE) ? "yes" : "no"); + seq_printf(seq, "periodic_freq\t: %u\n", rtc_readl(pxa_rtc, PIAR)); + + return 0; +} + +static const struct rtc_class_ops pxa_rtc_ops = { + .open = pxa_rtc_open, + .release = pxa_rtc_release, + .ioctl = pxa_rtc_ioctl, + .read_time = pxa_rtc_read_time, + .set_time = pxa_rtc_set_time, + .read_alarm = pxa_rtc_read_alarm, + .set_alarm = pxa_rtc_set_alarm, + .proc = pxa_rtc_proc, + .irq_set_state = pxa_periodic_irq_set_state, + .irq_set_freq = pxa_periodic_irq_set_freq, +}; + +static int __init pxa_rtc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pxa_rtc *pxa_rtc; + int ret; + u32 rttr; + + pxa_rtc = kzalloc(sizeof(struct pxa_rtc), GFP_KERNEL); + if (!pxa_rtc) + return -ENOMEM; + + spin_lock_init(&pxa_rtc->lock); + platform_set_drvdata(pdev, pxa_rtc); + + ret = -ENXIO; + pxa_rtc->ress = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!pxa_rtc->ress) { + dev_err(dev, "No I/O memory resource defined\n"); + goto err_ress; + } + + pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0); + if (pxa_rtc->irq_1Hz < 0) { + dev_err(dev, "No 1Hz IRQ resource defined\n"); + goto err_ress; + } + pxa_rtc->irq_Alrm = platform_get_irq(pdev, 1); + if (pxa_rtc->irq_Alrm < 0) { + dev_err(dev, "No alarm IRQ resource defined\n"); + goto err_ress; + } + + ret = -ENOMEM; + pxa_rtc->base = ioremap(pxa_rtc->ress->start, + resource_size(pxa_rtc->ress)); + if (!pxa_rtc->base) { + dev_err(&pdev->dev, "Unable to map pxa RTC I/O memory\n"); + goto err_map; + } + + /* + * If the clock divider is uninitialized then reset it to the + * default value to get the 1Hz clock. + */ + if (rtc_readl(pxa_rtc, RTTR) == 0) { + rttr = RTC_DEF_DIVIDER + (RTC_DEF_TRIM << 16); + rtc_writel(pxa_rtc, RTTR, rttr); + dev_warn(dev, "warning: initializing default clock" + " divider/trim value\n"); + } + + rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE); + + pxa_rtc->rtc = rtc_device_register("pxa-rtc", &pdev->dev, &pxa_rtc_ops, + THIS_MODULE); + ret = PTR_ERR(pxa_rtc->rtc); + if (IS_ERR(pxa_rtc->rtc)) { + dev_err(dev, "Failed to register RTC device -> %d\n", ret); + goto err_rtc_reg; + } + + device_init_wakeup(dev, 1); + + return 0; + +err_rtc_reg: + iounmap(pxa_rtc->base); +err_ress: +err_map: + kfree(pxa_rtc); + return ret; +} + +static int __exit pxa_rtc_remove(struct platform_device *pdev) +{ + struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev); + + rtc_device_unregister(pxa_rtc->rtc); + + spin_lock_irq(&pxa_rtc->lock); + iounmap(pxa_rtc->base); + spin_unlock_irq(&pxa_rtc->lock); + + kfree(pxa_rtc); + + return 0; +} + +#ifdef CONFIG_PM +static int pxa_rtc_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev); + + if (device_may_wakeup(&pdev->dev)) + enable_irq_wake(pxa_rtc->irq_Alrm); + return 0; +} + +static int pxa_rtc_resume(struct platform_device *pdev) +{ + struct pxa_rtc *pxa_rtc = platform_get_drvdata(pdev); + + if (device_may_wakeup(&pdev->dev)) + disable_irq_wake(pxa_rtc->irq_Alrm); + return 0; +} +#else +#define pxa_rtc_suspend NULL +#define pxa_rtc_resume NULL +#endif + +static struct platform_driver pxa_rtc_driver = { + .remove = __exit_p(pxa_rtc_remove), + .suspend = pxa_rtc_suspend, + .resume = pxa_rtc_resume, + .driver = { + .name = "pxa-rtc", + }, +}; + +static int __init pxa_rtc_init(void) +{ + if (cpu_is_pxa27x() || cpu_is_pxa3xx()) + return platform_driver_probe(&pxa_rtc_driver, pxa_rtc_probe); + + return -ENODEV; +} + +static void __exit pxa_rtc_exit(void) +{ + platform_driver_unregister(&pxa_rtc_driver); +} + +module_init(pxa_rtc_init); +module_exit(pxa_rtc_exit); + +MODULE_AUTHOR("Robert Jarzmik"); +MODULE_DESCRIPTION("PXA27x/PXA3xx Realtime Clock Driver (RTC)"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:pxa-rtc"); diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 7a568beba3f..e0d7b999150 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -94,6 +94,9 @@ static int s3c_rtc_setfreq(struct device *dev, int freq) { unsigned int tmp; + if (!is_power_of_2(freq)) + return -EINVAL; + spin_lock_irq(&s3c_rtc_pie_lock); tmp = readb(s3c_rtc_base + S3C2410_TICNT) & S3C2410_TICNT_ENABLE; diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index aaf9d6a337c..1c3fc6b428e 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -24,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/io.h> +#include <linux/log2.h> #include <asm/rtc.h> #define DRV_NAME "sh-rtc" @@ -89,7 +90,9 @@ struct sh_rtc { void __iomem *regbase; unsigned long regsize; struct resource *res; - unsigned int alarm_irq, periodic_irq, carry_irq; + int alarm_irq; + int periodic_irq; + int carry_irq; struct rtc_device *rtc_dev; spinlock_t lock; unsigned long capabilities; /* See asm-sh/rtc.h for cap bits */ @@ -549,6 +552,8 @@ static int sh_rtc_irq_set_state(struct device *dev, int enabled) static int sh_rtc_irq_set_freq(struct device *dev, int freq) { + if (!is_power_of_2(freq)) + return -EINVAL; return sh_rtc_ioctl(dev, RTC_IRQP_SET, freq); } @@ -578,7 +583,7 @@ static int __devinit sh_rtc_probe(struct platform_device *pdev) /* get periodic/carry/alarm irqs */ ret = platform_get_irq(pdev, 0); - if (unlikely(ret < 0)) { + if (unlikely(ret <= 0)) { ret = -ENOENT; dev_err(&pdev->dev, "No IRQ for period\n"); goto err_badres; @@ -586,7 +591,7 @@ static int __devinit sh_rtc_probe(struct platform_device *pdev) rtc->periodic_irq = ret; ret = platform_get_irq(pdev, 1); - if (unlikely(ret < 0)) { + if (unlikely(ret <= 0)) { ret = -ENOENT; dev_err(&pdev->dev, "No IRQ for carry\n"); goto err_badres; @@ -594,7 +599,7 @@ static int __devinit sh_rtc_probe(struct platform_device *pdev) rtc->carry_irq = ret; ret = platform_get_irq(pdev, 2); - if (unlikely(ret < 0)) { + if (unlikely(ret <= 0)) { ret = -ENOENT; dev_err(&pdev->dev, "No IRQ for alarm\n"); goto err_badres; diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index f4cd46e15af..dc0b6224ad9 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -170,7 +170,7 @@ static int stk17ta8_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -EINVAL; pdata->alrm_mday = alrm->time.tm_mday; pdata->alrm_hour = alrm->time.tm_hour; @@ -187,7 +187,7 @@ static int stk17ta8_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -EINVAL; alrm->time.tm_mday = pdata->alrm_mday < 0 ? 0 : pdata->alrm_mday; alrm->time.tm_hour = pdata->alrm_hour < 0 ? 0 : pdata->alrm_hour; @@ -221,7 +221,7 @@ static int stk17ta8_rtc_ioctl(struct device *dev, unsigned int cmd, struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - if (pdata->irq < 0) + if (pdata->irq <= 0) return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */ switch (cmd) { case RTC_AIE_OFF: @@ -303,7 +303,6 @@ static int __init stk17ta8_rtc_probe(struct platform_device *pdev) pdata = kzalloc(sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->irq = -1; if (!request_mem_region(res->start, RTC_REG_SIZE, pdev->name)) { ret = -EBUSY; goto out; @@ -329,13 +328,13 @@ static int __init stk17ta8_rtc_probe(struct platform_device *pdev) if (readb(ioaddr + RTC_FLAGS) & RTC_FLAGS_PF) dev_warn(&pdev->dev, "voltage-low detected.\n"); - if (pdata->irq >= 0) { + if (pdata->irq > 0) { writeb(0, ioaddr + RTC_INTERRUPTS); if (request_irq(pdata->irq, stk17ta8_rtc_interrupt, IRQF_DISABLED | IRQF_SHARED, pdev->name, pdev) < 0) { dev_warn(&pdev->dev, "interrupt not available.\n"); - pdata->irq = -1; + pdata->irq = 0; } } @@ -355,7 +354,7 @@ static int __init stk17ta8_rtc_probe(struct platform_device *pdev) out: if (pdata->rtc) rtc_device_unregister(pdata->rtc); - if (pdata->irq >= 0) + if (pdata->irq > 0) free_irq(pdata->irq, pdev); if (ioaddr) iounmap(ioaddr); @@ -371,7 +370,7 @@ static int __devexit stk17ta8_rtc_remove(struct platform_device *pdev) sysfs_remove_bin_file(&pdev->dev.kobj, &stk17ta8_nvram_attr); rtc_device_unregister(pdata->rtc); - if (pdata->irq >= 0) { + if (pdata->irq > 0) { writeb(0, pdata->ioaddr + RTC_INTERRUPTS); free_irq(pdata->irq, pdev); } diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c index bc930022004..e478280ff62 100644 --- a/drivers/rtc/rtc-test.c +++ b/drivers/rtc/rtc-test.c @@ -34,14 +34,9 @@ static int test_rtc_read_time(struct device *dev, return 0; } -static int test_rtc_set_time(struct device *dev, - struct rtc_time *tm) -{ - return 0; -} - static int test_rtc_set_mmss(struct device *dev, unsigned long secs) { + dev_info(dev, "%s, secs = %lu\n", __func__, secs); return 0; } @@ -78,7 +73,6 @@ static int test_rtc_ioctl(struct device *dev, unsigned int cmd, static const struct rtc_class_ops test_rtc_ops = { .proc = test_rtc_proc, .read_time = test_rtc_read_time, - .set_time = test_rtc_set_time, .read_alarm = test_rtc_read_alarm, .set_alarm = test_rtc_set_alarm, .set_mmss = test_rtc_set_mmss, diff --git a/drivers/rtc/rtc-twl4030.c b/drivers/rtc/rtc-twl4030.c index 01d8da9afdc..8ce5f74ee45 100644 --- a/drivers/rtc/rtc-twl4030.c +++ b/drivers/rtc/rtc-twl4030.c @@ -19,6 +19,7 @@ */ #include <linux/kernel.h> +#include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> @@ -415,8 +416,8 @@ static int __devinit twl4030_rtc_probe(struct platform_device *pdev) int irq = platform_get_irq(pdev, 0); u8 rd_reg; - if (irq < 0) - return irq; + if (irq <= 0) + return -EINVAL; rtc = rtc_device_register(pdev->name, &pdev->dev, &twl4030_rtc_ops, THIS_MODULE); diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c new file mode 100644 index 00000000000..4ee4857ff20 --- /dev/null +++ b/drivers/rtc/rtc-tx4939.c @@ -0,0 +1,317 @@ +/* + * TX4939 internal RTC driver + * Based on RBTX49xx patch from CELF patch archive. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * (C) Copyright TOSHIBA CORPORATION 2005-2007 + */ +#include <linux/rtc.h> +#include <linux/platform_device.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <asm/txx9/tx4939.h> + +struct tx4939rtc_plat_data { + struct rtc_device *rtc; + struct tx4939_rtc_reg __iomem *rtcreg; +}; + +static struct tx4939rtc_plat_data *get_tx4939rtc_plat_data(struct device *dev) +{ + return platform_get_drvdata(to_platform_device(dev)); +} + +static int tx4939_rtc_cmd(struct tx4939_rtc_reg __iomem *rtcreg, int cmd) +{ + int i = 0; + + __raw_writel(cmd, &rtcreg->ctl); + /* This might take 30us (next 32.768KHz clock) */ + while (__raw_readl(&rtcreg->ctl) & TX4939_RTCCTL_BUSY) { + /* timeout on approx. 100us (@ GBUS200MHz) */ + if (i++ > 200 * 100) + return -EBUSY; + cpu_relax(); + } + return 0; +} + +static int tx4939_rtc_set_mmss(struct device *dev, unsigned long secs) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + int i, ret; + unsigned char buf[6]; + + buf[0] = 0; + buf[1] = 0; + buf[2] = secs; + buf[3] = secs >> 8; + buf[4] = secs >> 16; + buf[5] = secs >> 24; + spin_lock_irq(&pdata->rtc->irq_lock); + __raw_writel(0, &rtcreg->adr); + for (i = 0; i < 6; i++) + __raw_writel(buf[i], &rtcreg->dat); + ret = tx4939_rtc_cmd(rtcreg, + TX4939_RTCCTL_COMMAND_SETTIME | + (__raw_readl(&rtcreg->ctl) & TX4939_RTCCTL_ALME)); + spin_unlock_irq(&pdata->rtc->irq_lock); + return ret; +} + +static int tx4939_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + int i, ret; + unsigned long sec; + unsigned char buf[6]; + + spin_lock_irq(&pdata->rtc->irq_lock); + ret = tx4939_rtc_cmd(rtcreg, + TX4939_RTCCTL_COMMAND_GETTIME | + (__raw_readl(&rtcreg->ctl) & TX4939_RTCCTL_ALME)); + if (ret) { + spin_unlock_irq(&pdata->rtc->irq_lock); + return ret; + } + __raw_writel(2, &rtcreg->adr); + for (i = 2; i < 6; i++) + buf[i] = __raw_readl(&rtcreg->dat); + spin_unlock_irq(&pdata->rtc->irq_lock); + sec = (buf[5] << 24) | (buf[4] << 16) | (buf[3] << 8) | buf[2]; + rtc_time_to_tm(sec, tm); + return rtc_valid_tm(tm); +} + +static int tx4939_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + int i, ret; + unsigned long sec; + unsigned char buf[6]; + + if (alrm->time.tm_sec < 0 || + alrm->time.tm_min < 0 || + alrm->time.tm_hour < 0 || + alrm->time.tm_mday < 0 || + alrm->time.tm_mon < 0 || + alrm->time.tm_year < 0) + return -EINVAL; + rtc_tm_to_time(&alrm->time, &sec); + buf[0] = 0; + buf[1] = 0; + buf[2] = sec; + buf[3] = sec >> 8; + buf[4] = sec >> 16; + buf[5] = sec >> 24; + spin_lock_irq(&pdata->rtc->irq_lock); + __raw_writel(0, &rtcreg->adr); + for (i = 0; i < 6; i++) + __raw_writel(buf[i], &rtcreg->dat); + ret = tx4939_rtc_cmd(rtcreg, TX4939_RTCCTL_COMMAND_SETALARM | + (alrm->enabled ? TX4939_RTCCTL_ALME : 0)); + spin_unlock_irq(&pdata->rtc->irq_lock); + return ret; +} + +static int tx4939_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + int i, ret; + unsigned long sec; + unsigned char buf[6]; + u32 ctl; + + spin_lock_irq(&pdata->rtc->irq_lock); + ret = tx4939_rtc_cmd(rtcreg, + TX4939_RTCCTL_COMMAND_GETALARM | + (__raw_readl(&rtcreg->ctl) & TX4939_RTCCTL_ALME)); + if (ret) { + spin_unlock_irq(&pdata->rtc->irq_lock); + return ret; + } + __raw_writel(2, &rtcreg->adr); + for (i = 2; i < 6; i++) + buf[i] = __raw_readl(&rtcreg->dat); + ctl = __raw_readl(&rtcreg->ctl); + alrm->enabled = (ctl & TX4939_RTCCTL_ALME) ? 1 : 0; + alrm->pending = (ctl & TX4939_RTCCTL_ALMD) ? 1 : 0; + spin_unlock_irq(&pdata->rtc->irq_lock); + sec = (buf[5] << 24) | (buf[4] << 16) | (buf[3] << 8) | buf[2]; + rtc_time_to_tm(sec, &alrm->time); + return rtc_valid_tm(&alrm->time); +} + +static int tx4939_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + + spin_lock_irq(&pdata->rtc->irq_lock); + tx4939_rtc_cmd(pdata->rtcreg, + TX4939_RTCCTL_COMMAND_NOP | + (enabled ? TX4939_RTCCTL_ALME : 0)); + spin_unlock_irq(&pdata->rtc->irq_lock); + return 0; +} + +static irqreturn_t tx4939_rtc_interrupt(int irq, void *dev_id) +{ + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev_id); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + unsigned long events = RTC_IRQF; + + spin_lock(&pdata->rtc->irq_lock); + if (__raw_readl(&rtcreg->ctl) & TX4939_RTCCTL_ALMD) { + events |= RTC_AF; + tx4939_rtc_cmd(rtcreg, TX4939_RTCCTL_COMMAND_NOP); + } + spin_unlock(&pdata->rtc->irq_lock); + rtc_update_irq(pdata->rtc, 1, events); + return IRQ_HANDLED; +} + +static const struct rtc_class_ops tx4939_rtc_ops = { + .read_time = tx4939_rtc_read_time, + .read_alarm = tx4939_rtc_read_alarm, + .set_alarm = tx4939_rtc_set_alarm, + .set_mmss = tx4939_rtc_set_mmss, + .alarm_irq_enable = tx4939_rtc_alarm_irq_enable, +}; + +static ssize_t tx4939_rtc_nvram_read(struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t size) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + ssize_t count; + + spin_lock_irq(&pdata->rtc->irq_lock); + for (count = 0; size > 0 && pos < TX4939_RTC_REG_RAMSIZE; + count++, size--) { + __raw_writel(pos++, &rtcreg->adr); + *buf++ = __raw_readl(&rtcreg->dat); + } + spin_unlock_irq(&pdata->rtc->irq_lock); + return count; +} + +static ssize_t tx4939_rtc_nvram_write(struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t size) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct tx4939rtc_plat_data *pdata = get_tx4939rtc_plat_data(dev); + struct tx4939_rtc_reg __iomem *rtcreg = pdata->rtcreg; + ssize_t count; + + spin_lock_irq(&pdata->rtc->irq_lock); + for (count = 0; size > 0 && pos < TX4939_RTC_REG_RAMSIZE; + count++, size--) { + __raw_writel(pos++, &rtcreg->adr); + __raw_writel(*buf++, &rtcreg->dat); + } + spin_unlock_irq(&pdata->rtc->irq_lock); + return count; +} + +static struct bin_attribute tx4939_rtc_nvram_attr = { + .attr = { + .name = "nvram", + .mode = S_IRUGO | S_IWUSR, + }, + .size = TX4939_RTC_REG_RAMSIZE, + .read = tx4939_rtc_nvram_read, + .write = tx4939_rtc_nvram_write, +}; + +static int __init tx4939_rtc_probe(struct platform_device *pdev) +{ + struct rtc_device *rtc; + struct tx4939rtc_plat_data *pdata; + struct resource *res; + int irq, ret; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return -ENODEV; + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return -ENOMEM; + platform_set_drvdata(pdev, pdata); + + if (!devm_request_mem_region(&pdev->dev, res->start, + resource_size(res), pdev->name)) + return -EBUSY; + pdata->rtcreg = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); + if (!pdata->rtcreg) + return -EBUSY; + + tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); + if (devm_request_irq(&pdev->dev, irq, tx4939_rtc_interrupt, + IRQF_DISABLED | IRQF_SHARED, + pdev->name, &pdev->dev) < 0) { + return -EBUSY; + } + rtc = rtc_device_register(pdev->name, &pdev->dev, + &tx4939_rtc_ops, THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + pdata->rtc = rtc; + ret = sysfs_create_bin_file(&pdev->dev.kobj, &tx4939_rtc_nvram_attr); + if (ret) + rtc_device_unregister(rtc); + return ret; +} + +static int __exit tx4939_rtc_remove(struct platform_device *pdev) +{ + struct tx4939rtc_plat_data *pdata = platform_get_drvdata(pdev); + struct rtc_device *rtc = pdata->rtc; + + spin_lock_irq(&rtc->irq_lock); + tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); + spin_unlock_irq(&rtc->irq_lock); + sysfs_remove_bin_file(&pdev->dev.kobj, &tx4939_rtc_nvram_attr); + rtc_device_unregister(rtc); + platform_set_drvdata(pdev, NULL); + return 0; +} + +static struct platform_driver tx4939_rtc_driver = { + .remove = __exit_p(tx4939_rtc_remove), + .driver = { + .name = "tx4939rtc", + .owner = THIS_MODULE, + }, +}; + +static int __init tx4939rtc_init(void) +{ + return platform_driver_probe(&tx4939_rtc_driver, tx4939_rtc_probe); +} + +static void __exit tx4939rtc_exit(void) +{ + platform_driver_unregister(&tx4939_rtc_driver); +} + +module_init(tx4939rtc_init); +module_exit(tx4939rtc_exit); + +MODULE_AUTHOR("Atsushi Nemoto <anemo@mba.ocn.ne.jp>"); +MODULE_DESCRIPTION("TX4939 internal RTC driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:tx4939rtc"); diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index 834dcc6d785..f11297aff85 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -27,6 +27,7 @@ #include <linux/rtc.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/log2.h> #include <asm/div64.h> #include <asm/io.h> @@ -84,8 +85,8 @@ static DEFINE_SPINLOCK(rtc_lock); static char rtc_name[] = "RTC"; static unsigned long periodic_count; static unsigned int alarm_enabled; -static int aie_irq = -1; -static int pie_irq = -1; +static int aie_irq; +static int pie_irq; static inline unsigned long read_elapsed_second(void) { @@ -210,6 +211,8 @@ static int vr41xx_rtc_irq_set_freq(struct device *dev, int freq) { unsigned long count; + if (!is_power_of_2(freq)) + return -EINVAL; count = RTC_FREQUENCY; do_div(count, freq); @@ -360,7 +363,7 @@ static int __devinit rtc_probe(struct platform_device *pdev) spin_unlock_irq(&rtc_lock); aie_irq = platform_get_irq(pdev, 0); - if (aie_irq < 0 || aie_irq >= nr_irqs) { + if (aie_irq <= 0) { retval = -EBUSY; goto err_device_unregister; } @@ -371,7 +374,7 @@ static int __devinit rtc_probe(struct platform_device *pdev) goto err_device_unregister; pie_irq = platform_get_irq(pdev, 1); - if (pie_irq < 0 || pie_irq >= nr_irqs) + if (pie_irq <= 0) goto err_free_irq; retval = request_irq(pie_irq, rtclong1_interrupt, IRQF_DISABLED, diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index b9d0efb6803..4a6fe01831a 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -78,7 +78,7 @@ config SPI_AU1550 will be called au1550_spi. config SPI_BITBANG - tristate "Bitbanging SPI master" + tristate "Utilities for Bitbanging SPI masters" help With a few GPIO pins, your system can bitbang the SPI protocol. Select this to get SPI support through I/O pins (GPIO, parallel @@ -100,6 +100,22 @@ config SPI_BUTTERFLY inexpensive battery powered microcontroller evaluation board. This same cable can be used to flash new firmware. +config SPI_GPIO + tristate "GPIO-based bitbanging SPI Master" + depends on GENERIC_GPIO + select SPI_BITBANG + help + This simple GPIO bitbanging SPI master uses the arch-neutral GPIO + interface to manage MOSI, MISO, SCK, and chipselect signals. SPI + slaves connected to a bus using this driver are configured as usual, + except that the spi_board_info.controller_data holds the GPIO number + for the chipselect used by this controller driver. + + Note that this driver often won't achieve even 1 Mbit/sec speeds, + making it unusually slow for SPI. If your platform can inline + GPIO operations, you should be able to leverage that for better + speed with a custom version of this driver; see the source code. + config SPI_IMX tristate "Freescale iMX SPI controller" depends on ARCH_IMX && EXPERIMENTAL diff --git a/drivers/spi/Makefile b/drivers/spi/Makefile index ccf18de34e1..5e9f521b884 100644 --- a/drivers/spi/Makefile +++ b/drivers/spi/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_SPI_BFIN) += spi_bfin5xx.o obj-$(CONFIG_SPI_BITBANG) += spi_bitbang.o obj-$(CONFIG_SPI_AU1550) += au1550_spi.o obj-$(CONFIG_SPI_BUTTERFLY) += spi_butterfly.o +obj-$(CONFIG_SPI_GPIO) += spi_gpio.o obj-$(CONFIG_SPI_IMX) += spi_imx.o obj-$(CONFIG_SPI_LM70_LLP) += spi_lm70llp.o obj-$(CONFIG_SPI_PXA2XX) += pxa2xx_spi.o diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c index 8abae4ad0fa..5e39bac9c51 100644 --- a/drivers/spi/atmel_spi.c +++ b/drivers/spi/atmel_spi.c @@ -30,13 +30,6 @@ * The core SPI transfer engine just talks to a register bank to set up * DMA transfers; transfer queue progress is driven by IRQs. The clock * framework provides the base clock, subdivided for each spi_device. - * - * Newer controllers, marked with "new_1" flag, have: - * - CR.LASTXFER - * - SPI_MR.DIV32 may become FDIV or must-be-zero (here: always zero) - * - SPI_SR.TXEMPTY, SPI_SR.NSSR (and corresponding irqs) - * - SPI_CSRx.CSAAT - * - SPI_CSRx.SBCR allows faster clocking */ struct atmel_spi { spinlock_t lock; @@ -45,7 +38,6 @@ struct atmel_spi { int irq; struct clk *clk; struct platform_device *pdev; - unsigned new_1:1; struct spi_device *stay; u8 stopping; @@ -59,10 +51,33 @@ struct atmel_spi { dma_addr_t buffer_dma; }; +/* Controller-specific per-slave state */ +struct atmel_spi_device { + unsigned int npcs_pin; + u32 csr; +}; + #define BUFFER_SIZE PAGE_SIZE #define INVALID_DMA_ADDRESS 0xffffffff /* + * Version 2 of the SPI controller has + * - CR.LASTXFER + * - SPI_MR.DIV32 may become FDIV or must-be-zero (here: always zero) + * - SPI_SR.TXEMPTY, SPI_SR.NSSR (and corresponding irqs) + * - SPI_CSRx.CSAAT + * - SPI_CSRx.SBCR allows faster clocking + * + * We can determine the controller version by reading the VERSION + * register, but I haven't checked that it exists on all chips, and + * this is cheaper anyway. + */ +static bool atmel_spi_is_v2(void) +{ + return !cpu_is_at91rm9200(); +} + +/* * Earlier SPI controllers (e.g. on at91rm9200) have a design bug whereby * they assume that spi slave device state will not change on deselect, so * that automagic deselection is OK. ("NPCSx rises if no data is to be @@ -80,39 +95,58 @@ struct atmel_spi { * Master on Chip Select 0.") No workaround exists for that ... so for * nCS0 on that chip, we (a) don't use the GPIO, (b) can't support CS_HIGH, * and (c) will trigger that first erratum in some cases. + * + * TODO: Test if the atmel_spi_is_v2() branch below works on + * AT91RM9200 if we use some other register than CSR0. However, don't + * do this unconditionally since AP7000 has an errata where the BITS + * field in CSR0 overrides all other CSRs. */ static void cs_activate(struct atmel_spi *as, struct spi_device *spi) { - unsigned gpio = (unsigned) spi->controller_data; + struct atmel_spi_device *asd = spi->controller_state; unsigned active = spi->mode & SPI_CS_HIGH; u32 mr; - int i; - u32 csr; - u32 cpol = (spi->mode & SPI_CPOL) ? SPI_BIT(CPOL) : 0; - - /* Make sure clock polarity is correct */ - for (i = 0; i < spi->master->num_chipselect; i++) { - csr = spi_readl(as, CSR0 + 4 * i); - if ((csr ^ cpol) & SPI_BIT(CPOL)) - spi_writel(as, CSR0 + 4 * i, csr ^ SPI_BIT(CPOL)); - } - mr = spi_readl(as, MR); - mr = SPI_BFINS(PCS, ~(1 << spi->chip_select), mr); + if (atmel_spi_is_v2()) { + /* + * Always use CSR0. This ensures that the clock + * switches to the correct idle polarity before we + * toggle the CS. + */ + spi_writel(as, CSR0, asd->csr); + spi_writel(as, MR, SPI_BF(PCS, 0x0e) | SPI_BIT(MODFDIS) + | SPI_BIT(MSTR)); + mr = spi_readl(as, MR); + gpio_set_value(asd->npcs_pin, active); + } else { + u32 cpol = (spi->mode & SPI_CPOL) ? SPI_BIT(CPOL) : 0; + int i; + u32 csr; + + /* Make sure clock polarity is correct */ + for (i = 0; i < spi->master->num_chipselect; i++) { + csr = spi_readl(as, CSR0 + 4 * i); + if ((csr ^ cpol) & SPI_BIT(CPOL)) + spi_writel(as, CSR0 + 4 * i, + csr ^ SPI_BIT(CPOL)); + } + + mr = spi_readl(as, MR); + mr = SPI_BFINS(PCS, ~(1 << spi->chip_select), mr); + if (spi->chip_select != 0) + gpio_set_value(asd->npcs_pin, active); + spi_writel(as, MR, mr); + } dev_dbg(&spi->dev, "activate %u%s, mr %08x\n", - gpio, active ? " (high)" : "", + asd->npcs_pin, active ? " (high)" : "", mr); - - if (!(cpu_is_at91rm9200() && spi->chip_select == 0)) - gpio_set_value(gpio, active); - spi_writel(as, MR, mr); } static void cs_deactivate(struct atmel_spi *as, struct spi_device *spi) { - unsigned gpio = (unsigned) spi->controller_data; + struct atmel_spi_device *asd = spi->controller_state; unsigned active = spi->mode & SPI_CS_HIGH; u32 mr; @@ -126,11 +160,11 @@ static void cs_deactivate(struct atmel_spi *as, struct spi_device *spi) } dev_dbg(&spi->dev, "DEactivate %u%s, mr %08x\n", - gpio, active ? " (low)" : "", + asd->npcs_pin, active ? " (low)" : "", mr); - if (!(cpu_is_at91rm9200() && spi->chip_select == 0)) - gpio_set_value(gpio, !active); + if (atmel_spi_is_v2() || spi->chip_select != 0) + gpio_set_value(asd->npcs_pin, !active); } static inline int atmel_spi_xfer_is_last(struct spi_message *msg, @@ -502,6 +536,7 @@ atmel_spi_interrupt(int irq, void *dev_id) static int atmel_spi_setup(struct spi_device *spi) { struct atmel_spi *as; + struct atmel_spi_device *asd; u32 scbr, csr; unsigned int bits = spi->bits_per_word; unsigned long bus_hz; @@ -536,19 +571,16 @@ static int atmel_spi_setup(struct spi_device *spi) } /* see notes above re chipselect */ - if (cpu_is_at91rm9200() + if (!atmel_spi_is_v2() && spi->chip_select == 0 && (spi->mode & SPI_CS_HIGH)) { dev_dbg(&spi->dev, "setup: can't be active-high\n"); return -EINVAL; } - /* - * Pre-new_1 chips start out at half the peripheral - * bus speed. - */ + /* v1 chips start out at half the peripheral bus speed. */ bus_hz = clk_get_rate(as->clk); - if (!as->new_1) + if (!atmel_spi_is_v2()) bus_hz /= 2; if (spi->max_speed_hz) { @@ -589,11 +621,20 @@ static int atmel_spi_setup(struct spi_device *spi) /* chipselect must have been muxed as GPIO (e.g. in board setup) */ npcs_pin = (unsigned int)spi->controller_data; - if (!spi->controller_state) { + asd = spi->controller_state; + if (!asd) { + asd = kzalloc(sizeof(struct atmel_spi_device), GFP_KERNEL); + if (!asd) + return -ENOMEM; + ret = gpio_request(npcs_pin, spi->dev.bus_id); - if (ret) + if (ret) { + kfree(asd); return ret; - spi->controller_state = (void *)npcs_pin; + } + + asd->npcs_pin = npcs_pin; + spi->controller_state = asd; gpio_direction_output(npcs_pin, !(spi->mode & SPI_CS_HIGH)); } else { unsigned long flags; @@ -605,11 +646,14 @@ static int atmel_spi_setup(struct spi_device *spi) spin_unlock_irqrestore(&as->lock, flags); } + asd->csr = csr; + dev_dbg(&spi->dev, "setup: %lu Hz bpw %u mode 0x%x -> csr%d %08x\n", bus_hz / scbr, bits, spi->mode, spi->chip_select, csr); - spi_writel(as, CSR0 + 4 * spi->chip_select, csr); + if (!atmel_spi_is_v2()) + spi_writel(as, CSR0 + 4 * spi->chip_select, csr); return 0; } @@ -684,10 +728,11 @@ static int atmel_spi_transfer(struct spi_device *spi, struct spi_message *msg) static void atmel_spi_cleanup(struct spi_device *spi) { struct atmel_spi *as = spi_master_get_devdata(spi->master); + struct atmel_spi_device *asd = spi->controller_state; unsigned gpio = (unsigned) spi->controller_data; unsigned long flags; - if (!spi->controller_state) + if (!asd) return; spin_lock_irqsave(&as->lock, flags); @@ -697,7 +742,9 @@ static void atmel_spi_cleanup(struct spi_device *spi) } spin_unlock_irqrestore(&as->lock, flags); + spi->controller_state = NULL; gpio_free(gpio); + kfree(asd); } /*-------------------------------------------------------------------------*/ @@ -755,8 +802,6 @@ static int __init atmel_spi_probe(struct platform_device *pdev) goto out_free_buffer; as->irq = irq; as->clk = clk; - if (!cpu_is_at91rm9200()) - as->new_1 = 1; ret = request_irq(irq, atmel_spi_interrupt, 0, pdev->dev.bus_id, master); diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c index 6104f461a3c..d0fc4ca2f65 100644 --- a/drivers/spi/pxa2xx_spi.c +++ b/drivers/spi/pxa2xx_spi.c @@ -1561,11 +1561,12 @@ out_error_master_alloc: static int pxa2xx_spi_remove(struct platform_device *pdev) { struct driver_data *drv_data = platform_get_drvdata(pdev); - struct ssp_device *ssp = drv_data->ssp; + struct ssp_device *ssp; int status = 0; if (!drv_data) return 0; + ssp = drv_data->ssp; /* Remove the queue */ status = destroy_queue(drv_data); diff --git a/drivers/spi/spi_gpio.c b/drivers/spi/spi_gpio.c new file mode 100644 index 00000000000..49698cabc30 --- /dev/null +++ b/drivers/spi/spi_gpio.c @@ -0,0 +1,360 @@ +/* + * spi_gpio.c - SPI master driver using generic bitbanged GPIO + * + * Copyright (C) 2006,2008 David Brownell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/gpio.h> + +#include <linux/spi/spi.h> +#include <linux/spi/spi_bitbang.h> +#include <linux/spi/spi_gpio.h> + + +/* + * This bitbanging SPI master driver should help make systems usable + * when a native hardware SPI engine is not available, perhaps because + * its driver isn't yet working or because the I/O pins it requires + * are used for other purposes. + * + * platform_device->driver_data ... points to spi_gpio + * + * spi->controller_state ... reserved for bitbang framework code + * spi->controller_data ... holds chipselect GPIO + * + * spi->master->dev.driver_data ... points to spi_gpio->bitbang + */ + +struct spi_gpio { + struct spi_bitbang bitbang; + struct spi_gpio_platform_data pdata; + struct platform_device *pdev; +}; + +/*----------------------------------------------------------------------*/ + +/* + * Because the overhead of going through four GPIO procedure calls + * per transferred bit can make performance a problem, this code + * is set up so that you can use it in either of two ways: + * + * - The slow generic way: set up platform_data to hold the GPIO + * numbers used for MISO/MOSI/SCK, and issue procedure calls for + * each of them. This driver can handle several such busses. + * + * - The quicker inlined way: only helps with platform GPIO code + * that inlines operations for constant GPIOs. This can give + * you tight (fast!) inner loops, but each such bus needs a + * new driver. You'll define a new C file, with Makefile and + * Kconfig support; the C code can be a total of six lines: + * + * #define DRIVER_NAME "myboard_spi2" + * #define SPI_MISO_GPIO 119 + * #define SPI_MOSI_GPIO 120 + * #define SPI_SCK_GPIO 121 + * #define SPI_N_CHIPSEL 4 + * #include "spi_gpio.c" + */ + +#ifndef DRIVER_NAME +#define DRIVER_NAME "spi_gpio" + +#define GENERIC_BITBANG /* vs tight inlines */ + +/* all functions referencing these symbols must define pdata */ +#define SPI_MISO_GPIO ((pdata)->miso) +#define SPI_MOSI_GPIO ((pdata)->mosi) +#define SPI_SCK_GPIO ((pdata)->sck) + +#define SPI_N_CHIPSEL ((pdata)->num_chipselect) + +#endif + +/*----------------------------------------------------------------------*/ + +static inline const struct spi_gpio_platform_data * __pure +spi_to_pdata(const struct spi_device *spi) +{ + const struct spi_bitbang *bang; + const struct spi_gpio *spi_gpio; + + bang = spi_master_get_devdata(spi->master); + spi_gpio = container_of(bang, struct spi_gpio, bitbang); + return &spi_gpio->pdata; +} + +/* this is #defined to avoid unused-variable warnings when inlining */ +#define pdata spi_to_pdata(spi) + +static inline void setsck(const struct spi_device *spi, int is_on) +{ + gpio_set_value(SPI_SCK_GPIO, is_on); +} + +static inline void setmosi(const struct spi_device *spi, int is_on) +{ + gpio_set_value(SPI_MOSI_GPIO, is_on); +} + +static inline int getmiso(const struct spi_device *spi) +{ + return gpio_get_value(SPI_MISO_GPIO); +} + +#undef pdata + +/* + * NOTE: this clocks "as fast as we can". It "should" be a function of the + * requested device clock. Software overhead means we usually have trouble + * reaching even one Mbit/sec (except when we can inline bitops), so for now + * we'll just assume we never need additional per-bit slowdowns. + */ +#define spidelay(nsecs) do {} while (0) + +#define EXPAND_BITBANG_TXRX +#include <linux/spi/spi_bitbang.h> + +/* + * These functions can leverage inline expansion of GPIO calls to shrink + * costs for a txrx bit, often by factors of around ten (by instruction + * count). That is particularly visible for larger word sizes, but helps + * even with default 8-bit words. + * + * REVISIT overheads calling these functions for each word also have + * significant performance costs. Having txrx_bufs() calls that inline + * the txrx_word() logic would help performance, e.g. on larger blocks + * used with flash storage or MMC/SD. There should also be ways to make + * GCC be less stupid about reloading registers inside the I/O loops, + * even without inlined GPIO calls; __attribute__((hot)) on GCC 4.3? + */ + +static u32 spi_gpio_txrx_word_mode0(struct spi_device *spi, + unsigned nsecs, u32 word, u8 bits) +{ + return bitbang_txrx_be_cpha0(spi, nsecs, 0, word, bits); +} + +static u32 spi_gpio_txrx_word_mode1(struct spi_device *spi, + unsigned nsecs, u32 word, u8 bits) +{ + return bitbang_txrx_be_cpha1(spi, nsecs, 0, word, bits); +} + +static u32 spi_gpio_txrx_word_mode2(struct spi_device *spi, + unsigned nsecs, u32 word, u8 bits) +{ + return bitbang_txrx_be_cpha0(spi, nsecs, 1, word, bits); +} + +static u32 spi_gpio_txrx_word_mode3(struct spi_device *spi, + unsigned nsecs, u32 word, u8 bits) +{ + return bitbang_txrx_be_cpha1(spi, nsecs, 1, word, bits); +} + +/*----------------------------------------------------------------------*/ + +static void spi_gpio_chipselect(struct spi_device *spi, int is_active) +{ + unsigned long cs = (unsigned long) spi->controller_data; + + /* set initial clock polarity */ + if (is_active) + setsck(spi, spi->mode & SPI_CPOL); + + /* SPI is normally active-low */ + gpio_set_value(cs, (spi->mode & SPI_CS_HIGH) ? is_active : !is_active); +} + +static int spi_gpio_setup(struct spi_device *spi) +{ + unsigned long cs = (unsigned long) spi->controller_data; + int status = 0; + + if (spi->bits_per_word > 32) + return -EINVAL; + + if (!spi->controller_state) { + status = gpio_request(cs, spi->dev.bus_id); + if (status) + return status; + status = gpio_direction_output(cs, spi->mode & SPI_CS_HIGH); + } + if (!status) + status = spi_bitbang_setup(spi); + if (status) { + if (!spi->controller_state) + gpio_free(cs); + } + return status; +} + +static void spi_gpio_cleanup(struct spi_device *spi) +{ + unsigned long cs = (unsigned long) spi->controller_data; + + gpio_free(cs); + spi_bitbang_cleanup(spi); +} + +static int __init spi_gpio_alloc(unsigned pin, const char *label, bool is_in) +{ + int value; + + value = gpio_request(pin, label); + if (value == 0) { + if (is_in) + value = gpio_direction_input(pin); + else + value = gpio_direction_output(pin, 0); + } + return value; +} + +static int __init +spi_gpio_request(struct spi_gpio_platform_data *pdata, const char *label) +{ + int value; + + /* NOTE: SPI_*_GPIO symbols may reference "pdata" */ + + value = spi_gpio_alloc(SPI_MOSI_GPIO, label, false); + if (value) + goto done; + + value = spi_gpio_alloc(SPI_MISO_GPIO, label, true); + if (value) + goto free_mosi; + + value = spi_gpio_alloc(SPI_SCK_GPIO, label, false); + if (value) + goto free_miso; + + goto done; + +free_miso: + gpio_free(SPI_MISO_GPIO); +free_mosi: + gpio_free(SPI_MOSI_GPIO); +done: + return value; +} + +static int __init spi_gpio_probe(struct platform_device *pdev) +{ + int status; + struct spi_master *master; + struct spi_gpio *spi_gpio; + struct spi_gpio_platform_data *pdata; + + pdata = pdev->dev.platform_data; +#ifdef GENERIC_BITBANG + if (!pdata || !pdata->num_chipselect) + return -ENODEV; +#endif + + status = spi_gpio_request(pdata, dev_name(&pdev->dev)); + if (status < 0) + return status; + + master = spi_alloc_master(&pdev->dev, sizeof *spi_gpio); + if (!master) { + status = -ENOMEM; + goto gpio_free; + } + spi_gpio = spi_master_get_devdata(master); + platform_set_drvdata(pdev, spi_gpio); + + spi_gpio->pdev = pdev; + if (pdata) + spi_gpio->pdata = *pdata; + + master->bus_num = pdev->id; + master->num_chipselect = SPI_N_CHIPSEL; + master->setup = spi_gpio_setup; + master->cleanup = spi_gpio_cleanup; + + spi_gpio->bitbang.master = spi_master_get(master); + spi_gpio->bitbang.chipselect = spi_gpio_chipselect; + spi_gpio->bitbang.txrx_word[SPI_MODE_0] = spi_gpio_txrx_word_mode0; + spi_gpio->bitbang.txrx_word[SPI_MODE_1] = spi_gpio_txrx_word_mode1; + spi_gpio->bitbang.txrx_word[SPI_MODE_2] = spi_gpio_txrx_word_mode2; + spi_gpio->bitbang.txrx_word[SPI_MODE_3] = spi_gpio_txrx_word_mode3; + spi_gpio->bitbang.setup_transfer = spi_bitbang_setup_transfer; + spi_gpio->bitbang.flags = SPI_CS_HIGH; + + status = spi_bitbang_start(&spi_gpio->bitbang); + if (status < 0) { + spi_master_put(spi_gpio->bitbang.master); +gpio_free: + gpio_free(SPI_MISO_GPIO); + gpio_free(SPI_MOSI_GPIO); + gpio_free(SPI_SCK_GPIO); + spi_master_put(master); + } + + return status; +} + +static int __exit spi_gpio_remove(struct platform_device *pdev) +{ + struct spi_gpio *spi_gpio; + struct spi_gpio_platform_data *pdata; + int status; + + spi_gpio = platform_get_drvdata(pdev); + pdata = pdev->dev.platform_data; + + /* stop() unregisters child devices too */ + status = spi_bitbang_stop(&spi_gpio->bitbang); + spi_master_put(spi_gpio->bitbang.master); + + platform_set_drvdata(pdev, NULL); + + gpio_free(SPI_MISO_GPIO); + gpio_free(SPI_MOSI_GPIO); + gpio_free(SPI_SCK_GPIO); + + return status; +} + +MODULE_ALIAS("platform:" DRIVER_NAME); + +static struct platform_driver spi_gpio_driver = { + .driver.name = DRIVER_NAME, + .driver.owner = THIS_MODULE, + .remove = __exit_p(spi_gpio_remove), +}; + +static int __init spi_gpio_init(void) +{ + return platform_driver_probe(&spi_gpio_driver, spi_gpio_probe); +} +module_init(spi_gpio_init); + +static void __exit spi_gpio_exit(void) +{ + platform_driver_unregister(&spi_gpio_driver); +} +module_exit(spi_gpio_exit); + + +MODULE_DESCRIPTION("SPI master driver using generic bitbanged GPIO "); +MODULE_AUTHOR("David Brownell"); +MODULE_LICENSE("GPL"); diff --git a/drivers/spi/spi_s3c24xx.c b/drivers/spi/spi_s3c24xx.c index 256d18395a2..b3ebc1d0f85 100644 --- a/drivers/spi/spi_s3c24xx.c +++ b/drivers/spi/spi_s3c24xx.c @@ -19,6 +19,7 @@ #include <linux/err.h> #include <linux/clk.h> #include <linux/platform_device.h> +#include <linux/gpio.h> #include <linux/spi/spi.h> #include <linux/spi/spi_bitbang.h> @@ -27,7 +28,6 @@ #include <asm/dma.h> #include <mach/hardware.h> -#include <mach/regs-gpio.h> #include <plat/regs-spi.h> #include <mach/spi.h> @@ -66,7 +66,7 @@ static inline struct s3c24xx_spi *to_hw(struct spi_device *sdev) static void s3c24xx_spi_gpiocs(struct s3c2410_spi_info *spi, int cs, int pol) { - s3c2410_gpio_setpin(spi->pin_cs, pol); + gpio_set_value(spi->pin_cs, pol); } static void s3c24xx_spi_chipsel(struct spi_device *spi, int value) @@ -248,8 +248,13 @@ static void s3c24xx_spi_initialsetup(struct s3c24xx_spi *hw) writeb(SPPIN_DEFAULT, hw->regs + S3C2410_SPPIN); writeb(SPCON_DEFAULT, hw->regs + S3C2410_SPCON); - if (hw->pdata && hw->pdata->gpio_setup) - hw->pdata->gpio_setup(hw->pdata, 1); + if (hw->pdata) { + if (hw->set_cs == s3c24xx_spi_gpiocs) + gpio_direction_output(hw->pdata->pin_cs, 1); + + if (hw->pdata->gpio_setup) + hw->pdata->gpio_setup(hw->pdata, 1); + } } static int __init s3c24xx_spi_probe(struct platform_device *pdev) @@ -343,18 +348,27 @@ static int __init s3c24xx_spi_probe(struct platform_device *pdev) goto err_no_clk; } - s3c24xx_spi_initialsetup(hw); - /* setup any gpio we can */ if (!pdata->set_cs) { - hw->set_cs = s3c24xx_spi_gpiocs; + if (pdata->pin_cs < 0) { + dev_err(&pdev->dev, "No chipselect pin\n"); + goto err_register; + } - s3c2410_gpio_setpin(pdata->pin_cs, 1); - s3c2410_gpio_cfgpin(pdata->pin_cs, S3C2410_GPIO_OUTPUT); + err = gpio_request(pdata->pin_cs, dev_name(&pdev->dev)); + if (err) { + dev_err(&pdev->dev, "Failed to get gpio for cs\n"); + goto err_register; + } + + hw->set_cs = s3c24xx_spi_gpiocs; + gpio_direction_output(pdata->pin_cs, 1); } else hw->set_cs = pdata->set_cs; + s3c24xx_spi_initialsetup(hw); + /* register our spi controller */ err = spi_bitbang_start(&hw->bitbang); @@ -366,6 +380,9 @@ static int __init s3c24xx_spi_probe(struct platform_device *pdev) return 0; err_register: + if (hw->set_cs == s3c24xx_spi_gpiocs) + gpio_free(pdata->pin_cs); + clk_disable(hw->clk); clk_put(hw->clk); @@ -401,6 +418,9 @@ static int __exit s3c24xx_spi_remove(struct platform_device *dev) free_irq(hw->irq, hw); iounmap(hw->regs); + if (hw->set_cs == s3c24xx_spi_gpiocs) + gpio_free(hw->pdata->pin_cs); + release_resource(hw->ioarea); kfree(hw->ioarea); diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c index 243ea4ab20c..db16112cf19 100644 --- a/drivers/video/aty/aty128fb.c +++ b/drivers/video/aty/aty128fb.c @@ -2051,7 +2051,7 @@ static int __devinit aty128_probe(struct pci_dev *pdev, const struct pci_device_ /* Virtualize mmio region */ info->fix.mmio_start = reg_addr; - par->regbase = ioremap(reg_addr, pci_resource_len(pdev, 2)); + par->regbase = pci_ioremap_bar(pdev, 2); if (!par->regbase) goto err_free_info; diff --git a/drivers/video/bfin-t350mcqb-fb.c b/drivers/video/bfin-t350mcqb-fb.c index 7d1b819e501..a9b3ada05d9 100644 --- a/drivers/video/bfin-t350mcqb-fb.c +++ b/drivers/video/bfin-t350mcqb-fb.c @@ -255,7 +255,7 @@ static int bfin_t350mcqb_fb_check_var(struct fb_var_screeninfo *var, { if (var->bits_per_pixel != LCD_BPP) { - pr_debug("%s: depth not supported: %u BPP\n", __FUNCTION__, + pr_debug("%s: depth not supported: %u BPP\n", __func__, var->bits_per_pixel); return -EINVAL; } @@ -264,7 +264,7 @@ static int bfin_t350mcqb_fb_check_var(struct fb_var_screeninfo *var, info->var.xres_virtual != var->xres_virtual || info->var.yres_virtual != var->yres_virtual) { pr_debug("%s: Resolution not supported: X%u x Y%u \n", - __FUNCTION__, var->xres, var->yres); + __func__, var->xres, var->yres); return -EINVAL; } @@ -274,7 +274,7 @@ static int bfin_t350mcqb_fb_check_var(struct fb_var_screeninfo *var, if ((info->fix.line_length * var->yres_virtual) > info->fix.smem_len) { pr_debug("%s: Memory Limit requested yres_virtual = %u\n", - __FUNCTION__, var->yres_virtual); + __func__, var->yres_virtual); return -ENOMEM; } diff --git a/drivers/video/carminefb.c b/drivers/video/carminefb.c index c9b191319a9..c7ff3c1a266 100644 --- a/drivers/video/carminefb.c +++ b/drivers/video/carminefb.c @@ -168,7 +168,7 @@ static int carmine_setcolreg(unsigned regno, unsigned red, unsigned green, blue >>= 8; transp >>= 8; - ((u32 *)info->pseudo_palette)[regno] = be32_to_cpu(transp << 24 | + ((__be32 *)info->pseudo_palette)[regno] = cpu_to_be32(transp << 24 | red << 0 | green << 8 | blue << 16); return 0; } diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 39d5d643a50..7a9e42e3a9a 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -1583,8 +1583,7 @@ cyberpro_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) goto failed_release; cfb->dev = dev; - cfb->region = ioremap(pci_resource_start(dev, 0), - pci_resource_len(dev, 0)); + cfb->region = pci_ioremap_bar(dev, 0); if (!cfb->region) goto failed_ioremap; diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 3c65b0d6761..756efeb91ab 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -510,6 +510,10 @@ static int fb_prepare_extra_logos(struct fb_info *info, unsigned int height, fb_logo_ex_num = 0; for (i = 0; i < fb_logo_ex_num; i++) { + if (fb_logo_ex[i].logo->type != fb_logo.logo->type) { + fb_logo_ex[i].logo = NULL; + continue; + } height += fb_logo_ex[i].logo->height; if (height > yres) { height -= fb_logo_ex[i].logo->height; diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c index f89c3cce1e0..fe5b519860b 100644 --- a/drivers/video/gbefb.c +++ b/drivers/video/gbefb.c @@ -912,6 +912,7 @@ static int gbefb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { unsigned int line_length; struct gbe_timing_info timing; + int ret; /* Limit bpp to 8, 16, and 32 */ if (var->bits_per_pixel <= 8) @@ -930,8 +931,10 @@ static int gbefb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) var->grayscale = 0; /* No grayscale for now */ - if ((var->pixclock = compute_gbe_timing(var, &timing)) < 0) - return(-EINVAL); + ret = compute_gbe_timing(var, &timing); + var->pixclock = ret; + if (ret < 0) + return -EINVAL; /* Adjust virtual resolution, if necessary */ if (var->xres > var->xres_virtual || (!ywrap && !ypan)) diff --git a/drivers/video/geode/gx1fb_core.c b/drivers/video/geode/gx1fb_core.c index bb20a228976..751e491ca8c 100644 --- a/drivers/video/geode/gx1fb_core.c +++ b/drivers/video/geode/gx1fb_core.c @@ -217,8 +217,7 @@ static int __init gx1fb_map_video_memory(struct fb_info *info, struct pci_dev *d ret = pci_request_region(dev, 0, "gx1fb (video)"); if (ret < 0) return ret; - par->vid_regs = ioremap(pci_resource_start(dev, 0), - pci_resource_len(dev, 0)); + par->vid_regs = pci_ioremap_bar(dev, 0); if (!par->vid_regs) return -ENOMEM; diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c index de2b8f9876a..48411892631 100644 --- a/drivers/video/geode/gxfb_core.c +++ b/drivers/video/geode/gxfb_core.c @@ -242,23 +242,21 @@ static int __init gxfb_map_video_memory(struct fb_info *info, struct pci_dev *de ret = pci_request_region(dev, 3, "gxfb (video processor)"); if (ret < 0) return ret; - par->vid_regs = ioremap(pci_resource_start(dev, 3), - pci_resource_len(dev, 3)); + par->vid_regs = pci_ioremap_bar(dev, 3); if (!par->vid_regs) return -ENOMEM; ret = pci_request_region(dev, 2, "gxfb (display controller)"); if (ret < 0) return ret; - par->dc_regs = ioremap(pci_resource_start(dev, 2), pci_resource_len(dev, 2)); + par->dc_regs = pci_ioremap_bar(dev, 2); if (!par->dc_regs) return -ENOMEM; ret = pci_request_region(dev, 1, "gxfb (graphics processor)"); if (ret < 0) return ret; - par->gp_regs = ioremap(pci_resource_start(dev, 1), - pci_resource_len(dev, 1)); + par->gp_regs = pci_ioremap_bar(dev, 1); if (!par->gp_regs) return -ENOMEM; diff --git a/drivers/video/geode/lxfb_core.c b/drivers/video/geode/lxfb_core.c index 2cd9b74d222..b965ecdbc60 100644 --- a/drivers/video/geode/lxfb_core.c +++ b/drivers/video/geode/lxfb_core.c @@ -379,20 +379,17 @@ static int __init lxfb_map_video_memory(struct fb_info *info, if (info->screen_base == NULL) return ret; - par->gp_regs = ioremap(pci_resource_start(dev, 1), - pci_resource_len(dev, 1)); + par->gp_regs = pci_ioremap_bar(dev, 1); if (par->gp_regs == NULL) return ret; - par->dc_regs = ioremap(pci_resource_start(dev, 2), - pci_resource_len(dev, 2)); + par->dc_regs = pci_ioremap_bar(dev, 2); if (par->dc_regs == NULL) return ret; - par->vp_regs = ioremap(pci_resource_start(dev, 3), - pci_resource_len(dev, 3)); + par->vp_regs = pci_ioremap_bar(dev, 3); if (par->vp_regs == NULL) return ret; diff --git a/drivers/video/gxt4500.c b/drivers/video/gxt4500.c index 564557792be..896e53dea90 100644 --- a/drivers/video/gxt4500.c +++ b/drivers/video/gxt4500.c @@ -648,7 +648,7 @@ static int __devinit gxt4500_probe(struct pci_dev *pdev, info->pseudo_palette = par->pseudo_palette; info->fix.mmio_start = reg_phys; - par->regs = ioremap(reg_phys, pci_resource_len(pdev, 0)); + par->regs = pci_ioremap_bar(pdev, 0); if (!par->regs) { dev_err(&pdev->dev, "gxt4500: cannot map registers\n"); goto err_free_all; @@ -656,7 +656,7 @@ static int __devinit gxt4500_probe(struct pci_dev *pdev, info->fix.smem_start = fb_phys; info->fix.smem_len = pci_resource_len(pdev, 1); - info->screen_base = ioremap(fb_phys, pci_resource_len(pdev, 1)); + info->screen_base = pci_ioremap_bar(pdev, 1); if (!info->screen_base) { dev_err(&pdev->dev, "gxt4500: cannot map framebuffer\n"); goto err_unmap_regs; diff --git a/drivers/video/i810/i810_accel.c b/drivers/video/i810/i810_accel.c index 76764ea3486..f5bedee4310 100644 --- a/drivers/video/i810/i810_accel.c +++ b/drivers/video/i810/i810_accel.c @@ -301,8 +301,10 @@ void i810fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) u32 dx, dy, width, height, dest, rop = 0, color = 0; if (!info->var.accel_flags || par->dev_flags & LOCKUP || - par->depth == 4) - return cfb_fillrect(info, rect); + par->depth == 4) { + cfb_fillrect(info, rect); + return; + } if (par->depth == 1) color = rect->color; @@ -327,8 +329,10 @@ void i810fb_copyarea(struct fb_info *info, const struct fb_copyarea *region) u32 sx, sy, dx, dy, pitch, width, height, src, dest, xdir; if (!info->var.accel_flags || par->dev_flags & LOCKUP || - par->depth == 4) - return cfb_copyarea(info, region); + par->depth == 4) { + cfb_copyarea(info, region); + return; + } dx = region->dx * par->depth; sx = region->sx * par->depth; @@ -366,8 +370,10 @@ void i810fb_imageblit(struct fb_info *info, const struct fb_image *image) u32 fg = 0, bg = 0, size, dst; if (!info->var.accel_flags || par->dev_flags & LOCKUP || - par->depth == 4 || image->depth != 1) - return cfb_imageblit(info, image); + par->depth == 4 || image->depth != 1) { + cfb_imageblit(info, image); + return; + } switch (info->var.bits_per_pixel) { case 8: diff --git a/drivers/video/intelfb/intelfbdrv.c b/drivers/video/intelfb/intelfbdrv.c index a09e2364935..6d8e5415c80 100644 --- a/drivers/video/intelfb/intelfbdrv.c +++ b/drivers/video/intelfb/intelfbdrv.c @@ -1493,8 +1493,10 @@ static void intelfb_fillrect (struct fb_info *info, DBG_MSG("intelfb_fillrect\n"); #endif - if (!ACCEL(dinfo, info) || dinfo->depth == 4) - return cfb_fillrect(info, rect); + if (!ACCEL(dinfo, info) || dinfo->depth == 4) { + cfb_fillrect(info, rect); + return; + } if (rect->rop == ROP_COPY) rop = PAT_ROP_GXCOPY; @@ -1521,8 +1523,10 @@ static void intelfb_copyarea(struct fb_info *info, DBG_MSG("intelfb_copyarea\n"); #endif - if (!ACCEL(dinfo, info) || dinfo->depth == 4) - return cfb_copyarea(info, region); + if (!ACCEL(dinfo, info) || dinfo->depth == 4) { + cfb_copyarea(info, region); + return; + } intelfbhw_do_bitblt(dinfo, region->sx, region->sy, region->dx, region->dy, region->width, region->height, @@ -1540,8 +1544,10 @@ static void intelfb_imageblit(struct fb_info *info, #endif if (!ACCEL(dinfo, info) || dinfo->depth == 4 - || image->depth != 1) - return cfb_imageblit(info, image); + || image->depth != 1) { + cfb_imageblit(info, image); + return; + } if (dinfo->depth != 8) { fgcolor = dinfo->pseudo_palette[image->fg_color]; @@ -1554,8 +1560,10 @@ static void intelfb_imageblit(struct fb_info *info, if (!intelfbhw_do_drawglyph(dinfo, fgcolor, bgcolor, image->width, image->height, image->data, image->dx, image->dy, - dinfo->pitch, info->var.bits_per_pixel)) - return cfb_imageblit(info, image); + dinfo->pitch, info->var.bits_per_pixel)) { + cfb_imageblit(info, image); + return; + } } static int intelfb_cursor(struct fb_info *info, struct fb_cursor *cursor) diff --git a/drivers/video/modedb.c b/drivers/video/modedb.c index d3c3af53a29..16186240c5f 100644 --- a/drivers/video/modedb.c +++ b/drivers/video/modedb.c @@ -329,7 +329,7 @@ const struct fb_videomode vesa_modes[] = { FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 17 1152x864-75 VESA */ - { NULL, 75, 1153, 864, 9259, 256, 64, 32, 1, 128, 3, + { NULL, 75, 1152, 864, 9259, 256, 64, 32, 1, 128, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 18 1280x960-60 VESA */ diff --git a/drivers/video/neofb.c b/drivers/video/neofb.c index bfb802d26d5..588527a254c 100644 --- a/drivers/video/neofb.c +++ b/drivers/video/neofb.c @@ -1453,7 +1453,8 @@ neo2200_imageblit(struct fb_info *info, const struct fb_image *image) * is less than 16 bits wide. This is due to insufficient * padding when writing the image. We need to adjust * struct fb_pixmap. Not yet done. */ - return cfb_imageblit(info, image); + cfb_imageblit(info, image); + return; } bltCntl_flags = NEO_BC0_SRC_MONO; } else if (image->depth == info->var.bits_per_pixel) { @@ -1461,7 +1462,8 @@ neo2200_imageblit(struct fb_info *info, const struct fb_image *image) } else { /* We don't currently support hardware acceleration if image * depth is different from display */ - return cfb_imageblit(info, image); + cfb_imageblit(info, image); + return; } switch (info->var.bits_per_pixel) { diff --git a/drivers/video/nvidia/nv_accel.c b/drivers/video/nvidia/nv_accel.c index fa4821c5572..ad6472a894e 100644 --- a/drivers/video/nvidia/nv_accel.c +++ b/drivers/video/nvidia/nv_accel.c @@ -300,8 +300,10 @@ void nvidiafb_copyarea(struct fb_info *info, const struct fb_copyarea *region) if (info->state != FBINFO_STATE_RUNNING) return; - if (par->lockup) - return cfb_copyarea(info, region); + if (par->lockup) { + cfb_copyarea(info, region); + return; + } NVDmaStart(info, par, BLIT_POINT_SRC, 3); NVDmaNext(par, (region->sy << 16) | region->sx); @@ -319,8 +321,10 @@ void nvidiafb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) if (info->state != FBINFO_STATE_RUNNING) return; - if (par->lockup) - return cfb_fillrect(info, rect); + if (par->lockup) { + cfb_fillrect(info, rect); + return; + } if (info->var.bits_per_pixel == 8) color = rect->color; diff --git a/drivers/video/pm3fb.c b/drivers/video/pm3fb.c index 68089d1456c..6666f45a2f8 100644 --- a/drivers/video/pm3fb.c +++ b/drivers/video/pm3fb.c @@ -539,8 +539,10 @@ static void pm3fb_imageblit(struct fb_info *info, const struct fb_image *image) bgx = par->palette[image->bg_color]; break; } - if (image->depth != 1) - return cfb_imageblit(info, image); + if (image->depth != 1) { + cfb_imageblit(info, image); + return; + } if (info->var.bits_per_pixel == 8) { fgx |= fgx << 8; diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c index f94ae84a58c..dcd98793d56 100644 --- a/drivers/video/sm501fb.c +++ b/drivers/video/sm501fb.c @@ -159,6 +159,9 @@ static int sm501_alloc_mem(struct sm501fb_info *inf, struct sm501_mem *mem, break; case SM501_MEMF_PANEL: + if (size > inf->fbmem_len) + return -ENOMEM; + ptr = inf->fbmem_len - size; fbi = inf->fb[HEAD_CRT]; @@ -172,9 +175,6 @@ static int sm501_alloc_mem(struct sm501fb_info *inf, struct sm501_mem *mem, if (fbi && ptr < fbi->fix.smem_len) return -ENOMEM; - if (ptr < 0) - return -ENOMEM; - break; case SM501_MEMF_CRT: diff --git a/drivers/video/via/viafbdev.c b/drivers/video/via/viafbdev.c index e21fe5b6f9f..37b433a08ce 100644 --- a/drivers/video/via/viafbdev.c +++ b/drivers/video/via/viafbdev.c @@ -870,8 +870,10 @@ static void viafb_fillrect(struct fb_info *info, u32 col = 0, rop = 0; int pitch; - if (!viafb_accel) - return cfb_fillrect(info, rect); + if (!viafb_accel) { + cfb_fillrect(info, rect); + return; + } if (!rect->width || !rect->height) return; @@ -937,8 +939,10 @@ static void viafb_copyarea(struct fb_info *info, DEBUG_MSG(KERN_INFO "viafb_copyarea!!\n"); - if (!viafb_accel) - return cfb_copyarea(info, area); + if (!viafb_accel) { + cfb_copyarea(info, area); + return; + } if (!area->width || !area->height) return; @@ -994,8 +998,10 @@ static void viafb_imageblit(struct fb_info *info, int i; int pitch; - if (!viafb_accel) - return cfb_imageblit(info, image); + if (!viafb_accel) { + cfb_imageblit(info, image); + return; + } udata = (u32 *) image->data; diff --git a/firmware/dsp56k/bootstrap.asm b/firmware/dsp56k/bootstrap.asm index 10d891929cd..a411047e6db 100644 --- a/firmware/dsp56k/bootstrap.asm +++ b/firmware/dsp56k/bootstrap.asm @@ -51,19 +51,19 @@ start jmp <$40 ; Copy DSP program control move #real,r0 move #upload,r1 - do #upload_end-upload,<_copy - move P:(r0)+,x0 - move x0,P:(r1)+ -_copy movep #>4,X:<<M_HCR - movep #>$c00,X:<<M_IPR + do #upload_end-upload,_copy + movem P:(r0)+,x0 + movem x0,P:(r1)+ +_copy movep #4,X:<<M_HCR + movep #$c00,X:<<M_IPR and #<$fe,mr jmp upload real org P:$7ea9 upload - movep #>1,X:<<M_PBC - movep #>0,X:<<M_BCR + movep #1,X:<<M_PBC + movep #0,X:<<M_BCR next jclr #0,X:<<M_HSR,* movep X:<<M_HRX,A @@ -81,18 +81,18 @@ _get_length cmp x0,A jeq load_Y -load_P do y0,_load +load_P do y0,_load_P jclr #0,X:<<M_HSR,* movep X:<<M_HRX,P:(r0)+ -_load jmp next -load_X do y0,_load +_load_P jmp next +load_X do y0,_load_X jclr #0,X:<<M_HSR,* movep X:<<M_HRX,X:(r0)+ -_load jmp next -load_Y do y0,_load +_load_X jmp next +load_Y do y0,_load_Y jclr #0,X:<<M_HSR,* movep X:<<M_HRX,Y:(r0)+ -_load jmp next +_load_Y jmp next upload_end end diff --git a/fs/Kconfig b/fs/Kconfig index f9b6e2979aa..32883589ee5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -721,7 +721,20 @@ config CONFIGFS_FS endmenu -menu "Miscellaneous filesystems" +menuconfig MISC_FILESYSTEMS + bool "Miscellaneous filesystems" + default y + ---help--- + Say Y here to get to see options for various miscellaneous + filesystems, such as filesystems that came from other + operating systems. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if MISC_FILESYSTEMS config ADFS_FS tristate "ADFS file system support (EXPERIMENTAL)" @@ -1091,7 +1104,7 @@ config UFS_DEBUG Y here. This will result in _many_ additional debugging messages to be written to the system log. -endmenu +endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index e0f16da00e5..a76803108d0 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -25,8 +25,6 @@ #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) #define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) -#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) - #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 63b7c7afe8d..025e105bffe 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) /* * Check sanity of parameter control fields and if a path is present - * check that it has a "/" and is terminated. + * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { @@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) } if (param->size > sizeof(*param)) { - err = check_name(param->path); + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", - cmd); + AUTOFS_WARN( + "path string terminator missing for cmd(0x%08x)", + cmd); goto out; } - err = invalid_str(param->path, - (void *) ((size_t) param + param->size)); + err = check_name(param->path); if (err) { AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", cmd); @@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->version; + param->protover.version = sbi->version; return 0; } @@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->sub_version; + param->protosubver.sub_version = sbi->sub_version; return 0; } @@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp, int err, fd; /* param->path has already been checked */ - if (!param->arg1) + if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; - devid = param->arg1; + devid = param->openmount.devid; err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); @@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, { autofs_wqt_t token; - token = (autofs_wqt_t) param->arg1; + token = (autofs_wqt_t) param->ready.token; return autofs4_wait_release(sbi, token, 0); } @@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp, autofs_wqt_t token; int status; - token = (autofs_wqt_t) param->arg1; - status = param->arg2 ? param->arg2 : -ENOENT; + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } @@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, int pipefd; int err = 0; - if (param->arg1 == -1) + if (param->setpipefd.pipefd == -1) return -EINVAL; - pipefd = param->arg1; + pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!sbi->catatonic) { @@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, { unsigned long timeout; - timeout = param->arg1; - param->arg1 = sbi->exp_timeout / HZ; + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; return 0; } @@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, path = param->path; devid = sbi->sb->s_dev; - param->arg1 = param->arg2 = -1; + param->requester.uid = param->requester.gid = -1; /* Get nameidata of the parent directory */ err = path_lookup(path, LOOKUP_PARENT, &nd); @@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(nd.path.dentry); spin_lock(&sbi->fs_lock); - param->arg1 = ino->uid; - param->arg2 = ino->gid; + param->requester.uid = ino->uid; + param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } @@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp, int err = -EAGAIN; int how; - how = param->arg1; + how = param->expire.how; mnt = fp->f_path.mnt; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); else dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); @@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = 0; + param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) - param->arg1 = 1; + param->askumount.may_umount = 1; return 0; } @@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct nameidata nd; const char *path; unsigned int type; + unsigned int devid, magic; int err = -ENOENT; if (param->size <= sizeof(*param)) { @@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, } path = param->path; - type = param->arg1; + type = param->ismountpoint.in.type; - param->arg1 = 0; - param->arg2 = 0; + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { - if (type == AUTOFS_TYPE_ANY) { + if (autofs_type_any(type)) { struct super_block *sb; err = path_lookup(path, LOOKUP_FOLLOW, &nd); @@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; sb = nd.path.dentry->d_sb; - param->arg1 = new_encode_dev(sb->s_dev); + devid = new_encode_dev(sb->s_dev); } else { struct autofs_info *ino; @@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out_release; ino = autofs4_dentry_ino(nd.path.dentry); - param->arg1 = autofs4_get_dev(ino->sbi); + devid = autofs4_get_dev(ino->sbi); } err = 0; if (nd.path.dentry->d_inode && nd.path.mnt->mnt_root == nd.path.dentry) { err = 1; - param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + magic = nd.path.dentry->d_inode->i_sb->s_magic; } } else { - dev_t devid = new_encode_dev(sbi->sb->s_dev); + dev_t dev = autofs4_get_dev(sbi); err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; - err = autofs_dev_ioctl_find_super(&nd, devid); + err = autofs_dev_ioctl_find_super(&nd, dev); if (err) goto out_release; - param->arg1 = autofs4_get_dev(sbi); + devid = dev; err = have_submounts(nd.path.dentry); if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { if (follow_down(&nd.path.mnt, &nd.path.dentry)) { struct inode *inode = nd.path.dentry->d_inode; - param->arg2 = inode->i_sb->s_magic; + magic = inode->i_sb->s_magic; } } } + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + out_release: path_put(&nd.path); out: diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4b6fb3f628c..e3bd50776f9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); /* This is an autofs submount, we can't expire it */ - if (sbi->type == AUTOFS_TYPE_INDIRECT) + if (autofs_type_indirect(sbi->type)) goto done; /* @@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index cfc23e53b6f..716e12b627b 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); - if (sbi->type & AUTOFS_TYPE_OFFSET) + if (autofs_type_offset(sbi->type)) seq_printf(m, ",offset"); - else if (sbi->type & AUTOFS_TYPE_DIRECT) + else if (autofs_type_direct(sbi->type)) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); @@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *maxproto = option; break; case Opt_indirect: - *type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(type); break; case Opt_direct: - *type = AUTOFS_TYPE_DIRECT; + set_autofs_type_direct(type); break; case Opt_offset: - *type = AUTOFS_TYPE_OFFSET; + set_autofs_type_offset(type); break; default: return 1; @@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? + root_inode->i_op = autofs_type_trigger(sbi->type) ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e02cc8ae5eb..eeb24684590 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 0ed57b5ee01..cc4062d12ca 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); + if (!info) + return; + brelse(info->si_sbh); mutex_destroy(&info->bfs_lock); kfree(info->si_imap); @@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) unsigned i, imap_len; struct bfs_sb_info *info; long ret = -EINVAL; + unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) @@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; info->si_sbh = bh; + + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + printf("Superblock is corrupted\n"); + goto out; + } + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; @@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; + + /* can we read the last block? */ + bh = sb_bread(s, info->si_blocks - 1); + if (!bh) { + printf("Last block not available: %lu\n", info->si_blocks - 1); + iput(inode); + ret = -EIO; + kfree(info->si_imap); + goto out; + } + brelse(bh); + bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; @@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) di = (struct bfs_inode *)bh->b_data + off; + /* test if filesystem is not corrupted */ + + i_eoff = le32_to_cpu(di->i_eoffset); + i_sblock = le32_to_cpu(di->i_sblock); + i_eblock = le32_to_cpu(di->i_eblock); + s_size = le32_to_cpu(bfs_sb->s_end); + + if (i_sblock > info->si_blocks || + i_eblock > info->si_blocks || + i_sblock > i_eblock || + i_eoff > s_size || + i_sblock * BFS_BSIZE > i_eoff) { + + printf("Inode 0x%08x corrupted\n", i); + + brelse(bh); + s->s_root = NULL; + kfree(info->si_imap); + kfree(info); + s->s_fs_info = NULL; + return -EIO; + } + if (!di->i_ino) { info->si_freei++; continue; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1158cb4fbd..c4e83537ead 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -649,7 +649,7 @@ static const struct file_operations bm_register_operations = { static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled" : "disabled"; + char *s = enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 349a26c1000..b957717e25a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @path: special file representing the block device + * @pathname: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/buffer.c b/fs/buffer.c index a13f09b696f..c26da785938 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping, if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } - goto out; } out: diff --git a/fs/char_dev.c b/fs/char_dev.c index 700697a7261..38f71222a55 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strncpy(cd->name,name, 64); + strlcpy(cd->name, name, sizeof(cd->name)); i = major_to_index(major); diff --git a/fs/compat.c b/fs/compat.c index d1ece79b641..30f2faa22f5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); fput(file); return ret; } @@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); fput(file); return ret; } diff --git a/fs/direct-io.c b/fs/direct-io.c index af0558dbe8b..b6d43908ff7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by + * it's own meaner. + */ + if (unlikely(retval < 0 && (rw & WRITE))) { + loff_t isize = i_size_read(inode); + + if (end > isize && dio_lock_type == DIO_LOCKING) + vmtruncate(inode, isize); + } + if (rw == READ && dio_lock_type == DIO_LOCKING) release_i_mutex = 0; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 6046239465a..c01e043670e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -175,8 +175,8 @@ out: * * Returns zero on success; non-zero on error. */ -static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { int rc = 0; char dst[MD5_DIGEST_SIZE]; @@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags( crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; + else if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; + } } static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( @@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem { static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { {0x00000001, ECRYPTFS_ENABLE_HMAC}, {0x00000002, ECRYPTFS_ENCRYPTED}, - {0x00000004, ECRYPTFS_METADATA_IN_XATTR} + {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, + {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} }; /** @@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = { /** * ecryptfs_code_for_cipher_string - * @crypt_stat: The cryptographic context + * @cipher_name: The string alias for the cipher + * @key_bytes: Length of key in bytes; used for AES code selection * * Returns zero on no match, or the cipher code on match */ -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) { int i; u8 code = 0; struct ecryptfs_cipher_code_str_map_elem *map = ecryptfs_cipher_code_str_map; - if (strcmp(crypt_stat->cipher, "aes") == 0) { - switch (crypt_stat->key_size) { + if (strcmp(cipher_name, "aes") == 0) { + switch (key_bytes) { case 16: code = RFC2440_CIPHER_AES_128; break; @@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) } } else { for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) - if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ + if (strcmp(cipher_name, map[i].cipher_str) == 0) { code = map[i].cipher_code; break; } @@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data, &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); int rc; + if (crypt_stat->extent_size == 0) + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, ecryptfs_inode); if (rc) { @@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data, } if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { rc = -EINVAL; - ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n"); } out: return rc; @@ -1628,95 +1640,95 @@ out: } /** - * ecryptfs_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode - * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * ecryptfs_encrypt_filename - encrypt filename * - * Encrypts and encodes a filename into something that constitutes a - * valid filename for a filesystem, with printable characters. + * CBC-encrypts the filename. We do not want to encrypt the same + * filename with the same key and IV, which may happen with hard + * links, so we prepend random bits to each filename. * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of encoded filename; negative if error + * Returns zero on success; non-zero otherwise */ -int -ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **encoded_name) +static int +ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { - int error = 0; + int rc = 0; - (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*encoded_name)) { - error = -ENOMEM; + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + size_t packet_size; + size_t remaining_bytes; + + rc = ecryptfs_write_tag_70_packet( + NULL, NULL, + &filename->encrypted_filename_size, + mount_crypt_stat, NULL, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to get packet " + "size for tag 72; rc = [%d]\n", __func__, + rc); + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename = + kmalloc(filename->encrypted_filename_size, GFP_KERNEL); + if (!filename->encrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + filename->encrypted_filename_size); + rc = -ENOMEM; + goto out; + } + remaining_bytes = filename->encrypted_filename_size; + rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, + &remaining_bytes, + &packet_size, + mount_crypt_stat, + filename->filename, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to generate " + "tag 70 packet; rc = [%d]\n", __func__, + rc); + kfree(filename->encrypted_filename); + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename_size = packet_size; + } else { + printk(KERN_ERR "%s: No support for requested filename " + "encryption method in this release\n", __func__); + rc = -ENOTSUPP; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to encrypt and encode the - * filename, the set the length accordingly. */ - memcpy((void *)(*encoded_name), (void *)name, length); - (*encoded_name)[length] = '\0'; - error = length + 1; out: - return error; + return rc; } -/** - * ecryptfs_decode_filename - converts the cipher text name to plaintext - * @crypt_stat: The crypt_stat struct associated with the file - * @name: The filename in cipher text - * @length: The length of the cipher text name - * @decrypted_name: The plaintext name - * - * Decodes and decrypts the filename. - * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of decoded filename; negative if error - */ -int -ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **decrypted_name) +static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, + const char *name, size_t name_size) { - int error = 0; + int rc = 0; - (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*decrypted_name)) { - error = -ENOMEM; + (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); + if (!(*copied_name)) { + rc = -ENOMEM; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to decode and decrypt the - * filename, the set the length accordingly. */ - memcpy((void *)(*decrypted_name), (void *)name, length); - (*decrypted_name)[length + 1] = '\0'; /* Only for convenience + memcpy((void *)(*copied_name), (void *)name, name_size); + (*copied_name)[(name_size)] = '\0'; /* Only for convenience * in printing out the * string in debug * messages */ - error = length; + (*copied_name_size) = (name_size + 1); out: - return error; + return rc; } /** @@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, *key_tfm = NULL; if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { rc = -EINVAL; - printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " + printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); goto out; } @@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, get_random_bytes(dummy_key, *key_size); rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { - printk(KERN_ERR "Error attempting to set key of size [%Zd] for " + printk(KERN_ERR "Error attempting to set key of size [%zd] for " "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); rc = -EINVAL; goto out; @@ -1910,3 +1922,341 @@ out: mutex_unlock(&key_tfm_list_mutex); return rc; } + +/* 64 characters forming a 6-bit target field */ +static unsigned char *portable_filename_chars = ("-.0123456789ABCD" + "EFGHIJKLMNOPQRST" + "UVWXYZabcdefghij" + "klmnopqrstuvwxyz"); + +/* We could either offset on every reverse map or just pad some 0x00's + * at the front here */ +static const unsigned char filename_rev_map[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ + 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ + 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ + 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ + 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ + 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ + 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ + 0x3D, 0x3E, 0x3F +}; + +/** + * ecryptfs_encode_for_filename + * @dst: Destination location for encoded filename + * @dst_size: Size of the encoded filename in bytes + * @src: Source location for the filename to encode + * @src_size: Size of the source in bytes + */ +void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, + unsigned char *src, size_t src_size) +{ + size_t num_blocks; + size_t block_num = 0; + size_t dst_offset = 0; + unsigned char last_block[3]; + + if (src_size == 0) { + (*dst_size) = 0; + goto out; + } + num_blocks = (src_size / 3); + if ((src_size % 3) == 0) { + memcpy(last_block, (&src[src_size - 3]), 3); + } else { + num_blocks++; + last_block[2] = 0x00; + switch (src_size % 3) { + case 1: + last_block[0] = src[src_size - 1]; + last_block[1] = 0x00; + break; + case 2: + last_block[0] = src[src_size - 2]; + last_block[1] = src[src_size - 1]; + } + } + (*dst_size) = (num_blocks * 4); + if (!dst) + goto out; + while (block_num < num_blocks) { + unsigned char *src_block; + unsigned char dst_block[4]; + + if (block_num == (num_blocks - 1)) + src_block = last_block; + else + src_block = &src[block_num * 3]; + dst_block[0] = ((src_block[0] >> 2) & 0x3F); + dst_block[1] = (((src_block[0] << 4) & 0x30) + | ((src_block[1] >> 4) & 0x0F)); + dst_block[2] = (((src_block[1] << 2) & 0x3C) + | ((src_block[2] >> 6) & 0x03)); + dst_block[3] = (src_block[2] & 0x3F); + dst[dst_offset++] = portable_filename_chars[dst_block[0]]; + dst[dst_offset++] = portable_filename_chars[dst_block[1]]; + dst[dst_offset++] = portable_filename_chars[dst_block[2]]; + dst[dst_offset++] = portable_filename_chars[dst_block[3]]; + block_num++; + } +out: + return; +} + +/** + * ecryptfs_decode_from_filename + * @dst: If NULL, this function only sets @dst_size and returns. If + * non-NULL, this function decodes the encoded octets in @src + * into the memory that @dst points to. + * @dst_size: Set to the size of the decoded string. + * @src: The encoded set of octets to decode. + * @src_size: The size of the encoded set of octets to decode. + */ +static void +ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + const unsigned char *src, size_t src_size) +{ + u8 current_bit_offset = 0; + size_t src_byte_offset = 0; + size_t dst_byte_offset = 0; + + if (dst == NULL) { + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + (*dst_size) = (((src_size + 1) * 3) / 4); + goto out; + } + while (src_byte_offset < src_size) { + unsigned char src_byte = + filename_rev_map[(int)src[src_byte_offset]]; + + switch (current_bit_offset) { + case 0: + dst[dst_byte_offset] = (src_byte << 2); + current_bit_offset = 6; + break; + case 6: + dst[dst_byte_offset++] |= (src_byte >> 4); + dst[dst_byte_offset] = ((src_byte & 0xF) + << 4); + current_bit_offset = 4; + break; + case 4: + dst[dst_byte_offset++] |= (src_byte >> 2); + dst[dst_byte_offset] = (src_byte << 6); + current_bit_offset = 2; + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); + dst[dst_byte_offset] = 0; + current_bit_offset = 0; + break; + } + src_byte_offset++; + } + (*dst_size) = dst_byte_offset; +out: + return; +} + +/** + * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * Returns zero on success; non-zero on otherwise + */ +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size) +{ + size_t encoded_name_no_prefix_size; + int rc = 0; + + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + struct ecryptfs_filename *filename; + + filename = kzalloc(sizeof(*filename), GFP_KERNEL); + if (!filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + sizeof(*filename)); + rc = -ENOMEM; + goto out; + } + filename->filename = (char *)name; + filename->filename_size = name_size; + rc = ecryptfs_encrypt_filename(filename, crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt " + "filename; rc = [%d]\n", __func__, rc); + kfree(filename); + goto out; + } + ecryptfs_encode_for_filename( + NULL, &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + else + (*encoded_name_size) = + (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); + if (!(*encoded_name)) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + (*encoded_name_size)); + rc = -ENOMEM; + kfree(filename->encrypted_filename); + kfree(filename); + goto out; + } + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + memcpy((*encoded_name), + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); + ecryptfs_encode_for_filename( + ((*encoded_name) + + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), + &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name)[(*encoded_name_size)] = '\0'; + (*encoded_name_size)++; + } else { + rc = -ENOTSUPP; + } + if (rc) { + printk(KERN_ERR "%s: Error attempting to encode " + "encrypted filename; rc = [%d]\n", __func__, + rc); + kfree((*encoded_name)); + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + } + kfree(filename->encrypted_filename); + kfree(filename); + } else { + rc = ecryptfs_copy_filename(encoded_name, + encoded_name_size, + name, name_size); + } +out: + return rc; +} + +/** + * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext + * @plaintext_name: The plaintext name + * @plaintext_name_size: The plaintext name size + * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @name: The filename in cipher text + * @name_size: The cipher text name size + * + * Decrypts and decodes the filename. + * + * Returns zero on error; non-zero otherwise + */ +int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, + size_t *plaintext_name_size, + struct dentry *ecryptfs_dir_dentry, + const char *name, size_t name_size) +{ + char *decoded_name; + size_t decoded_name_size; + size_t packet_size; + int rc = 0; + + if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + const char *orig_name = name; + size_t orig_name_size = name_size; + + name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + ecryptfs_decode_from_filename(NULL, &decoded_name_size, + name, name_size); + decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); + if (!decoded_name) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + decoded_name_size); + rc = -ENOMEM; + goto out; + } + ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, + name, name_size); + rc = ecryptfs_parse_tag_70_packet(plaintext_name, + plaintext_name_size, + &packet_size, + mount_crypt_stat, + decoded_name, + decoded_name_size); + if (rc) { + printk(KERN_INFO "%s: Could not parse tag 70 packet " + "from filename; copying through filename " + "as-is\n", __func__); + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + orig_name, orig_name_size); + goto out_free; + } + } else { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } +out_free: + kfree(decoded_name); +out: + return rc; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a75026d35d1..c11fc95714a 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -51,12 +51,16 @@ #define ECRYPTFS_VERSIONING_XATTR 0x00000010 #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 #define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 +#define ECRYPTFS_VERSIONING_HMAC 0x00000080 +#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 +#define ECRYPTFS_VERSIONING_GCM 0x00000200 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | ECRYPTFS_VERSIONING_PUBKEY \ | ECRYPTFS_VERSIONING_XATTR \ | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC) + | ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 @@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 #define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 +#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename + * as dentry name */ +#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in + * metadata */ +#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as + * dentry name */ +#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as + * metadata */ +/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= + * ECRYPTFS_MAX_IV_BYTES */ +#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 +#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define MD5_DIGEST_SIZE 16 +#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 +#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX]; }; +struct ecryptfs_filename { + struct list_head crypt_stat_list; +#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 + u32 flags; + u32 seq_no; + char *filename; + char *encrypted_filename; + size_t filename_size; + size_t encrypted_filename_size; + char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; + char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; +}; + /** * This is the primary struct associated with each encrypted file. * * TODO: cache align/pack? */ struct ecryptfs_crypt_stat { -#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 -#define ECRYPTFS_POLICY_APPLIED 0x00000002 -#define ECRYPTFS_NEW_FILE 0x00000004 -#define ECRYPTFS_ENCRYPTED 0x00000008 -#define ECRYPTFS_SECURITY_WARNING 0x00000010 -#define ECRYPTFS_ENABLE_HMAC 0x00000020 -#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 -#define ECRYPTFS_KEY_VALID 0x00000080 -#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 -#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 -#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_NEW_FILE 0x00000004 +#define ECRYPTFS_ENCRYPTED 0x00000008 +#define ECRYPTFS_SECURITY_WARNING 0x00000010 +#define ECRYPTFS_ENABLE_HMAC 0x00000020 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 +#define ECRYPTFS_KEY_VALID 0x00000080 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 +#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 +#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat { #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 +#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 +#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 +#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; size_t num_global_auth_toks; size_t global_default_cipher_key_size; + size_t global_default_fn_cipher_key_bytes; unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + unsigned char global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; /* superblock private data. */ @@ -571,13 +617,22 @@ struct ecryptfs_open_req { int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, u32 flags); +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd); +int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct dentry *ecryptfs_dentry, + const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); -int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **decrypted_name); -int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **encoded_name); +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, @@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data, struct inode *ecryptfs_inode); int ecryptfs_read_and_validate_xattr_region(char *page_virt, struct dentry *ecryptfs_dentry); -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_generate_key_packet_set(char *dest_base, @@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file, struct vfsmount *lower_mnt, const struct cred *cred); int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size); +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size); +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 71383437122..9e944057001 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback { /* Inspired by generic filldir in fs/readdir.c */ static int -ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, - u64 ino, unsigned int d_type) +ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, + loff_t offset, u64 ino, unsigned int d_type) { - struct ecryptfs_crypt_stat *crypt_stat; struct ecryptfs_getdents_callback *buf = (struct ecryptfs_getdents_callback *)dirent; + size_t name_size; + char *name; int rc; - int decoded_length; - char *decoded_name; - crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat; buf->filldir_called++; - decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, - &decoded_name); - if (decoded_length < 0) { - rc = decoded_length; + rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->dentry, lower_name, + lower_namelen); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and decrypt " + "filename [%s]; rc = [%d]\n", __func__, lower_name, + rc); goto out; } - rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, - ino, d_type); - kfree(decoded_name); + rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + kfree(name); if (rc >= 0) buf->entries_written++; out: @@ -106,8 +106,8 @@ out: /** * ecryptfs_readdir - * @file: The ecryptfs file struct - * @dirent: Directory entry + * @file: The eCryptfs directory file + * @dirent: Directory entry handle * @filldir: The filldir callback function */ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 0111906a887..5697899a168 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir) /** * ecryptfs_create_underlying_file * @lower_dir_inode: inode of the parent in the lower fs of the new file - * @lower_dentry: New file's dentry in the lower fs - * @ecryptfs_dentry: New file's dentry in ecryptfs + * @dentry: New file's dentry * @mode: The mode of the new file * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * @@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, { int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens - * the crypt_stat->lower_file (persistent file) */ + /* ecryptfs_do_create() calls ecryptfs_interpose() */ rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); if (unlikely(rc)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" @@ -244,141 +242,91 @@ out: } /** - * ecryptfs_lookup - * @dir: inode - * @dentry: The dentry - * @nd: nameidata, may be NULL - * - * Find a file on disk. If the file does not exist, then we'll add it to the - * dentry cache and continue on to read it from the disk. + * ecryptfs_lookup_and_interpose_lower - Perform a lookup */ -static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd) { - int rc = 0; struct dentry *lower_dir_dentry; - struct dentry *lower_dentry; struct vfsmount *lower_mnt; - char *encoded_name; - int encoded_namelen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *page_virt = NULL; - struct inode *lower_inode; u64 file_size; + int rc = 0; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - dentry->d_op = &ecryptfs_dops; - if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) - || (dentry->d_name.len == 2 - && !strcmp(dentry->d_name.name, ".."))) { - d_drop(dentry); - goto out; - } - encoded_namelen = ecryptfs_encode_filename(crypt_stat, - dentry->d_name.name, - dentry->d_name.len, - &encoded_name); - if (encoded_namelen < 0) { - rc = encoded_namelen; - d_drop(dentry); - goto out; - } - ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen " - "= [%d]\n", encoded_name, encoded_namelen); - lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry, - encoded_namelen - 1); - kfree(encoded_name); - if (IS_ERR(lower_dentry)) { - ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n"); - rc = PTR_ERR(lower_dentry); - d_drop(dentry); - goto out; - } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->" - "d_name.name = [%s]\n", lower_dentry, - lower_dentry->d_name.name); + lower_dir_dentry = lower_dentry->d_parent; + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( + ecryptfs_dentry->d_parent)); lower_inode = lower_dentry->d_inode; - fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); BUG_ON(!atomic_read(&lower_dentry->d_count)); - ecryptfs_set_dentry_private(dentry, + ecryptfs_set_dentry_private(ecryptfs_dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(dentry)) { + if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " - "to allocate ecryptfs_dentry_info struct\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n", + __func__); goto out_dput; } - ecryptfs_set_dentry_lower(dentry, lower_dentry); - ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); if (!lower_dentry->d_inode) { /* We want to add because we couldn't find in lower */ - d_add(dentry, NULL); + d_add(ecryptfs_dentry, NULL); goto out; } - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, - ECRYPTFS_INTERPOSE_FLAG_D_ADD); + rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, + ecryptfs_dir_inode->i_sb, 1); if (rc) { - ecryptfs_printk(KERN_ERR, "Error interposing\n"); + printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", + __func__, rc); goto out; } - if (S_ISDIR(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); + if (S_ISDIR(lower_inode->i_mode)) goto out; - } - if (S_ISLNK(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n"); + if (S_ISLNK(lower_inode->i_mode)) goto out; - } - if (special_file(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n"); + if (special_file(lower_inode->i_mode)) goto out; - } - if (!nd) { - ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave" - "as we *think* we are about to unlink\n"); + if (!ecryptfs_nd) goto out; - } /* Released in this function */ - page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, - GFP_USER); + page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); if (!page_virt) { + printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n", + __func__); rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, - "Cannot ecryptfs_kmalloc a page\n"); goto out; } - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(dentry); + if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, - dentry->d_name.name, rc); - goto out; + ecryptfs_dentry->d_name.name, rc); + goto out_free_kmem; } } rc = ecryptfs_read_and_validate_header_region(page_virt, - dentry->d_inode); + ecryptfs_dentry->d_inode); if (rc) { - rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); + rc = ecryptfs_read_and_validate_xattr_region(page_virt, + ecryptfs_dentry); if (rc) { - printk(KERN_DEBUG "Valid metadata not found in header " - "region or xattr region; treating file as " - "unencrypted\n"); rc = 0; - kmem_cache_free(ecryptfs_header_cache_2, page_virt); - goto out; + goto out_free_kmem; } crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; } mount_crypt_stat = &ecryptfs_superblock_to_private( - dentry->d_sb)->mount_crypt_stat; + ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) file_size = (crypt_stat->num_header_bytes_at_front @@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, } else { file_size = get_unaligned_be64(page_virt); } - i_size_write(dentry->d_inode, (loff_t)file_size); + i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size); +out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; - out_dput: dput(lower_dentry); - d_drop(dentry); + d_drop(ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_lookup + * @ecryptfs_dir_inode: The eCryptfs directory inode + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @ecryptfs_nd: nameidata; may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, + struct dentry *ecryptfs_dentry, + struct nameidata *ecryptfs_nd) +{ + char *encrypted_and_encoded_name = NULL; + size_t encrypted_and_encoded_name_size; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct ecryptfs_inode_info *inode_info; + struct dentry *lower_dir_dentry, *lower_dentry; + int rc = 0; + + ecryptfs_dentry->d_op = &ecryptfs_dops; + if ((ecryptfs_dentry->d_name.len == 1 + && !strcmp(ecryptfs_dentry->d_name.name, ".")) + || (ecryptfs_dentry->d_name.len == 2 + && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { + goto out_d_drop; + } + lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + ecryptfs_dentry->d_name.name); + goto out_d_drop; + } + if (lower_dentry->d_inode) + goto lookup_and_interpose; + inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + if (inode_info) { + crypt_stat = &inode_info->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + } + if (crypt_stat) + mount_crypt_stat = crypt_stat->mount_crypt_stat; + else + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + && !(mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + goto lookup_and_interpose; + dput(lower_dentry); + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, + crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + goto out_d_drop; + } + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size - 1); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out_d_drop; + } +lookup_and_interpose: + rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, + crypt_stat, ecryptfs_dir_inode, + ecryptfs_nd); + goto out; +out_d_drop: + d_drop(ecryptfs_dentry); out: + kfree(encrypted_and_encoded_name); return ERR_PTR(rc); } @@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, struct dentry *lower_dentry; struct dentry *lower_dir_dentry; char *encoded_symname; - int encoded_symlen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + size_t encoded_symlen; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, - strlen(symname), - &encoded_symname); - if (encoded_symlen < 0) { - rc = encoded_symlen; + mount_crypt_stat = &ecryptfs_superblock_to_private( + dir->i_sb)->mount_crypt_stat; + rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, + &encoded_symlen, + NULL, + mount_crypt_stat, symname, + strlen(symname)); + if (rc) goto out_lock; - } rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, encoded_symname); kfree(encoded_symname); @@ -602,52 +641,54 @@ out_lock: } static int -ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { - int rc; - struct dentry *lower_dentry; - char *decoded_name; char *lower_buf; - mm_segment_t old_fs; + struct dentry *lower_dentry; struct ecryptfs_crypt_stat *crypt_stat; + char *plaintext_name; + size_t plaintext_name_size; + mm_segment_t old_fs; + int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->readlink) { rc = -EINVAL; goto out; } + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Released in this function */ lower_buf = kmalloc(bufsiz, GFP_KERNEL); if (lower_buf == NULL) { - ecryptfs_printk(KERN_ERR, "Out of memory\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%d] bytes\n", __func__, bufsiz); rc = -ENOMEM; goto out; } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "lower_dentry->d_name.name = [%s]\n", - lower_dentry->d_name.name); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, bufsiz); set_fs(old_fs); if (rc >= 0) { - crypt_stat = NULL; - rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, - &decoded_name); - if (rc == -ENOMEM) + rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, + &plaintext_name_size, + dentry, lower_buf, + rc); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and " + "decrypt filename; rc = [%d]\n", __func__, + rc); goto out_free_lower_buf; - if (rc > 0) { - ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes " - "to userspace: [%*s]\n", rc, - decoded_name); - if (copy_to_user(buf, decoded_name, rc)) - rc = -EFAULT; } - kfree(decoded_name); - fsstack_copy_attr_atime(dentry->d_inode, - lower_dentry->d_inode); + rc = copy_to_user(buf, plaintext_name, plaintext_name_size); + if (rc) + rc = -EFAULT; + else + rc = plaintext_name_size; + kfree(plaintext_name); + fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); } out_free_lower_buf: kfree(lower_buf); @@ -669,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "dentry->d_name.name = [%s]\n", dentry->d_name.name); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); set_fs(old_fs); if (rc < 0) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 0d713b69194..ff539420cc6 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, /* verify that everything through the encrypted FEK size is present */ if (message_len < 4) { rc = -EIO; - printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " + printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " "message length is [%d]\n", __func__, message_len, 4); goto out; } @@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, i += data_len; if (message_len < (i + key_rec->enc_key_size)) { rc = -EIO; - printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", + printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", __func__, message_len, (i + key_rec->enc_key_size)); goto out; } if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { rc = -EIO; - printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " + printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " "the maximum key size [%d]\n", __func__, key_rec->enc_key_size, ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); @@ -403,6 +403,580 @@ out: } static int +ecryptfs_find_global_auth_tok_for_sig( + struct ecryptfs_global_auth_tok **global_auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) +{ + struct ecryptfs_global_auth_tok *walker; + int rc = 0; + + (*global_auth_tok) = NULL; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { + (*global_auth_tok) = walker; + goto out; + } + } + rc = -EINVAL; +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + return rc; +} + +/** + * ecryptfs_find_auth_tok_for_sig + * @auth_tok: Set to the matching auth_tok; NULL if not found + * @crypt_stat: inode crypt_stat crypto context + * @sig: Sig of auth_tok to find + * + * For now, this function simply looks at the registered auth_tok's + * linked off the mount_crypt_stat, so all the auth_toks that can be + * used must be registered at mount time. This function could + * potentially try a lot harder to find auth_tok's (e.g., by calling + * out to ecryptfsd to dynamically retrieve an auth_tok object) so + * that static registration of auth_tok's will no longer be necessary. + * + * Returns zero on no error; non-zero on error + */ +static int +ecryptfs_find_auth_tok_for_sig( + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + int rc = 0; + + (*auth_tok) = NULL; + if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, + mount_crypt_stat, sig)) { + struct key *auth_tok_key; + + rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, + sig); + } else + (*auth_tok) = global_auth_tok->global_auth_tok; + return rc; +} + +/** + * write_tag_70_packet can gobble a lot of stack space. We stuff most + * of the function's parameters in a kmalloc'd struct to help reduce + * eCryptfs' overall stack usage. + */ +struct ecryptfs_write_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + size_t j; + size_t num_rand_bytes; + struct mutex *tfm_mutex; + char *block_aligned_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + struct hash_desc hash_desc; + struct scatterlist hash_sg; +}; + +/** + * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK + * @filename: NULL-terminated filename string + * + * This is the simplest mechanism for achieving filename encryption in + * eCryptfs. It encrypts the given filename with the mount-wide + * filename encryption key (FNEK) and stores it in a packet to @dest, + * which the callee will encode and write directly into the dentry + * name. + */ +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size) +{ + struct ecryptfs_write_tag_70_packet_silly_stack *s; + int rc = 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + (*packet_size) = 0; + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( + &s->desc.tfm, + &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + /* Plus one for the \0 separator between the random prefix + * and the plaintext filename */ + s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); + s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); + if ((s->block_aligned_filename_size % s->block_size) != 0) { + s->num_rand_bytes += (s->block_size + - (s->block_aligned_filename_size + % s->block_size)); + s->block_aligned_filename_size = (s->num_rand_bytes + + filename_size); + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random characters, a \0 + * separator, and then the filename */ + s->max_packet_size = (1 /* Tag 70 identifier */ + + 3 /* Max Tag 70 packet size */ + + ECRYPTFS_SIG_SIZE /* FNEK sig */ + + 1 /* Cipher identifier */ + + s->block_aligned_filename_size); + if (dest == NULL) { + (*packet_size) = s->max_packet_size; + goto out_unlock; + } + if (s->max_packet_size > (*remaining_bytes)) { + printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " + "[%zd] available\n", __func__, s->max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out_unlock; + } + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->block_aligned_filename) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kzalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + s->i = 0; + dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[s->i], + (ECRYPTFS_SIG_SIZE + + 1 /* Cipher code */ + + s->block_aligned_filename_size), + &s->packet_size_len); + if (rc) { + printk(KERN_ERR "%s: Error generating tag 70 packet " + "header; cannot generate packet length; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + s->i += s->packet_size_len; + ecryptfs_from_hex(&dest[s->i], + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_SIG_SIZE); + s->i += ECRYPTFS_SIG_SIZE; + s->cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (s->cipher_code == 0) { + printk(KERN_WARNING "%s: Unable to generate code for " + "cipher [%s] with key bytes [%zd]\n", __func__, + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + rc = -EINVAL; + goto out_free_unlock; + } + dest[s->i++] = s->cipher_code; + rc = ecryptfs_find_auth_tok_for_sig( + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + sg_init_one( + &s->hash_sg, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes); + s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(s->hash_desc.tfm)) { + rc = PTR_ERR(s->hash_desc.tfm); + printk(KERN_ERR "%s: Error attempting to " + "allocate hash crypto context; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update( + &s->hash_desc, &s->hash_sg, + s->auth_tok->token.password.session_key_encryption_key_bytes); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { + s->block_aligned_filename[s->j] = + s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; + if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) + == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { + sg_init_one(&s->hash_sg, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, + ECRYPTFS_TAG_70_DIGEST_SIZE); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + memcpy(s->hash, s->tmp_hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + } + if (s->block_aligned_filename[s->j] == '\0') + s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; + } + memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, + filename_size); + rc = virt_to_scatterlist(s->block_aligned_filename, + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, + &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + /* The characters in the first block effectively do the job + * of the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_release_free_unlock; + } + rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + s->i += s->block_aligned_filename_size; + (*packet_size) = s->i; + (*remaining_bytes) -= (*packet_size); +out_release_free_unlock: + crypto_free_hash(s->hash_desc.tfm); +out_free_unlock: + memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); + kfree(s->block_aligned_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + kfree(s); + return rc; +} + +struct ecryptfs_parse_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t parsed_tag_70_packet_size; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + struct mutex *tfm_mutex; + char *decrypted_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; +}; + +/** + * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * @filename: This function kmalloc's the memory for the filename + * @filename_size: This function sets this to the amount of memory + * kmalloc'd for the filename + * @packet_size: This function sets this to the the number of octets + * in the packet parsed + * @mount_crypt_stat: The mount-wide cryptographic context + * @data: The memory location containing the start of the tag 70 + * packet + * @max_packet_size: The maximum legal size of the packet to be parsed + * from @data + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size) +{ + struct ecryptfs_parse_tag_70_packet_silly_stack *s; + int rc = 0; + + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " + "at least [%d]\n", __func__, max_packet_size, + (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + rc = -EINVAL; + goto out; + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random numbers, a \0 + * separator, and then the filename */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { + printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " + "tag [0x%.2x]\n", __func__, + data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], + &s->parsed_tag_70_packet_size, + &s->packet_size_len); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out; + } + s->block_aligned_filename_size = (s->parsed_tag_70_packet_size + - ECRYPTFS_SIG_SIZE - 1); + if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) + > max_packet_size) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " + "size is [%zd]\n", __func__, max_packet_size, + (1 + s->packet_size_len + 1 + + s->block_aligned_filename_size)); + rc = -EINVAL; + goto out; + } + (*packet_size) += s->packet_size_len; + ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], + ECRYPTFS_SIG_SIZE); + s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + (*packet_size) += ECRYPTFS_SIG_SIZE; + s->cipher_code = data[(*packet_size)++]; + rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); + if (rc) { + printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", + __func__, s->cipher_code); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + &s->tfm_mutex, + s->cipher_string); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + s->cipher_string, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + rc = virt_to_scatterlist(&data[(*packet_size)], + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_unlock; + } + (*packet_size) += s->block_aligned_filename_size; + s->decrypted_filename = kmalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->decrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + rc = virt_to_scatterlist(s->decrypted_filename, + s->block_aligned_filename_size, &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert decrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_free_unlock; + } + /* The characters in the first block effectively do the job of + * the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_free_unlock; + } + rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_free_unlock; + } + s->i = 0; + while (s->decrypted_filename[s->i] != '\0' + && s->i < s->block_aligned_filename_size) + s->i++; + if (s->i == s->block_aligned_filename_size) { + printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " + "find valid separator between random characters and " + "the filename\n", __func__); + rc = -EINVAL; + goto out_free_unlock; + } + s->i++; + (*filename_size) = (s->block_aligned_filename_size - s->i); + if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { + printk(KERN_WARNING "%s: Filename size is [%zd], which is " + "invalid\n", __func__, (*filename_size)); + rc = -EINVAL; + goto out_free_unlock; + } + (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); + if (!(*filename)) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + ((*filename_size) + 1)); + rc = -ENOMEM; + goto out_free_unlock; + } + memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); + (*filename)[(*filename_size)] = '\0'; +out_free_unlock: + kfree(s->decrypted_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (rc) { + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + } + kfree(s); + return rc; +} + +static int ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) { int rc = 0; @@ -897,30 +1471,6 @@ out: return rc; } -static int -ecryptfs_find_global_auth_tok_for_sig( - struct ecryptfs_global_auth_tok **global_auth_tok, - struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) -{ - struct ecryptfs_global_auth_tok *walker; - int rc = 0; - - (*global_auth_tok) = NULL; - mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); - list_for_each_entry(walker, - &mount_crypt_stat->global_auth_tok_list, - mount_crypt_stat_list) { - if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { - (*global_auth_tok) = walker; - goto out; - } - } - rc = -EINVAL; -out: - mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); - return rc; -} - /** * ecryptfs_verify_version * @version: The version number to confirm @@ -990,43 +1540,6 @@ out: } /** - * ecryptfs_find_auth_tok_for_sig - * @auth_tok: Set to the matching auth_tok; NULL if not found - * @crypt_stat: inode crypt_stat crypto context - * @sig: Sig of auth_tok to find - * - * For now, this function simply looks at the registered auth_tok's - * linked off the mount_crypt_stat, so all the auth_toks that can be - * used must be registered at mount time. This function could - * potentially try a lot harder to find auth_tok's (e.g., by calling - * out to ecryptfsd to dynamically retrieve an auth_tok object) so - * that static registration of auth_tok's will no longer be necessary. - * - * Returns zero on no error; non-zero on error - */ -static int -ecryptfs_find_auth_tok_for_sig( - struct ecryptfs_auth_tok **auth_tok, - struct ecryptfs_crypt_stat *crypt_stat, char *sig) -{ - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - crypt_stat->mount_crypt_stat; - struct ecryptfs_global_auth_tok *global_auth_tok; - int rc = 0; - - (*auth_tok) = NULL; - if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, - mount_crypt_stat, sig)) { - struct key *auth_tok_key; - - rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, - sig); - } else - (*auth_tok) = global_auth_tok->global_auth_tok; - return rc; -} - -/** * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. * @auth_tok: The passphrase authentication token to use to encrypt the FEK * @crypt_stat: The cryptographic context @@ -1256,7 +1769,8 @@ find_next_matching_auth_tok: rc = -EINVAL; goto out_wipe_list; } - ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, + ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, + crypt_stat->mount_crypt_stat, candidate_auth_tok_sig); if (matching_auth_tok) { found_auth_tok = 1; @@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, - ecryptfs_code_for_cipher_string(crypt_stat), + ecryptfs_code_for_cipher_string( + crypt_stat->cipher, + crypt_stat->key_size), crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); @@ -1696,7 +2212,8 @@ encrypted_session_key_set: dest[(*packet_size)++] = 0x04; /* version 4 */ /* TODO: Break from RFC2440 so that arbitrary ciphers can be * specified with strings */ - cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, + crypt_stat->key_size); if (cipher_code == 0) { ecryptfs_printk(KERN_WARNING, "Unable to generate code for " "cipher [%s]\n", crypt_stat->cipher); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index fd630713c5c..789cf2e1be1 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, - ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; + ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, + ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, + ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -217,6 +219,9 @@ static const match_table_t tokens = { {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, + {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, + {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, + {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_err, NULL} }; @@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int rc = 0; int sig_set = 0; int cipher_name_set = 0; + int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; + int fn_cipher_key_bytes; + int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; @@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) char *sig_src; char *cipher_name_dst; char *cipher_name_src; + char *fn_cipher_name_dst; + char *fn_cipher_name_src; + char *fnek_dst; + char *fnek_src; char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; if (!options) { rc = -EINVAL; @@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) global_default_cipher_name; strncpy(cipher_name_dst, cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_name set to: " - "[%s]\n", cipher_name_dst); + cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: @@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_key_size " - "set to: [%d]\n", mount_crypt_stat-> - global_default_cipher_key_size); cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: @@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; + case ecryptfs_opt_fnek_sig: + fnek_src = args[0].from; + fnek_dst = + mount_crypt_stat->global_default_fnek_sig; + strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); + mount_crypt_stat->global_default_fnek_sig[ + ECRYPTFS_SIG_SIZE_HEX] = '\0'; + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, + rc); + goto out; + } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case ecryptfs_opt_fn_cipher: + fn_cipher_name_src = args[0].from; + fn_cipher_name_dst = + mount_crypt_stat->global_default_fn_cipher_name; + strncpy(fn_cipher_name_dst, fn_cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + mount_crypt_stat->global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + fn_cipher_name_set = 1; + break; + case ecryptfs_opt_fn_cipher_key_bytes: + fn_cipher_key_bytes_src = args[0].from; + fn_cipher_key_bytes = + (int)simple_strtol(fn_cipher_key_bytes_src, + &fn_cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_fn_cipher_key_bytes = + fn_cipher_key_bytes; + fn_cipher_key_bytes_set = 1; + break; case ecryptfs_opt_err: default: - ecryptfs_printk(KERN_WARNING, - "eCryptfs: unrecognized option '%s'\n", - p); + printk(KERN_WARNING + "%s: eCryptfs: unrecognized option [%s]\n", + __func__, p); } } if (!sig_set) { @@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); - strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } - if (!cipher_key_bytes_set) { + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_name_set) + strcpy(mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_cipher_name); + if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; - } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, - NULL)) + NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); - mutex_unlock(&key_tfm_list_mutex); - if (rc) { - printk(KERN_ERR "Error attempting to initialize cipher with " - "name = [%s] and key size = [%td]; rc = [%d]\n", - mount_crypt_stat->global_default_cipher_name, - mount_crypt_stat->global_default_cipher_key_size, rc); - rc = -EINVAL; - goto out; + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !ecryptfs_tfm_exists( + mount_crypt_stat->global_default_fn_cipher_name, NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); - if (rc) { + if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); - } out: return rc; } diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 6913f727624..96ef51489e0 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); if (!(*daemon)) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } @@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index efd95a0ed1e..a67fea655f4 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, if (!msg_ctx->msg) { rc = -ENOMEM; printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, (sizeof(*msg_ctx->msg) + data_size)); goto out_unlock; } @@ -322,7 +322,7 @@ check_list: if (count < total_length) { rc = 0; printk(KERN_WARNING "%s: Only given user buffer of " - "size [%Zd], but we need [%Zd] to read the " + "size [%zd], but we need [%zd] to read the " "pending message\n", __func__, count, total_length); goto out_unlock_msg_ctx; } @@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size, if ((sizeof(*msg) + msg->data_len) != data_size) { printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " - "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, + "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, (sizeof(*msg) + msg->data_len), data_size); rc = -EINVAL; goto out; @@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, data = kmalloc(count, GFP_KERNEL); if (!data) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); + "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); goto out; } rc = copy_from_user(data, buf, count); @@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, case ECRYPTFS_MSG_RESPONSE: if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { printk(KERN_WARNING "%s: Minimum acceptable packet " - "size is [%Zd], but amount of data written is " - "only [%Zd]. Discarding response packet.\n", + "size is [%zd], but amount of data written is " + "only [%zd]. Discarding response packet.\n", __func__, (1 + 4 + 1 + sizeof(struct ecryptfs_message)), count); @@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, } i += packet_size_length; if ((1 + 4 + packet_size_length + packet_size) != count) { - printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" - " + packet_size([%Zd]))([%Zd]) != " - "count([%Zd]). Invalid packet format.\n", + printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" + " + packet_size([%zd]))([%zd]) != " + "count([%zd]). Invalid packet format.\n", __func__, packet_size_length, packet_size, (1 + packet_size_length + packet_size), count); goto out_free; diff --git a/fs/exec.c b/fs/exec.c index 9c33f542dc7..71a6efe5d8b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -232,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, static int __bprm_mm_init(struct linux_binprm *bprm) { - int err = -ENOMEM; + int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) - goto err; + return -ENOMEM; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -251,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm) */ vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); - if (err) { - up_write(&mm->mmap_sem); + if (err) goto err; - } mm->stack_vm = mm->total_vm = 1; up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); - return 0; - err: - if (vma) { - bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); - } - + up_write(&mm->mmap_sem); + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); return err; } @@ -1694,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm) return (ret >= 2) ? 2 : ret; } -int do_coredump(long signr, int exit_code, struct pt_regs * regs) +void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; @@ -1778,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) if (ispipe) { helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_unlock; + } /* Terminate the string before the first option */ delimit = strchr(corename, ' '); if (delimit) @@ -1845,5 +1842,5 @@ fail_unlock: put_cred(cred); coredump_finish(mm); fail: - return retval; + return; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0537c82702..6c46c648430 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1225,11 +1225,11 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate FBC_BATCH blocks in their local +/* Each CPU can accumulate percpu_counter_batch blocks in their local * counters. So we need to make sure we have free blocks more - * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREEBLOCKS_WATERMARK 0 #endif diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6702a49992a..98d3fe7057e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2498,7 +2498,7 @@ static int ext4_nonda_switch(struct super_block *sb) /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu - * counters can get slightly wrong with FBC_BATCH getting + * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0ff0b8cf30..e5eaa62fd17 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If we're a pdlfush thread, then implement pdflush collision avoidance * against the entire list. * - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) @@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb, __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); - if (wbc->sync_mode == WB_SYNC_HOLD) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } if (current_is_pdflush()) writeback_release(bdi); if (wbc->pages_skipped != pages_skipped) { @@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb, if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; } - spin_unlock(&inode_lock); + + if (sync) { + struct inode *inode, *old_inode = NULL; + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); + } else + spin_unlock(&inode_lock); + return; /* Leave any unwritten inodes on s_io */ } EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); @@ -588,8 +624,7 @@ restart: /* * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is - * used to park the written inodes on sb->s_dirty for the wait pass. + * do this in two passes - one to write, and one to wait. * * A finite limit is set on the number of pages which will be written. * To prevent infinite livelock of sys_sync(). @@ -600,30 +635,21 @@ restart: void sync_inodes_sb(struct super_block *sb, int wait) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - wbc.nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ - sync_sb_inodes(sb, &wbc); -} + if (!wait) { + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); -/* - * Rather lame livelock avoidance. - */ -static void set_sb_syncing(int val) -{ - struct super_block *sb; - spin_lock(&sb_lock); - list_for_each_entry_reverse(sb, &super_blocks, s_list) - sb->s_syncing = val; - spin_unlock(&sb_lock); + wbc.nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + } else + wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ + + sync_sb_inodes(sb, &wbc); } /** @@ -652,9 +678,6 @@ static void __sync_inodes(int wait) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); @@ -672,13 +695,10 @@ restart: void sync_inodes(int wait) { - set_sb_syncing(0); __sync_inodes(0); - if (wait) { - set_sb_syncing(0); + if (wait) __sync_inodes(1); - } } /** diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0ab0c6f5f43..6903d37af03 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, for (;;) { struct page *page; unsigned long nr, ret; + int ra; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, */ ret = len < nr ? len : nr; if (clear_user(buf, ret)) - ret = -EFAULT; + ra = -EFAULT; + else + ra = 0; } else { /* * We have the page, copy it to user space buffer. */ - ret = hugetlbfs_read_actor(page, offset, buf, len, nr); + ra = hugetlbfs_read_actor(page, offset, buf, len, nr); + ret = ra; } - if (ret < 0) { + if (ra < 0) { if (retval == 0) - retval = ret; + retval = ra; if (page) page_cache_release(page); goto out; diff --git a/fs/inode.c b/fs/inode.c index bd48e5e6d3e..7a6e8c2ff7b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -110,8 +110,8 @@ static void wake_up_inode(struct inode *inode) /** * inode_init_always - perform inode structure intialisation - * @sb - superblock inode belongs to. - * @inode - inode to initialise + * @sb: superblock inode belongs to + * @inode: inode to initialise * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. @@ -166,7 +166,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; @@ -576,8 +576,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, /** * inode_add_to_lists - add a new inode to relevant lists - * @sb - superblock inode belongs to. - * @inode - inode to mark in use + * @sb: superblock inode belongs to + * @inode: inode to mark in use * * When an inode is allocated it needs to be accounted for, added to the in use * list, the owning superblock and the inode hash. This needs to be done under @@ -601,7 +601,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists); * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask - * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the diff --git a/fs/minix/dir.c b/fs/minix/dir.c index f70433816a3..d4946c4c90e 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) return -EINVAL; got_it: - pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); + pos = page_offset(page) + p - (char *)page_address(page); err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err) diff --git a/fs/mpage.c b/fs/mpage.c index 552b80b3fac..16c3ef37eae 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, first_hole = page_block; page_block++; block_in_file++; - clear_buffer_mapped(map_bh); continue; } @@ -308,7 +307,10 @@ alloc_new: goto alloc_new; } - if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) + relative_block = block_in_file - *first_logical_block; + nblocks = map_bh->b_size >> blkbits; + if ((buffer_boundary(map_bh) && relative_block == nblocks) || + (first_hole != blocks_per_page)) bio = mpage_bio_submit(READ, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 335b003dddf..0af3349de85 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -16,7 +16,6 @@ * @opts: an array of &struct option entries controlling parser operations * @optopt: output; will contain the current option * @optarg: output; will contain the value (if one exists) - * @flag: output; may be NULL; should point to a long for or'ing flags * @value: output; may be NULL; will be overwritten with the integer value * of the current argument. * diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3a8bdd7f575..94063840832 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" - "Swap: %8lu kB\n", + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, - mss.swap >> 10); + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/fs/select.c b/fs/select.c index 87df51eadcf..08b91beed80 100644 --- a/fs/select.c +++ b/fs/select.c @@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } - EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) @@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq) free_page((unsigned long) old); } } - EXPORT_SYMBOL(poll_freewait); -static struct poll_table_entry *poll_get_entry(poll_table *_p) +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { - struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) @@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); return NULL; } new_table->entry = new_table->entries; @@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) return table->entry++; } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - struct poll_table_entry *entry = poll_get_entry(p); + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec variable for the final timeout @@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - set_current_state(TASK_INTERRUPTIBLE); - inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); poll_freewait(&table); @@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; - set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); return count; } diff --git a/fs/sync.c b/fs/sync.c index 0921d6d4b5e..ac02b56548b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -295,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, if (flags & SYNC_FILE_RANGE_WRITE) { ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + WB_SYNC_ALL); if (ret < 0) goto out; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0d7564b95f8..89556ee7251 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) int i, err; struct ubifs_info *c = sb->s_fs_info; struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, .nr_to_write = LONG_MAX, }; + /* + * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an + * advisory thing to help the file system shove lots of data into the + * queues. If some gets missed then it'll be picked up on the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (sb->s_flags & MS_RDONLY) return 0; diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h index 46d696b331e..296c35cfb20 100644 --- a/include/asm-frv/atomic.h +++ b/include/asm-frv/atomic.h @@ -35,10 +35,6 @@ #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -typedef struct { - int counter; -} atomic_t; - #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) #define atomic_set(v, i) (((v)->counter) = (i)) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 8af276361bf..37b82cb96c8 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -28,6 +28,17 @@ struct bug_entry { #define BUGFLAG_WARNING (1<<0) #endif /* CONFIG_GENERIC_BUG */ +/* + * Don't use BUG() or BUG_ON() unless there's really no way out; one + * example might be detecting data structure corruption in the middle + * of an operation that can't be backed out of. If the (sub)system + * can somehow continue operating, perhaps with reduced functionality, + * it's probably not BUG-worthy. + * + * If you're tempted to BUG(), think again: is completely giving up + * really the *only* solution? There are usually better options, where + * users don't need to reboot ASAP and can mostly shut down cleanly. + */ #ifndef HAVE_ARCH_BUG #define BUG() do { \ printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \ @@ -39,6 +50,12 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while(0) #endif +/* + * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report + * significant issues that need prompt attention if they should ever + * appear at runtime. Use the versions with printk format strings + * to provide better diagnostics. + */ #ifndef __WARN #ifndef __ASSEMBLY__ extern void warn_slowpath(const char *file, const int line, diff --git a/include/asm-generic/local.h b/include/asm-generic/local.h index 33d7d04e411..dbd6150763e 100644 --- a/include/asm-generic/local.h +++ b/include/asm-generic/local.h @@ -2,7 +2,6 @@ #define _ASM_GENERIC_LOCAL_H #include <linux/percpu.h> -#include <linux/hardirq.h> #include <asm/atomic.h> #include <asm/types.h> diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 36fa286adad..4c8d0afae71 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -69,15 +69,8 @@ }) #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page; -/* this is useful when inlined pfn_to_page is too big */ -extern struct page *pfn_to_page(unsigned long pfn); -extern unsigned long page_to_pfn(struct page *page); -#else #define page_to_pfn __page_to_pfn #define pfn_to_page __pfn_to_page -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ #endif /* __ASSEMBLY__ */ diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h index 3a38ffe4a4f..2eed30f8408 100644 --- a/include/asm-m32r/atomic.h +++ b/include/asm-m32r/atomic.h @@ -9,6 +9,7 @@ * Copyright (C) 2004 Hirokazu Takata <takata at linux-m32r.org> */ +#include <linux/types.h> #include <asm/assembler.h> #include <asm/system.h> @@ -17,13 +18,6 @@ * resource counting etc.. */ -/* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, - * not some alias that contains the same information. - */ -typedef struct { volatile int counter; } atomic_t; - #define ATOMIC_INIT(i) { (i) } /** diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h index 4915294fea6..eb0ab9d4ee7 100644 --- a/include/asm-m68k/atomic.h +++ b/include/asm-m68k/atomic.h @@ -1,7 +1,7 @@ #ifndef __ARCH_M68K_ATOMIC__ #define __ARCH_M68K_ATOMIC__ - +#include <linux/types.h> #include <asm/system.h> /* @@ -13,7 +13,6 @@ * We do not have SMP m68k systems, so we don't have to deal with that. */ -typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(v) ((v)->counter) diff --git a/include/asm-mn10300/atomic.h b/include/asm-mn10300/atomic.h index 27c9690b957..bc064825f9b 100644 --- a/include/asm-mn10300/atomic.h +++ b/include/asm-mn10300/atomic.h @@ -20,15 +20,6 @@ * resource counting etc.. */ -/* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, - * not some alias that contains the same information. - */ -typedef struct { - int counter; -} atomic_t; - #define ATOMIC_INIT(i) { (i) } #ifdef __KERNEL__ diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h index b3b23540f14..67ad67bed8c 100644 --- a/include/asm-xtensa/atomic.h +++ b/include/asm-xtensa/atomic.h @@ -14,8 +14,7 @@ #define _XTENSA_ATOMIC_H #include <linux/stringify.h> - -typedef struct { volatile int counter; } atomic_t; +#include <linux/types.h> #ifdef __KERNEL__ #include <asm/processor.h> diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index f4d05ccd731..91a773993a5 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -10,6 +10,7 @@ #ifndef _LINUX_AUTO_DEV_IOCTL_H #define _LINUX_AUTO_DEV_IOCTL_H +#include <linux/string.h> #include <linux/types.h> #define AUTOFS_DEVICE_NAME "autofs" @@ -25,6 +26,60 @@ * An ioctl interface for autofs mount point control. */ +struct args_protover { + __u32 version; +}; + +struct args_protosubver { + __u32 sub_version; +}; + +struct args_openmount { + __u32 devid; +}; + +struct args_ready { + __u32 token; +}; + +struct args_fail { + __u32 token; + __s32 status; +}; + +struct args_setpipefd { + __s32 pipefd; +}; + +struct args_timeout { + __u64 timeout; +}; + +struct args_requester { + __u32 uid; + __u32 gid; +}; + +struct args_expire { + __u32 how; +}; + +struct args_askumount { + __u32 may_umount; +}; + +struct args_ismountpoint { + union { + struct args_in { + __u32 type; + } in; + struct args_out { + __u32 devid; + __u32 magic; + } out; + }; +}; + /* * All the ioctls use this structure. * When sending a path size must account for the total length @@ -39,20 +94,32 @@ struct autofs_dev_ioctl { * including this struct */ __s32 ioctlfd; /* automount command fd */ - __u32 arg1; /* Command parameters */ - __u32 arg2; + /* Command parameters */ + + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; char path[0]; }; static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) { + memset(in, 0, sizeof(struct autofs_dev_ioctl)); in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; in->size = sizeof(struct autofs_dev_ioctl); in->ioctlfd = -1; - in->arg1 = 0; - in->arg2 = 0; return; } diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h index 2253716d4b9..55fa478bd63 100644 --- a/include/linux/auto_fs4.h +++ b/include/linux/auto_fs4.h @@ -29,10 +29,64 @@ #define AUTOFS_EXP_IMMEDIATE 1 #define AUTOFS_EXP_LEAVES 2 -#define AUTOFS_TYPE_ANY 0x0000 -#define AUTOFS_TYPE_INDIRECT 0x0001 -#define AUTOFS_TYPE_DIRECT 0x0002 -#define AUTOFS_TYPE_OFFSET 0x0004 +#define AUTOFS_TYPE_ANY 0U +#define AUTOFS_TYPE_INDIRECT 1U +#define AUTOFS_TYPE_DIRECT 2U +#define AUTOFS_TYPE_OFFSET 4U + +static inline void set_autofs_type_indirect(unsigned int *type) +{ + *type = AUTOFS_TYPE_INDIRECT; + return; +} + +static inline unsigned int autofs_type_indirect(unsigned int type) +{ + return (type == AUTOFS_TYPE_INDIRECT); +} + +static inline void set_autofs_type_direct(unsigned int *type) +{ + *type = AUTOFS_TYPE_DIRECT; + return; +} + +static inline unsigned int autofs_type_direct(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT); +} + +static inline void set_autofs_type_offset(unsigned int *type) +{ + *type = AUTOFS_TYPE_OFFSET; + return; +} + +static inline unsigned int autofs_type_offset(unsigned int type) +{ + return (type == AUTOFS_TYPE_OFFSET); +} + +static inline unsigned int autofs_type_trigger(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET); +} + +/* + * This isn't really a type as we use it to say "no type set" to + * indicate we want to search for "any" mount in the + * autofs_dev_ioctl_ismountpoint() device ioctl function. + */ +static inline void set_autofs_type_any(unsigned int *type) +{ + *type = AUTOFS_TYPE_ANY; + return; +} + +static inline unsigned int autofs_type_any(unsigned int type) +{ + return (type == AUTOFS_TYPE_ANY); +} /* Daemon notification packet types */ enum autofs_notify { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 6cbfbe29718..77b4a9e4600 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -18,6 +18,7 @@ struct pt_regs; #define BINPRM_BUF_SIZE 128 #ifdef __KERNEL__ +#include <linux/list.h> #define CORENAME_MAX_SIZE 128 @@ -106,7 +107,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); -extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); +extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); extern int set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1164963c3a8..08b78c09b09 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -329,13 +329,7 @@ struct cgroup_subsys { struct cgroup *cgrp); void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); - /* - * This routine is called with the task_lock of mm->owner held - */ - void (*mm_owner_changed)(struct cgroup_subsys *ss, - struct cgroup *old, - struct cgroup *new, - struct task_struct *p); + int subsys_id; int active; int disabled; @@ -400,9 +394,6 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); -void cgroup_mm_owner_callbacks(struct task_struct *old, - struct task_struct *new); - #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } @@ -420,9 +411,6 @@ static inline int cgroupstats_build(struct cgroupstats *stats, return -EINVAL; } -static inline void cgroup_mm_owner_callbacks(struct task_struct *old, - struct task_struct *new) {} - #endif /* !CONFIG_CGROUPS */ #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 8e540d32c9f..51ea2bdea0f 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -78,6 +78,8 @@ extern int current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); +extern void cpuset_print_task_mems_allowed(struct task_struct *p); + #else /* !CONFIG_CPUSETS */ static inline int cpuset_init_early(void) { return 0; } @@ -159,6 +161,10 @@ static inline void rebuild_sched_domains(void) partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_print_task_mems_allowed(struct task_struct *p) +{ +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index fb59673c60b..d7eba77f666 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1133,7 +1133,6 @@ struct super_block { struct rw_semaphore s_umount; struct mutex s_lock; int s_count; - int s_syncing; int s_need_sync_fs; atomic_t s_active; #ifdef CONFIG_SECURITY diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e8003afeffb..dd20cd78faa 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -69,12 +69,6 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) -#define GFP_NOFS_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_MOVABLE) -#define GFP_USER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ - __GFP_HARDWALL | __GFP_MOVABLE) -#define GFP_HIGHUSER_PAGECACHE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ - __GFP_HARDWALL | __GFP_HIGHMEM | \ - __GFP_MOVABLE) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e1c8afc002c..f1d2fba19ea 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -233,6 +233,10 @@ static inline unsigned long huge_page_size(struct hstate *h) return (unsigned long)PAGE_SIZE << h->order; } +extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); + +extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); + static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; @@ -273,6 +277,8 @@ struct hstate {}; #define hstate_inode(i) NULL #define huge_page_size(h) PAGE_SIZE #define huge_page_mask(h) PAGE_MASK +#define vma_kernel_pagesize(v) PAGE_SIZE +#define vma_mmu_pagesize(v) PAGE_SIZE #define huge_page_order(h) 0 #define huge_page_shift(h) PAGE_SHIFT static inline unsigned int pages_per_huge_page(struct hstate *h) diff --git a/include/linux/i2c/twl4030.h b/include/linux/i2c/twl4030.h index a8f84c01f82..8137f660a5c 100644 --- a/include/linux/i2c/twl4030.h +++ b/include/linux/i2c/twl4030.h @@ -234,6 +234,9 @@ struct twl4030_gpio_platform_data { /* gpio-n should control VMMC(n+1) if BIT(n) in mmc_cd is set */ u8 mmc_cd; + /* if BIT(N) is set, or VMMC(n+1) is linked, debounce GPIO-N */ + u32 debounce; + /* For gpio-N, bit (1 << N) in "pullups" is set if that pullup * should be enabled. Else, if that bit is set in "pulldowns", * that pulldown is enabled. Don't waste power by letting any @@ -307,12 +310,6 @@ int twl4030_sih_setup(int module); #define TWL4030_VAUX3_DEV_GRP 0x1F #define TWL4030_VAUX3_DEDICATED 0x22 -/* - * Exported TWL4030 GPIO APIs - * - * WARNING -- use standard GPIO and IRQ calls instead; these will vanish. - */ -int twl4030_set_gpio_debounce(int gpio, int enable); #if defined(CONFIG_TWL4030_BCI_BATTERY) || \ defined(CONFIG_TWL4030_BCI_BATTERY_MODULE) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0702c4d7bdf..af886b26c9d 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,7 +14,6 @@ #include <linux/irqflags.h> #include <linux/smp.h> #include <linux/percpu.h> -#include <linux/irqnr.h> #include <asm/atomic.h> #include <asm/ptrace.h> diff --git a/include/linux/kernel.h b/include/linux/kernel.h index ca9ff6411df..721984844c9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -48,6 +48,12 @@ extern const char linux_proc_banner[]; #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#define DIV_ROUND_CLOSEST(x, divisor)( \ +{ \ + typeof(divisor) __divisor = divisor; \ + (((x) + ((__divisor) / 2)) / (__divisor)); \ +} \ +) #define _RET_IP_ (unsigned long)__builtin_return_address(0) #define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 497b1d1f7a0..d6ea19e314b 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -69,9 +69,6 @@ struct kprobe { /* list of kprobes for multi-handler support */ struct list_head list; - /* Indicates that the corresponding module has been ref counted */ - unsigned int mod_refcounted; - /*count the number of times this probe was temporarily disarmed */ unsigned long nmissed; @@ -103,8 +100,19 @@ struct kprobe { /* copy of the original instruction */ struct arch_specific_insn ainsn; + + /* Indicates various status flags. Protected by kprobe_mutex. */ + u32 flags; }; +/* Kprobe status flags */ +#define KPROBE_FLAG_GONE 1 /* breakpoint has already gone */ + +static inline int kprobe_gone(struct kprobe *p) +{ + return p->flags & KPROBE_FLAG_GONE; +} + /* * Special probe type that uses setjmp-longjmp type tricks to resume * execution at a specified entry with a matching prototype corresponding @@ -201,7 +209,6 @@ static inline int init_test_probes(void) } #endif /* CONFIG_KPROBES_SANITY_TEST */ -extern struct mutex kprobe_mutex; extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); diff --git a/include/linux/memory.h b/include/linux/memory.h index 36c82c9e6ea..3fdc10806d3 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -79,14 +79,14 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -extern int register_new_memory(struct mem_section *); +extern int register_new_memory(int, struct mem_section *); extern int unregister_memory_section(struct mem_section *); extern int memory_dev_init(void); extern int remove_memory_block(unsigned long, struct mem_section *, int); extern int memory_notify(unsigned long val, void *v); +extern struct memory_block *find_memory_block(struct mem_section *); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT) - - +enum mem_add_context { BOOT, HOTPLUG }; #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 763ba81fc0f..d95f72e79b8 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -72,7 +72,7 @@ extern void __offline_isolated_pages(unsigned long, unsigned long); extern int offline_pages(unsigned long, unsigned long, unsigned long); /* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(struct zone *zone, unsigned long start_pfn, +extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3f34005068d..527602cdea1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,6 +7,8 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION +#define PAGE_MIGRATION 1 + extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); @@ -20,6 +22,8 @@ extern int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long flags); #else +#define PAGE_MIGRATION 0 + static inline int putback_lru_pages(struct list_head *l) { return 0; } static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private) { return -ENOSYS; } diff --git a/include/linux/mm.h b/include/linux/mm.h index aaa8b843be2..4a3d28c8644 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -717,6 +717,11 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS) +/* + * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. + */ +extern void pagefault_out_of_memory(void); + #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) extern void show_free_areas(void); diff --git a/include/linux/module.h b/include/linux/module.h index 3bfed013350..4f7ea12463d 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -294,9 +294,6 @@ struct module /* The size of the executable code in each section. */ unsigned int init_text_size, core_text_size; - /* The handle returned from unwind_add_table. */ - void *unwind_info; - /* Arch-specific module values */ struct mod_arch_specific arch; @@ -368,6 +365,18 @@ struct module *module_text_address(unsigned long addr); struct module *__module_text_address(unsigned long addr); int is_module_address(unsigned long addr); +static inline int within_module_core(unsigned long addr, struct module *mod) +{ + return (unsigned long)mod->module_core <= addr && + addr < (unsigned long)mod->module_core + mod->core_size; +} + +static inline int within_module_init(unsigned long addr, struct module *mod) +{ + return (unsigned long)mod->module_init <= addr && + addr < (unsigned long)mod->module_init + mod->init_size; +} + /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, diff --git a/include/linux/node.h b/include/linux/node.h index bc001bc225c..681a697b9a8 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -26,6 +26,7 @@ struct node { struct sys_device sysdev; }; +struct memory_block; extern struct node node_devices[]; extern int register_node(struct node *, int, struct node *); @@ -35,6 +36,9 @@ extern int register_one_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); +extern int register_mem_sect_under_node(struct memory_block *mem_blk, + int nid); +extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk); #else static inline int register_one_node(int nid) { @@ -52,6 +56,15 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) { return 0; } +static inline int register_mem_sect_under_node(struct memory_block *mem_blk, + int nid) +{ + return 0; +} +static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk) +{ + return 0; +} #endif #define to_node(sys_device) container_of(sys_device, struct node, sysdev) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b12f93a3c34..219a523ecdb 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,6 +228,7 @@ PAGEFLAG_FALSE(HighMem) PAGEFLAG(SwapCache, swapcache) #else PAGEFLAG_FALSE(SwapCache) + SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif #ifdef CONFIG_UNEVICTABLE_LRU @@ -372,31 +373,22 @@ static inline void __ClearPageTail(struct page *page) #define __PG_MLOCKED 0 #endif -#define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ - 1 << PG_buddy | 1 << PG_writeback | \ - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) - -/* - * Flags checked in bad_page(). Pages on the free list should not have - * these flags set. It they are, there is a problem. - */ -#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \ - 1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked) - /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. */ -#define PAGE_FLAGS_CHECK_AT_FREE (PAGE_FLAGS | 1 << PG_reserved) +#define PAGE_FLAGS_CHECK_AT_FREE \ + (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ + 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ + __PG_UNEVICTABLE | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. - * Pages being prepped should not have these flags set. It they are, there - * is a problem. + * Pages being prepped should not have any flags set. It they are set, + * there has been a kernel bug or struct page corruption. */ -#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \ - 1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked) +#define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1) #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index e90a2cb0291..7b2886fa7fd 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_release_nonlru(struct pagevec *pvec); void __pagevec_free(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); @@ -69,12 +68,6 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } -static inline void pagevec_release_nonlru(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_release_nonlru(pvec); -} - static inline void pagevec_free(struct pagevec *pvec) { if (pagevec_count(pvec)) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 9007ccdfc11..99de7a31bab 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -24,11 +24,7 @@ struct percpu_counter { s32 *counters; }; -#if NR_CPUS >= 16 -#define FBC_BATCH (NR_CPUS*2) -#else -#define FBC_BATCH (NR_CPUS*4) -#endif +extern int percpu_counter_batch; int percpu_counter_init(struct percpu_counter *fbc, s64 amount); int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); @@ -39,7 +35,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc); static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { - __percpu_counter_add(fbc, amount, FBC_BATCH); + __percpu_counter_add(fbc, amount, percpu_counter_batch); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) diff --git a/include/linux/poll.h b/include/linux/poll.h index badd98ab06f..8c24ef8d997 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -46,9 +46,9 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) } struct poll_table_entry { - struct file * filp; + struct file *filp; wait_queue_t wait; - wait_queue_head_t * wait_address; + wait_queue_head_t *wait_address; }; /* @@ -56,7 +56,9 @@ struct poll_table_entry { */ struct poll_wqueues { poll_table pt; - struct poll_table_page * table; + struct poll_table_page *table; + struct task_struct *polling_task; + int triggered; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; @@ -64,6 +66,13 @@ struct poll_wqueues { extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); +extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack); + +static inline int poll_schedule(struct poll_wqueues *pwq, int state) +{ + return poll_schedule_timeout(pwq, state, NULL, 0); +} /* * Scaleable version of the fd_set. diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 32c0547ffaf..c93a58a4003 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -391,7 +391,6 @@ static inline int rio_add_inb_buffer(struct rio_mport *mport, int mbox, * rio_get_inb_message - Get A RIO message from an inbound mailbox queue * @mport: Master port containing the inbound mailbox * @mbox: The inbound mailbox number - * @buffer: Pointer to the message buffer * * Get a RIO message from an inbound mailbox queue. Returns 0 on success. */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 89f0564b10c..b35bc0e19cd 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -63,16 +63,13 @@ void anon_vma_unlink(struct vm_area_struct *); void anon_vma_link(struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); -extern struct anon_vma *page_lock_anon_vma(struct page *page); -extern void page_unlock_anon_vma(struct anon_vma *anon_vma); - /* * rmap interfaces called when adding or removing pte of page */ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void page_add_file_rmap(struct page *); -void page_remove_rmap(struct page *, struct vm_area_struct *); +void page_remove_rmap(struct page *); #ifdef CONFIG_DEBUG_VM void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/sched.h b/include/linux/sched.h index 38a3f4b1539..ea415136ac9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -386,6 +386,9 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) +#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) +#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) + extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/include/linux/spi/spi_gpio.h b/include/linux/spi/spi_gpio.h new file mode 100644 index 00000000000..0f01a0f1f40 --- /dev/null +++ b/include/linux/spi/spi_gpio.h @@ -0,0 +1,60 @@ +#ifndef __LINUX_SPI_GPIO_H +#define __LINUX_SPI_GPIO_H + +/* + * For each bitbanged SPI bus, set up a platform_device node with: + * - name "spi_gpio" + * - id the same as the SPI bus number it implements + * - dev.platform data pointing to a struct spi_gpio_platform_data + * + * Or, see the driver code for information about speedups that are + * possible on platforms that support inlined access for GPIOs (no + * spi_gpio_platform_data is used). + * + * Use spi_board_info with these busses in the usual way, being sure + * that the controller_data being the GPIO used for each device's + * chipselect: + * + * static struct spi_board_info ... [] = { + * ... + * // this slave uses GPIO 42 for its chipselect + * .controller_data = (void *) 42, + * ... + * // this one uses GPIO 86 for its chipselect + * .controller_data = (void *) 86, + * ... + * }; + * + * If the bitbanged bus is later switched to a "native" controller, + * that platform_device and controller_data should be removed. + */ + +/** + * struct spi_gpio_platform_data - parameter for bitbanged SPI master + * @sck: number of the GPIO used for clock output + * @mosi: number of the GPIO used for Master Output, Slave In (MOSI) data + * @miso: number of the GPIO used for Master Input, Slave Output (MISO) data + * @num_chipselect: how many slaves to allow + * + * All GPIO signals used with the SPI bus managed through this driver + * (chipselects, MOSI, MISO, SCK) must be configured as GPIOs, instead + * of some alternate function. + * + * It can be convenient to use this driver with pins that have alternate + * functions associated with a "native" SPI controller if a driver for that + * controller is not available, or is missing important functionality. + * + * On platforms which can do so, configure MISO with a weak pullup unless + * there's an external pullup on that signal. That saves power by avoiding + * floating signals. (A weak pulldown would save power too, but many + * drivers expect to see all-ones data as the no slave "response".) + */ +struct spi_gpio_platform_data { + unsigned sck; + unsigned mosi; + unsigned miso; + + u16 num_chipselect; +}; + +#endif /* __LINUX_SPI_GPIO_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index a3af95b2cb6..91dee50fe26 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -120,7 +120,9 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), + SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ + SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ + SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; @@ -134,22 +136,24 @@ enum { * The in-memory structure used to track swap areas. */ struct swap_info_struct { - unsigned int flags; + unsigned long flags; int prio; /* swap priority */ + int next; /* next entry on swap list */ struct file *swap_file; struct block_device *bdev; struct list_head extent_list; struct swap_extent *curr_swap_extent; - unsigned old_block_size; - unsigned short * swap_map; + unsigned short *swap_map; unsigned int lowest_bit; unsigned int highest_bit; + unsigned int lowest_alloc; /* while preparing discard cluster */ + unsigned int highest_alloc; /* while preparing discard cluster */ unsigned int cluster_next; unsigned int cluster_nr; unsigned int pages; unsigned int max; unsigned int inuse_pages; - int next; /* next entry on swap list */ + unsigned int old_block_size; }; struct swap_list_t { @@ -163,7 +167,6 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; -extern long nr_swap_pages; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); @@ -174,8 +177,6 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); -extern void lru_cache_add_active_or_unevictable(struct page *, - struct vm_area_struct *); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); @@ -280,7 +281,7 @@ extern void end_swap_bio_read(struct bio *bio, int err); extern struct address_space swapper_space; #define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *, gfp_t); +extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); @@ -293,6 +294,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); /* linux/mm/swapfile.c */ +extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); @@ -300,15 +302,14 @@ extern swp_entry_t get_swap_page_of_type(int); extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); -extern void free_swap_and_cache(swp_entry_t); +extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern sector_t swapdev_block(int, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); -extern int can_share_swap_page(struct page *); -extern int remove_exclusive_swap_page(struct page *); -extern int remove_exclusive_swap_page_ref(struct page *); +extern int reuse_swap_page(struct page *); +extern int try_to_free_swap(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ @@ -334,7 +335,8 @@ static inline void disable_swap_token(void) #else /* CONFIG_SWAP */ -#define total_swap_pages 0 +#define nr_swap_pages 0L +#define total_swap_pages 0L #define total_swapcache_pages 0UL #define si_swapinfo(val) \ @@ -350,14 +352,8 @@ static inline void show_swap_cache_info(void) { } -static inline void free_swap_and_cache(swp_entry_t swp) -{ -} - -static inline int swap_duplicate(swp_entry_t swp) -{ - return 0; -} +#define free_swap_and_cache(swp) is_migration_entry(swp) +#define swap_duplicate(swp) is_migration_entry(swp) static inline void swap_free(swp_entry_t swp) { @@ -374,7 +370,10 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -#define can_share_swap_page(p) (page_mapcount(p) == 1) +static inline int add_to_swap(struct page *page) +{ + return 0; +} static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) @@ -390,14 +389,9 @@ static inline void delete_from_swap_cache(struct page *page) { } -#define swap_token_default_timeout 0 - -static inline int remove_exclusive_swap_page(struct page *p) -{ - return 0; -} +#define reuse_swap_page(page) (page_mapcount(page) == 1) -static inline int remove_exclusive_swap_page_ref(struct page *page) +static inline int try_to_free_swap(struct page *page) { return 0; } diff --git a/include/linux/types.h b/include/linux/types.h index 121f349cb7e..3b864f2d956 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -195,6 +195,16 @@ typedef u32 phys_addr_t; typedef phys_addr_t resource_size_t; +typedef struct { + volatile int counter; +} atomic_t; + +#ifdef CONFIG_64BIT +typedef struct { + volatile long counter; +} atomic64_t; +#endif + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; diff --git a/include/linux/unwind.h b/include/linux/unwind.h deleted file mode 100644 index 7760860fa17..00000000000 --- a/include/linux/unwind.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef _LINUX_UNWIND_H -#define _LINUX_UNWIND_H - -/* - * Copyright (C) 2002-2006 Novell, Inc. - * Jan Beulich <jbeulich@novell.com> - * This code is released under version 2 of the GNU GPL. - * - * A simple API for unwinding kernel stacks. This is used for - * debugging and error reporting purposes. The kernel doesn't need - * full-blown stack unwinding with all the bells and whistles, so there - * is not much point in implementing the full Dwarf2 unwind API. - */ - -struct module; - -struct unwind_frame_info {}; - -static inline void unwind_init(void) {} -static inline void unwind_setup(void) {} - -#ifdef CONFIG_MODULES - -static inline void *unwind_add_table(struct module *mod, - const void *table_start, - unsigned long table_size) -{ - return NULL; -} - -static inline void unwind_remove_table(void *handle, int init_only) -{ -} - -#endif - -static inline int unwind_init_frame_info(struct unwind_frame_info *info, - struct task_struct *tsk, - const struct pt_regs *regs) -{ - return -ENOSYS; -} - -static inline int unwind_init_blocked(struct unwind_frame_info *info, - struct task_struct *tsk) -{ - return -ENOSYS; -} - -static inline int unwind_init_running(struct unwind_frame_info *info, - asmlinkage int (*cb)(struct unwind_frame_info *, - void *arg), - void *arg) -{ - return -ENOSYS; -} - -static inline int unwind(struct unwind_frame_info *info) -{ - return -ENOSYS; -} - -static inline int unwind_to_user(struct unwind_frame_info *info) -{ - return -ENOSYS; -} - -#endif /* _LINUX_UNWIND_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 307b88577ea..506e7620a98 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -97,6 +97,10 @@ extern void unmap_kernel_range(unsigned long addr, unsigned long size); extern struct vm_struct *alloc_vm_area(size_t size); extern void free_vm_area(struct vm_struct *area); +/* for /dev/kmem */ +extern long vread(char *buf, char *addr, unsigned long count); +extern long vwrite(char *buf, char *addr, unsigned long count); + /* * Internals. Dont't use.. */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e585657e983..7300ecdc480 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -30,7 +30,6 @@ static inline int task_is_pdflush(struct task_struct *task) enum writeback_sync_modes { WB_SYNC_NONE, /* Don't wait on anything */ WB_SYNC_ALL, /* Wait on every mapping */ - WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */ }; /* @@ -107,7 +106,9 @@ void throttle_vm_writeout(gfp_t gfp_mask); /* These are exported to sysctl. */ extern int dirty_background_ratio; +extern unsigned long dirty_background_bytes; extern int vm_dirty_ratio; +extern unsigned long vm_dirty_bytes; extern int dirty_writeback_interval; extern int dirty_expire_interval; extern int vm_highmem_is_dirtyable; @@ -116,17 +117,26 @@ extern int laptop_mode; extern unsigned long determine_dirtyable_memory(void); +extern int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); +extern int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); extern int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); struct ctl_table; struct file; int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi); +void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi); void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, diff --git a/init/Kconfig b/init/Kconfig index 52847eec739..315a6114bf8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -838,10 +838,6 @@ config RT_MUTEXES boolean select PLIST -config TINY_SHMEM - default !SHMEM - bool - config BASE_SMALL int default 0 if BASE_FULL diff --git a/init/do_mounts.c b/init/do_mounts.c index d055b1914c3..5efca73b39f 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -220,10 +220,10 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) sys_chdir("/root"); ROOT_DEV = current->fs->pwd.mnt->mnt_sb->s_dev; - printk("VFS: Mounted root (%s filesystem)%s.\n", + printk("VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", current->fs->pwd.mnt->mnt_sb->s_type->name, current->fs->pwd.mnt->mnt_sb->s_flags & MS_RDONLY ? - " readonly" : ""); + " readonly" : "", MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); return 0; } diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index d6da5cdd3c3..ff95e319288 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -271,7 +271,7 @@ static int __init raid_setup(char *str) __setup("raid=", raid_setup); __setup("md=", md_setup); -static void autodetect_raid(void) +static void __init autodetect_raid(void) { int fd; diff --git a/init/main.c b/init/main.c index cd168ebc592..b5a892c6837 100644 --- a/init/main.c +++ b/init/main.c @@ -50,7 +50,6 @@ #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> -#include <linux/unwind.h> #include <linux/buffer_head.h> #include <linux/page_cgroup.h> #include <linux/debug_locks.h> @@ -108,7 +107,7 @@ EXPORT_SYMBOL(system_state); extern void time_init(void); /* Default late time init is NULL. archs can override this later. */ -void (*late_time_init)(void); +void (*__initdata late_time_init)(void); extern void softirq_init(void); /* Untouched command line saved by arch-specific code. */ @@ -447,7 +446,7 @@ static void __init setup_command_line(char *command_line) * gcc-3.4 accidentally inlines this function, so use noinline. */ -static void noinline __init_refok rest_init(void) +static noinline void __init_refok rest_init(void) __releases(kernel_lock) { int pid; @@ -537,7 +536,6 @@ asmlinkage void __init start_kernel(void) * Need to run as early as possible, to initialize the * lockdep hash: */ - unwind_init(); lockdep_init(); debug_objects_early_init(); cgroup_init_early(); @@ -559,7 +557,6 @@ asmlinkage void __init start_kernel(void) setup_arch(&command_line); mm_init_owner(&init_mm, &init_task); setup_command_line(command_line); - unwind_setup(); setup_per_cpu_areas(); setup_nr_cpu_ids(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ @@ -786,7 +783,7 @@ static void run_init_process(char *init_filename) /* This is a non __init function. Force it to be noinline otherwise gcc * makes it inline to init() and it becomes part of init.text section */ -static int noinline init_post(void) +static noinline int init_post(void) { free_initmem(); unlock_kernel(); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 0dfebc50942..4a7a12c95ab 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -26,29 +26,6 @@ static void *get_ipc(ctl_table *table) return which; } -/* - * Routine that is called when the file "auto_msgmni" has successfully been - * written. - * Two values are allowed: - * 0: unregister msgmni's callback routine from the ipc namespace notifier - * chain. This means that msgmni won't be recomputed anymore upon memory - * add/remove or ipc namespace creation/removal. - * 1: register back the callback routine. - */ -static void ipc_auto_callback(int val) -{ - if (!val) - unregister_ipcns_notifier(current->nsproxy->ipc_ns); - else { - /* - * Re-enable automatic recomputing only if not already - * enabled. - */ - recompute_msgmni(current->nsproxy->ipc_ns); - cond_register_ipcns_notifier(current->nsproxy->ipc_ns); - } -} - #ifdef CONFIG_PROC_FS static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -94,6 +71,29 @@ static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, lenp, ppos); } +/* + * Routine that is called when the file "auto_msgmni" has successfully been + * written. + * Two values are allowed: + * 0: unregister msgmni's callback routine from the ipc namespace notifier + * chain. This means that msgmni won't be recomputed anymore upon memory + * add/remove or ipc namespace creation/removal. + * 1: register back the callback routine. + */ +static void ipc_auto_callback(int val) +{ + if (!val) + unregister_ipcns_notifier(current->nsproxy->ipc_ns); + else { + /* + * Re-enable automatic recomputing only if not already + * enabled. + */ + recompute_msgmni(current->nsproxy->ipc_ns); + cond_register_ipcns_notifier(current->nsproxy->ipc_ns); + } +} + static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/ipc/sem.c b/ipc/sem.c index fea0ad3aed7..c68cd3f8f0c 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1216,7 +1216,6 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, if (timeout && jiffies_left == 0) error = -EAGAIN; list_del(&queue.list); - goto out_unlock_free; out_unlock_free: sem_unlock(sma); diff --git a/ipc/shm.c b/ipc/shm.c index 57dd50046ce..b125b560240 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -75,7 +75,7 @@ void shm_init_ns(struct ipc_namespace *ns) ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_tot = 0; - ipc_init_ids(&ns->ids[IPC_SHM_IDS]); + ipc_init_ids(&shm_ids(ns)); } /* @@ -644,7 +644,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) if (err) return err; - memset(&shminfo,0,sizeof(shminfo)); + memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; shminfo.shmmax = ns->shm_ctlmax; shminfo.shmall = ns->shm_ctlall; @@ -669,7 +669,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) if (err) return err; - memset(&shm_info,0,sizeof(shm_info)); + memset(&shm_info, 0, sizeof(shm_info)); down_read(&shm_ids(ns).rw_mutex); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); @@ -678,7 +678,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); up_read(&shm_ids(ns).rw_mutex); - if(copy_to_user (buf, &shm_info, sizeof(shm_info))) { + if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; } @@ -692,11 +692,6 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) struct shmid64_ds tbuf; int result; - if (!buf) { - err = -EFAULT; - goto out; - } - if (cmd == SHM_STAT) { shp = shm_lock(ns, shmid); if (IS_ERR(shp)) { @@ -712,7 +707,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) } result = 0; } - err=-EACCES; + err = -EACCES; if (ipcperms (&shp->shm_perm, S_IRUGO)) goto out_unlock; err = security_shm_shmctl(shp, cmd); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87bb0258fd2..f221446aa02 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -2539,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -2789,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/compat.c b/kernel/compat.c index d52e2ec1deb..42d56544460 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -24,6 +24,7 @@ #include <linux/migrate.h> #include <linux/posix-timers.h> #include <linux/times.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> @@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } + force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } @@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39c1a4c1c5a..345ace5117d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -240,6 +240,17 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -2356,6 +2367,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c2e11..038707404b7 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + struct dma_coherent_mem *mem; int order = get_order(size); + int pageno; - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; + if (!dev) + return 0; + mem = dev->dma_mem; + if (!mem) + return 0; + if (unlikely(size > mem->size)) + return 0; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (pageno >= 0) { + /* + * Memory was found in the per-device arena. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ + *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; } - return (mem != NULL); + return 1; } EXPORT_SYMBOL(dma_alloc_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c14e0..c7740fa3252 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 43cbf30669e..7b8f2a78be3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + #include <linux/init_task.h> static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) @@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; + mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sighand->count); return 0; } diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb90866..a27a5f64443 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; /** * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code on failure. Note that a diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f25259..1b9cbdc0127 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobe_enabled; -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -115,6 +115,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -144,10 +145,10 @@ loop_end: } /** - * get_insn_slot() - Find a slot on an executable page for an instruction. + * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) return kip->insns; } +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret; + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(); + mutex_unlock(&kprobe_insn_mutex); + return ret; +} + /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { @@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; + int safety; /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) + mutex_unlock(&kprobe_insn_mutex); + safety = check_safety(); + mutex_lock(&kprobe_insn_mutex); + if (safety != 0) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; + mutex_lock(&kprobe_insn_mutex); hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { @@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); + + mutex_unlock(&kprobe_insn_mutex); } #endif @@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { + if (kp->pre_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { + if (kp->post_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, head); } -void kretprobe_hash_lock(struct task_struct *tsk, +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) spin_unlock_irqrestore(hlist_lock, *flags); } -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; - if (p->post_handler) + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); @@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap; + if (kprobe_gone(old_p)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(old_p); + if (ret) + return ret; + } if (old_p->pre_handler == aggr_pre_handler) { copy_kprobe(old_p, p); ret = add_new_kprobe(old_p, p); + ap = old_p; } else { ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) + if (!ap) { + if (kprobe_gone(old_p)) + arch_remove_kprobe(old_p); return -ENOMEM; + } add_aggr_kprobe(ap, old_p); copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); } + if (kprobe_gone(old_p)) { + /* + * If the old_p has gone, its breakpoint has been disarmed. + * We have to arm it again after preparing real kprobes. + */ + ap->flags &= ~KPROBE_FLAG_GONE; + if (kprobe_enabled) + arch_arm_kprobe(ap); + } return ret; } @@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *old_p; @@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; } - p->mod_refcounted = 0; - + p->flags = 0; /* * Check if are we probing a module. */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod; - calling_mod = __module_text_address(called_from); /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. */ - if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } - p->mod_refcounted = 1; - } else - probed_mod = NULL; + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); + return -EINVAL; + } + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, out: mutex_unlock(&kprobe_mutex); - if (ret && probed_mod) + if (probed_mod) module_put(probed_mod); + return ret; } @@ -697,16 +743,16 @@ valid_p: list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. */ - if (kprobe_enabled) + if (kprobe_enabled && !kprobe_gone(old_p)) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) old_p->break_handler = NULL; - if (p->post_handler) { + if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &old_p->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; @@ -721,39 +767,27 @@ noclean: static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct module *mod; struct kprobe *old_p; - if (p->mod_refcounted) { - /* - * Since we've already incremented refcount, - * we don't need to disable preemption. - */ - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } + if (list_empty(&p->list)) arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + arch_remove_kprobe(old_p); + kfree(old_p); } } -static int __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) +int __kprobes register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); + ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); @@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num, return ret; } -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) +int __kprobes register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num, /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); + ret = register_kprobe(&jp->kp); } if (ret < 0) { if (i > 0) @@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num, int __kprobes register_jprobe(struct jprobe *jp) { - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); + return register_jprobes(&jp, 1); } void __kprobes unregister_jprobe(struct jprobe *jp) @@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_jprobes(&jp, 1); } -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->nmissed = 0; /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); + ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } -static int __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); + ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); @@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return ret; } -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; + if (p->pre_handler == aggr_pre_handler) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) + return NOTIFY_DONE; + + /* + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + static int __init init_kprobes(void) { int i, err = 0; @@ -1111,6 +1174,9 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + kprobes_initialized = (err == 0); if (!err) @@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); + seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : "")); else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); + seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); + if (!kprobe_gone(p)) + arch_arm_kprobe(p); } kprobe_enabled = true; @@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) arch_disarm_kprobe(p); } } diff --git a/kernel/module.c b/kernel/module.c index f47cce910f2..496dcb57b60 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> -#include <linux/unwind.h> #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -1449,8 +1448,6 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); - unwind_remove_table(mod->unwind_info, 0); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1867,7 +1864,6 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int unwindex = 0; unsigned int num_kp, num_mcount; struct kernel_param *kp; struct module *mod; @@ -1957,9 +1953,6 @@ static noinline struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1969,8 +1962,6 @@ static noinline struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2267,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod, add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - /* Get rid of temporary copy */ vfree(hdr); @@ -2366,11 +2352,12 @@ sys_init_module(void __user *umod, /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2405,7 +2392,7 @@ static const char *get_ksymbol(struct module *mod, unsigned long nextval; /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) + if (within_module_init(addr, mod)) nextval = (unsigned long)mod->module_init+mod->init_text_size; else nextval = (unsigned long)mod->module_core+mod->core_text_size; @@ -2453,8 +2440,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -2476,8 +2463,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -2500,8 +2487,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -2720,7 +2707,7 @@ int is_module_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { + if (within_module_core(addr, mod)) { preempt_enable(); return 1; } diff --git a/kernel/panic.c b/kernel/panic.c index 13f06349a78..2a2ff36ff44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,8 @@ static int init_oops_id(void) { if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; return 0; } diff --git a/kernel/profile.c b/kernel/profile.c index d18e2d2654f..784933acf5b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -445,7 +445,6 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <asm/uaccess.h> -#include <asm/ptrace.h> static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855ff3c..3152ac3b62e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); + q->info.si_pid = task_tgid_nr_ns(current, + task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: diff --git a/kernel/sys.c b/kernel/sys.c index d356d79e84a..4a43617cd56 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> +#include <linux/ptrace.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c7626..92f6e5bc3c2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,10 +87,6 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +97,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,12 +949,22 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, @@ -969,6 +976,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395b45b..4f104515a19 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,21 +22,11 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); static noinline u32 kprobe_target(u32 value) { - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - return (value / div_factor); } @@ -74,7 +64,7 @@ static int test_kprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kprobe(&kp); if (preh_val == 0) { @@ -92,6 +82,84 @@ static int test_kprobe(void) return 0; } +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + static u32 j_kprobe_target(u32 value) { if (value != rand1) { @@ -121,7 +189,7 @@ static int test_jprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -132,6 +200,43 @@ static int test_jprobe(void) return 0; } +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -177,7 +282,7 @@ static int test_kretprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -187,12 +292,72 @@ static int test_kretprobe(void) return 0; } + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} #endif /* CONFIG_KRETPROBES */ int init_test_probes(void) { int ret; + target = kprobe_target; + target2 = kprobe_target2; + do { rand1 = random32(); } while (rand1 <= div_factor); @@ -204,15 +369,30 @@ int init_test_probes(void) errors++; num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + + num_tests++; ret = test_jprobe(); if (ret < 0) errors++; + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); if (ret < 0) errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; #endif /* CONFIG_KRETPROBES */ if (errors) diff --git a/kernel/time.c b/kernel/time.c index d63a4336fad..4886e3ce83a 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -37,6 +37,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab3571..43f891b05a4 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 486da62b2b0..9681d54b95d 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -12,6 +12,7 @@ #include <linux/tty.h> #include <linux/wait.h> #include <linux/vt_kern.h> +#include <linux/console.h> void __attribute__((weak)) bust_spinlocks(int yes) @@ -22,6 +23,7 @@ void __attribute__((weak)) bust_spinlocks(int yes) #ifdef CONFIG_VT unblank_screen(); #endif + console_unblank(); if (--oops_in_progress == 0) wake_up_klogd(); } diff --git a/lib/fault-inject.c b/lib/fault-inject.c index a50a311554c..f97af55bdd9 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -6,7 +6,6 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/interrupt.h> -#include <linux/unwind.h> #include <linux/stacktrace.h> #include <linux/kallsyms.h> #include <linux/fault-inject.h> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index b255b939bc1..a60bd804609 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -9,10 +9,8 @@ #include <linux/cpu.h> #include <linux/module.h> -#ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_MUTEX(percpu_counters_lock); -#endif void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { @@ -111,13 +109,24 @@ void percpu_counter_destroy(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_destroy); -#ifdef CONFIG_HOTPLUG_CPU +int percpu_counter_batch __read_mostly = 32; +EXPORT_SYMBOL(percpu_counter_batch); + +static void compute_batch_value(void) +{ + int nr = num_online_cpus(); + + percpu_counter_batch = max(32, nr*2); +} + static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { +#ifdef CONFIG_HOTPLUG_CPU unsigned int cpu; struct percpu_counter *fbc; + compute_batch_value(); if (action != CPU_DEAD) return NOTIFY_OK; @@ -134,13 +143,14 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, spin_unlock_irqrestore(&fbc->lock, flags); } mutex_unlock(&percpu_counters_lock); +#endif return NOTIFY_OK; } static int __init percpu_counter_startup(void) { + compute_batch_value(); hotcpu_notifier(percpu_counter_hotcpu_callback, 0); return 0; } module_init(percpu_counter_startup); -#endif diff --git a/lib/prio_heap.c b/lib/prio_heap.c index 471944a54e2..a7af6f85eca 100644 --- a/lib/prio_heap.c +++ b/lib/prio_heap.c @@ -31,7 +31,7 @@ void *heap_insert(struct ptr_heap *heap, void *p) if (heap->size < heap->max) { /* Heap insertion */ - int pos = heap->size++; + pos = heap->size++; while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { ptrs[pos] = ptrs[(pos-1)/2]; pos = (pos-1)/2; diff --git a/lib/proportions.c b/lib/proportions.c index 4f387a643d7..3fda810faf0 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -147,6 +147,7 @@ out: * this is used to track the active references. */ static struct prop_global *prop_get_global(struct prop_descriptor *pd) +__acquires(RCU) { int index; @@ -160,6 +161,7 @@ static struct prop_global *prop_get_global(struct prop_descriptor *pd) } static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) +__releases(RCU) { rcu_read_unlock(); } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index be86b32bc87..8d3fb0bd128 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -81,7 +81,7 @@ struct radix_tree_preload { int nr; struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; }; -DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 98d632277ca..0fbd0121d91 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -170,6 +170,8 @@ int strict_strtoul(const char *cp, unsigned int base, unsigned long *res) return -EINVAL; val = simple_strtoul(cp, &tail, base); + if (tail == cp) + return -EINVAL; if ((*tail == '\0') || ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { *res = val; @@ -241,6 +243,8 @@ int strict_strtoull(const char *cp, unsigned int base, unsigned long long *res) return -EINVAL; val = simple_strtoull(cp, &tail, base); + if (tail == cp) + return -EINVAL; if ((*tail == '\0') || ((len == (size_t)(tail - cp) + 1) && (*tail == '\n'))) { *res = val; diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a81..a5b77811fdf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -181,12 +181,6 @@ config MIGRATION example on NUMA systems to put pages nearer to the processors accessing the page. -config RESOURCES_64BIT - bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) - default 64BIT - help - This option allows memory and IO resources to be 64 bit. - config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7..72255be57f8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ - readahead.o swap.o truncate.o vmscan.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) @@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o -obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 801c08b046e..6f80beddd8a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,9 +24,9 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142..51a0ccf61e0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + BUG_ON(!size); BUG_ON(align & (align - 1)); BUG_ON(limit && goal + size > limit); @@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, if (!bdata->node_bootmem_map) return NULL; - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - min = bdata->node_min_pfn; max = bdata->node_low_pfn; diff --git a/mm/filemap.c b/mm/filemap.c index f5769b4dc07..2f55a1e2baf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; @@ -741,7 +741,14 @@ repeat: page = __page_cache_alloc(gfp_mask); if (!page) return NULL; - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + /* + * We want a regular kernel memory (not highmem or DMA etc) + * allocation for the radix tree nodes, but we need to honour + * the context-specific requirements the caller has asked for. + * GFP_RECLAIM_MASK collects those requirements. + */ + err = add_to_page_cache_lru(page, mapping, index, + (gfp_mask & GFP_RECLAIM_MASK)); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) return NULL; } page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { + if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; } @@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = filemap_write_and_wait(mapping); + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_length(iov, nr_segs) - 1); if (!retval) { retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); @@ -1530,7 +1538,6 @@ retry_find: /* * Found the page and have a reference on it. */ - mark_page_accessed(page); ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - /* - * Unmap all mmappings of the file up-front. - * - * This will cause any pte dirty bits to be propagated into the - * pageframes for the subsequent filemap_write_and_wait(). - */ write_len = iov_length(iov, *nr_segs); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, pos, write_len, 0); - written = filemap_write_and_wait(mapping); + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) goto out; @@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); + status = filemap_write_and_wait_range(mapping, + pos, pos + written - 1); return written ? written : status; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2..0c04615651b 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -193,7 +193,7 @@ retry: /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); - page_remove_rmap(page, vma); + page_remove_rmap(page); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7..62d5bbda921 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (page) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb8..618e9830408 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, } /* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << (hstate->order + PAGE_SHIFT); +} + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. @@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) - return clear_gigantic_page(page, addr, sz); + if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, sz); + return; + } might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { @@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, int i; struct hstate *h = hstate_vma(vma); - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) - return copy_gigantic_page(dst, src, addr, vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src, addr, vma); + return; + } might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { @@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) +int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); @@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) * puts them into the mem_map). */ m = addr; - if (m) - goto found; + goto found; } hstate_next_node(h); nr_nodes--; diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb6..478223b73a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ +extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define GUP_FLAGS_WRITE 0x1 #define GUP_FLAGS_FORCE 0x2 #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 +#define GUP_FLAGS_IGNORE_SIGKILL 0x8 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0..51ee9654557 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) return 0; } -int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, + unsigned long long val) { int retry_count = MEM_CGROUP_RECLAIM_RETRIES; diff --git a/mm/memory.c b/mm/memory.c index 7b9db658aca..3f8fa06b963 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,9 @@ #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> +#include <linux/kallsyms.h> +#include <linux/swapops.h> +#include <linux/elf.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -59,9 +62,6 @@ #include <asm/tlbflush.h> #include <asm/pgtable.h> -#include <linux/swapops.h> -#include <linux/elf.h> - #include "internal.h" #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, - unsigned long vaddr) -{ - printk(KERN_ERR "Bad pte = %08llx, process = %s, " - "vm_flags = %lx, vaddr = %lx\n", - (long long)pte_val(pte), - (vma->vm_mm == current->mm ? current->comm : "???"), - vma->vm_flags, vaddr); +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) { + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + } + printk(KERN_ALERT + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + if (vma->vm_ops) + print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", + (unsigned long)vma->vm_ops->fault); + if (vma->vm_file && vma->vm_file->f_op) + print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", + (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); + add_taint(TAINT_BAD_PAGE); } static inline int is_cow_mapping(unsigned int flags) @@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags) struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn; + unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) { - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - return pte_page(pte); - } - VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); + if (likely(!pte_special(pte))) + goto check_pfn; + if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ - pfn = pte_pfn(pte); - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - VM_BUG_ON(!pfn_valid(pfn)); +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } /* * NOTE! We still have PageReserved() pages in the page tables. - * * eg. VDSO mappings can cause them to exist. */ out: @@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, else { if (pte_dirty(ptent)) set_page_dirty(page); - if (pte_young(ptent)) - SetPageReferenced(page); + if (pte_young(ptent) && + likely(!VM_SequentialReadHint(vma))) + mark_page_accessed(page); file_rss--; } - page_remove_rmap(page, vma); + page_remove_rmap(page); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } @@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (unlikely(details)) continue; - if (!pte_file(ptent)) - free_swap_and_cache(pte_to_swp_entry(ptent)); + if (pte_file(ptent)) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) + print_bad_pte(vma, addr, ptent, NULL); + } else if + (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) + print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int write = !!(flags & GUP_FLAGS_WRITE); int force = !!(flags & GUP_FLAGS_FORCE); int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); + int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); if (len <= 0) return 0; @@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page *page; /* - * If tsk is ooming, cut off its access to large memory - * allocations. It has a pending SIGKILL, but it can't - * be processed until returning to user space. + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory, unless + * current is handling munlock--e.g., on exit. In + * that case, we are not allocating memory. Rather, + * we're only unlocking already resident/mapped pages. */ - if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) - return i ? i : -ENOMEM; + if (unlikely(!ignore_sigkill && + fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; if (write) foll_flags |= FOLL_WRITE; @@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). */ - if (ret & VM_FAULT_WRITE) + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); @@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + arch_enter_lazy_mmu_mode(); + token = pmd_pgtable(*pmd); do { @@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; @@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (trylock_page(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); + goto unlock; + } + page_cache_release(old_page); } + reuse = reuse_swap_page(old_page); + unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* @@ -1943,11 +2025,7 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - SetPageSwapBacked(new_page); - lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); - -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { @@ -1973,7 +2051,7 @@ gotten: * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma); + page_remove_rmap(old_page); } /* Free the old page.. */ @@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { + if (write_access && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } @@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - remove_exclusive_swap_page(page); + try_to_free_swap(page); unlock_page(page); if (write_access) { @@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter(mm, anon_rss); - SetPageSwapBacked(page); - lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); @@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } -//TODO: is this safe? do_anonymous_page() does it this way. set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ @@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return 0; - if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || - !(vma->vm_flags & VM_CAN_NONLINEAR))) { + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* * Page table corrupted: show pte and kill process. */ - print_bad_pte(vma, orig_pte, address); + print_bad_pte(vma, address, orig_pte, NULL); return VM_FAULT_OOM; } @@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, { resource_size_t phys_addr; unsigned long prot = 0; - void *maddr; + void __iomem *maddr; int offset = addr & (PAGE_SIZE-1); if (follow_phys(vma, addr, write, &prot, &phys_addr)) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b1737118546..c083cf5fd6d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) +static int __meminit __add_section(int nid, struct zone *zone, + unsigned long phys_start_pfn) { int nr_pages = PAGES_PER_SECTION; int ret; @@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p if (ret < 0) return ret; - return register_new_memory(__pfn_to_section(phys_start_pfn)); + return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) { unsigned long i; int err = 0; @@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(zone, i << PFN_SECTION_SHIFT); + err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); /* * EEXIST is finally dealt with by ioresource collision @@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end) } static struct page * -hotremove_migrate_alloc(struct page *page, - unsigned long private, - int **x) +hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) { - /* This should be improoooooved!! */ - return alloc_page(GFP_HIGHUSER_PAGECACHE); + /* This should be improooooved!! */ + return alloc_page(GFP_HIGHUSER_MOVABLE); } - #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/migrate.c b/mm/migrate.c index 21631ab8c08..55373983c9c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page. */ get_page(newpage); /* add cache reference */ -#ifdef CONFIG_SWAP if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } -#endif radix_tree_replace_slot(pslot, newpage); @@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page) mlock_migrate_page(newpage, page); -#ifdef CONFIG_SWAP ClearPageSwapCache(page); -#endif ClearPagePrivate(page); set_page_private(page, 0); /* page->mapping contains a flag for PageAnon() */ @@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, struct vm_area_struct *vma; struct page *page; - /* - * A valid page pointer that will not match any of the - * pages that will be moved. - */ - pp->page = ZERO_PAGE(0); - err = -EFAULT; vma = find_vma(mm, pp->addr); if (!vma || !vma_migratable(vma)) @@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm = NULL; + struct page_to_node *pm; nodemask_t task_nodes; - int err = 0; - int i; + unsigned long chunk_nr_pages; + unsigned long chunk_start; + int err; task_nodes = cpuset_mems_allowed(task); - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out; - } - - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; + err = -ENOMEM; + pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); + if (!pm) goto out; - } - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. + * Store a chunk of page_to_node array in a page, + * but keep the last one as a marker */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; + chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - err = -EFAULT; - if (get_user(p, pages + i)) - goto out_pm; + for (chunk_start = 0; + chunk_start < nr_pages; + chunk_start += chunk_nr_pages) { + int j; - pm[i].addr = (unsigned long)p; - if (nodes) { + if (chunk_start + chunk_nr_pages > nr_pages) + chunk_nr_pages = nr_pages - chunk_start; + + /* fill the chunk pm with addrs and nodes from user-space */ + for (j = 0; j < chunk_nr_pages; j++) { + const void __user *p; int node; - if (get_user(node, nodes + i)) + err = -EFAULT; + if (get_user(p, pages + j + chunk_start)) + goto out_pm; + pm[j].addr = (unsigned long) p; + + if (get_user(node, nodes + j + chunk_start)) goto out_pm; err = -ENODEV; @@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, if (!node_isset(node, task_nodes)) goto out_pm; - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ - } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; + pm[j].node = node; + } + + /* End marker for this chunk */ + pm[chunk_nr_pages].node = MAX_NUMNODES; + + /* Migrate this chunk */ + err = do_move_page_to_node_array(mm, pm, + flags & MPOL_MF_MOVE_ALL); + if (err < 0) + goto out_pm; - err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); - if (err >= 0) /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) + for (j = 0; j < chunk_nr_pages; j++) + if (put_user(pm[j].status, status + j + chunk_start)) { err = -EFAULT; + goto out_pm; + } + } + err = 0; out_pm: - vfree(pm); + free_page((unsigned long)pm); out: return err; } diff --git a/mm/mlock.c b/mm/mlock.c index 3035a56e761..e125156c664 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, (atomic_read(&mm->mm_users) != 0)); /* - * mlock: don't page populate if page has PROT_NONE permission. - * munlock: the pages always do munlock althrough - * its has PROT_NONE permission. + * mlock: don't page populate if vma has PROT_NONE permission. + * munlock: always do munlock although the vma has PROT_NONE + * permission, or SIGKILL is pending. */ if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | + GUP_FLAGS_IGNORE_SIGKILL; if (vma->vm_flags & VM_WRITE) gup_flags |= GUP_FLAGS_WRITE; diff --git a/mm/mmap.c b/mm/mmap.c index 2c778fcfd9b..a910c045cfd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, static void __vma_link_file(struct vm_area_struct *vma) { - struct file * file; + struct file *file; file = vma->vm_file; if (file) { @@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * insert vm structure into list and rbtree and anon_vma, * but it has already been inserted into prio_tree earlier. */ -static void -__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; + struct vm_area_struct *__vma, *prev; + struct rb_node **rb_link, *rb_parent; __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); BUG_ON(__vma && __vma->vm_start < vma->vm_end); @@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, * The caller must hold down_write(current->mm->mmap_sem). */ -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff) { @@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma = NULL; @@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node * rb_node; + struct rb_node *rb_node; if (!mm) goto out; @@ -1541,7 +1540,7 @@ out: * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; @@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ + return; + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ + /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); diff --git a/mm/mprotect.c b/mm/mprotect.c index cfb4c485206..d0f6e7ce09f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -22,6 +22,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> +#include <linux/migrate.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); -#ifdef CONFIG_MIGRATION - } else if (!pte_file(oldpte)) { + } else if (PAGE_MIGRATION && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { @@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } -#endif } - } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 558f9afe6e4..6b9e758c98a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,7 +31,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; -static DEFINE_SPINLOCK(zone_scan_mutex); +static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ /** @@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, printk(KERN_WARNING "%s invoked oom-killer: " "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); dump_stack(); show_mem(); if (sysctl_oom_dump_tasks) @@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zone *zone; int ret = 1; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { if (zone_is_oom_locked(zone)) { ret = 0; @@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* - * Lock each zone in the zonelist under zone_scan_mutex so a + * Lock each zone in the zonelist under zone_scan_lock so a * parallel invocation of try_set_zone_oom() doesn't succeed * when it shouldn't. */ @@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) } out: - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); return ret; } @@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zoneref *z; struct zone *zone; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { zone_clear_flag(zone, ZONE_OOM_LOCKED); } - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); +} + +/* + * Must be called with tasklist_lock held for read. + */ +static void __out_of_memory(gfp_t gfp_mask, int order) +{ + if (sysctl_oom_kill_allocating_task) { + oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)"); + + } else { + unsigned long points; + struct task_struct *p; + +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); + + if (PTR_ERR(p) == -1UL) + return; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; + } +} + +/* + * pagefault handler calls into here because it is out of memory but + * doesn't know exactly how or why. + */ +void pagefault_out_of_memory(void) +{ + unsigned long freed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + + if (sysctl_panic_on_oom) + panic("out of memory from page fault. panic_on_oom is selected.\n"); + + read_lock(&tasklist_lock); + __out_of_memory(0, 0); /* unknown gfp_mask and order */ + read_unlock(&tasklist_lock); + + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory. + */ + if (!test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); } /** @@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - struct task_struct *p; - unsigned long points = 0; unsigned long freed = 0; enum oom_constraint constraint; @@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, points, NULL, + oom_kill_process(current, gfp_mask, order, 0, NULL, "No available memory (MPOL_BIND)"); break; @@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) panic("out of memory. panic_on_oom is selected\n"); /* Fall-through */ case CONSTRAINT_CPUSET: - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, points, NULL, - "Out of memory (oom_kill_allocating_task)"); - break; - } -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - goto out; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } - - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; - + __out_of_memory(gfp_mask, order); break; } -out: read_unlock(&tasklist_lock); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2970e35fd03..b493db7841d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void) int dirty_background_ratio = 5; /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + +/* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ @@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable; int vm_dirty_ratio = 10; /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + +/* * The interval between `kupdate'-style writebacks, in jiffies */ int dirty_writeback_interval = 5 * HZ; @@ -135,23 +147,75 @@ static int calc_period_shift(void) { unsigned long dirty_total; - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; + if (vm_dirty_bytes) + dirty_total = vm_dirty_bytes / PAGE_SIZE; + else + dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + 100; return 2 + ilog2(dirty_total - 1); } /* - * update the period when the dirty ratio changes. + * update the period when the dirty threshold changes. */ +static void update_completion_period(void) +{ + int shift = calc_period_shift(); + prop_change_shift(&vm_completions, shift); + prop_change_shift(&vm_dirties, shift); +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret; + + ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); + update_completion_period(); + vm_dirty_bytes = 0; + } + return ret; +} + + +int dirty_bytes_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + update_completion_period(); + vm_dirty_ratio = 0; } return ret; } @@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void) } void -get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, - struct backing_dev_info *bdi) +get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, + unsigned long *pbdi_dirty, struct backing_dev_info *bdi) { - int background_ratio; /* Percentages */ - int dirty_ratio; - long background; - long dirty; + unsigned long background; + unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else { + int dirty_ratio; - background_ratio = dirty_background_ratio; - if (background_ratio >= dirty_ratio) - background_ratio = dirty_ratio / 2; + dirty_ratio = vm_dirty_ratio; + if (dirty_ratio < 5) + dirty_ratio = 5; + dirty = (dirty_ratio * available_memory) / 100; + } + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; - background = (background_ratio * available_memory) / 100; - dirty = (dirty_ratio * available_memory) / 100; + if (background >= dirty) + background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; @@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; unsigned long pages_written = 0; unsigned long write_chunk = sync_writeback_pages(); @@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); void throttle_vm_writeout(gfp_t gfp_mask) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; for ( ; ; ) { get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); @@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages) }; for ( ; ; ) { - long background_thresh; - long dirty_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + @@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; long nr_to_write = wbc->nr_to_write; @@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index + 1; + lock_page(page); + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - ret = (*writepage)(page, wbc, data); + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done = 1; + break; + } + } + + if (wbc->sync_mode == WB_SYNC_NONE) { + wbc->nr_to_write--; + if (wbc->nr_to_write <= 0) { + done = 1; + break; + } } - if (ret || (--nr_to_write <= 0)) - done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; + break; } } pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (!wbc->no_nrwrite_index_update) { if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; wbc->nr_to_write = nr_to_write; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d8ac0147456..7bf22e04531 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -long nr_swap_pages; +unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - current->comm, page, (int)(2*sizeof(unsigned long)), - (unsigned long)page->flags, page->mapping, - page_mapcount(page), page_count(page)); + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" - KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; +out: + /* Leave bad fields for debug, except PageBuddy could make trouble */ + __ClearPageBuddy(page); add_taint(TAINT_BAD_PAGE); } @@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order) } #endif -static void destroy_compound_page(struct page *page, unsigned long order) +static int destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + int bad = 0; - if (unlikely(compound_order(page) != order)) + if (unlikely(compound_order(page) != order) || + unlikely(!PageHead(page))) { bad_page(page); + bad++; + } - if (unlikely(!PageHead(page))) - bad_page(page); __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageTail(p) | - (p->first_page != page))) + if (unlikely(!PageTail(p) | (p->first_page != page))) { bad_page(page); + bad++; + } __ClearPageTail(p); } + + return bad; } static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) @@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page, int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) - destroy_compound_page(page, order); + if (unlikely(destroy_compound_page(page, order))) + return; page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); - if (PageDirty(page)) - __ClearPageDirty(page); - if (PageSwapBacked(page)) - __ClearPageSwapBacked(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not free the page. But we shall soon need - * to do more, for when the ZERO_PAGE count wraps negative. - */ - return PageReserved(page); + return 1; + } + if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; } /* @@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int i; - int reserved = 0; + int bad = 0; for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); - if (reserved) + bad += free_pages_check(page + i); + if (bad) return; if (!PageHighMem(page)) { @@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); - - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not allocate the page: as a safety net. - */ - if (PageReserved(page)) return 1; + } - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk -#ifdef CONFIG_UNEVICTABLE_LRU - | 1 << PG_mlocked -#endif - ); set_page_private(page, 0); set_page_refcounted(page); @@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long pfn; struct zone *z; + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* @@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; - if (usemapsize) { + if (usemapsize) zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); - memset(zone->pageblock_flags, 0, usemapsize); - } } #else static void inline setup_usemap(struct pglist_data *pgdat, @@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", @@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -void setup_per_zone_inactive_ratio(void) +static void setup_per_zone_inactive_ratio(void) { struct zone *zone; @@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page *pfn_to_page(unsigned long pfn) -{ - return __pfn_to_page(pfn); -} -unsigned long page_to_pfn(struct page *page) -{ - return __page_to_pfn(page); -} -EXPORT_SYMBOL(pfn_to_page); -EXPORT_SYMBOL(page_to_pfn); -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ - /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(struct zone *zone, unsigned long pfn) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ab27ff75051..d6507a660ed 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) } /* __alloc_bootmem...() is protected by !slab_available() */ -int __init_refok init_section_page_cgroup(unsigned long pfn) +static int __init_refok init_section_page_cgroup(unsigned long pfn) { struct mem_section *section; struct page_cgroup *base, *pc; diff --git a/mm/page_io.c b/mm/page_io.c index 065c4480eaf..dc6ce0afbde 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) struct bio *bio; int ret = 0, rw = WRITE; - if (remove_exclusive_swap_page(page)) { + if (try_to_free_swap(page)) { unlock_page(page); goto out; } @@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page) struct bio *bio; int ret = 0; - BUG_ON(!PageLocked(page)); - BUG_ON(PageUptodate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageUptodate(page)); bio = get_swap_bio(GFP_KERNEL, page_private(page), page, end_swap_bio_read); if (bio == NULL) { diff --git a/mm/rmap.c b/mm/rmap.c index 10993942d6c..ac4af8cffbf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -47,9 +47,9 @@ #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> -#include <linux/kallsyms.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> +#include <linux/migrate.h> #include <asm/tlbflush.h> @@ -191,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +static struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +211,7 @@ out: return NULL; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +static void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page, goto out_unmap; } - if (ptep_clear_flush_young_notify(vma, address, pte)) - referenced++; + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ + VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); + SetPageSwapBacked(page); + atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ __page_set_anon_rmap(page, vma, address); + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(page); } /** @@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page) */ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(page_mapcount(page) == 0); if (PageAnon(page)) __page_check_anon_rmap(page, vma, address); atomic_inc(&page->_mapcount); @@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from - * @vma: the vm area in which the mapping is removed * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, struct vm_area_struct *vma) +void page_remove_rmap(struct page *page) { if (atomic_add_negative(-1, &page->_mapcount)) { - if (unlikely(page_mapcount(page) < 0)) { - printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); - printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); - printk (KERN_EMERG " page->flags = %lx\n", page->flags); - printk (KERN_EMERG " page->count = %x\n", page_count(page)); - printk (KERN_EMERG " page->mapping = %p\n", page->mapping); - print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); - if (vma->vm_ops) { - print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); - } - if (vma->vm_file && vma->vm_file->f_op) - print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); - BUG(); - } - /* * Now that the last pte has gone, s390 must transfer dirty * flag from storage key to struct page. We can usually skip @@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, anon_rss); -#ifdef CONFIG_MIGRATION - } else { + } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration @@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ BUG_ON(!migration); entry = make_migration_entry(page, pte_write(pteval)); -#endif } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else -#ifdef CONFIG_MIGRATION - if (migration) { + } else if (PAGE_MIGRATION && migration) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else -#endif dec_mm_counter(mm, file_rss); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); out_unmap: @@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (pte_dirty(pteval)) set_page_dirty(page); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); dec_mm_counter(mm, file_rss); (*mapcount)--; diff --git a/mm/shmem.c b/mm/shmem.c index f1b0d4871f3..5941f980136 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -14,31 +14,39 @@ * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * + * tiny-shmem: + * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> + * * This file is released under the GPL. */ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/vfs.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> + +static struct vfsmount *shm_mnt; + +#ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ -#include <linux/module.h> -#include <linux/init.h> -#include <linux/fs.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/generic_acl.h> -#include <linux/mm.h> #include <linux/mman.h> -#include <linux/file.h> -#include <linux/swap.h> #include <linux/pagemap.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/shmem_fs.h> -#include <linux/mount.h> #include <linux/writeback.h> #include <linux/vfs.h> #include <linux/blkdev.h> @@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - mark_page_accessed(vmf->page); return ret | VM_FAULT_LOCKED; } @@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = { .get_sb = shmem_get_sb, .kill_sb = kill_litter_super, }; -static struct vfsmount *shm_mnt; static int __init init_tmpfs(void) { @@ -2525,7 +2531,51 @@ out4: shm_mnt = ERR_PTR(error); return error; } -module_init(init_tmpfs) + +#else /* !CONFIG_SHMEM */ + +/* + * tiny-shmem: simple shmemfs and tmpfs using ramfs code + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +#include <linux/ramfs.h> + +static struct file_system_type tmpfs_fs_type = { + .name = "tmpfs", + .get_sb = ramfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static int __init init_tmpfs(void) +{ + BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + + shm_mnt = kern_mount(&tmpfs_fs_type); + BUG_ON(IS_ERR(shm_mnt)); + + return 0; +} + +int shmem_unuse(swp_entry_t entry, struct page *page) +{ + return 0; +} + +#define shmem_file_operations ramfs_file_operations +#define shmem_vm_ops generic_file_vm_ops +#define shmem_get_inode ramfs_get_inode +#define shmem_acct_size(a, b) 0 +#define shmem_unacct_size(a, b) do {} while (0) +#define SHMEM_MAX_BYTES LLONG_MAX + +#endif /* CONFIG_SHMEM */ + +/* common code */ /** * shmem_file_setup - get an unlinked file living in tmpfs @@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!inode) goto close_file; +#ifdef CONFIG_SHMEM SHMEM_I(inode)->flags = flags & VM_ACCOUNT; +#endif d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); + &shmem_file_operations); + +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) + goto close_file; +#endif return file; close_file: @@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_ops = &shmem_vm_ops; return 0; } + +module_init(init_tmpfs) diff --git a/mm/swap.c b/mm/swap.c index b135ec90cde..ba2c0e8b8b5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } -/** - * lru_cache_add_active_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability - * - * place @page on active or unevictable LRU list, depending on - * page_evictable(). Note that if the page is not evictable, - * it goes directly back onto it's zone's unevictable list. It does - * NOT use a per cpu pagevec. - */ -void lru_cache_add_active_or_unevictable(struct page *page, - struct vm_area_struct *vma) -{ - if (page_evictable(page, vma)) - lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); - else - add_page_to_unevictable_list(page); -} - /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); /* - * pagevec_release() for pages which are known to not be on the LRU - * - * This function reinitialises the caller's pagevec. - */ -void __pagevec_release_nonlru(struct pagevec *pvec) -{ - int i; - struct pagevec pages_to_free; - - pagevec_init(&pages_to_free, pvec->cold); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - - VM_BUG_ON(PageLRU(page)); - if (put_page_testzero(page)) - pagevec_add(&pages_to_free, page); - } - pagevec_free(&pages_to_free); - pagevec_reinit(pvec); -} - -/* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ @@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec) struct page *page = pvec->pages[i]; if (PageSwapCache(page) && trylock_page(page)) { - if (PageSwapCache(page)) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 3353c9029ce..81c825f67a7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - BUG_ON(!PageLocked(page)); - BUG_ON(PageSwapCache(page)); - BUG_ON(PagePrivate(page)); - BUG_ON(!PageSwapBacked(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapCache(page)); + VM_BUG_ON(!PageSwapBacked(page)); + error = radix_tree_preload(gfp_mask); if (!error) { page_cache_get(page); @@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - BUG_ON(!PageLocked(page)); - BUG_ON(!PageSwapCache(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageSwapCache(page)); + VM_BUG_ON(PageWriteback(page)); radix_tree_delete(&swapper_space.page_tree, page_private(page)); set_page_private(page, 0); @@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page * page, gfp_t gfp_mask) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; - BUG_ON(!PageLocked(page)); - BUG_ON(!PageUptodate(page)); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageUptodate(page)); for (;;) { entry = get_swap_page(); @@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) * Add it to the swap cache and mark it dirty */ err = add_to_swap_cache(page, entry, - gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ @@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page) * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock - * here because we are going to recheck again inside - * exclusive_swap_page() _with_ the lock. + * here because we are going to recheck again inside + * try_to_free_swap() _with_ the lock. * - Marcelo */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && trylock_page(page)) { - remove_exclusive_swap_page(page); + if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { + try_to_free_swap(page); unlock_page(page); } } diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a9f87e516..eec5ca758a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/shm.h> #include <linux/blkdev.h> +#include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -35,6 +36,7 @@ static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; +long nr_swap_pages; long total_swap_pages; static int swap_overflow; static int least_priority; @@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) up_read(&swap_unplug_sem); } +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + int err = 0; + + list_for_each_entry(se, &si->extent_list, list) { + sector_t start_block = se->start_block << (PAGE_SHIFT - 9); + sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + + if (se->start_page == 0) { + /* Do not discard the swap header page! */ + start_block += 1 << (PAGE_SHIFT - 9); + nr_blocks -= 1 << (PAGE_SHIFT - 9); + if (!nr_blocks) + continue; + } + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = si->curr_swap_extent; + int found_extent = 0; + + while (nr_pages) { + struct list_head *lh; + + if (se->start_page <= start_page && + start_page < se->start_page + se->nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + if (!found_extent++) + si->curr_swap_extent = se; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO)) + break; + } + + lh = se->list.next; + if (lh == &si->extent_list) + lh = lh->next; + se = list_entry(lh, struct swap_extent, list); + } +} + +static int wait_for_discard(void *word) +{ + schedule(); + return 0; +} + #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si) { - unsigned long offset, last_in_cluster; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; + int found_free_cluster = 0; - /* + /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting @@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; - if (unlikely(!si->cluster_nr)) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) - goto lowest; + scan_base = offset = si->cluster_next; + + if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + if (si->flags & SWP_DISCARDABLE) { + /* + * Start range check on racing allocations, in case + * they overlap the cluster we eventually decide on + * (we scan without swap_lock to allow preemption). + * It's hardly conceivable that cluster_nr could be + * wrapped during our scan, but don't depend on it. + */ + if (si->lowest_alloc) + goto checks; + si->lowest_alloc = si->max; + si->highest_alloc = 0; + } spin_unlock(&swap_lock); - offset = si->lowest_bit; + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * But if seek is cheap, search from our current position, so + * that swap is allocated from all over the partition: if the + * Flash Translation Layer only remaps within limited zones, + * we don't want to wear out the first zone too quickly. + */ + if (!(si->flags & SWP_SOLIDSTATE)) + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); - si->cluster_next = offset-SWAPFILE_CLUSTER+1; - goto cluster; + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } + + offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster < scan_base; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&swap_lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = scan_base; spin_lock(&swap_lock); - goto lowest; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + si->lowest_alloc = 0; } - si->cluster_nr--; -cluster: - offset = si->cluster_next; - if (offset > si->highest_bit) -lowest: offset = si->lowest_bit; -checks: if (!(si->flags & SWP_WRITEOK)) +checks: + if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; - if (!si->swap_map[offset]) { - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + if (si->swap_map[offset]) + goto scan; + + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + si->swap_map[offset] = 1; + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + + if (si->lowest_alloc) { + /* + * Only set when SWP_DISCARDABLE, and there's a scan + * for a free cluster in progress or just completed. + */ + if (found_free_cluster) { + /* + * To optimize wear-levelling, discard the + * old data of the cluster, taking care not to + * discard any of its pages that have already + * been allocated by racing tasks (offset has + * already stepped over any at the beginning). + */ + if (offset < si->highest_alloc && + si->lowest_alloc <= last_in_cluster) + last_in_cluster = si->lowest_alloc - 1; + si->flags |= SWP_DISCARDING; + spin_unlock(&swap_lock); + + if (offset < last_in_cluster) + discard_swap_cluster(si, offset, + last_in_cluster - offset + 1); + + spin_lock(&swap_lock); + si->lowest_alloc = 0; + si->flags &= ~SWP_DISCARDING; + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); + + } else if (si->flags & SWP_DISCARDING) { + /* + * Delay using pages allocated by racing tasks + * until the whole discard has been issued. We + * could defer that delay until swap_writepage, + * but it's easier to keep this self-contained. + */ + spin_unlock(&swap_lock); + wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), + wait_for_discard, TASK_UNINTERRUPTIBLE); + spin_lock(&swap_lock); + } else { + /* + * Note pages allocated by racing tasks while + * scan for a free cluster is in progress, so + * that its final discard can exclude them. + */ + if (offset < si->lowest_alloc) + si->lowest_alloc = offset; + if (offset > si->highest_alloc) + si->highest_alloc = offset; } - si->swap_map[offset] = 1; - si->cluster_next = offset + 1; - si->flags -= SWP_SCANNING; - return offset; } + return offset; +scan: spin_unlock(&swap_lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { @@ -165,8 +355,18 @@ checks: if (!(si->flags & SWP_WRITEOK)) latency_ration = LATENCY_LIMIT; } } + offset = si->lowest_bit; + while (++offset < scan_base) { + if (!si->swap_map[offset]) { + spin_lock(&swap_lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } spin_lock(&swap_lock); - goto lowest; no_page: si->flags -= SWP_SCANNING; @@ -268,7 +468,7 @@ bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; -} +} static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) { @@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page) } /* - * We can use this swap cache entry directly - * if there are no other references to it. + * We can write to an anon page without COW if there are no other references + * to it. And as a side-effect, free up its swap: because the old content + * on disk will never be read, and seeking back there to write new content + * later would only waste time away from clustering. */ -int can_share_swap_page(struct page *page) +int reuse_swap_page(struct page *page) { int count; - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); count = page_mapcount(page); - if (count <= 1 && PageSwapCache(page)) + if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); + if (count == 1 && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + } return count == 1; } /* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. + * If swap is getting full, or if there are no more mappings of this page, + * then try_to_free_swap is called to free its swap space. */ -static int remove_exclusive_swap_page_count(struct page *page, int count) +int try_to_free_swap(struct page *page) { - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - - BUG_ON(PagePrivate(page)); - BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageLocked(page)); if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != count) /* us + cache + ptes */ - return 0; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (!p) + if (page_swapcount(page)) return 0; - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[swp_offset(entry)] == 1) { - /* Recheck the page count with the swapcache lock held.. */ - spin_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == count) && !PageWriteback(page)) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; - } - spin_unlock_irq(&swapper_space.tree_lock); - } - spin_unlock(&swap_lock); - - if (retval) { - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* - * Most of the time the page should have two references: one for the - * process and one for the swap cache. - */ -int remove_exclusive_swap_page(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2); -} - -/* - * The pageout code holds an extra reference to the page. That raises - * the reference count to test for to 2 for a page that is only in the - * swap cache plus 1 for each process that maps the page. - */ -int remove_exclusive_swap_page_ref(struct page *page) -{ - return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); + delete_from_swap_cache(page); + SetPageDirty(page); + return 1; } /* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ -void free_swap_and_cache(swp_entry_t entry) +int free_swap_and_cache(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; struct page *page = NULL; if (is_migration_entry(entry)) - return; + return 1; p = swap_info_get(entry); if (p) { @@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry) spin_unlock(&swap_lock); } if (page) { - int one_user; - - BUG_ON(PagePrivate(page)); - one_user = (page_count(page) == 2); - /* Only cache user (+us), or swap space full? Free it! */ - /* Also recheck PageSwapCache after page is locked (above) */ + /* + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ if (PageSwapCache(page) && !PageWriteback(page) && - (one_user || vm_swap_full())) { + (!page_mapped(page) || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } unlock_page(page); page_cache_release(page); } + return p != NULL; } #ifdef CONFIG_HIBERNATION @@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type) break; } - /* + /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. + * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); @@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type) lock_page(page); wait_on_page_writeback(page); } - if (PageSwapCache(page)) + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock at the top, + * or while we dropped it in unuse_mm(). The page might even + * be back in swap cache on another swap area: that we must not + * delete, since it may not have been written out to swap yet. + */ + if (PageSwapCache(page) && + likely(page_private(page) == entry.val)) delete_from_swap_cache(page); /* @@ -1203,26 +1372,6 @@ out: return ret; } -#if 0 /* We don't need this yet */ -#include <linux/backing-dev.h> -int page_queue_congested(struct page *page) -{ - struct backing_dev_info *bdi; - - BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page_private(page) }; - struct swap_info_struct *sis; - - sis = get_swap_info_struct(swp_type(entry)); - bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; - } else - bdi = page->mapping->backing_dev_info; - return bdi_write_congested(bdi); -} -#endif - asmlinkage long sys_swapoff(const char __user * specialfile) { struct swap_info_struct * p = NULL; @@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) char * pathname; int i, type, prev; int err; - + if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) spin_lock(&swap_lock); for (type = swap_list.head; type >= 0; type = swap_info[type].next) { p = swap_info + type; - if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { + if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; } @@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v) file = ptr->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", - len < 40 ? 40 - len : 1, " ", - S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + ptr->pages << (PAGE_SHIFT - 10), + ptr->inuse_pages << (PAGE_SHIFT - 10), + ptr->prio); return 0; } @@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) int i, prev; int error; union swap_header *swap_header = NULL; - int swap_header_version; unsigned int nr_good_pages = 0; int nr_extents = 0; sector_t span; unsigned long maxpages = 1; - int swapfilesize; + unsigned long swapfilepages; unsigned short *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } - swapfilesize = i_size_read(inode) >> PAGE_SHIFT; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; /* * Read the swap header. @@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - kmap(page); - swap_header = page_address(page); + swap_header = kmap(page); - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); error = -EINVAL; goto bad_swap; } - - switch (swap_header_version) { - case 1: - printk(KERN_ERR "version 0 swap is no longer supported. " - "Use mkswap -v1 %s\n", name); + + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + printk(KERN_WARNING + "Unable to handle swap header version %d\n", + swap_header->info.version); error = -EINVAL; goto bad_swap; - case 2: - /* swap partition endianess hack... */ - if (swab32(swap_header->info.version) == 1) { - swab32s(&swap_header->info.version); - swab32s(&swap_header->info.last_page); - swab32s(&swap_header->info.nr_badpages); - for (i = 0; i < swap_header->info.nr_badpages; i++) - swab32s(&swap_header->info.badpages[i]); - } - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } + } - p->lowest_bit = 1; - p->cluster_next = 1; + p->lowest_bit = 1; + p->cluster_next = 1; - /* - * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 - * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap - * offset is extracted. This will mask all the bits from - * the initial ~0UL mask that can't be encoded in either - * the swp_entry_t or the architecture definition of a - * swap pte. - */ - maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number of + * bits for the swap offset in the swp_entry_t type and + * 2) the number of bits in the a swap pte as defined by + * the different architectures. In order to find the + * largest possible bit mask a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; + if (maxpages > swap_header->info.last_page) + maxpages = swap_header->info.last_page; + p->highest_bit = maxpages - 1; - error = -EINVAL; - if (!maxpages) - goto bad_swap; - if (swapfilesize && maxpages > swapfilesize) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - goto bad_swap; - } - if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + error = -EINVAL; + if (!maxpages) + goto bad_swap; + if (swapfilepages && maxpages > swapfilepages) { + printk(KERN_WARNING + "Swap area shorter than signature indicates\n"); + goto bad_swap; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + goto bad_swap; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + goto bad_swap; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + /* OK, set up the swap map and apply the bad block list */ + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } - error = 0; - memset(swap_map, 0, maxpages * sizeof(short)); - for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) - error = -EINVAL; - else - swap_map[page_nr] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) + memset(swap_map, 0, maxpages * sizeof(short)); + for (i = 0; i < swap_header->info.nr_badpages; i++) { + int page_nr = swap_header->info.badpages[i]; + if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + error = -EINVAL; goto bad_swap; + } + swap_map[page_nr] = SWAP_MAP_BAD; } + nr_good_pages = swap_header->info.last_page - + swap_header->info.nr_badpages - + 1 /* header page */; if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; @@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; } + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); if (swap_flags & SWAP_FLAG_PREFER) @@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) else p->prio = --least_priority; p->swap_map = swap_map; - p->flags = SWP_ACTIVE; + p->flags |= SWP_WRITEOK; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk\n", + "Priority:%d extents:%d across:%lluk %s%s\n", nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : ""); /* insert swap space into swap_list: */ prev = -1; diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c deleted file mode 100644 index 3e67d575ee6..00000000000 --- a/mm/tiny-shmem.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code - * - * Matt Mackall <mpm@selenic.com> January, 2004 - * derived from mm/shmem.c and fs/ramfs/inode.c - * - * This is intended for small system where the benefits of the full - * shmem code (swap-backed and resource-limited) are outweighed by - * their complexity. On systems without swap this code should be - * effectively equivalent, but much lighter weight. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/vfs.h> -#include <linux/mount.h> -#include <linux/file.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/ramfs.h> - -static struct file_system_type tmpfs_fs_type = { - .name = "tmpfs", - .get_sb = ramfs_get_sb, - .kill_sb = kill_litter_super, -}; - -static struct vfsmount *shm_mnt; - -static int __init init_tmpfs(void) -{ - BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); - - shm_mnt = kern_mount(&tmpfs_fs_type); - BUG_ON(IS_ERR(shm_mnt)); - - return 0; -} -module_init(init_tmpfs) - -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps - * @size: size to be set for the file - * @flags: vm_flags - */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) -{ - int error; - struct file *file; - struct inode *inode; - struct dentry *dentry, *root; - struct qstr this; - - if (IS_ERR(shm_mnt)) - return (void *)shm_mnt; - - error = -ENOMEM; - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - root = shm_mnt->mnt_root; - dentry = d_alloc(root, &this); - if (!dentry) - goto put_memory; - - error = -ENFILE; - file = get_empty_filp(); - if (!file) - goto put_dentry; - - error = -ENOSPC; - inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); - if (!inode) - goto close_file; - - d_instantiate(dentry, inode); - inode->i_size = size; - inode->i_nlink = 0; /* It is unlinked */ - init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); - -#ifndef CONFIG_MMU - error = ramfs_nommu_expand_for_mapping(inode, size); - if (error) - goto close_file; -#endif - return file; - -close_file: - put_filp(file); -put_dentry: - dput(dentry); -put_memory: - return ERR_PTR(error); -} -EXPORT_SYMBOL_GPL(shmem_file_setup); - -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff - */ -int shmem_zero_setup(struct vm_area_struct *vma) -{ - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; - - file = shmem_file_setup("dev/zero", size, vma->vm_flags); - if (IS_ERR(file)) - return PTR_ERR(file); - - if (vma->vm_file) - fput(vma->vm_file); - vma->vm_file = file; - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -int shmem_unuse(swp_entry_t entry, struct page *page) -{ - return 0; -} - -#ifndef CONFIG_MMU -unsigned long shmem_get_unmapped_area(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); -} -#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7465f22fec0..c5db9a7264d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -381,8 +382,9 @@ found: goto retry; } if (printk_ratelimit()) - printk(KERN_WARNING "vmap allocation failed: " - "use vmalloc=<size> to increase size.\n"); + printk(KERN_WARNING + "vmap allocation for size %lu failed: " + "use vmalloc=<size> to increase size.\n", size); return ERR_PTR(-EBUSY); } @@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } +static void vmap_debug_free_range(unsigned long start, unsigned long end) +{ + /* + * Unmap page tables and force a TLB flush immediately if + * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free + * bugs similarly to those in linear kernel virtual address + * space after a page has been freed. + * + * All the lazy freeing logic is still retained, in order to + * minimise intrusiveness of this debugging feature. + * + * This is going to be *slow* (linear kernel virtual address + * debugging doesn't do a broadcast TLB flush so it is a lot + * faster). + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); +#endif +} + /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_SPINLOCK(purge_lock); + static DEFINE_MUTEX(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) + if (!mutex_trylock(&purge_lock)) return; } else - spin_lock(&purge_lock); + mutex_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - spin_unlock(&purge_lock); + mutex_unlock(&purge_lock); } /* @@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr & (PAGE_SIZE-1)); debug_check_no_locks_freed(mem, size); + vmap_debug_free_range(addr, addr+size); if (likely(count <= VMAP_MAX_ALLOC)) vb_free(mem, size); @@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; + + vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; @@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; @@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) @@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); + return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); area->flags |= VM_USERMAP; diff --git a/mm/vmscan.c b/mm/vmscan.c index d196f46c880..b07c48b09a9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -52,6 +52,9 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced && page_mapping_inuse(page)) goto activate_locked; -#ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. @@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageAnon(page) && !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - switch (try_to_munlock(page)) { - case SWAP_FAIL: /* shouldn't happen */ - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_SUCCESS: - ; /* fall thru'; add to swap cache */ - } - if (!add_to_swap(page, GFP_ATOMIC)) + if (!add_to_swap(page)) goto activate_locked; may_enter_fs = 1; } -#endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -752,6 +744,8 @@ free_it: continue; cull_mlocked: + if (PageSwapCache(page)) + try_to_free_swap(page); unlock_page(page); putback_lru_page(page); continue; @@ -759,7 +753,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; @@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) zone->prev_priority = priority; } -static inline int zone_is_near_oom(struct zone *zone) -{ - return zone->pages_scanned >= (zone_lru_pages(zone) * 3); -} - /* * This moves pages from the active list to the inactive list. * @@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, list_add(&page->lru, &l_inactive); } + /* + * Move the pages to the [file or anon] inactive list. + */ + pagevec_init(&pvec, 1); + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; + spin_lock_irq(&zone->lru_lock); /* * Count referenced pages from currently used mappings as @@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * This helps balance scan pressure between file and anonymous * pages in get_scan_ratio. */ - zone->recent_rotated[!!file] += pgmoved; - - /* - * Move the pages to the [file or anon] inactive list. - */ - pagevec_init(&pvec, 1); + if (scan_global_lru(sc)) + zone->recent_rotated[!!file] += pgmoved; - pgmoved = 0; - lru = LRU_BASE + file * LRU_FILE; while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon_prio, file_prio; unsigned long ap, fp; - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have no swap space, do not bother scanning anon pages. */ if (nr_swap_pages <= 0) { percent[0] = 0; @@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, return; } + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= zone->pages_high)) { percent[0] = 100; @@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, +static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long nr_reclaimed = 0; unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; + unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long swap_cluster_max = sc->swap_cluster_max; get_scan_ratio(zone, sc, percent); @@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone, } zone->lru[l].nr_scan += scan; nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= sc->swap_cluster_max) + if (nr[l] >= swap_cluster_max) zone->lru[l].nr_scan = 0; else nr[l] = 0; @@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone, nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], - (unsigned long)sc->swap_cluster_max); + nr_to_scan = min(nr[l], swap_cluster_max); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + zone, sc, priority); } } + /* + * On large memory systems, scan >> priority can become + * really large. This is fine for the starting priority; + * we want to put equal scanning pressure on each zone. + * However, if the VM has a harder time of freeing pages, + * with multiple processes reclaiming pages, the total + * freeing target can get unreasonably large. + */ + if (nr_reclaimed > swap_cluster_max && + priority < DEF_PRIORITY && !current_is_kswapd()) + break; } + sc->nr_reclaimed = nr_reclaimed; + /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. @@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); - return nr_reclaimed; } /* @@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * b) The zones may be over pages_high but they must go *over* pages_high to * satisfy the `incremental min' zone defense algorithm. * - * Returns the number of reclaimed pages. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); - unsigned long nr_reclaimed = 0; struct zoneref *z; struct zone *zone; @@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, priority); } - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(priority, zone, sc); } - - return nr_reclaimed; } /* @@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int priority; unsigned long ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; struct zoneref *z; @@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (scan_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; + sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } } total_scanned += sc->nr_scanned; - if (nr_reclaimed >= sc->swap_cluster_max) { - ret = nr_reclaimed; + if (sc->nr_reclaimed >= sc->swap_cluster_max) { + ret = sc->nr_reclaimed; goto out; } @@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = nr_reclaimed; + ret = sc->nr_reclaimed; out: /* * Now that we've scanned all the zones at this priority level, note @@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) int priority; int i; unsigned long total_scanned; - unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) loop_again: total_scanned = 0; - nr_reclaimed = 0; + sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); @@ -1817,11 +1812,11 @@ loop_again: */ if (!zone_watermark_ok(zone, order, 8*zone->pages_high, end_zone, 0)) - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) continue; @@ -1835,7 +1830,7 @@ loop_again: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; } if (all_zones_ok) @@ -1853,7 +1848,7 @@ loop_again: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if (nr_reclaimed >= SWAP_CLUSTER_MAX) + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1872,10 +1867,27 @@ out: try_to_freeze(); + /* + * Fragmentation may mean that the system cannot be + * rebalanced for high-order allocations in all zones. + * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, + * it means the zones have been fully scanned and are still + * not balanced. For high-order allocations, there is + * little point trying all over again as kswapd may + * infinite loop. + * + * Instead, recheck all watermarks at order-0 as they + * are the most important. If watermarks are ok, kswapd will go + * back to sleep. High-order users can still perform direct + * reclaim if they wish. + */ + if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + order = sc.order = 0; + goto loop_again; } - return nr_reclaimed; + return sc.nr_reclaimed; } /* @@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; - unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) priority = ZONE_RECLAIM_PRIORITY; do { note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - nr_reclaimed += slab_reclaimable - + sc.nr_reclaimed += slab_reclaimable - zone_page_state(zone, NR_SLAB_RECLAIMABLE); } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - return nr_reclaimed >= nr_pages; + return sc.nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) * back onto @zone's unevictable list. */ #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -void scan_zone_unevictable_pages(struct zone *zone) +static void scan_zone_unevictable_pages(struct zone *zone) { struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; unsigned long scan; @@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone) * that has possibly/probably made some previously unevictable pages * evictable. */ -void scan_all_zones_unevictable_pages(void) +static void scan_all_zones_unevictable_pages(void) { struct zone *zone; diff --git a/samples/firmware_class/firmware_sample_driver.c b/samples/firmware_class/firmware_sample_driver.c index 11114f389c4..219a2989660 100644 --- a/samples/firmware_class/firmware_sample_driver.c +++ b/samples/firmware_class/firmware_sample_driver.c @@ -100,7 +100,7 @@ static void sample_probe_async(void) " request_firmware_nowait failed\n"); } -static int sample_init(void) +static int __init sample_init(void) { device_initialize(&ghost_device); /* since there is no real hardware insertion I just call the diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c index 08d0d3ff326..8d9b55a1202 100644 --- a/samples/kobject/kobject-example.c +++ b/samples/kobject/kobject-example.c @@ -101,7 +101,7 @@ static struct attribute_group attr_group = { static struct kobject *example_kobj; -static int example_init(void) +static int __init example_init(void) { int retval; @@ -126,7 +126,7 @@ static int example_init(void) return retval; } -static void example_exit(void) +static void __exit example_exit(void) { kobject_put(example_kobj); } diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c index 7395c0bbae1..45b7d56fb54 100644 --- a/samples/kobject/kset-example.c +++ b/samples/kobject/kset-example.c @@ -229,7 +229,7 @@ static void destroy_foo_obj(struct foo_obj *foo) kobject_put(&foo->kobj); } -static int example_init(void) +static int __init example_init(void) { /* * Create a kset with the name of "kset_example", @@ -264,7 +264,7 @@ foo_error: return -EINVAL; } -static void example_exit(void) +static void __exit example_exit(void) { destroy_foo_obj(baz_obj); destroy_foo_obj(bar_obj); diff --git a/samples/markers/marker-example.c b/samples/markers/marker-example.c index e90dc5d0439..e9cd9c0bc84 100644 --- a/samples/markers/marker-example.c +++ b/samples/markers/marker-example.c @@ -30,7 +30,7 @@ static struct file_operations mark_ops = { .open = my_open, }; -static int example_init(void) +static int __init example_init(void) { printk(KERN_ALERT "example init\n"); pentry_example = proc_create("marker-example", 0444, NULL, &mark_ops); @@ -39,7 +39,7 @@ static int example_init(void) return 0; } -static void example_exit(void) +static void __exit example_exit(void) { printk(KERN_ALERT "example exit\n"); remove_proc_entry("marker-example", NULL); diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c index e3a964889dc..9e60eb6ca2d 100644 --- a/samples/tracepoints/tracepoint-probe-sample.c +++ b/samples/tracepoints/tracepoint-probe-sample.c @@ -28,7 +28,7 @@ static void probe_subsys_eventb(void) printk(KERN_INFO "Event B is encountered\n"); } -int __init tp_sample_trace_init(void) +static int __init tp_sample_trace_init(void) { int ret; @@ -42,7 +42,7 @@ int __init tp_sample_trace_init(void) module_init(tp_sample_trace_init); -void __exit tp_sample_trace_exit(void) +static void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_eventb(probe_subsys_eventb); unregister_trace_subsys_event(probe_subsys_event); diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c index 685a5acb456..be2a960573f 100644 --- a/samples/tracepoints/tracepoint-probe-sample2.c +++ b/samples/tracepoints/tracepoint-probe-sample2.c @@ -18,7 +18,7 @@ static void probe_subsys_event(struct inode *inode, struct file *file) inode->i_ino); } -int __init tp_sample_trace_init(void) +static int __init tp_sample_trace_init(void) { int ret; @@ -30,7 +30,7 @@ int __init tp_sample_trace_init(void) module_init(tp_sample_trace_init); -void __exit tp_sample_trace_exit(void) +static void __exit tp_sample_trace_exit(void) { unregister_trace_subsys_event(probe_subsys_event); tracepoint_synchronize_unregister(); diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 00d169792a3..68d5dc0310e 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -32,7 +32,7 @@ static struct file_operations mark_ops = { .open = my_open, }; -static int example_init(void) +static int __init example_init(void) { printk(KERN_ALERT "example init\n"); pentry_example = proc_create("tracepoint-example", 0444, NULL, @@ -42,7 +42,7 @@ static int example_init(void) return 0; } -static void example_exit(void) +static void __exit example_exit(void) { printk(KERN_ALERT "example exit\n"); remove_proc_entry("tracepoint-example", NULL); diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f88bb3e21cd..7bed4ed2c51 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1,7 +1,8 @@ #!/usr/bin/perl -w # (c) 2001, Dave Jones. <davej@redhat.com> (the file handling bit) # (c) 2005, Joel Schopp <jschopp@austin.ibm.com> (the ugly bit) -# (c) 2007, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite, etc) +# (c) 2007,2008, Andy Whitcroft <apw@uk.ibm.com> (new conditions, test suite) +# (c) 2008, Andy Whitcroft <apw@canonical.com> # Licensed under the terms of the GNU GPL License version 2 use strict; @@ -9,7 +10,7 @@ use strict; my $P = $0; $P =~ s@.*/@@g; -my $V = '0.24'; +my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); @@ -68,7 +69,9 @@ my $dbg_possible = 0; my $dbg_type = 0; my $dbg_attr = 0; for my $key (keys %debug) { - eval "\${dbg_$key} = '$debug{$key}';" + ## no critic + eval "\${dbg_$key} = '$debug{$key}';"; + die "$@" if ($@); } if ($terse) { @@ -116,7 +119,8 @@ our $Attribute = qr{ __(?:mem|cpu|dev|)(?:initdata|init)| ____cacheline_aligned| ____cacheline_aligned_in_smp| - ____cacheline_internodealigned_in_smp + ____cacheline_internodealigned_in_smp| + __weak }x; our $Modifier; our $Inline = qr{inline|__always_inline|noinline}; @@ -125,6 +129,7 @@ our $Lval = qr{$Ident(?:$Member)*}; our $Constant = qr{(?:[0-9]+|0x[0-9a-fA-F]+)[UL]*}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; +our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ <=|>=|==|!=| =>|->|<<|>>|<|>|!|~| @@ -190,7 +195,7 @@ sub build_types { }x; $Type = qr{ $NonptrType - (?:\s*\*+\s*const|\s*\*+|(?:\s*\[\s*\])+)? + (?:[\s\*]+\s*const|[\s\*]+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+)?$Type}; @@ -203,9 +208,9 @@ my @dep_includes = (); my @dep_functions = (); my $removal = "Documentation/feature-removal-schedule.txt"; if ($tree && -f "$root/$removal") { - open(REMOVE, "<$root/$removal") || + open(my $REMOVE, '<', "$root/$removal") || die "$P: $removal: open failed - $!\n"; - while (<REMOVE>) { + while (<$REMOVE>) { if (/^Check:\s+(.*\S)/) { for my $entry (split(/[, ]+/, $1)) { if ($entry =~ m@include/(.*)@) { @@ -217,17 +222,21 @@ if ($tree && -f "$root/$removal") { } } } + close($REMOVE); } my @rawlines = (); my @lines = (); my $vname; for my $filename (@ARGV) { + my $FILE; if ($file) { - open(FILE, "diff -u /dev/null $filename|") || + open($FILE, '-|', "diff -u /dev/null $filename") || die "$P: $filename: diff failed - $!\n"; + } elsif ($filename eq '-') { + open($FILE, '<&STDIN'); } else { - open(FILE, "<$filename") || + open($FILE, '<', "$filename") || die "$P: $filename: open failed - $!\n"; } if ($filename eq '-') { @@ -235,11 +244,11 @@ for my $filename (@ARGV) { } else { $vname = $filename; } - while (<FILE>) { + while (<$FILE>) { chomp; push(@rawlines, $_); } - close(FILE); + close($FILE); if (!process($filename)) { $exit = 1; } @@ -366,7 +375,7 @@ sub sanitise_line { } } - #print "SQ:$sanitise_quote\n"; + #print "c<$c> SQ<$sanitise_quote>\n"; if ($off != 0 && $sanitise_quote eq '*/' && $c ne "\t") { substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote && $c ne "\t") { @@ -402,6 +411,7 @@ sub ctx_statement_block { my $type = ''; my $level = 0; + my @stack = ([$type, $level]); my $p; my $c; my $len = 0; @@ -433,6 +443,16 @@ sub ctx_statement_block { $remainder = substr($blk, $off); #warn "CSB: c<$c> type<$type> level<$level> remainder<$remainder> coff_set<$coff_set>\n"; + + # Handle nested #if/#else. + if ($remainder =~ /^#\s*(?:ifndef|ifdef|if)\s/) { + push(@stack, [ $type, $level ]); + } elsif ($remainder =~ /^#\s*(?:else|elif)\b/) { + ($type, $level) = @{$stack[$#stack - 1]}; + } elsif ($remainder =~ /^#\s*endif\b/) { + ($type, $level) = @{pop(@stack)}; + } + # Statement ends at the ';' or a close '}' at the # outermost level. if ($level == 0 && $c eq ';') { @@ -579,11 +599,22 @@ sub ctx_block_get { my @res = (); my $level = 0; + my @stack = ($level); for ($line = $start; $remain > 0; $line++) { next if ($rawlines[$line] =~ /^-/); $remain--; $blk .= $rawlines[$line]; + + # Handle nested #if/#else. + if ($rawlines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) { + push(@stack, $level); + } elsif ($rawlines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) { + $level = $stack[$#stack - 1]; + } elsif ($rawlines[$line] =~ /^.\s*#\s*endif\b/) { + $level = pop(@stack); + } + foreach my $c (split(//, $rawlines[$line])) { ##print "C<$c>L<$level><$open$close>O<$off>\n"; if ($off > 0) { @@ -843,11 +874,11 @@ sub annotate_values { $type = 'V'; $av_pending = 'V'; - } elsif ($cur =~ /^($Ident\s*):/) { - if ($type eq 'E') { - $av_pend_colon = 'L'; - } elsif ($type eq 'T') { + } elsif ($cur =~ /^($Ident\s*):(?:\s*\d+\s*(,|=|;))?/) { + if (defined $2 && $type eq 'C' || $type eq 'T') { $av_pend_colon = 'B'; + } elsif ($type eq 'E') { + $av_pend_colon = 'L'; } print "IDENT_COLON($1,$type>$av_pend_colon)\n" if ($dbg_values > 1); $type = 'V'; @@ -865,6 +896,10 @@ sub annotate_values { $type = 'E'; $av_pend_colon = 'O'; + } elsif ($cur =~/^(,)/) { + print "COMMA($1)\n" if ($dbg_values > 1); + $type = 'C'; + } elsif ($cur =~ /^(\?)/o) { print "QUESTION($1)\n" if ($dbg_values > 1); $type = 'N'; @@ -880,7 +915,7 @@ sub annotate_values { } $av_pend_colon = 'O'; - } elsif ($cur =~ /^(;|\[)/o) { + } elsif ($cur =~ /^(\[)/o) { print "CLOSE($1)\n" if ($dbg_values > 1); $type = 'N'; @@ -1051,6 +1086,7 @@ sub process { my $in_comment = 0; my $comment_edge = 0; my $first_line = 0; + my $p1_prefix = ''; my $prev_values = 'E'; @@ -1097,9 +1133,12 @@ sub process { $rawlines[$ln - 1] =~ /^-/); $cnt--; #print "RAW<$rawlines[$ln - 1]>\n"; - ($edge) = (defined $rawlines[$ln - 1] && - $rawlines[$ln - 1] =~ m@(/\*|\*/)@); - last if (defined $edge); + last if (!defined $rawlines[$ln - 1]); + if ($rawlines[$ln - 1] =~ m@(/\*|\*/)@ && + $rawlines[$ln - 1] !~ m@"[^"]*(?:/\*|\*/)[^"]*"@) { + ($edge) = $1; + last; + } } if (defined $edge && $edge eq '*/') { $in_comment = 1; @@ -1109,7 +1148,7 @@ sub process { # is the start of a diff block and this line starts # ' *' then it is very likely a comment. if (!defined $edge && - $rawlines[$linenr] =~ m@^.\s* \*(?:\s|$)@) + $rawlines[$linenr] =~ m@^.\s*(?:\*\*+| \*)(?:\s|$)@) { $in_comment = 1; } @@ -1196,7 +1235,12 @@ sub process { # extract the filename as it passes if ($line=~/^\+\+\+\s+(\S+)/) { $realfile = $1; - $realfile =~ s@^[^/]*/@@; + $realfile =~ s@^([^/]*)/@@; + + $p1_prefix = $1; + if ($tree && $p1_prefix ne '' && -e "$root/$p1_prefix") { + WARN("patch prefix '$p1_prefix' exists, appears to be a -p0 patch\n"); + } if ($realfile =~ m@^include/asm/@) { ERROR("do not modify files in include/asm, change architecture specific files in include/asm-<architecture>\n" . "$here$rawline\n"); @@ -1336,7 +1380,7 @@ sub process { } # any (foo ... *) is a pointer cast, and foo is a type - while ($s =~ /\(($Ident)(?:\s+$Sparse)*\s*\*+\s*\)/sg) { + while ($s =~ /\(($Ident)(?:\s+$Sparse)*[\s\*]+\s*\)/sg) { possible($1, "C:" . $s); } @@ -1594,7 +1638,7 @@ sub process { $herecurr); } # check for static initialisers. - if ($line =~ /\s*static\s.*=\s*(0|NULL|false)\s*;/) { + if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { ERROR("do not initialise statics to 0 or NULL\n" . $herecurr); } @@ -1602,7 +1646,7 @@ sub process { # check for new typedefs, only function parameters and sparse annotations # make sense. if ($line =~ /\btypedef\s/ && - $line !~ /\btypedef\s+$Type\s+\(\s*\*?$Ident\s*\)\s*\(/ && + $line !~ /\btypedef\s+$Type\s*\(\s*\*?$Ident\s*\)\s*\(/ && $line !~ /\btypedef\s+$Type\s+$Ident\s*\(/ && $line !~ /\b$typeTypedefs\b/ && $line !~ /\b__bitwise(?:__|)\b/) { @@ -1610,21 +1654,39 @@ sub process { } # * goes on variable not on type - if ($line =~ m{\($NonptrType(\*+)(?:\s+const)?\)}) { - ERROR("\"(foo$1)\" should be \"(foo $1)\"\n" . - $herecurr); + # (char*[ const]) + if ($line =~ m{\($NonptrType(\s*\*[\s\*]*(?:$Modifier\s*)*)\)}) { + my ($from, $to) = ($1, $1); - } elsif ($line =~ m{\($NonptrType\s+(\*+)(?!\s+const)\s+\)}) { - ERROR("\"(foo $1 )\" should be \"(foo $1)\"\n" . - $herecurr); + # Should start with a space. + $to =~ s/^(\S)/ $1/; + # Should not end with a space. + $to =~ s/\s+$//; + # '*'s should not have spaces between. + while ($to =~ s/(.)\s\*/$1\*/) { + } - } elsif ($line =~ m{\b$NonptrType(\*+)(?:\s+(?:$Attribute|$Sparse))?\s+[A-Za-z\d_]+}) { - ERROR("\"foo$1 bar\" should be \"foo $1bar\"\n" . - $herecurr); + #print "from<$from> to<$to>\n"; + if ($from ne $to) { + ERROR("\"(foo$from)\" should be \"(foo$to)\"\n" . $herecurr); + } + } elsif ($line =~ m{\b$NonptrType(\s*\*[\s\*]*(?:$Modifier\s*)?)($Ident)}) { + my ($from, $to, $ident) = ($1, $1, $2); - } elsif ($line =~ m{\b$NonptrType\s+(\*+)(?!\s+(?:$Attribute|$Sparse))\s+[A-Za-z\d_]+}) { - ERROR("\"foo $1 bar\" should be \"foo $1bar\"\n" . - $herecurr); + # Should start with a space. + $to =~ s/^(\S)/ $1/; + # Should not end with a space. + $to =~ s/\s+$//; + # '*'s should not have spaces between. + while ($to =~ s/(.)\s\*/$1\*/) { + } + # Modifiers should have spaces. + $to =~ s/(\b$Modifier$)/$1 /; + + #print "from<$from> to<$to>\n"; + if ($from ne $to) { + ERROR("\"foo${from}bar\" should be \"foo${to}bar\"\n" . $herecurr); + } } # # no BUG() or BUG_ON() @@ -1759,7 +1821,7 @@ sub process { $c = 'C' if ($elements[$n + 2] =~ /^$;/); $c = 'B' if ($elements[$n + 2] =~ /^(\)|\]|;)/); $c = 'O' if ($elements[$n + 2] eq ''); - $c = 'E' if ($elements[$n + 2] =~ /\s*\\$/); + $c = 'E' if ($elements[$n + 2] =~ /^\s*\\$/); } else { $c = 'E'; } @@ -1950,9 +2012,9 @@ sub process { my $spacing = $1; my $value = $2; - # Flatten any parentheses and braces + # Flatten any parentheses $value =~ s/\)\(/\) \(/g; - while ($value =~ s/\([^\(\)]*\)/1/) { + while ($value !~ /(?:$Ident|-?$Constant)\s*$Compare\s*(?:$Ident|-?$Constant)/ && $value =~ s/\([^\(\)]*\)/1/) { } if ($value =~ /^(?:$Ident|-?$Constant)$/) { @@ -1992,7 +2054,7 @@ sub process { $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); - if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/) { + if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) { ERROR("do not use assignment in if condition\n" . $herecurr); } @@ -2167,9 +2229,10 @@ sub process { MODULE_PARAM_DESC| DECLARE_PER_CPU| DEFINE_PER_CPU| - __typeof__\( + __typeof__\(| + \.$Ident\s*=\s* }x; - #print "REST<$rest>\n"; + #print "REST<$rest> dstat<$dstat>\n"; if ($rest ne '') { if ($rest !~ /while\s*\(/ && $dstat !~ /$exceptions/) @@ -2189,6 +2252,15 @@ sub process { } } +# make sure symbols are always wrapped with VMLINUX_SYMBOL() ... +# all assignments may have only one of the following with an assignment: +# . +# ALIGN(...) +# VMLINUX_SYMBOL(...) + if ($realfile eq 'vmlinux.lds.h' && $line =~ /(?:(?:^|\s)$Ident\s*=|=\s*$Ident(?:\s|$))/) { + WARN("vmlinux.lds.h needs VMLINUX_SYMBOL() around C-visible symbols\n" . $herecurr); + } + # check for redundant bracing round if etc if ($line =~ /(^.*)\bif\b/ && $1 !~ /else\s*$/) { my ($level, $endln, @chunks) = @@ -2443,6 +2515,11 @@ sub process { if ($line =~ /^.\s*__initcall\s*\(/) { WARN("please use device_initcall() instead of __initcall()\n" . $herecurr); } +# check for struct file_operations, ensure they are const. + if ($line =~ /\bstruct\s+file_operations\b/ && + $line !~ /\bconst\b/) { + WARN("struct file_operations should normally be const\n" . $herecurr); + } # use of NR_CPUS is usually wrong # ignore definitions of NR_CPUS and usage to define arrays as likely right @@ -2466,6 +2543,15 @@ sub process { last; } } + +# whine mightly about in_atomic + if ($line =~ /\bin_atomic\s*\(/) { + if ($realfile =~ m@^drivers/@) { + ERROR("do not use in_atomic in drivers\n" . $herecurr); + } else { + WARN("use of in_atomic() is incorrect outside core kernel code\n" . $herecurr); + } + } } # If we have no input at all, then there is nothing to report on diff --git a/scripts/markup_oops.pl b/scripts/markup_oops.pl new file mode 100644 index 00000000000..700a7a654a3 --- /dev/null +++ b/scripts/markup_oops.pl @@ -0,0 +1,162 @@ +#!/usr/bin/perl -w + +# Copyright 2008, Intel Corporation +# +# This file is part of the Linux kernel +# +# This program file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; version 2 of the License. +# +# Authors: +# Arjan van de Ven <arjan@linux.intel.com> + + +my $vmlinux_name = $ARGV[0]; + +# +# Step 1: Parse the oops to find the EIP value +# + +my $target = "0"; +while (<STDIN>) { + if ($_ =~ /EIP: 0060:\[\<([a-z0-9]+)\>\]/) { + $target = $1; + } +} + +if ($target =~ /^f8/) { + print "This script does not work on modules ... \n"; + exit; +} + +if ($target eq "0") { + print "No oops found!\n"; + print "Usage: \n"; + print " dmesg | perl scripts/markup_oops.pl vmlinux\n"; + exit; +} + +my $counter = 0; +my $state = 0; +my $center = 0; +my @lines; + +sub InRange { + my ($address, $target) = @_; + my $ad = "0x".$address; + my $ta = "0x".$target; + my $delta = hex($ad) - hex($ta); + + if (($delta > -4096) && ($delta < 4096)) { + return 1; + } + return 0; +} + + + +# first, parse the input into the lines array, but to keep size down, +# we only do this for 4Kb around the sweet spot + +my $filename; + +open(FILE, "objdump -dS $vmlinux_name |") || die "Cannot start objdump"; + +while (<FILE>) { + my $line = $_; + chomp($line); + if ($state == 0) { + if ($line =~ /^([a-f0-9]+)\:/) { + if (InRange($1, $target)) { + $state = 1; + } + } + } else { + if ($line =~ /^([a-f0-9][a-f0-9][a-f0-9][a-f0-9][a-f0-9][a-f0-9]+)\:/) { + my $val = $1; + if (!InRange($val, $target)) { + last; + } + if ($val eq $target) { + $center = $counter; + } + } + $lines[$counter] = $line; + + $counter = $counter + 1; + } +} + +close(FILE); + +if ($counter == 0) { + print "No matching code found \n"; + exit; +} + +if ($center == 0) { + print "No matching code found \n"; + exit; +} + +my $start; +my $finish; +my $codelines = 0; +my $binarylines = 0; +# now we go up and down in the array to find how much we want to print + +$start = $center; + +while ($start > 1) { + $start = $start - 1; + my $line = $lines[$start]; + if ($line =~ /^([a-f0-9]+)\:/) { + $binarylines = $binarylines + 1; + } else { + $codelines = $codelines + 1; + } + if ($codelines > 10) { + last; + } + if ($binarylines > 20) { + last; + } +} + + +$finish = $center; +$codelines = 0; +$binarylines = 0; +while ($finish < $counter) { + $finish = $finish + 1; + my $line = $lines[$finish]; + if ($line =~ /^([a-f0-9]+)\:/) { + $binarylines = $binarylines + 1; + } else { + $codelines = $codelines + 1; + } + if ($codelines > 10) { + last; + } + if ($binarylines > 20) { + last; + } +} + + +my $i; + +my $fulltext = ""; +$i = $start; +while ($i < $finish) { + if ($i == $center) { + $fulltext = $fulltext . "*$lines[$i] <----- faulting instruction\n"; + } else { + $fulltext = $fulltext . " $lines[$i]\n"; + } + $i = $i +1; +} + +print $fulltext; + diff --git a/sound/core/sound.c b/sound/core/sound.c index 44a69bb8d4f..7872a02f6ca 100644 --- a/sound/core/sound.c +++ b/sound/core/sound.c @@ -152,6 +152,10 @@ static int __snd_open(struct inode *inode, struct file *file) } old_fops = file->f_op; file->f_op = fops_get(mptr->f_ops); + if (file->f_op == NULL) { + file->f_op = old_fops; + return -ENODEV; + } if (file->f_op->open) err = file->f_op->open(inode, file); if (err) { |