From aa6f5ffbdba45aa8e19e5048648fc6c7b25376d3 Mon Sep 17 00:00:00 2001 From: merge Date: Thu, 22 Jan 2009 13:55:32 +0000 Subject: MERGE-via-pending-tracking-hist-MERGE-via-stable-tracking-MERGE-via-mokopatches-tracking-fix-stray-endmenu-patch-1232632040-1232632141 pending-tracking-hist top was MERGE-via-stable-tracking-MERGE-via-mokopatches-tracking-fix-stray-endmenu-patch-1232632040-1232632141 / fdf777a63bcb59e0dfd78bfe2c6242e01f6d4eb9 ... parent commitmessage: From: merge MERGE-via-stable-tracking-hist-MERGE-via-mokopatches-tracking-fix-stray-endmenu-patch-1232632040 stable-tracking-hist top was MERGE-via-mokopatches-tracking-fix-stray-endmenu-patch-1232632040 / 90463bfd2d5a3c8b52f6e6d71024a00e052b0ced ... parent commitmessage: From: merge MERGE-via-mokopatches-tracking-hist-fix-stray-endmenu-patch mokopatches-tracking-hist top was fix-stray-endmenu-patch / 3630e0be570de8057e7f8d2fe501ed353cdf34e6 ... parent commitmessage: From: Andy Green fix-stray-endmenu.patch Signed-off-by: Andy Green --- Documentation/00-INDEX | 44 +- Documentation/ABI/testing/sysfs-c2port | 88 +++ Documentation/ABI/testing/sysfs-class-regulator | 136 ++-- Documentation/ABI/testing/sysfs-class-uwb_rc | 14 +- Documentation/ABI/testing/sysfs-devices-memory | 51 +- Documentation/ABI/testing/sysfs-firmware-acpi | 16 +- Documentation/DMA-API.txt | 17 +- Documentation/DMA-mapping.txt | 2 +- Documentation/DocBook/Makefile | 4 +- Documentation/DocBook/networking.tmpl | 11 +- Documentation/DocBook/regulator.tmpl | 304 ++++++++ Documentation/DocBook/uio-howto.tmpl | 101 ++- Documentation/DocBook/wanbook.tmpl | 99 --- Documentation/MSI-HOWTO.txt | 509 ------------- Documentation/PCI/00-INDEX | 2 + Documentation/PCI/MSI-HOWTO.txt | 509 +++++++++++++ Documentation/PCI/pci.txt | 3 +- Documentation/RCU/00-INDEX | 4 + Documentation/RCU/rcubarrier.txt | 304 ++++++++ Documentation/RCU/rculist_nulls.txt | 167 +++++ Documentation/RCU/trace.txt | 413 +++++++++++ Documentation/README.DAC960 | 756 ------------------- Documentation/README.cycladesZ | 8 - Documentation/accounting/getdelays.c | 4 + Documentation/acpi/debug.txt | 148 ++++ Documentation/arm/mem_alignment | 2 +- Documentation/arm/pxa/mfp.txt | 286 ++++++++ Documentation/bad_memory.txt | 45 ++ Documentation/blackfin/00-INDEX | 3 + Documentation/blackfin/bfin-gpio-notes.txt | 71 ++ Documentation/block/biodoc.txt | 6 +- Documentation/blockdev/00-INDEX | 16 + Documentation/blockdev/README.DAC960 | 756 +++++++++++++++++++ Documentation/blockdev/cciss.txt | 171 +++++ Documentation/blockdev/cpqarray.txt | 93 +++ Documentation/blockdev/floppy.txt | 245 +++++++ Documentation/blockdev/nbd.txt | 47 ++ Documentation/blockdev/paride.txt | 417 +++++++++++ Documentation/blockdev/ramdisk.txt | 165 +++++ Documentation/c2port.txt | 90 +++ Documentation/cciss.txt | 171 ----- Documentation/cgroups/cgroups.txt | 14 +- Documentation/cgroups/cpuacct.txt | 32 + Documentation/cgroups/cpusets.txt | 808 +++++++++++++++++++++ Documentation/cgroups/devices.txt | 52 ++ Documentation/cgroups/freezer-subsystem.txt | 21 +- Documentation/cgroups/memcg_test.txt | 342 +++++++++ Documentation/cgroups/memory.txt | 399 ++++++++++ Documentation/cgroups/resource_counter.txt | 181 +++++ Documentation/computone.txt | 522 ------------- Documentation/controllers/devices.txt | 52 -- Documentation/controllers/memory.txt | 284 -------- Documentation/controllers/resource_counter.txt | 181 ----- Documentation/cpqarray.txt | 93 --- Documentation/cpu-freq/user-guide.txt | 16 +- Documentation/cpu-hotplug.txt | 17 +- Documentation/cpusets.txt | 808 --------------------- Documentation/cputopology.txt | 48 ++ Documentation/credentials.txt | 582 +++++++++++++++ Documentation/crypto/async-tx-api.txt | 96 ++- Documentation/dell_rbu.txt | 4 +- Documentation/development-process/4.Coding | 6 +- Documentation/digiepca.txt | 98 --- Documentation/dmaengine.txt | 1 + Documentation/dvb/technisat.txt | 69 ++ Documentation/fb/pxafb.txt | 92 ++- Documentation/feature-removal-schedule.txt | 72 +- Documentation/filesystems/Locking | 12 +- Documentation/filesystems/btrfs.txt | 91 +++ Documentation/filesystems/devpts.txt | 132 ++++ Documentation/filesystems/ext4.txt | 85 ++- Documentation/filesystems/files.txt | 6 +- Documentation/filesystems/ocfs2.txt | 6 +- Documentation/filesystems/proc.txt | 301 +------- .../filesystems/ramfs-rootfs-initramfs.txt | 12 +- Documentation/filesystems/squashfs.txt | 225 ++++++ Documentation/filesystems/ubifs.txt | 3 + Documentation/filesystems/vfs.txt | 13 +- Documentation/filesystems/xfs.txt | 4 - Documentation/filesystems/xip.txt | 9 +- Documentation/floppy.txt | 245 ------- Documentation/ftrace.txt | 314 ++++---- Documentation/hayes-esp.txt | 154 ---- Documentation/hwmon/abituguru-datasheet | 10 +- Documentation/hwmon/adt7462 | 67 ++ Documentation/hwmon/adt7470 | 19 +- Documentation/hwmon/adt7475 | 87 +++ Documentation/hwmon/f71882fg | 89 +++ Documentation/hwmon/it87 | 20 +- Documentation/hwmon/lis3lv02d | 53 ++ Documentation/hwmon/lm70 | 12 +- Documentation/hwmon/lm85 | 2 +- Documentation/hwmon/ltc4245 | 81 +++ Documentation/ics932s401 | 31 + Documentation/ide/warm-plug-howto.txt | 5 + Documentation/input/input-programming.txt | 3 +- Documentation/input/walkera0701.txt | 109 +++ Documentation/ioctl-number.txt | 201 ----- Documentation/ioctl/00-INDEX | 10 + Documentation/ioctl/ioctl-number.txt | 205 ++++++ Documentation/kbuild/00-INDEX | 6 +- Documentation/kbuild/kbuild.txt | 133 ++++ Documentation/kbuild/kconfig.txt | 188 +++++ Documentation/kbuild/makefiles.txt | 14 + Documentation/kbuild/modules.txt | 4 +- Documentation/kernel-doc-nano-HOWTO.txt | 34 + Documentation/kernel-parameters.txt | 303 +++++--- Documentation/kobject.txt | 4 +- Documentation/kprobes.txt | 5 +- Documentation/laptops/thinkpad-acpi.txt | 27 +- Documentation/lguest/lguest.c | 66 +- Documentation/local_ops.txt | 2 +- Documentation/lockstat.txt | 51 +- Documentation/magic-number.txt | 6 +- Documentation/markers.txt | 29 +- Documentation/memory-hotplug.txt | 16 +- Documentation/mips/AU1xxx_IDE.README | 8 +- Documentation/moxa-smartio | 523 ------------- Documentation/nbd.txt | 47 -- Documentation/networking/README.ipw2200 | 2 +- Documentation/networking/bonding.txt | 68 +- Documentation/networking/dccp.txt | 32 +- Documentation/networking/driver.txt | 2 +- Documentation/networking/generic-hdlc.txt | 8 +- Documentation/networking/ip-sysctl.txt | 6 + Documentation/networking/mac80211_hwsim/README | 9 +- Documentation/networking/netdevices.txt | 2 +- Documentation/networking/phy.txt | 2 +- Documentation/networking/regulatory.txt | 22 +- Documentation/networking/rxrpc.txt | 2 +- Documentation/networking/tuntap.txt | 2 +- Documentation/nmi_watchdog.txt | 5 + Documentation/nommu-mmap.txt | 31 +- Documentation/paride.txt | 417 ----------- Documentation/powerpc/booting-without-of.txt | 65 +- Documentation/powerpc/cpu_features.txt | 2 +- Documentation/powerpc/dts-bindings/4xx/ndfc.txt | 39 + Documentation/powerpc/dts-bindings/fsl/board.txt | 32 +- Documentation/powerpc/dts-bindings/fsl/tsec.txt | 12 +- Documentation/printk-formats.txt | 35 + Documentation/ramdisk.txt | 165 ----- Documentation/rfkill.txt | 20 +- Documentation/riscom8.txt | 36 - Documentation/rocket.txt | 189 ----- Documentation/s390/Debugging390.txt | 2 +- Documentation/s390/cds.txt | 2 +- Documentation/s390/s390dbf.txt | 2 +- Documentation/scheduler/sched-arch.txt | 4 +- Documentation/scheduler/sched-design-CFS.txt | 23 +- Documentation/scsi/ChangeLog.lpfc | 2 +- Documentation/scsi/ChangeLog.ncr53c8xx | 2 +- Documentation/scsi/ChangeLog.sym53c8xx | 2 +- Documentation/scsi/cxgb3i.txt | 85 +++ Documentation/scsi/scsi_fc_transport.txt | 4 +- Documentation/serial/00-INDEX | 24 + Documentation/serial/README.cycladesZ | 8 + Documentation/serial/computone.txt | 522 +++++++++++++ Documentation/serial/digiepca.txt | 98 +++ Documentation/serial/hayes-esp.txt | 154 ++++ Documentation/serial/moxa-smartio | 523 +++++++++++++ Documentation/serial/riscom8.txt | 36 + Documentation/serial/rocket.txt | 189 +++++ Documentation/serial/specialix.txt | 383 ++++++++++ Documentation/serial/stallion.txt | 392 ++++++++++ Documentation/serial/sx.txt | 294 ++++++++ Documentation/serial/tty.txt | 292 ++++++++ Documentation/sh/kgdb.txt | 179 ----- Documentation/sound/alsa/ALSA-Configuration.txt | 325 +-------- Documentation/sound/alsa/HD-Audio-Models.txt | 356 +++++++++ Documentation/sound/alsa/HD-Audio.txt | 577 +++++++++++++++ Documentation/sound/alsa/Procfile.txt | 10 + Documentation/sound/alsa/soc/machine.txt | 8 +- Documentation/specialix.txt | 383 ---------- Documentation/spi/spi-lm70llp | 10 + Documentation/spi/spi-summary | 2 +- Documentation/stallion.txt | 392 ---------- Documentation/sx.txt | 294 -------- Documentation/sysctl/vm.txt | 616 +++++++++++----- Documentation/sysrq.txt | 19 +- Documentation/tracepoints.txt | 94 ++- Documentation/tracers/mmiotrace.txt | 6 +- Documentation/tty.txt | 292 -------- Documentation/usb/gadget_serial.txt | 4 +- Documentation/usb/power-management.txt | 22 +- Documentation/usb/proc_usb_info.txt | 6 +- Documentation/usb/usbmon.txt | 12 +- Documentation/usb/wusb-cbaf | 9 - Documentation/video4linux/API.html | 43 +- Documentation/video4linux/CARDLIST.bttv | 7 +- Documentation/video4linux/CARDLIST.cx23885 | 1 + Documentation/video4linux/CARDLIST.cx88 | 5 +- Documentation/video4linux/CARDLIST.em28xx | 9 +- Documentation/video4linux/CARDLIST.saa7134 | 4 +- Documentation/video4linux/README.cx88 | 12 +- Documentation/video4linux/gspca.txt | 19 +- Documentation/video4linux/si470x.txt | 119 +++ Documentation/video4linux/v4l2-framework.txt | 521 +++++++++++++ Documentation/vm/unevictable-lru.txt | 63 +- Documentation/w1/masters/00-INDEX | 2 + Documentation/w1/masters/mxc-w1 | 11 + Documentation/w1/masters/omap-hdq | 46 ++ Documentation/w1/w1.netlink | 164 ++++- Documentation/wimax/README.i2400m | 260 +++++++ Documentation/wimax/README.wimax | 81 +++ Documentation/x86/boot.txt | 8 +- Documentation/x86/pat.txt | 24 + Documentation/x86/x86_64/boot-options.txt | 11 - Documentation/x86/x86_64/mm.txt | 2 +- Documentation/x86/zero-page.txt | 2 +- 209 files changed, 15675 insertions(+), 8873 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-c2port create mode 100644 Documentation/DocBook/regulator.tmpl delete mode 100644 Documentation/DocBook/wanbook.tmpl delete mode 100644 Documentation/MSI-HOWTO.txt create mode 100644 Documentation/PCI/MSI-HOWTO.txt create mode 100644 Documentation/RCU/rcubarrier.txt create mode 100644 Documentation/RCU/rculist_nulls.txt create mode 100644 Documentation/RCU/trace.txt delete mode 100644 Documentation/README.DAC960 delete mode 100644 Documentation/README.cycladesZ create mode 100644 Documentation/acpi/debug.txt create mode 100644 Documentation/arm/pxa/mfp.txt create mode 100644 Documentation/bad_memory.txt create mode 100644 Documentation/blackfin/bfin-gpio-notes.txt create mode 100644 Documentation/blockdev/00-INDEX create mode 100644 Documentation/blockdev/README.DAC960 create mode 100644 Documentation/blockdev/cciss.txt create mode 100644 Documentation/blockdev/cpqarray.txt create mode 100644 Documentation/blockdev/floppy.txt create mode 100644 Documentation/blockdev/nbd.txt create mode 100644 Documentation/blockdev/paride.txt create mode 100644 Documentation/blockdev/ramdisk.txt create mode 100644 Documentation/c2port.txt delete mode 100644 Documentation/cciss.txt create mode 100644 Documentation/cgroups/cpuacct.txt create mode 100644 Documentation/cgroups/cpusets.txt create mode 100644 Documentation/cgroups/devices.txt create mode 100644 Documentation/cgroups/memcg_test.txt create mode 100644 Documentation/cgroups/memory.txt create mode 100644 Documentation/cgroups/resource_counter.txt delete mode 100644 Documentation/computone.txt delete mode 100644 Documentation/controllers/devices.txt delete mode 100644 Documentation/controllers/memory.txt delete mode 100644 Documentation/controllers/resource_counter.txt delete mode 100644 Documentation/cpqarray.txt delete mode 100644 Documentation/cpusets.txt create mode 100644 Documentation/credentials.txt delete mode 100644 Documentation/digiepca.txt create mode 100644 Documentation/dmaengine.txt create mode 100644 Documentation/dvb/technisat.txt create mode 100644 Documentation/filesystems/btrfs.txt create mode 100644 Documentation/filesystems/devpts.txt create mode 100644 Documentation/filesystems/squashfs.txt delete mode 100644 Documentation/floppy.txt delete mode 100644 Documentation/hayes-esp.txt create mode 100644 Documentation/hwmon/adt7462 create mode 100644 Documentation/hwmon/adt7475 create mode 100644 Documentation/hwmon/f71882fg create mode 100644 Documentation/hwmon/lis3lv02d create mode 100644 Documentation/hwmon/ltc4245 create mode 100644 Documentation/ics932s401 create mode 100644 Documentation/input/walkera0701.txt delete mode 100644 Documentation/ioctl-number.txt create mode 100644 Documentation/ioctl/00-INDEX create mode 100644 Documentation/ioctl/ioctl-number.txt create mode 100644 Documentation/kbuild/kbuild.txt create mode 100644 Documentation/kbuild/kconfig.txt delete mode 100644 Documentation/moxa-smartio delete mode 100644 Documentation/nbd.txt delete mode 100644 Documentation/paride.txt create mode 100644 Documentation/powerpc/dts-bindings/4xx/ndfc.txt create mode 100644 Documentation/printk-formats.txt delete mode 100644 Documentation/ramdisk.txt delete mode 100644 Documentation/riscom8.txt delete mode 100644 Documentation/rocket.txt create mode 100644 Documentation/scsi/cxgb3i.txt create mode 100644 Documentation/serial/00-INDEX create mode 100644 Documentation/serial/README.cycladesZ create mode 100644 Documentation/serial/computone.txt create mode 100644 Documentation/serial/digiepca.txt create mode 100644 Documentation/serial/hayes-esp.txt create mode 100644 Documentation/serial/moxa-smartio create mode 100644 Documentation/serial/riscom8.txt create mode 100644 Documentation/serial/rocket.txt create mode 100644 Documentation/serial/specialix.txt create mode 100644 Documentation/serial/stallion.txt create mode 100644 Documentation/serial/sx.txt create mode 100644 Documentation/serial/tty.txt delete mode 100644 Documentation/sh/kgdb.txt create mode 100644 Documentation/sound/alsa/HD-Audio-Models.txt create mode 100644 Documentation/sound/alsa/HD-Audio.txt delete mode 100644 Documentation/specialix.txt delete mode 100644 Documentation/stallion.txt delete mode 100644 Documentation/sx.txt delete mode 100644 Documentation/tty.txt create mode 100644 Documentation/video4linux/si470x.txt create mode 100644 Documentation/video4linux/v4l2-framework.txt create mode 100644 Documentation/w1/masters/mxc-w1 create mode 100644 Documentation/w1/masters/omap-hdq create mode 100644 Documentation/wimax/README.i2400m create mode 100644 Documentation/wimax/README.wimax (limited to 'Documentation') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index edef85ce119..2a39aeba146 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -42,14 +42,8 @@ IRQ.txt - description of what an IRQ is. ManagementStyle - how to (attempt to) manage kernel hackers. -MSI-HOWTO.txt - - the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ. RCU/ - directory with info on RCU (read-copy update). -README.DAC960 - - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux. -README.cycladesZ - - info on Cyclades-Z firmware loading. SAK.txt - info on Secure Attention Keys. SM501.txt @@ -86,20 +80,16 @@ blackfin/ - directory with documentation for the Blackfin arch. block/ - info on the Block I/O (BIO) layer. +blockdev/ + - info on block devices & drivers cachetlb.txt - describes the cache/TLB flushing interfaces Linux uses. -cciss.txt - - info, major/minor #'s for Compaq's SMART Array Controllers. cdrom/ - directory with information on the CD-ROM drivers that Linux has. -computone.txt - - info on Computone Intelliport II/Plus Multiport Serial Driver. connector/ - docs on the netlink based userspace<->kernel space communication mod. console/ - documentation on Linux console drivers. -cpqarray.txt - - info on using Compaq's SMART2 Intelligent Disk Array Controllers. cpu-freq/ - info on CPU frequency and voltage scaling. cpu-hotplug.txt @@ -126,8 +116,6 @@ device-mapper/ - directory with info on Device Mapper. devices.txt - plain ASCII listing of all the nodes in /dev/ with major minor #'s. -digiepca.txt - - info on Digi Intl. {PC,PCI,EISA}Xx and Xem series cards. dontdiff - file containing a list of files that should never be diff'ed. driver-model/ @@ -152,14 +140,10 @@ filesystems/ - info on the vfs and the various filesystems that Linux supports. firmware_class/ - request_firmware() hotplug interface info. -floppy.txt - - notes and driver options for the floppy disk driver. frv/ - Fujitsu FR-V Linux documentation. gpio.txt - overview of GPIO (General Purpose Input/Output) access conventions. -hayes-esp.txt - - info on using the Hayes ESP serial driver. highuid.txt - notes on the change from 16 bit to 32 bit user/group IDs. timers/ @@ -186,8 +170,6 @@ io_ordering.txt - info on ordering I/O writes to memory-mapped addresses. ioctl/ - directory with documents describing various IOCTL calls. -ioctl-number.txt - - how to implement and register device/driver ioctl calls. iostats.txt - info on I/O statistics Linux kernel provides. irqflags-tracing.txt @@ -250,14 +232,10 @@ mips/ - directory with info about Linux on MIPS architecture. mono.txt - how to execute Mono-based .NET binaries with the help of BINFMT_MISC. -moxa-smartio - - file with info on installing/using Moxa multiport serial driver. mutex-design.txt - info on the generic mutex subsystem. namespaces/ - directory with various information about namespaces -nbd.txt - - info on a TCP implementation of a network block device. netlabel/ - directory with information on the NetLabel subsystem. networking/ @@ -270,8 +248,6 @@ numastat.txt - info on how to read Numa policy hit/miss statistics in sysfs. oops-tracing.txt - how to decode those nasty internal kernel error dump messages. -paride.txt - - information about the parallel port IDE subsystem. parisc/ - directory with info on using Linux on PA-RISC architecture. parport.txt @@ -290,20 +266,16 @@ powerpc/ - directory with info on using Linux with the PowerPC. preempt-locking.txt - info on locking under a preemptive kernel. +printk-formats.txt + - how to get printk format specifiers right prio_tree.txt - info on radix-priority-search-tree use for indexing vmas. -ramdisk.txt - - short guide on how to set up and use the RAM disk. rbtree.txt - info on what red-black trees are and what they are for. -riscom8.txt - - notes on using the RISCom/8 multi-port serial driver. robust-futex-ABI.txt - documentation of the robust futex ABI. robust-futexes.txt - a description of what robust futexes are. -rocket.txt - - info on the Comtrol RocketPort multiport serial driver. rt-mutex-design.txt - description of the RealTime mutex implementation design. rt-mutex.txt @@ -332,8 +304,6 @@ sparc/ - directory with info on using Linux on Sparc architecture. sparse.txt - info on how to obtain and use the sparse tool for typechecking. -specialix.txt - - info on hardware/driver for specialix IO8+ multiport serial card. spi/ - overview of Linux kernel Serial Peripheral Interface (SPI) support. spinlocks.txt @@ -342,14 +312,10 @@ stable_api_nonsense.txt - info on why the kernel does not have a stable in-kernel api or abi. stable_kernel_rules.txt - rules and procedures for the -stable kernel releases. -stallion.txt - - info on using the Stallion multiport serial driver. svga.txt - short guide on selecting video modes at boot via VGA BIOS. sysfs-rules.txt - How not to use sysfs. -sx.txt - - info on the Specialix SX/SI multiport serial driver. sysctl/ - directory with info on the /proc/sys/* files. sysrq.txt @@ -358,8 +324,6 @@ telephony/ - directory with info on telephony (e.g. voice over IP) support. time_interpolators.txt - info on time interpolators. -tty.txt - - guide to the locking policies of the tty layer. uml/ - directory with information about User Mode Linux. unicode.txt diff --git a/Documentation/ABI/testing/sysfs-c2port b/Documentation/ABI/testing/sysfs-c2port new file mode 100644 index 00000000000..716cffc457e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-c2port @@ -0,0 +1,88 @@ +What: /sys/class/c2port/ +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/ directory will contain files and + directories that will provide a unified interface to + the C2 port interface. + +What: /sys/class/c2port/c2portX +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/ directory is related to X-th + C2 port into the system. Each directory will contain files to + manage and control its C2 port. + +What: /sys/class/c2port/c2portX/access +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/access file enable the access + to the C2 port from the system. No commands can be sent + till this entry is set to 0. + +What: /sys/class/c2port/c2portX/dev_id +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/dev_id file show the device ID + of the connected micro. + +What: /sys/class/c2port/c2portX/flash_access +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_access file enable the + access to the on-board flash of the connected micro. + No commands can be sent till this entry is set to 0. + +What: /sys/class/c2port/c2portX/flash_block_size +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_block_size file show + the on-board flash block size of the connected micro. + +What: /sys/class/c2port/c2portX/flash_blocks_num +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_blocks_num file show + the on-board flash blocks number of the connected micro. + +What: /sys/class/c2port/c2portX/flash_data +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_data file export + the content of the on-board flash of the connected micro. + +What: /sys/class/c2port/c2portX/flash_erase +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_erase file execute + the "erase" command on the on-board flash of the connected + micro. + +What: /sys/class/c2port/c2portX/flash_erase +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/flash_erase file show the + on-board flash size of the connected micro. + +What: /sys/class/c2port/c2portX/reset +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/reset file execute a "reset" + command on the connected micro. + +What: /sys/class/c2port/c2portX/rev_id +Date: October 2008 +Contact: Rodolfo Giometti +Description: + The /sys/class/c2port/c2portX/rev_id file show the revision ID + of the connected micro. diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator index 3731f6f29bc..873ef1fc156 100644 --- a/Documentation/ABI/testing/sysfs-class-regulator +++ b/Documentation/ABI/testing/sysfs-class-regulator @@ -3,8 +3,9 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called - state. This holds the regulator output state. + Some regulator directories will contain a field called + state. This reports the regulator enable status, for + regulators which can report that value. This will be one of the following strings: @@ -18,7 +19,8 @@ Description: 'disabled' means the regulator output is OFF and is not supplying power to the system.. - 'unknown' means software cannot determine the state. + 'unknown' means software cannot determine the state, or + the reported state is invalid. NOTE: this field can be used in conjunction with microvolts and microamps to determine regulator output levels. @@ -53,9 +55,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called microvolts. This holds the regulator output voltage setting - measured in microvolts (i.e. E-6 Volts). + measured in microvolts (i.e. E-6 Volts), for regulators + which can report that voltage. NOTE: This value should not be used to determine the regulator output voltage level as this value is the same regardless of @@ -67,9 +70,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called microamps. This holds the regulator output current limit - setting measured in microamps (i.e. E-6 Amps). + setting measured in microamps (i.e. E-6 Amps), for regulators + which can report that current. NOTE: This value should not be used to determine the regulator output current level as this value is the same regardless of @@ -81,8 +85,9 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called - opmode. This holds the regulator operating mode setting. + Some regulator directories will contain a field called + opmode. This holds the current regulator operating mode, + for regulators which can report it. The opmode value can be one of the following strings: @@ -92,7 +97,7 @@ Description: 'standby' 'unknown' - The modes are described in include/linux/regulator/regulator.h + The modes are described in include/linux/regulator/consumer.h NOTE: This value should not be used to determine the regulator output operating mode as this value is the same regardless of @@ -104,9 +109,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called min_microvolts. This holds the minimum safe working regulator - output voltage setting for this domain measured in microvolts. + output voltage setting for this domain measured in microvolts, + for regulators which support voltage constraints. NOTE: this will return the string 'constraint not defined' if the power domain has no min microvolts constraint defined by @@ -118,9 +124,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called max_microvolts. This holds the maximum safe working regulator - output voltage setting for this domain measured in microvolts. + output voltage setting for this domain measured in microvolts, + for regulators which support voltage constraints. NOTE: this will return the string 'constraint not defined' if the power domain has no max microvolts constraint defined by @@ -132,10 +139,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called min_microamps. This holds the minimum safe working regulator output current limit setting for this domain measured in - microamps. + microamps, for regulators which support current constraints. NOTE: this will return the string 'constraint not defined' if the power domain has no min microamps constraint defined by @@ -147,10 +154,10 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called max_microamps. This holds the maximum safe working regulator output current limit setting for this domain measured in - microamps. + microamps, for regulators which support current constraints. NOTE: this will return the string 'constraint not defined' if the power domain has no max microamps constraint defined by @@ -185,7 +192,7 @@ Date: April 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called requested_microamps. This holds the total requested load current in microamps for this regulator from all its consumer devices. @@ -204,125 +211,102 @@ Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_mem_microvolts. This holds the regulator output voltage setting for this domain measured in microvolts when - the system is suspended to memory. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to memory voltage defined by - platform code. + the system is suspended to memory, for voltage regulators + implementing suspend voltage configuration constraints. What: /sys/class/regulator/.../suspend_disk_microvolts Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_disk_microvolts. This holds the regulator output voltage setting for this domain measured in microvolts when - the system is suspended to disk. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to disk voltage defined by - platform code. + the system is suspended to disk, for voltage regulators + implementing suspend voltage configuration constraints. What: /sys/class/regulator/.../suspend_standby_microvolts Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_standby_microvolts. This holds the regulator output voltage setting for this domain measured in microvolts when - the system is suspended to standby. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to standby voltage defined by - platform code. + the system is suspended to standby, for voltage regulators + implementing suspend voltage configuration constraints. What: /sys/class/regulator/.../suspend_mem_mode Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_mem_mode. This holds the regulator operating mode setting for this domain when the system is suspended to - memory. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to memory mode defined by - platform code. + memory, for regulators implementing suspend mode + configuration constraints. What: /sys/class/regulator/.../suspend_disk_mode Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_disk_mode. This holds the regulator operating mode - setting for this domain when the system is suspended to disk. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to disk mode defined by - platform code. + setting for this domain when the system is suspended to disk, + for regulators implementing suspend mode configuration + constraints. What: /sys/class/regulator/.../suspend_standby_mode Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_standby_mode. This holds the regulator operating mode setting for this domain when the system is suspended to - standby. - - NOTE: this will return the string 'not defined' if - the power domain has no suspend to standby mode defined by - platform code. + standby, for regulators implementing suspend mode + configuration constraints. What: /sys/class/regulator/.../suspend_mem_state Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_mem_state. This holds the regulator operating state - when suspended to memory. - - This will be one of the following strings: + when suspended to memory, for regulators implementing suspend + configuration constraints. - 'enabled' - 'disabled' - 'not defined' + This will be one of the same strings reported by + the "state" attribute. What: /sys/class/regulator/.../suspend_disk_state Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_disk_state. This holds the regulator operating state - when suspended to disk. - - This will be one of the following strings: + when suspended to disk, for regulators implementing + suspend configuration constraints. - 'enabled' - 'disabled' - 'not defined' + This will be one of the same strings reported by + the "state" attribute. What: /sys/class/regulator/.../suspend_standby_state Date: May 2008 KernelVersion: 2.6.26 Contact: Liam Girdwood Description: - Each regulator directory will contain a field called + Some regulator directories will contain a field called suspend_standby_state. This holds the regulator operating - state when suspended to standby. - - This will be one of the following strings: + state when suspended to standby, for regulators implementing + suspend configuration constraints. - 'enabled' - 'disabled' - 'not defined' + This will be one of the same strings reported by + the "state" attribute. diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc index a0d18dbeb7a..6a5fd072849 100644 --- a/Documentation/ABI/testing/sysfs-class-uwb_rc +++ b/Documentation/ABI/testing/sysfs-class-uwb_rc @@ -32,14 +32,16 @@ Contact: linux-usb@vger.kernel.org Description: Write: - [] + - to start beaconing on a specific channel, or stop - beaconing if is -1. Valid channels depends - on the radio controller's supported band groups. + to force a specific channel to be used when beaconing, + or, if is -1, to prohibit beaconing. If + is 0, then the default channel selection + algorithm will be used. Valid channels depends on the + radio controller's supported band groups. - may be used to try and join a specific - beacon group if more than one was found during a scan. + Reading returns the currently active channel, or -1 if + the radio controller is not beaconing. What: /sys/class/uwb_rc/uwbN/scan Date: July 2008 diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 7a16fe1e227..9fe91c02ee4 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory @@ -6,7 +6,6 @@ Description: internal state of the kernel memory blocks. Files could be added or removed dynamically to represent hot-add/remove operations. - Users: hotplug memory add/remove tools https://w3.opensource.ibm.com/projects/powerpc-utils/ @@ -19,6 +18,56 @@ Description: This is useful for a user-level agent to determine identify removable sections of the memory before attempting potentially expensive hot-remove memory operation +Users: hotplug memory remove tools + https://w3.opensource.ibm.com/projects/powerpc-utils/ + +What: /sys/devices/system/memory/memoryX/phys_device +Date: September 2008 +Contact: Badari Pulavarty +Description: + The file /sys/devices/system/memory/memoryX/phys_device + is read-only and is designed to show the name of physical + memory device. Implementation is currently incomplete. +What: /sys/devices/system/memory/memoryX/phys_index +Date: September 2008 +Contact: Badari Pulavarty +Description: + The file /sys/devices/system/memory/memoryX/phys_index + is read-only and contains the section ID in hexadecimal + which is equivalent to decimal X contained in the + memory section directory name. + +What: /sys/devices/system/memory/memoryX/state +Date: September 2008 +Contact: Badari Pulavarty +Description: + The file /sys/devices/system/memory/memoryX/state + is read-write. When read, it's contents show the + online/offline state of the memory section. When written, + root can toggle the the online/offline state of a removable + memory section (see removable file description above) + using the following commands. + # echo online > /sys/devices/system/memory/memoryX/state + # echo offline > /sys/devices/system/memory/memoryX/state + + For example, if /sys/devices/system/memory/memory22/removable + contains a value of 1 and + /sys/devices/system/memory/memory22/state contains the + string "online" the following command can be executed by + by root to offline that section. + # echo offline > /sys/devices/system/memory/memory22/state Users: hotplug memory remove tools https://w3.opensource.ibm.com/projects/powerpc-utils/ + +What: /sys/devices/system/node/nodeX/memoryY +Date: September 2008 +Contact: Gary Hade +Description: + When CONFIG_NUMA is enabled + /sys/devices/system/node/nodeX/memoryY is a symbolic link that + points to the corresponding /sys/devices/system/memory/memoryY + memory section directory. For example, the following symbolic + link is created for memory section 9 on node0. + /sys/devices/system/node/node0/memory9 -> ../../memory/memory9 + diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi index f27be7d1a49..e8ffc70ffe1 100644 --- a/Documentation/ABI/testing/sysfs-firmware-acpi +++ b/Documentation/ABI/testing/sysfs-firmware-acpi @@ -89,7 +89,7 @@ Description: error - an interrupt that can't be accounted for above. - invalid: it's either a wakeup GPE or a GPE/Fixed Event that + invalid: it's either a GPE or a Fixed Event that doesn't have an event handler. disable: the GPE/Fixed Event is valid but disabled. @@ -117,30 +117,30 @@ Description: and other user space applications so that the machine won't shutdown when pressing the power button. # cat ff_pwr_btn - 0 + 0 enabled # press the power button for 3 times; # cat ff_pwr_btn - 3 + 3 enabled # echo disable > ff_pwr_btn # cat ff_pwr_btn - disable + 3 disabled # press the power button for 3 times; # cat ff_pwr_btn - disable + 3 disabled # echo enable > ff_pwr_btn # cat ff_pwr_btn - 4 + 4 enabled /* * this is because the status bit is set even if the enable bit is cleared, * and it triggers an ACPI fixed event when the enable bit is set again */ # press the power button for 3 times; # cat ff_pwr_btn - 7 + 7 enabled # echo disable > ff_pwr_btn # press the power button for 3 times; # echo clear > ff_pwr_btn /* clear the status bit */ # echo disable > ff_pwr_btn # cat ff_pwr_btn - 7 + 7 enabled diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index b8e86460046..52441694fe0 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -170,16 +170,15 @@ Returns: 0 if successful and a negative error if not. u64 dma_get_required_mask(struct device *dev) -After setting the mask with dma_set_mask(), this API returns the -actual mask (within that already set) that the platform actually -requires to operate efficiently. Usually this means the returned mask +This API returns the mask that the platform requires to +operate efficiently. Usually this means the returned mask is the minimum required to cover all of memory. Examining the required mask gives drivers with variable descriptor sizes the opportunity to use smaller descriptors as necessary. Requesting the required mask does not alter the current mask. If you -wish to take advantage of it, you should issue another dma_set_mask() -call to lower the mask again. +wish to take advantage of it, you should issue a dma_set_mask() +call to set the mask to the value returned. Part Id - Streaming DMA mappings @@ -316,12 +315,10 @@ reduce current DMA mapping usage or delay and try again later). pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) -Maps a scatter gather list from the block layer. - Returns: the number of physical segments mapped (this may be shorter -than passed in if the block layer determines that some -elements of the scatter/gather list are physically adjacent and thus -may be mapped with a single entry). +than passed in if some elements of the scatter/gather list are +physically or virtually adjacent and an IOMMU maps them with a single +entry). Please note that the sg cannot be mapped again if it has been mapped once. The mapping process is allowed to destroy information in the sg. diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt index c74fec8c235..b2a4d6d244d 100644 --- a/Documentation/DMA-mapping.txt +++ b/Documentation/DMA-mapping.txt @@ -26,7 +26,7 @@ mapped only for the time they are actually used and unmapped after the DMA transfer. The following API will work of course even on platforms where no such -hardware exists, see e.g. include/asm-i386/pci.h for how it is implemented on +hardware exists, see e.g. arch/x86/include/asm/pci.h for how it is implemented on top of the virt_to_bus interface. First of all, you should make sure diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 9b1f6ca100d..dc3154e4927 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -6,13 +6,13 @@ # To add a new book the only step required is to add the book to the # list of DOCBOOKS. -DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml \ +DOCBOOKS := z8530book.xml mcabook.xml \ kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ procfs-guide.xml writing_usb_driver.xml networking.xml \ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ - mac80211.xml debugobjects.xml sh.xml + mac80211.xml debugobjects.xml sh.xml regulator.xml ### # The build process is as follows (targets): diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl index f24f9e85e4a..59ad69a9d77 100644 --- a/Documentation/DocBook/networking.tmpl +++ b/Documentation/DocBook/networking.tmpl @@ -74,6 +74,14 @@ !Enet/sunrpc/rpcb_clnt.c !Enet/sunrpc/clnt.c + WiMAX +!Enet/wimax/op-msg.c +!Enet/wimax/op-reset.c +!Enet/wimax/op-rfkill.c +!Enet/wimax/stack.c +!Iinclude/net/wimax.h +!Iinclude/linux/wimax.h + @@ -98,9 +106,6 @@ X!Enet/core/wireless.c --> - Synchronous PPP -!Edrivers/net/wan/syncppp.c - diff --git a/Documentation/DocBook/regulator.tmpl b/Documentation/DocBook/regulator.tmpl new file mode 100644 index 00000000000..53f4f8d3b81 --- /dev/null +++ b/Documentation/DocBook/regulator.tmpl @@ -0,0 +1,304 @@ + + + + + + Voltage and current regulator API + + + + Liam + Girdwood + +
+ lrg@slimlogic.co.uk +
+
+
+ + Mark + Brown + + Wolfson Microelectronics +
+ broonie@opensource.wolfsonmicro.com +
+
+
+
+ + + 2007-2008 + Wolfson Microelectronics + + + 2008 + Liam Girdwood + + + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + + Introduction + + This framework is designed to provide a standard kernel + interface to control voltage and current regulators. + + + The intention is to allow systems to dynamically control + regulator power output in order to save power and prolong + battery life. This applies to both voltage regulators (where + voltage output is controllable) and current sinks (where current + limit is controllable). + + + Note that additional (and currently more complete) documentation + is available in the Linux kernel source under + Documentation/power/regulator. + + + + Glossary + + The regulator API uses a number of terms which may not be + familiar: + + + + + Regulator + + + Electronic device that supplies power to other devices. Most + regulators can enable and disable their output and some can also + control their output voltage or current. + + + + + + Consumer + + + Electronic device which consumes power provided by a regulator. + These may either be static, requiring only a fixed supply, or + dynamic, requiring active management of the regulator at + runtime. + + + + + + Power Domain + + + The electronic circuit supplied by a given regulator, including + the regulator and all consumer devices. The configuration of + the regulator is shared between all the components in the + circuit. + + + + + + Power Management Integrated Circuit + PMIC + + + An IC which contains numerous regulators and often also other + subsystems. In an embedded system the primary PMIC is often + equivalent to a combination of the PSU and southbridge in a + desktop system. + + + + + + + + + Consumer driver interface + + This offers a similar API to the kernel clock framework. + Consumer drivers use get and put operations to acquire and + release regulators. Functions are + provided to enable + and disable the + reguator and to get and set the runtime parameters of the + regulator. + + + When requesting regulators consumers use symbolic names for their + supplies, such as "Vcc", which are mapped into actual regulator + devices by the machine interface. + + + A stub version of this API is provided when the regulator + framework is not in use in order to minimise the need to use + ifdefs. + + + + Enabling and disabling + + The regulator API provides reference counted enabling and + disabling of regulators. Consumer devices use the regulator_enable + and regulator_disable + functions to enable and disable regulators. Calls + to the two functions must be balanced. + + + Note that since multiple consumers may be using a regulator and + machine constraints may not allow the regulator to be disabled + there is no guarantee that calling + regulator_disable will actually cause the + supply provided by the regulator to be disabled. Consumer + drivers should assume that the regulator may be enabled at all + times. + + + + + Configuration + + Some consumer devices may need to be able to dynamically + configure their supplies. For example, MMC drivers may need to + select the correct operating voltage for their cards. This may + be done while the regulator is enabled or disabled. + + + The regulator_set_voltage + and regulator_set_current_limit + functions provide the primary interface for this. + Both take ranges of voltages and currents, supporting drivers + that do not require a specific value (eg, CPU frequency scaling + normally permits the CPU to use a wider range of supply + voltages at lower frequencies but does not require that the + supply voltage be lowered). Where an exact value is required + both minimum and maximum values should be identical. + + + + + Callbacks + + Callbacks may also be registered + for events such as regulation failures. + + + + + + Regulator driver interface + + Drivers for regulator chips register the regulators + with the regulator core, providing operations structures to the + core. A notifier interface + allows error conditions to be reported to the core. + + + Registration should be triggered by explicit setup done by the + platform, supplying a struct + regulator_init_data for the regulator containing + constraint and + supply information. + + + + + Machine interface + + This interface provides a way to define how regulators are + connected to consumers on a given system and what the valid + operating parameters are for the system. + + + + Supplies + + Regulator supplies are specified using struct + regulator_consumer_supply. This is done at + driver registration + time as part of the machine constraints. + + + + + Constraints + + As well as definining the connections the machine interface + also provides constraints definining the operations that + clients are allowed to perform and the parameters that may be + set. This is required since generally regulator devices will + offer more flexibility than it is safe to use on a given + system, for example supporting higher supply voltages than the + consumers are rated for. + + + This is done at driver + registration time by providing a struct + regulation_constraints. + + + The constraints may also specify an initial configuration for the + regulator in the constraints, which is particularly useful for + use with static consumers. + + + + + + API reference + + Due to limitations of the kernel documentation framework and the + existing layout of the source code the entire regulator API is + documented here. + +!Iinclude/linux/regulator/consumer.h +!Iinclude/linux/regulator/machine.h +!Iinclude/linux/regulator/driver.h +!Edrivers/regulator/core.c + +
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl index df87d1b9360..b787e4721c9 100644 --- a/Documentation/DocBook/uio-howto.tmpl +++ b/Documentation/DocBook/uio-howto.tmpl @@ -41,6 +41,12 @@ GPL version 2. + + 0.6 + 2008-12-05 + hjk + Added description of portio sysfs attributes. + 0.5 2008-05-22 @@ -318,6 +324,54 @@ interested in translating it, please email me offset = N * getpagesize(); + + Sometimes there is hardware with memory-like regions that can not be + mapped with the technique described here, but there are still ways to + access them from userspace. The most common example are x86 ioports. + On x86 systems, userspace can access these ioports using + ioperm(), iopl(), + inb(), outb(), and similar + functions. + + + Since these ioport regions can not be mapped, they will not appear under + /sys/class/uio/uioX/maps/ like the normal memory + described above. Without information about the port regions a hardware + has to offer, it becomes difficult for the userspace part of the + driver to find out which ports belong to which UIO device. + + + To address this situation, the new directory + /sys/class/uio/uioX/portio/ was added. It only + exists if the driver wants to pass information about one or more port + regions to userspace. If that is the case, subdirectories named + port0, port1, and so on, + will appear underneath + /sys/class/uio/uioX/portio/. + + + Each portX/ directory contains three read-only + files that show start, size, and type of the port region: + + + + + start: The first port of this region. + + + + + size: The number of ports in this region. + + + + + porttype: A string describing the type of port. + + + + + @@ -339,12 +393,12 @@ offset = N * getpagesize(); -char *name: Required. The name of your driver as +const char *name: Required. The name of your driver as it will appear in sysfs. I recommend using the name of your module for this. -char *version: Required. This string appears in +const char *version: Required. This string appears in /sys/class/uio/uioX/version. @@ -355,6 +409,13 @@ mapping you need to fill one of the uio_mem structures. See the description below for details. + +struct uio_port port[ MAX_UIO_PORTS_REGIONS ]: Required +if you want to pass information about ioports to userspace. For each port +region you need to fill one of the uio_port structures. +See the description below for details. + + long irq: Required. If your hardware generates an interrupt, it's your modules task to determine the irq number during @@ -448,6 +509,42 @@ Please do not touch the kobj element of struct uio_mem! It is used by the UIO framework to set up sysfs files for this mapping. Simply leave it alone. + + +Sometimes, your device can have one or more port regions which can not be +mapped to userspace. But if there are other possibilities for userspace to +access these ports, it makes sense to make information about the ports +available in sysfs. For each region, you have to set up a +struct uio_port in the port[] array. +Here's a description of the fields of struct uio_port: + + + + +char *porttype: Required. Set this to one of the predefined +constants. Use UIO_PORT_X86 for the ioports found in x86 +architectures. + + + +unsigned long start: Required if the port region is used. +Fill in the number of the first port of this region. + + + +unsigned long size: Fill in the number of ports in this +region. If size is zero, the region is considered unused. +Note that you must initialize size +with zero for all unused regions. + + + + +Please do not touch the portio element of +struct uio_port! It is used internally by the UIO +framework to set up sysfs files for this region. Simply leave it alone. + + diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl deleted file mode 100644 index 8c93db122f0..00000000000 --- a/Documentation/DocBook/wanbook.tmpl +++ /dev/null @@ -1,99 +0,0 @@ - - - - - - Synchronous PPP and Cisco HDLC Programming Guide - - - - Alan - Cox - -
- alan@lxorguk.ukuu.org.uk -
-
-
-
- - - 2000 - Alan Cox - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The syncppp drivers in Linux provide a fairly complete - implementation of Cisco HDLC and a minimal implementation of - PPP. The longer term goal is to switch the PPP layer to the - generic PPP interface that is new in Linux 2.3.x. The API should - remain unchanged when this is done, but support will then be - available for IPX, compression and other PPP features - - - - Known Bugs And Assumptions - - - PPP is minimal - - - The current PPP implementation is very basic, although sufficient - for most wan usages. - - - - Cisco HDLC Quirks - - - Currently we do not end all packets with the correct Cisco multicast - or unicast flags. Nothing appears to mind too much but this should - be corrected. - - - - - - - - - Public Functions Provided -!Edrivers/net/wan/syncppp.c - - -
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt deleted file mode 100644 index 256defd7e17..00000000000 --- a/Documentation/MSI-HOWTO.txt +++ /dev/null @@ -1,509 +0,0 @@ - The MSI Driver Guide HOWTO - Tom L Nguyen tom.l.nguyen@intel.com - 10/03/2003 - Revised Feb 12, 2004 by Martine Silbermann - email: Martine.Silbermann@hp.com - Revised Jun 25, 2004 by Tom L Nguyen - -1. About this guide - -This guide describes the basics of Message Signaled Interrupts (MSI), -the advantages of using MSI over traditional interrupt mechanisms, -and how to enable your driver to use MSI or MSI-X. Also included is -a Frequently Asked Questions (FAQ) section. - -1.1 Terminology - -PCI devices can be single-function or multi-function. In either case, -when this text talks about enabling or disabling MSI on a "device -function," it is referring to one specific PCI device and function and -not to all functions on a PCI device (unless the PCI device has only -one function). - -2. Copyright 2003 Intel Corporation - -3. What is MSI/MSI-X? - -Message Signaled Interrupt (MSI), as described in the PCI Local Bus -Specification Revision 2.3 or later, is an optional feature, and a -required feature for PCI Express devices. MSI enables a device function -to request service by sending an Inbound Memory Write on its PCI bus to -the FSB as a Message Signal Interrupt transaction. Because MSI is -generated in the form of a Memory Write, all transaction conditions, -such as a Retry, Master-Abort, Target-Abort or normal completion, are -supported. - -A PCI device that supports MSI must also support pin IRQ assertion -interrupt mechanism to provide backward compatibility for systems that -do not support MSI. In systems which support MSI, the bus driver is -responsible for initializing the message address and message data of -the device function's MSI/MSI-X capability structure during device -initial configuration. - -An MSI capable device function indicates MSI support by implementing -the MSI/MSI-X capability structure in its PCI capability list. The -device function may implement both the MSI capability structure and -the MSI-X capability structure; however, the bus driver should not -enable both. - -The MSI capability structure contains Message Control register, -Message Address register and Message Data register. These registers -provide the bus driver control over MSI. The Message Control register -indicates the MSI capability supported by the device. The Message -Address register specifies the target address and the Message Data -register specifies the characteristics of the message. To request -service, the device function writes the content of the Message Data -register to the target address. The device and its software driver -are prohibited from writing to these registers. - -The MSI-X capability structure is an optional extension to MSI. It -uses an independent and separate capability structure. There are -some key advantages to implementing the MSI-X capability structure -over the MSI capability structure as described below. - - - Support a larger maximum number of vectors per function. - - - Provide the ability for system software to configure - each vector with an independent message address and message - data, specified by a table that resides in Memory Space. - - - MSI and MSI-X both support per-vector masking. Per-vector - masking is an optional extension of MSI but a required - feature for MSI-X. Per-vector masking provides the kernel the - ability to mask/unmask a single MSI while running its - interrupt service routine. If per-vector masking is - not supported, then the device driver should provide the - hardware/software synchronization to ensure that the device - generates MSI when the driver wants it to do so. - -4. Why use MSI? - -As a benefit to the simplification of board design, MSI allows board -designers to remove out-of-band interrupt routing. MSI is another -step towards a legacy-free environment. - -Due to increasing pressure on chipset and processor packages to -reduce pin count, the need for interrupt pins is expected to -diminish over time. Devices, due to pin constraints, may implement -messages to increase performance. - -PCI Express endpoints uses INTx emulation (in-band messages) instead -of IRQ pin assertion. Using INTx emulation requires interrupt -sharing among devices connected to the same node (PCI bridge) while -MSI is unique (non-shared) and does not require BIOS configuration -support. As a result, the PCI Express technology requires MSI -support for better interrupt performance. - -Using MSI enables the device functions to support two or more -vectors, which can be configured to target different CPUs to -increase scalability. - -5. Configuring a driver to use MSI/MSI-X - -By default, the kernel will not enable MSI/MSI-X on all devices that -support this capability. The CONFIG_PCI_MSI kernel option -must be selected to enable MSI/MSI-X support. - -5.1 Including MSI/MSI-X support into the kernel - -To allow MSI/MSI-X capable device drivers to selectively enable -MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described -below), the VECTOR based scheme needs to be enabled by setting -CONFIG_PCI_MSI during kernel config. - -Since the target of the inbound message is the local APIC, providing -CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI. - -5.2 Configuring for MSI support - -Due to the non-contiguous fashion in vector assignment of the -existing Linux kernel, this version does not support multiple -messages regardless of a device function is capable of supporting -more than one vector. To enable MSI on a device function's MSI -capability structure requires a device driver to call the function -pci_enable_msi() explicitly. - -5.2.1 API pci_enable_msi - -int pci_enable_msi(struct pci_dev *dev) - -With this new API, a device driver that wants to have MSI -enabled on its device function must call this API to enable MSI. -A successful call will initialize the MSI capability structure -with ONE vector, regardless of whether a device function is -capable of supporting multiple messages. This vector replaces the -pre-assigned dev->irq with a new MSI vector. To avoid a conflict -of the new assigned vector with existing pre-assigned vector requires -a device driver to call this API before calling request_irq(). - -5.2.2 API pci_disable_msi - -void pci_disable_msi(struct pci_dev *dev) - -This API should always be used to undo the effect of pci_enable_msi() -when a device driver is unloading. This API restores dev->irq with -the pre-assigned IOAPIC vector and switches a device's interrupt -mode to PCI pin-irq assertion/INTx emulation mode. - -Note that a device driver should always call free_irq() on the MSI vector -that it has done request_irq() on before calling this API. Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. - -5.2.3 MSI mode vs. legacy mode diagram - -The below diagram shows the events which switch the interrupt -mode on the MSI-capable device function between MSI mode and -PIN-IRQ assertion mode. - - ------------ pci_enable_msi ------------------------ - | | <=============== | | - | MSI MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msi ------------------------ - - -Figure 1. MSI Mode vs. Legacy Mode - -In Figure 1, a device operates by default in legacy mode. Legacy -in this context means PCI pin-irq assertion or PCI-Express INTx -emulation. A successful MSI request (using pci_enable_msi()) switches -a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem and a new -assigned MSI vector will replace dev->irq. - -To return back to its default mode, a device driver should always call -pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a -device driver should always call free_irq() on the MSI vector it has -done request_irq() on before calling pci_disable_msi(). Failure to do -so results in a BUG_ON() and a device will be left with MSI enabled and -leaks its vector. Otherwise, the PCI subsystem restores a device's -dev->irq with a pre-assigned IOAPIC vector and marks the released -MSI vector as unused. - -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve this MSI vector for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, this MSI may be re-assigned. - -For the case where the PCI subsystem re-assigns this MSI vector to -another driver, a request to switch back to MSI mode may result -in being assigned a different MSI vector or a failure if no more -vectors are available. - -5.3 Configuring for MSI-X support - -Due to the ability of the system software to configure each vector of -the MSI-X capability structure with an independent message address -and message data, the non-contiguous fashion in vector assignment of -the existing Linux kernel has no impact on supporting multiple -messages on an MSI-X capable device functions. To enable MSI-X on -a device function's MSI-X capability structure requires its device -driver to call the function pci_enable_msix() explicitly. - -The function pci_enable_msix(), once invoked, enables either -all or nothing, depending on the current availability of PCI vector -resources. If the PCI vector resources are available for the number -of vectors requested by a device driver, this function will configure -the MSI-X table of the MSI-X capability structure of a device with -requested messages. To emphasize this reason, for example, a device -may be capable for supporting the maximum of 32 vectors while its -software driver usually may request 4 vectors. It is recommended -that the device driver should call this function once during the -initialization phase of the device driver. - -Unlike the function pci_enable_msi(), the function pci_enable_msix() -does not replace the pre-assigned IOAPIC dev->irq with a new MSI -vector because the PCI subsystem writes the 1:1 vector-to-entry mapping -into the field vector of each element contained in a second argument. -Note that the pre-assigned IOAPIC dev->irq is valid only if the device -operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at -using dev->irq by the device driver to request for interrupt service -may result in unpredictable behavior. - -For each MSI-X vector granted, a device driver is responsible for calling -other functions like request_irq(), enable_irq(), etc. to enable -this vector with its corresponding interrupt service handler. It is -a device driver's choice to assign all vectors with the same -interrupt service handler or each vector with a unique interrupt -service handler. - -5.3.1 Handling MMIO address space of MSI-X Table - -The PCI 3.0 specification has implementation notes that MMIO address -space for a device's MSI-X structure should be isolated so that the -software system can set different pages for controlling accesses to the -MSI-X structure. The implementation of MSI support requires the PCI -subsystem, not a device driver, to maintain full control of the MSI-X -table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X -table/MSI-X PBA. A device driver should not access the MMIO address -space of the MSI-X table/MSI-X PBA. - -5.3.2 API pci_enable_msix - -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) - -This API enables a device driver to request the PCI subsystem -to enable MSI-X messages on its hardware device. Depending on -the availability of PCI vectors resources, the PCI subsystem enables -either all or none of the requested vectors. - -Argument 'dev' points to the device (pci_dev) structure. - -Argument 'entries' is a pointer to an array of msix_entry structs. -The number of entries is indicated in argument 'nvec'. -struct msix_entry is defined in /driver/pci/msi.h: - -struct msix_entry { - u16 vector; /* kernel uses to write alloc vector */ - u16 entry; /* driver uses to specify entry */ -}; - -A device driver is responsible for initializing the field 'entry' of -each element with a unique entry supported by MSI-X table. Otherwise, --EINVAL will be returned as a result. A successful return of zero -indicates the PCI subsystem completed initializing each of the requested -entries of the MSI-X table with message address and message data. -Last but not least, the PCI subsystem will write the 1:1 -vector-to-entry mapping into the field 'vector' of each element. A -device driver is responsible for keeping track of allocated MSI-X -vectors in its internal data structure. - -A return of zero indicates that the number of MSI-X vectors was -successfully allocated. A return of greater than zero indicates -MSI-X vector shortage. Or a return of less than zero indicates -a failure. This failure may be a result of duplicate entries -specified in second argument, or a result of no available vector, -or a result of failing to initialize MSI-X table entries. - -5.3.3 API pci_disable_msix - -void pci_disable_msix(struct pci_dev *dev) - -This API should always be used to undo the effect of pci_enable_msix() -when a device driver is unloading. Note that a device driver should -always call free_irq() on all MSI-X vectors it has done request_irq() -on before calling this API. Failure to do so results in a BUG_ON() and -a device will be left with MSI-X enabled and leaks its vectors. - -5.3.4 MSI-X mode vs. legacy mode diagram - -The below diagram shows the events which switch the interrupt -mode on the MSI-X capable device function between MSI-X mode and -PIN-IRQ assertion mode (legacy). - - ------------ pci_enable_msix(,,n) ------------------------ - | | <=============== | | - | MSI-X MODE | | PIN-IRQ ASSERTION MODE | - | | ===============> | | - ------------ pci_disable_msix ------------------------ - -Figure 2. MSI-X Mode vs. Legacy Mode - -In Figure 2, a device operates by default in legacy mode. A -successful MSI-X request (using pci_enable_msix()) switches a -device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector -stored in dev->irq will be saved by the PCI subsystem; however, -unlike MSI mode, the PCI subsystem will not replace dev->irq with -assigned MSI-X vector because the PCI subsystem already writes the 1:1 -vector-to-entry mapping into the field 'vector' of each element -specified in second argument. - -To return back to its default mode, a device driver should always call -pci_disable_msix() to undo the effect of pci_enable_msix(). Note that -a device driver should always call free_irq() on all MSI-X vectors it -has done request_irq() on before calling pci_disable_msix(). Failure -to do so results in a BUG_ON() and a device will be left with MSI-X -enabled and leaks its vectors. Otherwise, the PCI subsystem switches a -device function's interrupt mode from MSI-X mode to legacy mode and -marks all allocated MSI-X vectors as unused. - -Once being marked as unused, there is no guarantee that the PCI -subsystem will reserve these MSI-X vectors for a device. Depending on -the availability of current PCI vector resources and the number of -MSI/MSI-X requests from other drivers, these MSI-X vectors may be -re-assigned. - -For the case where the PCI subsystem re-assigned these MSI-X vectors -to other drivers, a request to switch back to MSI-X mode may result -being assigned with another set of MSI-X vectors or a failure if no -more vectors are available. - -5.4 Handling function implementing both MSI and MSI-X capabilities - -For the case where a function implements both MSI and MSI-X -capabilities, the PCI subsystem enables a device to run either in MSI -mode or MSI-X mode but not both. A device driver determines whether it -wants MSI or MSI-X enabled on its hardware device. Once a device -driver requests for MSI, for example, it is prohibited from requesting -MSI-X; in other words, a device driver is not permitted to ping-pong -between MSI mod MSI-X mode during a run-time. - -5.5 Hardware requirements for MSI/MSI-X support - -MSI/MSI-X support requires support from both system hardware and -individual hardware device functions. - -5.5.1 Required x86 hardware support - -Since the target of MSI address is the local APIC CPU, enabling -MSI/MSI-X support in the Linux kernel is dependent on whether existing -system hardware supports local APIC. Users should verify that their -system supports local APIC operation by testing that it runs when -CONFIG_X86_LOCAL_APIC=y. - -In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; -however, in UP environment, users must manually set -CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting -CONFIG_PCI_MSI enables the VECTOR based scheme and the option for -MSI-capable device drivers to selectively enable MSI/MSI-X. - -Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X -vector is allocated new during runtime and MSI/MSI-X support does not -depend on BIOS support. This key independency enables MSI/MSI-X -support on future IOxAPIC free platforms. - -5.5.2 Device hardware support - -The hardware device function supports MSI by indicating the -MSI/MSI-X capability structure on its PCI capability list. By -default, this capability structure will not be initialized by -the kernel to enable MSI during the system boot. In other words, -the device function is running on its default pin assertion mode. -Note that in many cases the hardware supporting MSI have bugs, -which may result in system hangs. The software driver of specific -MSI-capable hardware is responsible for deciding whether to call -pci_enable_msi or not. A return of zero indicates the kernel -successfully initialized the MSI/MSI-X capability structure of the -device function. The device function is now running on MSI/MSI-X mode. - -5.6 How to tell whether MSI/MSI-X is enabled on device function - -At the driver level, a return of zero from the function call of -pci_enable_msi()/pci_enable_msix() indicates to a device driver that -its device function is initialized successfully and ready to run in -MSI/MSI-X mode. - -At the user level, users can use the command 'cat /proc/interrupts' -to display the vectors allocated for devices and their interrupt -MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is -enabled on a SCSI Adaptec 39320D Ultra320 controller. - - CPU0 CPU1 - 0: 324639 0 IO-APIC-edge timer - 1: 1186 0 IO-APIC-edge i8042 - 2: 0 0 XT-PIC cascade - 12: 2797 0 IO-APIC-edge i8042 - 14: 6543 0 IO-APIC-edge ide0 - 15: 1 0 IO-APIC-edge ide1 -169: 0 0 IO-APIC-level uhci-hcd -185: 0 0 IO-APIC-level uhci-hcd -193: 138 10 PCI-MSI aic79xx -201: 30 0 PCI-MSI aic79xx -225: 30 0 IO-APIC-level aic7xxx -233: 30 0 IO-APIC-level aic7xxx -NMI: 0 0 -LOC: 324553 325068 -ERR: 0 -MIS: 0 - -6. MSI quirks - -Several PCI chipsets or devices are known to not support MSI. -The PCI stack provides 3 possible levels of MSI disabling: -* on a single device -* on all devices behind a specific bridge -* globally - -6.1. Disabling MSI on a single device - -Under some circumstances it might be required to disable MSI on a -single device. This may be achieved by either not calling pci_enable_msi() -or all, or setting the pci_dev->no_msi flag before (most of the time -in a quirk). - -6.2. Disabling MSI below a bridge - -The vast majority of MSI quirks are required by PCI bridges not -being able to route MSI between busses. In this case, MSI have to be -disabled on all devices behind this bridge. It is achieves by setting -the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge -subordinate bus. There is no need to set the same flag on bridges that -are below the broken bridge. When pci_enable_msi() is called to enable -MSI on a device, pci_msi_supported() takes care of checking the NO_MSI -flag in all parent busses of the device. - -Some bridges actually support dynamic MSI support enabling/disabling -by changing some bits in their PCI configuration space (especially -the Hypertransport chipsets such as the nVidia nForce and Serverworks -HT2000). It may then be required to update the NO_MSI flag on the -corresponding devices in the sysfs hierarchy. To enable MSI support -on device "0000:00:0e", do: - - echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus - -To disable MSI support, echo 0 instead of 1. Note that it should be -used with caution since changing this value might break interrupts. - -6.3. Disabling MSI globally - -Some extreme cases may require to disable MSI globally on the system. -For now, the only known case is a Serverworks PCI-X chipsets (MSI are -not supported on several busses that are not all connected to the -chipset in the Linux PCI hierarchy). In the vast majority of other -cases, disabling only behind a specific bridge is enough. - -For debugging purpose, the user may also pass pci=nomsi on the kernel -command-line to explicitly disable MSI globally. But, once the appro- -priate quirks are added to the kernel, this option should not be -required anymore. - -6.4. Finding why MSI cannot be enabled on a device - -Assuming that MSI are not enabled on a device, you should look at -dmesg to find messages that quirks may output when disabling MSI -on some devices, some bridges or even globally. -Then, lspci -t gives the list of bridges above a device. Reading -/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI -are enabled (1) or disabled (0). In 0 is found in a single bridge -msi_bus file above the device, MSI cannot be enabled. - -7. FAQ - -Q1. Are there any limitations on using the MSI? - -A1. If the PCI device supports MSI and conforms to the -specification and the platform supports the APIC local bus, -then using MSI should work. - -Q2. Will it work on all the Pentium processors (P3, P4, Xeon, -AMD processors)? In P3 IPI's are transmitted on the APIC local -bus and in P4 and Xeon they are transmitted on the system -bus. Are there any implications with this? - -A2. MSI support enables a PCI device sending an inbound -memory write (0xfeexxxxx as target address) on its PCI bus -directly to the FSB. Since the message address has a -redirection hint bit cleared, it should work. - -Q3. The target address 0xfeexxxxx will be translated by the -Host Bridge into an interrupt message. Are there any -limitations on the chipsets such as Intel 8xx, Intel e7xxx, -or VIA? - -A3. If these chipsets support an inbound memory write with -target address set as 0xfeexxxxx, as conformed to PCI -specification 2.3 or latest, then it should work. - -Q4. From the driver point of view, if the MSI is lost because -of errors occurring during inbound memory write, then it may -wait forever. Is there a mechanism for it to recover? - -A4. Since the target of the transaction is an inbound memory -write, all transaction termination conditions (Retry, -Master-Abort, Target-Abort, or normal completion) are -supported. A device sending an MSI must abide by all the PCI -rules and conditions regarding that inbound memory write. So, -if a retry is signaled it must retry, etc... We believe that -the recommendation for Abort is also a retry (refer to PCI -specification 2.3 or latest). diff --git a/Documentation/PCI/00-INDEX b/Documentation/PCI/00-INDEX index 49f43946c6b..812b17fe3ed 100644 --- a/Documentation/PCI/00-INDEX +++ b/Documentation/PCI/00-INDEX @@ -1,5 +1,7 @@ 00-INDEX - this file +MSI-HOWTO.txt + - the Message Signaled Interrupts (MSI) Driver Guide HOWTO and FAQ. PCI-DMA-mapping.txt - info for PCI drivers using DMA portably across all platforms PCIEBUS-HOWTO.txt diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt new file mode 100644 index 00000000000..256defd7e17 --- /dev/null +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -0,0 +1,509 @@ + The MSI Driver Guide HOWTO + Tom L Nguyen tom.l.nguyen@intel.com + 10/03/2003 + Revised Feb 12, 2004 by Martine Silbermann + email: Martine.Silbermann@hp.com + Revised Jun 25, 2004 by Tom L Nguyen + +1. About this guide + +This guide describes the basics of Message Signaled Interrupts (MSI), +the advantages of using MSI over traditional interrupt mechanisms, +and how to enable your driver to use MSI or MSI-X. Also included is +a Frequently Asked Questions (FAQ) section. + +1.1 Terminology + +PCI devices can be single-function or multi-function. In either case, +when this text talks about enabling or disabling MSI on a "device +function," it is referring to one specific PCI device and function and +not to all functions on a PCI device (unless the PCI device has only +one function). + +2. Copyright 2003 Intel Corporation + +3. What is MSI/MSI-X? + +Message Signaled Interrupt (MSI), as described in the PCI Local Bus +Specification Revision 2.3 or later, is an optional feature, and a +required feature for PCI Express devices. MSI enables a device function +to request service by sending an Inbound Memory Write on its PCI bus to +the FSB as a Message Signal Interrupt transaction. Because MSI is +generated in the form of a Memory Write, all transaction conditions, +such as a Retry, Master-Abort, Target-Abort or normal completion, are +supported. + +A PCI device that supports MSI must also support pin IRQ assertion +interrupt mechanism to provide backward compatibility for systems that +do not support MSI. In systems which support MSI, the bus driver is +responsible for initializing the message address and message data of +the device function's MSI/MSI-X capability structure during device +initial configuration. + +An MSI capable device function indicates MSI support by implementing +the MSI/MSI-X capability structure in its PCI capability list. The +device function may implement both the MSI capability structure and +the MSI-X capability structure; however, the bus driver should not +enable both. + +The MSI capability structure contains Message Control register, +Message Address register and Message Data register. These registers +provide the bus driver control over MSI. The Message Control register +indicates the MSI capability supported by the device. The Message +Address register specifies the target address and the Message Data +register specifies the characteristics of the message. To request +service, the device function writes the content of the Message Data +register to the target address. The device and its software driver +are prohibited from writing to these registers. + +The MSI-X capability structure is an optional extension to MSI. It +uses an independent and separate capability structure. There are +some key advantages to implementing the MSI-X capability structure +over the MSI capability structure as described below. + + - Support a larger maximum number of vectors per function. + + - Provide the ability for system software to configure + each vector with an independent message address and message + data, specified by a table that resides in Memory Space. + + - MSI and MSI-X both support per-vector masking. Per-vector + masking is an optional extension of MSI but a required + feature for MSI-X. Per-vector masking provides the kernel the + ability to mask/unmask a single MSI while running its + interrupt service routine. If per-vector masking is + not supported, then the device driver should provide the + hardware/software synchronization to ensure that the device + generates MSI when the driver wants it to do so. + +4. Why use MSI? + +As a benefit to the simplification of board design, MSI allows board +designers to remove out-of-band interrupt routing. MSI is another +step towards a legacy-free environment. + +Due to increasing pressure on chipset and processor packages to +reduce pin count, the need for interrupt pins is expected to +diminish over time. Devices, due to pin constraints, may implement +messages to increase performance. + +PCI Express endpoints uses INTx emulation (in-band messages) instead +of IRQ pin assertion. Using INTx emulation requires interrupt +sharing among devices connected to the same node (PCI bridge) while +MSI is unique (non-shared) and does not require BIOS configuration +support. As a result, the PCI Express technology requires MSI +support for better interrupt performance. + +Using MSI enables the device functions to support two or more +vectors, which can be configured to target different CPUs to +increase scalability. + +5. Configuring a driver to use MSI/MSI-X + +By default, the kernel will not enable MSI/MSI-X on all devices that +support this capability. The CONFIG_PCI_MSI kernel option +must be selected to enable MSI/MSI-X support. + +5.1 Including MSI/MSI-X support into the kernel + +To allow MSI/MSI-X capable device drivers to selectively enable +MSI/MSI-X (using pci_enable_msi()/pci_enable_msix() as described +below), the VECTOR based scheme needs to be enabled by setting +CONFIG_PCI_MSI during kernel config. + +Since the target of the inbound message is the local APIC, providing +CONFIG_X86_LOCAL_APIC must be enabled as well as CONFIG_PCI_MSI. + +5.2 Configuring for MSI support + +Due to the non-contiguous fashion in vector assignment of the +existing Linux kernel, this version does not support multiple +messages regardless of a device function is capable of supporting +more than one vector. To enable MSI on a device function's MSI +capability structure requires a device driver to call the function +pci_enable_msi() explicitly. + +5.2.1 API pci_enable_msi + +int pci_enable_msi(struct pci_dev *dev) + +With this new API, a device driver that wants to have MSI +enabled on its device function must call this API to enable MSI. +A successful call will initialize the MSI capability structure +with ONE vector, regardless of whether a device function is +capable of supporting multiple messages. This vector replaces the +pre-assigned dev->irq with a new MSI vector. To avoid a conflict +of the new assigned vector with existing pre-assigned vector requires +a device driver to call this API before calling request_irq(). + +5.2.2 API pci_disable_msi + +void pci_disable_msi(struct pci_dev *dev) + +This API should always be used to undo the effect of pci_enable_msi() +when a device driver is unloading. This API restores dev->irq with +the pre-assigned IOAPIC vector and switches a device's interrupt +mode to PCI pin-irq assertion/INTx emulation mode. + +Note that a device driver should always call free_irq() on the MSI vector +that it has done request_irq() on before calling this API. Failure to do +so results in a BUG_ON() and a device will be left with MSI enabled and +leaks its vector. + +5.2.3 MSI mode vs. legacy mode diagram + +The below diagram shows the events which switch the interrupt +mode on the MSI-capable device function between MSI mode and +PIN-IRQ assertion mode. + + ------------ pci_enable_msi ------------------------ + | | <=============== | | + | MSI MODE | | PIN-IRQ ASSERTION MODE | + | | ===============> | | + ------------ pci_disable_msi ------------------------ + + +Figure 1. MSI Mode vs. Legacy Mode + +In Figure 1, a device operates by default in legacy mode. Legacy +in this context means PCI pin-irq assertion or PCI-Express INTx +emulation. A successful MSI request (using pci_enable_msi()) switches +a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector +stored in dev->irq will be saved by the PCI subsystem and a new +assigned MSI vector will replace dev->irq. + +To return back to its default mode, a device driver should always call +pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a +device driver should always call free_irq() on the MSI vector it has +done request_irq() on before calling pci_disable_msi(). Failure to do +so results in a BUG_ON() and a device will be left with MSI enabled and +leaks its vector. Otherwise, the PCI subsystem restores a device's +dev->irq with a pre-assigned IOAPIC vector and marks the released +MSI vector as unused. + +Once being marked as unused, there is no guarantee that the PCI +subsystem will reserve this MSI vector for a device. Depending on +the availability of current PCI vector resources and the number of +MSI/MSI-X requests from other drivers, this MSI may be re-assigned. + +For the case where the PCI subsystem re-assigns this MSI vector to +another driver, a request to switch back to MSI mode may result +in being assigned a different MSI vector or a failure if no more +vectors are available. + +5.3 Configuring for MSI-X support + +Due to the ability of the system software to configure each vector of +the MSI-X capability structure with an independent message address +and message data, the non-contiguous fashion in vector assignment of +the existing Linux kernel has no impact on supporting multiple +messages on an MSI-X capable device functions. To enable MSI-X on +a device function's MSI-X capability structure requires its device +driver to call the function pci_enable_msix() explicitly. + +The function pci_enable_msix(), once invoked, enables either +all or nothing, depending on the current availability of PCI vector +resources. If the PCI vector resources are available for the number +of vectors requested by a device driver, this function will configure +the MSI-X table of the MSI-X capability structure of a device with +requested messages. To emphasize this reason, for example, a device +may be capable for supporting the maximum of 32 vectors while its +software driver usually may request 4 vectors. It is recommended +that the device driver should call this function once during the +initialization phase of the device driver. + +Unlike the function pci_enable_msi(), the function pci_enable_msix() +does not replace the pre-assigned IOAPIC dev->irq with a new MSI +vector because the PCI subsystem writes the 1:1 vector-to-entry mapping +into the field vector of each element contained in a second argument. +Note that the pre-assigned IOAPIC dev->irq is valid only if the device +operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at +using dev->irq by the device driver to request for interrupt service +may result in unpredictable behavior. + +For each MSI-X vector granted, a device driver is responsible for calling +other functions like request_irq(), enable_irq(), etc. to enable +this vector with its corresponding interrupt service handler. It is +a device driver's choice to assign all vectors with the same +interrupt service handler or each vector with a unique interrupt +service handler. + +5.3.1 Handling MMIO address space of MSI-X Table + +The PCI 3.0 specification has implementation notes that MMIO address +space for a device's MSI-X structure should be isolated so that the +software system can set different pages for controlling accesses to the +MSI-X structure. The implementation of MSI support requires the PCI +subsystem, not a device driver, to maintain full control of the MSI-X +table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X +table/MSI-X PBA. A device driver should not access the MMIO address +space of the MSI-X table/MSI-X PBA. + +5.3.2 API pci_enable_msix + +int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) + +This API enables a device driver to request the PCI subsystem +to enable MSI-X messages on its hardware device. Depending on +the availability of PCI vectors resources, the PCI subsystem enables +either all or none of the requested vectors. + +Argument 'dev' points to the device (pci_dev) structure. + +Argument 'entries' is a pointer to an array of msix_entry structs. +The number of entries is indicated in argument 'nvec'. +struct msix_entry is defined in /driver/pci/msi.h: + +struct msix_entry { + u16 vector; /* kernel uses to write alloc vector */ + u16 entry; /* driver uses to specify entry */ +}; + +A device driver is responsible for initializing the field 'entry' of +each element with a unique entry supported by MSI-X table. Otherwise, +-EINVAL will be returned as a result. A successful return of zero +indicates the PCI subsystem completed initializing each of the requested +entries of the MSI-X table with message address and message data. +Last but not least, the PCI subsystem will write the 1:1 +vector-to-entry mapping into the field 'vector' of each element. A +device driver is responsible for keeping track of allocated MSI-X +vectors in its internal data structure. + +A return of zero indicates that the number of MSI-X vectors was +successfully allocated. A return of greater than zero indicates +MSI-X vector shortage. Or a return of less than zero indicates +a failure. This failure may be a result of duplicate entries +specified in second argument, or a result of no available vector, +or a result of failing to initialize MSI-X table entries. + +5.3.3 API pci_disable_msix + +void pci_disable_msix(struct pci_dev *dev) + +This API should always be used to undo the effect of pci_enable_msix() +when a device driver is unloading. Note that a device driver should +always call free_irq() on all MSI-X vectors it has done request_irq() +on before calling this API. Failure to do so results in a BUG_ON() and +a device will be left with MSI-X enabled and leaks its vectors. + +5.3.4 MSI-X mode vs. legacy mode diagram + +The below diagram shows the events which switch the interrupt +mode on the MSI-X capable device function between MSI-X mode and +PIN-IRQ assertion mode (legacy). + + ------------ pci_enable_msix(,,n) ------------------------ + | | <=============== | | + | MSI-X MODE | | PIN-IRQ ASSERTION MODE | + | | ===============> | | + ------------ pci_disable_msix ------------------------ + +Figure 2. MSI-X Mode vs. Legacy Mode + +In Figure 2, a device operates by default in legacy mode. A +successful MSI-X request (using pci_enable_msix()) switches a +device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector +stored in dev->irq will be saved by the PCI subsystem; however, +unlike MSI mode, the PCI subsystem will not replace dev->irq with +assigned MSI-X vector because the PCI subsystem already writes the 1:1 +vector-to-entry mapping into the field 'vector' of each element +specified in second argument. + +To return back to its default mode, a device driver should always call +pci_disable_msix() to undo the effect of pci_enable_msix(). Note that +a device driver should always call free_irq() on all MSI-X vectors it +has done request_irq() on before calling pci_disable_msix(). Failure +to do so results in a BUG_ON() and a device will be left with MSI-X +enabled and leaks its vectors. Otherwise, the PCI subsystem switches a +device function's interrupt mode from MSI-X mode to legacy mode and +marks all allocated MSI-X vectors as unused. + +Once being marked as unused, there is no guarantee that the PCI +subsystem will reserve these MSI-X vectors for a device. Depending on +the availability of current PCI vector resources and the number of +MSI/MSI-X requests from other drivers, these MSI-X vectors may be +re-assigned. + +For the case where the PCI subsystem re-assigned these MSI-X vectors +to other drivers, a request to switch back to MSI-X mode may result +being assigned with another set of MSI-X vectors or a failure if no +more vectors are available. + +5.4 Handling function implementing both MSI and MSI-X capabilities + +For the case where a function implements both MSI and MSI-X +capabilities, the PCI subsystem enables a device to run either in MSI +mode or MSI-X mode but not both. A device driver determines whether it +wants MSI or MSI-X enabled on its hardware device. Once a device +driver requests for MSI, for example, it is prohibited from requesting +MSI-X; in other words, a device driver is not permitted to ping-pong +between MSI mod MSI-X mode during a run-time. + +5.5 Hardware requirements for MSI/MSI-X support + +MSI/MSI-X support requires support from both system hardware and +individual hardware device functions. + +5.5.1 Required x86 hardware support + +Since the target of MSI address is the local APIC CPU, enabling +MSI/MSI-X support in the Linux kernel is dependent on whether existing +system hardware supports local APIC. Users should verify that their +system supports local APIC operation by testing that it runs when +CONFIG_X86_LOCAL_APIC=y. + +In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set; +however, in UP environment, users must manually set +CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting +CONFIG_PCI_MSI enables the VECTOR based scheme and the option for +MSI-capable device drivers to selectively enable MSI/MSI-X. + +Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X +vector is allocated new during runtime and MSI/MSI-X support does not +depend on BIOS support. This key independency enables MSI/MSI-X +support on future IOxAPIC free platforms. + +5.5.2 Device hardware support + +The hardware device function supports MSI by indicating the +MSI/MSI-X capability structure on its PCI capability list. By +default, this capability structure will not be initialized by +the kernel to enable MSI during the system boot. In other words, +the device function is running on its default pin assertion mode. +Note that in many cases the hardware supporting MSI have bugs, +which may result in system hangs. The software driver of specific +MSI-capable hardware is responsible for deciding whether to call +pci_enable_msi or not. A return of zero indicates the kernel +successfully initialized the MSI/MSI-X capability structure of the +device function. The device function is now running on MSI/MSI-X mode. + +5.6 How to tell whether MSI/MSI-X is enabled on device function + +At the driver level, a return of zero from the function call of +pci_enable_msi()/pci_enable_msix() indicates to a device driver that +its device function is initialized successfully and ready to run in +MSI/MSI-X mode. + +At the user level, users can use the command 'cat /proc/interrupts' +to display the vectors allocated for devices and their interrupt +MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is +enabled on a SCSI Adaptec 39320D Ultra320 controller. + + CPU0 CPU1 + 0: 324639 0 IO-APIC-edge timer + 1: 1186 0 IO-APIC-edge i8042 + 2: 0 0 XT-PIC cascade + 12: 2797 0 IO-APIC-edge i8042 + 14: 6543 0 IO-APIC-edge ide0 + 15: 1 0 IO-APIC-edge ide1 +169: 0 0 IO-APIC-level uhci-hcd +185: 0 0 IO-APIC-level uhci-hcd +193: 138 10 PCI-MSI aic79xx +201: 30 0 PCI-MSI aic79xx +225: 30 0 IO-APIC-level aic7xxx +233: 30 0 IO-APIC-level aic7xxx +NMI: 0 0 +LOC: 324553 325068 +ERR: 0 +MIS: 0 + +6. MSI quirks + +Several PCI chipsets or devices are known to not support MSI. +The PCI stack provides 3 possible levels of MSI disabling: +* on a single device +* on all devices behind a specific bridge +* globally + +6.1. Disabling MSI on a single device + +Under some circumstances it might be required to disable MSI on a +single device. This may be achieved by either not calling pci_enable_msi() +or all, or setting the pci_dev->no_msi flag before (most of the time +in a quirk). + +6.2. Disabling MSI below a bridge + +The vast majority of MSI quirks are required by PCI bridges not +being able to route MSI between busses. In this case, MSI have to be +disabled on all devices behind this bridge. It is achieves by setting +the PCI_BUS_FLAGS_NO_MSI flag in the pci_bus->bus_flags of the bridge +subordinate bus. There is no need to set the same flag on bridges that +are below the broken bridge. When pci_enable_msi() is called to enable +MSI on a device, pci_msi_supported() takes care of checking the NO_MSI +flag in all parent busses of the device. + +Some bridges actually support dynamic MSI support enabling/disabling +by changing some bits in their PCI configuration space (especially +the Hypertransport chipsets such as the nVidia nForce and Serverworks +HT2000). It may then be required to update the NO_MSI flag on the +corresponding devices in the sysfs hierarchy. To enable MSI support +on device "0000:00:0e", do: + + echo 1 > /sys/bus/pci/devices/0000:00:0e/msi_bus + +To disable MSI support, echo 0 instead of 1. Note that it should be +used with caution since changing this value might break interrupts. + +6.3. Disabling MSI globally + +Some extreme cases may require to disable MSI globally on the system. +For now, the only known case is a Serverworks PCI-X chipsets (MSI are +not supported on several busses that are not all connected to the +chipset in the Linux PCI hierarchy). In the vast majority of other +cases, disabling only behind a specific bridge is enough. + +For debugging purpose, the user may also pass pci=nomsi on the kernel +command-line to explicitly disable MSI globally. But, once the appro- +priate quirks are added to the kernel, this option should not be +required anymore. + +6.4. Finding why MSI cannot be enabled on a device + +Assuming that MSI are not enabled on a device, you should look at +dmesg to find messages that quirks may output when disabling MSI +on some devices, some bridges or even globally. +Then, lspci -t gives the list of bridges above a device. Reading +/sys/bus/pci/devices/0000:00:0e/msi_bus will tell you whether MSI +are enabled (1) or disabled (0). In 0 is found in a single bridge +msi_bus file above the device, MSI cannot be enabled. + +7. FAQ + +Q1. Are there any limitations on using the MSI? + +A1. If the PCI device supports MSI and conforms to the +specification and the platform supports the APIC local bus, +then using MSI should work. + +Q2. Will it work on all the Pentium processors (P3, P4, Xeon, +AMD processors)? In P3 IPI's are transmitted on the APIC local +bus and in P4 and Xeon they are transmitted on the system +bus. Are there any implications with this? + +A2. MSI support enables a PCI device sending an inbound +memory write (0xfeexxxxx as target address) on its PCI bus +directly to the FSB. Since the message address has a +redirection hint bit cleared, it should work. + +Q3. The target address 0xfeexxxxx will be translated by the +Host Bridge into an interrupt message. Are there any +limitations on the chipsets such as Intel 8xx, Intel e7xxx, +or VIA? + +A3. If these chipsets support an inbound memory write with +target address set as 0xfeexxxxx, as conformed to PCI +specification 2.3 or latest, then it should work. + +Q4. From the driver point of view, if the MSI is lost because +of errors occurring during inbound memory write, then it may +wait forever. Is there a mechanism for it to recover? + +A4. Since the target of the transaction is an inbound memory +write, all transaction termination conditions (Retry, +Master-Abort, Target-Abort, or normal completion) are +supported. A device sending an MSI must abide by all the PCI +rules and conditions regarding that inbound memory write. So, +if a retry is signaled it must retry, etc... We believe that +the recommendation for Abort is also a retry (refer to PCI +specification 2.3 or latest). diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt index fd4907a2968..7f6de6ea5b4 100644 --- a/Documentation/PCI/pci.txt +++ b/Documentation/PCI/pci.txt @@ -294,7 +294,8 @@ NOTE: pci_enable_device() can fail! Check the return value. pci_set_master() will enable DMA by setting the bus master bit in the PCI_COMMAND register. It also fixes the latency timer value if -it's set to something bogus by the BIOS. +it's set to something bogus by the BIOS. pci_clear_master() will +disable DMA by clearing the bus master bit. If the PCI device can use the PCI Memory-Write-Invalidate transaction, call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 461481dfb7c..9bb62f7b89c 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -12,10 +12,14 @@ rcuref.txt - Reference-count design for elements of lists/arrays protected by RCU rcu.txt - RCU Concepts +rcubarrier.txt + - Unloading modules that use RCU callbacks RTFP.txt - List of RCU papers (bibliography) going back to 1980. torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) +trace.txt + - CONFIG_RCU_TRACE debugfs files and formats UP.txt - RCU on Uniprocessor Systems whatisRCU.txt diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt new file mode 100644 index 00000000000..909602d409b --- /dev/null +++ b/Documentation/RCU/rcubarrier.txt @@ -0,0 +1,304 @@ +RCU and Unloadable Modules + +[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] + +RCU (read-copy update) is a synchronization mechanism that can be thought +of as a replacement for read-writer locking (among other things), but with +very low-overhead readers that are immune to deadlock, priority inversion, +and unbounded latency. RCU read-side critical sections are delimited +by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT +kernels, generate no code whatsoever. + +This means that RCU writers are unaware of the presence of concurrent +readers, so that RCU updates to shared data must be undertaken quite +carefully, leaving an old version of the data structure in place until all +pre-existing readers have finished. These old versions are needed because +such readers might hold a reference to them. RCU updates can therefore be +rather expensive, and RCU is thus best suited for read-mostly situations. + +How can an RCU writer possibly determine when all readers are finished, +given that readers might well leave absolutely no trace of their +presence? There is a synchronize_rcu() primitive that blocks until all +pre-existing readers have completed. An updater wishing to delete an +element p from a linked list might do the following, while holding an +appropriate lock, of course: + + list_del_rcu(p); + synchronize_rcu(); + kfree(p); + +But the above code cannot be used in IRQ context -- the call_rcu() +primitive must be used instead. This primitive takes a pointer to an +rcu_head struct placed within the RCU-protected data structure and +another pointer to a function that may be invoked later to free that +structure. Code to delete an element p from the linked list from IRQ +context might then be as follows: + + list_del_rcu(p); + call_rcu(&p->rcu, p_callback); + +Since call_rcu() never blocks, this code can safely be used from within +IRQ context. The function p_callback() might be defined as follows: + + static void p_callback(struct rcu_head *rp) + { + struct pstruct *p = container_of(rp, struct pstruct, rcu); + + kfree(p); + } + + +Unloading Modules That Use call_rcu() + +But what if p_callback is defined in an unloadable module? + +If we unload the module while some RCU callbacks are pending, +the CPUs executing these callbacks are going to be severely +disappointed when they are later invoked, as fancifully depicted at +http://lwn.net/images/ns/kernel/rcu-drop.jpg. + +We could try placing a synchronize_rcu() in the module-exit code path, +but this is not sufficient. Although synchronize_rcu() does wait for a +grace period to elapse, it does not wait for the callbacks to complete. + +One might be tempted to try several back-to-back synchronize_rcu() +calls, but this is still not guaranteed to work. If there is a very +heavy RCU-callback load, then some of the callbacks might be deferred +in order to allow other processing to proceed. Such deferral is required +in realtime kernels in order to avoid excessive scheduling latencies. + + +rcu_barrier() + +We instead need the rcu_barrier() primitive. This primitive is similar +to synchronize_rcu(), but instead of waiting solely for a grace +period to elapse, it also waits for all outstanding RCU callbacks to +complete. Pseudo-code using rcu_barrier() is as follows: + + 1. Prevent any new RCU callbacks from being posted. + 2. Execute rcu_barrier(). + 3. Allow the module to be unloaded. + +Quick Quiz #1: Why is there no srcu_barrier()? + +The rcutorture module makes use of rcu_barrier in its exit function +as follows: + + 1 static void + 2 rcu_torture_cleanup(void) + 3 { + 4 int i; + 5 + 6 fullstop = 1; + 7 if (shuffler_task != NULL) { + 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + 9 kthread_stop(shuffler_task); +10 } +11 shuffler_task = NULL; +12 +13 if (writer_task != NULL) { +14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); +15 kthread_stop(writer_task); +16 } +17 writer_task = NULL; +18 +19 if (reader_tasks != NULL) { +20 for (i = 0; i < nrealreaders; i++) { +21 if (reader_tasks[i] != NULL) { +22 VERBOSE_PRINTK_STRING( +23 "Stopping rcu_torture_reader task"); +24 kthread_stop(reader_tasks[i]); +25 } +26 reader_tasks[i] = NULL; +27 } +28 kfree(reader_tasks); +29 reader_tasks = NULL; +30 } +31 rcu_torture_current = NULL; +32 +33 if (fakewriter_tasks != NULL) { +34 for (i = 0; i < nfakewriters; i++) { +35 if (fakewriter_tasks[i] != NULL) { +36 VERBOSE_PRINTK_STRING( +37 "Stopping rcu_torture_fakewriter task"); +38 kthread_stop(fakewriter_tasks[i]); +39 } +40 fakewriter_tasks[i] = NULL; +41 } +42 kfree(fakewriter_tasks); +43 fakewriter_tasks = NULL; +44 } +45 +46 if (stats_task != NULL) { +47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); +48 kthread_stop(stats_task); +49 } +50 stats_task = NULL; +51 +52 /* Wait for all RCU callbacks to fire. */ +53 rcu_barrier(); +54 +55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ +56 +57 if (cur_ops->cleanup != NULL) +58 cur_ops->cleanup(); +59 if (atomic_read(&n_rcu_torture_error)) +60 rcu_torture_print_module_parms("End of test: FAILURE"); +61 else +62 rcu_torture_print_module_parms("End of test: SUCCESS"); +63 } + +Line 6 sets a global variable that prevents any RCU callbacks from +re-posting themselves. This will not be necessary in most cases, since +RCU callbacks rarely include calls to call_rcu(). However, the rcutorture +module is an exception to this rule, and therefore needs to set this +global variable. + +Lines 7-50 stop all the kernel tasks associated with the rcutorture +module. Therefore, once execution reaches line 53, no more rcutorture +RCU callbacks will be posted. The rcu_barrier() call on line 53 waits +for any pre-existing callbacks to complete. + +Then lines 55-62 print status and do operation-specific cleanup, and +then return, permitting the module-unload operation to be completed. + +Quick Quiz #2: Is there any other situation where rcu_barrier() might + be required? + +Your module might have additional complications. For example, if your +module invokes call_rcu() from timers, you will need to first cancel all +the timers, and only then invoke rcu_barrier() to wait for any remaining +RCU callbacks to complete. + + +Implementing rcu_barrier() + +Dipankar Sarma's implementation of rcu_barrier() makes use of the fact +that RCU callbacks are never reordered once queued on one of the per-CPU +queues. His implementation queues an RCU callback on each of the per-CPU +callback queues, and then waits until they have all started executing, at +which point, all earlier RCU callbacks are guaranteed to have completed. + +The original code for rcu_barrier() was as follows: + + 1 void rcu_barrier(void) + 2 { + 3 BUG_ON(in_interrupt()); + 4 /* Take cpucontrol mutex to protect against CPU hotplug */ + 5 mutex_lock(&rcu_barrier_mutex); + 6 init_completion(&rcu_barrier_completion); + 7 atomic_set(&rcu_barrier_cpu_count, 0); + 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); + 9 wait_for_completion(&rcu_barrier_completion); +10 mutex_unlock(&rcu_barrier_mutex); +11 } + +Line 3 verifies that the caller is in process context, and lines 5 and 10 +use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the +global completion and counters at a time, which are initialized on lines +6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is +shown below. Note that the final "1" in on_each_cpu()'s argument list +ensures that all the calls to rcu_barrier_func() will have completed +before on_each_cpu() returns. Line 9 then waits for the completion. + +This code was rewritten in 2008 to support rcu_barrier_bh() and +rcu_barrier_sched() in addition to the original rcu_barrier(). + +The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() +to post an RCU callback, as follows: + + 1 static void rcu_barrier_func(void *notused) + 2 { + 3 int cpu = smp_processor_id(); + 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + 5 struct rcu_head *head; + 6 + 7 head = &rdp->barrier; + 8 atomic_inc(&rcu_barrier_cpu_count); + 9 call_rcu(head, rcu_barrier_callback); +10 } + +Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, +which contains the struct rcu_head that needed for the later call to +call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line +8 increments a global counter. This counter will later be decremented +by the callback. Line 9 then registers the rcu_barrier_callback() on +the current CPU's queue. + +The rcu_barrier_callback() function simply atomically decrements the +rcu_barrier_cpu_count variable and finalizes the completion when it +reaches zero, as follows: + + 1 static void rcu_barrier_callback(struct rcu_head *notused) + 2 { + 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + 4 complete(&rcu_barrier_completion); + 5 } + +Quick Quiz #3: What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + + +rcu_barrier() Summary + +The rcu_barrier() primitive has seen relatively little use, since most +code using RCU is in the core kernel rather than in modules. However, if +you are using RCU from an unloadable module, you need to use rcu_barrier() +so that your module may be safely unloaded. + + +Answers to Quick Quizzes + +Quick Quiz #1: Why is there no srcu_barrier()? + +Answer: Since there is no call_srcu(), there can be no outstanding SRCU + callbacks. Therefore, there is no need to wait for them. + +Quick Quiz #2: Is there any other situation where rcu_barrier() might + be required? + +Answer: Interestingly enough, rcu_barrier() was not originally + implemented for module unloading. Nikita Danilov was using + RCU in a filesystem, which resulted in a similar situation at + filesystem-unmount time. Dipankar Sarma coded up rcu_barrier() + in response, so that Nikita could invoke it during the + filesystem-unmount process. + + Much later, yours truly hit the RCU module-unload problem when + implementing rcutorture, and found that rcu_barrier() solves + this problem as well. + +Quick Quiz #3: What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + +Answer: This cannot happen. The reason is that on_each_cpu() has its last + argument, the wait flag, set to "1". This flag is passed through + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent + a grace period from completing on non-CONFIG_PREEMPT kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is + of no use in CONFIG_PREEMPT kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to + rcu_barrier_func(). This prevents the local CPU from context + switching, again preventing grace periods from completing. This + means that all CPUs have executed rcu_barrier_func() before + the first rcu_barrier_callback() can possibly execute, in turn + preventing rcu_barrier_cpu_count from prematurely reaching zero. + + Currently, -rt implementations of RCU keep but a single global + queue for RCU callbacks, and thus do not suffer from this + problem. However, when the -rt RCU eventually does have per-CPU + callback queues, things will have to change. One simple change + is to add an rcu_read_lock() before line 8 of rcu_barrier() + and an rcu_read_unlock() after line 8 of this same function. If + you can think of a better change, please let me know! diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt new file mode 100644 index 00000000000..239f542d48b --- /dev/null +++ b/Documentation/RCU/rculist_nulls.txt @@ -0,0 +1,167 @@ +Using hlist_nulls to protect read-mostly linked lists and +objects using SLAB_DESTROY_BY_RCU allocations. + +Please read the basics in Documentation/RCU/listRCU.txt + +Using special makers (called 'nulls') is a convenient way +to solve following problem : + +A typical RCU linked list managing objects which are +allocated with SLAB_DESTROY_BY_RCU kmem_cache can +use following algos : + +1) Lookup algo +-------------- +rcu_read_lock() +begin: +obj = lockless_lookup(key); +if (obj) { + if (!try_get_ref(obj)) // might fail for free objects + goto begin; + /* + * Because a writer could delete object, and a writer could + * reuse these object before the RCU grace period, we + * must check key after geting the reference on object + */ + if (obj->key != key) { // not the object we expected + put_ref(obj); + goto begin; + } +} +rcu_read_unlock(); + +Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu() +but a version with an additional memory barrier (smp_rmb()) + +lockless_lookup(key) +{ + struct hlist_node *node, *next; + for (pos = rcu_dereference((head)->first); + pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) && + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); + pos = rcu_dereference(next)) + if (obj->key == key) + return obj; + return NULL; + +And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() : + + struct hlist_node *node; + for (pos = rcu_dereference((head)->first); + pos && ({ prefetch(pos->next); 1; }) && + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); + pos = rcu_dereference(pos->next)) + if (obj->key == key) + return obj; + return NULL; +} + +Quoting Corey Minyard : + +"If the object is moved from one list to another list in-between the + time the hash is calculated and the next field is accessed, and the + object has moved to the end of a new list, the traversal will not + complete properly on the list it should have, since the object will + be on the end of the new list and there's not a way to tell it's on a + new list and restart the list traversal. I think that this can be + solved by pre-fetching the "next" field (with proper barriers) before + checking the key." + +2) Insert algo : +---------------- + +We need to make sure a reader cannot read the new 'obj->obj_next' value +and previous value of 'obj->key'. Or else, an item could be deleted +from a chain, and inserted into another chain. If new chain was empty +before the move, 'next' pointer is NULL, and lockless reader can +not detect it missed following items in original chain. + +/* + * Please note that new inserts are done at the head of list, + * not in the middle or end. + */ +obj = kmem_cache_alloc(...); +lock_chain(); // typically a spin_lock() +obj->key = key; +atomic_inc(&obj->refcnt); +/* + * we need to make sure obj->key is updated before obj->next + */ +smp_wmb(); +hlist_add_head_rcu(&obj->obj_node, list); +unlock_chain(); // typically a spin_unlock() + + +3) Remove algo +-------------- +Nothing special here, we can use a standard RCU hlist deletion. +But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused +very very fast (before the end of RCU grace period) + +if (put_last_reference_on(obj) { + lock_chain(); // typically a spin_lock() + hlist_del_init_rcu(&obj->obj_node); + unlock_chain(); // typically a spin_unlock() + kmem_cache_free(cachep, obj); +} + + + +-------------------------------------------------------------------------- +With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup() +and extra smp_wmb() in insert function. + +For example, if we choose to store the slot number as the 'nulls' +end-of-list marker for each slot of the hash table, we can detect +a race (some writer did a delete and/or a move of an object +to another chain) checking the final 'nulls' value if +the lookup met the end of chain. If final 'nulls' value +is not the slot number, then we must restart the lookup at +the begining. If the object was moved to same chain, +then the reader doesnt care : It might eventually +scan the list again without harm. + + +1) lookup algo + + head = &table[slot]; + rcu_read_lock(); +begin: + hlist_nulls_for_each_entry_rcu(obj, node, head, member) { + if (obj->key == key) { + if (!try_get_ref(obj)) // might fail for free objects + goto begin; + if (obj->key != key) { // not the object we expected + put_ref(obj); + goto begin; + } + goto out; + } +/* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + obj = NULL; + +out: + rcu_read_unlock(); + +2) Insert function : +-------------------- + +/* + * Please note that new inserts are done at the head of list, + * not in the middle or end. + */ +obj = kmem_cache_alloc(cachep); +lock_chain(); // typically a spin_lock() +obj->key = key; +atomic_set(&obj->refcnt, 1); +/* + * insert obj in RCU way (readers might be traversing chain) + */ +hlist_nulls_add_head_rcu(&obj->obj_node, list); +unlock_chain(); // typically a spin_unlock() diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt new file mode 100644 index 00000000000..068848240a8 --- /dev/null +++ b/Documentation/RCU/trace.txt @@ -0,0 +1,413 @@ +CONFIG_RCU_TRACE debugfs Files and Formats + + +The rcupreempt and rcutree implementations of RCU provide debugfs trace +output that summarizes counters and state. This information is useful for +debugging RCU itself, and can sometimes also help to debug abuses of RCU. +Note that the rcuclassic implementation of RCU does not provide debugfs +trace output. + +The following sections describe the debugfs files and formats for +preemptable RCU (rcupreempt) and hierarchical RCU (rcutree). + + +Preemptable RCU debugfs Files and Formats + +This implementation of RCU provides three debugfs files under the +top-level directory RCU: rcu/rcuctrs (which displays the per-CPU +counters used by preemptable RCU) rcu/rcugp (which displays grace-period +counters), and rcu/rcustats (which internal counters for debugging RCU). + +The output of "cat rcu/rcuctrs" looks as follows: + +CPU last cur F M + 0 5 -5 0 0 + 1 -1 0 0 0 + 2 0 1 0 0 + 3 0 1 0 0 + 4 0 1 0 0 + 5 0 1 0 0 + 6 0 2 0 0 + 7 0 -1 0 0 + 8 0 1 0 0 +ggp = 26226, state = waitzero + +The per-CPU fields are as follows: + +o "CPU" gives the CPU number. Offline CPUs are not displayed. + +o "last" gives the value of the counter that is being decremented + for the current grace period phase. In the example above, + the counters sum to 4, indicating that there are still four + RCU read-side critical sections still running that started + before the last counter flip. + +o "cur" gives the value of the counter that is currently being + both incremented (by rcu_read_lock()) and decremented (by + rcu_read_unlock()). In the example above, the counters sum to + 1, indicating that there is only one RCU read-side critical section + still running that started after the last counter flip. + +o "F" indicates whether RCU is waiting for this CPU to acknowledge + a counter flip. In the above example, RCU is not waiting on any, + which is consistent with the state being "waitzero" rather than + "waitack". + +o "M" indicates whether RCU is waiting for this CPU to execute a + memory barrier. In the above example, RCU is not waiting on any, + which is consistent with the state being "waitzero" rather than + "waitmb". + +o "ggp" is the global grace-period counter. + +o "state" is the RCU state, which can be one of the following: + + o "idle": there is no grace period in progress. + + o "waitack": RCU just incremented the global grace-period + counter, which has the effect of reversing the roles of + the "last" and "cur" counters above, and is waiting for + all the CPUs to acknowledge the flip. Once the flip has + been acknowledged, CPUs will no longer be incrementing + what are now the "last" counters, so that their sum will + decrease monotonically down to zero. + + o "waitzero": RCU is waiting for the sum of the "last" counters + to decrease to zero. + + o "waitmb": RCU is waiting for each CPU to execute a memory + barrier, which ensures that instructions from a given CPU's + last RCU read-side critical section cannot be reordered + with instructions following the memory-barrier instruction. + +The output of "cat rcu/rcugp" looks as follows: + +oldggp=48870 newggp=48873 + +Note that reading from this file provokes a synchronize_rcu(). The +"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before +executing the synchronize_rcu(), and the "newggp" value is also the +"ggp" value, but taken after the synchronize_rcu() command returns. + + +The output of "cat rcu/rcugp" looks as follows: + +na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871 +1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640 +z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639 + +These are counters tracking internal preemptable-RCU events, however, +some of them may be useful for debugging algorithms using RCU. In +particular, the "nl", "wl", and "dl" values track the number of RCU +callbacks in various states. The fields are as follows: + +o "na" is the total number of RCU callbacks that have been enqueued + since boot. + +o "nl" is the number of RCU callbacks waiting for the previous + grace period to end so that they can start waiting on the next + grace period. + +o "wa" is the total number of RCU callbacks that have started waiting + for a grace period since boot. "na" should be roughly equal to + "nl" plus "wa". + +o "wl" is the number of RCU callbacks currently waiting for their + grace period to end. + +o "da" is the total number of RCU callbacks whose grace periods + have completed since boot. "wa" should be roughly equal to + "wl" plus "da". + +o "dr" is the total number of RCU callbacks that have been removed + from the list of callbacks ready to invoke. "dr" should be roughly + equal to "da". + +o "di" is the total number of RCU callbacks that have been invoked + since boot. "di" should be roughly equal to "da", though some + early versions of preemptable RCU had a bug so that only the + last CPU's count of invocations was displayed, rather than the + sum of all CPU's counts. + +o "1" is the number of calls to rcu_try_flip(). This should be + roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1" + described below. In other words, the number of times that + the state machine is visited should be equal to the sum of the + number of times that each state is visited plus the number of + times that the state-machine lock acquisition failed. + +o "e1" is the number of times that rcu_try_flip() was unable to + acquire the fliplock. + +o "i1" is the number of calls to rcu_try_flip_idle(). + +o "ie1" is the number of times rcu_try_flip_idle() exited early + due to the calling CPU having no work for RCU. + +o "g1" is the number of times that rcu_try_flip_idle() decided + to start a new grace period. "i1" should be roughly equal to + "ie1" plus "g1". + +o "a1" is the number of calls to rcu_try_flip_waitack(). + +o "ae1" is the number of times that rcu_try_flip_waitack() found + that at least one CPU had not yet acknowledge the new grace period + (AKA "counter flip"). + +o "a2" is the number of time rcu_try_flip_waitack() found that + all CPUs had acknowledged. "a1" should be roughly equal to + "ae1" plus "a2". (This particular output was collected on + a 128-CPU machine, hence the smaller-than-usual fraction of + calls to rcu_try_flip_waitack() finding all CPUs having already + acknowledged.) + +o "z1" is the number of calls to rcu_try_flip_waitzero(). + +o "ze1" is the number of times that rcu_try_flip_waitzero() found + that not all of the old RCU read-side critical sections had + completed. + +o "z2" is the number of times that rcu_try_flip_waitzero() finds + the sum of the counters equal to zero, in other words, that + all of the old RCU read-side critical sections had completed. + The value of "z1" should be roughly equal to "ze1" plus + "z2". + +o "m1" is the number of calls to rcu_try_flip_waitmb(). + +o "me1" is the number of times that rcu_try_flip_waitmb() finds + that at least one CPU has not yet executed a memory barrier. + +o "m2" is the number of times that rcu_try_flip_waitmb() finds that + all CPUs have executed a memory barrier. + + +Hierarchical RCU debugfs Files and Formats + +This implementation of RCU provides three debugfs files under the +top-level directory RCU: rcu/rcudata (which displays fields in struct +rcu_data), rcu/rcugp (which displays grace-period counters), and +rcu/rcuhier (which displays the struct rcu_node hierarchy). + +The output of "cat rcu/rcudata" looks as follows: + +rcu: + 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 + 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 + 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 + 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 + 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 + 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 + 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 + 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 +rcu_bh: + 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 + 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 + 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 + 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 + 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 + 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 + 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + +The first section lists the rcu_data structures for rcu, the second for +rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. +The fields are as follows: + +o The number at the beginning of each line is the CPU number. + CPUs numbers followed by an exclamation mark are offline, + but have been online at least once since boot. There will be + no output for CPUs that have never been online, which can be + a good thing in the surprisingly common case where NR_CPUS is + substantially larger than the number of actual CPUs. + +o "c" is the count of grace periods that this CPU believes have + completed. CPUs in dynticks idle mode may lag quite a ways + behind, for example, CPU 4 under "rcu" above, which has slept + through the past 25 RCU grace periods. It is not unusual to + see CPUs lagging by thousands of grace periods. + +o "g" is the count of grace periods that this CPU believes have + started. Again, CPUs in dynticks idle mode may lag behind. + If the "c" and "g" values are equal, this CPU has already + reported a quiescent state for the last RCU grace period that + it is aware of, otherwise, the CPU believes that it owes RCU a + quiescent state. + +o "pq" indicates that this CPU has passed through a quiescent state + for the current grace period. It is possible for "pq" to be + "1" and "c" different than "g", which indicates that although + the CPU has passed through a quiescent state, either (1) this + CPU has not yet reported that fact, (2) some other CPU has not + yet reported for this grace period, or (3) both. + +o "pqc" indicates which grace period the last-observed quiescent + state for this CPU corresponds to. This is important for handling + the race between CPU 0 reporting an extended dynticks-idle + quiescent state for CPU 1 and CPU 1 suddenly waking up and + reporting its own quiescent state. If CPU 1 was the last CPU + for the current grace period, then the CPU that loses this race + will attempt to incorrectly mark CPU 1 as having checked in for + the next grace period! + +o "qp" indicates that RCU still expects a quiescent state from + this CPU. + +o "rpfq" is the number of rcu_pending() calls on this CPU required + to induce this CPU to invoke force_quiescent_state(). + +o "rp" is low-order four hex digits of the count of how many times + rcu_pending() has been invoked on this CPU. + +o "dt" is the current value of the dyntick counter that is incremented + when entering or leaving dynticks idle state, either by the + scheduler or by irq. The number after the "/" is the interrupt + nesting depth when in dyntick-idle state, or one greater than + the interrupt-nesting depth otherwise. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "dn" is the current value of the dyntick counter that is incremented + when entering or leaving dynticks idle state via NMI. If both + the "dt" and "dn" values are even, then this CPU is in dynticks + idle mode and may be ignored by RCU. If either of these two + counters is odd, then RCU must be alert to the possibility of + an RCU read-side critical section running on this CPU. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "df" is the number of times that some other CPU has forced a + quiescent state on behalf of this CPU due to this CPU being in + dynticks-idle state. + + This field is displayed only for CONFIG_NO_HZ kernels. + +o "of" is the number of times that some other CPU has forced a + quiescent state on behalf of this CPU due to this CPU being + offline. In a perfect world, this might neve happen, but it + turns out that offlining and onlining a CPU can take several grace + periods, and so there is likely to be an extended period of time + when RCU believes that the CPU is online when it really is not. + Please note that erring in the other direction (RCU believing a + CPU is offline when it is really alive and kicking) is a fatal + error, so it makes sense to err conservatively. + +o "ri" is the number of times that RCU has seen fit to send a + reschedule IPI to this CPU in order to get it to report a + quiescent state. + +o "ql" is the number of RCU callbacks currently residing on + this CPU. This is the total number of callbacks, regardless + of what state they are in (new, waiting for grace period to + start, waiting for grace period to end, ready to invoke). + +o "b" is the batch limit for this CPU. If more than this number + of RCU callbacks is ready to invoke, then the remainder will + be deferred. + + +The output of "cat rcu/rcugp" looks as follows: + +rcu: completed=33062 gpnum=33063 +rcu_bh: completed=464 gpnum=464 + +Again, this output is for both "rcu" and "rcu_bh". The fields are +taken from the rcu_state structure, and are as follows: + +o "completed" is the number of grace periods that have completed. + It is comparable to the "c" field from rcu/rcudata in that a + CPU whose "c" field matches the value of "completed" is aware + that the corresponding RCU grace period has completed. + +o "gpnum" is the number of grace periods that have started. It is + comparable to the "g" field from rcu/rcudata in that a CPU + whose "g" field matches the value of "gpnum" is aware that the + corresponding RCU grace period has started. + + If these two fields are equal (as they are for "rcu_bh" above), + then there is no grace period in progress, in other words, RCU + is idle. On the other hand, if the two fields differ (as they + do for "rcu" above), then an RCU grace period is in progress. + + +The output of "cat rcu/rcuhier" looks as follows, with very long lines: + +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 +1/1 0:127 ^0 +3/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 +3/3f 0:5 ^0 2/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 +rcu_bh: +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 +0/1 0:127 ^0 +0/3 0:35 ^0 0/0 36:71 ^1 0/0 72:107 ^2 0/0 108:127 ^3 +0/3f 0:5 ^0 0/3 6:11 ^1 0/0 12:17 ^2 0/0 18:23 ^3 0/0 24:29 ^4 0/0 30:35 ^5 0/0 36:41 ^0 0/0 42:47 ^1 0/0 48:53 ^2 0/0 54:59 ^3 0/0 60:65 ^4 0/0 66:71 ^5 0/0 72:77 ^0 0/0 78:83 ^1 0/0 84:89 ^2 0/0 90:95 ^3 0/0 96:101 ^4 0/0 102:107 ^5 0/0 108:113 ^0 0/0 114:119 ^1 0/0 120:125 ^2 0/0 126:127 ^3 + +This is once again split into "rcu" and "rcu_bh" portions. The fields are +as follows: + +o "c" is exactly the same as "completed" under rcu/rcugp. + +o "g" is exactly the same as "gpnum" under rcu/rcugp. + +o "s" is the "signaled" state that drives force_quiescent_state()'s + state machine. + +o "jfq" is the number of jiffies remaining for this grace period + before force_quiescent_state() is invoked to help push things + along. Note that CPUs in dyntick-idle mode thoughout the grace + period will not report on their own, but rather must be check by + some other CPU via force_quiescent_state(). + +o "j" is the low-order four hex digits of the jiffies counter. + Yes, Paul did run into a number of problems that turned out to + be due to the jiffies counter no longer counting. Why do you ask? + +o "nfqs" is the number of calls to force_quiescent_state() since + boot. + +o "nfqsng" is the number of useless calls to force_quiescent_state(), + where there wasn't actually a grace period active. This can + happen due to races. The number in parentheses is the difference + between "nfqs" and "nfqsng", or the number of times that + force_quiescent_state() actually did some real work. + +o "fqlh" is the number of calls to force_quiescent_state() that + exited immediately (without even being counted in nfqs above) + due to contention on ->fqslock. + +o Each element of the form "1/1 0:127 ^0" represents one struct + rcu_node. Each line represents one level of the hierarchy, from + root to leaves. It is best to think of the rcu_data structures + as forming yet another level after the leaves. Note that there + might be either one, two, or three levels of rcu_node structures, + depending on the relationship between CONFIG_RCU_FANOUT and + CONFIG_NR_CPUS. + + o The numbers separated by the "/" are the qsmask followed + by the qsmaskinit. The qsmask will have one bit + set for each entity in the next lower level that + has not yet checked in for the current grace period. + The qsmaskinit will have one bit for each entity that is + currently expected to check in during each grace period. + The value of qsmaskinit is assigned to that of qsmask + at the beginning of each grace period. + + For example, for "rcu", the qsmask of the first entry + of the lowest level is 0x14, meaning that we are still + waiting for CPUs 2 and 4 to check in for the current + grace period. + + o The numbers separated by the ":" are the range of CPUs + served by this struct rcu_node. This can be helpful + in working out how the hierarchy is wired together. + + For example, the first entry at the lowest level shows + "0:5", indicating that it covers CPUs 0 through 5. + + o The number after the "^" indicates the bit in the + next higher level rcu_node structure that this + rcu_node structure corresponds to. + + For example, the first entry at the lowest level shows + "^0", indicating that it corresponds to bit zero in + the first entry at the middle level. diff --git a/Documentation/README.DAC960 b/Documentation/README.DAC960 deleted file mode 100644 index 0e8f618ab53..00000000000 --- a/Documentation/README.DAC960 +++ /dev/null @@ -1,756 +0,0 @@ - Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers - - Version 2.2.11 for Linux 2.2.19 - Version 2.4.11 for Linux 2.4.12 - - PRODUCTION RELEASE - - 11 October 2001 - - Leonard N. Zubkoff - Dandelion Digital - lnz@dandelion.com - - Copyright 1998-2001 by Leonard N. Zubkoff - - - INTRODUCTION - -Mylex, Inc. designs and manufactures a variety of high performance PCI RAID -controllers. Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont, -California 94555, USA and can be reached at 510.796.6100 or on the World Wide -Web at http://www.mylex.com. Mylex Technical Support can be reached by -electronic mail at mylexsup@us.ibm.com, by voice at 510.608.2400, or by FAX at -510.745.7715. Contact information for offices in Europe and Japan is available -on their Web site. - -The latest information on Linux support for DAC960 PCI RAID Controllers, as -well as the most recent release of this driver, will always be available from -my Linux Home Page at URL "http://www.dandelion.com/Linux/". The Linux DAC960 -driver supports all current Mylex PCI RAID controllers including the new -eXtremeRAID 2000/3000 and AcceleRAID 352/170/160 models which have an entirely -new firmware interface from the older eXtremeRAID 1100, AcceleRAID 150/200/250, -and DAC960PJ/PG/PU/PD/PL. See below for a complete controller list as well as -minimum firmware version requirements. For simplicity, in most places this -documentation refers to DAC960 generically rather than explicitly listing all -the supported models. - -Driver bug reports should be sent via electronic mail to "lnz@dandelion.com". -Please include with the bug report the complete configuration messages reported -by the driver at startup, along with any subsequent system messages relevant to -the controller's operation, and a detailed description of your system's -hardware configuration. Driver bugs are actually quite rare; if you encounter -problems with disks being marked offline, for example, please contact Mylex -Technical Support as the problem is related to the hardware configuration -rather than the Linux driver. - -Please consult the RAID controller documentation for detailed information -regarding installation and configuration of the controllers. This document -primarily provides information specific to the Linux support. - - - DRIVER FEATURES - -The DAC960 RAID controllers are supported solely as high performance RAID -controllers, not as interfaces to arbitrary SCSI devices. The Linux DAC960 -driver operates at the block device level, the same level as the SCSI and IDE -drivers. Unlike other RAID controllers currently supported on Linux, the -DAC960 driver is not dependent on the SCSI subsystem, and hence avoids all the -complexity and unnecessary code that would be associated with an implementation -as a SCSI driver. The DAC960 driver is designed for as high a performance as -possible with no compromises or extra code for compatibility with lower -performance devices. The DAC960 driver includes extensive error logging and -online configuration management capabilities. Except for initial configuration -of the controller and adding new disk drives, most everything can be handled -from Linux while the system is operational. - -The DAC960 driver is architected to support up to 8 controllers per system. -Each DAC960 parallel SCSI controller can support up to 15 disk drives per -channel, for a maximum of 60 drives on a four channel controller; the fibre -channel eXtremeRAID 3000 controller supports up to 125 disk drives per loop for -a total of 250 drives. The drives installed on a controller are divided into -one or more "Drive Groups", and then each Drive Group is subdivided further -into 1 to 32 "Logical Drives". Each Logical Drive has a specific RAID Level -and caching policy associated with it, and it appears to Linux as a single -block device. Logical Drives are further subdivided into up to 7 partitions -through the normal Linux and PC disk partitioning schemes. Logical Drives are -also known as "System Drives", and Drive Groups are also called "Packs". Both -terms are in use in the Mylex documentation; I have chosen to standardize on -the more generic "Logical Drive" and "Drive Group". - -DAC960 RAID disk devices are named in the style of the obsolete Device File -System (DEVFS). The device corresponding to Logical Drive D on Controller C -is referred to as /dev/rd/cCdD, and the partitions are called /dev/rd/cCdDp1 -through /dev/rd/cCdDp7. For example, partition 3 of Logical Drive 5 on -Controller 2 is referred to as /dev/rd/c2d5p3. Note that unlike with SCSI -disks the device names will not change in the event of a disk drive failure. -The DAC960 driver is assigned major numbers 48 - 55 with one major number per -controller. The 8 bits of minor number are divided into 5 bits for the Logical -Drive and 3 bits for the partition. - - - SUPPORTED DAC960/AcceleRAID/eXtremeRAID PCI RAID CONTROLLERS - -The following list comprises the supported DAC960, AcceleRAID, and eXtremeRAID -PCI RAID Controllers as of the date of this document. It is recommended that -anyone purchasing a Mylex PCI RAID Controller not in the following table -contact the author beforehand to verify that it is or will be supported. - -eXtremeRAID 3000 - 1 Wide Ultra-2/LVD SCSI channel - 2 External Fibre FC-AL channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -eXtremeRAID 2000 - 4 Wide Ultra-160 LVD SCSI channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -AcceleRAID 352 - 2 Wide Ultra-160 LVD SCSI channels - 100MHz Intel i960RN RISC Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 32MB/64MB ECC SDRAM Memory - -AcceleRAID 170 - 1 Wide Ultra-160 LVD SCSI channel - 100MHz Intel i960RM RISC Processor - 16MB/32MB/64MB ECC SDRAM Memory - -AcceleRAID 160 (AcceleRAID 170LP) - 1 Wide Ultra-160 LVD SCSI channel - 100MHz Intel i960RS RISC Processor - Built in 16M ECC SDRAM Memory - PCI Low Profile Form Factor - fit for 2U height - -eXtremeRAID 1100 (DAC1164P) - 3 Wide Ultra-2/LVD SCSI channels - 233MHz StrongARM SA 110 Processor - 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) - 16MB/32MB/64MB Parity SDRAM Memory with Battery Backup - -AcceleRAID 250 (DAC960PTL1) - Uses onboard Symbios SCSI chips on certain motherboards - Also includes one onboard Wide Ultra-2/LVD SCSI Channel - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -AcceleRAID 200 (DAC960PTL0) - Uses onboard Symbios SCSI chips on certain motherboards - Includes no onboard SCSI Channels - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -AcceleRAID 150 (DAC960PRL) - Uses onboard Symbios SCSI chips on certain motherboards - Also includes one onboard Wide Ultra-2/LVD SCSI Channel - 33MHz Intel i960RP RISC Processor - 4MB Parity EDO Memory - -DAC960PJ 1/2/3 Wide Ultra SCSI-3 Channels - 66MHz Intel i960RD RISC Processor - 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory - -DAC960PG 1/2/3 Wide Ultra SCSI-3 Channels - 33MHz Intel i960RP RISC Processor - 4MB/8MB ECC EDO Memory - -DAC960PU 1/2/3 Wide Ultra SCSI-3 Channels - Intel i960CF RISC Processor - 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960PD 1/2/3 Wide Fast SCSI-2 Channels - Intel i960CF RISC Processor - 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960PL 1/2/3 Wide Fast SCSI-2 Channels - Intel i960 RISC Processor - 2MB/4MB/8MB/16MB/32MB DRAM Memory - -DAC960P 1/2/3 Wide Fast SCSI-2 Channels - Intel i960 RISC Processor - 2MB/4MB/8MB/16MB/32MB DRAM Memory - -For the eXtremeRAID 2000/3000 and AcceleRAID 352/170/160, firmware version -6.00-01 or above is required. - -For the eXtremeRAID 1100, firmware version 5.06-0-52 or above is required. - -For the AcceleRAID 250, 200, and 150, firmware version 4.06-0-57 or above is -required. - -For the DAC960PJ and DAC960PG, firmware version 4.06-0-00 or above is required. - -For the DAC960PU, DAC960PD, DAC960PL, and DAC960P, either firmware version -3.51-0-04 or above is required (for dual Flash ROM controllers), or firmware -version 2.73-0-00 or above is required (for single Flash ROM controllers) - -Please note that not all SCSI disk drives are suitable for use with DAC960 -controllers, and only particular firmware versions of any given model may -actually function correctly. Similarly, not all motherboards have a BIOS that -properly initializes the AcceleRAID 250, AcceleRAID 200, AcceleRAID 150, -DAC960PJ, and DAC960PG because the Intel i960RD/RP is a multi-function device. -If in doubt, contact Mylex RAID Technical Support (mylexsup@us.ibm.com) to -verify compatibility. Mylex makes available a hard disk compatibility list at -http://www.mylex.com/support/hdcomp/hd-lists.html. - - - DRIVER INSTALLATION - -This distribution was prepared for Linux kernel version 2.2.19 or 2.4.12. - -To install the DAC960 RAID driver, you may use the following commands, -replacing "/usr/src" with wherever you keep your Linux kernel source tree: - - cd /usr/src - tar -xvzf DAC960-2.2.11.tar.gz (or DAC960-2.4.11.tar.gz) - mv README.DAC960 linux/Documentation - mv DAC960.[ch] linux/drivers/block - patch -p0 < DAC960.patch (if DAC960.patch is included) - cd linux - make config - make bzImage (or zImage) - -Then install "arch/i386/boot/bzImage" or "arch/i386/boot/zImage" as your -standard kernel, run lilo if appropriate, and reboot. - -To create the necessary devices in /dev, the "make_rd" script included in -"DAC960-Utilities.tar.gz" from http://www.dandelion.com/Linux/ may be used. -LILO 21 and FDISK v2.9 include DAC960 support; also included in this archive -are patches to LILO 20 and FDISK v2.8 that add DAC960 support, along with -statically linked executables of LILO and FDISK. This modified version of LILO -will allow booting from a DAC960 controller and/or mounting the root file -system from a DAC960. - -Red Hat Linux 6.0 and SuSE Linux 6.1 include support for Mylex PCI RAID -controllers. Installing directly onto a DAC960 may be problematic from other -Linux distributions until their installation utilities are updated. - - - INSTALLATION NOTES - -Before installing Linux or adding DAC960 logical drives to an existing Linux -system, the controller must first be configured to provide one or more logical -drives using the BIOS Configuration Utility or DACCF. Please note that since -there are only at most 6 usable partitions on each logical drive, systems -requiring more partitions should subdivide a drive group into multiple logical -drives, each of which can have up to 6 usable partitions. Also, note that with -large disk arrays it is advisable to enable the 8GB BIOS Geometry (255/63) -rather than accepting the default 2GB BIOS Geometry (128/32); failing to so do -will cause the logical drive geometry to have more than 65535 cylinders which -will make it impossible for FDISK to be used properly. The 8GB BIOS Geometry -can be enabled by configuring the DAC960 BIOS, which is accessible via Alt-M -during the BIOS initialization sequence. - -For maximum performance and the most efficient E2FSCK performance, it is -recommended that EXT2 file systems be built with a 4KB block size and 16 block -stride to match the DAC960 controller's 64KB default stripe size. The command -"mke2fs -b 4096 -R stride=16 " is appropriate. Unless there will be a -large number of small files on the file systems, it is also beneficial to add -the "-i 16384" option to increase the bytes per inode parameter thereby -reducing the file system metadata. Finally, on systems that will only be run -with Linux 2.2 or later kernels it is beneficial to enable sparse superblocks -with the "-s 1" option. - - - DAC960 ANNOUNCEMENTS MAILING LIST - -The DAC960 Announcements Mailing List provides a forum for informing Linux -users of new driver releases and other announcements regarding Linux support -for DAC960 PCI RAID Controllers. To join the mailing list, send a message to -"dac960-announce-request@dandelion.com" with the line "subscribe" in the -message body. - - - CONTROLLER CONFIGURATION AND STATUS MONITORING - -The DAC960 RAID controllers running firmware 4.06 or above include a Background -Initialization facility so that system downtime is minimized both for initial -installation and subsequent configuration of additional storage. The BIOS -Configuration Utility (accessible via Alt-R during the BIOS initialization -sequence) is used to quickly configure the controller, and then the logical -drives that have been created are available for immediate use even while they -are still being initialized by the controller. The primary need for online -configuration and status monitoring is then to avoid system downtime when disk -drives fail and must be replaced. Mylex's online monitoring and configuration -utilities are being ported to Linux and will become available at some point in -the future. Note that with a SAF-TE (SCSI Accessed Fault-Tolerant Enclosure) -enclosure, the controller is able to rebuild failed drives automatically as -soon as a drive replacement is made available. - -The primary interfaces for controller configuration and status monitoring are -special files created in the /proc/rd/... hierarchy along with the normal -system console logging mechanism. Whenever the system is operating, the DAC960 -driver queries each controller for status information every 10 seconds, and -checks for additional conditions every 60 seconds. The initial status of each -controller is always available for controller N in /proc/rd/cN/initial_status, -and the current status as of the last status monitoring query is available in -/proc/rd/cN/current_status. In addition, status changes are also logged by the -driver to the system console and will appear in the log files maintained by -syslog. The progress of asynchronous rebuild or consistency check operations -is also available in /proc/rd/cN/current_status, and progress messages are -logged to the system console at most every 60 seconds. - -Starting with the 2.2.3/2.0.3 versions of the driver, the status information -available in /proc/rd/cN/initial_status and /proc/rd/cN/current_status has been -augmented to include the vendor, model, revision, and serial number (if -available) for each physical device found connected to the controller: - -***** DAC960 RAID Driver Version 2.2.3 of 19 August 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PRL PCI RAID Controller - Firmware Version: 4.07-0-07, Channels: 1, Memory Size: 16MB - PCI Bus: 1, Device: 4, Function: 1, I/O Address: Unassigned - PCI Address: 0xFE300000 mapped at 0xA0800000, IRQ Channel: 21 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - SAF-TE Enclosure Management Enabled - Physical Devices: - 0:0 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68016775HA - Disk Status: Online, 17928192 blocks - 0:1 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68004E53HA - Disk Status: Online, 17928192 blocks - 0:2 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 13013935HA - Disk Status: Online, 17928192 blocks - 0:3 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 13016897HA - Disk Status: Online, 17928192 blocks - 0:4 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68019905HA - Disk Status: Online, 17928192 blocks - 0:5 Vendor: IBM Model: DRVS09D Revision: 0270 - Serial Number: 68012753HA - Disk Status: Online, 17928192 blocks - 0:6 Vendor: ESG-SHV Model: SCA HSBP M6 Revision: 0.61 - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 89640960 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -To simplify the monitoring process for custom software, the special file -/proc/rd/status returns "OK" when all DAC960 controllers in the system are -operating normally and no failures have occurred, or "ALERT" if any logical -drives are offline or critical or any non-standby physical drives are dead. - -Configuration commands for controller N are available via the special file -/proc/rd/cN/user_command. A human readable command can be written to this -special file to initiate a configuration operation, and the results of the -operation can then be read back from the special file in addition to being -logged to the system console. The shell command sequence - - echo "" > /proc/rd/c0/user_command - cat /proc/rd/c0/user_command - -is typically used to execute configuration commands. The configuration -commands are: - - flush-cache - - The "flush-cache" command flushes the controller's cache. The system - automatically flushes the cache at shutdown or if the driver module is - unloaded, so this command is only needed to be certain a write back cache - is flushed to disk before the system is powered off by a command to a UPS. - Note that the flush-cache command also stops an asynchronous rebuild or - consistency check, so it should not be used except when the system is being - halted. - - kill : - - The "kill" command marks the physical drive : as DEAD. - This command is provided primarily for testing, and should not be used - during normal system operation. - - make-online : - - The "make-online" command changes the physical drive : - from status DEAD to status ONLINE. In cases where multiple physical drives - have been killed simultaneously, this command may be used to bring all but - one of them back online, after which a rebuild to the final drive is - necessary. - - Warning: make-online should only be used on a dead physical drive that is - an active part of a drive group, never on a standby drive. The command - should never be used on a dead drive that is part of a critical logical - drive; rebuild should be used if only a single drive is dead. - - make-standby : - - The "make-standby" command changes physical drive : - from status DEAD to status STANDBY. It should only be used in cases where - a dead drive was replaced after an automatic rebuild was performed onto a - standby drive. It cannot be used to add a standby drive to the controller - configuration if one was not created initially; the BIOS Configuration - Utility must be used for that currently. - - rebuild : - - The "rebuild" command initiates an asynchronous rebuild onto physical drive - :. It should only be used when a dead drive has been - replaced. - - check-consistency - - The "check-consistency" command initiates an asynchronous consistency check - of with automatic restoration. It can be used - whenever it is desired to verify the consistency of the redundancy - information. - - cancel-rebuild - cancel-consistency-check - - The "cancel-rebuild" and "cancel-consistency-check" commands cancel any - rebuild or consistency check operations previously initiated. - - - EXAMPLE I - DRIVE FAILURE WITHOUT A STANDBY DRIVE - -The following annotated logs demonstrate the controller configuration and and -online status monitoring capabilities of the Linux DAC960 Driver. The test -configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a -DAC960PJ controller. The physical drives are configured into a single drive -group without a standby drive, and the drive group has been configured into two -logical drives, one RAID-5 and one RAID-6. Note that these logs are from an -earlier version of the driver and the messages have changed somewhat with newer -releases, but the functionality remains similar. First, here is the current -status of the RAID configuration: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -The above messages indicate that everything is healthy, and /proc/rd/status -returns "OK" indicating that there are no problems with any DAC960 controller -in the system. For demonstration purposes, while I/O is active Physical Drive -1:1 is now disconnected, simulating a drive failure. The failure is noted by -the driver within 10 seconds of the controller's having detected it, and the -driver logs the following console status messages indicating that Logical -Drives 0 and 1 are now CRITICAL as a result of Physical Drive 1:1 being DEAD: - -DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:1 killed because of timeout on SCSI command -DAC960#0: Physical Drive 1:1 is now DEAD -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL - -The Sense Keys logged here are just Check Condition / Unit Attention conditions -arising from a SCSI bus reset that is forced by the controller during its error -recovery procedures. Concurrently with the above, the driver status available -from /proc/rd also reflects the drive failure. The status message in -/proc/rd/status has changed from "OK" to "ALERT": - -gwynedd:/u/lnz# cat /proc/rd/status -ALERT - -and /proc/rd/c0/current_status has been updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Dead, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -Since there are no standby drives configured, the system can continue to access -the logical drives in a performance degraded mode until the failed drive is -replaced and a rebuild operation completed to restore the redundancy of the -logical drives. Once Physical Drive 1:1 is replaced with a properly -functioning drive, or if the physical drive was killed without having failed -(e.g., due to electrical problems on the SCSI bus), the user can instruct the -controller to initiate a rebuild operation onto the newly replaced drive: - -gwynedd:/u/lnz# echo "rebuild 1:1" > /proc/rd/c0/user_command -gwynedd:/u/lnz# cat /proc/rd/c0/user_command -Rebuild of Physical Drive 1:1 Initiated - -The echo command instructs the controller to initiate an asynchronous rebuild -operation onto Physical Drive 1:1, and the status message that results from the -operation is then available for reading from /proc/rd/c0/user_command, as well -as being logged to the console by the driver. - -Within 10 seconds of this command the driver logs the initiation of the -asynchronous rebuild operation: - -DAC960#0: Rebuild of Physical Drive 1:1 Initiated -DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 -DAC960#0: Physical Drive 1:1 is now WRITE-ONLY -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 1% completed - -and /proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Write-Only, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 6% completed - -As the rebuild progresses, the current status in /proc/rd/c0/current_status is -updated every 10 seconds: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Write-Only, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 15% completed - -and every minute a progress message is logged to the console by the driver: - -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 32% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 63% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 94% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 94% completed - -Finally, the rebuild completes successfully. The driver logs the status of the -logical and physical drives and the rebuild completion: - -DAC960#0: Rebuild Completed Successfully -DAC960#0: Physical Drive 1:1 is now ONLINE -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE - -/proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru - Rebuild Completed Successfully - -and /proc/rd/status indicates that everything is healthy once again: - -gwynedd:/u/lnz# cat /proc/rd/status -OK - - - EXAMPLE II - DRIVE FAILURE WITH A STANDBY DRIVE - -The following annotated logs demonstrate the controller configuration and and -online status monitoring capabilities of the Linux DAC960 Driver. The test -configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a -DAC960PJ controller. The physical drives are configured into a single drive -group with a standby drive, and the drive group has been configured into two -logical drives, one RAID-5 and one RAID-6. Note that these logs are from an -earlier version of the driver and the messages have changed somewhat with newer -releases, but the functionality remains similar. First, here is the current -status of the RAID configuration: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Online, 2201600 blocks - 1:3 - Disk: Standby, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - No Rebuild or Consistency Check in Progress - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -The above messages indicate that everything is healthy, and /proc/rd/status -returns "OK" indicating that there are no problems with any DAC960 controller -in the system. For demonstration purposes, while I/O is active Physical Drive -1:2 is now disconnected, simulating a drive failure. The failure is noted by -the driver within 10 seconds of the controller's having detected it, and the -driver logs the following console status messages: - -DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 -DAC960#0: Physical Drive 1:2 killed because of timeout on SCSI command -DAC960#0: Physical Drive 1:2 is now DEAD -DAC960#0: Physical Drive 1:2 killed because it was removed -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL - -Since a standby drive is configured, the controller automatically begins -rebuilding onto the standby drive: - -DAC960#0: Physical Drive 1:3 is now WRITE-ONLY -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed - -Concurrently with the above, the driver status available from /proc/rd also -reflects the drive failure and automatic rebuild. The status message in -/proc/rd/status has changed from "OK" to "ALERT": - -gwynedd:/u/lnz# cat /proc/rd/status -ALERT - -and /proc/rd/c0/current_status has been updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Write-Only, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed - -As the rebuild progresses, the current status in /proc/rd/c0/current_status is -updated every 10 seconds: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Write-Only, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru - Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed - -and every minute a progress message is logged on the console by the driver: - -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed -DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 76% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 66% completed -DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 84% completed - -Finally, the rebuild completes successfully. The driver logs the status of the -logical and physical drives and the rebuild completion: - -DAC960#0: Rebuild Completed Successfully -DAC960#0: Physical Drive 1:3 is now ONLINE -DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE -DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE - -/proc/rd/c0/current_status is updated: - -***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** -Copyright 1998-1999 by Leonard N. Zubkoff -Configuring Mylex DAC960PJ PCI RAID Controller - Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB - PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned - PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 - Controller Queue Depth: 128, Maximum Blocks per Command: 128 - Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 - Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Dead, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - Rebuild Completed Successfully - -and /proc/rd/status indicates that everything is healthy once again: - -gwynedd:/u/lnz# cat /proc/rd/status -OK - -Note that the absence of a viable standby drive does not create an "ALERT" -status. Once dead Physical Drive 1:2 has been replaced, the controller must be -told that this has occurred and that the newly replaced drive should become the -new standby drive: - -gwynedd:/u/lnz# echo "make-standby 1:2" > /proc/rd/c0/user_command -gwynedd:/u/lnz# cat /proc/rd/c0/user_command -Make Standby of Physical Drive 1:2 Succeeded - -The echo command instructs the controller to make Physical Drive 1:2 into a -standby drive, and the status message that results from the operation is then -available for reading from /proc/rd/c0/user_command, as well as being logged to -the console by the driver. Within 60 seconds of this command the driver logs: - -DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 -DAC960#0: Physical Drive 1:2 is now STANDBY -DAC960#0: Make Standby of Physical Drive 1:2 Succeeded - -and /proc/rd/c0/current_status is updated: - -gwynedd:/u/lnz# cat /proc/rd/c0/current_status - ... - Physical Devices: - 0:1 - Disk: Online, 2201600 blocks - 0:2 - Disk: Online, 2201600 blocks - 0:3 - Disk: Online, 2201600 blocks - 1:1 - Disk: Online, 2201600 blocks - 1:2 - Disk: Standby, 2201600 blocks - 1:3 - Disk: Online, 2201600 blocks - Logical Drives: - /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru - /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru - Rebuild Completed Successfully diff --git a/Documentation/README.cycladesZ b/Documentation/README.cycladesZ deleted file mode 100644 index 024a69443cc..00000000000 --- a/Documentation/README.cycladesZ +++ /dev/null @@ -1,8 +0,0 @@ - -The Cyclades-Z must have firmware loaded onto the card before it will -operate. This operation should be performed during system startup, - -The firmware, loader program and the latest device driver code are -available from Cyclades at - ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/ - diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index cc49400b4af..7ea231172c8 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -392,6 +392,10 @@ int main(int argc, char *argv[]) goto err; } } + if (!maskset && !tid && !containerset) { + usage(); + goto err; + } do { int i; diff --git a/Documentation/acpi/debug.txt b/Documentation/acpi/debug.txt new file mode 100644 index 00000000000..65bf47c46b6 --- /dev/null +++ b/Documentation/acpi/debug.txt @@ -0,0 +1,148 @@ + ACPI Debug Output + + +The ACPI CA, the Linux ACPI core, and some ACPI drivers can generate debug +output. This document describes how to use this facility. + +Compile-time configuration +-------------------------- + +ACPI debug output is globally enabled by CONFIG_ACPI_DEBUG. If this config +option is turned off, the debug messages are not even built into the +kernel. + +Boot- and run-time configuration +-------------------------------- + +When CONFIG_ACPI_DEBUG=y, you can select the component and level of messages +you're interested in. At boot-time, use the acpi.debug_layer and +acpi.debug_level kernel command line options. After boot, you can use the +debug_layer and debug_level files in /sys/module/acpi/parameters/ to control +the debug messages. + +debug_layer (component) +----------------------- + +The "debug_layer" is a mask that selects components of interest, e.g., a +specific driver or part of the ACPI interpreter. To build the debug_layer +bitmask, look for the "#define _COMPONENT" in an ACPI source file. + +You can set the debug_layer mask at boot-time using the acpi.debug_layer +command line argument, and you can change it after boot by writing values +to /sys/module/acpi/parameters/debug_layer. + +The possible components are defined in include/acpi/acoutput.h and +include/acpi/acpi_drivers.h. Reading /sys/module/acpi/parameters/debug_layer +shows the supported mask values, currently these: + + ACPI_UTILITIES 0x00000001 + ACPI_HARDWARE 0x00000002 + ACPI_EVENTS 0x00000004 + ACPI_TABLES 0x00000008 + ACPI_NAMESPACE 0x00000010 + ACPI_PARSER 0x00000020 + ACPI_DISPATCHER 0x00000040 + ACPI_EXECUTER 0x00000080 + ACPI_RESOURCES 0x00000100 + ACPI_CA_DEBUGGER 0x00000200 + ACPI_OS_SERVICES 0x00000400 + ACPI_CA_DISASSEMBLER 0x00000800 + ACPI_COMPILER 0x00001000 + ACPI_TOOLS 0x00002000 + ACPI_BUS_COMPONENT 0x00010000 + ACPI_AC_COMPONENT 0x00020000 + ACPI_BATTERY_COMPONENT 0x00040000 + ACPI_BUTTON_COMPONENT 0x00080000 + ACPI_SBS_COMPONENT 0x00100000 + ACPI_FAN_COMPONENT 0x00200000 + ACPI_PCI_COMPONENT 0x00400000 + ACPI_POWER_COMPONENT 0x00800000 + ACPI_CONTAINER_COMPONENT 0x01000000 + ACPI_SYSTEM_COMPONENT 0x02000000 + ACPI_THERMAL_COMPONENT 0x04000000 + ACPI_MEMORY_DEVICE_COMPONENT 0x08000000 + ACPI_VIDEO_COMPONENT 0x10000000 + ACPI_PROCESSOR_COMPONENT 0x20000000 + +debug_level +----------- + +The "debug_level" is a mask that selects different types of messages, e.g., +those related to initialization, method execution, informational messages, etc. +To build debug_level, look at the level specified in an ACPI_DEBUG_PRINT() +statement. + +The ACPI interpreter uses several different levels, but the Linux +ACPI core and ACPI drivers generally only use ACPI_LV_INFO. + +You can set the debug_level mask at boot-time using the acpi.debug_level +command line argument, and you can change it after boot by writing values +to /sys/module/acpi/parameters/debug_level. + +The possible levels are defined in include/acpi/acoutput.h. Reading +/sys/module/acpi/parameters/debug_level shows the supported mask values, +currently these: + + ACPI_LV_INIT 0x00000001 + ACPI_LV_DEBUG_OBJECT 0x00000002 + ACPI_LV_INFO 0x00000004 + ACPI_LV_INIT_NAMES 0x00000020 + ACPI_LV_PARSE 0x00000040 + ACPI_LV_LOAD 0x00000080 + ACPI_LV_DISPATCH 0x00000100 + ACPI_LV_EXEC 0x00000200 + ACPI_LV_NAMES 0x00000400 + ACPI_LV_OPREGION 0x00000800 + ACPI_LV_BFIELD 0x00001000 + ACPI_LV_TABLES 0x00002000 + ACPI_LV_VALUES 0x00004000 + ACPI_LV_OBJECTS 0x00008000 + ACPI_LV_RESOURCES 0x00010000 + ACPI_LV_USER_REQUESTS 0x00020000 + ACPI_LV_PACKAGE 0x00040000 + ACPI_LV_ALLOCATIONS 0x00100000 + ACPI_LV_FUNCTIONS 0x00200000 + ACPI_LV_OPTIMIZATIONS 0x00400000 + ACPI_LV_MUTEX 0x01000000 + ACPI_LV_THREADS 0x02000000 + ACPI_LV_IO 0x04000000 + ACPI_LV_INTERRUPTS 0x08000000 + ACPI_LV_AML_DISASSEMBLE 0x10000000 + ACPI_LV_VERBOSE_INFO 0x20000000 + ACPI_LV_FULL_TABLES 0x40000000 + ACPI_LV_EVENTS 0x80000000 + +Examples +-------- + +For example, drivers/acpi/bus.c contains this: + + #define _COMPONENT ACPI_BUS_COMPONENT + ... + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device insertion detected\n")); + +To turn on this message, set the ACPI_BUS_COMPONENT bit in acpi.debug_layer +and the ACPI_LV_INFO bit in acpi.debug_level. (The ACPI_DEBUG_PRINT +statement uses ACPI_DB_INFO, which is macro based on the ACPI_LV_INFO +definition.) + +Enable all AML "Debug" output (stores to the Debug object while interpreting +AML) during boot: + + acpi.debug_layer=0xffffffff acpi.debug_level=0x2 + +Enable PCI and PCI interrupt routing debug messages: + + acpi.debug_layer=0x400000 acpi.debug_level=0x4 + +Enable all ACPI hardware-related messages: + + acpi.debug_layer=0x2 acpi.debug_level=0xffffffff + +Enable all ACPI_DB_INFO messages after boot: + + # echo 0x4 > /sys/module/acpi/parameters/debug_level + +Show all valid component values: + + # cat /sys/module/acpi/parameters/debug_layer diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment index d145ccca169..c7c7a114c78 100644 --- a/Documentation/arm/mem_alignment +++ b/Documentation/arm/mem_alignment @@ -24,7 +24,7 @@ real bad - it changes the behaviour of all unaligned instructions in user space, and might cause programs to fail unexpectedly. To change the alignment trap behavior, simply echo a number into -/proc/sys/debug/alignment. The number is made up from various bits: +/proc/cpu/alignment. The number is made up from various bits: bit behavior when set --- ----------------- diff --git a/Documentation/arm/pxa/mfp.txt b/Documentation/arm/pxa/mfp.txt new file mode 100644 index 00000000000..a179e5bc02c --- /dev/null +++ b/Documentation/arm/pxa/mfp.txt @@ -0,0 +1,286 @@ + MFP Configuration for PXA2xx/PXA3xx Processors + + Eric Miao + +MFP stands for Multi-Function Pin, which is the pin-mux logic on PXA3xx and +later PXA series processors. This document describes the existing MFP API, +and how board/platform driver authors could make use of it. + + Basic Concept +=============== + +Unlike the GPIO alternate function settings on PXA25x and PXA27x, a new MFP +mechanism is introduced from PXA3xx to completely move the pin-mux functions +out of the GPIO controller. In addition to pin-mux configurations, the MFP +also controls the low power state, driving strength, pull-up/down and event +detection of each pin. Below is a diagram of internal connections between +the MFP logic and the remaining SoC peripherals: + + +--------+ + | |--(GPIO19)--+ + | GPIO | | + | |--(GPIO...) | + +--------+ | + | +---------+ + +--------+ +------>| | + | PWM2 |--(PWM_OUT)-------->| MFP | + +--------+ +------>| |-------> to external PAD + | +---->| | + +--------+ | | +-->| | + | SSP2 |---(TXD)----+ | | +---------+ + +--------+ | | + | | + +--------+ | | + | Keypad |--(MKOUT4)----+ | + +--------+ | + | + +--------+ | + | UART2 |---(TXD)--------+ + +--------+ + +NOTE: the external pad is named as MFP_PIN_GPIO19, it doesn't necessarily +mean it's dedicated for GPIO19, only as a hint that internally this pin +can be routed from GPIO19 of the GPIO controller. + +To better understand the change from PXA25x/PXA27x GPIO alternate function +to this new MFP mechanism, here are several key points: + + 1. GPIO controller on PXA3xx is now a dedicated controller, same as other + internal controllers like PWM, SSP and UART, with 128 internal signals + which can be routed to external through one or more MFPs (e.g. GPIO<0> + can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2, + see arch/arm/mach-pxa/mach/include/mfp-pxa300.h) + + 2. Alternate function configuration is removed from this GPIO controller, + the remaining functions are pure GPIO-specific, i.e. + + - GPIO signal level control + - GPIO direction control + - GPIO level change detection + + 3. Low power state for each pin is now controlled by MFP, this means the + PGSRx registers on PXA2xx are now useless on PXA3xx + + 4. Wakeup detection is now controlled by MFP, PWER does not control the + wakeup from GPIO(s) any more, depending on the sleeping state, ADxER + (as defined in pxa3xx-regs.h) controls the wakeup from MFP + +NOTE: with such a clear separation of MFP and GPIO, by GPIO we normally +mean it is a GPIO signal, and by MFP or pin xxx, we mean a physical +pad (or ball). + + MFP API Usage +=============== + +For board code writers, here are some guidelines: + +1. include ONE of the following header files in your .c: + + - #include + - #include + - #include + - #include + - #include + + NOTE: only one file in your .c, depending on the processors used, + because pin configuration definitions may conflict in these file (i.e. + same name, different meaning and settings on different processors). E.g. + for zylonite platform, which support both PXA300/PXA310 and PXA320, two + separate files are introduced: zylonite_pxa300.c and zylonite_pxa320.c + (in addition to handle MFP configuration differences, they also handle + the other differences between the two combinations). + + NOTE: PXA300 and PXA310 are almost identical in pin configurations (with + PXA310 supporting some additional ones), thus the difference is actually + covered in a single mfp-pxa300.h. + +2. prepare an array for the initial pin configurations, e.g.: + + static unsigned long mainstone_pin_config[] __initdata = { + /* Chip Select */ + GPIO15_nCS_1, + + /* LCD - 16bpp Active TFT */ + GPIOxx_TFT_LCD_16BPP, + GPIO16_PWM0_OUT, /* Backlight */ + + /* MMC */ + GPIO32_MMC_CLK, + GPIO112_MMC_CMD, + GPIO92_MMC_DAT_0, + GPIO109_MMC_DAT_1, + GPIO110_MMC_DAT_2, + GPIO111_MMC_DAT_3, + + ... + + /* GPIO */ + GPIO1_GPIO | WAKEUP_ON_EDGE_BOTH, + }; + + a) once the pin configurations are passed to pxa{2xx,3xx}_mfp_config(), + and written to the actual registers, they are useless and may discard, + adding '__initdata' will help save some additional bytes here. + + b) when there is only one possible pin configurations for a component, + some simplified definitions can be used, e.g. GPIOxx_TFT_LCD_16BPP on + PXA25x and PXA27x processors + + c) if by board design, a pin can be configured to wake up the system + from low power state, it can be 'OR'ed with any of: + + WAKEUP_ON_EDGE_BOTH + WAKEUP_ON_EDGE_RISE + WAKEUP_ON_EDGE_FALL + WAKEUP_ON_LEVEL_HIGH - specifically for enabling of keypad GPIOs, + + to indicate that this pin has the capability of wake-up the system, + and on which edge(s). This, however, doesn't necessarily mean the + pin _will_ wakeup the system, it will only when set_irq_wake() is + invoked with the corresponding GPIO IRQ (GPIO_IRQ(xx) or gpio_to_irq()) + and eventually calls gpio_set_wake() for the actual register setting. + + d) although PXA3xx MFP supports edge detection on each pin, the + internal logic will only wakeup the system when those specific bits + in ADxER registers are set, which can be well mapped to the + corresponding peripheral, thus set_irq_wake() can be called with + the peripheral IRQ to enable the wakeup. + + + MFP on PXA3xx +=============== + +Every external I/O pad on PXA3xx (excluding those for special purpose) has +one MFP logic associated, and is controlled by one MFP register (MFPR). + +The MFPR has the following bit definitions (for PXA300/PXA310/PXA320): + + 31 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + | RESERVED |PS|PU|PD| DRIVE |SS|SD|SO|EC|EF|ER|--| AF_SEL | + +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + + Bit 3: RESERVED + Bit 4: EDGE_RISE_EN - enable detection of rising edge on this pin + Bit 5: EDGE_FALL_EN - enable detection of falling edge on this pin + Bit 6: EDGE_CLEAR - disable edge detection on this pin + Bit 7: SLEEP_OE_N - enable outputs during low power modes + Bit 8: SLEEP_DATA - output data on the pin during low power modes + Bit 9: SLEEP_SEL - selection control for low power modes signals + Bit 13: PULLDOWN_EN - enable the internal pull-down resistor on this pin + Bit 14: PULLUP_EN - enable the internal pull-up resistor on this pin + Bit 15: PULL_SEL - pull state controlled by selected alternate function + (0) or by PULL{UP,DOWN}_EN bits (1) + + Bit 0 - 2: AF_SEL - alternate function selection, 8 possibilities, from 0-7 + Bit 10-12: DRIVE - drive strength and slew rate + 0b000 - fast 1mA + 0b001 - fast 2mA + 0b002 - fast 3mA + 0b003 - fast 4mA + 0b004 - slow 6mA + 0b005 - fast 6mA + 0b006 - slow 10mA + 0b007 - fast 10mA + + MFP Design for PXA2xx/PXA3xx +============================== + +Due to the difference of pin-mux handling between PXA2xx and PXA3xx, a unified +MFP API is introduced to cover both series of processors. + +The basic idea of this design is to introduce definitions for all possible pin +configurations, these definitions are processor and platform independent, and +the actual API invoked to convert these definitions into register settings and +make them effective there-after. + + Files Involved + -------------- + + - arch/arm/mach-pxa/include/mach/mfp.h + + for + 1. Unified pin definitions - enum constants for all configurable pins + 2. processor-neutral bit definitions for a possible MFP configuration + + - arch/arm/mach-pxa/include/mach/mfp-pxa3xx.h + + for PXA3xx specific MFPR register bit definitions and PXA3xx common pin + configurations + + - arch/arm/mach-pxa/include/mach/mfp-pxa2xx.h + + for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations + + - arch/arm/mach-pxa/include/mach/mfp-pxa25x.h + arch/arm/mach-pxa/include/mach/mfp-pxa27x.h + arch/arm/mach-pxa/include/mach/mfp-pxa300.h + arch/arm/mach-pxa/include/mach/mfp-pxa320.h + arch/arm/mach-pxa/include/mach/mfp-pxa930.h + + for processor specific definitions + + - arch/arm/mach-pxa/mfp-pxa3xx.c + - arch/arm/mach-pxa/mfp-pxa2xx.c + + for implementation of the pin configuration to take effect for the actual + processor. + + Pin Configuration + ----------------- + + The following comments are copied from mfp.h (see the actual source code + for most updated info) + + /* + * a possible MFP configuration is represented by a 32-bit integer + * + * bit 0.. 9 - MFP Pin Number (1024 Pins Maximum) + * bit 10..12 - Alternate Function Selection + * bit 13..15 - Drive Strength + * bit 16..18 - Low Power Mode State + * bit 19..20 - Low Power Mode Edge Detection + * bit 21..22 - Run Mode Pull State + * + * to facilitate the definition, the following macros are provided + * + * MFP_CFG_DEFAULT - default MFP configuration value, with + * alternate function = 0, + * drive strength = fast 3mA (MFP_DS03X) + * low power mode = default + * edge detection = none + * + * MFP_CFG - default MFPR value with alternate function + * MFP_CFG_DRV - default MFPR value with alternate function and + * pin drive strength + * MFP_CFG_LPM - default MFPR value with alternate function and + * low power mode + * MFP_CFG_X - default MFPR value with alternate function, + * pin drive strength and low power mode + */ + + Examples of pin configurations are: + + #define GPIO94_SSP3_RXD MFP_CFG_X(GPIO94, AF1, DS08X, FLOAT) + + which reads GPIO94 can be configured as SSP3_RXD, with alternate function + selection of 1, driving strength of 0b101, and a float state in low power + modes. + + NOTE: this is the default setting of this pin being configured as SSP3_RXD + which can be modified a bit in board code, though it is not recommended to + do so, simply because this default setting is usually carefully encoded, + and is supposed to work in most cases. + + Register Settings + ----------------- + + Register settings on PXA3xx for a pin configuration is actually very + straight-forward, most bits can be converted directly into MFPR value + in a easier way. Two sets of MFPR values are calculated: the run-time + ones and the low power mode ones, to allow different settings. + + The conversion from a generic pin configuration to the actual register + settings on PXA2xx is a bit complicated: many registers are involved, + including GAFRx, GPDRx, PGSRx, PWER, PKWR, PFER and PRER. Please see + mfp-pxa2xx.c for how the conversion is made. diff --git a/Documentation/bad_memory.txt b/Documentation/bad_memory.txt new file mode 100644 index 00000000000..df841621320 --- /dev/null +++ b/Documentation/bad_memory.txt @@ -0,0 +1,45 @@ +March 2008 +Jan-Simon Moeller, dl9pf@gmx.de + + +How to deal with bad memory e.g. reported by memtest86+ ? +######################################################### + +There are three possibilities I know of: + +1) Reinsert/swap the memory modules + +2) Buy new modules (best!) or try to exchange the memory + if you have spare-parts + +3) Use BadRAM or memmap + +This Howto is about number 3) . + + +BadRAM +###### +BadRAM is the actively developed and available as kernel-patch +here: http://rick.vanrein.org/linux/badram/ + +For more details see the BadRAM documentation. + +memmap +###### + +memmap is already in the kernel and usable as kernel-parameter at +boot-time. Its syntax is slightly strange and you may need to +calculate the values by yourself! + +Syntax to exclude a memory area (see kernel-parameters.txt for details): +memmap=$
+ +Example: memtest86+ reported here errors at address 0x18691458, 0x18698424 and + some others. All had 0x1869xxxx in common, so I chose a pattern of + 0x18690000,0xffff0000. + +With the numbers of the example above: +memmap=64K$0x18690000 + or +memmap=0x10000$0x18690000 + diff --git a/Documentation/blackfin/00-INDEX b/Documentation/blackfin/00-INDEX index 7cb3b356b24..d6840a91e1e 100644 --- a/Documentation/blackfin/00-INDEX +++ b/Documentation/blackfin/00-INDEX @@ -9,3 +9,6 @@ cachefeatures.txt Filesystems - Requirements for mounting the root file system. + +bfin-gpio-note.txt + - Notes in developing/using bfin-gpio driver. diff --git a/Documentation/blackfin/bfin-gpio-notes.txt b/Documentation/blackfin/bfin-gpio-notes.txt new file mode 100644 index 00000000000..9898c7ded7d --- /dev/null +++ b/Documentation/blackfin/bfin-gpio-notes.txt @@ -0,0 +1,71 @@ +/* + * File: Documentation/blackfin/bfin-gpio-note.txt + * Based on: + * Author: + * + * Created: $Id: bfin-gpio-note.txt 2008-11-24 16:42 grafyang $ + * Description: This file contains the notes in developing/using bfin-gpio. + * + * + * Rev: + * + * Modified: + * Copyright 2004-2008 Analog Devices Inc. + * + * Bugs: Enter bugs at http://blackfin.uclinux.org/ + * + */ + + +1. Blackfin GPIO introduction + + There are many GPIO pins on Blackfin. Most of these pins are muxed to + multi-functions. They can be configured as peripheral, or just as GPIO, + configured to input with interrupt enabled, or output. + + For detailed information, please see "arch/blackfin/kernel/bfin_gpio.c", + or the relevant HRM. + + +2. Avoiding resource conflict + + Followed function groups are used to avoiding resource conflict, + - Use the pin as peripheral, + int peripheral_request(unsigned short per, const char *label); + int peripheral_request_list(const unsigned short per[], const char *label); + void peripheral_free(unsigned short per); + void peripheral_free_list(const unsigned short per[]); + - Use the pin as GPIO, + int bfin_gpio_request(unsigned gpio, const char *label); + void bfin_gpio_free(unsigned gpio); + - Use the pin as GPIO interrupt, + int bfin_gpio_irq_request(unsigned gpio, const char *label); + void bfin_gpio_irq_free(unsigned gpio); + + The request functions will record the function state for a certain pin, + the free functions will clear it's function state. + Once a pin is requested, it can't be requested again before it is freed by + previous caller, otherwise kernel will dump stacks, and the request + function fail. + These functions are wrapped by other functions, most of the users need not + care. + + +3. But there are some exceptions + - Kernel permit the identical GPIO be requested both as GPIO and GPIO + interrut. + Some drivers, like gpio-keys, need this behavior. Kernel only print out + warning messages like, + bfin-gpio: GPIO 24 is already reserved by gpio-keys: BTN0, and you are +configuring it as IRQ! + + Note: Consider the case that, if there are two drivers need the + identical GPIO, one of them use it as GPIO, the other use it as + GPIO interrupt. This will really cause resource conflict. So if + there is any abnormal driver behavior, please check the bfin-gpio + warning messages. + + - Kernel permit the identical GPIO be requested from the same driver twice. + + + diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 4dbb8be1c99..3c5434c83da 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -914,7 +914,7 @@ I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch queue and specific I/O schedulers. Unless stated otherwise, elevator is used to refer to both parts and I/O scheduler to specific I/O schedulers. -Block layer implements generic dispatch queue in ll_rw_blk.c and elevator.c. +Block layer implements generic dispatch queue in block/*.c. The generic dispatch queue is responsible for properly ordering barrier requests, requeueing, handling non-fs requests and all other subtleties. @@ -926,8 +926,8 @@ be built inside the kernel. Each queue can choose different one and can also change to another one dynamically. A block layer call to the i/o scheduler follows the convention elv_xxx(). This -calls elevator_xxx_fn in the elevator switch (drivers/block/elevator.c). Oh, -xxx and xxx might not match exactly, but use your imagination. If an elevator +calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx +and xxx might not match exactly, but use your imagination. If an elevator doesn't implement a function, the switch does nothing or some minimal house keeping work. diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX new file mode 100644 index 00000000000..86f054c4701 --- /dev/null +++ b/Documentation/blockdev/00-INDEX @@ -0,0 +1,16 @@ +00-INDEX + - this file +README.DAC960 + - info on Mylex DAC960/DAC1100 PCI RAID Controller Driver for Linux. +cciss.txt + - info, major/minor #'s for Compaq's SMART Array Controllers. +cpqarray.txt + - info on using Compaq's SMART2 Intelligent Disk Array Controllers. +floppy.txt + - notes and driver options for the floppy disk driver. +nbd.txt + - info on a TCP implementation of a network block device. +paride.txt + - information about the parallel port IDE subsystem. +ramdisk.txt + - short guide on how to set up and use the RAM disk. diff --git a/Documentation/blockdev/README.DAC960 b/Documentation/blockdev/README.DAC960 new file mode 100644 index 00000000000..0e8f618ab53 --- /dev/null +++ b/Documentation/blockdev/README.DAC960 @@ -0,0 +1,756 @@ + Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers + + Version 2.2.11 for Linux 2.2.19 + Version 2.4.11 for Linux 2.4.12 + + PRODUCTION RELEASE + + 11 October 2001 + + Leonard N. Zubkoff + Dandelion Digital + lnz@dandelion.com + + Copyright 1998-2001 by Leonard N. Zubkoff + + + INTRODUCTION + +Mylex, Inc. designs and manufactures a variety of high performance PCI RAID +controllers. Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont, +California 94555, USA and can be reached at 510.796.6100 or on the World Wide +Web at http://www.mylex.com. Mylex Technical Support can be reached by +electronic mail at mylexsup@us.ibm.com, by voice at 510.608.2400, or by FAX at +510.745.7715. Contact information for offices in Europe and Japan is available +on their Web site. + +The latest information on Linux support for DAC960 PCI RAID Controllers, as +well as the most recent release of this driver, will always be available from +my Linux Home Page at URL "http://www.dandelion.com/Linux/". The Linux DAC960 +driver supports all current Mylex PCI RAID controllers including the new +eXtremeRAID 2000/3000 and AcceleRAID 352/170/160 models which have an entirely +new firmware interface from the older eXtremeRAID 1100, AcceleRAID 150/200/250, +and DAC960PJ/PG/PU/PD/PL. See below for a complete controller list as well as +minimum firmware version requirements. For simplicity, in most places this +documentation refers to DAC960 generically rather than explicitly listing all +the supported models. + +Driver bug reports should be sent via electronic mail to "lnz@dandelion.com". +Please include with the bug report the complete configuration messages reported +by the driver at startup, along with any subsequent system messages relevant to +the controller's operation, and a detailed description of your system's +hardware configuration. Driver bugs are actually quite rare; if you encounter +problems with disks being marked offline, for example, please contact Mylex +Technical Support as the problem is related to the hardware configuration +rather than the Linux driver. + +Please consult the RAID controller documentation for detailed information +regarding installation and configuration of the controllers. This document +primarily provides information specific to the Linux support. + + + DRIVER FEATURES + +The DAC960 RAID controllers are supported solely as high performance RAID +controllers, not as interfaces to arbitrary SCSI devices. The Linux DAC960 +driver operates at the block device level, the same level as the SCSI and IDE +drivers. Unlike other RAID controllers currently supported on Linux, the +DAC960 driver is not dependent on the SCSI subsystem, and hence avoids all the +complexity and unnecessary code that would be associated with an implementation +as a SCSI driver. The DAC960 driver is designed for as high a performance as +possible with no compromises or extra code for compatibility with lower +performance devices. The DAC960 driver includes extensive error logging and +online configuration management capabilities. Except for initial configuration +of the controller and adding new disk drives, most everything can be handled +from Linux while the system is operational. + +The DAC960 driver is architected to support up to 8 controllers per system. +Each DAC960 parallel SCSI controller can support up to 15 disk drives per +channel, for a maximum of 60 drives on a four channel controller; the fibre +channel eXtremeRAID 3000 controller supports up to 125 disk drives per loop for +a total of 250 drives. The drives installed on a controller are divided into +one or more "Drive Groups", and then each Drive Group is subdivided further +into 1 to 32 "Logical Drives". Each Logical Drive has a specific RAID Level +and caching policy associated with it, and it appears to Linux as a single +block device. Logical Drives are further subdivided into up to 7 partitions +through the normal Linux and PC disk partitioning schemes. Logical Drives are +also known as "System Drives", and Drive Groups are also called "Packs". Both +terms are in use in the Mylex documentation; I have chosen to standardize on +the more generic "Logical Drive" and "Drive Group". + +DAC960 RAID disk devices are named in the style of the obsolete Device File +System (DEVFS). The device corresponding to Logical Drive D on Controller C +is referred to as /dev/rd/cCdD, and the partitions are called /dev/rd/cCdDp1 +through /dev/rd/cCdDp7. For example, partition 3 of Logical Drive 5 on +Controller 2 is referred to as /dev/rd/c2d5p3. Note that unlike with SCSI +disks the device names will not change in the event of a disk drive failure. +The DAC960 driver is assigned major numbers 48 - 55 with one major number per +controller. The 8 bits of minor number are divided into 5 bits for the Logical +Drive and 3 bits for the partition. + + + SUPPORTED DAC960/AcceleRAID/eXtremeRAID PCI RAID CONTROLLERS + +The following list comprises the supported DAC960, AcceleRAID, and eXtremeRAID +PCI RAID Controllers as of the date of this document. It is recommended that +anyone purchasing a Mylex PCI RAID Controller not in the following table +contact the author beforehand to verify that it is or will be supported. + +eXtremeRAID 3000 + 1 Wide Ultra-2/LVD SCSI channel + 2 External Fibre FC-AL channels + 233MHz StrongARM SA 110 Processor + 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) + 32MB/64MB ECC SDRAM Memory + +eXtremeRAID 2000 + 4 Wide Ultra-160 LVD SCSI channels + 233MHz StrongARM SA 110 Processor + 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) + 32MB/64MB ECC SDRAM Memory + +AcceleRAID 352 + 2 Wide Ultra-160 LVD SCSI channels + 100MHz Intel i960RN RISC Processor + 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) + 32MB/64MB ECC SDRAM Memory + +AcceleRAID 170 + 1 Wide Ultra-160 LVD SCSI channel + 100MHz Intel i960RM RISC Processor + 16MB/32MB/64MB ECC SDRAM Memory + +AcceleRAID 160 (AcceleRAID 170LP) + 1 Wide Ultra-160 LVD SCSI channel + 100MHz Intel i960RS RISC Processor + Built in 16M ECC SDRAM Memory + PCI Low Profile Form Factor - fit for 2U height + +eXtremeRAID 1100 (DAC1164P) + 3 Wide Ultra-2/LVD SCSI channels + 233MHz StrongARM SA 110 Processor + 64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots) + 16MB/32MB/64MB Parity SDRAM Memory with Battery Backup + +AcceleRAID 250 (DAC960PTL1) + Uses onboard Symbios SCSI chips on certain motherboards + Also includes one onboard Wide Ultra-2/LVD SCSI Channel + 66MHz Intel i960RD RISC Processor + 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory + +AcceleRAID 200 (DAC960PTL0) + Uses onboard Symbios SCSI chips on certain motherboards + Includes no onboard SCSI Channels + 66MHz Intel i960RD RISC Processor + 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory + +AcceleRAID 150 (DAC960PRL) + Uses onboard Symbios SCSI chips on certain motherboards + Also includes one onboard Wide Ultra-2/LVD SCSI Channel + 33MHz Intel i960RP RISC Processor + 4MB Parity EDO Memory + +DAC960PJ 1/2/3 Wide Ultra SCSI-3 Channels + 66MHz Intel i960RD RISC Processor + 4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory + +DAC960PG 1/2/3 Wide Ultra SCSI-3 Channels + 33MHz Intel i960RP RISC Processor + 4MB/8MB ECC EDO Memory + +DAC960PU 1/2/3 Wide Ultra SCSI-3 Channels + Intel i960CF RISC Processor + 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory + +DAC960PD 1/2/3 Wide Fast SCSI-2 Channels + Intel i960CF RISC Processor + 4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory + +DAC960PL 1/2/3 Wide Fast SCSI-2 Channels + Intel i960 RISC Processor + 2MB/4MB/8MB/16MB/32MB DRAM Memory + +DAC960P 1/2/3 Wide Fast SCSI-2 Channels + Intel i960 RISC Processor + 2MB/4MB/8MB/16MB/32MB DRAM Memory + +For the eXtremeRAID 2000/3000 and AcceleRAID 352/170/160, firmware version +6.00-01 or above is required. + +For the eXtremeRAID 1100, firmware version 5.06-0-52 or above is required. + +For the AcceleRAID 250, 200, and 150, firmware version 4.06-0-57 or above is +required. + +For the DAC960PJ and DAC960PG, firmware version 4.06-0-00 or above is required. + +For the DAC960PU, DAC960PD, DAC960PL, and DAC960P, either firmware version +3.51-0-04 or above is required (for dual Flash ROM controllers), or firmware +version 2.73-0-00 or above is required (for single Flash ROM controllers) + +Please note that not all SCSI disk drives are suitable for use with DAC960 +controllers, and only particular firmware versions of any given model may +actually function correctly. Similarly, not all motherboards have a BIOS that +properly initializes the AcceleRAID 250, AcceleRAID 200, AcceleRAID 150, +DAC960PJ, and DAC960PG because the Intel i960RD/RP is a multi-function device. +If in doubt, contact Mylex RAID Technical Support (mylexsup@us.ibm.com) to +verify compatibility. Mylex makes available a hard disk compatibility list at +http://www.mylex.com/support/hdcomp/hd-lists.html. + + + DRIVER INSTALLATION + +This distribution was prepared for Linux kernel version 2.2.19 or 2.4.12. + +To install the DAC960 RAID driver, you may use the following commands, +replacing "/usr/src" with wherever you keep your Linux kernel source tree: + + cd /usr/src + tar -xvzf DAC960-2.2.11.tar.gz (or DAC960-2.4.11.tar.gz) + mv README.DAC960 linux/Documentation + mv DAC960.[ch] linux/drivers/block + patch -p0 < DAC960.patch (if DAC960.patch is included) + cd linux + make config + make bzImage (or zImage) + +Then install "arch/i386/boot/bzImage" or "arch/i386/boot/zImage" as your +standard kernel, run lilo if appropriate, and reboot. + +To create the necessary devices in /dev, the "make_rd" script included in +"DAC960-Utilities.tar.gz" from http://www.dandelion.com/Linux/ may be used. +LILO 21 and FDISK v2.9 include DAC960 support; also included in this archive +are patches to LILO 20 and FDISK v2.8 that add DAC960 support, along with +statically linked executables of LILO and FDISK. This modified version of LILO +will allow booting from a DAC960 controller and/or mounting the root file +system from a DAC960. + +Red Hat Linux 6.0 and SuSE Linux 6.1 include support for Mylex PCI RAID +controllers. Installing directly onto a DAC960 may be problematic from other +Linux distributions until their installation utilities are updated. + + + INSTALLATION NOTES + +Before installing Linux or adding DAC960 logical drives to an existing Linux +system, the controller must first be configured to provide one or more logical +drives using the BIOS Configuration Utility or DACCF. Please note that since +there are only at most 6 usable partitions on each logical drive, systems +requiring more partitions should subdivide a drive group into multiple logical +drives, each of which can have up to 6 usable partitions. Also, note that with +large disk arrays it is advisable to enable the 8GB BIOS Geometry (255/63) +rather than accepting the default 2GB BIOS Geometry (128/32); failing to so do +will cause the logical drive geometry to have more than 65535 cylinders which +will make it impossible for FDISK to be used properly. The 8GB BIOS Geometry +can be enabled by configuring the DAC960 BIOS, which is accessible via Alt-M +during the BIOS initialization sequence. + +For maximum performance and the most efficient E2FSCK performance, it is +recommended that EXT2 file systems be built with a 4KB block size and 16 block +stride to match the DAC960 controller's 64KB default stripe size. The command +"mke2fs -b 4096 -R stride=16 " is appropriate. Unless there will be a +large number of small files on the file systems, it is also beneficial to add +the "-i 16384" option to increase the bytes per inode parameter thereby +reducing the file system metadata. Finally, on systems that will only be run +with Linux 2.2 or later kernels it is beneficial to enable sparse superblocks +with the "-s 1" option. + + + DAC960 ANNOUNCEMENTS MAILING LIST + +The DAC960 Announcements Mailing List provides a forum for informing Linux +users of new driver releases and other announcements regarding Linux support +for DAC960 PCI RAID Controllers. To join the mailing list, send a message to +"dac960-announce-request@dandelion.com" with the line "subscribe" in the +message body. + + + CONTROLLER CONFIGURATION AND STATUS MONITORING + +The DAC960 RAID controllers running firmware 4.06 or above include a Background +Initialization facility so that system downtime is minimized both for initial +installation and subsequent configuration of additional storage. The BIOS +Configuration Utility (accessible via Alt-R during the BIOS initialization +sequence) is used to quickly configure the controller, and then the logical +drives that have been created are available for immediate use even while they +are still being initialized by the controller. The primary need for online +configuration and status monitoring is then to avoid system downtime when disk +drives fail and must be replaced. Mylex's online monitoring and configuration +utilities are being ported to Linux and will become available at some point in +the future. Note that with a SAF-TE (SCSI Accessed Fault-Tolerant Enclosure) +enclosure, the controller is able to rebuild failed drives automatically as +soon as a drive replacement is made available. + +The primary interfaces for controller configuration and status monitoring are +special files created in the /proc/rd/... hierarchy along with the normal +system console logging mechanism. Whenever the system is operating, the DAC960 +driver queries each controller for status information every 10 seconds, and +checks for additional conditions every 60 seconds. The initial status of each +controller is always available for controller N in /proc/rd/cN/initial_status, +and the current status as of the last status monitoring query is available in +/proc/rd/cN/current_status. In addition, status changes are also logged by the +driver to the system console and will appear in the log files maintained by +syslog. The progress of asynchronous rebuild or consistency check operations +is also available in /proc/rd/cN/current_status, and progress messages are +logged to the system console at most every 60 seconds. + +Starting with the 2.2.3/2.0.3 versions of the driver, the status information +available in /proc/rd/cN/initial_status and /proc/rd/cN/current_status has been +augmented to include the vendor, model, revision, and serial number (if +available) for each physical device found connected to the controller: + +***** DAC960 RAID Driver Version 2.2.3 of 19 August 1999 ***** +Copyright 1998-1999 by Leonard N. Zubkoff +Configuring Mylex DAC960PRL PCI RAID Controller + Firmware Version: 4.07-0-07, Channels: 1, Memory Size: 16MB + PCI Bus: 1, Device: 4, Function: 1, I/O Address: Unassigned + PCI Address: 0xFE300000 mapped at 0xA0800000, IRQ Channel: 21 + Controller Queue Depth: 128, Maximum Blocks per Command: 128 + Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 + Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 + SAF-TE Enclosure Management Enabled + Physical Devices: + 0:0 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 68016775HA + Disk Status: Online, 17928192 blocks + 0:1 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 68004E53HA + Disk Status: Online, 17928192 blocks + 0:2 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 13013935HA + Disk Status: Online, 17928192 blocks + 0:3 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 13016897HA + Disk Status: Online, 17928192 blocks + 0:4 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 68019905HA + Disk Status: Online, 17928192 blocks + 0:5 Vendor: IBM Model: DRVS09D Revision: 0270 + Serial Number: 68012753HA + Disk Status: Online, 17928192 blocks + 0:6 Vendor: ESG-SHV Model: SCA HSBP M6 Revision: 0.61 + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 89640960 blocks, Write Thru + No Rebuild or Consistency Check in Progress + +To simplify the monitoring process for custom software, the special file +/proc/rd/status returns "OK" when all DAC960 controllers in the system are +operating normally and no failures have occurred, or "ALERT" if any logical +drives are offline or critical or any non-standby physical drives are dead. + +Configuration commands for controller N are available via the special file +/proc/rd/cN/user_command. A human readable command can be written to this +special file to initiate a configuration operation, and the results of the +operation can then be read back from the special file in addition to being +logged to the system console. The shell command sequence + + echo "" > /proc/rd/c0/user_command + cat /proc/rd/c0/user_command + +is typically used to execute configuration commands. The configuration +commands are: + + flush-cache + + The "flush-cache" command flushes the controller's cache. The system + automatically flushes the cache at shutdown or if the driver module is + unloaded, so this command is only needed to be certain a write back cache + is flushed to disk before the system is powered off by a command to a UPS. + Note that the flush-cache command also stops an asynchronous rebuild or + consistency check, so it should not be used except when the system is being + halted. + + kill : + + The "kill" command marks the physical drive : as DEAD. + This command is provided primarily for testing, and should not be used + during normal system operation. + + make-online : + + The "make-online" command changes the physical drive : + from status DEAD to status ONLINE. In cases where multiple physical drives + have been killed simultaneously, this command may be used to bring all but + one of them back online, after which a rebuild to the final drive is + necessary. + + Warning: make-online should only be used on a dead physical drive that is + an active part of a drive group, never on a standby drive. The command + should never be used on a dead drive that is part of a critical logical + drive; rebuild should be used if only a single drive is dead. + + make-standby : + + The "make-standby" command changes physical drive : + from status DEAD to status STANDBY. It should only be used in cases where + a dead drive was replaced after an automatic rebuild was performed onto a + standby drive. It cannot be used to add a standby drive to the controller + configuration if one was not created initially; the BIOS Configuration + Utility must be used for that currently. + + rebuild : + + The "rebuild" command initiates an asynchronous rebuild onto physical drive + :. It should only be used when a dead drive has been + replaced. + + check-consistency + + The "check-consistency" command initiates an asynchronous consistency check + of with automatic restoration. It can be used + whenever it is desired to verify the consistency of the redundancy + information. + + cancel-rebuild + cancel-consistency-check + + The "cancel-rebuild" and "cancel-consistency-check" commands cancel any + rebuild or consistency check operations previously initiated. + + + EXAMPLE I - DRIVE FAILURE WITHOUT A STANDBY DRIVE + +The following annotated logs demonstrate the controller configuration and and +online status monitoring capabilities of the Linux DAC960 Driver. The test +configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a +DAC960PJ controller. The physical drives are configured into a single drive +group without a standby drive, and the drive group has been configured into two +logical drives, one RAID-5 and one RAID-6. Note that these logs are from an +earlier version of the driver and the messages have changed somewhat with newer +releases, but the functionality remains similar. First, here is the current +status of the RAID configuration: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status +***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** +Copyright 1998-1999 by Leonard N. Zubkoff +Configuring Mylex DAC960PJ PCI RAID Controller + Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB + PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned + PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 + Controller Queue Depth: 128, Maximum Blocks per Command: 128 + Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 + Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru + No Rebuild or Consistency Check in Progress + +gwynedd:/u/lnz# cat /proc/rd/status +OK + +The above messages indicate that everything is healthy, and /proc/rd/status +returns "OK" indicating that there are no problems with any DAC960 controller +in the system. For demonstration purposes, while I/O is active Physical Drive +1:1 is now disconnected, simulating a drive failure. The failure is noted by +the driver within 10 seconds of the controller's having detected it, and the +driver logs the following console status messages indicating that Logical +Drives 0 and 1 are now CRITICAL as a result of Physical Drive 1:1 being DEAD: + +DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 +DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 +DAC960#0: Physical Drive 1:1 killed because of timeout on SCSI command +DAC960#0: Physical Drive 1:1 is now DEAD +DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL +DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL + +The Sense Keys logged here are just Check Condition / Unit Attention conditions +arising from a SCSI bus reset that is forced by the controller during its error +recovery procedures. Concurrently with the above, the driver status available +from /proc/rd also reflects the drive failure. The status message in +/proc/rd/status has changed from "OK" to "ALERT": + +gwynedd:/u/lnz# cat /proc/rd/status +ALERT + +and /proc/rd/c0/current_status has been updated: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Dead, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru + No Rebuild or Consistency Check in Progress + +Since there are no standby drives configured, the system can continue to access +the logical drives in a performance degraded mode until the failed drive is +replaced and a rebuild operation completed to restore the redundancy of the +logical drives. Once Physical Drive 1:1 is replaced with a properly +functioning drive, or if the physical drive was killed without having failed +(e.g., due to electrical problems on the SCSI bus), the user can instruct the +controller to initiate a rebuild operation onto the newly replaced drive: + +gwynedd:/u/lnz# echo "rebuild 1:1" > /proc/rd/c0/user_command +gwynedd:/u/lnz# cat /proc/rd/c0/user_command +Rebuild of Physical Drive 1:1 Initiated + +The echo command instructs the controller to initiate an asynchronous rebuild +operation onto Physical Drive 1:1, and the status message that results from the +operation is then available for reading from /proc/rd/c0/user_command, as well +as being logged to the console by the driver. + +Within 10 seconds of this command the driver logs the initiation of the +asynchronous rebuild operation: + +DAC960#0: Rebuild of Physical Drive 1:1 Initiated +DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 +DAC960#0: Physical Drive 1:1 is now WRITE-ONLY +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 1% completed + +and /proc/rd/c0/current_status is updated: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Write-Only, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru + Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 6% completed + +As the rebuild progresses, the current status in /proc/rd/c0/current_status is +updated every 10 seconds: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Write-Only, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru + Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 15% completed + +and every minute a progress message is logged to the console by the driver: + +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 32% completed +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 63% completed +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 94% completed +DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 94% completed + +Finally, the rebuild completes successfully. The driver logs the status of the +logical and physical drives and the rebuild completion: + +DAC960#0: Rebuild Completed Successfully +DAC960#0: Physical Drive 1:1 is now ONLINE +DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE +DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE + +/proc/rd/c0/current_status is updated: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru + Rebuild Completed Successfully + +and /proc/rd/status indicates that everything is healthy once again: + +gwynedd:/u/lnz# cat /proc/rd/status +OK + + + EXAMPLE II - DRIVE FAILURE WITH A STANDBY DRIVE + +The following annotated logs demonstrate the controller configuration and and +online status monitoring capabilities of the Linux DAC960 Driver. The test +configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a +DAC960PJ controller. The physical drives are configured into a single drive +group with a standby drive, and the drive group has been configured into two +logical drives, one RAID-5 and one RAID-6. Note that these logs are from an +earlier version of the driver and the messages have changed somewhat with newer +releases, but the functionality remains similar. First, here is the current +status of the RAID configuration: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status +***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** +Copyright 1998-1999 by Leonard N. Zubkoff +Configuring Mylex DAC960PJ PCI RAID Controller + Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB + PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned + PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 + Controller Queue Depth: 128, Maximum Blocks per Command: 128 + Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 + Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Online, 2201600 blocks + 1:3 - Disk: Standby, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru + No Rebuild or Consistency Check in Progress + +gwynedd:/u/lnz# cat /proc/rd/status +OK + +The above messages indicate that everything is healthy, and /proc/rd/status +returns "OK" indicating that there are no problems with any DAC960 controller +in the system. For demonstration purposes, while I/O is active Physical Drive +1:2 is now disconnected, simulating a drive failure. The failure is noted by +the driver within 10 seconds of the controller's having detected it, and the +driver logs the following console status messages: + +DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 +DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02 +DAC960#0: Physical Drive 1:2 killed because of timeout on SCSI command +DAC960#0: Physical Drive 1:2 is now DEAD +DAC960#0: Physical Drive 1:2 killed because it was removed +DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL +DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL + +Since a standby drive is configured, the controller automatically begins +rebuilding onto the standby drive: + +DAC960#0: Physical Drive 1:3 is now WRITE-ONLY +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed + +Concurrently with the above, the driver status available from /proc/rd also +reflects the drive failure and automatic rebuild. The status message in +/proc/rd/status has changed from "OK" to "ALERT": + +gwynedd:/u/lnz# cat /proc/rd/status +ALERT + +and /proc/rd/c0/current_status has been updated: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Dead, 2201600 blocks + 1:3 - Disk: Write-Only, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru + Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed + +As the rebuild progresses, the current status in /proc/rd/c0/current_status is +updated every 10 seconds: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Dead, 2201600 blocks + 1:3 - Disk: Write-Only, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru + Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed + +and every minute a progress message is logged on the console by the driver: + +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed +DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 76% completed +DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 66% completed +DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 84% completed + +Finally, the rebuild completes successfully. The driver logs the status of the +logical and physical drives and the rebuild completion: + +DAC960#0: Rebuild Completed Successfully +DAC960#0: Physical Drive 1:3 is now ONLINE +DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE +DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE + +/proc/rd/c0/current_status is updated: + +***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 ***** +Copyright 1998-1999 by Leonard N. Zubkoff +Configuring Mylex DAC960PJ PCI RAID Controller + Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB + PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned + PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9 + Controller Queue Depth: 128, Maximum Blocks per Command: 128 + Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33 + Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63 + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Dead, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru + Rebuild Completed Successfully + +and /proc/rd/status indicates that everything is healthy once again: + +gwynedd:/u/lnz# cat /proc/rd/status +OK + +Note that the absence of a viable standby drive does not create an "ALERT" +status. Once dead Physical Drive 1:2 has been replaced, the controller must be +told that this has occurred and that the newly replaced drive should become the +new standby drive: + +gwynedd:/u/lnz# echo "make-standby 1:2" > /proc/rd/c0/user_command +gwynedd:/u/lnz# cat /proc/rd/c0/user_command +Make Standby of Physical Drive 1:2 Succeeded + +The echo command instructs the controller to make Physical Drive 1:2 into a +standby drive, and the status message that results from the operation is then +available for reading from /proc/rd/c0/user_command, as well as being logged to +the console by the driver. Within 60 seconds of this command the driver logs: + +DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01 +DAC960#0: Physical Drive 1:2 is now STANDBY +DAC960#0: Make Standby of Physical Drive 1:2 Succeeded + +and /proc/rd/c0/current_status is updated: + +gwynedd:/u/lnz# cat /proc/rd/c0/current_status + ... + Physical Devices: + 0:1 - Disk: Online, 2201600 blocks + 0:2 - Disk: Online, 2201600 blocks + 0:3 - Disk: Online, 2201600 blocks + 1:1 - Disk: Online, 2201600 blocks + 1:2 - Disk: Standby, 2201600 blocks + 1:3 - Disk: Online, 2201600 blocks + Logical Drives: + /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru + /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru + Rebuild Completed Successfully diff --git a/Documentation/blockdev/cciss.txt b/Documentation/blockdev/cciss.txt new file mode 100644 index 00000000000..89698e8df7d --- /dev/null +++ b/Documentation/blockdev/cciss.txt @@ -0,0 +1,171 @@ +This driver is for Compaq's SMART Array Controllers. + +Supported Cards: +---------------- + +This driver is known to work with the following cards: + + * SA 5300 + * SA 5i + * SA 532 + * SA 5312 + * SA 641 + * SA 642 + * SA 6400 + * SA 6400 U320 Expansion Module + * SA 6i + * SA P600 + * SA P800 + * SA E400 + * SA P400i + * SA E200 + * SA E200i + * SA E500 + * SA P700m + * SA P212 + * SA P410 + * SA P410i + * SA P411 + * SA P812 + * SA P712m + * SA P711m + +Detecting drive failures: +------------------------- + +To get the status of logical volumes and to detect physical drive +failures, you can use the cciss_vol_status program found here: +http://cciss.sourceforge.net/#cciss_utils + +Device Naming: +-------------- + +If nodes are not already created in the /dev/cciss directory, run as root: + +# cd /dev +# ./MAKEDEV cciss + +You need some entries in /dev for the cciss device. The MAKEDEV script +can make device nodes for you automatically. Currently the device setup +is as follows: + +Major numbers: + 104 cciss0 + 105 cciss1 + 106 cciss2 + 105 cciss3 + 108 cciss4 + 109 cciss5 + 110 cciss6 + 111 cciss7 + +Minor numbers: + b7 b6 b5 b4 b3 b2 b1 b0 + |----+----| |----+----| + | | + | +-------- Partition ID (0=wholedev, 1-15 partition) + | + +-------------------- Logical Volume number + +The device naming scheme is: +/dev/cciss/c0d0 Controller 0, disk 0, whole device +/dev/cciss/c0d0p1 Controller 0, disk 0, partition 1 +/dev/cciss/c0d0p2 Controller 0, disk 0, partition 2 +/dev/cciss/c0d0p3 Controller 0, disk 0, partition 3 + +/dev/cciss/c1d1 Controller 1, disk 1, whole device +/dev/cciss/c1d1p1 Controller 1, disk 1, partition 1 +/dev/cciss/c1d1p2 Controller 1, disk 1, partition 2 +/dev/cciss/c1d1p3 Controller 1, disk 1, partition 3 + +SCSI tape drive and medium changer support +------------------------------------------ + +SCSI sequential access devices and medium changer devices are supported and +appropriate device nodes are automatically created. (e.g. +/dev/st0, /dev/st1, etc. See the "st" man page for more details.) +You must enable "SCSI tape drive support for Smart Array 5xxx" and +"SCSI support" in your kernel configuration to be able to use SCSI +tape drives with your Smart Array 5xxx controller. + +Additionally, note that the driver will not engage the SCSI core at init +time. The driver must be directed to dynamically engage the SCSI core via +the /proc filesystem entry which the "block" side of the driver creates as +/proc/driver/cciss/cciss* at runtime. This is because at driver init time, +the SCSI core may not yet be initialized (because the driver is a block +driver) and attempting to register it with the SCSI core in such a case +would cause a hang. This is best done via an initialization script +(typically in /etc/init.d, but could vary depending on distribution). +For example: + + for x in /proc/driver/cciss/cciss[0-9]* + do + echo "engage scsi" > $x + done + +Once the SCSI core is engaged by the driver, it cannot be disengaged +(except by unloading the driver, if it happens to be linked as a module.) + +Note also that if no sequential access devices or medium changers are +detected, the SCSI core will not be engaged by the action of the above +script. + +Hot plug support for SCSI tape drives +------------------------------------- + +Hot plugging of SCSI tape drives is supported, with some caveats. +The cciss driver must be informed that changes to the SCSI bus +have been made. This may be done via the /proc filesystem. +For example: + + echo "rescan" > /proc/scsi/cciss0/1 + +This causes the driver to query the adapter about changes to the +physical SCSI buses and/or fibre channel arbitrated loop and the +driver to make note of any new or removed sequential access devices +or medium changers. The driver will output messages indicating what +devices have been added or removed and the controller, bus, target and +lun used to address the device. It then notifies the SCSI mid layer +of these changes. + +Note that the naming convention of the /proc filesystem entries +contains a number in addition to the driver name. (E.g. "cciss0" +instead of just "cciss" which you might expect.) + +Note: ONLY sequential access devices and medium changers are presented +as SCSI devices to the SCSI mid layer by the cciss driver. Specifically, +physical SCSI disk drives are NOT presented to the SCSI mid layer. The +physical SCSI disk drives are controlled directly by the array controller +hardware and it is important to prevent the kernel from attempting to directly +access these devices too, as if the array controller were merely a SCSI +controller in the same way that we are allowing it to access SCSI tape drives. + +SCSI error handling for tape drives and medium changers +------------------------------------------------------- + +The linux SCSI mid layer provides an error handling protocol which +kicks into gear whenever a SCSI command fails to complete within a +certain amount of time (which can vary depending on the command). +The cciss driver participates in this protocol to some extent. The +normal protocol is a four step process. First the device is told +to abort the command. If that doesn't work, the device is reset. +If that doesn't work, the SCSI bus is reset. If that doesn't work +the host bus adapter is reset. Because the cciss driver is a block +driver as well as a SCSI driver and only the tape drives and medium +changers are presented to the SCSI mid layer, and unlike more +straightforward SCSI drivers, disk i/o continues through the block +side during the SCSI error recovery process, the cciss driver only +implements the first two of these actions, aborting the command, and +resetting the device. Additionally, most tape drives will not oblige +in aborting commands, and sometimes it appears they will not even +obey a reset command, though in most circumstances they will. In +the case that the command cannot be aborted and the device cannot be +reset, the device will be set offline. + +In the event the error handling code is triggered and a tape drive is +successfully reset or the tardy command is successfully aborted, the +tape drive may still not allow i/o to continue until some command +is issued which positions the tape to a known position. Typically you +must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example) +before i/o can proceed again to a tape drive which was reset. + diff --git a/Documentation/blockdev/cpqarray.txt b/Documentation/blockdev/cpqarray.txt new file mode 100644 index 00000000000..c7154e20ef5 --- /dev/null +++ b/Documentation/blockdev/cpqarray.txt @@ -0,0 +1,93 @@ +This driver is for Compaq's SMART2 Intelligent Disk Array Controllers. + +Supported Cards: +---------------- + +This driver is known to work with the following cards: + + * SMART (EISA) + * SMART-2/E (EISA) + * SMART-2/P + * SMART-2DH + * SMART-2SL + * SMART-221 + * SMART-3100ES + * SMART-3200 + * Integrated Smart Array Controller + * SA 4200 + * SA 4250ES + * SA 431 + * RAID LC2 Controller + +It should also work with some really old Disk array adapters, but I am +unable to test against these cards: + + * IDA + * IDA-2 + * IAES + + +EISA Controllers: +----------------- + +If you want to use an EISA controller you'll have to supply some +modprobe/lilo parameters. If the driver is compiled into the kernel, must +give it the controller's IO port address at boot time (it is not +necessary to specify the IRQ). For example, if you had two SMART-2/E +controllers, in EISA slots 1 and 2 you'd give it a boot argument like +this: + + smart2=0x1000,0x2000 + +If you were loading the driver as a module, you'd give load it like this: + + modprobe cpqarray eisa=0x1000,0x2000 + +You can use EISA and PCI adapters at the same time. + + +Device Naming: +-------------- + +You need some entries in /dev for the ida device. MAKEDEV in the /dev +directory can make device nodes for you automatically. The device setup is +as follows: + +Major numbers: + 72 ida0 + 73 ida1 + 74 ida2 + 75 ida3 + 76 ida4 + 77 ida5 + 78 ida6 + 79 ida7 + +Minor numbers: + b7 b6 b5 b4 b3 b2 b1 b0 + |----+----| |----+----| + | | + | +-------- Partition ID (0=wholedev, 1-15 partition) + | + +-------------------- Logical Volume number + +The device naming scheme is: +/dev/ida/c0d0 Controller 0, disk 0, whole device +/dev/ida/c0d0p1 Controller 0, disk 0, partition 1 +/dev/ida/c0d0p2 Controller 0, disk 0, partition 2 +/dev/ida/c0d0p3 Controller 0, disk 0, partition 3 + +/dev/ida/c1d1 Controller 1, disk 1, whole device +/dev/ida/c1d1p1 Controller 1, disk 1, partition 1 +/dev/ida/c1d1p2 Controller 1, disk 1, partition 2 +/dev/ida/c1d1p3 Controller 1, disk 1, partition 3 + + +Changelog: +========== + +10-28-2004 : General cleanup, syntax fixes for in-kernel driver version. + James Nelson + + +1999 : Original Document diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt new file mode 100644 index 00000000000..6ccab88705c --- /dev/null +++ b/Documentation/blockdev/floppy.txt @@ -0,0 +1,245 @@ +This file describes the floppy driver. + +FAQ list: +========= + + A FAQ list may be found in the fdutils package (see below), and also +at . + + +LILO configuration options (Thinkpad users, read this) +====================================================== + + The floppy driver is configured using the 'floppy=' option in +lilo. This option can be typed at the boot prompt, or entered in the +lilo configuration file. + + Example: If your kernel is called linux-2.6.9, type the following line +at the lilo boot prompt (if you have a thinkpad): + + linux-2.6.9 floppy=thinkpad + +You may also enter the following line in /etc/lilo.conf, in the description +of linux-2.6.9: + + append = "floppy=thinkpad" + + Several floppy related options may be given, example: + + linux-2.6.9 floppy=daring floppy=two_fdc + append = "floppy=daring floppy=two_fdc" + + If you give options both in the lilo config file and on the boot +prompt, the option strings of both places are concatenated, the boot +prompt options coming last. That's why there are also options to +restore the default behavior. + + +Module configuration options +============================ + + If you use the floppy driver as a module, use the following syntax: +modprobe floppy + +Example: + modprobe floppy omnibook messages + + If you need certain options enabled every time you load the floppy driver, +you can put: + + options floppy omnibook messages + +in /etc/modprobe.conf. + + + The floppy driver related options are: + + floppy=asus_pci + Sets the bit mask to allow only units 0 and 1. (default) + + floppy=daring + Tells the floppy driver that you have a well behaved floppy controller. + This allows more efficient and smoother operation, but may fail on + certain controllers. This may speed up certain operations. + + floppy=0,daring + Tells the floppy driver that your floppy controller should be used + with caution. + + floppy=one_fdc + Tells the floppy driver that you have only one floppy controller. + (default) + + floppy=two_fdc + floppy=
,two_fdc + Tells the floppy driver that you have two floppy controllers. + The second floppy controller is assumed to be at
. + This option is not needed if the second controller is at address + 0x370, and if you use the 'cmos' option. + + floppy=thinkpad + Tells the floppy driver that you have a Thinkpad. Thinkpads use an + inverted convention for the disk change line. + + floppy=0,thinkpad + Tells the floppy driver that you don't have a Thinkpad. + + floppy=omnibook + floppy=nodma + Tells the floppy driver not to use Dma for data transfers. + This is needed on HP Omnibooks, which don't have a workable + DMA channel for the floppy driver. This option is also useful + if you frequently get "Unable to allocate DMA memory" messages. + Indeed, dma memory needs to be continuous in physical memory, + and is thus harder to find, whereas non-dma buffers may be + allocated in virtual memory. However, I advise against this if + you have an FDC without a FIFO (8272A or 82072). 82072A and + later are OK. You also need at least a 486 to use nodma. + If you use nodma mode, I suggest you also set the FIFO + threshold to 10 or lower, in order to limit the number of data + transfer interrupts. + + If you have a FIFO-able FDC, the floppy driver automatically + falls back on non DMA mode if no DMA-able memory can be found. + If you want to avoid this, explicitly ask for 'yesdma'. + + floppy=yesdma + Tells the floppy driver that a workable DMA channel is available. + (default) + + floppy=nofifo + Disables the FIFO entirely. This is needed if you get "Bus + master arbitration error" messages from your Ethernet card (or + from other devices) while accessing the floppy. + + floppy=usefifo + Enables the FIFO. (default) + + floppy=,fifo_depth + Sets the FIFO threshold. This is mostly relevant in DMA + mode. If this is higher, the floppy driver tolerates more + interrupt latency, but it triggers more interrupts (i.e. it + imposes more load on the rest of the system). If this is + lower, the interrupt latency should be lower too (faster + processor). The benefit of a lower threshold is less + interrupts. + + To tune the fifo threshold, switch on over/underrun messages + using 'floppycontrol --messages'. Then access a floppy + disk. If you get a huge amount of "Over/Underrun - retrying" + messages, then the fifo threshold is too low. Try with a + higher value, until you only get an occasional Over/Underrun. + It is a good idea to compile the floppy driver as a module + when doing this tuning. Indeed, it allows to try different + fifo values without rebooting the machine for each test. Note + that you need to do 'floppycontrol --messages' every time you + re-insert the module. + + Usually, tuning the fifo threshold should not be needed, as + the default (0xa) is reasonable. + + floppy=,,cmos + Sets the CMOS type of to . This is mandatory if + you have more than two floppy drives (only two can be + described in the physical CMOS), or if your BIOS uses + non-standard CMOS types. The CMOS types are: + + 0 - Use the value of the physical CMOS + 1 - 5 1/4 DD + 2 - 5 1/4 HD + 3 - 3 1/2 DD + 4 - 3 1/2 HD + 5 - 3 1/2 ED + 6 - 3 1/2 ED + 16 - unknown or not installed + + (Note: there are two valid types for ED drives. This is because 5 was + initially chosen to represent floppy *tapes*, and 6 for ED drives. + AMI ignored this, and used 5 for ED drives. That's why the floppy + driver handles both.) + + floppy=unexpected_interrupts + Print a warning message when an unexpected interrupt is received. + (default) + + floppy=no_unexpected_interrupts + floppy=L40SX + Don't print a message when an unexpected interrupt is received. This + is needed on IBM L40SX laptops in certain video modes. (There seems + to be an interaction between video and floppy. The unexpected + interrupts affect only performance, and can be safely ignored.) + + floppy=broken_dcl + Don't use the disk change line, but assume that the disk was + changed whenever the device node is reopened. Needed on some + boxes where the disk change line is broken or unsupported. + This should be regarded as a stopgap measure, indeed it makes + floppy operation less efficient due to unneeded cache + flushings, and slightly more unreliable. Please verify your + cable, connection and jumper settings if you have any DCL + problems. However, some older drives, and also some laptops + are known not to have a DCL. + + floppy=debug + Print debugging messages. + + floppy=messages + Print informational messages for some operations (disk change + notifications, warnings about over and underruns, and about + autodetection). + + floppy=silent_dcl_clear + Uses a less noisy way to clear the disk change line (which + doesn't involve seeks). Implied by 'daring' option. + + floppy=,irq + Sets the floppy IRQ to instead of 6. + + floppy=,dma + Sets the floppy DMA channel to instead of 2. + + floppy=slow + Use PS/2 stepping rate: + " PS/2 floppies have much slower step rates than regular floppies. + It's been recommended that take about 1/4 of the default speed + in some more extreme cases." + + +Supporting utilities and additional documentation: +================================================== + + Additional parameters of the floppy driver can be configured at +runtime. Utilities which do this can be found in the fdutils package. +This package also contains a new version of mtools which allows to +access high capacity disks (up to 1992K on a high density 3 1/2 disk!). +It also contains additional documentation about the floppy driver. + +The latest version can be found at fdutils homepage: + http://fdutils.linux.lu + +The fdutils releases can be found at: + http://fdutils.linux.lu/download.html + http://www.tux.org/pub/knaff/fdutils/ + ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ + +Reporting problems about the floppy driver +========================================== + + If you have a question or a bug report about the floppy driver, mail +me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use +comp.os.linux.hardware. As the volume in these groups is rather high, +be sure to include the word "floppy" (or "FLOPPY") in the subject +line. If the reported problem happens when mounting floppy disks, be +sure to mention also the type of the filesystem in the subject line. + + Be sure to read the FAQ before mailing/posting any bug reports! + + Alain + +Changelog +========= + +10-30-2004 : Cleanup, updating, add reference to module configuration. + James Nelson + +6-3-2000 : Original Document diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt new file mode 100644 index 00000000000..aeb93ffe641 --- /dev/null +++ b/Documentation/blockdev/nbd.txt @@ -0,0 +1,47 @@ + Network Block Device (TCP version) + + What is it: With this compiled in the kernel (or as a module), Linux + can use a remote server as one of its block devices. So every time + the client computer wants to read, e.g., /dev/nb0, it sends a + request over TCP to the server, which will reply with the data read. + This can be used for stations with low disk space (or even diskless - + if you boot from floppy) to borrow disk space from another computer. + Unlike NFS, it is possible to put any filesystem on it, etc. It should + even be possible to use NBD as a root filesystem (I've never tried), + but it requires a user-level program to be in the initrd to start. + It also allows you to run block-device in user land (making server + and client physically the same computer, communicating using loopback). + + Current state: It currently works. Network block device is stable. + I originally thought that it was impossible to swap over TCP. It + turned out not to be true - swapping over TCP now works and seems + to be deadlock-free, but it requires heavy patches into Linux's + network layer. + + For more information, or to download the nbd-client and nbd-server + tools, go to http://nbd.sf.net/. + + Howto: To setup nbd, you can simply do the following: + + First, serve a device or file from a remote server: + + nbd-server + + e.g., + root@server1 # nbd-server 1234 /dev/sdb1 + + (serves sdb1 partition on TCP port 1234) + + Then, on the local (client) system: + + nbd-client /dev/nb[0-n] + + e.g., + root@client1 # nbd-client server1 1234 /dev/nb0 + + (creates the nb0 device on client1) + + The nbd kernel module need only be installed on the client + system, as the nbd-server is completely in userspace. In fact, + the nbd-server has been successfully ported to other operating + systems, including Windows. diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt new file mode 100644 index 00000000000..e4312676bdd --- /dev/null +++ b/Documentation/blockdev/paride.txt @@ -0,0 +1,417 @@ + + Linux and parallel port IDE devices + +PARIDE v1.03 (c) 1997-8 Grant Guenther + +1. Introduction + +Owing to the simplicity and near universality of the parallel port interface +to personal computers, many external devices such as portable hard-disk, +CD-ROM, LS-120 and tape drives use the parallel port to connect to their +host computer. While some devices (notably scanners) use ad-hoc methods +to pass commands and data through the parallel port interface, most +external devices are actually identical to an internal model, but with +a parallel-port adapter chip added in. Some of the original parallel port +adapters were little more than mechanisms for multiplexing a SCSI bus. +(The Iomega PPA-3 adapter used in the ZIP drives is an example of this +approach). Most current designs, however, take a different approach. +The adapter chip reproduces a small ISA or IDE bus in the external device +and the communication protocol provides operations for reading and writing +device registers, as well as data block transfer functions. Sometimes, +the device being addressed via the parallel cable is a standard SCSI +controller like an NCR 5380. The "ditto" family of external tape +drives use the ISA replicator to interface a floppy disk controller, +which is then connected to a floppy-tape mechanism. The vast majority +of external parallel port devices, however, are now based on standard +IDE type devices, which require no intermediate controller. If one +were to open up a parallel port CD-ROM drive, for instance, one would +find a standard ATAPI CD-ROM drive, a power supply, and a single adapter +that interconnected a standard PC parallel port cable and a standard +IDE cable. It is usually possible to exchange the CD-ROM device with +any other device using the IDE interface. + +The document describes the support in Linux for parallel port IDE +devices. It does not cover parallel port SCSI devices, "ditto" tape +drives or scanners. Many different devices are supported by the +parallel port IDE subsystem, including: + + MicroSolutions backpack CD-ROM + MicroSolutions backpack PD/CD + MicroSolutions backpack hard-drives + MicroSolutions backpack 8000t tape drive + SyQuest EZ-135, EZ-230 & SparQ drives + Avatar Shark + Imation Superdisk LS-120 + Maxell Superdisk LS-120 + FreeCom Power CD + Hewlett-Packard 5GB and 8GB tape drives + Hewlett-Packard 7100 and 7200 CD-RW drives + +as well as most of the clone and no-name products on the market. + +To support such a wide range of devices, PARIDE, the parallel port IDE +subsystem, is actually structured in three parts. There is a base +paride module which provides a registry and some common methods for +accessing the parallel ports. The second component is a set of +high-level drivers for each of the different types of supported devices: + + pd IDE disk + pcd ATAPI CD-ROM + pf ATAPI disk + pt ATAPI tape + pg ATAPI generic + +(Currently, the pg driver is only used with CD-R drives). + +The high-level drivers function according to the relevant standards. +The third component of PARIDE is a set of low-level protocol drivers +for each of the parallel port IDE adapter chips. Thanks to the interest +and encouragement of Linux users from many parts of the world, +support is available for almost all known adapter protocols: + + aten ATEN EH-100 (HK) + bpck Microsolutions backpack (US) + comm DataStor (old-type) "commuter" adapter (TW) + dstr DataStor EP-2000 (TW) + epat Shuttle EPAT (UK) + epia Shuttle EPIA (UK) + fit2 FIT TD-2000 (US) + fit3 FIT TD-3000 (US) + friq Freecom IQ cable (DE) + frpw Freecom Power (DE) + kbic KingByte KBIC-951A and KBIC-971A (TW) + ktti KT Technology PHd adapter (SG) + on20 OnSpec 90c20 (US) + on26 OnSpec 90c26 (US) + + +2. Using the PARIDE subsystem + +While configuring the Linux kernel, you may choose either to build +the PARIDE drivers into your kernel, or to build them as modules. + +In either case, you will need to select "Parallel port IDE device support" +as well as at least one of the high-level drivers and at least one +of the parallel port communication protocols. If you do not know +what kind of parallel port adapter is used in your drive, you could +begin by checking the file names and any text files on your DOS +installation floppy. Alternatively, you can look at the markings on +the adapter chip itself. That's usually sufficient to identify the +correct device. + +You can actually select all the protocol modules, and allow the PARIDE +subsystem to try them all for you. + +For the "brand-name" products listed above, here are the protocol +and high-level drivers that you would use: + + Manufacturer Model Driver Protocol + + MicroSolutions CD-ROM pcd bpck + MicroSolutions PD drive pf bpck + MicroSolutions hard-drive pd bpck + MicroSolutions 8000t tape pt bpck + SyQuest EZ, SparQ pd epat + Imation Superdisk pf epat + Maxell Superdisk pf friq + Avatar Shark pd epat + FreeCom CD-ROM pcd frpw + Hewlett-Packard 5GB Tape pt epat + Hewlett-Packard 7200e (CD) pcd epat + Hewlett-Packard 7200e (CD-R) pg epat + +2.1 Configuring built-in drivers + +We recommend that you get to know how the drivers work and how to +configure them as loadable modules, before attempting to compile a +kernel with the drivers built-in. + +If you built all of your PARIDE support directly into your kernel, +and you have just a single parallel port IDE device, your kernel should +locate it automatically for you. If you have more than one device, +you may need to give some command line options to your bootloader +(eg: LILO), how to do that is beyond the scope of this document. + +The high-level drivers accept a number of command line parameters, all +of which are documented in the source files in linux/drivers/block/paride. +By default, each driver will automatically try all parallel ports it +can find, and all protocol types that have been installed, until it finds +a parallel port IDE adapter. Once it finds one, the probe stops. So, +if you have more than one device, you will need to tell the drivers +how to identify them. This requires specifying the port address, the +protocol identification number and, for some devices, the drive's +chain ID. While your system is booting, a number of messages are +displayed on the console. Like all such messages, they can be +reviewed with the 'dmesg' command. Among those messages will be +some lines like: + + paride: bpck registered as protocol 0 + paride: epat registered as protocol 1 + +The numbers will always be the same until you build a new kernel with +different protocol selections. You should note these numbers as you +will need them to identify the devices. + +If you happen to be using a MicroSolutions backpack device, you will +also need to know the unit ID number for each drive. This is usually +the last two digits of the drive's serial number (but read MicroSolutions' +documentation about this). + +As an example, let's assume that you have a MicroSolutions PD/CD drive +with unit ID number 36 connected to the parallel port at 0x378, a SyQuest +EZ-135 connected to the chained port on the PD/CD drive and also an +Imation Superdisk connected to port 0x278. You could give the following +options on your boot command: + + pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 + +In the last option, pf.drive1 configures device /dev/pf1, the 0x378 +is the parallel port base address, the 0 is the protocol registration +number and 36 is the chain ID. + +Please note: while PARIDE will work both with and without the +PARPORT parallel port sharing system that is included by the +"Parallel port support" option, PARPORT must be included and enabled +if you want to use chains of devices on the same parallel port. + +2.2 Loading and configuring PARIDE as modules + +It is much faster and simpler to get to understand the PARIDE drivers +if you use them as loadable kernel modules. + +Note 1: using these drivers with the "kerneld" automatic module loading +system is not recommended for beginners, and is not documented here. + +Note 2: if you build PARPORT support as a loadable module, PARIDE must +also be built as loadable modules, and PARPORT must be loaded before the +PARIDE modules. + +To use PARIDE, you must begin by + + insmod paride + +this loads a base module which provides a registry for the protocols, +among other tasks. + +Then, load as many of the protocol modules as you think you might need. +As you load each module, it will register the protocols that it supports, +and print a log message to your kernel log file and your console. For +example: + + # insmod epat + paride: epat registered as protocol 0 + # insmod kbic + paride: k951 registered as protocol 1 + paride: k971 registered as protocol 2 + +Finally, you can load high-level drivers for each kind of device that +you have connected. By default, each driver will autoprobe for a single +device, but you can support up to four similar devices by giving their +individual co-ordinates when you load the driver. + +For example, if you had two no-name CD-ROM drives both using the +KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc +you could give the following command: + + # insmod pcd drive0=0x378,1 drive1=0x3bc,1 + +For most adapters, giving a port address and protocol number is sufficient, +but check the source files in linux/drivers/block/paride for more +information. (Hopefully someone will write some man pages one day !). + +As another example, here's what happens when PARPORT is installed, and +a SyQuest EZ-135 is attached to port 0x378: + + # insmod paride + paride: version 1.0 installed + # insmod epat + paride: epat registered as protocol 0 + # insmod pd + pd: pd version 1.0, major 45, cluster 64, nice 0 + pda: Sharing parport1 at 0x378 + pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 + pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media + pda: pda1 + +Note that the last line is the output from the generic partition table +scanner - in this case it reports that it has found a disk with one partition. + +2.3 Using a PARIDE device + +Once the drivers have been loaded, you can access PARIDE devices in the +same way as their traditional counterparts. You will probably need to +create the device "special files". Here is a simple script that you can +cut to a file and execute: + +#!/bin/bash +# +# mkd -- a script to create the device special files for the PARIDE subsystem +# +function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 +} +# +function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done +} +# +cd /dev +# +for u in 0 1 2 3 ; do pd $u ; done +for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done +for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done +for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done +for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done +for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done +# +# end of mkd + +With the device files and drivers in place, you can access PARIDE devices +like any other Linux device. For example, to mount a CD-ROM in pcd0, use: + + mount /dev/pcd0 /cdrom + +If you have a fresh Avatar Shark cartridge, and the drive is pda, you +might do something like: + + fdisk /dev/pda -- make a new partition table with + partition 1 of type 83 + + mke2fs /dev/pda1 -- to build the file system + + mkdir /shark -- make a place to mount the disk + + mount /dev/pda1 /shark + +Devices like the Imation superdisk work in the same way, except that +they do not have a partition table. For example to make a 120MB +floppy that you could share with a DOS system: + + mkdosfs /dev/pf0 + mount /dev/pf0 /mnt + + +2.4 The pf driver + +The pf driver is intended for use with parallel port ATAPI disk +devices. The most common devices in this category are PD drives +and LS-120 drives. Traditionally, media for these devices are not +partitioned. Consequently, the pf driver does not support partitioned +media. This may be changed in a future version of the driver. + +2.5 Using the pt driver + +The pt driver for parallel port ATAPI tape drives is a minimal driver. +It does not yet support many of the standard tape ioctl operations. +For best performance, a block size of 32KB should be used. You will +probably want to set the parallel port delay to 0, if you can. + +2.6 Using the pg driver + +The pg driver can be used in conjunction with the cdrecord program +to create CD-ROMs. Please get cdrecord version 1.6.1 or later +from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media +your parallel port should ideally be set to EPP mode, and the "port delay" +should be set to 0. With those settings it is possible to record at 2x +speed without any buffer underruns. If you cannot get the driver to work +in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. + + +3. Troubleshooting + +3.1 Use EPP mode if you can + +The most common problems that people report with the PARIDE drivers +concern the parallel port CMOS settings. At this time, none of the +PARIDE protocol modules support ECP mode, or any ECP combination modes. +If you are able to do so, please set your parallel port into EPP mode +using your CMOS setup procedure. + +3.2 Check the port delay + +Some parallel ports cannot reliably transfer data at full speed. To +offset the errors, the PARIDE protocol modules introduce a "port +delay" between each access to the i/o ports. Each protocol sets +a default value for this delay. In most cases, the user can override +the default and set it to 0 - resulting in somewhat higher transfer +rates. In some rare cases (especially with older 486 systems) the +default delays are not long enough. if you experience corrupt data +transfers, or unexpected failures, you may wish to increase the +port delay. The delay can be programmed using the "driveN" parameters +to each of the high-level drivers. Please see the notes above, or +read the comments at the beginning of the driver source files in +linux/drivers/block/paride. + +3.3 Some drives need a printer reset + +There appear to be a number of "noname" external drives on the market +that do not always power up correctly. We have noticed this with some +drives based on OnSpec and older Freecom adapters. In these rare cases, +the adapter can often be reinitialised by issuing a "printer reset" on +the parallel port. As the reset operation is potentially disruptive in +multiple device environments, the PARIDE drivers will not do it +automatically. You can however, force a printer reset by doing: + + insmod lp reset=1 + rmmod lp + +If you have one of these marginal cases, you should probably build +your paride drivers as modules, and arrange to do the printer reset +before loading the PARIDE drivers. + +3.4 Use the verbose option and dmesg if you need help + +While a lot of testing has gone into these drivers to make them work +as smoothly as possible, problems will arise. If you do have problems, +please check all the obvious things first: does the drive work in +DOS with the manufacturer's drivers ? If that doesn't yield any useful +clues, then please make sure that only one drive is hooked to your system, +and that either (a) PARPORT is enabled or (b) no other device driver +is using your parallel port (check in /proc/ioports). Then, load the +appropriate drivers (you can load several protocol modules if you want) +as in: + + # insmod paride + # insmod epat + # insmod bpck + # insmod kbic + ... + # insmod pd verbose=1 + +(using the correct driver for the type of device you have, of course). +The verbose=1 parameter will cause the drivers to log a trace of their +activity as they attempt to locate your drive. + +Use 'dmesg' to capture a log of all the PARIDE messages (any messages +beginning with paride:, a protocol module's name or a driver's name) and +include that with your bug report. You can submit a bug report in one +of two ways. Either send it directly to the author of the PARIDE suite, +by e-mail to grant@torque.net, or join the linux-parport mailing list +and post your report there. + +3.5 For more information or help + +You can join the linux-parport mailing list by sending a mail message +to + linux-parport-request@torque.net + +with the single word + + subscribe + +in the body of the mail message (not in the subject line). Please be +sure that your mail program is correctly set up when you do this, as +the list manager is a robot that will subscribe you using the reply +address in your mail headers. REMOVE any anti-spam gimmicks you may +have in your mail headers, when sending mail to the list server. + +You might also find some useful information on the linux-parport +web pages (although they are not always up to date) at + + http://www.torque.net/parport/ + + diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt new file mode 100644 index 00000000000..6c820baa19a --- /dev/null +++ b/Documentation/blockdev/ramdisk.txt @@ -0,0 +1,165 @@ +Using the RAM disk block device with Linux +------------------------------------------ + +Contents: + + 1) Overview + 2) Kernel Command Line Parameters + 3) Using "rdev -r" + 4) An Example of Creating a Compressed RAM Disk + + +1) Overview +----------- + +The RAM disk driver is a way to use main system memory as a block device. It +is required for initrd, an initial filesystem used if you need to load modules +in order to access the root filesystem (see Documentation/initrd.txt). It can +also be used for a temporary filesystem for crypto work, since the contents +are erased on reboot. + +The RAM disk dynamically grows as more space is required. It does this by using +RAM from the buffer cache. The driver marks the buffers it is using as dirty +so that the VM subsystem does not try to reclaim them later. + +The RAM disk supports up to 16 RAM disks by default, and can be reconfigured +to support an unlimited number of RAM disks (at your own risk). Just change +the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu +and (re)build the kernel. + +To use RAM disk support with your system, run './MAKEDEV ram' from the /dev +directory. RAM disks are all major number 1, and start with minor number 0 +for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. + +The new RAM disk also has the ability to load compressed RAM disk images, +allowing one to squeeze more programs onto an average installation or +rescue floppy disk. + + +2) Kernel Command Line Parameters +--------------------------------- + + ramdisk_size=N + ============== + +This parameter tells the RAM disk driver to set up RAM disks of N k size. The +default is 4096 (4 MB) (8192 (8 MB) on S390). + + ramdisk_blocksize=N + =================== + +This parameter tells the RAM disk driver how many bytes to use per block. The +default is 1024 (BLOCK_SIZE). + + +3) Using "rdev -r" +------------------ + +The usage of the word (two bytes) that "rdev -r" sets in the kernel image is +as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up +to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit +14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a +prompt/wait sequence is to be given before trying to read the RAM disk. Since +the RAM disk dynamically grows as data is being written into it, a size field +is not required. Bits 11 to 13 are not currently used and may as well be zero. +These numbers are no magical secrets, as seen below: + +./arch/i386/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF +./arch/i386/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 +./arch/i386/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 + +Consider a typical two floppy disk setup, where you will have the +kernel on disk one, and have already put a RAM disk image onto disk #2. + +Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk +starts at an offset of 0 kB from the beginning of the floppy. +The command line equivalent is: "ramdisk_start=0" + +You want bit 14 as one, indicating that a RAM disk is to be loaded. +The command line equivalent is: "load_ramdisk=1" + +You want bit 15 as one, indicating that you want a prompt/keypress +sequence so that you have a chance to switch floppy disks. +The command line equivalent is: "prompt_ramdisk=1" + +Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. +So to create disk one of the set, you would do: + + /usr/src/linux# cat arch/i386/boot/zImage > /dev/fd0 + /usr/src/linux# rdev /dev/fd0 /dev/fd0 + /usr/src/linux# rdev -r /dev/fd0 49152 + +If you make a boot disk that has LILO, then for the above, you would use: + append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" +Since the default start = 0 and the default prompt = 1, you could use: + append = "load_ramdisk=1" + + +4) An Example of Creating a Compressed RAM Disk +---------------------------------------------- + +To create a RAM disk image, you will need a spare block device to +construct it on. This can be the RAM disk device itself, or an +unused disk partition (such as an unmounted swap partition). For this +example, we will use the RAM disk device, "/dev/ram0". + +Note: This technique should not be done on a machine with less than 8 MB +of RAM. If using a spare disk partition instead of /dev/ram0, then this +restriction does not apply. + +a) Decide on the RAM disk size that you want. Say 2 MB for this example. + Create it by writing to the RAM disk device. (This step is not currently + required, but may be in the future.) It is wise to zero out the + area (esp. for disks) so that maximal compression is achieved for + the unused blocks of the image that you are about to create. + + dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 + +b) Make a filesystem on it. Say ext2fs for this example. + + mke2fs -vm0 /dev/ram0 2048 + +c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) + and unmount it again. + +d) Compress the contents of the RAM disk. The level of compression + will be approximately 50% of the space used by the files. Unused + space on the RAM disk will compress to almost nothing. + + dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz + +e) Put the kernel onto the floppy + + dd if=zImage of=/dev/fd0 bs=1k + +f) Put the RAM disk image onto the floppy, after the kernel. Use an offset + that is slightly larger than the kernel, so that you can put another + (possibly larger) kernel onto the same floppy later without overlapping + the RAM disk image. An offset of 400 kB for kernels about 350 kB in + size would be reasonable. Make sure offset+size of ram_image.gz is + not larger than the total space on your floppy (usually 1440 kB). + + dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 + +g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. + For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would + have 2^15 + 2^14 + 400 = 49552. + + rdev /dev/fd0 /dev/fd0 + rdev -r /dev/fd0 49552 + +That is it. You now have your boot/root compressed RAM disk floppy. Some +users may wish to combine steps (d) and (f) by using a pipe. + +-------------------------------------------------------------------------- + Paul Gortmaker 12/95 + +Changelog: +---------- + +10-22-04 : Updated to reflect changes in command line options, remove + obsolete references, general cleanup. + James Nelson (james4765@gmail.com) + + +12-95 : Original Document diff --git a/Documentation/c2port.txt b/Documentation/c2port.txt new file mode 100644 index 00000000000..d9bf93ea439 --- /dev/null +++ b/Documentation/c2port.txt @@ -0,0 +1,90 @@ + C2 port support + --------------- + +(C) Copyright 2007 Rodolfo Giometti + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + + + +Overview +-------- + +This driver implements the support for Linux of Silicon Labs (Silabs) +C2 Interface used for in-system programming of micro controllers. + +By using this driver you can reprogram the in-system flash without EC2 +or EC3 debug adapter. This solution is also useful in those systems +where the micro controller is connected via special GPIOs pins. + +References +---------- + +The C2 Interface main references are at (http://www.silabs.com) +Silicon Laboratories site], see: + +- AN127: FLASH Programming via the C2 Interface at +http://www.silabs.com/public/documents/tpub_doc/anote/Microcontrollers/Small_Form_Factor/en/an127.pdf, and + +- C2 Specification at +http://www.silabs.com/public/documents/tpub_doc/spec/Microcontrollers/en/C2spec.pdf, + +however it implements a two wire serial communication protocol (bit +banging) designed to enable in-system programming, debugging, and +boundary-scan testing on low pin-count Silicon Labs devices. Currently +this code supports only flash programming but extensions are easy to +add. + +Using the driver +---------------- + +Once the driver is loaded you can use sysfs support to get C2port's +info or read/write in-system flash. + +# ls /sys/class/c2port/c2port0/ +access flash_block_size flash_erase rev_id +dev_id flash_blocks_num flash_size subsystem/ +flash_access flash_data reset uevent + +Initially the C2port access is disabled since you hardware may have +such lines multiplexed with other devices so, to get access to the +C2port, you need the command: + +# echo 1 > /sys/class/c2port/c2port0/access + +after that you should read the device ID and revision ID of the +connected micro controller: + +# cat /sys/class/c2port/c2port0/dev_id +8 +# cat /sys/class/c2port/c2port0/rev_id +1 + +However, for security reasons, the in-system flash access in not +enabled yet, to do so you need the command: + +# echo 1 > /sys/class/c2port/c2port0/flash_access + +After that you can read the whole flash: + +# cat /sys/class/c2port/c2port0/flash_data > image + +erase it: + +# echo 1 > /sys/class/c2port/c2port0/flash_erase + +and write it: + +# cat image > /sys/class/c2port/c2port0/flash_data + +after writing you have to reset the device to execute the new code: + +# echo 1 > /sys/class/c2port/c2port0/reset diff --git a/Documentation/cciss.txt b/Documentation/cciss.txt deleted file mode 100644 index 89698e8df7d..00000000000 --- a/Documentation/cciss.txt +++ /dev/null @@ -1,171 +0,0 @@ -This driver is for Compaq's SMART Array Controllers. - -Supported Cards: ----------------- - -This driver is known to work with the following cards: - - * SA 5300 - * SA 5i - * SA 532 - * SA 5312 - * SA 641 - * SA 642 - * SA 6400 - * SA 6400 U320 Expansion Module - * SA 6i - * SA P600 - * SA P800 - * SA E400 - * SA P400i - * SA E200 - * SA E200i - * SA E500 - * SA P700m - * SA P212 - * SA P410 - * SA P410i - * SA P411 - * SA P812 - * SA P712m - * SA P711m - -Detecting drive failures: -------------------------- - -To get the status of logical volumes and to detect physical drive -failures, you can use the cciss_vol_status program found here: -http://cciss.sourceforge.net/#cciss_utils - -Device Naming: --------------- - -If nodes are not already created in the /dev/cciss directory, run as root: - -# cd /dev -# ./MAKEDEV cciss - -You need some entries in /dev for the cciss device. The MAKEDEV script -can make device nodes for you automatically. Currently the device setup -is as follows: - -Major numbers: - 104 cciss0 - 105 cciss1 - 106 cciss2 - 105 cciss3 - 108 cciss4 - 109 cciss5 - 110 cciss6 - 111 cciss7 - -Minor numbers: - b7 b6 b5 b4 b3 b2 b1 b0 - |----+----| |----+----| - | | - | +-------- Partition ID (0=wholedev, 1-15 partition) - | - +-------------------- Logical Volume number - -The device naming scheme is: -/dev/cciss/c0d0 Controller 0, disk 0, whole device -/dev/cciss/c0d0p1 Controller 0, disk 0, partition 1 -/dev/cciss/c0d0p2 Controller 0, disk 0, partition 2 -/dev/cciss/c0d0p3 Controller 0, disk 0, partition 3 - -/dev/cciss/c1d1 Controller 1, disk 1, whole device -/dev/cciss/c1d1p1 Controller 1, disk 1, partition 1 -/dev/cciss/c1d1p2 Controller 1, disk 1, partition 2 -/dev/cciss/c1d1p3 Controller 1, disk 1, partition 3 - -SCSI tape drive and medium changer support ------------------------------------------- - -SCSI sequential access devices and medium changer devices are supported and -appropriate device nodes are automatically created. (e.g. -/dev/st0, /dev/st1, etc. See the "st" man page for more details.) -You must enable "SCSI tape drive support for Smart Array 5xxx" and -"SCSI support" in your kernel configuration to be able to use SCSI -tape drives with your Smart Array 5xxx controller. - -Additionally, note that the driver will not engage the SCSI core at init -time. The driver must be directed to dynamically engage the SCSI core via -the /proc filesystem entry which the "block" side of the driver creates as -/proc/driver/cciss/cciss* at runtime. This is because at driver init time, -the SCSI core may not yet be initialized (because the driver is a block -driver) and attempting to register it with the SCSI core in such a case -would cause a hang. This is best done via an initialization script -(typically in /etc/init.d, but could vary depending on distribution). -For example: - - for x in /proc/driver/cciss/cciss[0-9]* - do - echo "engage scsi" > $x - done - -Once the SCSI core is engaged by the driver, it cannot be disengaged -(except by unloading the driver, if it happens to be linked as a module.) - -Note also that if no sequential access devices or medium changers are -detected, the SCSI core will not be engaged by the action of the above -script. - -Hot plug support for SCSI tape drives -------------------------------------- - -Hot plugging of SCSI tape drives is supported, with some caveats. -The cciss driver must be informed that changes to the SCSI bus -have been made. This may be done via the /proc filesystem. -For example: - - echo "rescan" > /proc/scsi/cciss0/1 - -This causes the driver to query the adapter about changes to the -physical SCSI buses and/or fibre channel arbitrated loop and the -driver to make note of any new or removed sequential access devices -or medium changers. The driver will output messages indicating what -devices have been added or removed and the controller, bus, target and -lun used to address the device. It then notifies the SCSI mid layer -of these changes. - -Note that the naming convention of the /proc filesystem entries -contains a number in addition to the driver name. (E.g. "cciss0" -instead of just "cciss" which you might expect.) - -Note: ONLY sequential access devices and medium changers are presented -as SCSI devices to the SCSI mid layer by the cciss driver. Specifically, -physical SCSI disk drives are NOT presented to the SCSI mid layer. The -physical SCSI disk drives are controlled directly by the array controller -hardware and it is important to prevent the kernel from attempting to directly -access these devices too, as if the array controller were merely a SCSI -controller in the same way that we are allowing it to access SCSI tape drives. - -SCSI error handling for tape drives and medium changers -------------------------------------------------------- - -The linux SCSI mid layer provides an error handling protocol which -kicks into gear whenever a SCSI command fails to complete within a -certain amount of time (which can vary depending on the command). -The cciss driver participates in this protocol to some extent. The -normal protocol is a four step process. First the device is told -to abort the command. If that doesn't work, the device is reset. -If that doesn't work, the SCSI bus is reset. If that doesn't work -the host bus adapter is reset. Because the cciss driver is a block -driver as well as a SCSI driver and only the tape drives and medium -changers are presented to the SCSI mid layer, and unlike more -straightforward SCSI drivers, disk i/o continues through the block -side during the SCSI error recovery process, the cciss driver only -implements the first two of these actions, aborting the command, and -resetting the device. Additionally, most tape drives will not oblige -in aborting commands, and sometimes it appears they will not even -obey a reset command, though in most circumstances they will. In -the case that the command cannot be aborted and the device cannot be -reset, the device will be set offline. - -In the event the error handling code is triggered and a tape drive is -successfully reset or the tardy command is successfully aborted, the -tape drive may still not allow i/o to continue until some command -is issued which positions the tape to a known position. Typically you -must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example) -before i/o can proceed again to a tape drive which was reset. - diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index d9014aa0eb6..d9e5d6f41b9 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -1,7 +1,8 @@ CGROUPS ------- -Written by Paul Menage based on Documentation/cpusets.txt +Written by Paul Menage based on +Documentation/cgroups/cpusets.txt Original copyright statements from cpusets.txt: Portions Copyright (C) 2004 BULL SA. @@ -68,7 +69,7 @@ On their own, the only use for cgroups is for simple job tracking. The intention is that other subsystems hook into the generic cgroup support to provide new attributes for cgroups, such as accounting/limiting the resources which processes in a cgroup can -access. For example, cpusets (see Documentation/cpusets.txt) allows +access. For example, cpusets (see Documentation/cgroups/cpusets.txt) allows you to associate a set of CPUs and a set of memory nodes with the tasks in each cgroup. @@ -227,7 +228,6 @@ Each cgroup is represented by a directory in the cgroup file system containing the following files describing that cgroup: - tasks: list of tasks (by pid) attached to that cgroup - - releasable flag: cgroup currently removeable? - notify_on_release flag: run the release agent on exit? - release_agent: the path to use for release notifications (this file exists in the top cgroup only) @@ -360,7 +360,7 @@ Now you want to do something with this cgroup. In this directory you can find several files: # ls -notify_on_release releasable tasks +notify_on_release tasks (plus whatever files added by the attached subsystems) Now attach your shell to this cgroup: @@ -479,7 +479,6 @@ newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); -(cgroup_mutex held by caller) Called before checking the reference count on each subsystem. This may be useful for subsystems which have some extra references even if @@ -498,6 +497,7 @@ remain valid while the caller holds cgroup_mutex. void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task) +(cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. @@ -511,6 +511,7 @@ void exit(struct cgroup_subsys *ss, struct task_struct *task) Called during task exit. int populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +(cgroup_mutex held by caller) Called after creation of a cgroup to allow a subsystem to populate the cgroup directory with file entries. The subsystem should make @@ -520,6 +521,7 @@ method can return an error code, the error code is currently not always handled well. void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) +(cgroup_mutex held by caller) Called at the end of cgroup_clone() to do any paramater initialization which might be required before a task could attach. For @@ -527,7 +529,7 @@ example in cpusets, no task may attach before 'cpus' and 'mems' are set up. void bind(struct cgroup_subsys *ss, struct cgroup *root) -(cgroup_mutex held by caller) +(cgroup_mutex and ss->hierarchy_mutex held by caller) Called when a cgroup subsystem is rebound to a different hierarchy and root cgroup. Currently this will only involve movement between diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt new file mode 100644 index 00000000000..bb775fbe43d --- /dev/null +++ b/Documentation/cgroups/cpuacct.txt @@ -0,0 +1,32 @@ +CPU Accounting Controller +------------------------- + +The CPU accounting controller is used to group tasks using cgroups and +account the CPU usage of these groups of tasks. + +The CPU accounting controller supports multi-hierarchy groups. An accounting +group accumulates the CPU usage of all of its child groups and the tasks +directly present in its group. + +Accounting groups can be created by first mounting the cgroup filesystem. + +# mkdir /cgroups +# mount -t cgroup -ocpuacct none /cgroups + +With the above step, the initial or the parent accounting group +becomes visible at /cgroups. At bootup, this group includes all the +tasks in the system. /cgroups/tasks lists the tasks in this cgroup. +/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by +this group which is essentially the CPU time obtained by all the tasks +in the system. + +New accounting groups can be created under the parent group /cgroups. + +# cd /cgroups +# mkdir g1 +# echo $$ > g1 + +The above steps create a new group g1 and move the current shell +process (bash) into it. CPU time consumed by this bash and its children +can be obtained from g1/cpuacct.usage and the same is accumulated in +/cgroups/cpuacct.usage also. diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt new file mode 100644 index 00000000000..5c86c258c79 --- /dev/null +++ b/Documentation/cgroups/cpusets.txt @@ -0,0 +1,808 @@ + CPUSETS + ------- + +Copyright (C) 2004 BULL SA. +Written by Simon.Derr@bull.net + +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson +Modified by Christoph Lameter +Modified by Paul Menage +Modified by Hidetoshi Seto + +CONTENTS: +========= + +1. Cpusets + 1.1 What are cpusets ? + 1.2 Why are cpusets needed ? + 1.3 How are cpusets implemented ? + 1.4 What are exclusive cpusets ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? + 1.8 What is sched_relax_domain_level ? + 1.9 How do I use cpusets ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Adding/removing cpus + 2.3 Setting flags + 2.4 Attaching processes +3. Questions +4. Contact + +1. Cpusets +========== + +1.1 What are cpusets ? +---------------------- + +Cpusets provide a mechanism for assigning a set of CPUs and Memory +Nodes to a set of tasks. In this document "Memory Node" refers to +an on-line node that contains memory. + +Cpusets constrain the CPU and Memory placement of tasks to only +the resources within a tasks current cpuset. They form a nested +hierarchy visible in a virtual file system. These are the essential +hooks, beyond what is already present, required to manage dynamic +job placement on large systems. + +Cpusets use the generic cgroup subsystem described in +Documentation/cgroups/cgroups.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup +virtual file system, manage the attributes and permissions of these +cpusets and which CPUs and Memory Nodes are assigned to each cpuset, +specify and query to which cpuset a task is assigned, and list the +task pids assigned to a cpuset. + + +1.2 Why are cpusets needed ? +---------------------------- + +The management of large computer systems, with many processors (CPUs), +complex memory cache hierarchies and multiple Memory Nodes having +non-uniform access times (NUMA) presents additional challenges for +the efficient scheduling and memory placement of processes. + +Frequently more modest sized systems can be operated with adequate +efficiency just by letting the operating system automatically share +the available CPU and Memory resources amongst the requesting tasks. + +But larger systems, which benefit more from careful processor and +memory placement to reduce memory access times and contention, +and which typically represent a larger investment for the customer, +can benefit from explicitly placing jobs on properly sized subsets of +the system. + +This can be especially valuable on: + + * Web Servers running multiple instances of the same web application, + * Servers running different applications (for instance, a web server + and a database), or + * NUMA systems running large HPC applications with demanding + performance characteristics. + +These subsets, or "soft partitions" must be able to be dynamically +adjusted, as the job mix changes, without impacting other concurrently +executing jobs. The location of the running jobs pages may also be moved +when the memory locations are changed. + +The kernel cpuset patch provides the minimum essential kernel +mechanisms required to efficiently implement such subsets. It +leverages existing CPU and Memory Placement facilities in the Linux +kernel to avoid any additional impact on the critical scheduler or +memory allocator code. + + +1.3 How are cpusets implemented ? +--------------------------------- + +Cpusets provide a Linux kernel mechanism to constrain which CPUs and +Memory Nodes are used by a process or set of processes. + +The Linux kernel already has a pair of mechanisms to specify on which +CPUs a task may be scheduled (sched_setaffinity) and on which Memory +Nodes it may obtain memory (mbind, set_mempolicy). + +Cpusets extends these two mechanisms as follows: + + - Cpusets are sets of allowed CPUs and Memory Nodes, known to the + kernel. + - Each task in the system is attached to a cpuset, via a pointer + in the task structure to a reference counted cgroup structure. + - Calls to sched_setaffinity are filtered to just those CPUs + allowed in that tasks cpuset. + - Calls to mbind and set_mempolicy are filtered to just + those Memory Nodes allowed in that tasks cpuset. + - The root cpuset contains all the systems CPUs and Memory + Nodes. + - For any cpuset, one can define child cpusets containing a subset + of the parents CPU and Memory Node resources. + - The hierarchy of cpusets can be mounted at /dev/cpuset, for + browsing and manipulation from user space. + - A cpuset may be marked exclusive, which ensures that no other + cpuset (except direct ancestors and descendents) may contain + any overlapping CPUs or Memory Nodes. + - You can list all the tasks (by pid) attached to any cpuset. + +The implementation of cpusets requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cpuset at system boot. + - in fork and exit, to attach and detach a task from its cpuset. + - in sched_setaffinity, to mask the requested CPUs by what's + allowed in that tasks cpuset. + - in sched.c migrate_all_tasks(), to keep migrating tasks within + the CPUs allowed by their cpuset, if possible. + - in the mbind and set_mempolicy system calls, to mask the requested + Memory Nodes by what's allowed in that tasks cpuset. + - in page_alloc.c, to restrict memory to allowed nodes. + - in vmscan.c, to restrict page recovery to the current cpuset. + +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. + +The /proc//status file for each task has four added lines, +displaying the tasks cpus_allowed (on which CPUs it may be scheduled) +and mems_allowed (on which Memory Nodes it may obtain memory), +in the two formats seen in the following example: + + Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff + Cpus_allowed_list: 0-127 + Mems_allowed: ffffffff,ffffffff + Mems_allowed_list: 0-63 + +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: + + - cpus: list of CPUs in that cpuset + - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes + - cpu_exclusive flag: is cpu placement exclusive? + - mem_exclusive flag: is memory placement exclusive? + - mem_hardwall flag: is memory allocation hardwalled + - memory_pressure: measure of how much paging pressure in cpuset + +In addition, the root cpuset only has the following file: + - memory_pressure_enabled flag: compute memory_pressure? + +New cpusets are created using the mkdir system call or shell +command. The properties of a cpuset, such as its flags, allowed +CPUs and Memory Nodes, and attached tasks, are modified by writing +to the appropriate file in that cpusets directory, as listed above. + +The named hierarchical structure of nested cpusets allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cpuset allows organizing the work load +on a system into related sets of tasks such that each set is constrained +to using the CPUs and Memory Nodes of a particular cpuset. A task +may be re-attached to any other cpuset, if allowed by the permissions +on the necessary cpuset file system directories. + +Such management of a system "in the large" integrates smoothly with +the detailed placement done on individual tasks and memory regions +using the sched_setaffinity, mbind and set_mempolicy system calls. + +The following rules apply to each cpuset: + + - Its CPUs and Memory Nodes must be a subset of its parents. + - It can't be marked exclusive unless its parent is. + - If its cpu or memory is exclusive, they may not overlap any sibling. + +These rules, and the natural hierarchy of cpusets, enable efficient +enforcement of the exclusive guarantee, without having to scan all +cpusets every time any of them change to ensure nothing overlaps a +exclusive cpuset. Also, the use of a Linux virtual file system (vfs) +to represent the cpuset hierarchy provides for a familiar permission +and name space for cpusets, with a minimum of additional kernel code. + +The cpus and mems files in the root (top_cpuset) cpuset are +read-only. The cpus file automatically tracks the value of +cpu_online_map using a CPU hotplug notifier, and the mems file +automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., +nodes with memory--using the cpuset_track_online_nodes() hook. + + +1.4 What are exclusive cpusets ? +-------------------------------- + +If a cpuset is cpu or mem exclusive, no other cpuset, other than +a direct ancestor or descendent, may share any of the same CPUs or +Memory Nodes. + +A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", +i.e. it restricts kernel allocations for page, buffer and other data +commonly shared by the kernel across multiple users. All cpusets, +whether hardwalled or not, restrict allocations of memory for user +space. This enables configuring a system so that several independent +jobs can share common kernel data, such as file system pages, while +isolating each job's user allocation in its own cpuset. To do this, +construct a large mem_exclusive cpuset to hold all the jobs, and +construct child, non-mem_exclusive cpusets for each individual job. +Only a small amount of typical kernel memory, such as requests from +interrupt handlers, is allowed to be taken outside even a +mem_exclusive cpuset. + + +1.5 What is memory_pressure ? +----------------------------- +The memory_pressure of a cpuset provides a simple per-cpuset metric +of the rate that the tasks in a cpuset are attempting to free up in +use memory on the nodes of the cpuset to satisfy additional memory +requests. + +This enables batch managers monitoring jobs running in dedicated +cpusets to efficiently detect what level of memory pressure that job +is causing. + +This is useful both on tightly managed systems running a wide mix of +submitted jobs, which may choose to terminate or re-prioritize jobs that +are trying to use more memory than allowed on the nodes assigned them, +and with tightly coupled, long running, massively parallel scientific +computing jobs that will dramatically fail to meet required performance +goals if they start to use more memory than allowed to them. + +This mechanism provides a very economical way for the batch manager +to monitor a cpuset for signs of memory pressure. It's up to the +batch manager or other user code to decide what to do about it and +take action. + +==> Unless this feature is enabled by writing "1" to the special file + /dev/cpuset/memory_pressure_enabled, the hook in the rebalance + code of __alloc_pages() for this metric reduces to simply noticing + that the cpuset_memory_pressure_enabled flag is zero. So only + systems that enable this feature will compute the metric. + +Why a per-cpuset, running average: + + Because this meter is per-cpuset, rather than per-task or mm, + the system load imposed by a batch scheduler monitoring this + metric is sharply reduced on large systems, because a scan of + the tasklist can be avoided on each set of queries. + + Because this meter is a running average, instead of an accumulating + counter, a batch scheduler can detect memory pressure with a + single read, instead of having to read and accumulate results + for a period of time. + + Because this meter is per-cpuset rather than per-task or mm, + the batch scheduler can obtain the key information, memory + pressure in a cpuset, with a single read, rather than having to + query and accumulate results over all the (dynamically changing) + set of tasks in the cpuset. + +A per-cpuset simple digital filter (requires a spinlock and 3 words +of data per-cpuset) is kept, and updated by any task attached to that +cpuset, if it enters the synchronous (direct) page reclaim code. + +A per-cpuset file provides an integer number representing the recent +(half-life of 10 seconds) rate of direct page reclaims caused by +the tasks in the cpuset, in units of reclaims attempted per second, +times 1000. + + +1.6 What is memory spread ? +--------------------------- +There are two boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers and related in +kernel data structures. They are called 'memory_spread_page' and +'memory_spread_slab'. + +If the per-cpuset boolean flag file 'memory_spread_page' is set, then +the kernel will spread the file system buffers (page cache) evenly +over all the nodes that the faulting task is allowed to use, instead +of preferring to put those pages on the node where the task is running. + +If the per-cpuset boolean flag file 'memory_spread_slab' is set, +then the kernel will spread some file system related slab caches, +such as for inodes and dentries evenly over all the nodes that the +faulting task is allowed to use, instead of preferring to put those +pages on the node where the task is running. + +The setting of these flags does not affect anonymous data segment or +stack segment pages of a task. + +By default, both kinds of memory spreading are off, and memory +pages are allocated on the node local to where the task is running, +except perhaps as modified by the tasks NUMA mempolicy or cpuset +configuration, so long as sufficient free memory pages are available. + +When new cpusets are created, they inherit the memory spread settings +of their parent. + +Setting memory spreading causes allocations for the affected page +or slab caches to ignore the tasks NUMA mempolicy and be spread +instead. Tasks using mbind() or set_mempolicy() calls to set NUMA +mempolicies will not notice any change in these calls as a result of +their containing tasks memory spread settings. If memory spreading +is turned off, then the currently specified NUMA mempolicy once again +applies to memory page allocations. + +Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag +files. By default they contain "0", meaning that the feature is off +for that cpuset. If a "1" is written to that file, then that turns +the named feature on. + +The implementation is simple. + +Setting the flag 'memory_spread_page' turns on a per-process flag +PF_SPREAD_PAGE for each task that is in that cpuset or subsequently +joins that cpuset. The page allocation calls for the page cache +is modified to perform an inline check for this PF_SPREAD_PAGE task +flag, and if set, a call to a new routine cpuset_mem_spread_node() +returns the node to prefer for the allocation. + +Similarly, setting 'memory_spread_slab' turns on the flag +PF_SPREAD_SLAB, and appropriately marked slab caches will allocate +pages from the node returned by cpuset_mem_spread_node(). + +The cpuset_mem_spread_node() routine is also simple. It uses the +value of a per-task rotor cpuset_mem_spread_rotor to select the next +node in the current tasks mems_allowed to prefer for the allocation. + +This memory placement policy is also known (in other contexts) as +round-robin or interleave. + +This policy can provide substantial improvements for jobs that need +to place thread local data on the corresponding node, but that need +to access large file system data sets that need to be spread across +the several nodes in the jobs cpuset in order to fit. Without this +policy, especially for jobs that might have one thread reading in the +data set, the memory allocation across the nodes in the jobs cpuset +can become very uneven. + +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. + + +1.8 What is sched_relax_domain_level ? +-------------------------------------- + +In sched domain, the scheduler migrates tasks in 2 ways; periodic load +balance on tick, and at time of some schedule events. + +When a task is woken up, scheduler try to move the task on idle CPU. +For example, if a task A running on CPU X activates another task B +on the same CPU X, and if CPU Y is X's sibling and performing idle, +then scheduler migrate task B to CPU Y so that task B can start on +CPU Y without waiting task A on CPU X. + +And if a CPU run out of tasks in its runqueue, the CPU try to pull +extra tasks from other busy CPUs to help them before it is going to +be idle. + +Of course it takes some searching cost to find movable tasks and/or +idle CPUs, the scheduler might not search all CPUs in the domain +everytime. In fact, in some architectures, the searching ranges on +events are limited in the same socket or node where the CPU locates, +while the load balance on tick searchs all. + +For example, assume CPU Z is relatively far from CPU X. Even if CPU Z +is idle while CPU X and the siblings are busy, scheduler can't migrate +woken task B from X to Z since it is out of its searching range. +As the result, task B on CPU X need to wait task A or wait load balance +on the next tick. For some applications in special situation, waiting +1 tick may be too long. + +The 'sched_relax_domain_level' file allows you to request changing +this searching range as you like. This file takes int value which +indicates size of searching range in levels ideally as follows, +otherwise initial value -1 that indicates the cpuset has no request. + + -1 : no request. use system default or follow request of others. + 0 : no search. + 1 : search siblings (hyperthreads in a core). + 2 : search cores in a package. + 3 : search cpus in a node [= system wide on non-NUMA system] + ( 4 : search nodes in a chunk of node [on NUMA system] ) + ( 5 : search system wide [on NUMA system] ) + +The system default is architecture dependent. The system default +can be changed using the relax_domain_level= boot parameter. + +This file is per-cpuset and affect the sched domain where the cpuset +belongs to. Therefore if the flag 'sched_load_balance' of a cpuset +is disabled, then 'sched_relax_domain_level' have no effect since +there is no sched domain belonging the cpuset. + +If multiple cpusets are overlapping and hence they form a single sched +domain, the largest value among those is used. Be careful, if one +requests 0 and others are -1 then 0 is used. + +Note that modifying this file will have both good and bad effects, +and whether it is acceptable or not will be depend on your situation. +Don't modify this file if you are not sure. + +If your situation is: + - The migration costs between each cpu can be assumed considerably + small(for you) due to your special application's behavior or + special hardware support for CPU cache etc. + - The searching cost doesn't have impact(for you) or you can make + the searching cost enough small by managing cpuset to compact etc. + - The latency is required even it sacrifices cache hit rate etc. +then increasing 'sched_relax_domain_level' would benefit you. + + +1.9 How do I use cpusets ? +-------------------------- + +In order to minimize the impact of cpusets on critical kernel +code, such as the scheduler, and due to the fact that the kernel +does not support one task updating the memory placement of another +task directly, the impact on a task of changing its cpuset CPU +or Memory Node placement, or of changing to which cpuset a task +is attached, is subtle. + +If a cpuset has its Memory Nodes modified, then for each task attached +to that cpuset, the next time that the kernel attempts to allocate +a page of memory for that task, the kernel will notice the change +in the tasks cpuset, and update its per-task memory placement to +remain within the new cpusets memory placement. If the task was using +mempolicy MPOL_BIND, and the nodes to which it was bound overlap with +its new cpuset, then the task will continue to use whatever subset +of MPOL_BIND nodes are still allowed in the new cpuset. If the task +was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed +in the new cpuset, then the task will be essentially treated as if it +was MPOL_BIND bound to the new cpuset (even though its numa placement, +as queried by get_mempolicy(), doesn't change). If a task is moved +from one cpuset to another, then the kernel will adjust the tasks +memory placement, as above, the next time that the kernel attempts +to allocate a page of memory for that task. + +If a cpuset has its 'cpus' modified, then each task in that cpuset +will have its allowed CPU placement changed immediately. Similarly, +if a tasks pid is written to a cpusets 'tasks' file, in either its +current cpuset or another cpuset, then its allowed CPU placement is +changed immediately. If such a task had been bound to some subset +of its cpuset using the sched_setaffinity() call, the task will be +allowed to run on any CPU allowed in its new cpuset, negating the +affect of the prior sched_setaffinity() call. + +In summary, the memory placement of a task whose cpuset is changed is +updated by the kernel, on the next allocation of a page for that task, +but the processor placement is not updated, until that tasks pid is +rewritten to the 'tasks' file of its cpuset. This is done to avoid +impacting the scheduler code in the kernel with a check for changes +in a tasks processor placement. + +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. The relative placement of the page within +the cpuset is preserved during these migration operations if possible. +For example if the page was on the second valid node of the prior cpuset +then the page will be placed on the second valid node of the new cpuset. + +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' +Pages that were not in the tasks prior cpuset, or in the cpusets +prior 'mems' setting, will not be moved. + +There is an exception to the above. If hotplug functionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then all the tasks in that cpuset will be moved to the nearest ancestor +with non-empty cpus. But the moving of some (or all) tasks might fail if +cpuset is bound with another cgroup subsystem which has some restrictions +on task attaching. In this failing case, those tasks will stay +in the original cpuset, and the kernel will automatically update +their cpus_allowed to allow all online CPUs. When memory hotplug +functionality for removing Memory Nodes is available, a similar exception +is expected to apply there as well. In general, the kernel prefers to +violate cpuset placement, over starving a task that has had all +its allowed CPUs or Memory Nodes taken offline. + +There is a second exception to the above. GFP_ATOMIC requests are +kernel internal allocations that must be satisfied, immediately. +The kernel may drop some request, in rare cases even panic, if a +GFP_ATOMIC alloc fails. If the request cannot be satisfied within +the current tasks cpuset, then we relax the cpuset, and look for +memory anywhere we can find it. It's better to violate the cpuset +than stress the kernel. + +To start a new job that is to be contained within a cpuset, the steps are: + + 1) mkdir /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset + 3) Create the new cpuset by doing mkdir's and write's (or echo's) in + the /dev/cpuset virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cpuset by writing its pid to the + /dev/cpuset tasks file for that cpuset. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cpuset +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cpuset: + + mount -t cgroup -ocpuset cpuset /dev/cpuset + cd /dev/cpuset + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cpuset Charlie + # The next line should display '/Charlie' + cat /proc/self/cpuset + +In the future, a C library interface to cpusets will likely be +available. For now, the only way to query or modify cpusets is +via the cpuset file system, using the various cd, mkdir, echo, cat, +rmdir commands from the shell, or their equivalent from C. + +The sched_setaffinity calls can also be done at the shell prompt using +SGI's runon or Robert Love's taskset. The mbind and set_mempolicy +calls can be done at the shell prompt using the numactl command +(part of Andi Kleen's numa package). + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cpusets can be done through the cpuset +virtual filesystem. + +To mount it, type: +# mount -t cgroup -o cpuset cpuset /dev/cpuset + +Then under /dev/cpuset you can find a tree that corresponds to the +tree of the cpusets in the system. For instance, /dev/cpuset +is the cpuset that holds the whole system. + +If you want to create a new cpuset under /dev/cpuset: +# cd /dev/cpuset +# mkdir my_cpuset + +Now you want to do something with this cpuset. +# cd my_cpuset + +In this directory you can find several files: +# ls +cpu_exclusive memory_migrate mems tasks +cpus memory_pressure notify_on_release +mem_exclusive memory_spread_page sched_load_balance +mem_hardwall memory_spread_slab sched_relax_domain_level + +Reading them will give you information about the state of this cpuset: +the CPUs and Memory Nodes it can use, the processes that are using +it, its properties. By writing to these files you can manipulate +the cpuset. + +Set some flags: +# /bin/echo 1 > cpu_exclusive + +Add some cpus: +# /bin/echo 0-7 > cpus + +Add some mems: +# /bin/echo 0-7 > mems + +Now attach your shell to this cpuset: +# /bin/echo $$ > tasks + +You can also create cpusets inside your cpuset by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cpuset, just use rmdir: +# rmdir my_sub_cs +This will fail if the cpuset is in use (has cpusets inside, or has +processes attached). + +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + +2.2 Adding/removing cpus +------------------------ + +This is the syntax to use when writing in the cpus or mems files +in cpuset directories: + +# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 +# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 + +2.3 Setting flags +----------------- + +The syntax is very simple: + +# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' +# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' + +2.4 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + + +3. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cpuset file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + +4. Contact +========== + +Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt new file mode 100644 index 00000000000..7cc6e6a6067 --- /dev/null +++ b/Documentation/cgroups/devices.txt @@ -0,0 +1,52 @@ +Device Whitelist Controller + +1. Description: + +Implement a cgroup to track and enforce open and mknod restrictions +on device files. A device cgroup associates a device access +whitelist with each cgroup. A whitelist entry has 4 fields. +'type' is a (all), c (char), or b (block). 'all' means it applies +to all types and all major and minor numbers. Major and minor are +either an integer or * for all. Access is a composition of r +(read), w (write), and m (mknod). + +The root device cgroup starts with rwm to 'all'. A child device +cgroup gets a copy of the parent. Administrators can then remove +devices from the whitelist or add new entries. A child cgroup can +never receive a device access which is denied by its parent. However +when a device access is removed from a parent it will not also be +removed from the child(ren). + +2. User Interface + +An entry is added using devices.allow, and removed using +devices.deny. For instance + + echo 'c 1:3 mr' > /cgroups/1/devices.allow + +allows cgroup 1 to read and mknod the device usually known as +/dev/null. Doing + + echo a > /cgroups/1/devices.deny + +will remove the default 'a *:* rwm' entry. Doing + + echo a > /cgroups/1/devices.allow + +will add the 'a *:* rwm' entry to the whitelist. + +3. Security + +Any task can move itself between cgroups. This clearly won't +suffice, but we can decide the best way to adequately restrict +movement as people get some experience with this. We may just want +to require CAP_SYS_ADMIN, which at least is a separate bit from +CAP_MKNOD. We may want to just refuse moving to a cgroup which +isn't a descendent of the current one. Or we may want to use +CAP_MAC_ADMIN, since we really are trying to lock down root. + +CAP_SYS_ADMIN is needed to modify the whitelist or move another +task to a new cgroup. (Again we'll probably want to change that). + +A cgroup may not be granted more permissions than the cgroup's +parent has. diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt index c50ab58b72e..41f37fea127 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -1,4 +1,4 @@ - The cgroup freezer is useful to batch job management system which start +The cgroup freezer is useful to batch job management system which start and stop sets of tasks in order to schedule the resources of a machine according to the desires of a system administrator. This sort of program is often used on HPC clusters to schedule access to the cluster as a @@ -6,7 +6,7 @@ whole. The cgroup freezer uses cgroups to describe the set of tasks to be started/stopped by the batch job management system. It also provides a means to start and stop the tasks composing the job. - The cgroup freezer will also be useful for checkpointing running groups +The cgroup freezer will also be useful for checkpointing running groups of tasks. The freezer allows the checkpoint code to obtain a consistent image of the tasks by attempting to force the tasks in a cgroup into a quiescent state. Once the tasks are quiescent another task can @@ -16,7 +16,7 @@ recoverable error occur. This also allows the checkpointed tasks to be migrated between nodes in a cluster by copying the gathered information to another node and restarting the tasks there. - Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping and resuming tasks in userspace. Both of these signals are observable from within the tasks we wish to freeze. While SIGSTOP cannot be caught, blocked, or ignored it can be seen by waiting or ptracing parent tasks. @@ -37,26 +37,29 @@ demonstrate this problem using nested bash shells: - This happens because bash can observe both signals and choose how it +This happens because bash can observe both signals and choose how it responds to them. - Another example of a program which catches and responds to these +Another example of a program which catches and responds to these signals is gdb. In fact any program designed to use ptrace is likely to have a problem with this method of stopping and resuming tasks. - In contrast, the cgroup freezer uses the kernel freezer code to +In contrast, the cgroup freezer uses the kernel freezer code to prevent the freeze/unfreeze cycle from becoming visible to the tasks being frozen. This allows the bash example above and gdb to run as expected. - The freezer subsystem in the container filesystem defines a file named +The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. Reading will return the current state. +Note freezer.state doesn't exist in root cgroup, which means root cgroup +is non-freezable. + * Examples of usage : - # mkdir /containers/freezer + # mkdir /containers # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks @@ -94,6 +97,6 @@ things happens: the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal - and returns EIO) + and returns EINVAL) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt new file mode 100644 index 00000000000..19533f93b7a --- /dev/null +++ b/Documentation/cgroups/memcg_test.txt @@ -0,0 +1,342 @@ +Memory Resource Controller(Memcg) Implementation Memo. +Last Updated: 2008/12/15 +Base Kernel Version: based on 2.6.28-rc8-mm. + +Because VM is getting complex (one of reasons is memcg...), memcg's behavior +is complex. This is a document for memcg's internal behavior. +Please note that implementation details can be changed. + +(*) Topics on API should be in Documentation/cgroups/memory.txt) + +0. How to record usage ? + 2 objects are used. + + page_cgroup ....an object per page. + Allocated at boot or memory hotplug. Freed at memory hot removal. + + swap_cgroup ... an entry per swp_entry. + Allocated at swapon(). Freed at swapoff(). + + The page_cgroup has USED bit and double count against a page_cgroup never + occurs. swap_cgroup is used only when a charged page is swapped-out. + +1. Charge + + a page/swp_entry may be charged (usage += PAGE_SIZE) at + + mem_cgroup_newpage_charge() + Called at new page fault and Copy-On-Write. + + mem_cgroup_try_charge_swapin() + Called at do_swap_page() (page fault on swap entry) and swapoff. + Followed by charge-commit-cancel protocol. (With swap accounting) + At commit, a charge recorded in swap_cgroup is removed. + + mem_cgroup_cache_charge() + Called at add_to_page_cache() + + mem_cgroup_cache_charge_swapin() + Called at shmem's swapin. + + mem_cgroup_prepare_migration() + Called before migration. "extra" charge is done and followed by + charge-commit-cancel protocol. + At commit, charge against oldpage or newpage will be committed. + +2. Uncharge + a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by + + mem_cgroup_uncharge_page() + Called when an anonymous page is fully unmapped. I.e., mapcount goes + to 0. If the page is SwapCache, uncharge is delayed until + mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_cache_page() + Called when a page-cache is deleted from radix-tree. If the page is + SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). + + mem_cgroup_uncharge_swapcache() + Called when SwapCache is removed from radix-tree. The charge itself + is moved to swap_cgroup. (If mem+swap controller is disabled, no + charge to swap occurs.) + + mem_cgroup_uncharge_swap() + Called when swp_entry's refcnt goes down to 0. A charge against swap + disappears. + + mem_cgroup_end_migration(old, new) + At success of migration old is uncharged (if necessary), a charge + to new page is committed. At failure, charge to old page is committed. + +3. charge-commit-cancel + In some case, we can't know this "charge" is valid or not at charging + (because of races). + To handle such case, there are charge-commit-cancel functions. + mem_cgroup_try_charge_XXX + mem_cgroup_commit_charge_XXX + mem_cgroup_cancel_charge_XXX + these are used in swap-in and migration. + + At try_charge(), there are no flags to say "this page is charged". + at this point, usage += PAGE_SIZE. + + At commit(), the function checks the page should be charged or not + and set flags or avoid charging.(usage -= PAGE_SIZE) + + At cancel(), simply usage -= PAGE_SIZE. + +Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. + +4. Anonymous + Anonymous page is newly allocated at + - page fault into MAP_ANONYMOUS mapping. + - Copy-On-Write. + It is charged right after it's allocated before doing any page table + related operations. Of course, it's uncharged when another page is used + for the fault address. + + At freeing anonymous page (by exit() or munmap()), zap_pte() is called + and pages for ptes are freed one by one.(see mm/memory.c). Uncharges + are done at page_remove_rmap() when page_mapcount() goes down to 0. + + Another page freeing is by page-reclaim (vmscan.c) and anonymous + pages are swapped out. In this case, the page is marked as + PageSwapCache(). uncharge() routine doesn't uncharge the page marked + as SwapCache(). It's delayed until __delete_from_swap_cache(). + + 4.1 Swap-in. + At swap-in, the page is taken from swap-cache. There are 2 cases. + + (a) If the SwapCache is newly allocated and read, it has no charges. + (b) If the SwapCache has been mapped by processes, it has been + charged already. + + This swap-in is one of the most complicated work. In do_swap_page(), + following events occur when pte is unchanged. + + (1) the page (SwapCache) is looked up. + (2) lock_page() + (3) try_charge_swapin() + (4) reuse_swap_page() (may call delete_swap_cache()) + (5) commit_charge_swapin() + (6) swap_free(). + + Considering following situation for example. + + (A) The page has not been charged before (2) and reuse_swap_page() + doesn't call delete_from_swap_cache(). + (B) The page has not been charged before (2) and reuse_swap_page() + calls delete_from_swap_cache(). + (C) The page has been charged before (2) and reuse_swap_page() doesn't + call delete_from_swap_cache(). + (D) The page has been charged before (2) and reuse_swap_page() calls + delete_from_swap_cache(). + + memory.usage/memsw.usage changes to this page/swp_entry will be + Case (A) (B) (C) (D) + Event + Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 + =========================================== + (3) +1/+1 +1/+1 +1/+1 +1/+1 + (4) - 0/ 0 - -1/ 0 + (5) 0/-1 0/ 0 -1/-1 0/ 0 + (6) - 0/-1 - 0/-1 + =========================================== + Result 1/ 1 1/ 1 1/ 1 1/ 1 + + In any cases, charges to this page should be 1/ 1. + + 4.2 Swap-out. + At swap-out, typical state transition is below. + + (a) add to swap cache. (marked as SwapCache) + swp_entry's refcnt += 1. + (b) fully unmapped. + swp_entry's refcnt += # of ptes. + (c) write back to swap. + (d) delete from swap cache. (remove from SwapCache) + swp_entry's refcnt -= 1. + + + At (b), the page is marked as SwapCache and not uncharged. + At (d), the page is removed from SwapCache and a charge in page_cgroup + is moved to swap_cgroup. + + Finally, at task exit, + (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. + Here, a charge in swap_cgroup disappears. + +5. Page Cache + Page Cache is charged at + - add_to_page_cache_locked(). + + uncharged at + - __remove_from_page_cache(). + + The logic is very clear. (About migration, see below) + Note: __remove_from_page_cache() is called by remove_from_page_cache() + and __remove_mapping(). + +6. Shmem(tmpfs) Page Cache + Memcg's charge/uncharge have special handlers of shmem. The best way + to understand shmem's page state transition is to read mm/shmem.c. + But brief explanation of the behavior of memcg around shmem will be + helpful to understand the logic. + + Shmem's page (just leaf page, not direct/indirect block) can be on + - radix-tree of shmem's inode. + - SwapCache. + - Both on radix-tree and SwapCache. This happens at swap-in + and swap-out, + + It's charged when... + - A new page is added to shmem's radix-tree. + - A swp page is read. (move a charge from swap_cgroup to page_cgroup) + It's uncharged when + - A page is removed from radix-tree and not SwapCache. + - When SwapCache is removed, a charge is moved to swap_cgroup. + - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup + disappears. + +7. Page Migration + One of the most complicated functions is page-migration-handler. + Memcg has 2 routines. Assume that we are migrating a page's contents + from OLDPAGE to NEWPAGE. + + Usual migration logic is.. + (a) remove the page from LRU. + (b) allocate NEWPAGE (migration target) + (c) lock by lock_page(). + (d) unmap all mappings. + (e-1) If necessary, replace entry in radix-tree. + (e-2) move contents of a page. + (f) map all mappings again. + (g) pushback the page to LRU. + (-) OLDPAGE will be freed. + + Before (g), memcg should complete all necessary charge/uncharge to + NEWPAGE/OLDPAGE. + + The point is.... + - If OLDPAGE is anonymous, all charges will be dropped at (d) because + try_to_unmap() drops all mapcount and the page will not be + SwapCache. + + - If OLDPAGE is SwapCache, charges will be kept at (g) because + __delete_from_swap_cache() isn't called at (e-1) + + - If OLDPAGE is page-cache, charges will be kept at (g) because + remove_from_swap_cache() isn't called at (e-1) + + memcg provides following hooks. + + - mem_cgroup_prepare_migration(OLDPAGE) + Called after (b) to account a charge (usage += PAGE_SIZE) against + memcg which OLDPAGE belongs to. + + - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) + Called after (f) before (g). + If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already + charged, a charge by prepare_migration() is automatically canceled. + If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. + + But zap_pte() (by exit or munmap) can be called while migration, + we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). + +8. LRU + Each memcg has its own private LRU. Now, it's handling is under global + VM's control (means that it's handled under global zone->lru_lock). + Almost all routines around memcg's LRU is called by global LRU's + list management functions under zone->lru_lock(). + + A special function is mem_cgroup_isolate_pages(). This scans + memcg's private LRU and call __isolate_lru_page() to extract a page + from LRU. + (By __isolate_lru_page(), the page is removed from both of global and + private LRU.) + + +9. Typical Tests. + + Tests for racy cases. + + 9.1 Small limit to memcg. + When you do test to do racy case, it's good test to set memcg's limit + to be very small rather than GB. Many races found in the test under + xKB or xxMB limits. + (Memory behavior under GB and Memory behavior under MB shows very + different situation.) + + 9.2 Shmem + Historically, memcg's shmem handling was poor and we saw some amount + of troubles here. This is because shmem is page-cache but can be + SwapCache. Test with shmem/tmpfs is always good test. + + 9.3 Migration + For NUMA, migration is an another special case. To do easy test, cpuset + is useful. Following is a sample script to do migration. + + mount -t cgroup -o cpuset none /opt/cpuset + + mkdir /opt/cpuset/01 + echo 1 > /opt/cpuset/01/cpuset.cpus + echo 0 > /opt/cpuset/01/cpuset.mems + echo 1 > /opt/cpuset/01/cpuset.memory_migrate + mkdir /opt/cpuset/02 + echo 1 > /opt/cpuset/02/cpuset.cpus + echo 1 > /opt/cpuset/02/cpuset.mems + echo 1 > /opt/cpuset/02/cpuset.memory_migrate + + In above set, when you moves a task from 01 to 02, page migration to + node 0 to node 1 will occur. Following is a script to migrate all + under cpuset. + -- + move_task() + { + for pid in $1 + do + /bin/echo $pid >$2/tasks 2>/dev/null + echo -n $pid + echo -n " " + done + echo END + } + + G1_TASK=`cat ${G1}/tasks` + G2_TASK=`cat ${G2}/tasks` + move_task "${G1_TASK}" ${G2} & + -- + 9.4 Memory hotplug. + memory hotplug test is one of good test. + to offline memory, do following. + # echo offline > /sys/devices/system/memory/memoryXXX/state + (XXX is the place of memory) + This is an easy way to test page migration, too. + + 9.5 mkdir/rmdir + When using hierarchy, mkdir/rmdir test should be done. + Use tests like the following. + + echo 1 >/opt/cgroup/01/memory/use_hierarchy + mkdir /opt/cgroup/01/child_a + mkdir /opt/cgroup/01/child_b + + set limit to 01. + add limit to 01/child_b + run jobs under child_a and child_b + + create/delete following groups at random while jobs are running. + /opt/cgroup/01/child_a/child_aa + /opt/cgroup/01/child_b/child_bb + /opt/cgroup/01/child_c + + running new jobs in new group is also good. + + 9.6 Mount with other subsystems. + Mounting with other subsystems is a good test because there is a + race and lock dependency with other cgroup subsystems. + + example) + # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices + + and do task move, mkdir, rmdir etc...under this. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt new file mode 100644 index 00000000000..e1501964df1 --- /dev/null +++ b/Documentation/cgroups/memory.txt @@ -0,0 +1,399 @@ +Memory Resource Controller + +NOTE: The Memory Resource Controller has been generically been referred +to as the memory controller in this document. Do not confuse memory controller +used here with the memory controller that is used in hardware. + +Salient features + +a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages +b. The infrastructure allows easy addition of other types of memory to control +c. Provides *zero overhead* for non memory controller users +d. Provides a double LRU: global memory pressure causes reclaim from the + global LRU; a cgroup on hitting a limit, reclaims from the per + cgroup LRU + +NOTE: Swap Cache (unmapped) is not accounted now. + +Benefits and Purpose of the memory controller + +The memory controller isolates the memory behaviour of a group of tasks +from the rest of the system. The article on LWN [12] mentions some probable +uses of the memory controller. The memory controller can be used to + +a. Isolate an application or a group of applications + Memory hungry applications can be isolated and limited to a smaller + amount of memory. +b. Create a cgroup with limited amount of memory, this can be used + as a good alternative to booting with mem=XXXX. +c. Virtualization solutions can control the amount of memory they want + to assign to a virtual machine instance. +d. A CD/DVD burner could control the amount of memory used by the + rest of the system to ensure that burning does not fail due to lack + of available memory. +e. There are several other use cases, find one or use the controller just + for fun (to learn and hack on the VM subsystem). + +1. History + +The memory controller has a long history. A request for comments for the memory +controller was posted by Balbir Singh [1]. At the time the RFC was posted +there were several implementations for memory control. The goal of the +RFC was to build consensus and agreement for the minimal features required +for memory control. The first RSS controller was posted by Balbir Singh[2] +in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the +RSS controller. At OLS, at the resource management BoF, everyone suggested +that we handle both page cache and RSS together. Another request was raised +to allow user space handling of OOM. The current memory controller is +at version 6; it combines both mapped (RSS) and unmapped Page +Cache Control [11]. + +2. Memory Control + +Memory is a unique resource in the sense that it is present in a limited +amount. If a task requires a lot of CPU processing, the task can spread +its processing over a period of hours, days, months or years, but with +memory, the same physical memory needs to be reused to accomplish the task. + +The memory controller implementation has been divided into phases. These +are: + +1. Memory controller +2. mlock(2) controller +3. Kernel user memory accounting and slab control +4. user mappings length controller + +The memory controller is the first controller developed. + +2.1. Design + +The core of the design is a counter called the res_counter. The res_counter +tracks the current memory usage and limit of the group of processes associated +with the controller. Each cgroup has a memory controller specific data +structure (mem_cgroup) associated with it. + +2.2. Accounting + + +--------------------+ + | mem_cgroup | + | (res_counter) | + +--------------------+ + / ^ \ + / | \ + +---------------+ | +---------------+ + | mm_struct | |.... | mm_struct | + | | | | | + +---------------+ | +---------------+ + | + + --------------+ + | + +---------------+ +------+--------+ + | page +----------> page_cgroup| + | | | | + +---------------+ +---------------+ + + (Figure 1: Hierarchy of Accounting) + + +Figure 1 shows the important aspects of the controller + +1. Accounting happens per cgroup +2. Each mm_struct knows about which cgroup it belongs to +3. Each page has a pointer to the page_cgroup, which in turn knows the + cgroup it belongs to + +The accounting is done as follows: mem_cgroup_charge() is invoked to setup +the necessary data structures and check if the cgroup that is being charged +is over its limit. If it is then reclaim is invoked on the cgroup. +More details can be found in the reclaim section of this document. +If everything goes well, a page meta-data-structure called page_cgroup is +allocated and associated with the page. This routine also adds the page to +the per cgroup LRU. + +2.2.1 Accounting details + +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +(some pages which never be reclaimable and will not be on global LRU + are not accounted. we just accounts pages under usual vm management.) + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +A RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. + +At page migration, accounting information is kept. + +Note: we just account pages-on-lru because our purpose is to control amount +of used pages. not-on-lru pages are tend to be out-of-control from vm view. + +2.3 Shared Page Accounting + +Shared pages are accounted on the basis of the first touch approach. The +cgroup that first touches a page is accounted for the page. The principle +behind this approach is that a cgroup that aggressively uses a shared +page will eventually get charged for it (once it is uncharged from +the cgroup that brought it in -- this will happen on memory pressure). + +Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. +When you do swapoff and make swapped-out pages of shmem(tmpfs) to +be backed into memory in force, charges for pages are accounted against the +caller of swapoff rather than the users of shmem. + + +2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +Swap Extension allows you to record charge for swap. A swapped-in page is +charged back to original page allocator if possible. + +When swap is accounted, following files are added. + - memory.memsw.usage_in_bytes. + - memory.memsw.limit_in_bytes. + +usage of mem+swap is limited by memsw.limit_in_bytes. + +Note: why 'mem+swap' rather than swap. +The global LRU(kswapd) can swap out arbitrary pages. Swap-out means +to move account from memory to swap...there is no change in usage of +mem+swap. + +In other words, when we want to limit the usage of swap without affecting +global LRU, mem+swap limit is better than just limiting swap from OS point +of view. + +2.5 Reclaim + +Each cgroup maintains a per cgroup LRU that consists of an active +and inactive list. When a cgroup goes over its limit, we first try +to reclaim memory from the cgroup so as to make space for the new +pages that the cgroup has touched. If the reclaim is unsuccessful, +an OOM routine is invoked to select and kill the bulkiest task in the +cgroup. + +The reclaim algorithm has not been modified for cgroups, except that +pages that are selected for reclaiming come from the per cgroup LRU +list. + +2. Locking + +The memory controller uses the following hierarchy + +1. zone->lru_lock is used for selecting pages to be isolated +2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) +3. lock_page_cgroup() is used to protect page->page_cgroup + +3. User Interface + +0. Configuration + +a. Enable CONFIG_CGROUPS +b. Enable CONFIG_RESOURCE_COUNTERS +c. Enable CONFIG_CGROUP_MEM_RES_CTLR + +1. Prepare the cgroups +# mkdir -p /cgroups +# mount -t cgroup none /cgroups -o memory + +2. Make the new group and move bash into it +# mkdir /cgroups/0 +# echo $$ > /cgroups/0/tasks + +Since now we're in the 0 cgroup, +We can alter the memory limit: +# echo 4M > /cgroups/0/memory.limit_in_bytes + +NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, +mega or gigabytes. + +# cat /cgroups/0/memory.limit_in_bytes +4194304 + +NOTE: The interface has now changed to display the usage in bytes +instead of pages + +We can check the usage: +# cat /cgroups/0/memory.usage_in_bytes +1216512 + +A successful write to this file does not guarantee a successful set of +this limit to the value written into the file. This can be due to a +number of factors, such as rounding up to page boundaries or the total +availability of memory on the system. The user is required to re-read +this file after a write to guarantee the value committed by the kernel. + +# echo 1 > memory.limit_in_bytes +# cat memory.limit_in_bytes +4096 + +The memory.failcnt field gives the number of times that the cgroup limit was +exceeded. + +The memory.stat file gives accounting information. Now, the number of +caches, RSS and Active pages/Inactive pages are shown. + +4. Testing + +Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. +Apart from that v6 has been tested with several applications and regular +daily use. The controller has also been tested on the PPC64, x86_64 and +UML platforms. + +4.1 Troubleshooting + +Sometimes a user might find that the application under a cgroup is +terminated. There are several causes for this: + +1. The cgroup limit is too low (just too low to do anything useful) +2. The user is using anonymous memory and swap is turned off or too low + +A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of +some of the pages cached in the cgroup (page cache pages). + +4.2 Task migration + +When a task migrates from one cgroup to another, it's charge is not +carried forward. The pages allocated from the original cgroup still +remain charged to it, the charge is dropped when the page is freed or +reclaimed. + +4.3 Removing a cgroup + +A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a +cgroup might have some charge associated with it, even though all +tasks have migrated away from it. +Such charges are freed(at default) or moved to its parent. When moved, +both of RSS and CACHES are moved to parent. +If both of them are busy, rmdir() returns -EBUSY. See 5.1 Also. + +Charges recorded in swap information is not updated at removal of cgroup. +Recorded information is discarded and a cgroup which uses swap (swapcache) +will be charged as a new owner of it. + + +5. Misc. interfaces. + +5.1 force_empty + memory.force_empty interface is provided to make cgroup's memory usage empty. + You can use this interface only when the cgroup has no tasks. + When writing anything to this + + # echo 0 > memory.force_empty + + Almost all pages tracked by this memcg will be unmapped and freed. Some of + pages cannot be freed because it's locked or in-use. Such pages are moved + to parent and this cgroup will be empty. But this may return -EBUSY in + some too busy case. + + Typical use case of this interface is that calling this before rmdir(). + Because rmdir() moves all pages to parent, some out-of-use page caches can be + moved to the parent. If you want to avoid that, force_empty will be useful. + +5.2 stat file + memory.stat file includes following statistics (now) + cache - # of pages from page-cache and shmem. + rss - # of pages from anonymous memory. + pgpgin - # of event of charging + pgpgout - # of event of uncharging + active_anon - # of pages on active lru of anon, shmem. + inactive_anon - # of pages on active lru of anon, shmem + active_file - # of pages on active lru of file-cache + inactive_file - # of pages on inactive lru of file cache + unevictable - # of pages cannot be reclaimed.(mlocked etc) + + Below is depend on CONFIG_DEBUG_VM. + inactive_ratio - VM inernal parameter. (see mm/page_alloc.c) + recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) + recent_rotated_file - VM internal parameter. (see mm/vmscan.c) + recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) + recent_scanned_file - VM internal parameter. (see mm/vmscan.c) + + Memo: + recent_rotated means recent frequency of lru rotation. + recent_scanned means recent # of scans to lru. + showing for better debug please see the code for meanings. + + +5.3 swappiness + Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. + + Following cgroup's swapiness can't be changed. + - root cgroup (uses /proc/sys/vm/swappiness). + - a cgroup which uses hierarchy and it has child cgroup. + - a cgroup which uses hierarchy and not the root of hierarchy. + + +6. Hierarchy support + +The memory controller supports a deep hierarchy and hierarchical accounting. +The hierarchy is created by creating the appropriate cgroups in the +cgroup filesystem. Consider for example, the following cgroup filesystem +hierarchy + + root + / | \ + / | \ + a b c + | \ + | \ + d e + +In the diagram above, with hierarchical accounting enabled, all memory +usage of e, is accounted to its ancestors up until the root (i.e, c and root), +that has memory.use_hierarchy enabled. If one of the ancestors goes over its +limit, the reclaim algorithm reclaims from the tasks in the ancestor and the +children of the ancestor. + +6.1 Enabling hierarchical accounting and reclaim + +The memory controller by default disables the hierarchy feature. Support +can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup + +# echo 1 > memory.use_hierarchy + +The feature can be disabled by + +# echo 0 > memory.use_hierarchy + +NOTE1: Enabling/disabling will fail if the cgroup already has other +cgroups created below it. + +NOTE2: This feature can be enabled/disabled per subtree. + +7. TODO + +1. Add support for accounting huge pages (as a separate controller) +2. Make per-cgroup scanner reclaim not-shared pages first +3. Teach controller to account for shared-pages +4. Start reclamation in the background when the limit is + not yet hit but the usage is getting closer + +Summary + +Overall, the memory controller has been a stable controller and has been +commented and discussed quite extensively in the community. + +References + +1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ +2. Singh, Balbir. Memory Controller (RSS Control), + http://lwn.net/Articles/222762/ +3. Emelianov, Pavel. Resource controllers based on process cgroups + http://lkml.org/lkml/2007/3/6/198 +4. Emelianov, Pavel. RSS controller based on process cgroups (v2) + http://lkml.org/lkml/2007/4/9/78 +5. Emelianov, Pavel. RSS controller based on process cgroups (v3) + http://lkml.org/lkml/2007/5/30/244 +6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ +7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control + subsystem (v3), http://lwn.net/Articles/235534/ +8. Singh, Balbir. RSS controller v2 test results (lmbench), + http://lkml.org/lkml/2007/5/17/232 +9. Singh, Balbir. RSS controller v2 AIM9 results + http://lkml.org/lkml/2007/5/18/1 +10. Singh, Balbir. Memory controller v6 test results, + http://lkml.org/lkml/2007/8/19/36 +11. Singh, Balbir. Memory controller introduction (v6), + http://lkml.org/lkml/2007/8/17/69 +12. Corbet, Jonathan, Controlling memory use in cgroups, + http://lwn.net/Articles/243795/ diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt new file mode 100644 index 00000000000..f196ac1d7d2 --- /dev/null +++ b/Documentation/cgroups/resource_counter.txt @@ -0,0 +1,181 @@ + + The Resource Counter + +The resource counter, declared at include/linux/res_counter.h, +is supposed to facilitate the resource management by controllers +by providing common stuff for accounting. + +This "stuff" includes the res_counter structure and routines +to work with it. + + + +1. Crucial parts of the res_counter structure + + a. unsigned long long usage + + The usage value shows the amount of a resource that is consumed + by a group at a given time. The units of measurement should be + determined by the controller that uses this counter. E.g. it can + be bytes, items or any other unit the controller operates on. + + b. unsigned long long max_usage + + The maximal value of the usage over time. + + This value is useful when gathering statistical information about + the particular group, as it shows the actual resource requirements + for a particular group, not just some usage snapshot. + + c. unsigned long long limit + + The maximal allowed amount of resource to consume by the group. In + case the group requests for more resources, so that the usage value + would exceed the limit, the resource allocation is rejected (see + the next section). + + d. unsigned long long failcnt + + The failcnt stands for "failures counter". This is the number of + resource allocation attempts that failed. + + c. spinlock_t lock + + Protects changes of the above values. + + + +2. Basic accounting routines + + a. void res_counter_init(struct res_counter *rc) + + Initializes the resource counter. As usual, should be the first + routine called for a new counter. + + b. int res_counter_charge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is about to be allocated it has to be accounted + with the appropriate resource counter (controller should determine + which one to use on its own). This operation is called "charging". + + This is not very important which operation - resource allocation + or charging - is performed first, but + * if the allocation is performed first, this may create a + temporary resource over-usage by the time resource counter is + charged; + * if the charging is performed first, then it should be uncharged + on error path (if the one is called). + + c. void res_counter_uncharge[_locked] + (struct res_counter *rc, unsigned long val) + + When a resource is released (freed) it should be de-accounted + from the resource counter it was accounted to. This is called + "uncharging". + + The _locked routines imply that the res_counter->lock is taken. + + + 2.1 Other accounting routines + + There are more routines that may help you with common needs, like + checking whether the limit is reached or resetting the max_usage + value. They are all declared in include/linux/res_counter.h. + + + +3. Analyzing the resource counter registrations + + a. If the failcnt value constantly grows, this means that the counter's + limit is too tight. Either the group is misbehaving and consumes too + many resources, or the configuration is not suitable for the group + and the limit should be increased. + + b. The max_usage value can be used to quickly tune the group. One may + set the limits to maximal values and either load the container with + a common pattern or leave one for a while. After this the max_usage + value shows the amount of memory the container would require during + its common activity. + + Setting the limit a bit above this value gives a pretty good + configuration that works in most of the cases. + + c. If the max_usage is much less than the limit, but the failcnt value + is growing, then the group tries to allocate a big chunk of resource + at once. + + d. If the max_usage is much less than the limit, but the failcnt value + is 0, then this group is given too high limit, that it does not + require. It is better to lower the limit a bit leaving more resource + for other groups. + + + +4. Communication with the control groups subsystem (cgroups) + +All the resource controllers that are using cgroups and resource counters +should provide files (in the cgroup filesystem) to work with the resource +counter fields. They are recommended to adhere to the following rules: + + a. File names + + Field name File name + --------------------------------------------------- + usage usage_in_ + max_usage max_usage_in_ + limit limit_in_ + failcnt failcnt + lock no file :) + + b. Reading from file should show the corresponding field value in the + appropriate format. + + c. Writing to file + + Field Expected behavior + ---------------------------------- + usage prohibited + max_usage reset to usage + limit set the limit + failcnt reset to zero + + + +5. Usage example + + a. Declare a task group (take a look at cgroups subsystem for this) and + fold a res_counter into it + + struct my_group { + struct res_counter res; + + + } + + b. Put hooks in resource allocation/release paths + + int alloc_something(...) + { + if (res_counter_charge(res_counter_ptr, amount) < 0) + return -ENOMEM; + + + } + + void release_something(...) + { + res_counter_uncharge(res_counter_ptr, amount); + + + } + + In order to keep the usage value self-consistent, both the + "res_counter_ptr" and the "amount" in release_something() should be + the same as they were in the alloc_something() when the releasing + resource was allocated. + + c. Provide the way to read res_counter values and set them (the cgroups + still can help with it). + + c. Compile and run :) diff --git a/Documentation/computone.txt b/Documentation/computone.txt deleted file mode 100644 index 5e2a0c76bfa..00000000000 --- a/Documentation/computone.txt +++ /dev/null @@ -1,522 +0,0 @@ -NOTE: This is an unmaintained driver. It is not guaranteed to work due to -changes made in the tty layer in 2.6. If you wish to take over maintenance of -this driver, contact Michael Warfield . - -Changelog: ----------- -11-01-2001: Original Document - -10-29-2004: Minor misspelling & format fix, update status of driver. - James Nelson - -Computone Intelliport II/Plus Multiport Serial Driver ------------------------------------------------------ - -Release Notes For Linux Kernel 2.2 and higher. -These notes are for the drivers which have already been integrated into the -kernel and have been tested on Linux kernels 2.0, 2.2, 2.3, and 2.4. - -Version: 1.2.14 -Date: 11/01/2001 -Historical Author: Andrew Manison -Primary Author: Doug McNash -Support: support@computone.com -Fixes and Updates: Mike Warfield - -This file assumes that you are using the Computone drivers which are -integrated into the kernel sources. For updating the drivers or installing -drivers into kernels which do not already have Computone drivers, please -refer to the instructions in the README.computone file in the driver patch. - - -1. INTRODUCTION - -This driver supports the entire family of Intelliport II/Plus controllers -with the exception of the MicroChannel controllers. It does not support -products previous to the Intelliport II. - -This driver was developed on the v2.0.x Linux tree and has been tested up -to v2.4.14; it will probably not work with earlier v1.X kernels,. - - -2. QUICK INSTALLATION - -Hardware - If you have an ISA card, find a free interrupt and io port. - List those in use with `cat /proc/interrupts` and - `cat /proc/ioports`. Set the card dip switches to a free - address. You may need to configure your BIOS to reserve an - irq for an ISA card. PCI and EISA parameters are set - automagically. Insert card into computer with the power off - before or after drivers installation. - - Note the hardware address from the Computone ISA cards installed into - the system. These are required for editing ip2.c or editing - /etc/modprobe.conf, or for specification on the modprobe - command line. - - Note that the /etc/modules.conf should be used for older (pre-2.6) - kernels. - -Software - - -Module installation: - -a) Determine free irq/address to use if any (configure BIOS if need be) -b) Run "make config" or "make menuconfig" or "make xconfig" - Select (m) module for CONFIG_COMPUTONE under character - devices. CONFIG_PCI and CONFIG_MODULES also may need to be set. -c) Set address on ISA cards then: - edit /usr/src/linux/drivers/char/ip2.c if needed - or - edit /etc/modprobe.conf if needed (module). - or both to match this setting. -d) Run "make modules" -e) Run "make modules_install" -f) Run "/sbin/depmod -a" -g) install driver using `modprobe ip2 ` (options listed below) -h) run ip2mkdev (either the script below or the binary version) - - -Kernel installation: - -a) Determine free irq/address to use if any (configure BIOS if need be) -b) Run "make config" or "make menuconfig" or "make xconfig" - Select (y) kernel for CONFIG_COMPUTONE under character - devices. CONFIG_PCI may need to be set if you have PCI bus. -c) Set address on ISA cards then: - edit /usr/src/linux/drivers/char/ip2.c - (Optional - may be specified on kernel command line now) -d) Run "make zImage" or whatever target you prefer. -e) mv /usr/src/linux/arch/i386/boot/zImage to /boot. -f) Add new config for this kernel into /etc/lilo.conf, run "lilo" - or copy to a floppy disk and boot from that floppy disk. -g) Reboot using this kernel -h) run ip2mkdev (either the script below or the binary version) - -Kernel command line options: - -When compiling the driver into the kernel, io and irq may be -compiled into the driver by editing ip2.c and setting the values for -io and irq in the appropriate array. An alternative is to specify -a command line parameter to the kernel at boot up. - - ip2=io0,irq0,io1,irq1,io2,irq2,io3,irq3 - -Note that this order is very different from the specifications for the -modload parameters which have separate IRQ and IO specifiers. - -The io port also selects PCI (1) and EISA (2) boards. - - io=0 No board - io=1 PCI board - io=2 EISA board - else ISA board io address - -You only need to specify the boards which are present. - - Examples: - - 2 PCI boards: - - ip2=1,0,1,0 - - 1 ISA board at 0x310 irq 5: - - ip2=0x310,5 - -This can be added to and "append" option in lilo.conf similar to this: - - append="ip2=1,0,1,0" - - -3. INSTALLATION - -Previously, the driver sources were packaged with a set of patch files -to update the character drivers' makefile and configuration file, and other -kernel source files. A build script (ip2build) was included which applies -the patches if needed, and build any utilities needed. -What you receive may be a single patch file in conventional kernel -patch format build script. That form can also be applied by -running patch -p1 < ThePatchFile. Otherwise run ip2build. - -The driver can be installed as a module (recommended) or built into the -kernel. This is selected as for other drivers through the `make config` -command from the root of the Linux source tree. If the driver is built -into the kernel you will need to edit the file ip2.c to match the boards -you are installing. See that file for instructions. If the driver is -installed as a module the configuration can also be specified on the -modprobe command line as follows: - - modprobe ip2 irq=irq1,irq2,irq3,irq4 io=addr1,addr2,addr3,addr4 - -where irqnum is one of the valid Intelliport II interrupts (3,4,5,7,10,11, -12,15) and addr1-4 are the base addresses for up to four controllers. If -the irqs are not specified the driver uses the default in ip2.c (which -selects polled mode). If no base addresses are specified the defaults in -ip2.c are used. If you are autoloading the driver module with kerneld or -kmod the base addresses and interrupt number must also be set in ip2.c -and recompile or just insert and options line in /etc/modprobe.conf or both. -The options line is equivalent to the command line and takes precedence over -what is in ip2.c. - -/etc/modprobe.conf sample: - options ip2 io=1,0x328 irq=1,10 - alias char-major-71 ip2 - alias char-major-72 ip2 - alias char-major-73 ip2 - -The equivalent in ip2.c: - -static int io[IP2_MAX_BOARDS]= { 1, 0x328, 0, 0 }; -static int irq[IP2_MAX_BOARDS] = { 1, 10, -1, -1 }; - -The equivalent for the kernel command line (in lilo.conf): - - append="ip2=1,1,0x328,10" - - -Note: Both io and irq should be updated to reflect YOUR system. An "io" - address of 1 or 2 indicates a PCI or EISA card in the board table. - The PCI or EISA irq will be assigned automatically. - -Specifying an invalid or in-use irq will default the driver into -running in polled mode for that card. If all irq entries are 0 then -all cards will operate in polled mode. - -If you select the driver as part of the kernel run : - - make zlilo (or whatever you do to create a bootable kernel) - -If you selected a module run : - - make modules && make modules_install - -The utility ip2mkdev (see 5 and 7 below) creates all the device nodes -required by the driver. For a device to be created it must be configured -in the driver and the board must be installed. Only devices corresponding -to real IntelliPort II ports are created. With multiple boards and expansion -boxes this will leave gaps in the sequence of device names. ip2mkdev uses -Linux tty naming conventions: ttyF0 - ttyF255 for normal devices, and -cuf0 - cuf255 for callout devices. - - -4. USING THE DRIVERS - -As noted above, the driver implements the ports in accordance with Linux -conventions, and the devices should be interchangeable with the standard -serial devices. (This is a key point for problem reporting: please make -sure that what you are trying do works on the ttySx/cuax ports first; then -tell us what went wrong with the ip2 ports!) - -Higher speeds can be obtained using the setserial utility which remaps -38,400 bps (extb) to 57,600 bps, 115,200 bps, or a custom speed. -Intelliport II installations using the PowerPort expansion module can -use the custom speed setting to select the highest speeds: 153,600 bps, -230,400 bps, 307,200 bps, 460,800bps and 921,600 bps. The base for -custom baud rate configuration is fixed at 921,600 for cards/expansion -modules with ST654's and 115200 for those with Cirrus CD1400's. This -corresponds to the maximum bit rates those chips are capable. -For example if the baud base is 921600 and the baud divisor is 18 then -the custom rate is 921600/18 = 51200 bps. See the setserial man page for -complete details. Of course if stty accepts the higher rates now you can -use that as well as the standard ioctls(). - - -5. ip2mkdev and assorted utilities... - -Several utilities, including the source for a binary ip2mkdev utility are -available under .../drivers/char/ip2. These can be build by changing to -that directory and typing "make" after the kernel has be built. If you do -not wish to compile the binary utilities, the shell script below can be -cut out and run as "ip2mkdev" to create the necessary device files. To -use the ip2mkdev script, you must have procfs enabled and the proc file -system mounted on /proc. - - -6. NOTES - -This is a release version of the driver, but it is impossible to test it -in all configurations of Linux. If there is any anomalous behaviour that -does not match the standard serial port's behaviour please let us know. - - -7. ip2mkdev shell script - -Previously, this script was simply attached here. It is now attached as a -shar archive to make it easier to extract the script from the documentation. -To create the ip2mkdev shell script change to a convenient directory (/tmp -works just fine) and run the following command: - - unshar Documentation/computone.txt - (This file) - -You should now have a file ip2mkdev in your current working directory with -permissions set to execute. Running that script with then create the -necessary devices for the Computone boards, interfaces, and ports which -are present on you system at the time it is run. - - -#!/bin/sh -# This is a shell archive (produced by GNU sharutils 4.2.1). -# To extract the files from this archive, save it to some FILE, remove -# everything before the `!/bin/sh' line above, then type `sh FILE'. -# -# Made on 2001-10-29 10:32 EST by . -# Source directory was `/home2/src/tmp'. -# -# Existing files will *not* be overwritten unless `-c' is specified. -# -# This shar contains: -# length mode name -# ------ ---------- ------------------------------------------ -# 4251 -rwxr-xr-x ip2mkdev -# -save_IFS="${IFS}" -IFS="${IFS}:" -gettext_dir=FAILED -locale_dir=FAILED -first_param="$1" -for dir in $PATH -do - if test "$gettext_dir" = FAILED && test -f $dir/gettext \ - && ($dir/gettext --version >/dev/null 2>&1) - then - set `$dir/gettext --version 2>&1` - if test "$3" = GNU - then - gettext_dir=$dir - fi - fi - if test "$locale_dir" = FAILED && test -f $dir/shar \ - && ($dir/shar --print-text-domain-dir >/dev/null 2>&1) - then - locale_dir=`$dir/shar --print-text-domain-dir` - fi -done -IFS="$save_IFS" -if test "$locale_dir" = FAILED || test "$gettext_dir" = FAILED -then - echo=echo -else - TEXTDOMAINDIR=$locale_dir - export TEXTDOMAINDIR - TEXTDOMAIN=sharutils - export TEXTDOMAIN - echo="$gettext_dir/gettext -s" -fi -if touch -am -t 200112312359.59 $$.touch >/dev/null 2>&1 && test ! -f 200112312359.59 -a -f $$.touch; then - shar_touch='touch -am -t $1$2$3$4$5$6.$7 "$8"' -elif touch -am 123123592001.59 $$.touch >/dev/null 2>&1 && test ! -f 123123592001.59 -a ! -f 123123592001.5 -a -f $$.touch; then - shar_touch='touch -am $3$4$5$6$1$2.$7 "$8"' -elif touch -am 1231235901 $$.touch >/dev/null 2>&1 && test ! -f 1231235901 -a -f $$.touch; then - shar_touch='touch -am $3$4$5$6$2 "$8"' -else - shar_touch=: - echo - $echo 'WARNING: not restoring timestamps. Consider getting and' - $echo "installing GNU \`touch', distributed in GNU File Utilities..." - echo -fi -rm -f 200112312359.59 123123592001.59 123123592001.5 1231235901 $$.touch -# -if mkdir _sh17581; then - $echo 'x -' 'creating lock directory' -else - $echo 'failed to create lock directory' - exit 1 -fi -# ============= ip2mkdev ============== -if test -f 'ip2mkdev' && test "$first_param" != -c; then - $echo 'x -' SKIPPING 'ip2mkdev' '(file already exists)' -else - $echo 'x -' extracting 'ip2mkdev' '(text)' - sed 's/^X//' << 'SHAR_EOF' > 'ip2mkdev' && -#!/bin/sh - -# -# ip2mkdev -# -# Make or remove devices as needed for Computone Intelliport drivers -# -# First rule! If the dev file exists and you need it, don't mess -# with it. That prevents us from screwing up open ttys, ownership -# and permissions on a running system! -# -# This script will NOT remove devices that no longer exist if their -# board or interface box has been removed. If you want to get rid -# of them, you can manually do an "rm -f /dev/ttyF* /dev/cuaf*" -# before running this script. Running this script will then recreate -# all the valid devices. -# -# Michael H. Warfield -# /\/\|=mhw=|\/\/ -# mhw@wittsend.com -# -# Updated 10/29/2000 for version 1.2.13 naming convention -# under devfs. /\/\|=mhw=|\/\/ -# -# Updated 03/09/2000 for devfs support in ip2 drivers. /\/\|=mhw=|\/\/ -# -X -if test -d /dev/ip2 ; then -# This is devfs mode... We don't do anything except create symlinks -# from the real devices to the old names! -X cd /dev -X echo "Creating symbolic links to devfs devices" -X for i in `ls ip2` ; do -X if test ! -L ip2$i ; then -X # Remove it incase it wasn't a symlink (old device) -X rm -f ip2$i -X ln -s ip2/$i ip2$i -X fi -X done -X for i in `( cd tts ; ls F* )` ; do -X if test ! -L tty$i ; then -X # Remove it incase it wasn't a symlink (old device) -X rm -f tty$i -X ln -s tts/$i tty$i -X fi -X done -X for i in `( cd cua ; ls F* )` ; do -X DEVNUMBER=`expr $i : 'F\(.*\)'` -X if test ! -L cuf$DEVNUMBER ; then -X # Remove it incase it wasn't a symlink (old device) -X rm -f cuf$DEVNUMBER -X ln -s cua/$i cuf$DEVNUMBER -X fi -X done -X exit 0 -fi -X -if test ! -f /proc/tty/drivers -then -X echo "\ -Unable to check driver status. -Make sure proc file system is mounted." -X -X exit 255 -fi -X -if test ! -f /proc/tty/driver/ip2 -then -X echo "\ -Unable to locate ip2 proc file. -Attempting to load driver" -X -X if /sbin/insmod ip2 -X then -X if test ! -f /proc/tty/driver/ip2 -X then -X echo "\ -Unable to locate ip2 proc file after loading driver. -Driver initialization failure or driver version error. -" -X exit 255 -X fi -X else -X echo "Unable to load ip2 driver." -X exit 255 -X fi -fi -X -# Ok... So we got the driver loaded and we can locate the procfs files. -# Next we need our major numbers. -X -TTYMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/tt/!d' -e 's/.*tt[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers` -CUAMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/cu/!d' -e 's/.*cu[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers` -BRDMAJOR=`sed -e '/^Driver: /!d' -e 's/.*IMajor=\([0-9]*\)[ ]*.*/\1/' < /proc/tty/driver/ip2` -X -echo "\ -TTYMAJOR = $TTYMAJOR -CUAMAJOR = $CUAMAJOR -BRDMAJOR = $BRDMAJOR -" -X -# Ok... Now we should know our major numbers, if appropriate... -# Now we need our boards and start the device loops. -X -grep '^Board [0-9]:' /proc/tty/driver/ip2 | while read token number type alltherest -do -X # The test for blank "type" will catch the stats lead-in lines -X # if they exist in the file -X if test "$type" = "vacant" -o "$type" = "Vacant" -o "$type" = "" -X then -X continue -X fi -X -X BOARDNO=`expr "$number" : '\([0-9]\):'` -X PORTS=`expr "$alltherest" : '.*ports=\([0-9]*\)' | tr ',' ' '` -X MINORS=`expr "$alltherest" : '.*minors=\([0-9,]*\)' | tr ',' ' '` -X -X if test "$BOARDNO" = "" -o "$PORTS" = "" -X then -# This may be a bug. We should at least get this much information -X echo "Unable to process board line" -X continue -X fi -X -X if test "$MINORS" = "" -X then -# Silently skip this one. This board seems to have no boxes -X continue -X fi -X -X echo "board $BOARDNO: $type ports = $PORTS; port numbers = $MINORS" -X -X if test "$BRDMAJOR" != "" -X then -X BRDMINOR=`expr $BOARDNO \* 4` -X STSMINOR=`expr $BRDMINOR + 1` -X if test ! -c /dev/ip2ipl$BOARDNO ; then -X mknod /dev/ip2ipl$BOARDNO c $BRDMAJOR $BRDMINOR -X fi -X if test ! -c /dev/ip2stat$BOARDNO ; then -X mknod /dev/ip2stat$BOARDNO c $BRDMAJOR $STSMINOR -X fi -X fi -X -X if test "$TTYMAJOR" != "" -X then -X PORTNO=$BOARDBASE -X -X for PORTNO in $MINORS -X do -X if test ! -c /dev/ttyF$PORTNO ; then -X # We got the hardware but no device - make it -X mknod /dev/ttyF$PORTNO c $TTYMAJOR $PORTNO -X fi -X done -X fi -X -X if test "$CUAMAJOR" != "" -X then -X PORTNO=$BOARDBASE -X -X for PORTNO in $MINORS -X do -X if test ! -c /dev/cuf$PORTNO ; then -X # We got the hardware but no device - make it -X mknod /dev/cuf$PORTNO c $CUAMAJOR $PORTNO -X fi -X done -X fi -done -X -Xexit 0 -SHAR_EOF - (set 20 01 10 29 10 32 01 'ip2mkdev'; eval "$shar_touch") && - chmod 0755 'ip2mkdev' || - $echo 'restore of' 'ip2mkdev' 'failed' - if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \ - && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then - md5sum -c << SHAR_EOF >/dev/null 2>&1 \ - || $echo 'ip2mkdev:' 'MD5 check failed' -cb5717134509f38bad9fde6b1f79b4a4 ip2mkdev -SHAR_EOF - else - shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'ip2mkdev'`" - test 4251 -eq "$shar_count" || - $echo 'ip2mkdev:' 'original size' '4251,' 'current size' "$shar_count!" - fi -fi -rm -fr _sh17581 -exit 0 diff --git a/Documentation/controllers/devices.txt b/Documentation/controllers/devices.txt deleted file mode 100644 index 7cc6e6a6067..00000000000 --- a/Documentation/controllers/devices.txt +++ /dev/null @@ -1,52 +0,0 @@ -Device Whitelist Controller - -1. Description: - -Implement a cgroup to track and enforce open and mknod restrictions -on device files. A device cgroup associates a device access -whitelist with each cgroup. A whitelist entry has 4 fields. -'type' is a (all), c (char), or b (block). 'all' means it applies -to all types and all major and minor numbers. Major and minor are -either an integer or * for all. Access is a composition of r -(read), w (write), and m (mknod). - -The root device cgroup starts with rwm to 'all'. A child device -cgroup gets a copy of the parent. Administrators can then remove -devices from the whitelist or add new entries. A child cgroup can -never receive a device access which is denied by its parent. However -when a device access is removed from a parent it will not also be -removed from the child(ren). - -2. User Interface - -An entry is added using devices.allow, and removed using -devices.deny. For instance - - echo 'c 1:3 mr' > /cgroups/1/devices.allow - -allows cgroup 1 to read and mknod the device usually known as -/dev/null. Doing - - echo a > /cgroups/1/devices.deny - -will remove the default 'a *:* rwm' entry. Doing - - echo a > /cgroups/1/devices.allow - -will add the 'a *:* rwm' entry to the whitelist. - -3. Security - -Any task can move itself between cgroups. This clearly won't -suffice, but we can decide the best way to adequately restrict -movement as people get some experience with this. We may just want -to require CAP_SYS_ADMIN, which at least is a separate bit from -CAP_MKNOD. We may want to just refuse moving to a cgroup which -isn't a descendent of the current one. Or we may want to use -CAP_MAC_ADMIN, since we really are trying to lock down root. - -CAP_SYS_ADMIN is needed to modify the whitelist or move another -task to a new cgroup. (Again we'll probably want to change that). - -A cgroup may not be granted more permissions than the cgroup's -parent has. diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt deleted file mode 100644 index 1c07547d3f8..00000000000 --- a/Documentation/controllers/memory.txt +++ /dev/null @@ -1,284 +0,0 @@ -Memory Resource Controller - -NOTE: The Memory Resource Controller has been generically been referred -to as the memory controller in this document. Do not confuse memory controller -used here with the memory controller that is used in hardware. - -Salient features - -a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages -b. The infrastructure allows easy addition of other types of memory to control -c. Provides *zero overhead* for non memory controller users -d. Provides a double LRU: global memory pressure causes reclaim from the - global LRU; a cgroup on hitting a limit, reclaims from the per - cgroup LRU - -NOTE: Swap Cache (unmapped) is not accounted now. - -Benefits and Purpose of the memory controller - -The memory controller isolates the memory behaviour of a group of tasks -from the rest of the system. The article on LWN [12] mentions some probable -uses of the memory controller. The memory controller can be used to - -a. Isolate an application or a group of applications - Memory hungry applications can be isolated and limited to a smaller - amount of memory. -b. Create a cgroup with limited amount of memory, this can be used - as a good alternative to booting with mem=XXXX. -c. Virtualization solutions can control the amount of memory they want - to assign to a virtual machine instance. -d. A CD/DVD burner could control the amount of memory used by the - rest of the system to ensure that burning does not fail due to lack - of available memory. -e. There are several other use cases, find one or use the controller just - for fun (to learn and hack on the VM subsystem). - -1. History - -The memory controller has a long history. A request for comments for the memory -controller was posted by Balbir Singh [1]. At the time the RFC was posted -there were several implementations for memory control. The goal of the -RFC was to build consensus and agreement for the minimal features required -for memory control. The first RSS controller was posted by Balbir Singh[2] -in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the -RSS controller. At OLS, at the resource management BoF, everyone suggested -that we handle both page cache and RSS together. Another request was raised -to allow user space handling of OOM. The current memory controller is -at version 6; it combines both mapped (RSS) and unmapped Page -Cache Control [11]. - -2. Memory Control - -Memory is a unique resource in the sense that it is present in a limited -amount. If a task requires a lot of CPU processing, the task can spread -its processing over a period of hours, days, months or years, but with -memory, the same physical memory needs to be reused to accomplish the task. - -The memory controller implementation has been divided into phases. These -are: - -1. Memory controller -2. mlock(2) controller -3. Kernel user memory accounting and slab control -4. user mappings length controller - -The memory controller is the first controller developed. - -2.1. Design - -The core of the design is a counter called the res_counter. The res_counter -tracks the current memory usage and limit of the group of processes associated -with the controller. Each cgroup has a memory controller specific data -structure (mem_cgroup) associated with it. - -2.2. Accounting - - +--------------------+ - | mem_cgroup | - | (res_counter) | - +--------------------+ - / ^ \ - / | \ - +---------------+ | +---------------+ - | mm_struct | |.... | mm_struct | - | | | | | - +---------------+ | +---------------+ - | - + --------------+ - | - +---------------+ +------+--------+ - | page +----------> page_cgroup| - | | | | - +---------------+ +---------------+ - - (Figure 1: Hierarchy of Accounting) - - -Figure 1 shows the important aspects of the controller - -1. Accounting happens per cgroup -2. Each mm_struct knows about which cgroup it belongs to -3. Each page has a pointer to the page_cgroup, which in turn knows the - cgroup it belongs to - -The accounting is done as follows: mem_cgroup_charge() is invoked to setup -the necessary data structures and check if the cgroup that is being charged -is over its limit. If it is then reclaim is invoked on the cgroup. -More details can be found in the reclaim section of this document. -If everything goes well, a page meta-data-structure called page_cgroup is -allocated and associated with the page. This routine also adds the page to -the per cgroup LRU. - -2.2.1 Accounting details - -All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. -(some pages which never be reclaimable and will not be on global LRU - are not accounted. we just accounts pages under usual vm management.) - -RSS pages are accounted at page_fault unless they've already been accounted -for earlier. A file page will be accounted for as Page Cache when it's -inserted into inode (radix-tree). While it's mapped into the page tables of -processes, duplicate accounting is carefully avoided. - -A RSS page is unaccounted when it's fully unmapped. A PageCache page is -unaccounted when it's removed from radix-tree. - -At page migration, accounting information is kept. - -Note: we just account pages-on-lru because our purpose is to control amount -of used pages. not-on-lru pages are tend to be out-of-control from vm view. - -2.3 Shared Page Accounting - -Shared pages are accounted on the basis of the first touch approach. The -cgroup that first touches a page is accounted for the page. The principle -behind this approach is that a cgroup that aggressively uses a shared -page will eventually get charged for it (once it is uncharged from -the cgroup that brought it in -- this will happen on memory pressure). - -2.4 Reclaim - -Each cgroup maintains a per cgroup LRU that consists of an active -and inactive list. When a cgroup goes over its limit, we first try -to reclaim memory from the cgroup so as to make space for the new -pages that the cgroup has touched. If the reclaim is unsuccessful, -an OOM routine is invoked to select and kill the bulkiest task in the -cgroup. - -The reclaim algorithm has not been modified for cgroups, except that -pages that are selected for reclaiming come from the per cgroup LRU -list. - -2. Locking - -The memory controller uses the following hierarchy - -1. zone->lru_lock is used for selecting pages to be isolated -2. mem->per_zone->lru_lock protects the per cgroup LRU (per zone) -3. lock_page_cgroup() is used to protect page->page_cgroup - -3. User Interface - -0. Configuration - -a. Enable CONFIG_CGROUPS -b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_CGROUP_MEM_RES_CTLR - -1. Prepare the cgroups -# mkdir -p /cgroups -# mount -t cgroup none /cgroups -o memory - -2. Make the new group and move bash into it -# mkdir /cgroups/0 -# echo $$ > /cgroups/0/tasks - -Since now we're in the 0 cgroup, -We can alter the memory limit: -# echo 4M > /cgroups/0/memory.limit_in_bytes - -NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, -mega or gigabytes. - -# cat /cgroups/0/memory.limit_in_bytes -4194304 - -NOTE: The interface has now changed to display the usage in bytes -instead of pages - -We can check the usage: -# cat /cgroups/0/memory.usage_in_bytes -1216512 - -A successful write to this file does not guarantee a successful set of -this limit to the value written into the file. This can be due to a -number of factors, such as rounding up to page boundaries or the total -availability of memory on the system. The user is required to re-read -this file after a write to guarantee the value committed by the kernel. - -# echo 1 > memory.limit_in_bytes -# cat memory.limit_in_bytes -4096 - -The memory.failcnt field gives the number of times that the cgroup limit was -exceeded. - -The memory.stat file gives accounting information. Now, the number of -caches, RSS and Active pages/Inactive pages are shown. - -The memory.force_empty gives an interface to drop *all* charges by force. - -# echo 1 > memory.force_empty - -will drop all charges in cgroup. Currently, this is maintained for test. - -4. Testing - -Balbir posted lmbench, AIM9, LTP and vmmstress results [10] and [11]. -Apart from that v6 has been tested with several applications and regular -daily use. The controller has also been tested on the PPC64, x86_64 and -UML platforms. - -4.1 Troubleshooting - -Sometimes a user might find that the application under a cgroup is -terminated. There are several causes for this: - -1. The cgroup limit is too low (just too low to do anything useful) -2. The user is using anonymous memory and swap is turned off or too low - -A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of -some of the pages cached in the cgroup (page cache pages). - -4.2 Task migration - -When a task migrates from one cgroup to another, it's charge is not -carried forward. The pages allocated from the original cgroup still -remain charged to it, the charge is dropped when the page is freed or -reclaimed. - -4.3 Removing a cgroup - -A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a -cgroup might have some charge associated with it, even though all -tasks have migrated away from it. Such charges are automatically dropped at -rmdir() if there are no tasks. - -5. TODO - -1. Add support for accounting huge pages (as a separate controller) -2. Make per-cgroup scanner reclaim not-shared pages first -3. Teach controller to account for shared-pages -4. Start reclamation in the background when the limit is - not yet hit but the usage is getting closer - -Summary - -Overall, the memory controller has been a stable controller and has been -commented and discussed quite extensively in the community. - -References - -1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/ -2. Singh, Balbir. Memory Controller (RSS Control), - http://lwn.net/Articles/222762/ -3. Emelianov, Pavel. Resource controllers based on process cgroups - http://lkml.org/lkml/2007/3/6/198 -4. Emelianov, Pavel. RSS controller based on process cgroups (v2) - http://lkml.org/lkml/2007/4/9/78 -5. Emelianov, Pavel. RSS controller based on process cgroups (v3) - http://lkml.org/lkml/2007/5/30/244 -6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/ -7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control - subsystem (v3), http://lwn.net/Articles/235534/ -8. Singh, Balbir. RSS controller v2 test results (lmbench), - http://lkml.org/lkml/2007/5/17/232 -9. Singh, Balbir. RSS controller v2 AIM9 results - http://lkml.org/lkml/2007/5/18/1 -10. Singh, Balbir. Memory controller v6 test results, - http://lkml.org/lkml/2007/8/19/36 -11. Singh, Balbir. Memory controller introduction (v6), - http://lkml.org/lkml/2007/8/17/69 -12. Corbet, Jonathan, Controlling memory use in cgroups, - http://lwn.net/Articles/243795/ diff --git a/Documentation/controllers/resource_counter.txt b/Documentation/controllers/resource_counter.txt deleted file mode 100644 index f196ac1d7d2..00000000000 --- a/Documentation/controllers/resource_counter.txt +++ /dev/null @@ -1,181 +0,0 @@ - - The Resource Counter - -The resource counter, declared at include/linux/res_counter.h, -is supposed to facilitate the resource management by controllers -by providing common stuff for accounting. - -This "stuff" includes the res_counter structure and routines -to work with it. - - - -1. Crucial parts of the res_counter structure - - a. unsigned long long usage - - The usage value shows the amount of a resource that is consumed - by a group at a given time. The units of measurement should be - determined by the controller that uses this counter. E.g. it can - be bytes, items or any other unit the controller operates on. - - b. unsigned long long max_usage - - The maximal value of the usage over time. - - This value is useful when gathering statistical information about - the particular group, as it shows the actual resource requirements - for a particular group, not just some usage snapshot. - - c. unsigned long long limit - - The maximal allowed amount of resource to consume by the group. In - case the group requests for more resources, so that the usage value - would exceed the limit, the resource allocation is rejected (see - the next section). - - d. unsigned long long failcnt - - The failcnt stands for "failures counter". This is the number of - resource allocation attempts that failed. - - c. spinlock_t lock - - Protects changes of the above values. - - - -2. Basic accounting routines - - a. void res_counter_init(struct res_counter *rc) - - Initializes the resource counter. As usual, should be the first - routine called for a new counter. - - b. int res_counter_charge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is about to be allocated it has to be accounted - with the appropriate resource counter (controller should determine - which one to use on its own). This operation is called "charging". - - This is not very important which operation - resource allocation - or charging - is performed first, but - * if the allocation is performed first, this may create a - temporary resource over-usage by the time resource counter is - charged; - * if the charging is performed first, then it should be uncharged - on error path (if the one is called). - - c. void res_counter_uncharge[_locked] - (struct res_counter *rc, unsigned long val) - - When a resource is released (freed) it should be de-accounted - from the resource counter it was accounted to. This is called - "uncharging". - - The _locked routines imply that the res_counter->lock is taken. - - - 2.1 Other accounting routines - - There are more routines that may help you with common needs, like - checking whether the limit is reached or resetting the max_usage - value. They are all declared in include/linux/res_counter.h. - - - -3. Analyzing the resource counter registrations - - a. If the failcnt value constantly grows, this means that the counter's - limit is too tight. Either the group is misbehaving and consumes too - many resources, or the configuration is not suitable for the group - and the limit should be increased. - - b. The max_usage value can be used to quickly tune the group. One may - set the limits to maximal values and either load the container with - a common pattern or leave one for a while. After this the max_usage - value shows the amount of memory the container would require during - its common activity. - - Setting the limit a bit above this value gives a pretty good - configuration that works in most of the cases. - - c. If the max_usage is much less than the limit, but the failcnt value - is growing, then the group tries to allocate a big chunk of resource - at once. - - d. If the max_usage is much less than the limit, but the failcnt value - is 0, then this group is given too high limit, that it does not - require. It is better to lower the limit a bit leaving more resource - for other groups. - - - -4. Communication with the control groups subsystem (cgroups) - -All the resource controllers that are using cgroups and resource counters -should provide files (in the cgroup filesystem) to work with the resource -counter fields. They are recommended to adhere to the following rules: - - a. File names - - Field name File name - --------------------------------------------------- - usage usage_in_ - max_usage max_usage_in_ - limit limit_in_ - failcnt failcnt - lock no file :) - - b. Reading from file should show the corresponding field value in the - appropriate format. - - c. Writing to file - - Field Expected behavior - ---------------------------------- - usage prohibited - max_usage reset to usage - limit set the limit - failcnt reset to zero - - - -5. Usage example - - a. Declare a task group (take a look at cgroups subsystem for this) and - fold a res_counter into it - - struct my_group { - struct res_counter res; - - - } - - b. Put hooks in resource allocation/release paths - - int alloc_something(...) - { - if (res_counter_charge(res_counter_ptr, amount) < 0) - return -ENOMEM; - - - } - - void release_something(...) - { - res_counter_uncharge(res_counter_ptr, amount); - - - } - - In order to keep the usage value self-consistent, both the - "res_counter_ptr" and the "amount" in release_something() should be - the same as they were in the alloc_something() when the releasing - resource was allocated. - - c. Provide the way to read res_counter values and set them (the cgroups - still can help with it). - - c. Compile and run :) diff --git a/Documentation/cpqarray.txt b/Documentation/cpqarray.txt deleted file mode 100644 index c7154e20ef5..00000000000 --- a/Documentation/cpqarray.txt +++ /dev/null @@ -1,93 +0,0 @@ -This driver is for Compaq's SMART2 Intelligent Disk Array Controllers. - -Supported Cards: ----------------- - -This driver is known to work with the following cards: - - * SMART (EISA) - * SMART-2/E (EISA) - * SMART-2/P - * SMART-2DH - * SMART-2SL - * SMART-221 - * SMART-3100ES - * SMART-3200 - * Integrated Smart Array Controller - * SA 4200 - * SA 4250ES - * SA 431 - * RAID LC2 Controller - -It should also work with some really old Disk array adapters, but I am -unable to test against these cards: - - * IDA - * IDA-2 - * IAES - - -EISA Controllers: ------------------ - -If you want to use an EISA controller you'll have to supply some -modprobe/lilo parameters. If the driver is compiled into the kernel, must -give it the controller's IO port address at boot time (it is not -necessary to specify the IRQ). For example, if you had two SMART-2/E -controllers, in EISA slots 1 and 2 you'd give it a boot argument like -this: - - smart2=0x1000,0x2000 - -If you were loading the driver as a module, you'd give load it like this: - - modprobe cpqarray eisa=0x1000,0x2000 - -You can use EISA and PCI adapters at the same time. - - -Device Naming: --------------- - -You need some entries in /dev for the ida device. MAKEDEV in the /dev -directory can make device nodes for you automatically. The device setup is -as follows: - -Major numbers: - 72 ida0 - 73 ida1 - 74 ida2 - 75 ida3 - 76 ida4 - 77 ida5 - 78 ida6 - 79 ida7 - -Minor numbers: - b7 b6 b5 b4 b3 b2 b1 b0 - |----+----| |----+----| - | | - | +-------- Partition ID (0=wholedev, 1-15 partition) - | - +-------------------- Logical Volume number - -The device naming scheme is: -/dev/ida/c0d0 Controller 0, disk 0, whole device -/dev/ida/c0d0p1 Controller 0, disk 0, partition 1 -/dev/ida/c0d0p2 Controller 0, disk 0, partition 2 -/dev/ida/c0d0p3 Controller 0, disk 0, partition 3 - -/dev/ida/c1d1 Controller 1, disk 1, whole device -/dev/ida/c1d1p1 Controller 1, disk 1, partition 1 -/dev/ida/c1d1p2 Controller 1, disk 1, partition 2 -/dev/ida/c1d1p3 Controller 1, disk 1, partition 3 - - -Changelog: -========== - -10-28-2004 : General cleanup, syntax fixes for in-kernel driver version. - James Nelson - - -1999 : Original Document diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt index 6c442d8426b..e3443ddcfb8 100644 --- a/Documentation/cpu-freq/user-guide.txt +++ b/Documentation/cpu-freq/user-guide.txt @@ -23,6 +23,7 @@ Contents: 1.3 sparc64 1.4 ppc 1.5 SuperH +1.6 Blackfin 2. "Policy" / "Governor"? 2.1 Policy @@ -92,10 +93,19 @@ Several "PowerBook" and "iBook2" notebooks are supported. 1.5 SuperH ---------- -The following SuperH processors are supported by cpufreq: +All SuperH processors supporting rate rounding through the clock +framework are supported by cpufreq. -SH-3 -SH-4 +1.6 Blackfin +------------ + +The following Blackfin processors are supported by cpufreq: + +BF522, BF523, BF524, BF525, BF526, BF527, Rev 0.1 or higher +BF531, BF532, BF533, Rev 0.3 or higher +BF534, BF536, BF537, Rev 0.2 or higher +BF561, Rev 0.3 or higher +BF542, BF544, BF547, BF548, BF549, Rev 0.1 or higher 2. "Policy" / "Governor" ? diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index 94bbc27ddd4..9d620c153b0 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -50,16 +50,17 @@ additional_cpus=n (*) Use this to limit hotpluggable cpus. This option sets cpu_possible_map = cpu_present_map + additional_cpus (*) Option valid only for following architectures -- x86_64, ia64 +- ia64 -ia64 and x86_64 use the number of disabled local apics in ACPI tables MADT -to determine the number of potentially hot-pluggable cpus. The implementation -should only rely on this to count the # of cpus, but *MUST* not rely on the -apicid values in those tables for disabled apics. In the event BIOS doesn't -mark such hot-pluggable cpus as disabled entries, one could use this -parameter "additional_cpus=x" to represent those cpus in the cpu_possible_map. +ia64 uses the number of disabled local apics in ACPI tables MADT to +determine the number of potentially hot-pluggable cpus. The implementation +should only rely on this to count the # of cpus, but *MUST* not rely +on the apicid values in those tables for disabled apics. In the event +BIOS doesn't mark such hot-pluggable cpus as disabled entries, one could +use this parameter "additional_cpus=x" to represent those cpus in the +cpu_possible_map. -possible_cpus=n [s390 only] use this to set hotpluggable cpus. +possible_cpus=n [s390,x86_64] use this to set hotpluggable cpus. This option sets possible_cpus bits in cpu_possible_map. Thus keeping the numbers of bits set constant even if the machine gets rebooted. diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt deleted file mode 100644 index 5c86c258c79..00000000000 --- a/Documentation/cpusets.txt +++ /dev/null @@ -1,808 +0,0 @@ - CPUSETS - ------- - -Copyright (C) 2004 BULL SA. -Written by Simon.Derr@bull.net - -Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. -Modified by Paul Jackson -Modified by Christoph Lameter -Modified by Paul Menage -Modified by Hidetoshi Seto - -CONTENTS: -========= - -1. Cpusets - 1.1 What are cpusets ? - 1.2 Why are cpusets needed ? - 1.3 How are cpusets implemented ? - 1.4 What are exclusive cpusets ? - 1.5 What is memory_pressure ? - 1.6 What is memory spread ? - 1.7 What is sched_load_balance ? - 1.8 What is sched_relax_domain_level ? - 1.9 How do I use cpusets ? -2. Usage Examples and Syntax - 2.1 Basic Usage - 2.2 Adding/removing cpus - 2.3 Setting flags - 2.4 Attaching processes -3. Questions -4. Contact - -1. Cpusets -========== - -1.1 What are cpusets ? ----------------------- - -Cpusets provide a mechanism for assigning a set of CPUs and Memory -Nodes to a set of tasks. In this document "Memory Node" refers to -an on-line node that contains memory. - -Cpusets constrain the CPU and Memory placement of tasks to only -the resources within a tasks current cpuset. They form a nested -hierarchy visible in a virtual file system. These are the essential -hooks, beyond what is already present, required to manage dynamic -job placement on large systems. - -Cpusets use the generic cgroup subsystem described in -Documentation/cgroups/cgroups.txt. - -Requests by a task, using the sched_setaffinity(2) system call to -include CPUs in its CPU affinity mask, and using the mbind(2) and -set_mempolicy(2) system calls to include Memory Nodes in its memory -policy, are both filtered through that tasks cpuset, filtering out any -CPUs or Memory Nodes not in that cpuset. The scheduler will not -schedule a task on a CPU that is not allowed in its cpus_allowed -vector, and the kernel page allocator will not allocate a page on a -node that is not allowed in the requesting tasks mems_allowed vector. - -User level code may create and destroy cpusets by name in the cgroup -virtual file system, manage the attributes and permissions of these -cpusets and which CPUs and Memory Nodes are assigned to each cpuset, -specify and query to which cpuset a task is assigned, and list the -task pids assigned to a cpuset. - - -1.2 Why are cpusets needed ? ----------------------------- - -The management of large computer systems, with many processors (CPUs), -complex memory cache hierarchies and multiple Memory Nodes having -non-uniform access times (NUMA) presents additional challenges for -the efficient scheduling and memory placement of processes. - -Frequently more modest sized systems can be operated with adequate -efficiency just by letting the operating system automatically share -the available CPU and Memory resources amongst the requesting tasks. - -But larger systems, which benefit more from careful processor and -memory placement to reduce memory access times and contention, -and which typically represent a larger investment for the customer, -can benefit from explicitly placing jobs on properly sized subsets of -the system. - -This can be especially valuable on: - - * Web Servers running multiple instances of the same web application, - * Servers running different applications (for instance, a web server - and a database), or - * NUMA systems running large HPC applications with demanding - performance characteristics. - -These subsets, or "soft partitions" must be able to be dynamically -adjusted, as the job mix changes, without impacting other concurrently -executing jobs. The location of the running jobs pages may also be moved -when the memory locations are changed. - -The kernel cpuset patch provides the minimum essential kernel -mechanisms required to efficiently implement such subsets. It -leverages existing CPU and Memory Placement facilities in the Linux -kernel to avoid any additional impact on the critical scheduler or -memory allocator code. - - -1.3 How are cpusets implemented ? ---------------------------------- - -Cpusets provide a Linux kernel mechanism to constrain which CPUs and -Memory Nodes are used by a process or set of processes. - -The Linux kernel already has a pair of mechanisms to specify on which -CPUs a task may be scheduled (sched_setaffinity) and on which Memory -Nodes it may obtain memory (mbind, set_mempolicy). - -Cpusets extends these two mechanisms as follows: - - - Cpusets are sets of allowed CPUs and Memory Nodes, known to the - kernel. - - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cgroup structure. - - Calls to sched_setaffinity are filtered to just those CPUs - allowed in that tasks cpuset. - - Calls to mbind and set_mempolicy are filtered to just - those Memory Nodes allowed in that tasks cpuset. - - The root cpuset contains all the systems CPUs and Memory - Nodes. - - For any cpuset, one can define child cpusets containing a subset - of the parents CPU and Memory Node resources. - - The hierarchy of cpusets can be mounted at /dev/cpuset, for - browsing and manipulation from user space. - - A cpuset may be marked exclusive, which ensures that no other - cpuset (except direct ancestors and descendents) may contain - any overlapping CPUs or Memory Nodes. - - You can list all the tasks (by pid) attached to any cpuset. - -The implementation of cpusets requires a few, simple hooks -into the rest of the kernel, none in performance critical paths: - - - in init/main.c, to initialize the root cpuset at system boot. - - in fork and exit, to attach and detach a task from its cpuset. - - in sched_setaffinity, to mask the requested CPUs by what's - allowed in that tasks cpuset. - - in sched.c migrate_all_tasks(), to keep migrating tasks within - the CPUs allowed by their cpuset, if possible. - - in the mbind and set_mempolicy system calls, to mask the requested - Memory Nodes by what's allowed in that tasks cpuset. - - in page_alloc.c, to restrict memory to allowed nodes. - - in vmscan.c, to restrict page recovery to the current cpuset. - -You should mount the "cgroup" filesystem type in order to enable -browsing and modifying the cpusets presently known to the kernel. No -new system calls are added for cpusets - all support for querying and -modifying cpusets is via this cpuset file system. - -The /proc//status file for each task has four added lines, -displaying the tasks cpus_allowed (on which CPUs it may be scheduled) -and mems_allowed (on which Memory Nodes it may obtain memory), -in the two formats seen in the following example: - - Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff - Cpus_allowed_list: 0-127 - Mems_allowed: ffffffff,ffffffff - Mems_allowed_list: 0-63 - -Each cpuset is represented by a directory in the cgroup file system -containing (on top of the standard cgroup files) the following -files describing that cpuset: - - - cpus: list of CPUs in that cpuset - - mems: list of Memory Nodes in that cpuset - - memory_migrate flag: if set, move pages to cpusets nodes - - cpu_exclusive flag: is cpu placement exclusive? - - mem_exclusive flag: is memory placement exclusive? - - mem_hardwall flag: is memory allocation hardwalled - - memory_pressure: measure of how much paging pressure in cpuset - -In addition, the root cpuset only has the following file: - - memory_pressure_enabled flag: compute memory_pressure? - -New cpusets are created using the mkdir system call or shell -command. The properties of a cpuset, such as its flags, allowed -CPUs and Memory Nodes, and attached tasks, are modified by writing -to the appropriate file in that cpusets directory, as listed above. - -The named hierarchical structure of nested cpusets allows partitioning -a large system into nested, dynamically changeable, "soft-partitions". - -The attachment of each task, automatically inherited at fork by any -children of that task, to a cpuset allows organizing the work load -on a system into related sets of tasks such that each set is constrained -to using the CPUs and Memory Nodes of a particular cpuset. A task -may be re-attached to any other cpuset, if allowed by the permissions -on the necessary cpuset file system directories. - -Such management of a system "in the large" integrates smoothly with -the detailed placement done on individual tasks and memory regions -using the sched_setaffinity, mbind and set_mempolicy system calls. - -The following rules apply to each cpuset: - - - Its CPUs and Memory Nodes must be a subset of its parents. - - It can't be marked exclusive unless its parent is. - - If its cpu or memory is exclusive, they may not overlap any sibling. - -These rules, and the natural hierarchy of cpusets, enable efficient -enforcement of the exclusive guarantee, without having to scan all -cpusets every time any of them change to ensure nothing overlaps a -exclusive cpuset. Also, the use of a Linux virtual file system (vfs) -to represent the cpuset hierarchy provides for a familiar permission -and name space for cpusets, with a minimum of additional kernel code. - -The cpus and mems files in the root (top_cpuset) cpuset are -read-only. The cpus file automatically tracks the value of -cpu_online_map using a CPU hotplug notifier, and the mems file -automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., -nodes with memory--using the cpuset_track_online_nodes() hook. - - -1.4 What are exclusive cpusets ? --------------------------------- - -If a cpuset is cpu or mem exclusive, no other cpuset, other than -a direct ancestor or descendent, may share any of the same CPUs or -Memory Nodes. - -A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", -i.e. it restricts kernel allocations for page, buffer and other data -commonly shared by the kernel across multiple users. All cpusets, -whether hardwalled or not, restrict allocations of memory for user -space. This enables configuring a system so that several independent -jobs can share common kernel data, such as file system pages, while -isolating each job's user allocation in its own cpuset. To do this, -construct a large mem_exclusive cpuset to hold all the jobs, and -construct child, non-mem_exclusive cpusets for each individual job. -Only a small amount of typical kernel memory, such as requests from -interrupt handlers, is allowed to be taken outside even a -mem_exclusive cpuset. - - -1.5 What is memory_pressure ? ------------------------------ -The memory_pressure of a cpuset provides a simple per-cpuset metric -of the rate that the tasks in a cpuset are attempting to free up in -use memory on the nodes of the cpuset to satisfy additional memory -requests. - -This enables batch managers monitoring jobs running in dedicated -cpusets to efficiently detect what level of memory pressure that job -is causing. - -This is useful both on tightly managed systems running a wide mix of -submitted jobs, which may choose to terminate or re-prioritize jobs that -are trying to use more memory than allowed on the nodes assigned them, -and with tightly coupled, long running, massively parallel scientific -computing jobs that will dramatically fail to meet required performance -goals if they start to use more memory than allowed to them. - -This mechanism provides a very economical way for the batch manager -to monitor a cpuset for signs of memory pressure. It's up to the -batch manager or other user code to decide what to do about it and -take action. - -==> Unless this feature is enabled by writing "1" to the special file - /dev/cpuset/memory_pressure_enabled, the hook in the rebalance - code of __alloc_pages() for this metric reduces to simply noticing - that the cpuset_memory_pressure_enabled flag is zero. So only - systems that enable this feature will compute the metric. - -Why a per-cpuset, running average: - - Because this meter is per-cpuset, rather than per-task or mm, - the system load imposed by a batch scheduler monitoring this - metric is sharply reduced on large systems, because a scan of - the tasklist can be avoided on each set of queries. - - Because this meter is a running average, instead of an accumulating - counter, a batch scheduler can detect memory pressure with a - single read, instead of having to read and accumulate results - for a period of time. - - Because this meter is per-cpuset rather than per-task or mm, - the batch scheduler can obtain the key information, memory - pressure in a cpuset, with a single read, rather than having to - query and accumulate results over all the (dynamically changing) - set of tasks in the cpuset. - -A per-cpuset simple digital filter (requires a spinlock and 3 words -of data per-cpuset) is kept, and updated by any task attached to that -cpuset, if it enters the synchronous (direct) page reclaim code. - -A per-cpuset file provides an integer number representing the recent -(half-life of 10 seconds) rate of direct page reclaims caused by -the tasks in the cpuset, in units of reclaims attempted per second, -times 1000. - - -1.6 What is memory spread ? ---------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'memory_spread_page' and -'memory_spread_slab'. - -If the per-cpuset boolean flag file 'memory_spread_page' is set, then -the kernel will spread the file system buffers (page cache) evenly -over all the nodes that the faulting task is allowed to use, instead -of preferring to put those pages on the node where the task is running. - -If the per-cpuset boolean flag file 'memory_spread_slab' is set, -then the kernel will spread some file system related slab caches, -such as for inodes and dentries evenly over all the nodes that the -faulting task is allowed to use, instead of preferring to put those -pages on the node where the task is running. - -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. - -By default, both kinds of memory spreading are off, and memory -pages are allocated on the node local to where the task is running, -except perhaps as modified by the tasks NUMA mempolicy or cpuset -configuration, so long as sufficient free memory pages are available. - -When new cpusets are created, they inherit the memory spread settings -of their parent. - -Setting memory spreading causes allocations for the affected page -or slab caches to ignore the tasks NUMA mempolicy and be spread -instead. Tasks using mbind() or set_mempolicy() calls to set NUMA -mempolicies will not notice any change in these calls as a result of -their containing tasks memory spread settings. If memory spreading -is turned off, then the currently specified NUMA mempolicy once again -applies to memory page allocations. - -Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. - -The implementation is simple. - -Setting the flag 'memory_spread_page' turns on a per-process flag -PF_SPREAD_PAGE for each task that is in that cpuset or subsequently -joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PF_SPREAD_PAGE task -flag, and if set, a call to a new routine cpuset_mem_spread_node() -returns the node to prefer for the allocation. - -Similarly, setting 'memory_spread_slab' turns on the flag -PF_SPREAD_SLAB, and appropriately marked slab caches will allocate -pages from the node returned by cpuset_mem_spread_node(). - -The cpuset_mem_spread_node() routine is also simple. It uses the -value of a per-task rotor cpuset_mem_spread_rotor to select the next -node in the current tasks mems_allowed to prefer for the allocation. - -This memory placement policy is also known (in other contexts) as -round-robin or interleave. - -This policy can provide substantial improvements for jobs that need -to place thread local data on the corresponding node, but that need -to access large file system data sets that need to be spread across -the several nodes in the jobs cpuset in order to fit. Without this -policy, especially for jobs that might have one thread reading in the -data set, the memory allocation across the nodes in the jobs cpuset -can become very uneven. - -1.7 What is sched_load_balance ? --------------------------------- - -The kernel scheduler (kernel/sched.c) automatically load balances -tasks. If one CPU is underutilized, kernel code running on that -CPU will look for tasks on other more overloaded CPUs and move those -tasks to itself, within the constraints of such placement mechanisms -as cpusets and sched_setaffinity. - -The algorithmic cost of load balancing and its impact on key shared -kernel data structures such as the task list increases more than -linearly with the number of CPUs being balanced. So the scheduler -has support to partition the systems CPUs into a number of sched -domains such that it only load balances within each sched domain. -Each sched domain covers some subset of the CPUs in the system; -no two sched domains overlap; some CPUs might not be in any sched -domain and hence won't be load balanced. - -Put simply, it costs less to balance between two smaller sched domains -than one big one, but doing so means that overloads in one of the -two domains won't be load balanced to the other one. - -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. - -This default load balancing across all CPUs is not well suited for -the following two situations: - 1) On large systems, load balancing across many CPUs is expensive. - If the system is managed using cpusets to place independent jobs - on separate sets of CPUs, full load balancing is unnecessary. - 2) Systems supporting realtime on some CPUs need to minimize - system overhead on those CPUs, including avoiding task load - balancing if that is not needed. - -When the per-cpuset flag "sched_load_balance" is enabled (the default -setting), it requests that all the CPUs in that cpusets allowed 'cpus' -be contained in a single sched domain, ensuring that load balancing -can move a task (not otherwised pinned, as by sched_setaffinity) -from any CPU in that cpuset to any other. - -When the per-cpuset flag "sched_load_balance" is disabled, then the -scheduler will avoid load balancing across the CPUs in that cpuset, ---except-- in so far as is necessary because some overlapping cpuset -has "sched_load_balance" enabled. - -So, for example, if the top cpuset has the flag "sched_load_balance" -enabled, then the scheduler will have one sched domain covering all -CPUs, and the setting of the "sched_load_balance" flag in any other -cpusets won't matter, as we're already fully load balancing. - -Therefore in the above two situations, the top cpuset flag -"sched_load_balance" should be disabled, and only some of the smaller, -child cpusets have this flag enabled. - -When doing this, you don't usually want to leave any unpinned tasks in -the top cpuset that might use non-trivial amounts of CPU, as such tasks -may be artificially constrained to some subset of CPUs, depending on -the particulars of this flag setting in descendent cpusets. Even if -such a task could use spare CPU cycles in some other CPUs, the kernel -scheduler might not consider the possibility of load balancing that -task to that underused CPU. - -Of course, tasks pinned to a particular CPU can be left in a cpuset -that disables "sched_load_balance" as those tasks aren't going anywhere -else anyway. - -There is an impedance mismatch here, between cpusets and sched domains. -Cpusets are hierarchical and nest. Sched domains are flat; they don't -overlap and each CPU is in at most one sched domain. - -It is necessary for sched domains to be flat because load balancing -across partially overlapping sets of CPUs would risk unstable dynamics -that would be beyond our understanding. So if each of two partially -overlapping cpusets enables the flag 'sched_load_balance', then we -form a single sched domain that is a superset of both. We won't move -a task to a CPU outside it cpuset, but the scheduler load balancing -code might waste some compute cycles considering that possibility. - -This mismatch is why there is not a simple one-to-one relation -between which cpusets have the flag "sched_load_balance" enabled, -and the sched domain configuration. If a cpuset enables the flag, it -will get balancing across all its CPUs, but if it disables the flag, -it will only be assured of no load balancing if no other overlapping -cpuset enables the flag. - -If two cpusets have partially overlapping 'cpus' allowed, and only -one of them has this flag enabled, then the other may find its -tasks only partially load balanced, just on the overlapping CPUs. -This is just the general case of the top_cpuset example given a few -paragraphs above. In the general case, as in the top cpuset case, -don't leave tasks that might use non-trivial amounts of CPU in -such partially load balanced cpusets, as they may be artificially -constrained to some subset of the CPUs allowed to them, for lack of -load balancing to the other CPUs. - -1.7.1 sched_load_balance implementation details. ------------------------------------------------- - -The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary -to most cpuset flags.) When enabled for a cpuset, the kernel will -ensure that it can load balance across all the CPUs in that cpuset -(makes sure that all the CPUs in the cpus_allowed of that cpuset are -in the same sched domain.) - -If two overlapping cpusets both have 'sched_load_balance' enabled, -then they will be (must be) both in the same sched domain. - -If, as is the default, the top cpuset has 'sched_load_balance' enabled, -then by the above that means there is a single sched domain covering -the whole system, regardless of any other cpuset settings. - -The kernel commits to user space that it will avoid load balancing -where it can. It will pick as fine a granularity partition of sched -domains as it can while still providing load balancing for any set -of CPUs allowed to a cpuset having 'sched_load_balance' enabled. - -The internal kernel cpuset to scheduler interface passes from the -cpuset code to the scheduler code a partition of the load balanced -CPUs in the system. This partition is a set of subsets (represented -as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all -the CPUs that must be load balanced. - -Whenever the 'sched_load_balance' flag changes, or CPUs come or go -from a cpuset with this flag enabled, or a cpuset with this flag -enabled is removed, the cpuset code builds a new such partition and -passes it to the scheduler sched domain setup code, to have the sched -domains rebuilt as necessary. - -This partition exactly defines what sched domains the scheduler should -setup - one sched domain for each element (cpumask_t) in the partition. - -The scheduler remembers the currently active sched domain partitions. -When the scheduler routine partition_sched_domains() is invoked from -the cpuset code to update these sched domains, it compares the new -partition requested with the current, and updates its sched domains, -removing the old and adding the new, for each change. - - -1.8 What is sched_relax_domain_level ? --------------------------------------- - -In sched domain, the scheduler migrates tasks in 2 ways; periodic load -balance on tick, and at time of some schedule events. - -When a task is woken up, scheduler try to move the task on idle CPU. -For example, if a task A running on CPU X activates another task B -on the same CPU X, and if CPU Y is X's sibling and performing idle, -then scheduler migrate task B to CPU Y so that task B can start on -CPU Y without waiting task A on CPU X. - -And if a CPU run out of tasks in its runqueue, the CPU try to pull -extra tasks from other busy CPUs to help them before it is going to -be idle. - -Of course it takes some searching cost to find movable tasks and/or -idle CPUs, the scheduler might not search all CPUs in the domain -everytime. In fact, in some architectures, the searching ranges on -events are limited in the same socket or node where the CPU locates, -while the load balance on tick searchs all. - -For example, assume CPU Z is relatively far from CPU X. Even if CPU Z -is idle while CPU X and the siblings are busy, scheduler can't migrate -woken task B from X to Z since it is out of its searching range. -As the result, task B on CPU X need to wait task A or wait load balance -on the next tick. For some applications in special situation, waiting -1 tick may be too long. - -The 'sched_relax_domain_level' file allows you to request changing -this searching range as you like. This file takes int value which -indicates size of searching range in levels ideally as follows, -otherwise initial value -1 that indicates the cpuset has no request. - - -1 : no request. use system default or follow request of others. - 0 : no search. - 1 : search siblings (hyperthreads in a core). - 2 : search cores in a package. - 3 : search cpus in a node [= system wide on non-NUMA system] - ( 4 : search nodes in a chunk of node [on NUMA system] ) - ( 5 : search system wide [on NUMA system] ) - -The system default is architecture dependent. The system default -can be changed using the relax_domain_level= boot parameter. - -This file is per-cpuset and affect the sched domain where the cpuset -belongs to. Therefore if the flag 'sched_load_balance' of a cpuset -is disabled, then 'sched_relax_domain_level' have no effect since -there is no sched domain belonging the cpuset. - -If multiple cpusets are overlapping and hence they form a single sched -domain, the largest value among those is used. Be careful, if one -requests 0 and others are -1 then 0 is used. - -Note that modifying this file will have both good and bad effects, -and whether it is acceptable or not will be depend on your situation. -Don't modify this file if you are not sure. - -If your situation is: - - The migration costs between each cpu can be assumed considerably - small(for you) due to your special application's behavior or - special hardware support for CPU cache etc. - - The searching cost doesn't have impact(for you) or you can make - the searching cost enough small by managing cpuset to compact etc. - - The latency is required even it sacrifices cache hit rate etc. -then increasing 'sched_relax_domain_level' would benefit you. - - -1.9 How do I use cpusets ? --------------------------- - -In order to minimize the impact of cpusets on critical kernel -code, such as the scheduler, and due to the fact that the kernel -does not support one task updating the memory placement of another -task directly, the impact on a task of changing its cpuset CPU -or Memory Node placement, or of changing to which cpuset a task -is attached, is subtle. - -If a cpuset has its Memory Nodes modified, then for each task attached -to that cpuset, the next time that the kernel attempts to allocate -a page of memory for that task, the kernel will notice the change -in the tasks cpuset, and update its per-task memory placement to -remain within the new cpusets memory placement. If the task was using -mempolicy MPOL_BIND, and the nodes to which it was bound overlap with -its new cpuset, then the task will continue to use whatever subset -of MPOL_BIND nodes are still allowed in the new cpuset. If the task -was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed -in the new cpuset, then the task will be essentially treated as if it -was MPOL_BIND bound to the new cpuset (even though its numa placement, -as queried by get_mempolicy(), doesn't change). If a task is moved -from one cpuset to another, then the kernel will adjust the tasks -memory placement, as above, the next time that the kernel attempts -to allocate a page of memory for that task. - -If a cpuset has its 'cpus' modified, then each task in that cpuset -will have its allowed CPU placement changed immediately. Similarly, -if a tasks pid is written to a cpusets 'tasks' file, in either its -current cpuset or another cpuset, then its allowed CPU placement is -changed immediately. If such a task had been bound to some subset -of its cpuset using the sched_setaffinity() call, the task will be -allowed to run on any CPU allowed in its new cpuset, negating the -affect of the prior sched_setaffinity() call. - -In summary, the memory placement of a task whose cpuset is changed is -updated by the kernel, on the next allocation of a page for that task, -but the processor placement is not updated, until that tasks pid is -rewritten to the 'tasks' file of its cpuset. This is done to avoid -impacting the scheduler code in the kernel with a check for changes -in a tasks processor placement. - -Normally, once a page is allocated (given a physical page -of main memory) then that page stays on whatever node it -was allocated, so long as it remains allocated, even if the -cpusets memory placement policy 'mems' subsequently changes. -If the cpuset flag file 'memory_migrate' is set true, then when -tasks are attached to that cpuset, any pages that task had -allocated to it on nodes in its previous cpuset are migrated -to the tasks new cpuset. The relative placement of the page within -the cpuset is preserved during these migration operations if possible. -For example if the page was on the second valid node of the prior cpuset -then the page will be placed on the second valid node of the new cpuset. - -Also if 'memory_migrate' is set true, then if that cpusets -'mems' file is modified, pages allocated to tasks in that -cpuset, that were on nodes in the previous setting of 'mems', -will be moved to nodes in the new setting of 'mems.' -Pages that were not in the tasks prior cpuset, or in the cpusets -prior 'mems' setting, will not be moved. - -There is an exception to the above. If hotplug functionality is used -to remove all the CPUs that are currently assigned to a cpuset, -then all the tasks in that cpuset will be moved to the nearest ancestor -with non-empty cpus. But the moving of some (or all) tasks might fail if -cpuset is bound with another cgroup subsystem which has some restrictions -on task attaching. In this failing case, those tasks will stay -in the original cpuset, and the kernel will automatically update -their cpus_allowed to allow all online CPUs. When memory hotplug -functionality for removing Memory Nodes is available, a similar exception -is expected to apply there as well. In general, the kernel prefers to -violate cpuset placement, over starving a task that has had all -its allowed CPUs or Memory Nodes taken offline. - -There is a second exception to the above. GFP_ATOMIC requests are -kernel internal allocations that must be satisfied, immediately. -The kernel may drop some request, in rare cases even panic, if a -GFP_ATOMIC alloc fails. If the request cannot be satisfied within -the current tasks cpuset, then we relax the cpuset, and look for -memory anywhere we can find it. It's better to violate the cpuset -than stress the kernel. - -To start a new job that is to be contained within a cpuset, the steps are: - - 1) mkdir /dev/cpuset - 2) mount -t cgroup -ocpuset cpuset /dev/cpuset - 3) Create the new cpuset by doing mkdir's and write's (or echo's) in - the /dev/cpuset virtual file system. - 4) Start a task that will be the "founding father" of the new job. - 5) Attach that task to the new cpuset by writing its pid to the - /dev/cpuset tasks file for that cpuset. - 6) fork, exec or clone the job tasks from this founding father task. - -For example, the following sequence of commands will setup a cpuset -named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, -and then start a subshell 'sh' in that cpuset: - - mount -t cgroup -ocpuset cpuset /dev/cpuset - cd /dev/cpuset - mkdir Charlie - cd Charlie - /bin/echo 2-3 > cpus - /bin/echo 1 > mems - /bin/echo $$ > tasks - sh - # The subshell 'sh' is now running in cpuset Charlie - # The next line should display '/Charlie' - cat /proc/self/cpuset - -In the future, a C library interface to cpusets will likely be -available. For now, the only way to query or modify cpusets is -via the cpuset file system, using the various cd, mkdir, echo, cat, -rmdir commands from the shell, or their equivalent from C. - -The sched_setaffinity calls can also be done at the shell prompt using -SGI's runon or Robert Love's taskset. The mbind and set_mempolicy -calls can be done at the shell prompt using the numactl command -(part of Andi Kleen's numa package). - -2. Usage Examples and Syntax -============================ - -2.1 Basic Usage ---------------- - -Creating, modifying, using the cpusets can be done through the cpuset -virtual filesystem. - -To mount it, type: -# mount -t cgroup -o cpuset cpuset /dev/cpuset - -Then under /dev/cpuset you can find a tree that corresponds to the -tree of the cpusets in the system. For instance, /dev/cpuset -is the cpuset that holds the whole system. - -If you want to create a new cpuset under /dev/cpuset: -# cd /dev/cpuset -# mkdir my_cpuset - -Now you want to do something with this cpuset. -# cd my_cpuset - -In this directory you can find several files: -# ls -cpu_exclusive memory_migrate mems tasks -cpus memory_pressure notify_on_release -mem_exclusive memory_spread_page sched_load_balance -mem_hardwall memory_spread_slab sched_relax_domain_level - -Reading them will give you information about the state of this cpuset: -the CPUs and Memory Nodes it can use, the processes that are using -it, its properties. By writing to these files you can manipulate -the cpuset. - -Set some flags: -# /bin/echo 1 > cpu_exclusive - -Add some cpus: -# /bin/echo 0-7 > cpus - -Add some mems: -# /bin/echo 0-7 > mems - -Now attach your shell to this cpuset: -# /bin/echo $$ > tasks - -You can also create cpusets inside your cpuset by using mkdir in this -directory. -# mkdir my_sub_cs - -To remove a cpuset, just use rmdir: -# rmdir my_sub_cs -This will fail if the cpuset is in use (has cpusets inside, or has -processes attached). - -Note that for legacy reasons, the "cpuset" filesystem exists as a -wrapper around the cgroup filesystem. - -The command - -mount -t cpuset X /dev/cpuset - -is equivalent to - -mount -t cgroup -ocpuset X /dev/cpuset -echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent - -2.2 Adding/removing cpus ------------------------- - -This is the syntax to use when writing in the cpus or mems files -in cpuset directories: - -# /bin/echo 1-4 > cpus -> set cpus list to cpus 1,2,3,4 -# /bin/echo 1,2,3,4 > cpus -> set cpus list to cpus 1,2,3,4 - -2.3 Setting flags ------------------ - -The syntax is very simple: - -# /bin/echo 1 > cpu_exclusive -> set flag 'cpu_exclusive' -# /bin/echo 0 > cpu_exclusive -> unset flag 'cpu_exclusive' - -2.4 Attaching processes ------------------------ - -# /bin/echo PID > tasks - -Note that it is PID, not PIDs. You can only attach ONE task at a time. -If you have several tasks to attach, you have to do it one after another: - -# /bin/echo PID1 > tasks -# /bin/echo PID2 > tasks - ... -# /bin/echo PIDn > tasks - - -3. Questions -============ - -Q: what's up with this '/bin/echo' ? -A: bash's builtin 'echo' command does not check calls to write() against - errors. If you use it in the cpuset file system, you won't be - able to tell whether a command succeeded or failed. - -Q: When I attach processes, only the first of the line gets really attached ! -A: We can only return one error code per call to write(). So you should also - put only ONE pid. - -4. Contact -========== - -Web: http://www.bullopensource.org/cpuset diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index bd699da2466..45932ec21ce 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -31,3 +31,51 @@ not defined by include/asm-XXX/topology.h: 2) core_id: 0 3) thread_siblings: just the given CPU 4) core_siblings: just the given CPU + +Additionally, cpu topology information is provided under +/sys/devices/system/cpu and includes these files. The internal +source for the output is in brackets ("[]"). + + kernel_max: the maximum cpu index allowed by the kernel configuration. + [NR_CPUS-1] + + offline: cpus that are not online because they have been + HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit + of cpus allowed by the kernel configuration (kernel_max + above). [~cpu_online_mask + cpus >= NR_CPUS] + + online: cpus that are online and being scheduled [cpu_online_mask] + + possible: cpus that have been allocated resources and can be + brought online if they are present. [cpu_possible_mask] + + present: cpus that have been identified as being present in the + system. [cpu_present_mask] + +The format for the above output is compatible with cpulist_parse() +[see ]. Some examples follow. + +In this example, there are 64 cpus in the system but cpus 32-63 exceed +the kernel max which is limited to 0..31 by the NR_CPUS config option +being 32. Note also that cpus 2 and 4-31 are not online but could be +brought online as they are both present and possible. + + kernel_max: 31 + offline: 2,4-31,32-63 + online: 0-1,3 + possible: 0-31 + present: 0-31 + +In this example, the NR_CPUS config option is 128, but the kernel was +started with possible_cpus=144. There are 4 cpus in the system and cpu2 +was manually taken offline (and is the only cpu that can be brought +online.) + + kernel_max: 127 + offline: 2,4-127,128-143 + online: 0-1,3 + possible: 0-127 + present: 0-3 + +See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter +as well as more information on the various cpumask's. diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt new file mode 100644 index 00000000000..df03169782e --- /dev/null +++ b/Documentation/credentials.txt @@ -0,0 +1,582 @@ + ==================== + CREDENTIALS IN LINUX + ==================== + +By: David Howells + +Contents: + + (*) Overview. + + (*) Types of credentials. + + (*) File markings. + + (*) Task credentials. + + - Immutable credentials. + - Accessing task credentials. + - Accessing another task's credentials. + - Altering credentials. + - Managing credentials. + + (*) Open file credentials. + + (*) Overriding the VFS's use of credentials. + + +======== +OVERVIEW +======== + +There are several parts to the security check performed by Linux when one +object acts upon another: + + (1) Objects. + + Objects are things in the system that may be acted upon directly by + userspace programs. Linux has a variety of actionable objects, including: + + - Tasks + - Files/inodes + - Sockets + - Message queues + - Shared memory segments + - Semaphores + - Keys + + As a part of the description of all these objects there is a set of + credentials. What's in the set depends on the type of object. + + (2) Object ownership. + + Amongst the credentials of most objects, there will be a subset that + indicates the ownership of that object. This is used for resource + accounting and limitation (disk quotas and task rlimits for example). + + In a standard UNIX filesystem, for instance, this will be defined by the + UID marked on the inode. + + (3) The objective context. + + Also amongst the credentials of those objects, there will be a subset that + indicates the 'objective context' of that object. This may or may not be + the same set as in (2) - in standard UNIX files, for instance, this is the + defined by the UID and the GID marked on the inode. + + The objective context is used as part of the security calculation that is + carried out when an object is acted upon. + + (4) Subjects. + + A subject is an object that is acting upon another object. + + Most of the objects in the system are inactive: they don't act on other + objects within the system. Processes/tasks are the obvious exception: + they do stuff; they access and manipulate things. + + Objects other than tasks may under some circumstances also be subjects. + For instance an open file may send SIGIO to a task using the UID and EUID + given to it by a task that called fcntl(F_SETOWN) upon it. In this case, + the file struct will have a subjective context too. + + (5) The subjective context. + + A subject has an additional interpretation of its credentials. A subset + of its credentials forms the 'subjective context'. The subjective context + is used as part of the security calculation that is carried out when a + subject acts. + + A Linux task, for example, has the FSUID, FSGID and the supplementary + group list for when it is acting upon a file - which are quite separate + from the real UID and GID that normally form the objective context of the + task. + + (6) Actions. + + Linux has a number of actions available that a subject may perform upon an + object. The set of actions available depends on the nature of the subject + and the object. + + Actions include reading, writing, creating and deleting files; forking or + signalling and tracing tasks. + + (7) Rules, access control lists and security calculations. + + When a subject acts upon an object, a security calculation is made. This + involves taking the subjective context, the objective context and the + action, and searching one or more sets of rules to see whether the subject + is granted or denied permission to act in the desired manner on the + object, given those contexts. + + There are two main sources of rules: + + (a) Discretionary access control (DAC): + + Sometimes the object will include sets of rules as part of its + description. This is an 'Access Control List' or 'ACL'. A Linux + file may supply more than one ACL. + + A traditional UNIX file, for example, includes a permissions mask that + is an abbreviated ACL with three fixed classes of subject ('user', + 'group' and 'other'), each of which may be granted certain privileges + ('read', 'write' and 'execute' - whatever those map to for the object + in question). UNIX file permissions do not allow the arbitrary + specification of subjects, however, and so are of limited use. + + A Linux file might also sport a POSIX ACL. This is a list of rules + that grants various permissions to arbitrary subjects. + + (b) Mandatory access control (MAC): + + The system as a whole may have one or more sets of rules that get + applied to all subjects and objects, regardless of their source. + SELinux and Smack are examples of this. + + In the case of SELinux and Smack, each object is given a label as part + of its credentials. When an action is requested, they take the + subject label, the object label and the action and look for a rule + that says that this action is either granted or denied. + + +==================== +TYPES OF CREDENTIALS +==================== + +The Linux kernel supports the following types of credentials: + + (1) Traditional UNIX credentials. + + Real User ID + Real Group ID + + The UID and GID are carried by most, if not all, Linux objects, even if in + some cases it has to be invented (FAT or CIFS files for example, which are + derived from Windows). These (mostly) define the objective context of + that object, with tasks being slightly different in some cases. + + Effective, Saved and FS User ID + Effective, Saved and FS Group ID + Supplementary groups + + These are additional credentials used by tasks only. Usually, an + EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID + will be used as the objective. For tasks, it should be noted that this is + not always true. + + (2) Capabilities. + + Set of permitted capabilities + Set of inheritable capabilities + Set of effective capabilities + Capability bounding set + + These are only carried by tasks. They indicate superior capabilities + granted piecemeal to a task that an ordinary task wouldn't otherwise have. + These are manipulated implicitly by changes to the traditional UNIX + credentials, but can also be manipulated directly by the capset() system + call. + + The permitted capabilities are those caps that the process might grant + itself to its effective or permitted sets through capset(). This + inheritable set might also be so constrained. + + The effective capabilities are the ones that a task is actually allowed to + make use of itself. + + The inheritable capabilities are the ones that may get passed across + execve(). + + The bounding set limits the capabilities that may be inherited across + execve(), especially when a binary is executed that will execute as UID 0. + + (3) Secure management flags (securebits). + + These are only carried by tasks. These govern the way the above + credentials are manipulated and inherited over certain operations such as + execve(). They aren't used directly as objective or subjective + credentials. + + (4) Keys and keyrings. + + These are only carried by tasks. They carry and cache security tokens + that don't fit into the other standard UNIX credentials. They are for + making such things as network filesystem keys available to the file + accesses performed by processes, without the necessity of ordinary + programs having to know about security details involved. + + Keyrings are a special type of key. They carry sets of other keys and can + be searched for the desired key. Each process may subscribe to a number + of keyrings: + + Per-thread keying + Per-process keyring + Per-session keyring + + When a process accesses a key, if not already present, it will normally be + cached on one of these keyrings for future accesses to find. + + For more information on using keys, see Documentation/keys.txt. + + (5) LSM + + The Linux Security Module allows extra controls to be placed over the + operations that a task may do. Currently Linux supports two main + alternate LSM options: SELinux and Smack. + + Both work by labelling the objects in a system and then applying sets of + rules (policies) that say what operations a task with one label may do to + an object with another label. + + (6) AF_KEY + + This is a socket-based approach to credential management for networking + stacks [RFC 2367]. It isn't discussed by this document as it doesn't + interact directly with task and file credentials; rather it keeps system + level credentials. + + +When a file is opened, part of the opening task's subjective context is +recorded in the file struct created. This allows operations using that file +struct to use those credentials instead of the subjective context of the task +that issued the operation. An example of this would be a file opened on a +network filesystem where the credentials of the opened file should be presented +to the server, regardless of who is actually doing a read or a write upon it. + + +============= +FILE MARKINGS +============= + +Files on disk or obtained over the network may have annotations that form the +objective security context of that file. Depending on the type of filesystem, +this may include one or more of the following: + + (*) UNIX UID, GID, mode; + + (*) Windows user ID; + + (*) Access control list; + + (*) LSM security label; + + (*) UNIX exec privilege escalation bits (SUID/SGID); + + (*) File capabilities exec privilege escalation bits. + +These are compared to the task's subjective security context, and certain +operations allowed or disallowed as a result. In the case of execve(), the +privilege escalation bits come into play, and may allow the resulting process +extra privileges, based on the annotations on the executable file. + + +================ +TASK CREDENTIALS +================ + +In Linux, all of a task's credentials are held in (uid, gid) or through +(groups, keys, LSM security) a refcounted structure of type 'struct cred'. +Each task points to its credentials by a pointer called 'cred' in its +task_struct. + +Once a set of credentials has been prepared and committed, it may not be +changed, barring the following exceptions: + + (1) its reference count may be changed; + + (2) the reference count on the group_info struct it points to may be changed; + + (3) the reference count on the security data it points to may be changed; + + (4) the reference count on any keyrings it points to may be changed; + + (5) any keyrings it points to may be revoked, expired or have their security + attributes changed; and + + (6) the contents of any keyrings to which it points may be changed (the whole + point of keyrings being a shared set of credentials, modifiable by anyone + with appropriate access). + +To alter anything in the cred struct, the copy-and-replace principle must be +adhered to. First take a copy, then alter the copy and then use RCU to change +the task pointer to make it point to the new copy. There are wrappers to aid +with this (see below). + +A task may only alter its _own_ credentials; it is no longer permitted for a +task to alter another's credentials. This means the capset() system call is no +longer permitted to take any PID other than the one of the current process. +Also keyctl_instantiate() and keyctl_negate() functions no longer permit +attachment to process-specific keyrings in the requesting process as the +instantiating process may need to create them. + + +IMMUTABLE CREDENTIALS +--------------------- + +Once a set of credentials has been made public (by calling commit_creds() for +example), it must be considered immutable, barring two exceptions: + + (1) The reference count may be altered. + + (2) Whilst the keyring subscriptions of a set of credentials may not be + changed, the keyrings subscribed to may have their contents altered. + +To catch accidental credential alteration at compile time, struct task_struct +has _const_ pointers to its credential sets, as does struct file. Furthermore, +certain functions such as get_cred() and put_cred() operate on const pointers, +thus rendering casts unnecessary, but require to temporarily ditch the const +qualification to be able to alter the reference count. + + +ACCESSING TASK CREDENTIALS +-------------------------- + +A task being able to alter only its own credentials permits the current process +to read or replace its own credentials without the need for any form of locking +- which simplifies things greatly. It can just call: + + const struct cred *current_cred() + +to get a pointer to its credentials structure, and it doesn't have to release +it afterwards. + +There are convenience wrappers for retrieving specific aspects of a task's +credentials (the value is simply returned in each case): + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + void *current_security(void) Current's LSM security pointer + struct user_struct *current_user(void) Current's user account + +There are also convenience wrappers for retrieving specific associated pairs of +a task's credentials: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +which return these pairs of values through their arguments after retrieving +them from the current task's credentials. + + +In addition, there is a function for obtaining a reference on the current +process's current set of credentials: + + const struct cred *get_current_cred(void); + +and functions for getting references to one of the credentials that don't +actually live in struct cred: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +which get references to the current process's user accounting structure and +supplementary groups list respectively. + +Once a reference has been obtained, it must be released with put_cred(), +free_uid() or put_group_info() as appropriate. + + +ACCESSING ANOTHER TASK'S CREDENTIALS +------------------------------------ + +Whilst a task may access its own credentials without the need for locking, the +same is not true of a task wanting to access another task's credentials. It +must use the RCU read lock and rcu_dereference(). + +The rcu_dereference() is wrapped by: + + const struct cred *__task_cred(struct task_struct *task); + +This should be used inside the RCU read lock, as in the following example: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +A function need not get RCU read lock to use __task_cred() if it is holding a +spinlock at the time as this implicitly holds the RCU read lock. + +Should it be necessary to hold another task's credentials for a long period of +time, and possibly to sleep whilst doing so, then the caller should get a +reference on them using: + + const struct cred *get_task_cred(struct task_struct *task); + +This does all the RCU magic inside of it. The caller must call put_cred() on +the credentials so obtained when they're finished with. + +There are a couple of convenience functions to access bits of another task's +credentials, hiding the RCU magic from the caller: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +If the caller is holding a spinlock or the RCU read lock at the time anyway, +then: + + __task_cred(task)->uid + __task_cred(task)->euid + +should be used instead. Similarly, if multiple aspects of a task's credentials +need to be accessed, RCU read lock or a spinlock should be used, __task_cred() +called, the result stored in a temporary pointer and then the credential +aspects called from that before dropping the lock. This prevents the +potentially expensive RCU magic from being invoked multiple times. + +Should some other single aspect of another task's credentials need to be +accessed, then this can be used: + + task_cred_xxx(task, member) + +where 'member' is a non-pointer member of the cred struct. For instance: + + uid_t task_cred_xxx(task, suid); + +will retrieve 'struct cred::suid' from the task, doing the appropriate RCU +magic. This may not be used for pointer members as what they point to may +disappear the moment the RCU read lock is dropped. + + +ALTERING CREDENTIALS +-------------------- + +As previously mentioned, a task may only alter its own credentials, and may not +alter those of another task. This means that it doesn't need to use any +locking to alter its own credentials. + +To alter the current process's credentials, a function should first prepare a +new set of credentials by calling: + + struct cred *prepare_creds(void); + +this locks current->cred_replace_mutex and then allocates and constructs a +duplicate of the current process's credentials, returning with the mutex still +held if successful. It returns NULL if not successful (out of memory). + +The mutex prevents ptrace() from altering the ptrace state of a process whilst +security checks on credentials construction and changing is taking place as +the ptrace state may alter the outcome, particularly in the case of execve(). + +The new credentials set should be altered appropriately, and any security +checks and hooks done. Both the current and the proposed sets of credentials +are available for this purpose as current_cred() will return the current set +still at this point. + + +When the credential set is ready, it should be committed to the current process +by calling: + + int commit_creds(struct cred *new); + +This will alter various aspects of the credentials and the process, giving the +LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually +commit the new credentials to current->cred, it will release +current->cred_replace_mutex to allow ptrace() to take place, and it will notify +the scheduler and others of the changes. + +This function is guaranteed to return 0, so that it can be tail-called at the +end of such functions as sys_setresuid(). + +Note that this function consumes the caller's reference to the new credentials. +The caller should _not_ call put_cred() on the new credentials afterwards. + +Furthermore, once this function has been called on a new set of credentials, +those credentials may _not_ be changed further. + + +Should the security checks fail or some other error occur after prepare_creds() +has been called, then the following function should be invoked: + + void abort_creds(struct cred *new); + +This releases the lock on current->cred_replace_mutex that prepare_creds() got +and then releases the new credentials. + + +A typical credentials alteration function would look something like this: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +MANAGING CREDENTIALS +-------------------- + +There are some functions to help manage credentials: + + (*) void put_cred(const struct cred *cred); + + This releases a reference to the given set of credentials. If the + reference count reaches zero, the credentials will be scheduled for + destruction by the RCU system. + + (*) const struct cred *get_cred(const struct cred *cred); + + This gets a reference on a live set of credentials, returning a pointer to + that set of credentials. + + (*) struct cred *get_new_cred(struct cred *cred); + + This gets a reference on a set of credentials that is under construction + and is thus still mutable, returning a pointer to that set of credentials. + + +===================== +OPEN FILE CREDENTIALS +===================== + +When a new file is opened, a reference is obtained on the opening task's +credentials and this is attached to the file struct as 'f_cred' in place of +'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid +should now access file->f_cred->fsuid and file->f_cred->fsgid. + +It is safe to access f_cred without the use of RCU or locking because the +pointer will not change over the lifetime of the file struct, and nor will the +contents of the cred struct pointed to, barring the exceptions listed above +(see the Task Credentials section). + + +======================================= +OVERRIDING THE VFS'S USE OF CREDENTIALS +======================================= + +Under some circumstances it is desirable to override the credentials used by +the VFS, and that can be done by calling into such as vfs_mkdir() with a +different set of credentials. This is done in the following places: + + (*) sys_faccessat(). + + (*) do_coredump(). + + (*) nfs4recover.c. diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index c1e9545c59b..9f59fcbf5d8 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -13,9 +13,9 @@ 3.6 Constraints 3.7 Example -4 DRIVER DEVELOPER NOTES +4 DMAENGINE DRIVER DEVELOPER NOTES 4.1 Conformance points -4.2 "My application needs finer control of hardware channels" +4.2 "My application needs exclusive control of hardware channels" 5 SOURCE @@ -150,6 +150,7 @@ ops_run_* and ops_complete_* routines in drivers/md/raid5.c for more implementation examples. 4 DRIVER DEVELOPMENT NOTES + 4.1 Conformance points: There are a few conformance points required in dmaengine drivers to accommodate assumptions made by applications using the async_tx API: @@ -158,58 +159,49 @@ accommodate assumptions made by applications using the async_tx API: 3/ Use async_tx_run_dependencies() in the descriptor clean up path to handle submission of dependent operations -4.2 "My application needs finer control of hardware channels" -This requirement seems to arise from cases where a DMA engine driver is -trying to support device-to-memory DMA. The dmaengine and async_tx -implementations were designed for offloading memory-to-memory -operations; however, there are some capabilities of the dmaengine layer -that can be used for platform-specific channel management. -Platform-specific constraints can be handled by registering the -application as a 'dma_client' and implementing a 'dma_event_callback' to -apply a filter to the available channels in the system. Before showing -how to implement a custom dma_event callback some background of -dmaengine's client support is required. - -The following routines in dmaengine support multiple clients requesting -use of a channel: -- dma_async_client_register(struct dma_client *client) -- dma_async_client_chan_request(struct dma_client *client) - -dma_async_client_register takes a pointer to an initialized dma_client -structure. It expects that the 'event_callback' and 'cap_mask' fields -are already initialized. - -dma_async_client_chan_request triggers dmaengine to notify the client of -all channels that satisfy the capability mask. It is up to the client's -event_callback routine to track how many channels the client needs and -how many it is currently using. The dma_event_callback routine returns a -dma_state_client code to let dmaengine know the status of the -allocation. - -Below is the example of how to extend this functionality for -platform-specific filtering of the available channels beyond the -standard capability mask: - -static enum dma_state_client -my_dma_client_callback(struct dma_client *client, - struct dma_chan *chan, enum dma_state state) -{ - struct dma_device *dma_dev; - struct my_platform_specific_dma *plat_dma_dev; - - dma_dev = chan->device; - plat_dma_dev = container_of(dma_dev, - struct my_platform_specific_dma, - dma_dev); - - if (!plat_dma_dev->platform_specific_capability) - return DMA_DUP; - - . . . -} +4.2 "My application needs exclusive control of hardware channels" +Primarily this requirement arises from cases where a DMA engine driver +is being used to support device-to-memory operations. A channel that is +performing these operations cannot, for many platform specific reasons, +be shared. For these cases the dma_request_channel() interface is +provided. + +The interface is: +struct dma_chan *dma_request_channel(dma_cap_mask_t mask, + dma_filter_fn filter_fn, + void *filter_param); + +Where dma_filter_fn is defined as: +typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); + +When the optional 'filter_fn' parameter is set to NULL +dma_request_channel simply returns the first channel that satisfies the +capability mask. Otherwise, when the mask parameter is insufficient for +specifying the necessary channel, the filter_fn routine can be used to +disposition the available channels in the system. The filter_fn routine +is called once for each free channel in the system. Upon seeing a +suitable channel filter_fn returns DMA_ACK which flags that channel to +be the return value from dma_request_channel. A channel allocated via +this interface is exclusive to the caller, until dma_release_channel() +is called. + +The DMA_PRIVATE capability flag is used to tag dma devices that should +not be used by the general-purpose allocator. It can be set at +initialization time if it is known that a channel will always be +private. Alternatively, it is set when dma_request_channel() finds an +unused "public" channel. + +A couple caveats to note when implementing a driver and consumer: +1/ Once a channel has been privately allocated it will no longer be + considered by the general-purpose allocator even after a call to + dma_release_channel(). +2/ Since capabilities are specified at the device level a dma_device + with multiple channels will either have all channels public, or all + channels private. 5 SOURCE -include/linux/dmaengine.h: core header file for DMA drivers and clients + +include/linux/dmaengine.h: core header file for DMA drivers and api users drivers/dma/dmaengine.c: offload engine channel management routines drivers/dma/: location for offload engine drivers include/linux/async_tx.h: core header file for the async_tx api diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt index 2c0d631de0c..c11b931f8f9 100644 --- a/Documentation/dell_rbu.txt +++ b/Documentation/dell_rbu.txt @@ -81,8 +81,8 @@ Until this step is completed the driver cannot be unloaded. Also echoing either mono ,packet or init in to image_type will free up the memory allocated by the driver. -If an user by accident executes steps 1 and 3 above without executing step 2; -it will make the /sys/class/firmware/dell_rbu/ entries to disappear. +If a user by accident executes steps 1 and 3 above without executing step 2; +it will make the /sys/class/firmware/dell_rbu/ entries disappear. The entries can be recreated by doing the following echo init > /sys/devices/platform/dell_rbu/image_type NOTE: echoing init in image_type does not change it original value. diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding index 014aca8f14e..a5a3450faaa 100644 --- a/Documentation/development-process/4.Coding +++ b/Documentation/development-process/4.Coding @@ -375,10 +375,10 @@ say, this can be a large job, so it is best to be sure that the justification is solid. When making an incompatible API change, one should, whenever possible, -ensure that code which has not been updated is caught by the compiler. +ensure that code which has not been updated is caught by the compiler. This will help you to be sure that you have found all in-tree uses of that interface. It will also alert developers of out-of-tree code that there is a change that they need to respond to. Supporting out-of-tree code is not something that kernel developers need to be worried about, but we also do -not have to make life harder for out-of-tree developers than it it needs to -be. +not have to make life harder for out-of-tree developers than it needs to +be. diff --git a/Documentation/digiepca.txt b/Documentation/digiepca.txt deleted file mode 100644 index f2560e22f2c..00000000000 --- a/Documentation/digiepca.txt +++ /dev/null @@ -1,98 +0,0 @@ -NOTE: This driver is obsolete. Digi provides a 2.6 driver (dgdm) at -http://www.digi.com for PCI cards. They no longer maintain this driver, -and have no 2.6 driver for ISA cards. - -This driver requires a number of user-space tools. They can be acquired from -http://www.digi.com, but only works with 2.4 kernels. - - -The Digi Intl. epca driver. ----------------------------- -The Digi Intl. epca driver for Linux supports the following boards: - -Digi PC/Xem, PC/Xr, PC/Xe, PC/Xi, PC/Xeve -Digi EISA/Xem, PCI/Xem, PCI/Xr - -Limitations: ------------- -Currently the driver only autoprobes for supported PCI boards. - -The Linux MAKEDEV command does not support generating the Digiboard -Devices. Users executing digiConfig to setup EISA and PC series cards -will have their device nodes automatically constructed (cud?? for ~CLOCAL, -and ttyD?? for CLOCAL). Users wishing to boot their board from the LILO -prompt, or those users booting PCI cards may use buildDIGI to construct -the necessary nodes. - -Notes: ------- -This driver may be configured via LILO. For users who have already configured -their driver using digiConfig, configuring from LILO will override previous -settings. Multiple boards may be configured by issuing multiple LILO command -lines. For examples see the bottom of this document. - -Device names start at 0 and continue up. Beware of this as previous Digi -drivers started device names with 1. - -PCI boards are auto-detected and configured by the driver. PCI boards will -be allocated device numbers (internally) beginning with the lowest PCI slot -first. In other words a PCI card in slot 3 will always have higher device -nodes than a PCI card in slot 1. - -LILO config examples: ---------------------- -Using LILO's APPEND command, a string of comma separated identifiers or -integers can be used to configure supported boards. The six values in order -are: - - Enable/Disable this card or Override, - Type of card: PC/Xe (AccelePort) (0), PC/Xeve (1), PC/Xem or PC/Xr (2), - EISA/Xem (3), PC/64Xe (4), PC/Xi (5), - Enable/Disable alternate pin arrangement, - Number of ports on this card, - I/O Port where card is configured (in HEX if using string identifiers), - Base of memory window (in HEX if using string identifiers), - -NOTE : PCI boards are auto-detected and configured. Do not attempt to -configure PCI boards with the LILO append command. If you wish to override -previous configuration data (As set by digiConfig), but you do not wish to -configure any specific card (Example if there are PCI cards in the system) -the following override command will accomplish this: --> append="digi=2" - -Samples: - append="digiepca=E,PC/Xe,D,16,200,D0000" - or - append="digi=1,0,0,16,512,851968" - -Supporting Tools: ------------------ -Supporting tools include digiDload, digiConfig, buildPCI, and ditty. See -drivers/char/README.epca for more details. Note, -this driver REQUIRES that digiDload be executed prior to it being used. -Failure to do this will result in an ENODEV error. - -Documentation: --------------- -Complete documentation for this product may be found in the tool package. - -Sources of information and support: ------------------------------------ -Digi Intl. support site for this product: - --> http://www.digi.com - -Acknowledgments: ----------------- -Much of this work (And even text) was derived from a similar document -supporting the original public domain DigiBoard driver Copyright (C) -1994,1995 Troy De Jongh. Many thanks to Christoph Lameter -(christoph@lameter.com) and Mike McLagan (mike.mclagan@linux.org) who authored -and contributed to the original document. - -Changelog: ----------- -10-29-04: Update status of driver, remove dead links in document - James Nelson - -2000 (?) Original Document diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt new file mode 100644 index 00000000000..0c1c2f63c0a --- /dev/null +++ b/Documentation/dmaengine.txt @@ -0,0 +1 @@ +See Documentation/crypto/async-tx-api.txt diff --git a/Documentation/dvb/technisat.txt b/Documentation/dvb/technisat.txt new file mode 100644 index 00000000000..cdf6ee4b2da --- /dev/null +++ b/Documentation/dvb/technisat.txt @@ -0,0 +1,69 @@ +How to set up the Technisat devices +=================================== + +1) Find out what device you have +================================ + +First start your linux box with a shipped kernel: +lspci -vvv for a PCI device (lsusb -vvv for an USB device) will show you for example: +02:0b.0 Network controller: Techsan Electronics Co Ltd B2C2 FlexCopII DVB chip / Technisat SkyStar2 DVB card (rev 02) + +dmesg | grep frontend may show you for example: +DVB: registering frontend 0 (Conexant CX24123/CX24109)... + +2) Kernel compilation: +====================== + +If the Technisat is the only TV device in your box get rid of unnecessary modules and check this one: +"Multimedia devices" => "Customise analog and hybrid tuner modules to build" +In this directory uncheck every driver which is activated there. + +Then please activate: +2a) Main module part: + +a.)"Multimedia devices" => "DVB/ATSC adapters" => "Technisat/B2C2 FlexcopII(b) and FlexCopIII adapters" +b.)"Multimedia devices" => "DVB/ATSC adapters" => "Technisat/B2C2 FlexcopII(b) and FlexCopIII adapters" => "Technisat/B2C2 Air/Sky/Cable2PC PCI" in case of a PCI card OR +c.)"Multimedia devices" => "DVB/ATSC adapters" => "Technisat/B2C2 FlexcopII(b) and FlexCopIII adapters" => "Technisat/B2C2 Air/Sky/Cable2PC USB" in case of an USB 1.1 adapter +d.)"Multimedia devices" => "DVB/ATSC adapters" => "Technisat/B2C2 FlexcopII(b) and FlexCopIII adapters" => "Enable debug for the B2C2 FlexCop drivers" +Notice: d.) is helpful for troubleshooting + +2b) Frontend module part: + +1.) Revision 2.3: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "Zarlink VP310/MT312/ZL10313 based" + +2.) Revision 2.6: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "ST STV0299 based" + +3.) Revision 2.7: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "Samsung S5H1420 based" +c.)"Multimedia devices" => "Customise DVB frontends" => "Integrant ITD1000 Zero IF tuner for DVB-S/DSS" +d.)"Multimedia devices" => "Customise DVB frontends" => "ISL6421 SEC controller" + +4.) Revision 2.8: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "Conexant CX24113/CX24128 tuner for DVB-S/DSS" +c.)"Multimedia devices" => "Customise DVB frontends" => "Conexant CX24123 based" +d.)"Multimedia devices" => "Customise DVB frontends" => "ISL6421 SEC controller" + +5.) DVB-T card: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "Zarlink MT352 based" + +6.) DVB-C card: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "ST STV0297 based" + +7.) ATSC card 1st generation: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "Broadcom BCM3510" + +8.) ATSC card 2nd generation: +a.)"Multimedia devices" => "Customise DVB frontends" => "Customise the frontend modules to build" +b.)"Multimedia devices" => "Customise DVB frontends" => "NxtWave Communications NXT2002/NXT2004 based" +c.)"Multimedia devices" => "Customise DVB frontends" => "LG Electronics LGDT3302/LGDT3303 based" + +Author: Uwe Bugla December 2008 diff --git a/Documentation/fb/pxafb.txt b/Documentation/fb/pxafb.txt index db9b8500b43..d143a0a749f 100644 --- a/Documentation/fb/pxafb.txt +++ b/Documentation/fb/pxafb.txt @@ -5,9 +5,13 @@ The driver supports the following options, either via options= when modular or video=pxafb: when built in. For example: - modprobe pxafb options=mode:640x480-8,passive + modprobe pxafb options=vmem:2M,mode:640x480-8,passive or on the kernel command line - video=pxafb:mode:640x480-8,passive + video=pxafb:vmem:2M,mode:640x480-8,passive + +vmem: VIDEO_MEM_SIZE + Amount of video memory to allocate (can be suffixed with K or M + for kilobytes or megabytes) mode:XRESxYRES[-BPP] XRES == LCCR1_PPL + 1 @@ -52,3 +56,87 @@ outputen:POLARITY pixclockpol:POLARITY pixel clock polarity 0 => falling edge, 1 => rising edge + + +Overlay Support for PXA27x and later LCD controllers +==================================================== + + PXA27x and later processors support overlay1 and overlay2 on-top of the + base framebuffer (although under-neath the base is also possible). They + support palette and no-palette RGB formats, as well as YUV formats (only + available on overlay2). These overlays have dedicated DMA channels and + behave in a similar way as a framebuffer. + + However, there are some differences between these overlay framebuffers + and normal framebuffers, as listed below: + + 1. overlay can start at a 32-bit word aligned position within the base + framebuffer, which means they have a start (x, y). This information + is encoded into var->nonstd (no, var->xoffset and var->yoffset are + not for such purpose). + + 2. overlay framebuffer is allocated dynamically according to specified + 'struct fb_var_screeninfo', the amount is decided by: + + var->xres_virtual * var->yres_virtual * bpp + + bpp = 16 -- for RGB565 or RGBT555 + = 24 -- for YUV444 packed + = 24 -- for YUV444 planar + = 16 -- for YUV422 planar (1 pixel = 1 Y + 1/2 Cb + 1/2 Cr) + = 12 -- for YUV420 planar (1 pixel = 1 Y + 1/4 Cb + 1/4 Cr) + + NOTE: + + a. overlay does not support panning in x-direction, thus + var->xres_virtual will always be equal to var->xres + + b. line length of overlay(s) must be on a 32-bit word boundary, + for YUV planar modes, it is a requirement for the component + with minimum bits per pixel, e.g. for YUV420, Cr component + for one pixel is actually 2-bits, it means the line length + should be a multiple of 16-pixels + + c. starting horizontal position (XPOS) should start on a 32-bit + word boundary, otherwise the fb_check_var() will just fail. + + d. the rectangle of the overlay should be within the base plane, + otherwise fail + + Applications should follow the sequence below to operate an overlay + framebuffer: + + a. open("/dev/fb[1-2]", ...) + b. ioctl(fd, FBIOGET_VSCREENINFO, ...) + c. modify 'var' with desired parameters: + 1) var->xres and var->yres + 2) larger var->yres_virtual if more memory is required, + usually for double-buffering + 3) var->nonstd for starting (x, y) and color format + 4) var->{red, green, blue, transp} if RGB mode is to be used + d. ioctl(fd, FBIOPUT_VSCREENINFO, ...) + e. ioctl(fd, FBIOGET_FSCREENINFO, ...) + f. mmap + g. ... + + 3. for YUV planar formats, these are actually not supported within the + framebuffer framework, application has to take care of the offsets + and lengths of each component within the framebuffer. + + 4. var->nonstd is used to pass starting (x, y) position and color format, + the detailed bit fields are shown below: + + 31 23 20 10 0 + +-----------------+---+----------+----------+ + | ... unused ... |FOR| XPOS | YPOS | + +-----------------+---+----------+----------+ + + FOR - color format, as defined by OVERLAY_FORMAT_* in pxafb.h + 0 - RGB + 1 - YUV444 PACKED + 2 - YUV444 PLANAR + 3 - YUV422 PLANAR + 4 - YUR420 PLANAR + + XPOS - starting horizontal position + YPOS - starting vertical position diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 05d71b4b943..5ddbe350487 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -56,30 +56,6 @@ Who: Mauro Carvalho Chehab --------------------------- -What: old tuner-3036 i2c driver -When: 2.6.28 -Why: This driver is for VERY old i2c-over-parallel port teletext receiver - boxes. Rather then spending effort on converting this driver to V4L2, - and since it is extremely unlikely that anyone still uses one of these - devices, it was decided to drop it. -Who: Hans Verkuil - Mauro Carvalho Chehab - - --------------------------- - -What: V4L2 dpc7146 driver -When: 2.6.28 -Why: Old driver for the dpc7146 demonstration board that is no longer - relevant. The last time this was tested on actual hardware was - probably around 2002. Since this is a driver for a demonstration - board the decision was made to remove it rather than spending a - lot of effort continually updating this driver to stay in sync - with the latest internal V4L2 or I2C API. -Who: Hans Verkuil - Mauro Carvalho Chehab - ---------------------------- - What: PCMCIA control ioctl (needed for pcmcia-cs [cardmgr, cardctl]) When: November 2005 Files: drivers/pcmcia/: pcmcia_ioctl.c @@ -144,13 +120,6 @@ Who: Christoph Hellwig --------------------------- -What: eepro100 network driver -When: January 2007 -Why: replaced by the e100 driver -Who: Adrian Bunk - ---------------------------- - What: Unused EXPORT_SYMBOL/EXPORT_SYMBOL_GPL exports (temporary transition config option provided until then) The transition config option will also be removed at the same time. @@ -268,18 +237,6 @@ Who: Michael Buesch --------------------------- -What: init_mm export -When: 2.6.26 -Why: Not used in-tree. The current out-of-tree users used it to - work around problems in the CPA code which should be resolved - by now. One usecase was described to provide verification code - of the CPA operation. That's a good idea in general, but such - code / infrastructure should be in the kernel and not in some - out-of-tree driver. -Who: Thomas Gleixner - ----------------------------- - What: usedac i386 kernel parameter When: 2.6.27 Why: replaced by allowdac and no dac combination @@ -353,17 +310,28 @@ Who: Krzysztof Piotr Oledzki --------------------------- -What: ide-scsi (BLK_DEV_IDESCSI) -When: 2.6.29 -Why: The 2.6 kernel supports direct writing to ide CD drives, which - eliminates the need for ide-scsi. The new method is more - efficient in every way. -Who: FUJITA Tomonori - ---------------------------- - What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client() When: 2.6.29 (ideally) or 2.6.30 (more likely) Why: Deprecated by the new (standard) device driver binding model. Use i2c_driver->probe() and ->remove() instead. Who: Jean Delvare + +--------------------------- + +What: fscher and fscpos drivers +When: June 2009 +Why: Deprecated by the new fschmd driver. +Who: Hans de Goede + Jean Delvare + +--------------------------- + +What: SELinux "compat_net" functionality +When: 2.6.30 at the earliest +Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net" + network access control functionality of SELinux. Secmark offers both + better performance and greater flexibility than the "compat_net" + mechanism. Now that the major Linux distributions have moved to + Secmark, it is time to deprecate the older mechanism and start the + process of removing the old code. +Who: Paul Moore diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 23d2f4460de..ec6a9392a17 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -97,8 +97,8 @@ prototypes: void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -119,8 +119,8 @@ delete_inode: no put_super: yes yes no write_super: no yes read sync_fs: no no read -write_super_lockfs: ? -unlockfs: ? +freeze_fs: ? +unfreeze_fs: ? statfs: no no no remount_fs: yes yes maybe (see below) clear_inode: no @@ -394,11 +394,10 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *, unsigned long); }; locking rules: - All except ->poll() may block. + All may block. BKL llseek: no (see below) read: no @@ -424,7 +423,6 @@ sendfile: no sendpage: no get_unmapped_area: no check_flags: no -dir_notify: no ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt new file mode 100644 index 00000000000..64087c34327 --- /dev/null +++ b/Documentation/filesystems/btrfs.txt @@ -0,0 +1,91 @@ + + BTRFS + ===== + +Btrfs is a new copy on write filesystem for Linux aimed at +implementing advanced features while focusing on fault tolerance, +repair and easy administration. Initially developed by Oracle, Btrfs +is licensed under the GPL and open for contribution from anyone. + +Linux has a wealth of filesystems to choose from, but we are facing a +number of challenges with scaling to the large storage subsystems that +are becoming common in today's data centers. Filesystems need to scale +in their ability to address and manage large storage, and also in +their ability to detect, repair and tolerate errors in the data stored +on disk. Btrfs is under heavy development, and is not suitable for +any uses other than benchmarking and review. The Btrfs disk format is +not yet finalized. + +The main Btrfs features include: + + * Extent based file storage (2^64 max file size) + * Space efficient packing of small files + * Space efficient indexed directories + * Dynamic inode allocation + * Writable snapshots + * Subvolumes (separate internal filesystem roots) + * Object level mirroring and striping + * Checksums on data and metadata (multiple algorithms available) + * Compression + * Integrated multiple device support, with several raid algorithms + * Online filesystem check (not yet implemented) + * Very fast offline filesystem check + * Efficient incremental backup and FS mirroring (not yet implemented) + * Online filesystem defragmentation + + + + MAILING LIST + ============ + +There is a Btrfs mailing list hosted on vger.kernel.org. You can +find details on how to subscribe here: + +http://vger.kernel.org/vger-lists.html#linux-btrfs + +Mailing list archives are available from gmane: + +http://dir.gmane.org/gmane.comp.file-systems.btrfs + + + + IRC + === + +Discussion of Btrfs also occurs on the #btrfs channel of the Freenode +IRC network. + + + + UTILITIES + ========= + +Userspace tools for creating and manipulating Btrfs file systems are +available from the git repository at the following location: + + http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git + git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git + +These include the following tools: + +mkfs.btrfs: create a filesystem + +btrfsctl: control program to create snapshots and subvolumes: + + mount /dev/sda2 /mnt + btrfsctl -s new_subvol_name /mnt + btrfsctl -s snapshot_of_default /mnt/default + btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name + btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol + ls /mnt + default snapshot_of_a_snapshot snapshot_of_new_subvol + new_subvol_name snapshot_of_default + + Snapshots and subvolumes cannot be deleted right now, but you can + rm -rf all the files and directories inside them. + +btrfsck: do a limited check of the FS extent trees. + +btrfs-debug-tree: print all of the FS metadata in text form. Example: + + btrfs-debug-tree /dev/sda2 >& big_output_file diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt new file mode 100644 index 00000000000..68dffd87f9b --- /dev/null +++ b/Documentation/filesystems/devpts.txt @@ -0,0 +1,132 @@ + +To support containers, we now allow multiple instances of devpts filesystem, +such that indices of ptys allocated in one instance are independent of indices +allocated in other instances of devpts. + +To preserve backward compatibility, this support for multiple instances is +enabled only if: + + - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and + - '-o newinstance' mount option is specified while mounting devpts + +IOW, devpts now supports both single-instance and multi-instance semantics. + +If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and +this referred to as the "legacy" mode. In this mode, the new mount options +(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message +on console. + +If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the +'newinstance' option (as in current start-up scripts) the new mount binds +to the initial kernel mount of devpts. This mode is referred to as the +'single-instance' mode and the current, single-instance semantics are +preserved, i.e PTYs are common across the system. + +The only difference between this single-instance mode and the legacy mode +is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which +can safely be ignored. + +If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified, +the mount is considered to be in the multi-instance mode and a new instance +of the devpts fs is created. Any ptys created in this instance are independent +of ptys in other instances of devpts. Like in the single-instance mode, the +/dev/pts/ptmx node is present. To effectively use the multi-instance mode, +open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or +bind-mount. + +Eg: A container startup script could do the following: + + $ chmod 0666 /dev/pts/ptmx + $ rm /dev/ptmx + $ ln -s pts/ptmx /dev/ptmx + $ ns_exec -cm /bin/bash + + # We are now in new container + + $ umount /dev/pts + $ mount -t devpts -o newinstance lxcpts /dev/pts + $ sshd -p 1234 + +where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs +/bin/bash in the child process. A pty created by the sshd is not visible in +the original mount of /dev/pts. + +User-space changes +------------------ + +In multi-instance mode (i.e '-o newinstance' mount option is specified at least +once), following user-space issues should be noted. + +1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored + and no change is needed to system-startup scripts. + +2. To effectively use multi-instance mode (i.e -o newinstance is specified) + administrators or startup scripts should "redirect" open of /dev/ptmx to + /dev/pts/ptmx using either a bind mount or symlink. + + $ mount -t devpts -o newinstance devpts /dev/pts + + followed by either + + $ rm /dev/ptmx + $ ln -s pts/ptmx /dev/ptmx + $ chmod 666 /dev/pts/ptmx + or + $ mount -o bind /dev/pts/ptmx /dev/ptmx + +3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it + enables better error-reporting and treats both single-instance and + multi-instance mounts similarly. + + But this method requires that system-startup scripts set the mode of + /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the + mode by, either + + - adding ptmxmode mount option to devpts entry in /etc/fstab, or + - using 'chmod 0666 /dev/pts/ptmx' + +4. If multi-instance mode mount is needed for containers, but the system + startup scripts have not yet been updated, container-startup scripts + should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single- + instance mounts. + + Or, in general, container-startup scripts should use: + + mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts + if [ ! -L /dev/ptmx ]; then + mount -o bind /dev/pts/ptmx /dev/ptmx + fi + + When all devpts mounts are multi-instance, /dev/ptmx can permanently be + a symlink to pts/ptmx and the bind mount can be ignored. + +5. A multi-instance mount that is not accompanied by the /dev/ptmx to + /dev/pts/ptmx redirection would result in an unusable/unreachable pty. + + mount -t devpts -o newinstance lxcpts /dev/pts + + immediately followed by: + + open("/dev/ptmx") + + would create a pty, say /dev/pts/7, in the initial kernel mount. + But /dev/pts/7 would be invisible in the new mount. + +6. The permissions for /dev/pts/ptmx node should be specified when mounting + /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000). + + mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts + + The permissions can be later be changed as usual with 'chmod'. + + chmod 666 /dev/pts/ptmx + +7. A mount of devpts without the 'newinstance' option results in binding to + initial kernel mount. This behavior while preserving legacy semantics, + does not provide strict isolation in a container environment. i.e by + mounting devpts without the 'newinstance' option, a container could + get visibility into the 'host' or root container's devpts. + + To workaround this and have strict isolation, all mounts of devpts, + including the mount in the root container, should use the newinstance + option. diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 174eaff7ded..cec829bc729 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be # mount -t ext4 /dev/hda1 /wherever - - When comparing performance with other filesystems, remember that - ext3/4 by default offers higher data integrity guarantees than most. - So when comparing with a metadata-only journalling filesystem, such - as ext3, use `mount -o data=writeback'. And you might as well use - `mount -o nobh' too along with it. Making the journal larger than - the mke2fs default often helps performance with metadata-intensive - workloads. + - When comparing performance with other filesystems, it's always + important to try multiple workloads; very often a subtle change in a + workload parameter can completely change the ranking of which + filesystems do well compared to others. When comparing versus ext3, + note that ext4 enables write barriers by default, while ext3 does + not enable write barriers by default. So it is useful to use + explicitly specify whether barriers are enabled or not when via the + '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems + for a fair comparison. When tuning ext3 for best benchmark numbers, + it is often worthwhile to try changing the data journaling mode; '-o + data=writeback,nobh' can be faster for some workloads. (Note + however that running mounted with data=writeback can potentially + leave stale data exposed in recently written files in case of an + unclean shutdown, which could be a security exposure in some + situations.) Configuring the filesystem with a large journal can + also be helpful for metadata-intensive workloads. 2. Features =========== @@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be * ability to use filesystems > 16TB (e2fsprogs support not available yet) * extent format reduces metadata overhead (RAM, IO for access, transactions) * extent format more robust in face of on-disk corruption due to magics, -* internal redunancy in tree +* internal redundancy in tree * improved file allocation (multi-block alloc) * fix 32000 subdirectory limit * nsec timestamps for mtime, atime, ctime, create time @@ -116,10 +125,11 @@ grouping of bitmaps and inode tables. Some test results available here: When mounting an ext4 filesystem, the following option are accepted: (*) == default -extents (*) ext4 will use extents to address file data. The - file system will no longer be mountable by ext3. - -noextents ext4 will not use extents for newly created files +ro Mount filesystem read only. Note that ext4 will + replay the journal (and thus write to the + partition) even when mounted "read only". The + mount options "ro,noload" can be used to prevent + writes to the filesystem. journal_checksum Enable checksumming of the journal transactions. This will allow the recovery code in e2fsck and the @@ -134,17 +144,17 @@ journal_async_commit Commit block can be written to disk without waiting journal=update Update the ext4 file system's journal to the current format. -journal=inum When a journal already exists, this option is ignored. - Otherwise, it specifies the number of the inode which - will represent the ext4 file system's journal file. - journal_dev=devnum When the external journal device's major/minor numbers have changed, this option allows the user to specify the new journal location. The journal device is identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. +noload Don't load the journal on mounting. Note that + if the filesystem was not unmounted cleanly, + skipping the journal replay will lead to the + filesystem containing inconsistencies that can + lead to any number of problems. data=journal All data are committed into the journal prior to being written into the main file system. @@ -219,9 +229,12 @@ minixdf Make 'df' act like Minix. debug Extra debugging information is sent to syslog. -errors=remount-ro(*) Remount the filesystem read-only on an error. +errors=remount-ro Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. + (These mount options override the errors behavior + specified in the superblock, which can be configured + using tune2fs) data_err=ignore(*) Just print an error message if an error occurs in a file data buffer in ordered mode. @@ -261,6 +274,42 @@ delalloc (*) Deferring block allocation until write-out time. nodelalloc Disable delayed allocation. Blocks are allocation when data is copied from user to page cache. +max_batch_time=usec Maximum amount of time ext4 should wait for + additional filesystem operations to be batch + together with a synchronous write operation. + Since a synchronous write operation is going to + force a commit and then a wait for the I/O + complete, it doesn't cost much, and can be a + huge throughput win, we wait for a small amount + of time to see if any other transactions can + piggyback on the synchronous write. The + algorithm used is designed to automatically tune + for the speed of the disk, by measuring the + amount of time (on average) that it takes to + finish committing a transaction. Call this time + the "commit time". If the time that the + transactoin has been running is less than the + commit time, ext4 will try sleeping for the + commit time to see if other operations will join + the transaction. The commit time is capped by + the max_batch_time, which defaults to 15000us + (15ms). This optimization can be turned off + entirely by setting max_batch_time to 0. + +min_batch_time=usec This parameter sets the commit time (as + described above) to be at least min_batch_time. + It defaults to zero microseconds. Increasing + this parameter may improve the throughput of + multi-threaded, synchronous workloads on very + fast disks, at the cost of increasing latency. + +journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the + highest priorty) which should be used for I/O + operations submitted by kjournald2 during a + commit operation. This defaults to 3, which is + a slightly higher priority than the default I/O + priority. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt index bb0142f6108..ac2facc50d2 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.txt @@ -76,13 +76,13 @@ the fdtable structure - 5. Handling of the file structures is special. Since the look-up of the fd (fget()/fget_light()) are lock-free, it is possible that look-up may race with the last put() operation on the - file structure. This is avoided using atomic_inc_not_zero() + file structure. This is avoided using atomic_long_inc_not_zero() on ->f_count : rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (atomic_inc_not_zero(&file->f_count)) + if (atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ @@ -92,7 +92,7 @@ the fdtable structure - .... return file; - atomic_inc_not_zero() detects if refcounts is already zero or + atomic_long_inc_not_zero() detects if refcounts is already zero or goes to zero during increment. If it does, we fail fget()/fget_light(). diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 4340cc82579..c2a0871280a 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -28,13 +28,9 @@ Manish Singh Caveats ======= Features which OCFS2 does not support yet: - - extended attributes - quotas - - cluster aware flock - - cluster aware lockf - Directory change notification (F_NOTIFY) - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) - - POSIX ACLs Mount options ============= @@ -82,3 +78,5 @@ inode64 Indicates that Ocfs2 is allowed to create inodes at bits of significance. user_xattr (*) Enables Extended User Attributes. nouser_xattr Disables Extended User Attributes. +acl Enables POSIX Access Control Lists support. +noacl (*) Disables POSIX Access Control Lists support. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index bcceb99b81d..bbebc3a43ac 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -44,6 +44,7 @@ Table of Contents 2.14 /proc//io - Display the IO accounting fields 2.15 /proc//coredump_filter - Core dump filtering settings 2.16 /proc//mountinfo - Information about mounts + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface ------------------------------------------------------------------------------ Preface @@ -139,6 +140,7 @@ Table 1-1: Process specific entries in /proc statm Process memory status information status Process status in human readable form wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan + stack Report full stack trace, enable via CONFIG_STACKTRACE smaps Extension based on maps, the rss size for each mapped file .............................................................................. @@ -1338,10 +1340,13 @@ nmi_watchdog Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero the NMI watchdog is enabled and will continuously test all online cpus to -determine whether or not they are still functioning properly. +determine whether or not they are still functioning properly. Currently, +passing "nmi_watchdog=" parameter at boot time is required for this function +to work. -Because the NMI watchdog shares registers with oprofile, by disabling the NMI -watchdog, oprofile may have more registers to utilize. +If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the +NMI watchdog shares registers with oprofile. By disabling the NMI watchdog, +oprofile may have more registers to utilize. msgmni ------ @@ -1366,268 +1371,8 @@ auto_msgmni default value is 1. 2.4 /proc/sys/vm - The virtual memory subsystem ----------------------------------------------- -The files in this directory can be used to tune the operation of the virtual -memory (VM) subsystem of the Linux kernel. - -vfs_cache_pressure ------------------- - -Controls the tendency of the kernel to reclaim the memory which is used for -caching of directory and inode objects. - -At the default value of vfs_cache_pressure=100 the kernel will attempt to -reclaim dentries and inodes at a "fair" rate with respect to pagecache and -swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer -to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 -causes the kernel to prefer to reclaim dentries and inodes. - -dirty_background_ratio ----------------------- - -Contains, as a percentage of the dirtyable system memory (free pages + mapped -pages + file cache, not including locked pages and HugePages), the number of -pages at which the pdflush background writeback daemon will start writing out -dirty data. - -dirty_ratio ------------------ - -Contains, as a percentage of the dirtyable system memory (free pages + mapped -pages + file cache, not including locked pages and HugePages), the number of -pages at which a process which is generating disk writes will itself start -writing out dirty data. - -dirty_writeback_centisecs -------------------------- - -The pdflush writeback daemons will periodically wake up and write `old' data -out to disk. This tunable expresses the interval between those wakeups, in -100'ths of a second. - -Setting this to zero disables periodic writeback altogether. - -dirty_expire_centisecs ----------------------- - -This tunable is used to define when dirty data is old enough to be eligible -for writeout by the pdflush daemons. It is expressed in 100'ths of a second. -Data which has been dirty in-memory for longer than this interval will be -written out next time a pdflush daemon wakes up. - -highmem_is_dirtyable --------------------- - -Only present if CONFIG_HIGHMEM is set. - -This defaults to 0 (false), meaning that the ratios set above are calculated -as a percentage of lowmem only. This protects against excessive scanning -in page reclaim, swapping and general VM distress. - -Setting this to 1 can be useful on 32 bit machines where you want to make -random changes within an MMAPed file that is larger than your available -lowmem without causing large quantities of random IO. Is is safe if the -behavior of all programs running on the machine is known and memory will -not be otherwise stressed. - -legacy_va_layout ----------------- - -If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel -will use the legacy (2.4) layout for all processes. - -lowmem_reserve_ratio ---------------------- - -For some specialised workloads on highmem machines it is dangerous for -the kernel to allow process memory to be allocated from the "lowmem" -zone. This is because that memory could then be pinned via the mlock() -system call, or by unavailability of swapspace. - -And on large highmem machines this lack of reclaimable lowmem memory -can be fatal. - -So the Linux page allocator has a mechanism which prevents allocations -which _could_ use highmem from using too much lowmem. This means that -a certain amount of lowmem is defended from the possibility of being -captured into pinned user memory. - -(The same argument applies to the old 16 megabyte ISA DMA region. This -mechanism will also defend that region from allocations which could use -highmem or lowmem). - -The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is -in defending these lower zones. - -If you have a machine which uses highmem or ISA DMA and your -applications are using mlock(), or if you are running with no swap then -you probably should change the lowmem_reserve_ratio setting. - -The lowmem_reserve_ratio is an array. You can see them by reading this file. -- -% cat /proc/sys/vm/lowmem_reserve_ratio -256 256 32 -- -Note: # of this elements is one fewer than number of zones. Because the highest - zone's value is not necessary for following calculation. - -But, these values are not used directly. The kernel calculates # of protection -pages for each zones from them. These are shown as array of protection pages -in /proc/zoneinfo like followings. (This is an example of x86-64 box). -Each zone has an array of protection pages like this. - -- -Node 0, zone DMA - pages free 1355 - min 3 - low 3 - high 4 - : - : - numa_other 0 - protection: (0, 2004, 2004, 2004) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - pagesets - cpu: 0 pcp: 0 - : -- -These protections are added to score to judge whether this zone should be used -for page allocation or should be reclaimed. - -In this example, if normal pages (index=2) are required to this DMA zone and -pages_high is used for watermark, the kernel judges this zone should not be -used because pages_free(1355) is smaller than watermark + protection[2] -(4 + 2004 = 2008). If this protection value is 0, this zone would be used for -normal page requirement. If requirement is DMA zone(index=0), protection[0] -(=0) is used. - -zone[i]'s protection[j] is calculated by following expression. - -(i < j): - zone[i]->protection[j] - = (total sums of present_pages from zone[i+1] to zone[j] on the node) - / lowmem_reserve_ratio[i]; -(i = j): - (should not be protected. = 0; -(i > j): - (not necessary, but looks 0) - -The default values of lowmem_reserve_ratio[i] are - 256 (if zone[i] means DMA or DMA32 zone) - 32 (others). -As above expression, they are reciprocal number of ratio. -256 means 1/256. # of protection pages becomes about "0.39%" of total present -pages of higher zones on the node. - -If you would like to protect more pages, smaller values are effective. -The minimum value is 1 (1/1 -> 100%). - -page-cluster ------------- - -page-cluster controls the number of pages which are written to swap in -a single attempt. The swap I/O size. - -It is a logarithmic value - setting it to zero means "1 page", setting -it to 1 means "2 pages", setting it to 2 means "4 pages", etc. - -The default value is three (eight pages at a time). There may be some -small benefits in tuning this to a different value if your workload is -swap-intensive. - -overcommit_memory ------------------ - -Controls overcommit of system memory, possibly allowing processes -to allocate (but not use) more memory than is actually available. - - -0 - Heuristic overcommit handling. Obvious overcommits of - address space are refused. Used for a typical system. It - ensures a seriously wild allocation fails while allowing - overcommit to reduce swap usage. root is allowed to - allocate slightly more memory in this mode. This is the - default. - -1 - Always overcommit. Appropriate for some scientific - applications. - -2 - Don't overcommit. The total address space commit - for the system is not permitted to exceed swap plus a - configurable percentage (default is 50) of physical RAM. - Depending on the percentage you use, in most situations - this means a process will not be killed while attempting - to use already-allocated memory but will receive errors - on memory allocation as appropriate. - -overcommit_ratio ----------------- - -Percentage of physical memory size to include in overcommit calculations -(see above.) - -Memory allocation limit = swapspace + physmem * (overcommit_ratio / 100) - - swapspace = total size of all swap areas - physmem = size of physical memory in system - -nr_hugepages and hugetlb_shm_group ----------------------------------- - -nr_hugepages configures number of hugetlb page reserved for the system. - -hugetlb_shm_group contains group id that is allowed to create SysV shared -memory segment using hugetlb page. - -hugepages_treat_as_movable --------------------------- - -This parameter is only useful when kernelcore= is specified at boot time to -create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages -are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero -value written to hugepages_treat_as_movable allows huge pages to be allocated -from ZONE_MOVABLE. - -Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge -pages pool can easily grow or shrink within. Assuming that applications are -not running that mlock() a lot of memory, it is likely the huge pages pool -can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value -into nr_hugepages and triggering page reclaim. - -laptop_mode ------------ - -laptop_mode is a knob that controls "laptop mode". All the things that are -controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt. - -block_dump ----------- - -block_dump enables block I/O debugging when set to a nonzero value. More -information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. - -swap_token_timeout ------------------- - -This file contains valid hold time of swap out protection token. The Linux -VM has token based thrashing control mechanism and uses the token to prevent -unnecessary page faults in thrashing situation. The unit of the value is -second. The value would be useful to tune thrashing behavior. - -drop_caches ------------ - -Writing to this will cause the kernel to drop clean caches, dentries and -inodes from memory, causing that memory to become free. - -To free pagecache: - echo 1 > /proc/sys/vm/drop_caches -To free dentries and inodes: - echo 2 > /proc/sys/vm/drop_caches -To free pagecache, dentries and inodes: - echo 3 > /proc/sys/vm/drop_caches - -As this is a non-destructive operation and dirty objects are not freeable, the -user should run `sync' first. +Please see: Documentation/sysctls/vm.txt for a description of these +entries. 2.5 /proc/sys/dev - Device specific parameters @@ -2483,4 +2228,30 @@ For more information on mount propagation see: Documentation/filesystems/sharedsubtree.txt +2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface +-------------------------------------------------------- + +This directory contains configuration options for the epoll(7) interface. + +max_user_instances +------------------ + +This is the maximum number of epoll file descriptors that a single user can +have open at a given time. The default value is 128, and should be enough +for normal users. + +max_user_watches +---------------- + +Every epoll file descriptor can store a number of files to be monitored +for event readiness. Each one of these monitored files constitutes a "watch". +This configuration option sets the maximum number of "watches" that are +allowed for each user. +Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes +on a 64bit one. +The current default value for max_user_watches is the 1/32 of the available +low memory, divided for the "watch" cost in bytes. + + ------------------------------------------------------------------------------ + diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index 62fe9b1e089..a8273d5fad2 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt @@ -130,12 +130,12 @@ The 2.6 kernel build process always creates a gzipped cpio format initramfs archive and links it into the resulting kernel binary. By default, this archive is empty (consuming 134 bytes on x86). -The config option CONFIG_INITRAMFS_SOURCE (for some reason buried under -devices->block devices in menuconfig, and living in usr/Kconfig) can be used -to specify a source for the initramfs archive, which will automatically be -incorporated into the resulting binary. This option can point to an existing -gzipped cpio archive, a directory containing files to be archived, or a text -file specification such as the following example: +The config option CONFIG_INITRAMFS_SOURCE (in General Setup in menuconfig, +and living in usr/Kconfig) can be used to specify a source for the +initramfs archive, which will automatically be incorporated into the +resulting binary. This option can point to an existing gzipped cpio +archive, a directory containing files to be archived, or a text file +specification such as the following example: dir /dev 755 0 0 nod /dev/console 644 0 0 c 5 1 diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt new file mode 100644 index 00000000000..3e79e4a7a39 --- /dev/null +++ b/Documentation/filesystems/squashfs.txt @@ -0,0 +1,225 @@ +SQUASHFS 4.0 FILESYSTEM +======================= + +Squashfs is a compressed read-only filesystem for Linux. +It uses zlib compression to compress files, inodes and directories. +Inodes in the system are very small and all blocks are packed to minimise +data overhead. Block sizes greater than 4K are supported up to a maximum +of 1Mbytes (default block size 128K). + +Squashfs is intended for general read-only filesystem use, for archival +use (i.e. in cases where a .tar.gz file may be used), and in constrained +block device/memory systems (e.g. embedded systems) where low overhead is +needed. + +Mailing list: squashfs-devel@lists.sourceforge.net +Web site: www.squashfs.org + +1. FILESYSTEM FEATURES +---------------------- + +Squashfs filesystem features versus Cramfs: + + Squashfs Cramfs + +Max filesystem size: 2^64 16 MiB +Max file size: ~ 2 TiB 16 MiB +Max files: unlimited unlimited +Max directories: unlimited unlimited +Max entries per directory: unlimited unlimited +Max block size: 1 MiB 4 KiB +Metadata compression: yes no +Directory indexes: yes no +Sparse file support: yes no +Tail-end packing (fragments): yes no +Exportable (NFS etc.): yes no +Hard link support: yes no +"." and ".." in readdir: yes no +Real inode numbers: yes no +32-bit uids/gids: yes no +File creation time: yes no +Xattr and ACL support: no no + +Squashfs compresses data, inodes and directories. In addition, inode and +directory data are highly compacted, and packed on byte boundaries. Each +compressed inode is on average 8 bytes in length (the exact length varies on +file type, i.e. regular file, directory, symbolic link, and block/char device +inodes have different sizes). + +2. USING SQUASHFS +----------------- + +As squashfs is a read-only filesystem, the mksquashfs program must be used to +create populated squashfs filesystems. This and other squashfs utilities +can be obtained from http://www.squashfs.org. Usage instructions can be +obtained from this site also. + + +3. SQUASHFS FILESYSTEM DESIGN +----------------------------- + +A squashfs filesystem consists of seven parts, packed together on a byte +alignment: + + --------------- + | superblock | + |---------------| + | datablocks | + | & fragments | + |---------------| + | inode table | + |---------------| + | directory | + | table | + |---------------| + | fragment | + | table | + |---------------| + | export | + | table | + |---------------| + | uid/gid | + | lookup table | + --------------- + +Compressed data blocks are written to the filesystem as files are read from +the source directory, and checked for duplicates. Once all file data has been +written the completed inode, directory, fragment, export and uid/gid lookup +tables are written. + +3.1 Inodes +---------- + +Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each +compressed block is prefixed by a two byte length, the top bit is set if the +block is uncompressed. A block will be uncompressed if the -noI option is set, +or if the compressed block was larger than the uncompressed block. + +Inodes are packed into the metadata blocks, and are not aligned to block +boundaries, therefore inodes overlap compressed blocks. Inodes are identified +by a 48-bit number which encodes the location of the compressed metadata block +containing the inode, and the byte offset into that block where the inode is +placed (). + +To maximise compression there are different inodes for each file type +(regular file, directory, device, etc.), the inode contents and length +varying with the type. + +To further maximise compression, two types of regular file inode and +directory inode are defined: inodes optimised for frequently occurring +regular files and directories, and extended types where extra +information has to be stored. + +3.2 Directories +--------------- + +Like inodes, directories are packed into compressed metadata blocks, stored +in a directory table. Directories are accessed using the start address of +the metablock containing the directory and the offset into the +decompressed block (). + +Directories are organised in a slightly complex way, and are not simply +a list of file names. The organisation takes advantage of the +fact that (in most cases) the inodes of the files will be in the same +compressed metadata block, and therefore, can share the start block. +Directories are therefore organised in a two level list, a directory +header containing the shared start block value, and a sequence of directory +entries, each of which share the shared start block. A new directory header +is written once/if the inode start block changes. The directory +header/directory entry list is repeated as many times as necessary. + +Directories are sorted, and can contain a directory index to speed up +file lookup. Directory indexes store one entry per metablock, each entry +storing the index/filename mapping to the first directory header +in each metadata block. Directories are sorted in alphabetical order, +and at lookup the index is scanned linearly looking for the first filename +alphabetically larger than the filename being looked up. At this point the +location of the metadata block the filename is in has been found. +The general idea of the index is ensure only one metadata block needs to be +decompressed to do a lookup irrespective of the length of the directory. +This scheme has the advantage that it doesn't require extra memory overhead +and doesn't require much extra storage on disk. + +3.3 File data +------------- + +Regular files consist of a sequence of contiguous compressed blocks, and/or a +compressed fragment block (tail-end packed block). The compressed size +of each datablock is stored in a block list contained within the +file inode. + +To speed up access to datablocks when reading 'large' files (256 Mbytes or +larger), the code implements an index cache that caches the mapping from +block index to datablock location on disk. + +The index cache allows Squashfs to handle large files (up to 1.75 TiB) while +retaining a simple and space-efficient block list on disk. The cache +is split into slots, caching up to eight 224 GiB files (128 KiB blocks). +Larger files use multiple slots, with 1.75 TiB files using all 8 slots. +The index cache is designed to be memory efficient, and by default uses +16 KiB. + +3.4 Fragment lookup table +------------------------- + +Regular files can contain a fragment index which is mapped to a fragment +location on disk and compressed size using a fragment lookup table. This +fragment lookup table is itself stored compressed into metadata blocks. +A second index table is used to locate these. This second index table for +speed of access (and because it is small) is read at mount time and cached +in memory. + +3.5 Uid/gid lookup table +------------------------ + +For space efficiency regular files store uid and gid indexes, which are +converted to 32-bit uids/gids using an id look up table. This table is +stored compressed into metadata blocks. A second index table is used to +locate these. This second index table for speed of access (and because it +is small) is read at mount time and cached in memory. + +3.6 Export table +---------------- + +To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems +can optionally (disabled with the -no-exports Mksquashfs option) contain +an inode number to inode disk location lookup table. This is required to +enable Squashfs to map inode numbers passed in filehandles to the inode +location on disk, which is necessary when the export code reinstantiates +expired/flushed inodes. + +This table is stored compressed into metadata blocks. A second index table is +used to locate these. This second index table for speed of access (and because +it is small) is read at mount time and cached in memory. + + +4. TODOS AND OUTSTANDING ISSUES +------------------------------- + +4.1 Todo list +------------- + +Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks +for these but the code has not been written. Once the code has been written +the existing layout should not require modification. + +4.2 Squashfs internal cache +--------------------------- + +Blocks in Squashfs are compressed. To avoid repeatedly decompressing +recently accessed data Squashfs uses two small metadata and fragment caches. + +The cache is not used for file datablocks, these are decompressed and cached in +the page-cache in the normal way. The cache is used to temporarily cache +fragment and metadata blocks which have been read as a result of a metadata +(i.e. inode or directory) or fragment access. Because metadata and fragments +are packed together into blocks (to gain greater compression) the read of a +particular piece of metadata or fragment will retrieve other metadata/fragments +which have been packed with it, these because of locality-of-reference may be +read in the near future. Temporarily caching them ensures they are available +for near future access without requiring an additional read and decompress. + +In the future this internal cache may be replaced with an implementation which +uses the kernel page cache. Because the page cache operates on page sized +units this may introduce additional complexity in terms of locking and +associated race conditions. diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index dd84ea3c10d..84da2a4ba25 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt @@ -95,6 +95,9 @@ no_chk_data_crc skip checking of CRCs on data nodes in order to of this option is that corruption of the contents of a file can go unnoticed. chk_data_crc (*) do not skip checking CRCs on data nodes +compr=none override default compressor and set it to "none" +compr=lzo override default compressor and set it to "lzo" +compr=zlib override default compressor and set it to "zlib" Quick usage instructions diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 5579bda58a6..deeeed0faa8 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -210,8 +210,8 @@ struct super_operations { void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -270,11 +270,11 @@ or bottom half). a superblock. The second parameter indicates whether the method should wait until the write out has been completed. Optional. - write_super_lockfs: called when VFS is locking a filesystem and + freeze_fs: called when VFS is locking a filesystem and forcing it into a consistent state. This method is currently used by the Logical Volume Manager (LVM). - unlockfs: called when VFS is unlocking a filesystem and making it writable + unfreeze_fs: called when VFS is unlocking a filesystem and making it writable again. statfs: called when the VFS needs to get filesystem statistics. This @@ -733,7 +733,6 @@ struct file_operations { ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *filp, unsigned long arg); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int); @@ -800,8 +799,6 @@ otherwise noted. check_flags: called by the fcntl(2) system call for F_SETFL command - dir_notify: called by the fcntl(2) system call for F_NOTIFY command - flock: called by the flock(2) system call splice_write: called by the VFS to splice data from a pipe to a file. This @@ -931,7 +928,7 @@ manipulate dentries: d_lookup: look up a dentry given its parent and path name component It looks up the child of that given name from the dcache hash table. If it is found, the reference count is incremented - and the dentry is returned. The caller must use d_put() + and the dentry is returned. The caller must use dput() to free the dentry when it finishes using it. For further information on dentry locking, please refer to the document diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 0a1668ba260..9878f50d6ed 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -229,10 +229,6 @@ The following sysctls are available for the XFS filesystem: ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl is set. - fs.xfs.restrict_chown (Min: 0 Default: 1 Max: 1) - Controls whether unprivileged users can use chown to "give away" - a file to another user. - fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1) Setting this to "1" will cause the "sync" flag set by the xfs_io(8) chattr command on a directory to be diff --git a/Documentation/filesystems/xip.txt b/Documentation/filesystems/xip.txt index 3cc4010521a..0466ee56927 100644 --- a/Documentation/filesystems/xip.txt +++ b/Documentation/filesystems/xip.txt @@ -39,10 +39,11 @@ The block device operation is optional, these block devices support it as of today: - dcssblk: s390 dcss block device driver -An address space operation named get_xip_page is used to retrieve reference -to a struct page. To address the target page, a reference to an address_space, -and a sector number is provided. A 3rd argument indicates whether the -function should allocate blocks if needed. +An address space operation named get_xip_mem is used to retrieve references +to a page frame number and a kernel address. To obtain these values a reference +to an address_space is provided. This function assigns values to the kmem and +pfn parameters. The third argument indicates whether the function should allocate +blocks if needed. This address space operation is mutually exclusive with readpage&writepage that do page cache read/write operations. diff --git a/Documentation/floppy.txt b/Documentation/floppy.txt deleted file mode 100644 index 6ccab88705c..00000000000 --- a/Documentation/floppy.txt +++ /dev/null @@ -1,245 +0,0 @@ -This file describes the floppy driver. - -FAQ list: -========= - - A FAQ list may be found in the fdutils package (see below), and also -at . - - -LILO configuration options (Thinkpad users, read this) -====================================================== - - The floppy driver is configured using the 'floppy=' option in -lilo. This option can be typed at the boot prompt, or entered in the -lilo configuration file. - - Example: If your kernel is called linux-2.6.9, type the following line -at the lilo boot prompt (if you have a thinkpad): - - linux-2.6.9 floppy=thinkpad - -You may also enter the following line in /etc/lilo.conf, in the description -of linux-2.6.9: - - append = "floppy=thinkpad" - - Several floppy related options may be given, example: - - linux-2.6.9 floppy=daring floppy=two_fdc - append = "floppy=daring floppy=two_fdc" - - If you give options both in the lilo config file and on the boot -prompt, the option strings of both places are concatenated, the boot -prompt options coming last. That's why there are also options to -restore the default behavior. - - -Module configuration options -============================ - - If you use the floppy driver as a module, use the following syntax: -modprobe floppy - -Example: - modprobe floppy omnibook messages - - If you need certain options enabled every time you load the floppy driver, -you can put: - - options floppy omnibook messages - -in /etc/modprobe.conf. - - - The floppy driver related options are: - - floppy=asus_pci - Sets the bit mask to allow only units 0 and 1. (default) - - floppy=daring - Tells the floppy driver that you have a well behaved floppy controller. - This allows more efficient and smoother operation, but may fail on - certain controllers. This may speed up certain operations. - - floppy=0,daring - Tells the floppy driver that your floppy controller should be used - with caution. - - floppy=one_fdc - Tells the floppy driver that you have only one floppy controller. - (default) - - floppy=two_fdc - floppy=
,two_fdc - Tells the floppy driver that you have two floppy controllers. - The second floppy controller is assumed to be at
. - This option is not needed if the second controller is at address - 0x370, and if you use the 'cmos' option. - - floppy=thinkpad - Tells the floppy driver that you have a Thinkpad. Thinkpads use an - inverted convention for the disk change line. - - floppy=0,thinkpad - Tells the floppy driver that you don't have a Thinkpad. - - floppy=omnibook - floppy=nodma - Tells the floppy driver not to use Dma for data transfers. - This is needed on HP Omnibooks, which don't have a workable - DMA channel for the floppy driver. This option is also useful - if you frequently get "Unable to allocate DMA memory" messages. - Indeed, dma memory needs to be continuous in physical memory, - and is thus harder to find, whereas non-dma buffers may be - allocated in virtual memory. However, I advise against this if - you have an FDC without a FIFO (8272A or 82072). 82072A and - later are OK. You also need at least a 486 to use nodma. - If you use nodma mode, I suggest you also set the FIFO - threshold to 10 or lower, in order to limit the number of data - transfer interrupts. - - If you have a FIFO-able FDC, the floppy driver automatically - falls back on non DMA mode if no DMA-able memory can be found. - If you want to avoid this, explicitly ask for 'yesdma'. - - floppy=yesdma - Tells the floppy driver that a workable DMA channel is available. - (default) - - floppy=nofifo - Disables the FIFO entirely. This is needed if you get "Bus - master arbitration error" messages from your Ethernet card (or - from other devices) while accessing the floppy. - - floppy=usefifo - Enables the FIFO. (default) - - floppy=,fifo_depth - Sets the FIFO threshold. This is mostly relevant in DMA - mode. If this is higher, the floppy driver tolerates more - interrupt latency, but it triggers more interrupts (i.e. it - imposes more load on the rest of the system). If this is - lower, the interrupt latency should be lower too (faster - processor). The benefit of a lower threshold is less - interrupts. - - To tune the fifo threshold, switch on over/underrun messages - using 'floppycontrol --messages'. Then access a floppy - disk. If you get a huge amount of "Over/Underrun - retrying" - messages, then the fifo threshold is too low. Try with a - higher value, until you only get an occasional Over/Underrun. - It is a good idea to compile the floppy driver as a module - when doing this tuning. Indeed, it allows to try different - fifo values without rebooting the machine for each test. Note - that you need to do 'floppycontrol --messages' every time you - re-insert the module. - - Usually, tuning the fifo threshold should not be needed, as - the default (0xa) is reasonable. - - floppy=,,cmos - Sets the CMOS type of to . This is mandatory if - you have more than two floppy drives (only two can be - described in the physical CMOS), or if your BIOS uses - non-standard CMOS types. The CMOS types are: - - 0 - Use the value of the physical CMOS - 1 - 5 1/4 DD - 2 - 5 1/4 HD - 3 - 3 1/2 DD - 4 - 3 1/2 HD - 5 - 3 1/2 ED - 6 - 3 1/2 ED - 16 - unknown or not installed - - (Note: there are two valid types for ED drives. This is because 5 was - initially chosen to represent floppy *tapes*, and 6 for ED drives. - AMI ignored this, and used 5 for ED drives. That's why the floppy - driver handles both.) - - floppy=unexpected_interrupts - Print a warning message when an unexpected interrupt is received. - (default) - - floppy=no_unexpected_interrupts - floppy=L40SX - Don't print a message when an unexpected interrupt is received. This - is needed on IBM L40SX laptops in certain video modes. (There seems - to be an interaction between video and floppy. The unexpected - interrupts affect only performance, and can be safely ignored.) - - floppy=broken_dcl - Don't use the disk change line, but assume that the disk was - changed whenever the device node is reopened. Needed on some - boxes where the disk change line is broken or unsupported. - This should be regarded as a stopgap measure, indeed it makes - floppy operation less efficient due to unneeded cache - flushings, and slightly more unreliable. Please verify your - cable, connection and jumper settings if you have any DCL - problems. However, some older drives, and also some laptops - are known not to have a DCL. - - floppy=debug - Print debugging messages. - - floppy=messages - Print informational messages for some operations (disk change - notifications, warnings about over and underruns, and about - autodetection). - - floppy=silent_dcl_clear - Uses a less noisy way to clear the disk change line (which - doesn't involve seeks). Implied by 'daring' option. - - floppy=,irq - Sets the floppy IRQ to instead of 6. - - floppy=,dma - Sets the floppy DMA channel to instead of 2. - - floppy=slow - Use PS/2 stepping rate: - " PS/2 floppies have much slower step rates than regular floppies. - It's been recommended that take about 1/4 of the default speed - in some more extreme cases." - - -Supporting utilities and additional documentation: -================================================== - - Additional parameters of the floppy driver can be configured at -runtime. Utilities which do this can be found in the fdutils package. -This package also contains a new version of mtools which allows to -access high capacity disks (up to 1992K on a high density 3 1/2 disk!). -It also contains additional documentation about the floppy driver. - -The latest version can be found at fdutils homepage: - http://fdutils.linux.lu - -The fdutils releases can be found at: - http://fdutils.linux.lu/download.html - http://www.tux.org/pub/knaff/fdutils/ - ftp://metalab.unc.edu/pub/Linux/utils/disk-management/ - -Reporting problems about the floppy driver -========================================== - - If you have a question or a bug report about the floppy driver, mail -me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use -comp.os.linux.hardware. As the volume in these groups is rather high, -be sure to include the word "floppy" (or "FLOPPY") in the subject -line. If the reported problem happens when mounting floppy disks, be -sure to mention also the type of the filesystem in the subject line. - - Be sure to read the FAQ before mailing/posting any bug reports! - - Alain - -Changelog -========= - -10-30-2004 : Cleanup, updating, add reference to module configuration. - James Nelson - -6-3-2000 : Original Document diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index ea5a827395d..803b1318b13 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -8,7 +8,7 @@ Copyright 2008 Red Hat Inc. Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton, John Kacur, and David Teigland. -Written for: 2.6.27-rc1 +Written for: 2.6.28-rc2 Introduction ------------ @@ -50,26 +50,26 @@ of ftrace. Here is a list of some of the key files: Note: all time values are in microseconds. - current_tracer : This is used to set or display the current tracer + current_tracer: This is used to set or display the current tracer that is configured. - available_tracers : This holds the different types of tracers that + available_tracers: This holds the different types of tracers that have been compiled into the kernel. The tracers listed here can be configured by echoing their name into current_tracer. - tracing_enabled : This sets or displays whether the current_tracer + tracing_enabled: This sets or displays whether the current_tracer is activated and tracing or not. Echo 0 into this file to disable the tracer or 1 to enable it. - trace : This file holds the output of the trace in a human readable + trace: This file holds the output of the trace in a human readable format (described below). - latency_trace : This file shows the same trace but the information + latency_trace: This file shows the same trace but the information is organized more to display possible latencies in the system (described below). - trace_pipe : The output is the same as the "trace" file but this + trace_pipe: The output is the same as the "trace" file but this file is meant to be streamed with live tracing. Reads from this file will block until new data is retrieved. Unlike the "trace" and "latency_trace" @@ -82,11 +82,11 @@ of ftrace. Here is a list of some of the key files: tracer is not adding more data, they will display the same information every time they are read. - iter_ctrl : This file lets the user control the amount of data + trace_options: This file lets the user control the amount of data that is displayed in one of the above output files. - trace_max_latency : Some of the tracers record the max latency. + trace_max_latency: Some of the tracers record the max latency. For example, the time interrupts are disabled. This time is saved in this file. The max trace will also be stored, and displayed by either @@ -94,29 +94,26 @@ of ftrace. Here is a list of some of the key files: only be recorded if the latency is greater than the value in this file. (in microseconds) - trace_entries : This sets or displays the number of trace - entries each CPU buffer can hold. The tracer buffers - are the same size for each CPU. The displayed number - is the size of the CPU buffer and not total size. The + buffer_size_kb: This sets or displays the number of kilobytes each CPU + buffer can hold. The tracer buffers are the same size + for each CPU. The displayed number is the size of the + CPU buffer and not total size of all buffers. The trace buffers are allocated in pages (blocks of memory that the kernel uses for allocation, usually 4 KB in size). - Since each entry is smaller than a page, if the last - allocated page has room for more entries than were - requested, the rest of the page is used to allocate - entries. + If the last page allocated has room for more bytes + than requested, the rest of the page will be used, + making the actual allocation bigger than requested. + (Note, the size may not be a multiple of the page size due + to buffer managment overhead.) This can only be updated when the current_tracer - is set to "none". + is set to "nop". - NOTE: It is planned on changing the allocated buffers - from being the number of possible CPUS to - the number of online CPUS. - - tracing_cpumask : This is a mask that lets the user only trace + tracing_cpumask: This is a mask that lets the user only trace on specified CPUS. The format is a hex string representing the CPUS. - set_ftrace_filter : When dynamic ftrace is configured in (see the + set_ftrace_filter: When dynamic ftrace is configured in (see the section below "dynamic ftrace"), the code is dynamically modified (code text rewrite) to disable calling of the function profiler (mcount). This lets tracing be configured @@ -130,14 +127,13 @@ of ftrace. Here is a list of some of the key files: be traced. If a function exists in both set_ftrace_filter and set_ftrace_notrace, the function will _not_ be traced. - available_filter_functions : When a function is encountered the first - time by the dynamic tracer, it is recorded and - later the call is converted into a nop. This file - lists the functions that have been recorded - by the dynamic tracer and these functions can - be used to set the ftrace filter by the above - "set_ftrace_filter" file. (See the section "dynamic ftrace" - below for more details). + set_ftrace_pid: Have the function tracer only trace a single thread. + + available_filter_functions: This lists the functions that ftrace + has processed and can trace. These are the function + names that you can pass to "set_ftrace_filter" or + "set_ftrace_notrace". (See the section "dynamic ftrace" + below for more details.) The Tracers @@ -145,7 +141,7 @@ The Tracers Here is the list of current tracers that may be configured. - ftrace - function tracer that uses mcount to trace all functions. + function - function tracer that uses mcount to trace all functions. sched_switch - traces the context switches between tasks. @@ -166,8 +162,8 @@ Here is the list of current tracers that may be configured. the highest priority task to get scheduled after it has been woken up. - none - This is not a tracer. To remove all tracers from tracing - simply echo "none" into current_tracer. + nop - This is not a tracer. To remove all tracers from tracing + simply echo "nop" into current_tracer. Examples of using the tracer @@ -182,7 +178,7 @@ Output format: Here is an example of the output format of the file "trace" -------- -# tracer: ftrace +# tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | @@ -192,7 +188,7 @@ Here is an example of the output format of the file "trace" -------- A header is printed with the tracer name that is represented by the trace. -In this case the tracer is "ftrace". Then a header showing the format. Task +In this case the tracer is "function". Then a header showing the format. Task name "bash", the task PID "4251", the CPU that it was running on "01", the timestamp in . format, the function name that was traced "path_put" and the parent function that called this function @@ -322,23 +318,23 @@ The above is mostly meaningful for kernel developers. The rest is the same as the 'trace' file. -iter_ctrl ---------- +trace_options +------------- -The iter_ctrl file is used to control what gets printed in the trace +The trace_options file is used to control what gets printed in the trace output. To see what is available, simply cat the file: - cat /debug/tracing/iter_ctrl + cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj To disable one of the options, echo in the option prepended with "no". - echo noprint-parent > /debug/tracing/iter_ctrl + echo noprint-parent > /debug/tracing/trace_options To enable an option, leave off the "no". - echo sym-offset > /debug/tracing/iter_ctrl + echo sym-offset > /debug/tracing/trace_options Here are the available options: @@ -384,6 +380,20 @@ Here are the available options: When a trace is recorded, so is the stack of functions. This allows for back traces of trace sites. + userstacktrace - This option changes the trace. + It records a stacktrace of the current userspace thread. + + sym-userobj - when user stacktrace are enabled, look up which object the + address belongs to, and print a relative address + This is especially useful when ASLR is on, otherwise you don't + get a chance to resolve the address to object/file/line after the app is no + longer running + + The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + + a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 +x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] + sched-tree - TBD (any users??) @@ -1003,22 +1013,20 @@ is the stack for the hard interrupt. This hides the fact that NEED_RESCHED has been set. We do not see the 'N' until we switch back to the task's assigned stack. -ftrace ------- +function +-------- -ftrace is not only the name of the tracing infrastructure, but it -is also a name of one of the tracers. The tracer is the function -tracer. Enabling the function tracer can be done from the -debug file system. Make sure the ftrace_enabled is set otherwise -this tracer is a nop. +This tracer is the function tracer. Enabling the function tracer +can be done from the debug file system. Make sure the ftrace_enabled is +set; otherwise this tracer is a nop. # sysctl kernel.ftrace_enabled=1 - # echo ftrace > /debug/tracing/current_tracer + # echo function > /debug/tracing/current_tracer # echo 1 > /debug/tracing/tracing_enabled # usleep 1 # echo 0 > /debug/tracing/tracing_enabled # cat /debug/tracing/trace -# tracer: ftrace +# tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | @@ -1040,10 +1048,10 @@ this tracer is a nop. [...] -Note: ftrace uses ring buffers to store the above entries. The newest data -may overwrite the oldest data. Sometimes using echo to stop the trace -is not sufficient because the tracing could have overwritten the data -that you wanted to record. For this reason, it is sometimes better to +Note: function tracer uses ring buffers to store the above entries. +The newest data may overwrite the oldest data. Sometimes using echo to +stop the trace is not sufficient because the tracing could have overwritten +the data that you wanted to record. For this reason, it is sometimes better to disable tracing directly from a program. This allows you to stop the tracing at the point that you hit the part that you are interested in. To disable the tracing directly from a C program, something like following @@ -1067,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else, a search through /proc/mounts may be needed to find where the debugfs file-system is mounted. + +Single thread tracing +--------------------- + +By writing into /debug/tracing/set_ftrace_pid you can trace a +single thread. For example: + +# cat /debug/tracing/set_ftrace_pid +no pid +# echo 3111 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/set_ftrace_pid +3111 +# echo function > /debug/tracing/current_tracer +# cat /debug/tracing/trace | head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + yum-updatesd-3111 [003] 1637.254676: finish_task_switch <-thread_return + yum-updatesd-3111 [003] 1637.254681: hrtimer_cancel <-schedule_hrtimeout_range + yum-updatesd-3111 [003] 1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel + yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel + yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll + yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll +# echo -1 > /debug/tracing/set_ftrace_pid +# cat /debug/tracing/trace |head + # tracer: function + # + # TASK-PID CPU# TIMESTAMP FUNCTION + # | | | | | + ##### CPU 3 buffer started #### + yum-updatesd-3111 [003] 1701.957688: free_poll_entry <-poll_freewait + yum-updatesd-3111 [003] 1701.957689: remove_wait_queue <-free_poll_entry + yum-updatesd-3111 [003] 1701.957691: fput <-free_poll_entry + yum-updatesd-3111 [003] 1701.957692: audit_syscall_exit <-sysret_audit + yum-updatesd-3111 [003] 1701.957693: path_put <-audit_syscall_exit + +If you want to trace a function when executing, you could use +something like this simple program: + +#include +#include +#include +#include +#include +#include + +int main (int argc, char **argv) +{ + if (argc < 1) + exit(-1); + + if (fork() > 0) { + int fd, ffd; + char line[64]; + int s; + + ffd = open("/debug/tracing/current_tracer", O_WRONLY); + if (ffd < 0) + exit(-1); + write(ffd, "nop", 3); + + fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); + s = sprintf(line, "%d\n", getpid()); + write(fd, line, s); + + write(ffd, "function", 8); + + close(fd); + close(ffd); + + execvp(argv[1], argv+1); + } + + return 0; +} + dynamic ftrace -------------- @@ -1077,18 +1162,31 @@ every kernel function, produced by the -pg switch in gcc), starts of pointing to a simple return. (Enabling FTRACE will include the -pg switch in the compiling of the kernel.) -When dynamic ftrace is initialized, it calls kstop_machine to make -the machine act like a uniprocessor so that it can freely modify code -without worrying about other processors executing that same code. At -initialization, the mcount calls are changed to call a "record_ip" -function. After this, the first time a kernel function is called, -it has the calling address saved in a hash table. - -Later on the ftraced kernel thread is awoken and will again call -kstop_machine if new functions have been recorded. The ftraced thread -will change all calls to mcount to "nop". Just calling mcount -and having mcount return has shown a 10% overhead. By converting -it to a nop, there is no measurable overhead to the system. +At compile time every C file object is run through the +recordmcount.pl script (located in the scripts directory). This +script will process the C object using objdump to find all the +locations in the .text section that call mcount. (Note, only +the .text section is processed, since processing other sections +like .init.text may cause races due to those sections being freed). + +A new section called "__mcount_loc" is created that holds references +to all the mcount call sites in the .text section. This section is +compiled back into the original object. The final linker will add +all these references into a single table. + +On boot up, before SMP is initialized, the dynamic ftrace code +scans this table and updates all the locations into nops. It also +records the locations, which are added to the available_filter_functions +list. Modules are processed as they are loaded and before they are +executed. When a module is unloaded, it also removes its functions from +the ftrace function list. This is automatic in the module unload +code, and the module author does not need to worry about it. + +When tracing is enabled, kstop_machine is called to prevent races +with the CPUS executing code being modified (which can cause the +CPU to do undesireable things), and the nops are patched back +to calls. But this time, they do not call mcount (which is just +a function stub). They now call into the ftrace infrastructure. One special side-effect to the recording of the functions being traced is that we can now selectively choose which functions we @@ -1153,7 +1251,11 @@ These are the only wild cards which are supported. * will not work. - # echo hrtimer_* > /debug/tracing/set_ftrace_filter +Note: It is better to use quotes to enclose the wild cards, otherwise + the shell may expand the parameters into names of files in the local + directory. + + # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter Produces: @@ -1208,7 +1310,7 @@ Again, now we want to append. # echo sys_nanosleep > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter sys_nanosleep - # echo hrtimer_* >> /debug/tracing/set_ftrace_filter + # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter hrtimer_run_queues hrtimer_run_pending @@ -1251,36 +1353,6 @@ Produces: We can see that there's no more lock or preempt tracing. -ftraced -------- - -As mentioned above, when dynamic ftrace is configured in, a kernel -thread wakes up once a second and checks to see if there are mcount -calls that need to be converted into nops. If there are not any, then -it simply goes back to sleep. But if there are some, it will call -kstop_machine to convert the calls to nops. - -There may be a case in which you do not want this added latency. -Perhaps you are doing some audio recording and this activity might -cause skips in the playback. There is an interface to disable -and enable the "ftraced" kernel thread. - - # echo 0 > /debug/tracing/ftraced_enabled - -This will disable the calling of kstop_machine to update the -mcount calls to nops. Remember that there is a large overhead -to calling mcount. Without this kernel thread, that overhead will -exist. - -If there are recorded calls to mcount, any write to the ftraced_enabled -file will cause the kstop_machine to run. This means that a -user can manually perform the updates when they want to by simply -echoing a '0' into the ftraced_enabled file. - -The updates are also done at the beginning of enabling a tracer -that uses ftrace function recording. - - trace_pipe ---------- @@ -1289,14 +1361,14 @@ on the tracing is different. Every read from trace_pipe is consumed. This means that subsequent reads will be different. The trace is live. - # echo ftrace > /debug/tracing/current_tracer + # echo function > /debug/tracing/current_tracer # cat /debug/tracing/trace_pipe > /tmp/trace.out & [1] 4153 # echo 1 > /debug/tracing/tracing_enabled # usleep 1 # echo 0 > /debug/tracing/tracing_enabled # cat /debug/tracing/trace -# tracer: ftrace +# tracer: function # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | @@ -1317,48 +1389,36 @@ is live. Note, reading the trace_pipe file will block until more input is added. By changing the tracer, trace_pipe will issue an EOF. We needed -to set the ftrace tracer _before_ cating the trace_pipe file. +to set the function tracer _before_ we "cat" the trace_pipe file. trace entries ------------- Having too much or not enough data can be troublesome in diagnosing -an issue in the kernel. The file trace_entries is used to modify +an issue in the kernel. The file buffer_size_kb is used to modify the size of the internal trace buffers. The number listed is the number of entries that can be recorded per CPU. To know the full size, multiply the number of possible CPUS with the number of entries. - # cat /debug/tracing/trace_entries -65620 + # cat /debug/tracing/buffer_size_kb +1408 (units kilobytes) Note, to modify this, you must have tracing completely disabled. To do that, -echo "none" into the current_tracer. If the current_tracer is not set -to "none", an EINVAL error will be returned. - - # echo none > /debug/tracing/current_tracer - # echo 100000 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries -100045 - - -Notice that we echoed in 100,000 but the size is 100,045. The entries -are held in individual pages. It allocates the number of pages it takes -to fulfill the request. If more entries may fit on the last page -then they will be added. - - # echo 1 > /debug/tracing/trace_entries - # cat /debug/tracing/trace_entries -85 +echo "nop" into the current_tracer. If the current_tracer is not set +to "nop", an EINVAL error will be returned. -This shows us that 85 entries can fit in a single page. + # echo nop > /debug/tracing/current_tracer + # echo 10000 > /debug/tracing/buffer_size_kb + # cat /debug/tracing/buffer_size_kb +10000 (units kilobytes) The number of pages which will be allocated is limited to a percentage of available memory. Allocating too much will produce an error. - # echo 1000000000000 > /debug/tracing/trace_entries + # echo 1000000000000 > /debug/tracing/buffer_size_kb -bash: echo: write error: Cannot allocate memory - # cat /debug/tracing/trace_entries + # cat /debug/tracing/buffer_size_kb 85 diff --git a/Documentation/hayes-esp.txt b/Documentation/hayes-esp.txt deleted file mode 100644 index 09b5d585675..00000000000 --- a/Documentation/hayes-esp.txt +++ /dev/null @@ -1,154 +0,0 @@ -HAYES ESP DRIVER VERSION 2.1 - -A big thanks to the people at Hayes, especially Alan Adamson. Their support -has enabled me to provide enhancements to the driver. - -Please report your experiences with this driver to me (arobinso@nyx.net). I -am looking for both positive and negative feedback. - -*** IMPORTANT CHANGES FOR 2.1 *** -Support for PIO mode. Five situations will cause PIO mode to be used: -1) A multiport card is detected. PIO mode will always be used. (8 port cards -do not support DMA). -2) The DMA channel is set to an invalid value (anything other than 1 or 3). -3) The DMA buffer/channel could not be allocated. The port will revert to PIO -mode until it is reopened. -4) Less than a specified number of bytes need to be transferred to/from the -FIFOs. PIO mode will be used for that transfer only. -5) A port needs to do a DMA transfer and another port is already using the -DMA channel. PIO mode will be used for that transfer only. - -Since the Hayes ESP seems to conflict with other cards (notably sound cards) -when using DMA, DMA is turned off by default. To use DMA, it must be turned -on explicitly, either with the "dma=" option described below or with -setserial. A multiport card can be forced into DMA mode by using setserial; -however, most multiport cards don't support DMA. - -The latest version of setserial allows the enhanced configuration of the ESP -card to be viewed and modified. -*** - -This package contains the files needed to compile a module to support the Hayes -ESP card. The drivers are basically a modified version of the serial drivers. - -Features: - -- Uses the enhanced mode of the ESP card, allowing a wider range of - interrupts and features than compatibility mode -- Uses DMA and 16 bit PIO mode to transfer data to and from the ESP's FIFOs, - reducing CPU load -- Supports primary and secondary ports - - -If the driver is compiled as a module, the IRQs to use can be specified by -using the irq= option. The format is: - -irq=[0x100],[0x140],[0x180],[0x200],[0x240],[0x280],[0x300],[0x380] - -The address in brackets is the base address of the card. The IRQ of -nonexistent cards can be set to 0. If an IRQ of a card that does exist is set -to 0, the driver will attempt to guess at the correct IRQ. For example, to set -the IRQ of the card at address 0x300 to 12, the insmod command would be: - -insmod esp irq=0,0,0,0,0,0,12,0 - -The custom divisor can be set by using the divisor= option. The format is the -same as for the irq= option. Each divisor value is a series of hex digits, -with each digit representing the divisor to use for a corresponding port. The -divisor value is constructed RIGHT TO LEFT. Specifying a nonzero divisor value -will automatically set the spd_cust flag. To calculate the divisor to use for -a certain baud rate, divide the port's base baud (generally 921600) by the -desired rate. For example, to set the divisor of the primary port at 0x300 to -4 and the divisor of the secondary port at 0x308 to 8, the insmod command would -be: - -insmod esp divisor=0,0,0,0,0,0,0x84,0 - -The dma= option can be used to set the DMA channel. The channel can be either -1 or 3. Specifying any other value will force the driver to use PIO mode. -For example, to set the DMA channel to 3, the insmod command would be: - -insmod esp dma=3 - -The rx_trigger= and tx_trigger= options can be used to set the FIFO trigger -levels. They specify when the ESP card should send an interrupt. Larger -values will decrease the number of interrupts; however, a value too high may -result in data loss. Valid values are 1 through 1023, with 768 being the -default. For example, to set the receive trigger level to 512 bytes and the -transmit trigger level to 700 bytes, the insmod command would be: - -insmod esp rx_trigger=512 tx_trigger=700 - -The flow_off= and flow_on= options can be used to set the hardware flow off/ -flow on levels. The flow on level must be lower than the flow off level, and -the flow off level should be higher than rx_trigger. Valid values are 1 -through 1023, with 1016 being the default flow off level and 944 being the -default flow on level. For example, to set the flow off level to 1000 bytes -and the flow on level to 935 bytes, the insmod command would be: - -insmod esp flow_off=1000 flow_on=935 - -The rx_timeout= option can be used to set the receive timeout value. This -value indicates how long after receiving the last character that the ESP card -should wait before signalling an interrupt. Valid values are 0 though 255, -with 128 being the default. A value too high will increase latency, and a -value too low will cause unnecessary interrupts. For example, to set the -receive timeout to 255, the insmod command would be: - -insmod esp rx_timeout=255 - -The pio_threshold= option sets the threshold (in number of characters) for -using PIO mode instead of DMA mode. For example, if this value is 32, -transfers of 32 bytes or less will always use PIO mode. - -insmod esp pio_threshold=32 - -Multiple options can be listed on the insmod command line by separating each -option with a space. For example: - -insmod esp dma=3 trigger=512 - -The esp module can be automatically loaded when needed. To cause this to -happen, add the following lines to /etc/modprobe.conf (replacing the last line -with options for your configuration): - -alias char-major-57 esp -alias char-major-58 esp -options esp irq=0,0,0,0,0,0,3,0 divisor=0,0,0,0,0,0,0x4,0 - -You may also need to run 'depmod -a'. - -Devices must be created manually. To create the devices, note the output from -the module after it is inserted. The output will appear in the location where -kernel messages usually appear (usually /var/adm/messages). Create two devices -for each 'tty' mentioned, one with major of 57 and the other with major of 58. -The minor number should be the same as the tty number reported. The commands -would be (replace ? with the tty number): - -mknod /dev/ttyP? c 57 ? -mknod /dev/cup? c 58 ? - -For example, if the following line appears: - -Oct 24 18:17:23 techno kernel: ttyP8 at 0x0140 (irq = 3) is an ESP primary port - -...two devices should be created: - -mknod /dev/ttyP8 c 57 8 -mknod /dev/cup8 c 58 8 - -You may need to set the permissions on the devices: - -chmod 666 /dev/ttyP* -chmod 666 /dev/cup* - -The ESP module and the serial module should not conflict (they can be used at -the same time). After the ESP module has been loaded the ports on the ESP card -will no longer be accessible by the serial driver. - -If I/O errors are experienced when accessing the port, check for IRQ and DMA -conflicts ('cat /proc/interrupts' and 'cat /proc/dma' for a list of IRQs and -DMAs currently in use). - -Enjoy! -Andrew J. Robinson diff --git a/Documentation/hwmon/abituguru-datasheet b/Documentation/hwmon/abituguru-datasheet index aef5a9b3684..d9251efdcec 100644 --- a/Documentation/hwmon/abituguru-datasheet +++ b/Documentation/hwmon/abituguru-datasheet @@ -74,7 +74,7 @@ a sensor. Notice that some banks have both a read and a write address this is how the uGuru determines if a read from or a write to the bank is taking place, thus when reading you should always use the read address and when writing the -write address. The write address is always one (1) more then the read address. +write address. The write address is always one (1) more than the read address. uGuru ready @@ -121,7 +121,7 @@ Once all bytes have been read data will hold 0x09, but there is no reason to test for this. Notice that the number of bytes is bank address dependent see above and below. -After completing a successfull read it is advised to put the uGuru back in +After completing a successful read it is advised to put the uGuru back in ready mode, so that it is ready for the next read / write cycle. This way if your program / driver is unloaded and later loaded again the detection algorithm described above will still work. @@ -141,7 +141,7 @@ don't ask why this is the way it is. Once DATA holds 0x01 read CMD it should hold 0xAC now. -After completing a successfull write it is advised to put the uGuru back in +After completing a successful write it is advised to put the uGuru back in ready mode, so that it is ready for the next read / write cycle. This way if your program / driver is unloaded and later loaded again the detection algorithm described above will still work. @@ -224,7 +224,7 @@ Bit 3: Beep if alarm (RW) Bit 4: 1 if alarm cause measured temp is over the warning threshold (R) Bit 5: 1 if alarm cause measured volt is over the max threshold (R) Bit 6: 1 if alarm cause measured volt is under the min threshold (R) -Bit 7: Volt sensor: Shutdown if alarm persist for more then 4 seconds (RW) +Bit 7: Volt sensor: Shutdown if alarm persist for more than 4 seconds (RW) Temp sensor: Shutdown if temp is over the shutdown threshold (RW) * This bit is only honored/used by the uGuru if a temp sensor is connected @@ -293,7 +293,7 @@ Byte 0: Alarm behaviour for the selected sensor. A 1 enables the described behaviour. Bit 0: Give an alarm if measured rpm is under the min threshold (RW) Bit 3: Beep if alarm (RW) -Bit 7: Shutdown if alarm persist for more then 4 seconds (RW) +Bit 7: Shutdown if alarm persist for more than 4 seconds (RW) Byte 1: min threshold (scale as bank 0x26) diff --git a/Documentation/hwmon/adt7462 b/Documentation/hwmon/adt7462 new file mode 100644 index 00000000000..ec660b32827 --- /dev/null +++ b/Documentation/hwmon/adt7462 @@ -0,0 +1,67 @@ +Kernel driver adt7462 +====================== + +Supported chips: + * Analog Devices ADT7462 + Prefix: 'adt7462' + Addresses scanned: I2C 0x58, 0x5C + Datasheet: Publicly available at the Analog Devices website + +Author: Darrick J. Wong + +Description +----------- + +This driver implements support for the Analog Devices ADT7462 chip family. + +This chip is a bit of a beast. It has 8 counters for measuring fan speed. It +can also measure 13 voltages or 4 temperatures, or various combinations of the +two. See the chip documentation for more details about the exact set of +configurations. This driver does not allow one to configure the chip; that is +left to the system designer. + +A sophisticated control system for the PWM outputs is designed into the ADT7462 +that allows fan speed to be adjusted automatically based on any of the three +temperature sensors. Each PWM output is individually adjustable and +programmable. Once configured, the ADT7462 will adjust the PWM outputs in +response to the measured temperatures without further host intervention. This +feature can also be disabled for manual control of the PWM's. + +Each of the measured inputs (voltage, temperature, fan speed) has +corresponding high/low limit values. The ADT7462 will signal an ALARM if +any measured value exceeds either limit. + +The ADT7462 samples all inputs continuously. The driver will not read +the registers more often than once every other second. Further, +configuration data is only read once per minute. + +Special Features +---------------- + +The ADT7462 have a 10-bit ADC and can therefore measure temperatures +with 0.25 degC resolution. + +The Analog Devices datasheet is very detailed and describes a procedure for +determining an optimal configuration for the automatic PWM control. + +The driver will report sensor labels when it is able to determine that +information from the configuration registers. + +Configuration Notes +------------------- + +Besides standard interfaces driver adds the following: + +* PWM Control + +* pwm#_auto_point1_pwm and temp#_auto_point1_temp and +* pwm#_auto_point2_pwm and temp#_auto_point2_temp - + +point1: Set the pwm speed at a lower temperature bound. +point2: Set the pwm speed at a higher temperature bound. + +The ADT7462 will scale the pwm between the lower and higher pwm speed when +the temperature is between the two temperature boundaries. PWM values range +from 0 (off) to 255 (full speed). Fan speed will be set to maximum when the +temperature sensor associated with the PWM control exceeds temp#_max. + diff --git a/Documentation/hwmon/adt7470 b/Documentation/hwmon/adt7470 index 75d13ca147c..8ce4aa0a0f5 100644 --- a/Documentation/hwmon/adt7470 +++ b/Documentation/hwmon/adt7470 @@ -31,15 +31,11 @@ Each of the measured inputs (temperature, fan speed) has corresponding high/low limit values. The ADT7470 will signal an ALARM if any measured value exceeds either limit. -The ADT7470 DOES NOT sample all inputs continuously. A single pin on the -ADT7470 is connected to a multitude of thermal diodes, but the chip must be -instructed explicitly to read the multitude of diodes. If you want to use -automatic fan control mode, you must manually read any of the temperature -sensors or the fan control algorithm will not run. The chip WILL NOT DO THIS -AUTOMATICALLY; this must be done from userspace. This may be a bug in the chip -design, given that many other AD chips take care of this. The driver will not -read the registers more often than once every 5 seconds. Further, -configuration data is only read once per minute. +The ADT7470 samples all inputs continuously. A kernel thread is started up for +the purpose of periodically querying the temperature sensors, thus allowing the +automatic fan pwm control to set the fan speed. The driver will not read the +registers more often than once every 5 seconds. Further, configuration data is +only read once per minute. Special Features ---------------- @@ -72,5 +68,6 @@ pwm#_auto_point2_temp. Notes ----- -As stated above, the temperature inputs must be read periodically from -userspace in order for the automatic pwm algorithm to run. +The temperature inputs no longer need to be read periodically from userspace in +order for the automatic pwm algorithm to run. This was the case for earlier +versions of the driver. diff --git a/Documentation/hwmon/adt7475 b/Documentation/hwmon/adt7475 new file mode 100644 index 00000000000..a2b1abec850 --- /dev/null +++ b/Documentation/hwmon/adt7475 @@ -0,0 +1,87 @@ +This describes the interface for the ADT7475 driver: + +(there are 4 fans, numbered fan1 to fan4): + +fanX_input Read the current speed of the fan (in RPMs) +fanX_min Read/write the minimum speed of the fan. Dropping + below this sets an alarm. + +(there are three PWMs, numbered pwm1 to pwm3): + +pwmX Read/write the current duty cycle of the PWM. Writes + only have effect when auto mode is turned off (see + below). Range is 0 - 255. + +pwmX_enable Fan speed control method: + + 0 - No control (fan at full speed) + 1 - Manual fan speed control (using pwm[1-*]) + 2 - Automatic fan speed control + +pwmX_auto_channels_temp Select which channels affect this PWM + + 1 - TEMP1 controls PWM + 2 - TEMP2 controls PWM + 4 - TEMP3 controls PWM + 6 - TEMP2 and TEMP3 control PWM + 7 - All three inputs control PWM + +pwmX_freq Read/write the PWM frequency in Hz. The number + should be one of the following: + + 11 Hz + 14 Hz + 22 Hz + 29 Hz + 35 Hz + 44 Hz + 58 Hz + 88 Hz + +pwmX_auto_point1_pwm Read/write the minimum PWM duty cycle in automatic mode + +pwmX_auto_point2_pwm Read/write the maximum PWM duty cycle in automatic mode + +(there are three temperature settings numbered temp1 to temp3): + +tempX_input Read the current temperature. The value is in milli + degrees of Celsius. + +tempX_max Read/write the upper temperature limit - exceeding this + will cause an alarm. + +tempX_min Read/write the lower temperature limit - exceeding this + will cause an alarm. + +tempX_offset Read/write the temperature adjustment offset + +tempX_crit Read/write the THERM limit for remote1. + +tempX_crit_hyst Set the temperature value below crit where the + fans will stay on - this helps drive the temperature + low enough so it doesn't stay near the edge and + cause THERM to keep tripping. + +tempX_auto_point1_temp Read/write the minimum temperature where the fans will + turn on in automatic mode. + +tempX_auto_point2_temp Read/write the maximum temperature over which the fans + will run in automatic mode. tempX_auto_point1_temp + and tempX_auto_point2_temp together define the + range of automatic control. + +tempX_alarm Read a 1 if the max/min alarm is set +tempX_fault Read a 1 if either temp1 or temp3 diode has a fault + +(There are two voltage settings, in1 and in2): + +inX_input Read the current voltage on VCC. Value is in + millivolts. + +inX_min read/write the minimum voltage limit. + Dropping below this causes an alarm. + +inX_max read/write the maximum voltage limit. + Exceeding this causes an alarm. + +inX_alarm Read a 1 if the max/min alarm is set. diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg new file mode 100644 index 00000000000..a8321267b5b --- /dev/null +++ b/Documentation/hwmon/f71882fg @@ -0,0 +1,89 @@ +Kernel driver f71882fg +====================== + +Supported chips: + * Fintek F71882FG and F71883FG + Prefix: 'f71882fg' + Addresses scanned: none, address read from Super I/O config space + Datasheet: Available from the Fintek website + * Fintek F71862FG and F71863FG + Prefix: 'f71862fg' + Addresses scanned: none, address read from Super I/O config space + Datasheet: Available from the Fintek website + * Fintek F8000 + Prefix: 'f8000' + Addresses scanned: none, address read from Super I/O config space + Datasheet: Not public + +Author: Hans de Goede + + +Description +----------- + +Fintek F718xxFG/F8000 Super I/O chips include complete hardware monitoring +capabilities. They can monitor up to 9 voltages (3 for the F8000), 4 fans and +3 temperature sensors. + +These chips also have fan controlling features, using either DC or PWM, in +three different modes (one manual, two automatic). + +The driver assumes that no more than one chip is present, which seems +reasonable. + + +Monitoring +---------- + +The Voltage, Fan and Temperature Monitoring uses the standard sysfs +interface as documented in sysfs-interface, without any exceptions. + + +Fan Control +----------- + +Both PWM (pulse-width modulation) and DC fan speed control methods are +supported. The right one to use depends on external circuitry on the +motherboard, so the driver assumes that the BIOS set the method +properly. + +There are 2 modes to specify the speed of the fan, PWM duty cycle (or DC +voltage) mode, where 0-100% duty cycle (0-100% of 12V) is specified. And RPM +mode where the actual RPM of the fan (as measured) is controlled and the speed +gets specified as 0-100% of the fan#_full_speed file. + +Since both modes work in a 0-100% (mapped to 0-255) scale, there isn't a +whole lot of a difference when modifying fan control settings. The only +important difference is that in RPM mode the 0-100% controls the fan speed +between 0-100% of fan#_full_speed. It is assumed that if the BIOS programs +RPM mode, it will also set fan#_full_speed properly, if it does not then +fan control will not work properly, unless you set a sane fan#_full_speed +value yourself. + +Switching between these modes requires re-initializing a whole bunch of +registers, so the mode which the BIOS has set is kept. The mode is +printed when loading the driver. + +Three different fan control modes are supported; the mode number is written +to the pwm#_enable file. Note that not all modes are supported on all +chips, and some modes may only be available in RPM / PWM mode on the F8000. +Writing an unsupported mode will result in an invalid parameter error. + +* 1: Manual mode + You ask for a specific PWM duty cycle / DC voltage or a specific % of + fan#_full_speed by writing to the pwm# file. This mode is only + available on the F8000 if the fan channel is in RPM mode. + +* 2: Normal auto mode + You can define a number of temperature/fan speed trip points, which % the + fan should run at at this temp and which temp a fan should follow using the + standard sysfs interface. The number and type of trip points is chip + depended, see which files are available in sysfs. + Fan/PWM channel 3 of the F8000 is always in this mode! + +* 3: Thermostat mode (Only available on the F8000 when in duty cycle mode) + The fan speed is regulated to keep the temp the fan is mapped to between + temp#_auto_point2_temp and temp#_auto_point3_temp. + +Both of the automatic modes require that pwm1 corresponds to fan1, pwm2 to +fan2 and pwm3 to fan3. diff --git a/Documentation/hwmon/it87 b/Documentation/hwmon/it87 index 042c0415140..659315d98e0 100644 --- a/Documentation/hwmon/it87 +++ b/Documentation/hwmon/it87 @@ -26,6 +26,10 @@ Supported chips: Datasheet: Publicly available at the ITE website http://www.ite.com.tw/product_info/file/pc/IT8718F_V0.2.zip http://www.ite.com.tw/product_info/file/pc/IT8718F_V0%203_(for%20C%20version).zip + * IT8720F + Prefix: 'it8720' + Addresses scanned: from Super I/O config space (8 I/O ports) + Datasheet: Not yet publicly available. * SiS950 [clone of IT8705F] Prefix: 'it87' Addresses scanned: from Super I/O config space (8 I/O ports) @@ -71,7 +75,7 @@ Description ----------- This driver implements support for the IT8705F, IT8712F, IT8716F, -IT8718F, IT8726F and SiS950 chips. +IT8718F, IT8720F, IT8726F and SiS950 chips. These chips are 'Super I/O chips', supporting floppy disks, infrared ports, joysticks and other miscellaneous stuff. For hardware monitoring, they @@ -84,19 +88,19 @@ the IT8716F and late IT8712F have 6. They are shared with other functions though, so the functionality may not be available on a given system. The driver dumbly assume it is there. -The IT8718F also features VID inputs (up to 8 pins) but the value is -stored in the Super-I/O configuration space. Due to technical limitations, +The IT8718F and IT8720F also features VID inputs (up to 8 pins) but the value +is stored in the Super-I/O configuration space. Due to technical limitations, this value can currently only be read once at initialization time, so the driver won't notice and report changes in the VID value. The two upper VID bits share their pins with voltage inputs (in5 and in6) so you can't have both on a given board. -The IT8716F, IT8718F and later IT8712F revisions have support for +The IT8716F, IT8718F, IT8720F and later IT8712F revisions have support for 2 additional fans. The additional fans are supported by the driver. -The IT8716F and IT8718F, and late IT8712F and IT8705F also have optional -16-bit tachometer counters for fans 1 to 3. This is better (no more fan -clock divider mess) but not compatible with the older chips and +The IT8716F, IT8718F and IT8720F, and late IT8712F and IT8705F also have +optional 16-bit tachometer counters for fans 1 to 3. This is better (no more +fan clock divider mess) but not compatible with the older chips and revisions. The 16-bit tachometer mode is enabled by the driver when one of the above chips is detected. @@ -122,7 +126,7 @@ zero'; this is important for negative voltage measurements. All voltage inputs can measure voltages between 0 and 4.08 volts, with a resolution of 0.016 volt. The battery voltage in8 does not have limit registers. -The VID lines (IT8712F/IT8716F/IT8718F) encode the core voltage value: +The VID lines (IT8712F/IT8716F/IT8718F/IT8720F) encode the core voltage value: the voltage level your processor should work with. This is hardcoded by the mainboard and/or processor itself. It is a value in volts. diff --git a/Documentation/hwmon/lis3lv02d b/Documentation/hwmon/lis3lv02d new file mode 100644 index 00000000000..0fcfc4a7ccd --- /dev/null +++ b/Documentation/hwmon/lis3lv02d @@ -0,0 +1,53 @@ +Kernel driver lis3lv02d +================== + +Supported chips: + + * STMicroelectronics LIS3LV02DL and LIS3LV02DQ + +Author: + Yan Burman + Eric Piel + + +Description +----------- + +This driver provides support for the accelerometer found in various HP +laptops sporting the feature officially called "HP Mobile Data +Protection System 3D" or "HP 3D DriveGuard". It detect automatically +laptops with this sensor. Known models (for now the HP 2133, nc6420, +nc2510, nc8510, nc84x0, nw9440 and nx9420) will have their axis +automatically oriented on standard way (eg: you can directly play +neverball). The accelerometer data is readable via +/sys/devices/platform/lis3lv02d. + +Sysfs attributes under /sys/devices/platform/lis3lv02d/: +position - 3D position that the accelerometer reports. Format: "(x,y,z)" +calibrate - read: values (x, y, z) that are used as the base for input + class device operation. + write: forces the base to be recalibrated with the current + position. +rate - reports the sampling rate of the accelerometer device in HZ + +This driver also provides an absolute input class device, allowing +the laptop to act as a pinball machine-esque joystick. + +Axes orientation +---------------- + +For better compatibility between the various laptops. The values reported by +the accelerometer are converted into a "standard" organisation of the axes +(aka "can play neverball out of the box"): + * When the laptop is horizontal the position reported is about 0 for X and Y +and a positive value for Z + * If the left side is elevated, X increases (becomes positive) + * If the front side (where the touchpad is) is elevated, Y decreases + (becomes negative) + * If the laptop is put upside-down, Z becomes negative + +If your laptop model is not recognized (cf "dmesg"), you can send an +email to the authors to add it to the database. When reporting a new +laptop, please include the output of "dmidecode" plus the value of +/sys/devices/platform/lis3lv02d/position in these four cases. + diff --git a/Documentation/hwmon/lm70 b/Documentation/hwmon/lm70 index 2bdd3feebf5..0d240291e3c 100644 --- a/Documentation/hwmon/lm70 +++ b/Documentation/hwmon/lm70 @@ -1,9 +1,11 @@ Kernel driver lm70 ================== -Supported chip: +Supported chips: * National Semiconductor LM70 Datasheet: http://www.national.com/pf/LM/LM70.html + * Texas Instruments TMP121/TMP123 + Information: http://focus.ti.com/docs/prod/folders/print/tmp121.html Author: Kaiwan N Billimoria @@ -25,6 +27,14 @@ complement digital temperature (sent via the SIO line), is available in the driver for interpretation. This driver makes use of the kernel's in-core SPI support. +As a real (in-tree) example of this "SPI protocol driver" interfacing +with a "SPI master controller driver", see drivers/spi/spi_lm70llp.c +and its associated documentation. + +The TMP121/TMP123 are very similar; main differences are 4 wire SPI inter- +face (read only) and 13-bit temperature data (0.0625 degrees celsius reso- +lution). + Thanks to --------- Jean Delvare for mentoring the hwmon-side driver diff --git a/Documentation/hwmon/lm85 b/Documentation/hwmon/lm85 index 40062074129..a13680871bc 100644 --- a/Documentation/hwmon/lm85 +++ b/Documentation/hwmon/lm85 @@ -164,7 +164,7 @@ configured individually according to the following options. temperature. (PWM value from 0 to 255) * pwm#_auto_pwm_minctl - this flags selects for temp#_auto_temp_off temperature - the bahaviour of fans. Write 1 to let fans spinning at + the behaviour of fans. Write 1 to let fans spinning at pwm#_auto_pwm_min or write 0 to let them off. NOTE: It has been reported that there is a bug in the LM85 that causes the flag diff --git a/Documentation/hwmon/ltc4245 b/Documentation/hwmon/ltc4245 new file mode 100644 index 00000000000..bae7a3adc5d --- /dev/null +++ b/Documentation/hwmon/ltc4245 @@ -0,0 +1,81 @@ +Kernel driver ltc4245 +===================== + +Supported chips: + * Linear Technology LTC4245 + Prefix: 'ltc4245' + Addresses scanned: 0x20-0x3f + Datasheet: + http://www.linear.com/pc/downloadDocument.do?navId=H0,C1,C1003,C1006,C1140,P19392,D13517 + +Author: Ira W. Snyder + + +Description +----------- + +The LTC4245 controller allows a board to be safely inserted and removed +from a live backplane in multiple supply systems such as CompactPCI and +PCI Express. + + +Usage Notes +----------- + +This driver does not probe for LTC4245 devices, due to the fact that some +of the possible addresses are unfriendly to probing. You will need to use +the "force" parameter to tell the driver where to find the device. + +Example: the following will load the driver for an LTC4245 at address 0x23 +on I2C bus #1: +$ modprobe ltc4245 force=1,0x23 + + +Sysfs entries +------------- + +The LTC4245 has built-in limits for over and under current warnings. This +makes it very likely that the reference circuit will be used. + +This driver uses the values in the datasheet to change the register values +into the values specified in the sysfs-interface document. The current readings +rely on the sense resistors listed in Table 2: "Sense Resistor Values". + +in1_input 12v input voltage (mV) +in2_input 5v input voltage (mV) +in3_input 3v input voltage (mV) +in4_input Vee (-12v) input voltage (mV) + +in1_min_alarm 12v input undervoltage alarm +in2_min_alarm 5v input undervoltage alarm +in3_min_alarm 3v input undervoltage alarm +in4_min_alarm Vee (-12v) input undervoltage alarm + +curr1_input 12v current (mA) +curr2_input 5v current (mA) +curr3_input 3v current (mA) +curr4_input Vee (-12v) current (mA) + +curr1_max_alarm 12v overcurrent alarm +curr2_max_alarm 5v overcurrent alarm +curr3_max_alarm 3v overcurrent alarm +curr4_max_alarm Vee (-12v) overcurrent alarm + +in5_input 12v output voltage (mV) +in6_input 5v output voltage (mV) +in7_input 3v output voltage (mV) +in8_input Vee (-12v) output voltage (mV) + +in5_min_alarm 12v output undervoltage alarm +in6_min_alarm 5v output undervoltage alarm +in7_min_alarm 3v output undervoltage alarm +in8_min_alarm Vee (-12v) output undervoltage alarm + +in9_input GPIO #1 voltage data +in10_input GPIO #2 voltage data +in11_input GPIO #3 voltage data + +power1_input 12v power usage (mW) +power2_input 5v power usage (mW) +power3_input 3v power usage (mW) +power4_input Vee (-12v) power usage (mW) diff --git a/Documentation/ics932s401 b/Documentation/ics932s401 new file mode 100644 index 00000000000..07a739f406d --- /dev/null +++ b/Documentation/ics932s401 @@ -0,0 +1,31 @@ +Kernel driver ics932s401 +====================== + +Supported chips: + * IDT ICS932S401 + Prefix: 'ics932s401' + Addresses scanned: I2C 0x69 + Datasheet: Publically available at the IDT website + +Author: Darrick J. Wong + +Description +----------- + +This driver implements support for the IDT ICS932S401 chip family. + +This chip has 4 clock outputs--a base clock for the CPU (which is likely +multiplied to get the real CPU clock), a system clock, a PCI clock, a USB +clock, and a reference clock. The driver reports selected and actual +frequency. If spread spectrum mode is enabled, the driver also reports by what +percent the clock signal is being spread, which should be between 0 and -0.5%. +All frequencies are reported in KHz. + +The ICS932S401 monitors all inputs continuously. The driver will not read +the registers more often than once every other second. + +Special Features +---------------- + +The clocks could be reprogrammed to increase system speed. I will not help you +do this, as you risk damaging your system! diff --git a/Documentation/ide/warm-plug-howto.txt b/Documentation/ide/warm-plug-howto.txt index d5885468b07..98152bcd515 100644 --- a/Documentation/ide/warm-plug-howto.txt +++ b/Documentation/ide/warm-plug-howto.txt @@ -11,3 +11,8 @@ unplug old device(s) and plug new device(s) # echo -n "1" > /sys/class/ide_port/idex/scan done + +NOTE: please make sure that partitions are unmounted and that there are +no other active references to devices before doing "delete_devices" step, +also do not attempt "scan" step on devices currently in use -- otherwise +results may be unpredictable and lead to data loss if you're unlucky diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index 81905e81585..7f8b9d97bc4 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt @@ -20,10 +20,11 @@ pressed or released a BUTTON_IRQ happens. The driver could look like: static struct input_dev *button_dev; -static void button_interrupt(int irq, void *dummy, struct pt_regs *fp) +static irqreturn_t button_interrupt(int irq, void *dummy) { input_report_key(button_dev, BTN_0, inb(BUTTON_PORT) & 1); input_sync(button_dev); + return IRQ_HANDLED; } static int __init button_init(void) diff --git a/Documentation/input/walkera0701.txt b/Documentation/input/walkera0701.txt new file mode 100644 index 00000000000..8f4289efc5c --- /dev/null +++ b/Documentation/input/walkera0701.txt @@ -0,0 +1,109 @@ + +Walkera WK-0701 transmitter is supplied with a ready to fly Walkera +helicopters such as HM36, HM37, HM60. The walkera0701 module enables to use +this transmitter as joystick + +Devel homepage and download: +http://zub.fei.tuke.sk/walkera-wk0701/ + +or use cogito: +cg-clone http://zub.fei.tuke.sk/GIT/walkera0701-joystick + + +Connecting to PC: + +At back side of transmitter S-video connector can be found. Modulation +pulses from processor to HF part can be found at pin 2 of this connector, +pin 3 is GND. Between pin 3 and CPU 5k6 resistor can be found. To get +modulation pulses to PC, signal pulses must be amplified. + +Cable: (walkera TX to parport) + +Walkera WK-0701 TX S-VIDEO connector: + (back side of TX) + __ __ S-video: canon25 + / |_| \ pin 2 (signal) NPN parport + / O 4 3 O \ pin 3 (GND) LED ________________ 10 ACK + ( O 2 1 O ) | C + \ ___ / 2 ________________________|\|_____|/ + | [___] | |/| B |\ + ------- 3 __________________________________|________________ 25 GND + E + + +I use green LED and BC109 NPN transistor. + +Software: + +Build kernel with walkera0701 module. Module walkera0701 need exclusive +access to parport, modules like lp must be unloaded before loading +walkera0701 module, check dmesg for error messages. Connect TX to PC by +cable and run jstest /dev/input/js0 to see values from TX. If no value can +be changed by TX "joystick", check output from /proc/interrupts. Value for +(usually irq7) parport must increase if TX is on. + + + +Technical details: + +Driver use interrupt from parport ACK input bit to measure pulse length +using hrtimers. + +Frame format: +Based on walkera WK-0701 PCM Format description by Shaul Eizikovich. +(downloaded from http://www.smartpropoplus.com/Docs/Walkera_Wk-0701_PCM.pdf) + +Signal pulses: + (ANALOG) + SYNC BIN OCT + +---------+ +------+ + | | | | +--+ +------+ +--- + +Frame: + SYNC , BIN1, OCT1, BIN2, OCT2 ... BIN24, OCT24, BIN25, next frame SYNC .. + +pulse length: + Binary values: Analog octal values: + + 288 uS Binary 0 318 uS 000 + 438 uS Binary 1 398 uS 001 + 478 uS 010 + 558 uS 011 + 638 uS 100 + 1306 uS SYNC 718 uS 101 + 798 uS 110 + 878 uS 111 + +24 bin+oct values + 1 bin value = 24*4+1 bits = 97 bits + +(Warning, pulses on ACK ar inverted by transistor, irq is rised up on sync +to bin change or octal value to bin change). + +Binary data representations: + +One binary and octal value can be grouped to nibble. 24 nibbles + one binary +values can be sampled between sync pulses. + +Values for first four channels (analog joystick values) can be found in +first 10 nibbles. Analog value is represented by one sign bit and 9 bit +absolute binary value. (10 bits per channel). Next nibble is checksum for +first ten nibbles. + +Next nibbles 12 .. 21 represents four channels (not all channels can be +directly controlled from TX). Binary representations ar the same as in first +four channels. In nibbles 22 and 23 is a special magic number. Nibble 24 is +checksum for nibbles 12..23. + +After last octal value for nibble 24 and next sync pulse one additional +binary value can be sampled. This bit and magic number is not used in +software driver. Some details about this magic numbers can be found in +Walkera_Wk-0701_PCM.pdf. + +Checksum calculation: + +Summary of octal values in nibbles must be same as octal value in checksum +nibble (only first 3 bits are used). Binary value for checksum nibble is +calculated by sum of binary values in checked nibbles + sum of octal values +in checked nibbles divided by 8. Only bit 0 of this sum is used. + diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt deleted file mode 100644 index b880ce5dbd3..00000000000 --- a/Documentation/ioctl-number.txt +++ /dev/null @@ -1,201 +0,0 @@ -Ioctl Numbers -19 October 1999 -Michael Elizabeth Chastain - - -If you are adding new ioctl's to the kernel, you should use the _IO -macros defined in : - - _IO an ioctl with no parameters - _IOW an ioctl with write parameters (copy_from_user) - _IOR an ioctl with read parameters (copy_to_user) - _IOWR an ioctl with both write and read parameters. - -'Write' and 'read' are from the user's point of view, just like the -system calls 'write' and 'read'. For example, a SET_FOO ioctl would -be _IOW, although the kernel would actually read data from user space; -a GET_FOO ioctl would be _IOR, although the kernel would actually write -data to user space. - -The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter -or number from the table below. Because of the large number of drivers, -many drivers share a partial letter with other drivers. - -If you are writing a driver for a new device and need a letter, pick an -unused block with enough room for expansion: 32 to 256 ioctl commands. -You can register the block by patching this file and submitting the -patch to Linus Torvalds. Or you can e-mail me at and -I'll register one for you. - -The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number -to distinguish ioctls from each other. The third argument to _IOW, -_IOR, or _IOWR is the type of the data going into the kernel or coming -out of the kernel (e.g. 'int' or 'struct foo'). NOTE! Do NOT use -sizeof(arg) as the third argument as this results in your ioctl thinking -it passes an argument of type size_t. - -Some devices use their major number as the identifier; this is OK, as -long as it is unique. Some devices are irregular and don't follow any -convention at all. - -Following this convention is good because: - -(1) Keeping the ioctl's globally unique helps error checking: - if a program calls an ioctl on the wrong device, it will get an - error rather than some unexpected behaviour. - -(2) The 'strace' build procedure automatically finds ioctl numbers - defined with _IO, _IOW, _IOR, or _IOWR. - -(3) 'strace' can decode numbers back into useful names when the - numbers are unique. - -(4) People looking for ioctls can grep for them more easily when - this convention is used to define the ioctl numbers. - -(5) When following the convention, the driver code can use generic - code to copy the parameters between user and kernel space. - -This table lists ioctls visible from user land for Linux/i386. It contains -most drivers up to 2.3.14, but I know I am missing some. - -Code Seq# Include File Comments -======================================================== -0x00 00-1F linux/fs.h conflict! -0x00 00-1F scsi/scsi_ioctl.h conflict! -0x00 00-1F linux/fb.h conflict! -0x00 00-1F linux/wavefront.h conflict! -0x02 all linux/fd.h -0x03 all linux/hdreg.h -0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these. -0x06 all linux/lp.h -0x09 all linux/md.h -0x12 all linux/fs.h - linux/blkpg.h -0x1b all InfiniBand Subsystem -0x20 all drivers/cdrom/cm206.h -0x22 all scsi/sg.h -'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem -'1' 00-1F PPS kit from Ulrich Windl - -'8' all SNP8023 advanced NIC card - -'A' 00-1F linux/apm_bios.h -'B' C0-FF advanced bbus - -'C' all linux/soundcard.h -'D' all asm-s390/dasd.h -'E' all linux/input.h -'F' all linux/fb.h -'H' all linux/hiddev.h -'I' all linux/isdn.h -'J' 00-1F drivers/scsi/gdth_ioctl.h -'K' all linux/kd.h -'L' 00-1F linux/loop.h -'L' 20-2F driver/usb/misc/vstusb.h -'L' E0-FF linux/ppdd.h encrypted disk device driver - -'M' all linux/soundcard.h -'N' 00-1F drivers/usb/scanner.h -'P' all linux/soundcard.h -'Q' all linux/soundcard.h -'R' 00-1F linux/random.h -'S' all linux/cdrom.h conflict! -'S' 80-81 scsi/scsi_ioctl.h conflict! -'S' 82-FF scsi/scsi.h conflict! -'T' all linux/soundcard.h conflict! -'T' all asm-i386/ioctls.h conflict! -'U' 00-EF linux/drivers/usb/usb.h -'V' all linux/vt.h -'W' 00-1F linux/watchdog.h conflict! -'W' 00-1F linux/wanrouter.h conflict! -'X' all linux/xfs_fs.h -'Y' all linux/cyclades.h -'[' 00-07 linux/usb/usbtmc.h USB Test and Measurement Devices - -'a' all ATM on linux - -'b' 00-FF bit3 vme host bridge - -'c' 00-7F linux/comstats.h conflict! -'c' 00-7F linux/coda.h conflict! -'c' 80-9F asm-s390/chsc.h -'d' 00-FF linux/char/drm/drm/h conflict! -'d' 00-DF linux/video_decoder.h conflict! -'d' F0-FF linux/digi1.h -'e' all linux/digi1.h conflict! -'e' 00-1F linux/video_encoder.h conflict! -'e' 00-1F net/irda/irtty.h conflict! -'f' 00-1F linux/ext2_fs.h -'h' 00-7F Charon filesystem - -'i' 00-3F linux/i2o.h -'j' 00-3F linux/joystick.h -'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system - -'l' 40-7F linux/udf_fs_i.h in development: - -'m' all linux/mtio.h conflict! -'m' all linux/soundcard.h conflict! -'m' all linux/synclink.h conflict! -'m' 00-1F net/irda/irmod.h conflict! -'n' 00-7F linux/ncp_fs.h -'n' E0-FF video/matrox.h matroxfb -'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 -'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this) -'p' 00-3F linux/mc146818rtc.h conflict! -'p' 40-7F linux/nvram.h -'p' 80-9F user-space parport - -'q' 00-1F linux/serio.h -'q' 80-FF Internet PhoneJACK, Internet LineJACK - -'r' 00-1F linux/msdos_fs.h -'s' all linux/cdk.h -'t' 00-7F linux/if_ppp.h -'t' 80-8F linux/isdn_ppp.h -'u' 00-1F linux/smb_fs.h -'v' 00-1F linux/ext2_fs.h conflict! -'v' all linux/videodev.h conflict! -'w' all CERN SCI driver -'y' 00-1F packet based user level communications - -'z' 00-3F CAN bus card - -'z' 40-7F CAN bus card - -0x80 00-1F linux/fb.h -0x81 00-1F linux/videotext.h -0x89 00-06 asm-i386/sockios.h -0x89 0B-DF linux/sockios.h -0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range -0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range -0x8B all linux/wireless.h -0x8C 00-3F WiNRADiO driver - -0x90 00 drivers/cdrom/sbpcd.h -0x93 60-7F linux/auto_fs.h -0x99 00-0F 537-Addinboard driver - -0xA0 all linux/sdp/sdp.h Industrial Device Project - -0xA3 80-8F Port ACL in development: - -0xA3 90-9F linux/dtlk.h -0xAB 00-1F linux/nbd.h -0xAC 00-1F linux/raw.h -0xAD 00 Netfilter device in development: - -0xAE all linux/kvm.h Kernel-based Virtual Machine - -0xB0 all RATIO devices in development: - -0xB1 00-1F PPPoX -0xCB 00-1F CBM serial IEC bus in development: - -0xDD 00-3F ZFCP device driver see drivers/s390/scsi/ - -0xF3 00-3F video/sisfb.h sisfb (in development) - -0xF4 00-1F video/mbxfb.h mbxfb - diff --git a/Documentation/ioctl/00-INDEX b/Documentation/ioctl/00-INDEX new file mode 100644 index 00000000000..d2fe4d4729e --- /dev/null +++ b/Documentation/ioctl/00-INDEX @@ -0,0 +1,10 @@ +00-INDEX + - this file +cdrom.txt + - summary of CDROM ioctl calls +hdio.txt + - summary of HDIO_ ioctl calls +ioctl-decoding.txt + - how to decode the bits of an IOCTL code +ioctl-number.txt + - how to implement and register device/driver ioctl calls diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt new file mode 100644 index 00000000000..f1d63990332 --- /dev/null +++ b/Documentation/ioctl/ioctl-number.txt @@ -0,0 +1,205 @@ +Ioctl Numbers +19 October 1999 +Michael Elizabeth Chastain + + +If you are adding new ioctl's to the kernel, you should use the _IO +macros defined in : + + _IO an ioctl with no parameters + _IOW an ioctl with write parameters (copy_from_user) + _IOR an ioctl with read parameters (copy_to_user) + _IOWR an ioctl with both write and read parameters. + +'Write' and 'read' are from the user's point of view, just like the +system calls 'write' and 'read'. For example, a SET_FOO ioctl would +be _IOW, although the kernel would actually read data from user space; +a GET_FOO ioctl would be _IOR, although the kernel would actually write +data to user space. + +The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter +or number from the table below. Because of the large number of drivers, +many drivers share a partial letter with other drivers. + +If you are writing a driver for a new device and need a letter, pick an +unused block with enough room for expansion: 32 to 256 ioctl commands. +You can register the block by patching this file and submitting the +patch to Linus Torvalds. Or you can e-mail me at and +I'll register one for you. + +The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number +to distinguish ioctls from each other. The third argument to _IOW, +_IOR, or _IOWR is the type of the data going into the kernel or coming +out of the kernel (e.g. 'int' or 'struct foo'). NOTE! Do NOT use +sizeof(arg) as the third argument as this results in your ioctl thinking +it passes an argument of type size_t. + +Some devices use their major number as the identifier; this is OK, as +long as it is unique. Some devices are irregular and don't follow any +convention at all. + +Following this convention is good because: + +(1) Keeping the ioctl's globally unique helps error checking: + if a program calls an ioctl on the wrong device, it will get an + error rather than some unexpected behaviour. + +(2) The 'strace' build procedure automatically finds ioctl numbers + defined with _IO, _IOW, _IOR, or _IOWR. + +(3) 'strace' can decode numbers back into useful names when the + numbers are unique. + +(4) People looking for ioctls can grep for them more easily when + this convention is used to define the ioctl numbers. + +(5) When following the convention, the driver code can use generic + code to copy the parameters between user and kernel space. + +This table lists ioctls visible from user land for Linux/i386. It contains +most drivers up to 2.3.14, but I know I am missing some. + +Code Seq# Include File Comments +======================================================== +0x00 00-1F linux/fs.h conflict! +0x00 00-1F scsi/scsi_ioctl.h conflict! +0x00 00-1F linux/fb.h conflict! +0x00 00-1F linux/wavefront.h conflict! +0x02 all linux/fd.h +0x03 all linux/hdreg.h +0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these. +0x06 all linux/lp.h +0x09 all linux/md.h +0x12 all linux/fs.h + linux/blkpg.h +0x1b all InfiniBand Subsystem +0x20 all drivers/cdrom/cm206.h +0x22 all scsi/sg.h +'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem +'1' 00-1F PPS kit from Ulrich Windl + +'8' all SNP8023 advanced NIC card + +'A' 00-1F linux/apm_bios.h +'B' C0-FF advanced bbus + +'C' all linux/soundcard.h +'D' all arch/s390/include/asm/dasd.h +'E' all linux/input.h +'F' all linux/fb.h +'H' all linux/hiddev.h +'I' all linux/isdn.h +'J' 00-1F drivers/scsi/gdth_ioctl.h +'K' all linux/kd.h +'L' 00-1F linux/loop.h +'L' 20-2F driver/usb/misc/vstusb.h +'L' E0-FF linux/ppdd.h encrypted disk device driver + +'M' all linux/soundcard.h +'N' 00-1F drivers/usb/scanner.h +'O' 00-02 include/mtd/ubi-user.h UBI +'P' all linux/soundcard.h +'Q' all linux/soundcard.h +'R' 00-1F linux/random.h +'S' all linux/cdrom.h conflict! +'S' 80-81 scsi/scsi_ioctl.h conflict! +'S' 82-FF scsi/scsi.h conflict! +'T' all linux/soundcard.h conflict! +'T' all arch/x86/include/asm/ioctls.h conflict! +'U' 00-EF linux/drivers/usb/usb.h +'V' all linux/vt.h +'W' 00-1F linux/watchdog.h conflict! +'W' 00-1F linux/wanrouter.h conflict! +'X' all linux/xfs_fs.h +'Y' all linux/cyclades.h +'[' 00-07 linux/usb/usbtmc.h USB Test and Measurement Devices + +'a' all ATM on linux + +'b' 00-FF bit3 vme host bridge + +'c' 00-7F linux/comstats.h conflict! +'c' 00-7F linux/coda.h conflict! +'c' 80-9F arch/s390/include/asm/chsc.h +'d' 00-FF linux/char/drm/drm/h conflict! +'d' 00-DF linux/video_decoder.h conflict! +'d' F0-FF linux/digi1.h +'e' all linux/digi1.h conflict! +'e' 00-1F linux/video_encoder.h conflict! +'e' 00-1F net/irda/irtty.h conflict! +'f' 00-1F linux/ext2_fs.h +'h' 00-7F Charon filesystem + +'i' 00-3F linux/i2o.h +'j' 00-3F linux/joystick.h +'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system + +'l' 40-7F linux/udf_fs_i.h in development: + +'m' all linux/mtio.h conflict! +'m' all linux/soundcard.h conflict! +'m' all linux/synclink.h conflict! +'m' 00-1F net/irda/irmod.h conflict! +'n' 00-7F linux/ncp_fs.h +'n' E0-FF video/matrox.h matroxfb +'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2 +'o' 00-03 include/mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps) +'o' 40-41 include/mtd/ubi-user.h UBI +'o' 01-A1 include/linux/dvb/*.h DVB +'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this) +'p' 00-3F linux/mc146818rtc.h conflict! +'p' 40-7F linux/nvram.h +'p' 80-9F user-space parport + +'q' 00-1F linux/serio.h +'q' 80-FF Internet PhoneJACK, Internet LineJACK + +'r' 00-1F linux/msdos_fs.h +'s' all linux/cdk.h +'t' 00-7F linux/if_ppp.h +'t' 80-8F linux/isdn_ppp.h +'u' 00-1F linux/smb_fs.h +'v' 00-1F linux/ext2_fs.h conflict! +'v' all linux/videodev.h conflict! +'w' all CERN SCI driver +'y' 00-1F packet based user level communications + +'z' 00-3F CAN bus card + +'z' 40-7F CAN bus card + +0x80 00-1F linux/fb.h +0x81 00-1F linux/videotext.h +0x89 00-06 arch/x86/include/asm/sockios.h +0x89 0B-DF linux/sockios.h +0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range +0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range +0x8B all linux/wireless.h +0x8C 00-3F WiNRADiO driver + +0x90 00 drivers/cdrom/sbpcd.h +0x93 60-7F linux/auto_fs.h +0x99 00-0F 537-Addinboard driver + +0xA0 all linux/sdp/sdp.h Industrial Device Project + +0xA3 80-8F Port ACL in development: + +0xA3 90-9F linux/dtlk.h +0xAB 00-1F linux/nbd.h +0xAC 00-1F linux/raw.h +0xAD 00 Netfilter device in development: + +0xAE all linux/kvm.h Kernel-based Virtual Machine + +0xB0 all RATIO devices in development: + +0xB1 00-1F PPPoX +0xCB 00-1F CBM serial IEC bus in development: + +0xDD 00-3F ZFCP device driver see drivers/s390/scsi/ + +0xF3 00-3F video/sisfb.h sisfb (in development) + +0xF4 00-1F video/mbxfb.h mbxfb + diff --git a/Documentation/kbuild/00-INDEX b/Documentation/kbuild/00-INDEX index 11464428545..e8d2b6d83a3 100644 --- a/Documentation/kbuild/00-INDEX +++ b/Documentation/kbuild/00-INDEX @@ -1,5 +1,9 @@ 00-INDEX - - this file: info on the kernel build process + - this file: info on the kernel build process +kbuild.txt + - developer information on kbuild +kconfig.txt + - usage help for make *config kconfig-language.txt - specification of Config Language, the language in Kconfig files makefiles.txt diff --git a/Documentation/kbuild/kbuild.txt b/Documentation/kbuild/kbuild.txt new file mode 100644 index 00000000000..923f9ddee8f --- /dev/null +++ b/Documentation/kbuild/kbuild.txt @@ -0,0 +1,133 @@ +Environment variables + +KCPPFLAGS +-------------------------------------------------- +Additional options to pass when preprocessing. The preprocessing options +will be used in all cases where kbuild do preprocessing including +building C files and assembler files. + +KAFLAGS +-------------------------------------------------- +Additional options to the assembler. + +KCFLAGS +-------------------------------------------------- +Additional options to the C compiler. + +KBUILD_VERBOSE +-------------------------------------------------- +Set the kbuild verbosity. Can be assinged same values as "V=...". +See make help for the full list. +Setting "V=..." takes precedence over KBUILD_VERBOSE. + +KBUILD_EXTMOD +-------------------------------------------------- +Set the directory to look for the kernel source when building external +modules. +The directory can be specified in several ways: +1) Use "M=..." on the command line +2) Environmnet variable KBUILD_EXTMOD +3) Environmnet variable SUBDIRS +The possibilities are listed in the order they take precedence. +Using "M=..." will always override the others. + +KBUILD_OUTPUT +-------------------------------------------------- +Specify the output directory when building the kernel. +The output directory can also be specificed using "O=...". +Setting "O=..." takes precedence over KBUILD_OUTPUT + +ARCH +-------------------------------------------------- +Set ARCH to the architecture to be built. +In most cases the name of the architecture is the same as the +directory name found in the arch/ directory. +But some architectures suach as x86 and sparc has aliases. +x86: i386 for 32 bit, x86_64 for 64 bit +sparc: sparc for 32 bit, sparc64 for 64 bit + +CROSS_COMPILE +-------------------------------------------------- +Specify an optional fixed part of the binutils filename. +CROSS_COMPILE can be a part of the filename or the full path. + +CROSS_COMPILE is also used for ccache is some setups. + +CF +-------------------------------------------------- +Additional options for sparse. +CF is often used on the command-line like this: + + make CF=-Wbitwise C=2 + +INSTALL_PATH +-------------------------------------------------- +INSTALL_PATH specifies where to place the updated kernel and system map +images. Default is /boot, but you can set it to other values + + +MODLIB +-------------------------------------------------- +Specify where to install modules. +The default value is: + + $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) + +The value can be overridden in which case the default value is ignored. + +INSTALL_MOD_PATH +-------------------------------------------------- +INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory +relocations required by build roots. This is not defined in the +makefile but the argument can be passed to make if needed. + +INSTALL_MOD_STRIP +-------------------------------------------------- +INSTALL_MOD_STRIP, if defined, will cause modules to be +stripped after they are installed. If INSTALL_MOD_STRIP is '1', then +the default option --strip-debug will be used. Otherwise, +INSTALL_MOD_STRIP will used as the options to the strip command. + +INSTALL_FW_PATH +-------------------------------------------------- +INSTALL_FW_PATH specify where to install the firmware blobs. +The default value is: + + $(INSTALL_MOD_PATH)/lib/firmware + +The value can be overridden in which case the default value is ignored. + +INSTALL_HDR_PATH +-------------------------------------------------- +INSTALL_HDR_PATH specify where to install user space headers when +executing "make headers_*". +The default value is: + + $(objtree)/usr + +$(objtree) is the directory where output files are saved. +The output directory is often set using "O=..." on the commandline. + +The value can be overridden in which case the default value is ignored. + +KBUILD_MODPOST_WARN +-------------------------------------------------- +KBUILD_MODPOST_WARN can be set to avoid error out in case of undefined +symbols in the final module linking stage. + +KBUILD_MODPOST_FINAL +-------------------------------------------------- +KBUILD_MODPOST_NOFINAL can be set to skip the final link of modules. +This is solely usefull to speed up test compiles. + +KBUILD_EXTRA_SYMBOLS +-------------------------------------------------- +For modules use symbols from another modules. +See more details in modules.txt. + +ALLSOURCE_ARCHS +-------------------------------------------------- +For tags/TAGS/cscope targets, you can specify more than one archs +to be included in the databases, separated by blankspace. e.g. + + $ make ALLSOURCE_ARCHS="x86 mips arm" tags diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt new file mode 100644 index 00000000000..26a7c0a9319 --- /dev/null +++ b/Documentation/kbuild/kconfig.txt @@ -0,0 +1,188 @@ +This file contains some assistance for using "make *config". + +Use "make help" to list all of the possible configuration targets. + +The xconfig ('qconf') and menuconfig ('mconf') programs also +have embedded help text. Be sure to check it for navigation, +search, and other general help text. + +====================================================================== +General +-------------------------------------------------- + +New kernel releases often introduce new config symbols. Often more +important, new kernel releases may rename config symbols. When +this happens, using a previously working .config file and running +"make oldconfig" won't necessarily produce a working new kernel +for you, so you may find that you need to see what NEW kernel +symbols have been introduced. + +To see a list of new config symbols when using "make oldconfig", use + + cp user/some/old.config .config + yes "" | make oldconfig >conf.new + +and the config program will list as (NEW) any new symbols that have +unknown values. Of course, the .config file is also updated with +new (default) values, so you can use: + + grep "(NEW)" conf.new + +to see the new config symbols or you can 'diff' the previous and +new .config files to see the differences: + + diff .config.old .config | less + +(Yes, we need something better here.) + + +====================================================================== +menuconfig +-------------------------------------------------- + +SEARCHING for CONFIG symbols + +Searching in menuconfig: + + The Search function searches for kernel configuration symbol + names, so you have to know something close to what you are + looking for. + + Example: + /hotplug + This lists all config symbols that contain "hotplug", + e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG. + + For search help, enter / followed TAB-TAB-TAB (to highlight + ) and Enter. This will tell you that you can also use + regular expressions (regexes) in the search string, so if you + are not interested in MEMORY_HOTPLUG, you could try + + /^hotplug + + +______________________________________________________________________ +Color Themes for 'menuconfig' + +It is possible to select different color themes using the variable +MENUCONFIG_COLOR. To select a theme use: + + make MENUCONFIG_COLOR= menuconfig + +Available themes are: + mono => selects colors suitable for monochrome displays + blackbg => selects a color scheme with black background + classic => theme with blue background. The classic look + bluetitle => a LCD friendly version of classic. (default) + +______________________________________________________________________ +Environment variables in 'menuconfig' + +KCONFIG_ALLCONFIG +-------------------------------------------------- +(partially based on lkml email from/by Rob Landley, re: miniconfig) +-------------------------------------------------- +The allyesconfig/allmodconfig/allnoconfig/randconfig variants can +also use the environment variable KCONFIG_ALLCONFIG as a flag or a +filename that contains config symbols that the user requires to be +set to a specific value. If KCONFIG_ALLCONFIG is used without a +filename, "make *config" checks for a file named +"all{yes/mod/no/random}.config" (corresponding to the *config command +that was used) for symbol values that are to be forced. If this file +is not found, it checks for a file named "all.config" to contain forced +values. + +This enables you to create "miniature" config (miniconfig) or custom +config files containing just the config symbols that you are interested +in. Then the kernel config system generates the full .config file, +including dependencies of your miniconfig file, based on the miniconfig +file. + +This 'KCONFIG_ALLCONFIG' file is a config file which contains +(usually a subset of all) preset config symbols. These variable +settings are still subject to normal dependency checks. + +Examples: + KCONFIG_ALLCONFIG=custom-notebook.config make allnoconfig +or + KCONFIG_ALLCONFIG=mini.config make allnoconfig +or + make KCONFIG_ALLCONFIG=mini.config allnoconfig + +These examples will disable most options (allnoconfig) but enable or +disable the options that are explicitly listed in the specified +mini-config files. + +KCONFIG_NOSILENTUPDATE +-------------------------------------------------- +If this variable has a non-blank value, it prevents silent kernel +config udpates (requires explicit updates). + +KCONFIG_CONFIG +-------------------------------------------------- +This environment variable can be used to specify a default kernel config +file name to override the default name of ".config". + +KCONFIG_OVERWRITECONFIG +-------------------------------------------------- +If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not +break symlinks when .config is a symlink to somewhere else. + +KCONFIG_NOTIMESTAMP +-------------------------------------------------- +If this environment variable exists and is non-null, the timestamp line +in generated .config files is omitted. + +KCONFIG_AUTOCONFIG +-------------------------------------------------- +This environment variable can be set to specify the path & name of the +"auto.conf" file. Its default value is "include/config/auto.conf". + +KCONFIG_AUTOHEADER +-------------------------------------------------- +This environment variable can be set to specify the path & name of the +"autoconf.h" (header) file. Its default value is "include/linux/autoconf.h". + +______________________________________________________________________ +menuconfig User Interface Options +---------------------------------------------------------------------- +MENUCONFIG_MODE +-------------------------------------------------- +This mode shows all sub-menus in one large tree. + +Example: + MENUCONFIG_MODE=single_menu make menuconfig + +====================================================================== +xconfig +-------------------------------------------------- + +Searching in xconfig: + + The Search function searches for kernel configuration symbol + names, so you have to know something close to what you are + looking for. + + Example: + Ctrl-F hotplug + or + Menu: File, Search, hotplug + + lists all config symbol entries that contain "hotplug" in + the symbol name. In this Search dialog, you may change the + config setting for any of the entries that are not grayed out. + You can also enter a different search string without having + to return to the main menu. + + +====================================================================== +gconfig +-------------------------------------------------- + +Searching in gconfig: + + None (gconfig isn't maintained as well as xconfig or menuconfig); + however, gconfig does have a few more viewing choices than + xconfig does. + +### diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 7a7753321a2..51104f9194a 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -383,6 +383,20 @@ more details, with real examples. to prerequisites are referenced with $(src) (because they are not generated files). + $(kecho) + echoing information to user in a rule is often a good practice + but when execution "make -s" one does not expect to see any output + except for warnings/errors. + To support this kbuild define $(kecho) which will echo out the + text following $(kecho) to stdout except if "make -s" is used. + + Example: + #arch/blackfin/boot/Makefile + $(obj)/vmImage: $(obj)/vmlinux.gz + $(call if_changed,uimage) + @$(kecho) 'Kernel: $@ is ready' + + --- 3.11 $(CC) support functions The kernel may be built with several different versions of diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt index 1821c077b43..b1096da953c 100644 --- a/Documentation/kbuild/modules.txt +++ b/Documentation/kbuild/modules.txt @@ -253,7 +253,7 @@ following files: # Module specific targets genbin: - echo "X" > 8123_bin_shipped + echo "X" > 8123_bin.o_shipped In example 2, we are down to two fairly simple files and for simple @@ -279,7 +279,7 @@ following files: # Module specific targets genbin: - echo "X" > 8123_bin_shipped + echo "X" > 8123_bin.o_shipped endif diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt index c6841eee959..d73fbd2b2b4 100644 --- a/Documentation/kernel-doc-nano-HOWTO.txt +++ b/Documentation/kernel-doc-nano-HOWTO.txt @@ -71,6 +71,11 @@ The @argument descriptions must begin on the very next line following this opening short function description line, with no intervening empty comment lines. +If a function parameter is "..." (varargs), it should be listed in +kernel-doc notation as: + * @...: description + + Example kernel-doc data structure comment. /** @@ -282,6 +287,32 @@ struct my_struct { }; +Including documentation blocks in source files +---------------------------------------------- + +To facilitate having source code and comments close together, you can +include kernel-doc documentation blocks that are free-form comments +instead of being kernel-doc for functions, structures, unions, +enums, or typedefs. This could be used for something like a +theory of operation for a driver or library code, for example. + +This is done by using a DOC: section keyword with a section title. E.g.: + +/** + * DOC: Theory of Operation + * + * The whizbang foobar is a dilly of a gizmo. It can do whatever you + * want it to do, at any time. It reads your mind. Here's how it works. + * + * foo bar splat + * + * The only drawback to this gizmo is that is can sometimes damage + * hardware, software, or its subject(s). + */ + +DOC: sections are used in SGML templates files as indicated below. + + How to make new SGML template files ----------------------------------- @@ -302,6 +333,9 @@ exported using EXPORT_SYMBOL. !F is replaced by the documentation, in , for the functions listed. +!P
is replaced by the contents of the DOC: +section titled
from . +Spaces are allowed in
; do not quote the
. Tim. */ diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c86c0745971..8511d3532c2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -89,7 +89,9 @@ parameter is applicable: SPARC Sparc architecture is enabled. SWSUSP Software suspend (hibernation) is enabled. SUSPEND System suspend states are enabled. + FTRACE Function tracing enabled. TS Appropriate touchscreen support is enabled. + UMS USB Mass Storage support is enabled. USB USB support is enabled. USBHID USB Human Interface Device support is enabled. V4L Video For Linux support is enabled. @@ -139,6 +141,7 @@ and is between 256 and 4096 characters. It is defined in the file ht -- run only enough ACPI to enable Hyper Threading strict -- Be less tolerant of platforms that are not strictly ACPI specification compliant. + rsdt -- prefer RSDT over (default) XSDT See also Documentation/power/pm.txt, pci=noacpi @@ -149,16 +152,20 @@ and is between 256 and 4096 characters. It is defined in the file default: 0 acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering } - See Documentation/power/video.txt for s3_bios and s3_mode. + Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, + old_ordering, s4_nonvs } + See Documentation/power/video.txt for information on + s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. s4_nohwsig prevents ACPI hardware signature from being used during resume from hibernation. old_ordering causes the ACPI 1.0 ordering of the _PTS - control method, wrt putting devices into low power - states, to be enforced (the ACPI 2.0 ordering of _PTS is - used by default). + control method, with respect to putting devices into + low power states, to be enforced (the ACPI 2.0 ordering + of _PTS is used by default). + s4_nonvs prevents the kernel from saving/restoring the + ACPI NVS memory during hibernation. acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } @@ -193,64 +200,50 @@ and is between 256 and 4096 characters. It is defined in the file acpi_skip_timer_override [HW,ACPI] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. - acpi_use_timer_override [HW,ACPI} + acpi_use_timer_override [HW,ACPI] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET - acpi.debug_layer= [HW,ACPI] - Format: - Each bit of the indicates an ACPI debug layer, - 1: enable, 0: disable. It is useful for boot time - debugging. After system has booted up, it can be set - via /sys/module/acpi/parameters/debug_layer. - CONFIG_ACPI_DEBUG must be enabled for this to produce any output. - Available bits (add the numbers together) to enable debug output - for specific parts of the ACPI subsystem: - 0x01 utilities 0x02 hardware 0x04 events 0x08 tables - 0x10 namespace 0x20 parser 0x40 dispatcher - 0x80 executer 0x100 resources 0x200 acpica debugger - 0x400 os services 0x800 acpica disassembler. - The number can be in decimal or prefixed with 0x in hex. - Warning: Many of these options can produce a lot of - output and make your system unusable. Be very careful. - - acpi.debug_level= [HW,ACPI] + acpi_backlight= [HW,ACPI] + acpi_backlight=vendor + acpi_backlight=video + If set to vendor, prefer vendor specific driver + (e.g. thinkpad_acpi, sony_acpi, etc.) instead + of the ACPI video.ko driver. + + acpi_display_output= [HW,ACPI] + acpi_display_output=vendor + acpi_display_output=video + See above. + + acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] + acpi.debug_level= [HW,ACPI,ACPI_DEBUG] Format: - Each bit of the indicates an ACPI debug level, - which corresponds to the level in an ACPI_DEBUG_PRINT - statement. After system has booted up, this mask - can be set via /sys/module/acpi/parameters/debug_level. - - CONFIG_ACPI_DEBUG must be enabled for this to produce - any output. The number can be in decimal or prefixed - with 0x in hex. Some of these options produce so much - output that the system is unusable. - - The following global components are defined by the - ACPI CA: - 0x01 error - 0x02 warn - 0x04 init - 0x08 debug object - 0x10 info - 0x20 init names - 0x40 parse - 0x80 load - 0x100 dispatch - 0x200 execute - 0x400 names - 0x800 operation region - 0x1000 bfield - 0x2000 tables - 0x4000 values - 0x8000 objects - 0x10000 resources - 0x20000 user requests - 0x40000 package - The number can be in decimal or prefixed with 0x in hex. - Warning: Many of these options can produce a lot of - output and make your system unusable. Be very careful. + CONFIG_ACPI_DEBUG must be enabled to produce any ACPI + debug output. Bits in debug_layer correspond to a + _COMPONENT in an ACPI source file, e.g., + #define _COMPONENT ACPI_PCI_COMPONENT + Bits in debug_level correspond to a level in + ACPI_DEBUG_PRINT statements, e.g., + ACPI_DEBUG_PRINT((ACPI_DB_INFO, ... + The debug_level mask defaults to "info". See + Documentation/acpi/debug.txt for more information about + debug layers and levels. + + Enable processor driver info messages: + acpi.debug_layer=0x20000000 + Enable PCI/PCI interrupt routing info messages: + acpi.debug_layer=0x400000 + Enable AML "Debug" output, i.e., stores to the Debug + object while interpreting AML: + acpi.debug_layer=0xffffffff acpi.debug_level=0x2 + Enable all messages related to ACPI hardware: + acpi.debug_layer=0x2 acpi.debug_level=0xffffffff + + Some values produce so much output that the system is + unusable. The "log_buf_len" parameter may be useful + if you need to capture more output. acpi.power_nocheck= [HW,ACPI] Format: 1/0 enable/disable the check of power state. @@ -311,7 +304,9 @@ and is between 256 and 4096 characters. It is defined in the file Possible values are: isolate - enable device isolation (each device, as far as possible, will get its own protection - domain) + domain) [default] + share - put every device behind one IOMMU into the + same protection domain fullflush - enable flushing of IO/TLB entries when they are unmapped. Otherwise they are flushed before they will be reused, which @@ -480,8 +475,8 @@ and is between 256 and 4096 characters. It is defined in the file clearcpuid=BITNUM [X86] Disable CPUID feature X for the kernel. See - include/asm-x86/cpufeature.h for the valid bit numbers. - Note the Linux specific bits are not necessarily + arch/x86/include/asm/cpufeature.h for the valid bit + numbers. Note the Linux specific bits are not necessarily stable over kernel options, but the vendor specific ones should be. Also note that user programs calling CPUID directly @@ -562,6 +557,11 @@ and is between 256 and 4096 characters. It is defined in the file not work reliably with all consoles, but is known to work with serial and VGA consoles. + coredump_filter= + [KNL] Change the default value for + /proc//coredump_filter. + See also Documentation/filesystems/proc.txt. + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: ,,,[,] @@ -646,7 +646,7 @@ and is between 256 and 4096 characters. It is defined in the file digiepca= [HW,SERIAL] See drivers/char/README.epca and - Documentation/digiepca.txt. + Documentation/serial/digiepca.txt. disable_mtrr_cleanup [X86] enable_mtrr_cleanup [X86] @@ -757,7 +757,7 @@ and is between 256 and 4096 characters. It is defined in the file See header of drivers/scsi/fdomain.c. floppy= [HW] - See Documentation/floppy.txt. + See Documentation/blockdev/floppy.txt. force_pal_cache_flush [IA-64] Avoid check_sal_cache_flush which may hang on @@ -765,6 +765,14 @@ and is between 256 and 4096 characters. It is defined in the file parameter will force ia64_sal_cache_flush to call ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. + ftrace=[tracer] + [ftrace] will set and start the specified tracer + as early as possible in order to facilitate early + boot debugging. + + ftrace_dump_on_oops + [ftrace] will dump the trace buffers on oops. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) @@ -826,6 +834,9 @@ and is between 256 and 4096 characters. It is defined in the file hlt [BUGS=ARM,SH] + hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) + terminal devices. Valid values: 0..8 + i8042.debug [HW] Toggle i8042 debug mode i8042.direct [HW] Put keyboard port into non-translated mode i8042.dumbkbd [HW] Pretend that controller can only read data from @@ -872,17 +883,19 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/ide/ide.txt. idle= [X86] - Format: idle=poll or idle=mwait, idle=halt, idle=nomwait - Poll forces a polling idle loop that can slightly improves the performance - of waking up a idle CPU, but will use a lot of power and make the system - run hot. Not recommended. - idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose - to not use it because it doesn't save as much power as a normal idle - loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same - as idle=poll. - idle=halt. Halt is forced to be used for CPU idle. + Format: idle=poll, idle=mwait, idle=halt, idle=nomwait + Poll forces a polling idle loop that can slightly + improve the performance of waking up a idle CPU, but + will use a lot of power and make the system run hot. + Not recommended. + idle=mwait: On systems which support MONITOR/MWAIT but + the kernel chose to not use it because it doesn't save + as much power as a normal idle loop, use the + MONITOR/MWAIT idle loop anyways. Performance should be + the same as idle=poll. + idle=halt: Halt is forced to be used for CPU idle. In such case C2/C3 won't be used again. - idle=nomwait. Disable mwait for CPU C-states + idle=nomwait: Disable mwait for CPU C-states ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem Claim all unknown PCI IDE storage controllers. @@ -913,6 +926,10 @@ and is between 256 and 4096 characters. It is defined in the file inttest= [IA64] + iomem= Disable strict checking of access to MMIO memory + strict regions from userspace. + relaxed + iommu= [x86] off force @@ -1064,8 +1081,8 @@ and is between 256 and 4096 characters. It is defined in the file lapic [X86-32,APIC] Enable the local APIC even if BIOS disabled it. - lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in - C2 power state. + lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer + in C2 power state. libata.dma= [LIBATA] DMA control libata.dma=0 Disable all PATA and SATA DMA @@ -1117,8 +1134,10 @@ and is between 256 and 4096 characters. It is defined in the file If there are multiple matching configurations changing the same attribute, the last one is used. + lmb=debug [KNL] Enable lmb debug messages. + load_ramdisk= [RAM] List of ramdisks to load from floppy - See Documentation/ramdisk.txt. + See Documentation/blockdev/ramdisk.txt. lockd.nlm_grace_period=P [NFS] Assign grace period. Format: @@ -1210,8 +1229,8 @@ and is between 256 and 4096 characters. It is defined in the file it is equivalent to "nosmp", which also disables the IO APIC. - max_addr=[KMG] [KNL,BOOT,ia64] All physical memory greater than or - equal to this physical address is ignored. + max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than + or equal to this physical address is ignored. max_luns= [SCSI] Maximum number of LUNs to probe. Should be between 1 and 2^32-1. @@ -1311,6 +1330,9 @@ and is between 256 and 4096 characters. It is defined in the file mga= [HW,DRM] + min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this + physical address is ignored. + mminit_loglevel= [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for @@ -1405,7 +1427,20 @@ and is between 256 and 4096 characters. It is defined in the file when a NMI is triggered. Format: [state][,regs][,debounce][,die] - nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels + nmi_watchdog= [KNL,BUGS=X86-32,X86-64] Debugging features for SMP kernels + Format: [panic,][num] + Valid num: 0,1,2 + 0 - turn nmi_watchdog off + 1 - use the IO-APIC timer for the NMI watchdog + 2 - use the local APIC for the NMI watchdog using + a performance counter. Note: This will use one performance + counter and the local APIC's performance vector. + When panic is specified panic when an NMI watchdog timeout occurs. + This is useful when you use a panic=... timeout and need the box + quickly up again. + Instead of 1 and 2 it is possible to use the following + symbolic names: lapic and ioapic + Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic no387 [BUGS=X86-32] Tells the kernel to use the 387 maths emulation library even if a 387 maths coprocessor @@ -1461,6 +1496,10 @@ and is between 256 and 4096 characters. It is defined in the file instruction doesn't work correctly and not to use it. + no_file_caps Tells the kernel not to honor file capabilities. The + only way then for a file to be executed with privilege + is to be setuid root or executed by root. + nohalt [IA-64] Tells the kernel not to use the power saving function PAL_HALT_LIGHT when idle. This increases power-consumption. On the positive side, it reduces @@ -1530,6 +1569,9 @@ and is between 256 and 4096 characters. It is defined in the file nosoftlockup [KNL] Disable the soft-lockup detector. + noswapaccount [KNL] Disable accounting of swap in memory resource + controller. (See Documentation/controllers/memory.txt) + nosync [HW,M68K] Disables sync negotiation for all devices. notsc [BUGS=X86-32] Disable Time Stamp Counter @@ -1549,6 +1591,10 @@ and is between 256 and 4096 characters. It is defined in the file nr_uarts= [SERIAL] maximum number of UARTs to be registered. + ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + See Documentation/debugging-via-ohci1394.txt for more + info. + olpc_ec_timeout= [OLPC] ms delay when issuing EC commands Rather than timing out after 20 ms if an EC command is not properly ACKed, override the length @@ -1613,7 +1659,7 @@ and is between 256 and 4096 characters. It is defined in the file pcd. [PARIDE] See header of drivers/block/paride/pcd.c. - See also Documentation/paride.txt. + See also Documentation/blockdev/paride.txt. pci=option[,option...] [PCI] various PCI subsystem options: off [X86] don't probe for the PCI bus @@ -1638,6 +1684,17 @@ and is between 256 and 4096 characters. It is defined in the file nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. + ioapicreroute [APIC] Enable rerouting of boot IRQs to the + primary IO-APIC for bridges that cannot disable + boot IRQs. This fixes a source of spurious IRQs + when the system masks IRQs. + noioapicreroute [APIC] Disable workaround that uses the + boot IRQ equivalent of an IRQ that connects to + a chipset where boot IRQs cannot be disabled. + The opposite of ioapicreroute. biosirq [X86-32] Use PCI BIOS calls to get the interrupt routing table. These calls are known to be buggy on several machines and they hang the machine @@ -1714,7 +1771,7 @@ and is between 256 and 4096 characters. It is defined in the file pcmv= [HW,PCMCIA] BadgePAD 4 pd. [PARIDE] - See Documentation/paride.txt. + See Documentation/blockdev/paride.txt. pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at boot time. @@ -1722,10 +1779,10 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c pf. [PARIDE] - See Documentation/paride.txt. + See Documentation/blockdev/paride.txt. pg. [PARIDE] - See Documentation/paride.txt. + See Documentation/blockdev/paride.txt. pirq= [SMP,APIC] Manual mp-table setup See Documentation/x86/i386/IO-APIC.txt. @@ -1762,10 +1819,10 @@ and is between 256 and 4096 characters. It is defined in the file autoconfiguration. Ranges are in pairs (memory base and size). - dynamic_printk - Enables pr_debug()/dev_dbg() calls if - CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. These can also - be switched on/off via /dynamic_printk/modules + dynamic_printk Enables pr_debug()/dev_dbg() calls if + CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. + These can also be switched on/off via + /dynamic_printk/modules print-fatal-signals= [KNL] debug: print fatal signals @@ -1795,7 +1852,7 @@ and is between 256 and 4096 characters. It is defined in the file prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk before loading. - See Documentation/ramdisk.txt. + See Documentation/blockdev/ramdisk.txt. psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to probe for; one of (bare|imps|exps|lifebook|any). @@ -1815,7 +1872,7 @@ and is between 256 and 4096 characters. It is defined in the file ,,,,, pt. [PARIDE] - See Documentation/paride.txt. + See Documentation/blockdev/paride.txt. pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in @@ -1829,10 +1886,10 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/md.txt. ramdisk_blocksize= [RAM] - See Documentation/ramdisk.txt. + See Documentation/blockdev/ramdisk.txt. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes - See Documentation/ramdisk.txt. + See Documentation/blockdev/ramdisk.txt. rcupdate.blimit= [KNL,BOOT] Set maximum number of finished RCU callbacks to process @@ -1853,7 +1910,7 @@ and is between 256 and 4096 characters. It is defined in the file reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode Format: [,[,...]] - See arch/*/kernel/reboot.c or arch/*/kernel/process.c + See arch/*/kernel/reboot.c or arch/*/kernel/process.c relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. @@ -2164,7 +2221,7 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/sonypi.txt specialix= [HW,SERIAL] Specialix multi-serial port adapter - See Documentation/specialix.txt. + See Documentation/serial/specialix.txt. spia_io_base= [HW,MTD] spia_fio_base= @@ -2177,6 +2234,9 @@ and is between 256 and 4096 characters. It is defined in the file st= [HW,SCSI] SCSI tape parameters (buffers, etc.) See Documentation/scsi/st.txt. + stacktrace [FTRACE] + Enabled the stack tracer on boot up. + sti= [PARISC,HW] Format: Set the STI (builtin display/keyboard on the HP-PARISC @@ -2250,7 +2310,8 @@ and is between 256 and 4096 characters. It is defined in the file thermal.psv= [HW,ACPI] -1: disable all passive trip points - : override all passive trip points to this value + : override all passive trip points to this + value thermal.tzp= [HW,ACPI] Specify global default ACPI thermal zone polling rate @@ -2261,12 +2322,27 @@ and is between 256 and 4096 characters. It is defined in the file See comment before function dc390_setup() in drivers/scsi/tmscsim.c. + topology= [S390] + Format: {off | on} + Specify if the kernel should make use of the cpu + topology informations if the hardware supports these. + The scheduler will make use of these informations and + e.g. base its process migration decisions on it. + Default is off. + tp720= [HW,PS2] trix= [HW,OSS] MediaTrix AudioTrix Pro Format: ,,,,,,,, + tsc= Disable clocksource-must-verify flag for TSC. + Format: + [x86] reliable: mark tsc clocksource as reliable, this + disables clocksource verification at runtime. + Used to enable high-resolution timer mode on older + hardware, and in virtualized environment. + turbografx.map[2|3]= [HW,JOY] TurboGraFX parallel port interface Format: @@ -2323,6 +2399,41 @@ and is between 256 and 4096 characters. It is defined in the file usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. + usb-storage.delay_use= + [UMS] The delay in seconds before a new device is + scanned for Logical Units (default 5). + + usb-storage.quirks= + [UMS] A list of quirks entries to supplement or + override the built-in unusual_devs list. List + entries are separated by commas. Each entry has + the form VID:PID:Flags where VID and PID are Vendor + and Product ID values (4-digit hex numbers) and + Flags is a set of characters, each corresponding + to a common usb-storage quirk flag as follows: + a = SANE_SENSE (collect more than 18 bytes + of sense data); + c = FIX_CAPACITY (decrease the reported + device capacity by one sector); + h = CAPACITY_HEURISTICS (decrease the + reported device capacity by one + sector if the number is odd); + i = IGNORE_DEVICE (don't bind to this + device); + l = NOT_LOCKABLE (don't try to lock and + unlock ejectable media); + m = MAX_SECTORS_64 (don't transfer more + than 64 sectors = 32 KB at a time); + o = CAPACITY_OK (accept the capacity + reported by the device); + r = IGNORE_RESIDUE (the device reports + bogus residue values); + s = SINGLE_LUN (the device has only one + Logical Unit); + w = NO_WP_DETECT (don't test whether the + medium is write-protected). + Example: quirks=0419:aaf5:rl,0421:0433:rc + add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in kernel's map of available physical RAM. @@ -2383,8 +2494,8 @@ and is between 256 and 4096 characters. It is defined in the file Format: ,,,,,[,[,[,]]] - norandmaps Don't use address space randomization - Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space ______________________________________________________________________ diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt index f5d2aad65a6..b2e374586bd 100644 --- a/Documentation/kobject.txt +++ b/Documentation/kobject.txt @@ -118,8 +118,8 @@ the name of the kobject, call kobject_rename(): int kobject_rename(struct kobject *kobj, const char *new_name); -Note kobject_rename does perform any locking or have a solid notion of -what names are valid so the provide must provide their own sanity checking +kobject_rename does not perform any locking or have a solid notion of +what names are valid so the caller must provide their own sanity checking and serialization. There is a function called kobject_set_name() but that is legacy cruft and diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index a79633d702b..48b3de90eb1 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -497,7 +497,10 @@ The first column provides the kernel address where the probe is inserted. The second column identifies the type of probe (k - kprobe, r - kretprobe and j - jprobe), while the third column specifies the symbol+offset of the probe. If the probed function belongs to a module, the module name -is also specified. +is also specified. Following columns show probe status. If the probe is on +a virtual address that is no longer valid (module init sections, module +virtual addresses that correspond to modules that've been unloaded), +such probes are marked with [GONE]. /debug/kprobes/enabled: Turn kprobes ON/OFF diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index 71f0fe1fc1b..41bc99fa188 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt @@ -1,7 +1,7 @@ ThinkPad ACPI Extras Driver - Version 0.21 - May 29th, 2008 + Version 0.22 + November 23rd, 2008 Borislav Deianov Henrique de Moraes Holschuh @@ -16,7 +16,8 @@ supported by the generic Linux ACPI drivers. This driver used to be named ibm-acpi until kernel 2.6.21 and release 0.13-20070314. It used to be in the drivers/acpi tree, but it was moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel -2.6.22, and release 0.14. +2.6.22, and release 0.14. It was moved to drivers/platform/x86 for +kernel 2.6.29 and release 0.22. The driver is named "thinkpad-acpi". In some places, like module names, "thinkpad_acpi" is used because of userspace issues. @@ -1412,6 +1413,24 @@ Sysfs notes: rfkill controller switch "tpacpi_wwan_sw": refer to Documentation/rfkill.txt for details. +EXPERIMENTAL: UWB +----------------- + +This feature is marked EXPERIMENTAL because it has not been extensively +tested and validated in various ThinkPad models yet. The feature may not +work as expected. USE WITH CAUTION! To use this feature, you need to supply +the experimental=1 parameter when loading the module. + +sysfs rfkill class: switch "tpacpi_uwb_sw" + +This feature exports an rfkill controller for the UWB device, if one is +present and enabled in the BIOS. + +Sysfs notes: + + rfkill controller switch "tpacpi_uwb_sw": refer to + Documentation/rfkill.txt for details. + Multiple Commands, Module Parameters ------------------------------------ @@ -1475,7 +1494,7 @@ Sysfs interface changelog: 0x020100: Marker for thinkpad-acpi with hot key NVRAM polling support. If you must, use it to know you should not - start an userspace NVRAM poller (allows to detect when + start a userspace NVRAM poller (allows to detect when NVRAM is compiled out by the user because it is unneeded/undesired in the first place). 0x020101: Marker for thinkpad-acpi with hot key NVRAM polling diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 804520633fc..f2dbbf3bdea 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -481,51 +481,6 @@ static unsigned long load_initrd(const char *name, unsigned long mem) /* We return the initrd size. */ return len; } - -/* Once we know how much memory we have we can construct simple linear page - * tables which set virtual == physical which will get the Guest far enough - * into the boot to create its own. - * - * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ -static unsigned long setup_pagetables(unsigned long mem, - unsigned long initrd_size) -{ - unsigned long *pgdir, *linear; - unsigned int mapped_pages, i, linear_pages; - unsigned int ptes_per_page = getpagesize()/sizeof(void *); - - mapped_pages = mem/getpagesize(); - - /* Each PTE page can map ptes_per_page pages: how many do we need? */ - linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; - - /* We put the toplevel page directory page at the top of memory. */ - pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); - - /* Now we use the next linear_pages pages as pte pages */ - linear = (void *)pgdir - linear_pages*getpagesize(); - - /* Linear mapping is easy: put every page's address into the mapping in - * order. PAGE_PRESENT contains the flags Present, Writable and - * Executable. */ - for (i = 0; i < mapped_pages; i++) - linear[i] = ((i * getpagesize()) | PAGE_PRESENT); - - /* The top level points to the linear page table pages above. */ - for (i = 0; i < mapped_pages; i += ptes_per_page) { - pgdir[i/ptes_per_page] - = ((to_guest_phys(linear) + i*sizeof(void *)) - | PAGE_PRESENT); - } - - verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", - mapped_pages, linear_pages, to_guest_phys(linear)); - - /* We return the top level (guest-physical) address: the kernel needs - * to know where it is. */ - return to_guest_phys(pgdir); -} /*:*/ /* Simple routine to roll all the commandline arguments together with spaces @@ -548,13 +503,13 @@ static void concat(char *dst, char *args[]) /*L:185 This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: - * the base of Guest "physical" memory, the top physical page to allow, the - * top level pagetable and the entry point for the Guest. */ -static int tell_kernel(unsigned long pgdir, unsigned long start) + * the base of Guest "physical" memory, the top physical page to allow and the + * entry point for the Guest. */ +static int tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, - guest_limit / getpagesize(), pgdir, start }; + guest_limit / getpagesize(), start }; int fd; verbose("Guest: %p - %p (%#lx)\n", @@ -1030,7 +985,7 @@ static void update_device_status(struct device *dev) /* Zero out the virtqueues. */ for (vq = dev->vq; vq; vq = vq->next) { memset(vq->vring.desc, 0, - vring_size(vq->config.num, getpagesize())); + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); lg_last_avail(vq) = 0; } } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { @@ -1211,7 +1166,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, void *p; /* First we need some memory for this virtqueue. */ - pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1) + pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) / getpagesize(); p = get_pages(pages); @@ -1228,7 +1183,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->config.pfn = to_guest_phys(p) / getpagesize(); /* Initialize the vring. */ - vring_init(&vq->vring, num_descs, p, getpagesize()); + vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); /* Append virtqueue to this device's descriptor. We use * device_config() to get the end of the device's current virtqueues; @@ -1941,7 +1896,7 @@ int main(int argc, char *argv[]) { /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ - unsigned long mem = 0, pgdir, start, initrd_size = 0; + unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries and the /dev/lguest file descriptor. */ int i, c, lguest_fd; /* The boot information for the Guest. */ @@ -2040,9 +1995,6 @@ int main(int argc, char *argv[]) boot->hdr.type_of_loader = 0xFF; } - /* Set up the initial linear pagetables, starting below the initrd. */ - pgdir = setup_pagetables(mem, initrd_size); - /* The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ boot->e820_entries = 1; @@ -2064,7 +2016,7 @@ int main(int argc, char *argv[]) /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(pgdir, start); + lguest_fd = tell_kernel(start); /* We clone off a thread, which wakes the Launcher whenever one of the * input file descriptors needs attention. We call this the Waker, and diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt index f4f8b1c6c8b..23045b8b50f 100644 --- a/Documentation/local_ops.txt +++ b/Documentation/local_ops.txt @@ -149,7 +149,7 @@ static void do_test_timer(unsigned long data) int cpu; /* Increment the counters */ - on_each_cpu(test_each, NULL, 0, 1); + on_each_cpu(test_each, NULL, 1); /* Read all the counters */ printk("Counters read from CPU %d\n", smp_processor_id()); for_each_online_cpu(cpu) { diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt index 4ba4664ce5c..9cb9138f7a7 100644 --- a/Documentation/lockstat.txt +++ b/Documentation/lockstat.txt @@ -71,35 +71,50 @@ Look at the current lock statistics: # less /proc/lock_stat -01 lock_stat version 0.2 +01 lock_stat version 0.3 02 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 03 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 04 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 05 -06 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 -07 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 -08 -------------------------- -09 &inode->i_data.tree_lock 0 [] add_to_page_cache+0x5f/0x190 -10 -11 ............................................................................................................................................................................................... -12 -13 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 -14 ----------- -15 dcache_lock 180 [] sys_getcwd+0x11e/0x230 -16 dcache_lock 165 [] d_alloc+0x15a/0x210 -17 dcache_lock 33 [] _atomic_dec_and_lock+0x4d/0x70 -18 dcache_lock 1 [] shrink_dcache_parent+0x18/0x130 +06 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34 +07 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88 +08 --------------- +09 &mm->mmap_sem 487 [] do_page_fault+0x466/0x928 +10 &mm->mmap_sem 179 [] sys_mprotect+0xcd/0x21d +11 &mm->mmap_sem 279 [] sys_mmap+0x75/0xce +12 &mm->mmap_sem 76 [] sys_munmap+0x32/0x59 +13 --------------- +14 &mm->mmap_sem 270 [] sys_mmap+0x75/0xce +15 &mm->mmap_sem 431 [] do_page_fault+0x466/0x928 +16 &mm->mmap_sem 138 [] sys_munmap+0x32/0x59 +17 &mm->mmap_sem 145 [] sys_mprotect+0xcd/0x21d +18 +19 ............................................................................................................................................................................................... +20 +21 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41 +22 ----------- +23 dcache_lock 179 [] _atomic_dec_and_lock+0x34/0x54 +24 dcache_lock 113 [] d_alloc+0x19a/0x1eb +25 dcache_lock 99 [] d_rehash+0x1b/0x44 +26 dcache_lock 104 [] d_instantiate+0x36/0x8a +27 ----------- +28 dcache_lock 192 [] _atomic_dec_and_lock+0x34/0x54 +29 dcache_lock 98 [] d_rehash+0x1b/0x44 +30 dcache_lock 72 [] d_alloc+0x19a/0x1eb +31 dcache_lock 112 [] d_instantiate+0x36/0x8a This excerpt shows the first two lock class statistics. Line 01 shows the output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-10 and 13-18 show the actual +show the header with column descriptions. Lines 05-18 and 20-31 show the actual statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 14) from the contention points. +short separator (line 08, 13) from the contention points. -The first lock (05-10) is a read/write lock, and shows two lines above the +The first lock (05-18) is a read/write lock, and shows two lines above the short separator. The contention points don't match the column descriptors, -they have two: contentions and [] symbol. +they have two: contentions and [] symbol. The second set of contention +points are the points we're contending with. +The integer part of the time values is in us. View the top contending locks: diff --git a/Documentation/magic-number.txt b/Documentation/magic-number.txt index 95070028d15..505f1960754 100644 --- a/Documentation/magic-number.txt +++ b/Documentation/magic-number.txt @@ -125,14 +125,14 @@ TRIDENT_CARD_MAGIC 0x5072696E trident_card sound/oss/trident.c ROUTER_MAGIC 0x524d4157 wan_device include/linux/wanrouter.h SCC_MAGIC 0x52696368 gs_port drivers/char/scc.h SAVEKMSG_MAGIC1 0x53415645 savekmsg arch/*/amiga/config.c -GDA_MAGIC 0x58464552 gda include/asm-mips64/sn/gda.h +GDA_MAGIC 0x58464552 gda arch/mips/include/asm/sn/gda.h RED_MAGIC1 0x5a2cf071 (any) mm/slab.c STL_PORTMAGIC 0x5a7182c9 stlport include/linux/stallion.h EEPROM_MAGIC_VALUE 0x5ab478d2 lanai_dev drivers/atm/lanai.c HDLCDRV_MAGIC 0x5ac6e778 hdlcdrv_state include/linux/hdlcdrv.h EPCA_MAGIC 0x5c6df104 channel include/linux/epca.h PCXX_MAGIC 0x5c6df104 channel drivers/char/pcxx.h -KV_MAGIC 0x5f4b565f kernel_vars_s include/asm-mips64/sn/klkernvars.h +KV_MAGIC 0x5f4b565f kernel_vars_s arch/mips/include/asm/sn/klkernvars.h I810_STATE_MAGIC 0x63657373 i810_state sound/oss/i810_audio.c TRIDENT_STATE_MAGIC 0x63657373 trient_state sound/oss/trident.c M3_CARD_MAGIC 0x646e6f50 m3_card sound/oss/maestro3.c @@ -158,7 +158,7 @@ CCB_MAGIC 0xf2691ad2 ccb drivers/scsi/ncr53c8xx.c QUEUE_MAGIC_FREE 0xf7e1c9a3 queue_entry drivers/scsi/arm/queue.c QUEUE_MAGIC_USED 0xf7e1cc33 queue_entry drivers/scsi/arm/queue.c HTB_CMAGIC 0xFEFAFEF1 htb_class net/sched/sch_htb.c -NMI_MAGIC 0x48414d4d455201 nmi_s include/asm-mips64/sn/nmi.h +NMI_MAGIC 0x48414d4d455201 nmi_s arch/mips/include/asm/sn/nmi.h Note that there are also defined special per-driver magic numbers in sound memory management. See include/sound/sndmagic.h for complete list of them. Many diff --git a/Documentation/markers.txt b/Documentation/markers.txt index 089f6138fcd..d2b3d0e91b2 100644 --- a/Documentation/markers.txt +++ b/Documentation/markers.txt @@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be activated by calling marker_arm(). Marker deactivation can be done by calling marker_disarm() as many times as marker_arm() has been called. Removing a probe is done through marker_probe_unregister(); it will disarm the probe. -marker_synchronize_unregister() must be called before the end of the module exit -function to make sure there is no caller left using the probe. This, and the -fact that preemption is disabled around the probe call, make sure that probe -removal and module unload are safe. See the "Probe example" section below for a -sample probe module. + +marker_synchronize_unregister() must be called between probe unregistration and +the first occurrence of +- the end of module exit function, + to make sure there is no caller left using the probe; +- the free of any resource used by the probes, + to make sure the probes wont be accessing invalid data. +This, and the fact that preemption is disabled around the probe call, make sure +that probe removal and module unload are safe. See the "Probe example" section +below for a sample probe module. The marker mechanism supports inserting multiple instances of the same marker. Markers can be put in inline functions, inlined static functions, and @@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency: "Format mismatch for probe probe_name (format), marker (format)" +Another way to use markers is to simply define the marker without generating any +function call to actually call into the marker. This is useful in combination +with tracepoint probes in a scheme like this : + +void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk); + +DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name, + "arg1 %u pid %d"); + +notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk) +{ + struct marker *marker = &GET_MARKER(kernel_irq_entry); + /* write data to trace buffers ... */ +} * Probe / marker example diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 168117bd6ee..4c2ecf537a4 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -124,7 +124,7 @@ config options. This option can be kernel module too. -------------------------------- -3 sysfs files for memory hotplug +4 sysfs files for memory hotplug -------------------------------- All sections have their device information under /sys/devices/system/memory as @@ -138,11 +138,12 @@ For example, assume 1GiB section size. A device for a memory starting at (0x100000000 / 1Gib = 4) This device covers address range [0x100000000 ... 0x140000000) -Under each section, you can see 3 files. +Under each section, you can see 4 files. /sys/devices/system/memory/memoryXXX/phys_index /sys/devices/system/memory/memoryXXX/phys_device /sys/devices/system/memory/memoryXXX/state +/sys/devices/system/memory/memoryXXX/removable 'phys_index' : read-only and contains section id, same as XXX. 'state' : read-write @@ -150,10 +151,20 @@ Under each section, you can see 3 files. at write: user can specify "online", "offline" command 'phys_device': read-only: designed to show the name of physical memory device. This is not well implemented now. +'removable' : read-only: contains an integer value indicating + whether the memory section is removable or not + removable. A value of 1 indicates that the memory + section is removable and a value of 0 indicates that + it is not removable. NOTE: These directories/files appear after physical memory hotplug phase. +If CONFIG_NUMA is enabled the +/sys/devices/system/memory/memoryXXX memory section +directories can also be accessed via symbolic links located in +the /sys/devices/system/node/node* directories. For example: +/sys/devices/system/node/node0/memory9 -> ../../memory/memory9 -------------------------------- 4. Physical memory hot-add phase @@ -365,7 +376,6 @@ node if necessary. - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. - showing memory section and physical device relationship. - - showing memory section and node relationship (maybe good for NUMA) - showing memory section is under ZONE_MOVABLE or not - test and make it better memory offlining. - support HugeTLB page migration and offlining. diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README index 25a6ed1aaa5..8ace35ebdcd 100644 --- a/Documentation/mips/AU1xxx_IDE.README +++ b/Documentation/mips/AU1xxx_IDE.README @@ -44,7 +44,7 @@ FILES, CONFIGS AND COMPATABILITY Two files are introduced: - a) 'include/asm-mips/mach-au1x00/au1xxx_ide.h' + a) 'arch/mips/include/asm/mach-au1x00/au1xxx_ide.h' containes : struct _auide_hwif timing parameters for PIO mode 0/1/2/3/4 timing parameters for MWDMA 0/1/2 @@ -52,14 +52,12 @@ Two files are introduced: b) 'drivers/ide/mips/au1xxx-ide.c' contains the functionality of the AU1XXX IDE driver -Four configs variables are introduced: +Following extra configs variables are introduced: CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA controller - CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size - per descriptor SUPPORTED IDE MODES @@ -87,7 +85,6 @@ CONFIG_BLK_DEV_IDEDMA_PCI=y CONFIG_IDEDMA_PCI_AUTO=y CONFIG_BLK_DEV_IDE_AU1XXX=y CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA=y -CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ=128 CONFIG_BLK_DEV_IDEDMA=y CONFIG_IDEDMA_AUTO=y @@ -105,7 +102,6 @@ CONFIG_BLK_DEV_IDEDMA_PCI=y CONFIG_IDEDMA_PCI_AUTO=y CONFIG_BLK_DEV_IDE_AU1XXX=y CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA=y -CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ=128 CONFIG_BLK_DEV_IDEDMA=y CONFIG_IDEDMA_AUTO=y diff --git a/Documentation/moxa-smartio b/Documentation/moxa-smartio deleted file mode 100644 index 5337e80a5b9..00000000000 --- a/Documentation/moxa-smartio +++ /dev/null @@ -1,523 +0,0 @@ -============================================================================= - MOXA Smartio/Industio Family Device Driver Installation Guide - for Linux Kernel 2.4.x, 2.6.x - Copyright (C) 2008, Moxa Inc. -============================================================================= -Date: 01/21/2008 - -Content - -1. Introduction -2. System Requirement -3. Installation - 3.1 Hardware installation - 3.2 Driver files - 3.3 Device naming convention - 3.4 Module driver configuration - 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x. - 3.6 Custom configuration - 3.7 Verify driver installation -4. Utilities -5. Setserial -6. Troubleshooting - ------------------------------------------------------------------------------ -1. Introduction - - The Smartio/Industio/UPCI family Linux driver supports following multiport - boards. - - - 2 ports multiport board - CP-102U, CP-102UL, CP-102UF - CP-132U-I, CP-132UL, - CP-132, CP-132I, CP132S, CP-132IS, - CI-132, CI-132I, CI-132IS, - (C102H, C102HI, C102HIS, C102P, CP-102, CP-102S) - - - 4 ports multiport board - CP-104EL, - CP-104UL, CP-104JU, - CP-134U, CP-134U-I, - C104H/PCI, C104HS/PCI, - CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL, - C104H, C104HS, - CI-104J, CI-104JS, - CI-134, CI-134I, CI-134IS, - (C114HI, CT-114I, C104P) - POS-104UL, - CB-114, - CB-134I - - - 8 ports multiport board - CP-118EL, CP-168EL, - CP-118U, CP-168U, - C168H/PCI, - C168H, C168HS, - (C168P), - CB-108 - - This driver and installation procedure have been developed upon Linux Kernel - 2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order - to maintain compatibility, this version has also been properly tested with - RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem - occurs, please contact Moxa at support@moxa.com.tw. - - In addition to device driver, useful utilities are also provided in this - version. They are - - msdiag Diagnostic program for displaying installed Moxa - Smartio/Industio boards. - - msmon Monitor program to observe data count and line status signals. - - msterm A simple terminal program which is useful in testing serial - ports. - - io-irq.exe Configuration program to setup ISA boards. Please note that - this program can only be executed under DOS. - - All the drivers and utilities are published in form of source code under - GNU General Public License in this version. Please refer to GNU General - Public License announcement in each source code file for more detail. - - In Moxa's Web sites, you may always find latest driver at http://web.moxa.com. - - This version of driver can be installed as Loadable Module (Module driver) - or built-in into kernel (Static driver). You may refer to following - installation procedure for suitable one. Before you install the driver, - please refer to hardware installation procedure in the User's Manual. - - We assume the user should be familiar with following documents. - - Serial-HOWTO - - Kernel-HOWTO - ------------------------------------------------------------------------------ -2. System Requirement - - Hardware platform: Intel x86 machine - - Kernel version: 2.4.x or 2.6.x - - gcc version 2.72 or later - - Maximum 4 boards can be installed in combination - ------------------------------------------------------------------------------ -3. Installation - - 3.1 Hardware installation - 3.2 Driver files - 3.3 Device naming convention - 3.4 Module driver configuration - 3.5 Static driver configuration for Linux kernel 2.4.x, 2.6.x. - 3.6 Custom configuration - 3.7 Verify driver installation - - - 3.1 Hardware installation - - There are two types of buses, ISA and PCI, for Smartio/Industio - family multiport board. - - ISA board - --------- - You'll have to configure CAP address, I/O address, Interrupt Vector - as well as IRQ before installing this driver. Please refer to hardware - installation procedure in User's Manual before proceed any further. - Please make sure the JP1 is open after the ISA board is set properly. - - PCI/UPCI board - -------------- - You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict - with other ISA devices. Please refer to hardware installation - procedure in User's Manual in advance. - - PCI IRQ Sharing - ----------- - Each port within the same multiport board shares the same IRQ. Up to - 4 Moxa Smartio/Industio PCI Family multiport boards can be installed - together on one system and they can share the same IRQ. - - - 3.2 Driver files - - The driver file may be obtained from ftp, CD-ROM or floppy disk. The - first step, anyway, is to copy driver file "mxser.tgz" into specified - directory. e.g. /moxa. The execute commands as below. - - # cd / - # mkdir moxa - # cd /moxa - # tar xvf /dev/fd0 - - or - - # cd / - # mkdir moxa - # cd /moxa - # cp /mnt/cdrom//mxser.tgz . - # tar xvfz mxser.tgz - - - 3.3 Device naming convention - - You may find all the driver and utilities files in /moxa/mxser. - Following installation procedure depends on the model you'd like to - run the driver. If you prefer module driver, please refer to 3.4. - If static driver is required, please refer to 3.5. - - Dialin and callout port - ----------------------- - This driver remains traditional serial device properties. There are - two special file name for each serial port. One is dial-in port - which is named "ttyMxx". For callout port, the naming convention - is "cumxx". - - Device naming when more than 2 boards installed - ----------------------------------------------- - Naming convention for each Smartio/Industio multiport board is - pre-defined as below. - - Board Num. Dial-in Port Callout port - 1st board ttyM0 - ttyM7 cum0 - cum7 - 2nd board ttyM8 - ttyM15 cum8 - cum15 - 3rd board ttyM16 - ttyM23 cum16 - cum23 - 4th board ttyM24 - ttym31 cum24 - cum31 - - - !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - Under Kernel 2.6 the cum Device is Obsolete. So use ttyM* - device instead. - !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - - Board sequence - -------------- - This driver will activate ISA boards according to the parameter set - in the driver. After all specified ISA board activated, PCI board - will be installed in the system automatically driven. - Therefore the board number is sorted by the CAP address of ISA boards. - For PCI boards, their sequence will be after ISA boards and C168H/PCI - has higher priority than C104H/PCI boards. - - 3.4 Module driver configuration - Module driver is easiest way to install. If you prefer static driver - installation, please skip this paragraph. - - - ------------- Prepare to use the MOXA driver-------------------- - 3.4.1 Create tty device with correct major number - Before using MOXA driver, your system must have the tty devices - which are created with driver's major number. We offer one shell - script "msmknod" to simplify the procedure. - This step is only needed to be executed once. But you still - need to do this procedure when: - a. You change the driver's major number. Please refer the "3.7" - section. - b. Your total installed MOXA boards number is changed. Maybe you - add/delete one MOXA board. - c. You want to change the tty name. This needs to modify the - shell script "msmknod" - - The procedure is: - # cd /moxa/mxser/driver - # ./msmknod - - This shell script will require the major number for dial-in - device and callout device to create tty device. You also need - to specify the total installed MOXA board number. Default major - numbers for dial-in device and callout device are 30, 35. If - you need to change to other number, please refer section "3.7" - for more detailed procedure. - Msmknod will delete any special files occupying the same device - naming. - - 3.4.2 Build the MOXA driver and utilities - Before using the MOXA driver and utilities, you need compile the - all the source code. This step is only need to be executed once. - But you still re-compile the source code if you modify the source - code. For example, if you change the driver's major number (see - "3.7" section), then you need to do this step again. - - Find "Makefile" in /moxa/mxser, then run - - # make clean; make install - - !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!! - For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1: - # make clean; make installsp1 - - For Red Hat Enterprise Linux AS4/ES4/WS4: - # make clean; make installsp2 - !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!! - - The driver files "mxser.o" and utilities will be properly compiled - and copied to system directories respectively. - - ------------- Load MOXA driver-------------------- - 3.4.3 Load the MOXA driver - - # modprobe mxser - - will activate the module driver. You may run "lsmod" to check - if "mxser" is activated. If the MOXA board is ISA board, the - is needed. Please refer to section "3.4.5" for more - information. - - - ------------- Load MOXA driver on boot -------------------- - 3.4.4 For the above description, you may manually execute - "modprobe mxser" to activate this driver and run - "rmmod mxser" to remove it. - However, it's better to have a boot time configuration to - eliminate manual operation. Boot time configuration can be - achieved by rc file. We offer one "rc.mxser" file to simplify - the procedure under "moxa/mxser/driver". - - But if you use ISA board, please modify the "modprobe ..." command - to add the argument (see "3.4.5" section). After modifying the - rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser" - manually to make sure the modification is ok. If any error - encountered, please try to modify again. If the modification is - completed, follow the below step. - - Run following command for setting rc files. - - # cd /moxa/mxser/driver - # cp ./rc.mxser /etc/rc.d - # cd /etc/rc.d - - Check "rc.serial" is existed or not. If "rc.serial" doesn't exist, - create it by vi, run "chmod 755 rc.serial" to change the permission. - Add "/etc/rc.d/rc.mxser" in last line, - - Reboot and check if moxa.o activated by "lsmod" command. - - 3.4.5. If you'd like to drive Smartio/Industio ISA boards in the system, - you'll have to add parameter to specify CAP address of given - board while activating "mxser.o". The format for parameters are - as follows. - - modprobe mxser ioaddr=0x???,0x???,0x???,0x??? - | | | | - | | | +- 4th ISA board - | | +------ 3rd ISA board - | +------------ 2nd ISA board - +------------------- 1st ISA board - - 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x - - Note: To use static driver, you must install the linux kernel - source package. - - 3.5.1 Backup the built-in driver in the kernel. - # cd /usr/src/linux/drivers/char - # mv mxser.c mxser.c.old - - For Red Hat 7.x user, you need to create link: - # cd /usr/src - # ln -s linux-2.4 linux - - 3.5.2 Create link - # cd /usr/src/linux/drivers/char - # ln -s /moxa/mxser/driver/mxser.c mxser.c - - 3.5.3 Add CAP address list for ISA boards. For PCI boards user, - please skip this step. - - In module mode, the CAP address for ISA board is given by - parameter. In static driver configuration, you'll have to - assign it within driver's source code. If you will not - install any ISA boards, you may skip to next portion. - The instructions to modify driver source code are as - below. - a. # cd /moxa/mxser/driver - # vi mxser.c - b. Find the array mxserBoardCAP[] as below. - - static int mxserBoardCAP[] - = {0x00, 0x00, 0x00, 0x00}; - - c. Change the address within this array using vi. For - example, to driver 2 ISA boards with CAP address - 0x280 and 0x180 as 1st and 2nd board. Just to change - the source code as follows. - - static int mxserBoardCAP[] - = {0x280, 0x180, 0x00, 0x00}; - - 3.5.4 Setup kernel configuration - - Configure the kernel: - - # cd /usr/src/linux - # make menuconfig - - You will go into a menu-driven system. Please select [Character - devices][Non-standard serial port support], enable the [Moxa - SmartIO support] driver with "[*]" for built-in (not "[M]"), then - select [Exit] to exit this program. - - 3.5.5 Rebuild kernel - The following are for Linux kernel rebuilding, for your - reference only. - For appropriate details, please refer to the Linux document. - - a. cd /usr/src/linux - b. make clean /* take a few minutes */ - c. make dep /* take a few minutes */ - d. make bzImage /* take probably 10-20 minutes */ - e. make install /* copy boot image to correct position */ - f. Please make sure the boot kernel (vmlinuz) is in the - correct position. - g. If you use 'lilo' utility, you should check /etc/lilo.conf - 'image' item specified the path which is the 'vmlinuz' path, - or you will load wrong (or old) boot kernel image (vmlinuz). - After checking /etc/lilo.conf, please run "lilo". - - Note that if the result of "make bzImage" is ERROR, then you have to - go back to Linux configuration Setup. Type "make menuconfig" in - directory /usr/src/linux. - - - 3.5.6 Make tty device and special file - # cd /moxa/mxser/driver - # ./msmknod - - 3.5.7 Make utility - # cd /moxa/mxser/utility - # make clean; make install - - 3.5.8 Reboot - - - - 3.6 Custom configuration - Although this driver already provides you default configuration, you - still can change the device name and major number. The instruction to - change these parameters are shown as below. - - Change Device name - ------------------ - If you'd like to use other device names instead of default naming - convention, all you have to do is to modify the internal code - within the shell script "msmknod". First, you have to open "msmknod" - by vi. Locate each line contains "ttyM" and "cum" and change them - to the device name you desired. "msmknod" creates the device names - you need next time executed. - - Change Major number - ------------------- - If major number 30 and 35 had been occupied, you may have to select - 2 free major numbers for this driver. There are 3 steps to change - major numbers. - - 3.6.1 Find free major numbers - In /proc/devices, you may find all the major numbers occupied - in the system. Please select 2 major numbers that are available. - e.g. 40, 45. - 3.6.2 Create special files - Run /moxa/mxser/driver/msmknod to create special files with - specified major numbers. - 3.6.3 Modify driver with new major number - Run vi to open /moxa/mxser/driver/mxser.c. Locate the line - contains "MXSERMAJOR". Change the content as below. - #define MXSERMAJOR 40 - #define MXSERCUMAJOR 45 - 3.6.4 Run "make clean; make install" in /moxa/mxser/driver. - - 3.7 Verify driver installation - You may refer to /var/log/messages to check the latest status - log reported by this driver whenever it's activated. - ------------------------------------------------------------------------------ -4. Utilities - There are 3 utilities contained in this driver. They are msdiag, msmon and - msterm. These 3 utilities are released in form of source code. They should - be compiled into executable file and copied into /usr/bin. - - Before using these utilities, please load driver (refer 3.4 & 3.5) and - make sure you had run the "msmknod" utility. - - msdiag - Diagnostic - -------------------- - This utility provides the function to display what Moxa Smartio/Industio - board found by driver in the system. - - msmon - Port Monitoring - ----------------------- - This utility gives the user a quick view about all the MOXA ports' - activities. One can easily learn each port's total received/transmitted - (Rx/Tx) character count since the time when the monitoring is started. - Rx/Tx throughputs per second are also reported in interval basis (e.g. - the last 5 seconds) and in average basis (since the time the monitoring - is started). You can reset all ports' count by key. <+> <-> - (plus/minus) keys to change the displaying time interval. Press - on the port, that cursor stay, to view the port's communication - parameters, signal status, and input/output queue. - - msterm - Terminal Emulation - --------------------------- - This utility provides data sending and receiving ability of all tty ports, - especially for MOXA ports. It is quite useful for testing simple - application, for example, sending AT command to a modem connected to the - port or used as a terminal for login purpose. Note that this is only a - dumb terminal emulation without handling full screen operation. - ------------------------------------------------------------------------------ -5. Setserial - - Supported Setserial parameters are listed as below. - - uart set UART type(16450-->disable FIFO, 16550A-->enable FIFO) - close_delay set the amount of time(in 1/100 of a second) that DTR - should be kept low while being closed. - closing_wait set the amount of time(in 1/100 of a second) that the - serial port should wait for data to be drained while - being closed, before the receiver is disable. - spd_hi Use 57.6kb when the application requests 38.4kb. - spd_vhi Use 115.2kb when the application requests 38.4kb. - spd_shi Use 230.4kb when the application requests 38.4kb. - spd_warp Use 460.8kb when the application requests 38.4kb. - spd_normal Use 38.4kb when the application requests 38.4kb. - spd_cust Use the custom divisor to set the speed when the - application requests 38.4kb. - divisor This option set the custom divison. - baud_base This option set the base baud rate. - ------------------------------------------------------------------------------ -6. Troubleshooting - - The boot time error messages and solutions are stated as clearly as - possible. If all the possible solutions fail, please contact our technical - support team to get more help. - - - Error msg: More than 4 Moxa Smartio/Industio family boards found. Fifth board - and after are ignored. - Solution: - To avoid this problem, please unplug fifth and after board, because Moxa - driver supports up to 4 boards. - - Error msg: Request_irq fail, IRQ(?) may be conflict with another device. - Solution: - Other PCI or ISA devices occupy the assigned IRQ. If you are not sure - which device causes the situation, please check /proc/interrupts to find - free IRQ and simply change another free IRQ for Moxa board. - - Error msg: Board #: C1xx Series(CAP=xxx) interrupt number invalid. - Solution: - Each port within the same multiport board shares the same IRQ. Please set - one IRQ (IRQ doesn't equal to zero) for one Moxa board. - - Error msg: No interrupt vector be set for Moxa ISA board(CAP=xxx). - Solution: - Moxa ISA board needs an interrupt vector.Please refer to user's manual - "Hardware Installation" chapter to set interrupt vector. - - Error msg: Couldn't install MOXA Smartio/Industio family driver! - Solution: - Load Moxa driver fail, the major number may conflict with other devices. - Please refer to previous section 3.7 to change a free major number for - Moxa driver. - - Error msg: Couldn't install MOXA Smartio/Industio family callout driver! - Solution: - Load Moxa callout driver fail, the callout device major number may - conflict with other devices. Please refer to previous section 3.7 to - change a free callout device major number for Moxa driver. - - ------------------------------------------------------------------------------ - diff --git a/Documentation/nbd.txt b/Documentation/nbd.txt deleted file mode 100644 index aeb93ffe641..00000000000 --- a/Documentation/nbd.txt +++ /dev/null @@ -1,47 +0,0 @@ - Network Block Device (TCP version) - - What is it: With this compiled in the kernel (or as a module), Linux - can use a remote server as one of its block devices. So every time - the client computer wants to read, e.g., /dev/nb0, it sends a - request over TCP to the server, which will reply with the data read. - This can be used for stations with low disk space (or even diskless - - if you boot from floppy) to borrow disk space from another computer. - Unlike NFS, it is possible to put any filesystem on it, etc. It should - even be possible to use NBD as a root filesystem (I've never tried), - but it requires a user-level program to be in the initrd to start. - It also allows you to run block-device in user land (making server - and client physically the same computer, communicating using loopback). - - Current state: It currently works. Network block device is stable. - I originally thought that it was impossible to swap over TCP. It - turned out not to be true - swapping over TCP now works and seems - to be deadlock-free, but it requires heavy patches into Linux's - network layer. - - For more information, or to download the nbd-client and nbd-server - tools, go to http://nbd.sf.net/. - - Howto: To setup nbd, you can simply do the following: - - First, serve a device or file from a remote server: - - nbd-server - - e.g., - root@server1 # nbd-server 1234 /dev/sdb1 - - (serves sdb1 partition on TCP port 1234) - - Then, on the local (client) system: - - nbd-client /dev/nb[0-n] - - e.g., - root@client1 # nbd-client server1 1234 /dev/nb0 - - (creates the nb0 device on client1) - - The nbd kernel module need only be installed on the client - system, as the nbd-server is completely in userspace. In fact, - the nbd-server has been successfully ported to other operating - systems, including Windows. diff --git a/Documentation/networking/README.ipw2200 b/Documentation/networking/README.ipw2200 index 4f2a40f1dbc..80c728522c4 100644 --- a/Documentation/networking/README.ipw2200 +++ b/Documentation/networking/README.ipw2200 @@ -147,7 +147,7 @@ Where the supported parameter are: driver. If disabled, the driver will not attempt to scan for and associate to a network until it has been configured with one or more properties for the target network, for example configuring - the network SSID. Default is 1 (auto-associate) + the network SSID. Default is 0 (do not auto-associate) Example: % modprobe ipw2200 associate=0 diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 688dfe1e6b7..5ede7473b42 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -194,6 +194,48 @@ or, for backwards compatibility, the option value. E.g., The parameters are as follows: +ad_select + + Specifies the 802.3ad aggregation selection logic to use. The + possible values and their effects are: + + stable or 0 + + The active aggregator is chosen by largest aggregate + bandwidth. + + Reselection of the active aggregator occurs only when all + slaves of the active aggregator are down or the active + aggregator has no slaves. + + This is the default value. + + bandwidth or 1 + + The active aggregator is chosen by largest aggregate + bandwidth. Reselection occurs if: + + - A slave is added to or removed from the bond + + - Any slave's link state changes + + - Any slave's 802.3ad association state changes + + - The bond's adminstrative state changes to up + + count or 2 + + The active aggregator is chosen by the largest number of + ports (slaves). Reselection occurs as described under the + "bandwidth" setting, above. + + The bandwidth and count selection policies permit failover of + 802.3ad aggregations when partial failure of the active aggregator + occurs. This keeps the aggregator with the highest availability + (either in bandwidth or in number of ports) active at all times. + + This option was added in bonding version 3.4.0. + arp_interval Specifies the ARP link monitoring frequency in milliseconds. @@ -551,6 +593,16 @@ num_grat_arp affects only the active-backup mode. This option was added for bonding version 3.3.0. +num_unsol_na + + Specifies the number of unsolicited IPv6 Neighbor Advertisements + to be issued after a failover event. One unsolicited NA is issued + immediately after the failover. + + The valid range is 0 - 255; the default value is 1. This option + affects only the active-backup mode. This option was added for + bonding version 3.4.0. + primary A string (eth0, eth2, etc) specifying which slave is the @@ -922,17 +974,19 @@ USERCTL=no NETMASK, NETWORK and BROADCAST) to match your network configuration. For later versions of initscripts, such as that found with Fedora -7 and Red Hat Enterprise Linux version 5 (or later), it is possible, and, -indeed, preferable, to specify the bonding options in the ifcfg-bond0 +7 (or later) and Red Hat Enterprise Linux version 5 (or later), it is possible, +and, indeed, preferable, to specify the bonding options in the ifcfg-bond0 file, e.g. a line of the format: -BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=+192.168.1.254" +BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" will configure the bond with the specified options. The options specified in BONDING_OPTS are identical to the bonding module parameters -except for the arp_ip_target field. Each target should be included as a -separate option and should be preceded by a '+' to indicate it should be -added to the list of queried targets, e.g., +except for the arp_ip_target field when using versions of initscripts older +than and 8.57 (Fedora 8) and 8.45.19 (Red Hat Enterprise Linux 5.2). When +using older versions each target should be included as a separate option and +should be preceded by a '+' to indicate it should be added to the list of +queried targets, e.g., arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 @@ -940,7 +994,7 @@ added to the list of queried targets, e.g., options via BONDING_OPTS, it is not necessary to edit /etc/modules.conf or /etc/modprobe.conf. - For older versions of initscripts that do not support + For even older versions of initscripts that do not support BONDING_OPTS, it is necessary to edit /etc/modules.conf (or /etc/modprobe.conf, depending upon your distro) to load the bonding module with your desired options when the bond0 interface is brought up. The diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f..7a3bb1abb83 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -57,6 +57,24 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint (see include/linux/dccp.h for symbolic constants). +The caller needs to provide a sufficiently large (> 2) array of type uint8_t. + +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferrable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is `int', not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait @@ -115,20 +133,12 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ndp = 1 - Whether or not to send NDP count options (sec. 7.7.2). - -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - -ack_ratio = 2 - The default Ack Ratio (sec. 11.3) to use. - tx_ccid = 2 - Default CCID for the sender-receiver half-connection. + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. rx_ccid = 2 - Default CCID for the receiver-sender half-connection. + Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 The initial sequence window (sec. 7.5.2). diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.txt index ea72d2e66ca..03283daa64f 100644 --- a/Documentation/networking/driver.txt +++ b/Documentation/networking/driver.txt @@ -13,7 +13,7 @@ Transmit path guidelines: static int drv_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct drv *dp = dev->priv; + struct drv *dp = netdev_priv(dev); lock_tx(dp); ... diff --git a/Documentation/networking/generic-hdlc.txt b/Documentation/networking/generic-hdlc.txt index 31bc8b759b7..4eb3cc40b70 100644 --- a/Documentation/networking/generic-hdlc.txt +++ b/Documentation/networking/generic-hdlc.txt @@ -3,15 +3,15 @@ Krzysztof Halasa Generic HDLC layer currently supports: -1. Frame Relay (ANSI, CCITT, Cisco and no LMI). +1. Frame Relay (ANSI, CCITT, Cisco and no LMI) - Normal (routed) and Ethernet-bridged (Ethernet device emulation) interfaces can share a single PVC. - ARP support (no InARP support in the kernel - there is an experimental InARP user-space daemon available on: http://www.kernel.org/pub/linux/utils/net/hdlc/). -2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation. -3. Cisco HDLC. -4. PPP (uses syncppp.c). +2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation +3. Cisco HDLC +4. PPP 5. X.25 (uses X.25 routines). Generic HDLC is a protocol driver only - it needs a low-level driver diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d84932650fd..c7712787933 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -27,6 +27,12 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. +rt_cache_rebuild_count - INTEGER + The per net-namespace route cache emergency rebuild threshold. + Any net-namespace having its route cache rebuilt due to + a hash bucket chain being too long more than this many times + will have its route caching disabled + IP Fragmentation: ipfrag_high_thresh - INTEGER diff --git a/Documentation/networking/mac80211_hwsim/README b/Documentation/networking/mac80211_hwsim/README index 2ff8ccb8dc3..24ac91d5669 100644 --- a/Documentation/networking/mac80211_hwsim/README +++ b/Documentation/networking/mac80211_hwsim/README @@ -50,10 +50,6 @@ associates with the AP. hostapd and wpa_supplicant are used to take care of WPA2-PSK authentication. In addition, hostapd is also processing access point side of association. -Please note that the current Linux kernel does not enable AP mode, so a -simple patch is needed to enable AP mode selection: -http://johannes.sipsolutions.net/patches/kernel/all/LATEST/006-allow-ap-vlan-modes.patch - # Build mac80211_hwsim as part of kernel configuration @@ -65,3 +61,8 @@ hostapd hostapd.conf # Run wpa_supplicant (station) for wlan1 wpa_supplicant -Dwext -iwlan1 -c wpa_supplicant.conf + + +More test cases are available in hostap.git: +git://w1.fi/srv/git/hostap.git and mac80211_hwsim/tests subdirectory +(http://w1.fi/gitweb/gitweb.cgi?p=hostap.git;a=tree;f=mac80211_hwsim/tests) diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt index d0f71fc7f78..a2ab6a0b116 100644 --- a/Documentation/networking/netdevices.txt +++ b/Documentation/networking/netdevices.txt @@ -18,7 +18,7 @@ There are routines in net_init.c to handle the common cases of alloc_etherdev, alloc_netdev. These reserve extra space for driver private data which gets freed when the network device is freed. If separately allocated data is attached to the network device -(dev->priv) then it is up to the module exit handler to free that. +(netdev_priv(dev)) then it is up to the module exit handler to free that. MTU === diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt index 8df6a7b0e66..88bb71b46da 100644 --- a/Documentation/networking/phy.txt +++ b/Documentation/networking/phy.txt @@ -96,7 +96,7 @@ Letting the PHY Abstraction Layer do Everything static void adjust_link(struct net_device *dev); Next, you need to know the device name of the PHY connected to this device. - The name will look something like, "phy0:0", where the first number is the + The name will look something like, "0:00", where the first number is the bus id, and the second is the PHY's address on that bus. Typically, the bus is responsible for making its ID unique. diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index a96989a8ff3..dcf31648414 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt @@ -131,11 +131,13 @@ are expected to do this during initialization. r = zd_reg2alpha2(mac->regdomain, alpha2); if (!r) - regulatory_hint(hw->wiphy, alpha2, NULL); + regulatory_hint(hw->wiphy, alpha2); Example code - drivers providing a built in regulatory domain: -------------------------------------------------------------- +[NOTE: This API is not currently available, it can be added when required] + If you have regulatory information you can obtain from your driver and you *need* to use this we let you build a regulatory domain structure and pass it to the wireless core. To do this you should @@ -167,7 +169,6 @@ struct ieee80211_regdomain mydriver_jp_regdom = { Then in some part of your code after your wiphy has been registered: - int r; struct ieee80211_regdomain *rd; int size_of_regd; int num_rules = mydriver_jp_regdom.n_reg_rules; @@ -178,17 +179,12 @@ Then in some part of your code after your wiphy has been registered: rd = kzalloc(size_of_regd, GFP_KERNEL); if (!rd) - return -ENOMEM; + return -ENOMEM; memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain)); - for (i=0; i < num_rules; i++) { - memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i], - sizeof(struct ieee80211_reg_rule)); - } - r = regulatory_hint(hw->wiphy, NULL, rd); - if (r) { - kfree(rd); - return r; - } - + for (i=0; i < num_rules; i++) + memcpy(&rd->reg_rules[i], + &mydriver_jp_regdom.reg_rules[i], + sizeof(struct ieee80211_reg_rule)); + regulatory_struct_hint(rd); diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index c3669a3fb4a..60d05eb77c6 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -540,7 +540,7 @@ A client would issue an operation by: MSG_MORE should be set in msghdr::msg_flags on all but the last part of the request. Multiple requests may be made simultaneously. - If a call is intended to go to a destination other then the default + If a call is intended to go to a destination other than the default specified through connect(), then msghdr::msg_name should be set on the first request message of that call. diff --git a/Documentation/networking/tuntap.txt b/Documentation/networking/tuntap.txt index 839cbb71388..c0aab985bad 100644 --- a/Documentation/networking/tuntap.txt +++ b/Documentation/networking/tuntap.txt @@ -118,7 +118,7 @@ As mentioned above, main purpose of TUN/TAP driver is tunneling. It is used by VTun (http://vtun.sourceforge.net). Another interesting application using TUN/TAP is pipsecd -(http://perso.enst.fr/~beyssac/pipsec/), an userspace IPSec +(http://perso.enst.fr/~beyssac/pipsec/), a userspace IPSec implementation that can use complete kernel routing (unlike FreeS/WAN). 3. How does Virtual network device actually work ? diff --git a/Documentation/nmi_watchdog.txt b/Documentation/nmi_watchdog.txt index 90aa4531cb6..bf9f80a9828 100644 --- a/Documentation/nmi_watchdog.txt +++ b/Documentation/nmi_watchdog.txt @@ -69,6 +69,11 @@ to the overall system performance. On x86 nmi_watchdog is disabled by default so you have to enable it with a boot time parameter. +It's possible to disable the NMI watchdog in run-time by writing "0" to +/proc/sys/kernel/nmi_watchdog. Writing "1" to the same file will re-enable +the NMI watchdog. Notice that you still need to use "nmi_watchdog=" parameter +at boot time. + NOTE: In kernels prior to 2.4.2-ac18 the NMI-oopser is enabled unconditionally on x86 SMP boxes. diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt index 7714f57caad..b565e8279d1 100644 --- a/Documentation/nommu-mmap.txt +++ b/Documentation/nommu-mmap.txt @@ -109,12 +109,18 @@ and it's also much more restricted in the latter case: FURTHER NOTES ON NO-MMU MMAP ============================ - (*) A request for a private mapping of less than a page in size may not return - a page-aligned buffer. This is because the kernel calls kmalloc() to - allocate the buffer, not get_free_page(). + (*) A request for a private mapping of a file may return a buffer that is not + page-aligned. This is because XIP may take place, and the data may not be + paged aligned in the backing store. - (*) A list of all the mappings on the system is visible through /proc/maps in - no-MMU mode. + (*) A request for an anonymous mapping will always be page aligned. If + possible the size of the request should be a power of two otherwise some + of the space may be wasted as the kernel must allocate a power-of-2 + granule but will only discard the excess if appropriately configured as + this has an effect on fragmentation. + + (*) A list of all the private copy and anonymous mappings on the system is + visible through /proc/maps in no-MMU mode. (*) A list of all the mappings in use by a process is visible through /proc//maps in no-MMU mode. @@ -242,3 +248,18 @@ PROVIDING SHAREABLE BLOCK DEVICE SUPPORT Provision of shared mappings on block device files is exactly the same as for character devices. If there isn't a real device underneath, then the driver should allocate sufficient contiguous memory to honour any supported mapping. + + +================================= +ADJUSTING PAGE TRIMMING BEHAVIOUR +================================= + +NOMMU mmap automatically rounds up to the nearest power-of-2 number of pages +when performing an allocation. This can have adverse effects on memory +fragmentation, and as such, is left configurable. The default behaviour is to +aggressively trim allocations and discard any excess pages back in to the page +allocator. In order to retain finer-grained control over fragmentation, this +behaviour can either be disabled completely, or bumped up to a higher page +watermark where trimming begins. + +Page trimming behaviour is configurable via the sysctl `vm.nr_trim_pages'. diff --git a/Documentation/paride.txt b/Documentation/paride.txt deleted file mode 100644 index e4312676bdd..00000000000 --- a/Documentation/paride.txt +++ /dev/null @@ -1,417 +0,0 @@ - - Linux and parallel port IDE devices - -PARIDE v1.03 (c) 1997-8 Grant Guenther - -1. Introduction - -Owing to the simplicity and near universality of the parallel port interface -to personal computers, many external devices such as portable hard-disk, -CD-ROM, LS-120 and tape drives use the parallel port to connect to their -host computer. While some devices (notably scanners) use ad-hoc methods -to pass commands and data through the parallel port interface, most -external devices are actually identical to an internal model, but with -a parallel-port adapter chip added in. Some of the original parallel port -adapters were little more than mechanisms for multiplexing a SCSI bus. -(The Iomega PPA-3 adapter used in the ZIP drives is an example of this -approach). Most current designs, however, take a different approach. -The adapter chip reproduces a small ISA or IDE bus in the external device -and the communication protocol provides operations for reading and writing -device registers, as well as data block transfer functions. Sometimes, -the device being addressed via the parallel cable is a standard SCSI -controller like an NCR 5380. The "ditto" family of external tape -drives use the ISA replicator to interface a floppy disk controller, -which is then connected to a floppy-tape mechanism. The vast majority -of external parallel port devices, however, are now based on standard -IDE type devices, which require no intermediate controller. If one -were to open up a parallel port CD-ROM drive, for instance, one would -find a standard ATAPI CD-ROM drive, a power supply, and a single adapter -that interconnected a standard PC parallel port cable and a standard -IDE cable. It is usually possible to exchange the CD-ROM device with -any other device using the IDE interface. - -The document describes the support in Linux for parallel port IDE -devices. It does not cover parallel port SCSI devices, "ditto" tape -drives or scanners. Many different devices are supported by the -parallel port IDE subsystem, including: - - MicroSolutions backpack CD-ROM - MicroSolutions backpack PD/CD - MicroSolutions backpack hard-drives - MicroSolutions backpack 8000t tape drive - SyQuest EZ-135, EZ-230 & SparQ drives - Avatar Shark - Imation Superdisk LS-120 - Maxell Superdisk LS-120 - FreeCom Power CD - Hewlett-Packard 5GB and 8GB tape drives - Hewlett-Packard 7100 and 7200 CD-RW drives - -as well as most of the clone and no-name products on the market. - -To support such a wide range of devices, PARIDE, the parallel port IDE -subsystem, is actually structured in three parts. There is a base -paride module which provides a registry and some common methods for -accessing the parallel ports. The second component is a set of -high-level drivers for each of the different types of supported devices: - - pd IDE disk - pcd ATAPI CD-ROM - pf ATAPI disk - pt ATAPI tape - pg ATAPI generic - -(Currently, the pg driver is only used with CD-R drives). - -The high-level drivers function according to the relevant standards. -The third component of PARIDE is a set of low-level protocol drivers -for each of the parallel port IDE adapter chips. Thanks to the interest -and encouragement of Linux users from many parts of the world, -support is available for almost all known adapter protocols: - - aten ATEN EH-100 (HK) - bpck Microsolutions backpack (US) - comm DataStor (old-type) "commuter" adapter (TW) - dstr DataStor EP-2000 (TW) - epat Shuttle EPAT (UK) - epia Shuttle EPIA (UK) - fit2 FIT TD-2000 (US) - fit3 FIT TD-3000 (US) - friq Freecom IQ cable (DE) - frpw Freecom Power (DE) - kbic KingByte KBIC-951A and KBIC-971A (TW) - ktti KT Technology PHd adapter (SG) - on20 OnSpec 90c20 (US) - on26 OnSpec 90c26 (US) - - -2. Using the PARIDE subsystem - -While configuring the Linux kernel, you may choose either to build -the PARIDE drivers into your kernel, or to build them as modules. - -In either case, you will need to select "Parallel port IDE device support" -as well as at least one of the high-level drivers and at least one -of the parallel port communication protocols. If you do not know -what kind of parallel port adapter is used in your drive, you could -begin by checking the file names and any text files on your DOS -installation floppy. Alternatively, you can look at the markings on -the adapter chip itself. That's usually sufficient to identify the -correct device. - -You can actually select all the protocol modules, and allow the PARIDE -subsystem to try them all for you. - -For the "brand-name" products listed above, here are the protocol -and high-level drivers that you would use: - - Manufacturer Model Driver Protocol - - MicroSolutions CD-ROM pcd bpck - MicroSolutions PD drive pf bpck - MicroSolutions hard-drive pd bpck - MicroSolutions 8000t tape pt bpck - SyQuest EZ, SparQ pd epat - Imation Superdisk pf epat - Maxell Superdisk pf friq - Avatar Shark pd epat - FreeCom CD-ROM pcd frpw - Hewlett-Packard 5GB Tape pt epat - Hewlett-Packard 7200e (CD) pcd epat - Hewlett-Packard 7200e (CD-R) pg epat - -2.1 Configuring built-in drivers - -We recommend that you get to know how the drivers work and how to -configure them as loadable modules, before attempting to compile a -kernel with the drivers built-in. - -If you built all of your PARIDE support directly into your kernel, -and you have just a single parallel port IDE device, your kernel should -locate it automatically for you. If you have more than one device, -you may need to give some command line options to your bootloader -(eg: LILO), how to do that is beyond the scope of this document. - -The high-level drivers accept a number of command line parameters, all -of which are documented in the source files in linux/drivers/block/paride. -By default, each driver will automatically try all parallel ports it -can find, and all protocol types that have been installed, until it finds -a parallel port IDE adapter. Once it finds one, the probe stops. So, -if you have more than one device, you will need to tell the drivers -how to identify them. This requires specifying the port address, the -protocol identification number and, for some devices, the drive's -chain ID. While your system is booting, a number of messages are -displayed on the console. Like all such messages, they can be -reviewed with the 'dmesg' command. Among those messages will be -some lines like: - - paride: bpck registered as protocol 0 - paride: epat registered as protocol 1 - -The numbers will always be the same until you build a new kernel with -different protocol selections. You should note these numbers as you -will need them to identify the devices. - -If you happen to be using a MicroSolutions backpack device, you will -also need to know the unit ID number for each drive. This is usually -the last two digits of the drive's serial number (but read MicroSolutions' -documentation about this). - -As an example, let's assume that you have a MicroSolutions PD/CD drive -with unit ID number 36 connected to the parallel port at 0x378, a SyQuest -EZ-135 connected to the chained port on the PD/CD drive and also an -Imation Superdisk connected to port 0x278. You could give the following -options on your boot command: - - pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36 - -In the last option, pf.drive1 configures device /dev/pf1, the 0x378 -is the parallel port base address, the 0 is the protocol registration -number and 36 is the chain ID. - -Please note: while PARIDE will work both with and without the -PARPORT parallel port sharing system that is included by the -"Parallel port support" option, PARPORT must be included and enabled -if you want to use chains of devices on the same parallel port. - -2.2 Loading and configuring PARIDE as modules - -It is much faster and simpler to get to understand the PARIDE drivers -if you use them as loadable kernel modules. - -Note 1: using these drivers with the "kerneld" automatic module loading -system is not recommended for beginners, and is not documented here. - -Note 2: if you build PARPORT support as a loadable module, PARIDE must -also be built as loadable modules, and PARPORT must be loaded before the -PARIDE modules. - -To use PARIDE, you must begin by - - insmod paride - -this loads a base module which provides a registry for the protocols, -among other tasks. - -Then, load as many of the protocol modules as you think you might need. -As you load each module, it will register the protocols that it supports, -and print a log message to your kernel log file and your console. For -example: - - # insmod epat - paride: epat registered as protocol 0 - # insmod kbic - paride: k951 registered as protocol 1 - paride: k971 registered as protocol 2 - -Finally, you can load high-level drivers for each kind of device that -you have connected. By default, each driver will autoprobe for a single -device, but you can support up to four similar devices by giving their -individual co-ordinates when you load the driver. - -For example, if you had two no-name CD-ROM drives both using the -KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc -you could give the following command: - - # insmod pcd drive0=0x378,1 drive1=0x3bc,1 - -For most adapters, giving a port address and protocol number is sufficient, -but check the source files in linux/drivers/block/paride for more -information. (Hopefully someone will write some man pages one day !). - -As another example, here's what happens when PARPORT is installed, and -a SyQuest EZ-135 is attached to port 0x378: - - # insmod paride - paride: version 1.0 installed - # insmod epat - paride: epat registered as protocol 0 - # insmod pd - pd: pd version 1.0, major 45, cluster 64, nice 0 - pda: Sharing parport1 at 0x378 - pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1 - pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media - pda: pda1 - -Note that the last line is the output from the generic partition table -scanner - in this case it reports that it has found a disk with one partition. - -2.3 Using a PARIDE device - -Once the drivers have been loaded, you can access PARIDE devices in the -same way as their traditional counterparts. You will probably need to -create the device "special files". Here is a simple script that you can -cut to a file and execute: - -#!/bin/bash -# -# mkd -- a script to create the device special files for the PARIDE subsystem -# -function mkdev { - mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 -} -# -function pd { - D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) - mkdev pd$D b 45 $[ $1 * 16 ] - for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] - done -} -# -cd /dev -# -for u in 0 1 2 3 ; do pd $u ; done -for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done -for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done -for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done -for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done -for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done -# -# end of mkd - -With the device files and drivers in place, you can access PARIDE devices -like any other Linux device. For example, to mount a CD-ROM in pcd0, use: - - mount /dev/pcd0 /cdrom - -If you have a fresh Avatar Shark cartridge, and the drive is pda, you -might do something like: - - fdisk /dev/pda -- make a new partition table with - partition 1 of type 83 - - mke2fs /dev/pda1 -- to build the file system - - mkdir /shark -- make a place to mount the disk - - mount /dev/pda1 /shark - -Devices like the Imation superdisk work in the same way, except that -they do not have a partition table. For example to make a 120MB -floppy that you could share with a DOS system: - - mkdosfs /dev/pf0 - mount /dev/pf0 /mnt - - -2.4 The pf driver - -The pf driver is intended for use with parallel port ATAPI disk -devices. The most common devices in this category are PD drives -and LS-120 drives. Traditionally, media for these devices are not -partitioned. Consequently, the pf driver does not support partitioned -media. This may be changed in a future version of the driver. - -2.5 Using the pt driver - -The pt driver for parallel port ATAPI tape drives is a minimal driver. -It does not yet support many of the standard tape ioctl operations. -For best performance, a block size of 32KB should be used. You will -probably want to set the parallel port delay to 0, if you can. - -2.6 Using the pg driver - -The pg driver can be used in conjunction with the cdrecord program -to create CD-ROMs. Please get cdrecord version 1.6.1 or later -from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media -your parallel port should ideally be set to EPP mode, and the "port delay" -should be set to 0. With those settings it is possible to record at 2x -speed without any buffer underruns. If you cannot get the driver to work -in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only. - - -3. Troubleshooting - -3.1 Use EPP mode if you can - -The most common problems that people report with the PARIDE drivers -concern the parallel port CMOS settings. At this time, none of the -PARIDE protocol modules support ECP mode, or any ECP combination modes. -If you are able to do so, please set your parallel port into EPP mode -using your CMOS setup procedure. - -3.2 Check the port delay - -Some parallel ports cannot reliably transfer data at full speed. To -offset the errors, the PARIDE protocol modules introduce a "port -delay" between each access to the i/o ports. Each protocol sets -a default value for this delay. In most cases, the user can override -the default and set it to 0 - resulting in somewhat higher transfer -rates. In some rare cases (especially with older 486 systems) the -default delays are not long enough. if you experience corrupt data -transfers, or unexpected failures, you may wish to increase the -port delay. The delay can be programmed using the "driveN" parameters -to each of the high-level drivers. Please see the notes above, or -read the comments at the beginning of the driver source files in -linux/drivers/block/paride. - -3.3 Some drives need a printer reset - -There appear to be a number of "noname" external drives on the market -that do not always power up correctly. We have noticed this with some -drives based on OnSpec and older Freecom adapters. In these rare cases, -the adapter can often be reinitialised by issuing a "printer reset" on -the parallel port. As the reset operation is potentially disruptive in -multiple device environments, the PARIDE drivers will not do it -automatically. You can however, force a printer reset by doing: - - insmod lp reset=1 - rmmod lp - -If you have one of these marginal cases, you should probably build -your paride drivers as modules, and arrange to do the printer reset -before loading the PARIDE drivers. - -3.4 Use the verbose option and dmesg if you need help - -While a lot of testing has gone into these drivers to make them work -as smoothly as possible, problems will arise. If you do have problems, -please check all the obvious things first: does the drive work in -DOS with the manufacturer's drivers ? If that doesn't yield any useful -clues, then please make sure that only one drive is hooked to your system, -and that either (a) PARPORT is enabled or (b) no other device driver -is using your parallel port (check in /proc/ioports). Then, load the -appropriate drivers (you can load several protocol modules if you want) -as in: - - # insmod paride - # insmod epat - # insmod bpck - # insmod kbic - ... - # insmod pd verbose=1 - -(using the correct driver for the type of device you have, of course). -The verbose=1 parameter will cause the drivers to log a trace of their -activity as they attempt to locate your drive. - -Use 'dmesg' to capture a log of all the PARIDE messages (any messages -beginning with paride:, a protocol module's name or a driver's name) and -include that with your bug report. You can submit a bug report in one -of two ways. Either send it directly to the author of the PARIDE suite, -by e-mail to grant@torque.net, or join the linux-parport mailing list -and post your report there. - -3.5 For more information or help - -You can join the linux-parport mailing list by sending a mail message -to - linux-parport-request@torque.net - -with the single word - - subscribe - -in the body of the mail message (not in the subject line). Please be -sure that your mail program is correctly set up when you do this, as -the list manager is a robot that will subscribe you using the reply -address in your mail headers. REMOVE any anti-spam gimmicks you may -have in your mail headers, when sending mail to the list server. - -You might also find some useful information on the linux-parport -web pages (although they are not always up to date) at - - http://www.torque.net/parport/ - - diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt index 02ea9a971b8..0ab0230cbcb 100644 --- a/Documentation/powerpc/booting-without-of.txt +++ b/Documentation/powerpc/booting-without-of.txt @@ -41,25 +41,14 @@ Table of Contents VI - System-on-a-chip devices and nodes 1) Defining child nodes of an SOC 2) Representing devices without a current OF specification - a) MDIO IO device - b) Gianfar-compatible ethernet nodes - c) PHY nodes - d) Interrupt controllers - e) I2C - f) Freescale SOC USB controllers - g) Freescale SOC SEC Security Engines - h) Board Control and Status (BCSR) - i) Freescale QUICC Engine module (QE) - j) CFI or JEDEC memory-mapped NOR flash - k) Global Utilities Block - l) Freescale Communications Processor Module - m) Chipselect/Local Bus - n) 4xx/Axon EMAC ethernet nodes - o) Xilinx IP cores - p) Freescale Synchronous Serial Interface - q) USB EHCI controllers - r) MDIO on GPIOs - s) SPI busses + a) PHY nodes + b) Interrupt controllers + c) CFI or JEDEC memory-mapped NOR flash + d) 4xx/Axon EMAC ethernet nodes + e) Xilinx IP cores + f) USB EHCI controllers + g) MDIO on GPIOs + h) SPI busses VII - Marvell Discovery mv64[345]6x System Controller chips 1) The /system-controller node @@ -1830,41 +1819,7 @@ platforms are moved over to use the flattened-device-tree model. big-endian; }; - r) Freescale Display Interface Unit - - The Freescale DIU is a LCD controller, with proper hardware, it can also - drive DVI monitors. - - Required properties: - - compatible : should be "fsl-diu". - - reg : should contain at least address and length of the DIU register - set. - - Interrupts : one DIU interrupt should be describe here. - - Example (MPC8610HPCD) - display@2c000 { - compatible = "fsl,diu"; - reg = <0x2c000 100>; - interrupts = <72 2>; - interrupt-parent = <&mpic>; - }; - - s) Freescale on board FPGA - - This is the memory-mapped registers for on board FPGA. - - Required properities: - - compatible : should be "fsl,fpga-pixis". - - reg : should contain the address and the lenght of the FPPGA register - set. - - Example (MPC8610HPCD) - board-control@e8000000 { - compatible = "fsl,fpga-pixis"; - reg = <0xe8000000 32>; - }; - - r) MDIO on GPIOs + g) MDIO on GPIOs Currently defined compatibles: - virtual,gpio-mdio @@ -1884,7 +1839,7 @@ platforms are moved over to use the flattened-device-tree model. &qe_pio_c 6>; }; - s) SPI (Serial Peripheral Interface) busses + h) SPI (Serial Peripheral Interface) busses SPI busses can be described with a node for the SPI master device and a set of child nodes for each SPI slave on the bus. For this diff --git a/Documentation/powerpc/cpu_features.txt b/Documentation/powerpc/cpu_features.txt index 472739880e8..ffa4183fdb8 100644 --- a/Documentation/powerpc/cpu_features.txt +++ b/Documentation/powerpc/cpu_features.txt @@ -31,7 +31,7 @@ anyways). After detecting the processor type, the kernel patches out sections of code that shouldn't be used by writing nop's over it. Using cpufeatures requires -just 2 macros (found in include/asm-ppc/cputable.h), as seen in head.S +just 2 macros (found in arch/powerpc/include/asm/cputable.h), as seen in head.S transfer_to_handler: #ifdef CONFIG_ALTIVEC diff --git a/Documentation/powerpc/dts-bindings/4xx/ndfc.txt b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt new file mode 100644 index 00000000000..869f0b5f16e --- /dev/null +++ b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt @@ -0,0 +1,39 @@ +AMCC NDFC (NanD Flash Controller) + +Required properties: +- compatible : "ibm,ndfc". +- reg : should specify chip select and size used for the chip (0x2000). + +Optional properties: +- ccr : NDFC config and control register value (default 0). +- bank-settings : NDFC bank configuration register value (default 0). + +Notes: +- partition(s) - follows the OF MTD standard for partitions + +Example: + +ndfc@1,0 { + compatible = "ibm,ndfc"; + reg = <0x00000001 0x00000000 0x00002000>; + ccr = <0x00001000>; + bank-settings = <0x80002222>; + #address-cells = <1>; + #size-cells = <1>; + + nand { + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + label = "kernel"; + reg = <0x00000000 0x00200000>; + }; + partition@200000 { + label = "root"; + reg = <0x00200000 0x03E00000>; + }; + }; +}; + + diff --git a/Documentation/powerpc/dts-bindings/fsl/board.txt b/Documentation/powerpc/dts-bindings/fsl/board.txt index 81a917ef96e..6c974d28eeb 100644 --- a/Documentation/powerpc/dts-bindings/fsl/board.txt +++ b/Documentation/powerpc/dts-bindings/fsl/board.txt @@ -18,7 +18,7 @@ This is the memory-mapped registers for on board FPGA. Required properities: - compatible : should be "fsl,fpga-pixis". -- reg : should contain the address and the lenght of the FPPGA register +- reg : should contain the address and the length of the FPPGA register set. Example (MPC8610HPCD): @@ -27,3 +27,33 @@ Example (MPC8610HPCD): compatible = "fsl,fpga-pixis"; reg = <0xe8000000 32>; }; + +* Freescale BCSR GPIO banks + +Some BCSR registers act as simple GPIO controllers, each such +register can be represented by the gpio-controller node. + +Required properities: +- compatible : Should be "fsl,-bcsr-gpio". +- reg : Should contain the address and the length of the GPIO bank + register. +- #gpio-cells : Should be two. The first cell is the pin number and the + second cell is used to specify optional paramters (currently unused). +- gpio-controller : Marks the port as GPIO controller. + +Example: + + bcsr@1,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,mpc8360mds-bcsr"; + reg = <1 0 0x8000>; + ranges = <0 1 0 0x8000>; + + bcsr13: gpio-controller@d { + #gpio-cells = <2>; + compatible = "fsl,mpc8360mds-bcsr-gpio"; + reg = <0xd 1>; + gpio-controller; + }; + }; diff --git a/Documentation/powerpc/dts-bindings/fsl/tsec.txt b/Documentation/powerpc/dts-bindings/fsl/tsec.txt index cf55fa4112d..7fa4b27574b 100644 --- a/Documentation/powerpc/dts-bindings/fsl/tsec.txt +++ b/Documentation/powerpc/dts-bindings/fsl/tsec.txt @@ -2,8 +2,8 @@ The MDIO is a bus to which the PHY devices are connected. For each device that exists on this bus, a child node should be created. See -the definition of the PHY node below for an example of how to define -a PHY. +the definition of the PHY node in booting-without-of.txt for an example +of how to define a PHY. Required properties: - reg : Offset and length of the register set for the device @@ -21,6 +21,14 @@ Example: }; }; +* TBI Internal MDIO bus + +As of this writing, every tsec is associated with an internal TBI PHY. +This PHY is accessed through the local MDIO bus. These buses are defined +similarly to the mdio buses, except they are compatible with "fsl,gianfar-tbi". +The TBI PHYs underneath them are similar to normal PHYs, but the reg property +is considered instructive, rather than descriptive. The reg property should +be chosen so it doesn't interfere with other PHYs on the bus. * Gianfar-compatible ethernet nodes diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt new file mode 100644 index 00000000000..1b5a5ddbc3e --- /dev/null +++ b/Documentation/printk-formats.txt @@ -0,0 +1,35 @@ +If variable is of Type, use printk format specifier: +--------------------------------------------------------- + int %d or %x + unsigned int %u or %x + long %ld or %lx + unsigned long %lu or %lx + long long %lld or %llx + unsigned long long %llu or %llx + size_t %zu or %zx + ssize_t %zd or %zx + +Raw pointer value SHOULD be printed with %p. + +u64 SHOULD be printed with %llu/%llx, (unsigned long long): + + printk("%llu", (unsigned long long)u64_var); + +s64 SHOULD be printed with %lld/%llx, (long long): + + printk("%lld", (long long)s64_var); + +If is dependent on a config option for its size (e.g., sector_t, +blkcnt_t, phys_addr_t, resource_size_t) or is architecture-dependent +for its size (e.g., tcflag_t), use a format specifier of its largest +possible type and explicitly cast to it. Example: + + printk("test: sector number/total blocks: %llu/%llu\n", + (unsigned long long)sector, (unsigned long long)blockcount); + +Reminder: sizeof() result is of type size_t. + +Thank you for your cooperation and attention. + + +By Randy Dunlap diff --git a/Documentation/ramdisk.txt b/Documentation/ramdisk.txt deleted file mode 100644 index 6c820baa19a..00000000000 --- a/Documentation/ramdisk.txt +++ /dev/null @@ -1,165 +0,0 @@ -Using the RAM disk block device with Linux ------------------------------------------- - -Contents: - - 1) Overview - 2) Kernel Command Line Parameters - 3) Using "rdev -r" - 4) An Example of Creating a Compressed RAM Disk - - -1) Overview ------------ - -The RAM disk driver is a way to use main system memory as a block device. It -is required for initrd, an initial filesystem used if you need to load modules -in order to access the root filesystem (see Documentation/initrd.txt). It can -also be used for a temporary filesystem for crypto work, since the contents -are erased on reboot. - -The RAM disk dynamically grows as more space is required. It does this by using -RAM from the buffer cache. The driver marks the buffers it is using as dirty -so that the VM subsystem does not try to reclaim them later. - -The RAM disk supports up to 16 RAM disks by default, and can be reconfigured -to support an unlimited number of RAM disks (at your own risk). Just change -the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu -and (re)build the kernel. - -To use RAM disk support with your system, run './MAKEDEV ram' from the /dev -directory. RAM disks are all major number 1, and start with minor number 0 -for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd. - -The new RAM disk also has the ability to load compressed RAM disk images, -allowing one to squeeze more programs onto an average installation or -rescue floppy disk. - - -2) Kernel Command Line Parameters ---------------------------------- - - ramdisk_size=N - ============== - -This parameter tells the RAM disk driver to set up RAM disks of N k size. The -default is 4096 (4 MB) (8192 (8 MB) on S390). - - ramdisk_blocksize=N - =================== - -This parameter tells the RAM disk driver how many bytes to use per block. The -default is 1024 (BLOCK_SIZE). - - -3) Using "rdev -r" ------------------- - -The usage of the word (two bytes) that "rdev -r" sets in the kernel image is -as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up -to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit -14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a -prompt/wait sequence is to be given before trying to read the RAM disk. Since -the RAM disk dynamically grows as data is being written into it, a size field -is not required. Bits 11 to 13 are not currently used and may as well be zero. -These numbers are no magical secrets, as seen below: - -./arch/i386/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF -./arch/i386/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000 -./arch/i386/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000 - -Consider a typical two floppy disk setup, where you will have the -kernel on disk one, and have already put a RAM disk image onto disk #2. - -Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk -starts at an offset of 0 kB from the beginning of the floppy. -The command line equivalent is: "ramdisk_start=0" - -You want bit 14 as one, indicating that a RAM disk is to be loaded. -The command line equivalent is: "load_ramdisk=1" - -You want bit 15 as one, indicating that you want a prompt/keypress -sequence so that you have a chance to switch floppy disks. -The command line equivalent is: "prompt_ramdisk=1" - -Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word. -So to create disk one of the set, you would do: - - /usr/src/linux# cat arch/i386/boot/zImage > /dev/fd0 - /usr/src/linux# rdev /dev/fd0 /dev/fd0 - /usr/src/linux# rdev -r /dev/fd0 49152 - -If you make a boot disk that has LILO, then for the above, you would use: - append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1" -Since the default start = 0 and the default prompt = 1, you could use: - append = "load_ramdisk=1" - - -4) An Example of Creating a Compressed RAM Disk ----------------------------------------------- - -To create a RAM disk image, you will need a spare block device to -construct it on. This can be the RAM disk device itself, or an -unused disk partition (such as an unmounted swap partition). For this -example, we will use the RAM disk device, "/dev/ram0". - -Note: This technique should not be done on a machine with less than 8 MB -of RAM. If using a spare disk partition instead of /dev/ram0, then this -restriction does not apply. - -a) Decide on the RAM disk size that you want. Say 2 MB for this example. - Create it by writing to the RAM disk device. (This step is not currently - required, but may be in the future.) It is wise to zero out the - area (esp. for disks) so that maximal compression is achieved for - the unused blocks of the image that you are about to create. - - dd if=/dev/zero of=/dev/ram0 bs=1k count=2048 - -b) Make a filesystem on it. Say ext2fs for this example. - - mke2fs -vm0 /dev/ram0 2048 - -c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...) - and unmount it again. - -d) Compress the contents of the RAM disk. The level of compression - will be approximately 50% of the space used by the files. Unused - space on the RAM disk will compress to almost nothing. - - dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz - -e) Put the kernel onto the floppy - - dd if=zImage of=/dev/fd0 bs=1k - -f) Put the RAM disk image onto the floppy, after the kernel. Use an offset - that is slightly larger than the kernel, so that you can put another - (possibly larger) kernel onto the same floppy later without overlapping - the RAM disk image. An offset of 400 kB for kernels about 350 kB in - size would be reasonable. Make sure offset+size of ram_image.gz is - not larger than the total space on your floppy (usually 1440 kB). - - dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400 - -g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc. - For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would - have 2^15 + 2^14 + 400 = 49552. - - rdev /dev/fd0 /dev/fd0 - rdev -r /dev/fd0 49552 - -That is it. You now have your boot/root compressed RAM disk floppy. Some -users may wish to combine steps (d) and (f) by using a pipe. - --------------------------------------------------------------------------- - Paul Gortmaker 12/95 - -Changelog: ----------- - -10-22-04 : Updated to reflect changes in command line options, remove - obsolete references, general cleanup. - James Nelson (james4765@gmail.com) - - -12-95 : Original Document diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt index b65f0799df4..4d3ee317a4a 100644 --- a/Documentation/rfkill.txt +++ b/Documentation/rfkill.txt @@ -191,12 +191,20 @@ Userspace input handlers (uevents) or kernel input handlers (rfkill-input): to tell the devices registered with the rfkill class to change their state (i.e. translates the input layer event into real action). + * rfkill-input implements EPO by handling EV_SW SW_RFKILL_ALL 0 (power off all transmitters) in a special way: it ignores any overrides and local state cache and forces all transmitters to the RFKILL_STATE_SOFT_BLOCKED state (including those which are already - supposed to be BLOCKED). Note that the opposite event (power on all - transmitters) is handled normally. + supposed to be BLOCKED). + * rfkill EPO will remain active until rfkill-input receives an + EV_SW SW_RFKILL_ALL 1 event. While the EPO is active, transmitters + are locked in the blocked state (rfkill will refuse to unblock them). + * rfkill-input implements different policies that the user can + select for handling EV_SW SW_RFKILL_ALL 1. It will unlock rfkill, + and either do nothing (leave transmitters blocked, but now unlocked), + restore the transmitters to their state before the EPO, or unblock + them all. Userspace uevent handler or kernel platform-specific drivers hooked to the rfkill notifier chain: @@ -331,11 +339,9 @@ class to get a sysfs interface :-) correct event for your switch/button. These events are emergency power-off events when they are trying to turn the transmitters off. An example of an input device which SHOULD generate *_RFKILL_ALL events is the wireless-kill -switch in a laptop which is NOT a hotkey, but a real switch that kills radios -in hardware, even if the O.S. has gone to lunch. An example of an input device -which SHOULD NOT generate *_RFKILL_ALL events by default, is any sort of hot -key that does nothing by itself, as well as any hot key that is type-specific -(e.g. the one for WLAN). +switch in a laptop which is NOT a hotkey, but a real sliding/rocker switch. +An example of an input device which SHOULD NOT generate *_RFKILL_ALL events by +default, is any sort of hot key that is type-specific (e.g. the one for WLAN). 3.1 Guidelines for wireless device drivers diff --git a/Documentation/riscom8.txt b/Documentation/riscom8.txt deleted file mode 100644 index 14f61fdad7c..00000000000 --- a/Documentation/riscom8.txt +++ /dev/null @@ -1,36 +0,0 @@ -* NOTE - this is an unmaintained driver. The original author cannot be located. - -SDL Communications is now SBS Technologies, and does not have any -information on these ancient ISA cards on their website. - -James Nelson - 12-12-2004 - - This is the README for RISCom/8 multi-port serial driver - (C) 1994-1996 D.Gorodchanin - See file LICENSE for terms and conditions. - -NOTE: English is not my native language. - I'm sorry for any mistakes in this text. - -Misc. notes for RISCom/8 serial driver, in no particular order :) - -1) This driver can support up to 4 boards at time. - Use string "riscom8=0xXXX,0xXXX,0xXXX,0xXXX" at LILO prompt, for - setting I/O base addresses for boards. If you compile driver - as module use modprobe options "iobase=0xXXX iobase1=0xXXX iobase2=..." - -2) The driver partially supports famous 'setserial' program, you can use almost - any of its options, excluding port & irq settings. - -3) There are some misc. defines at the beginning of riscom8.c, please read the - comments and try to change some of them in case of problems. - -4) I consider the current state of the driver as BETA. - -5) SDL Communications WWW page is http://www.sdlcomm.com. - -6) You can use the MAKEDEV program to create RISCom/8 /dev/ttyL* entries. - -7) Minor numbers for first board are 0-7, for second 8-15, etc. - -22 Apr 1996. diff --git a/Documentation/rocket.txt b/Documentation/rocket.txt deleted file mode 100644 index 1d858299043..00000000000 --- a/Documentation/rocket.txt +++ /dev/null @@ -1,189 +0,0 @@ -Comtrol(tm) RocketPort(R)/RocketModem(TM) Series -Device Driver for the Linux Operating System - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - -PRODUCT OVERVIEW ----------------- - -This driver provides a loadable kernel driver for the Comtrol RocketPort -and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32 -high-speed serial ports or modems. This driver supports up to a combination -of four RocketPort or RocketModems boards in one machine simultaneously. -This file assumes that you are using the RocketPort driver which is -integrated into the kernel sources. - -The driver can also be installed as an external module using the usual -"make;make install" routine. This external module driver, obtainable -from the Comtrol website listed below, is useful for updating the driver -or installing it into kernels which do not have the driver configured -into them. Installations instructions for the external module -are in the included README and HW_INSTALL files. - -RocketPort ISA and RocketModem II PCI boards currently are only supported by -this driver in module form. - -The RocketPort ISA board requires I/O ports to be configured by the DIP -switches on the board. See the section "ISA Rocketport Boards" below for -information on how to set the DIP switches. - -You pass the I/O port to the driver using the following module parameters: - -board1 : I/O port for the first ISA board -board2 : I/O port for the second ISA board -board3 : I/O port for the third ISA board -board4 : I/O port for the fourth ISA board - -There is a set of utilities and scripts provided with the external driver -( downloadable from http://www.comtrol.com ) that ease the configuration and -setup of the ISA cards. - -The RocketModem II PCI boards require firmware to be loaded into the card -before it will function. The driver has only been tested as a module for this -board. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - -INSTALLATION PROCEDURES ------------------------ - -RocketPort/RocketModem PCI cards require no driver configuration, they are -automatically detected and configured. - -The RocketPort driver can be installed as a module (recommended) or built -into the kernel. This is selected, as for other drivers, through the `make config` -command from the root of the Linux source tree during the kernel build process. - -The RocketPort/RocketModem serial ports installed by this driver are assigned -device major number 46, and will be named /dev/ttyRx, where x is the port number -starting at zero (ex. /dev/ttyR0, /devttyR1, ...). If you have multiple cards -installed in the system, the mapping of port names to serial ports is displayed -in the system log at /var/log/messages. - -If installed as a module, the module must be loaded. This can be done -manually by entering "modprobe rocket". To have the module loaded automatically -upon system boot, edit the /etc/modprobe.conf file and add the line -"alias char-major-46 rocket". - -In order to use the ports, their device names (nodes) must be created with mknod. -This is only required once, the system will retain the names once created. To -create the RocketPort/RocketModem device names, use the command -"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero. For example: - ->mknod /dev/ttyR0 c 46 0 ->mknod /dev/ttyR1 c 46 1 ->mknod /dev/ttyR2 c 46 2 - -The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes) -for you: - ->/dev/MAKEDEV ttyR - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - -ISA Rocketport Boards ---------------------- - -You must assign and configure the I/O addresses used by the ISA Rocketport -card before installing and using it. This is done by setting a set of DIP -switches on the Rocketport board. - - -SETTING THE I/O ADDRESS ------------------------ - -Before installing RocketPort(R) or RocketPort RA boards, you must find -a range of I/O addresses for it to use. The first RocketPort card -requires a 68-byte contiguous block of I/O addresses, starting at one -of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h, -0x300h, 0x340h, 0x380h. This I/O address must be reflected in the DIP -switches of *all* of the Rocketport cards. - -The second, third, and fourth RocketPort cards require a 64-byte -contiguous block of I/O addresses, starting at one of the following -I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h, -0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h. The I/O address used by the -second, third, and fourth Rocketport cards (if present) are set via -software control. The DIP switch settings for the I/O address must be -set to the value of the first Rocketport cards. - -In order to distinguish each of the card from the others, each card -must have a unique board ID set on the dip switches. The first -Rocketport board must be set with the DIP switches corresponding to -the first board, the second board must be set with the DIP switches -corresponding to the second board, etc. IMPORTANT: The board ID is -the only place where the DIP switch settings should differ between the -various Rocketport boards in a system. - -The I/O address range used by any of the RocketPort cards must not -conflict with any other cards in the system, including other -RocketPort cards. Below, you will find a list of commonly used I/O -address ranges which may be in use by other devices in your system. -On a Linux system, "cat /proc/ioports" will also be helpful in -identifying what I/O addresses are being used by devices on your -system. - -Remember, the FIRST RocketPort uses 68 I/O addresses. So, if you set it -for 0x100, it will occupy 0x100 to 0x143. This would mean that you -CAN NOT set the second, third or fourth board for address 0x140 since -the first 4 bytes of that range are used by the first board. You would -need to set the second, third, or fourth board to one of the next available -blocks such as 0x180. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - -RocketPort and RocketPort RA SW1 Settings: - - +-------------------------------+ - | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | - +-------+-------+---------------+ - | Unused| Card | I/O Port Block| - +-------------------------------+ - -DIP Switches DIP Switches -7 8 6 5 -=================== =================== -On On UNUSED, MUST BE ON. On On First Card <==== Default - On Off Second Card - Off On Third Card - Off Off Fourth Card - -DIP Switches I/O Address Range -4 3 2 1 Used by the First Card -===================================== -On Off On Off 100-143 -On Off Off On 140-183 -On Off Off Off 180-1C3 <==== Default -Off On On Off 200-243 -Off On Off On 240-283 -Off On Off Off 280-2C3 -Off Off On Off 300-343 -Off Off Off On 340-383 -Off Off Off Off 380-3C3 - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - -REPORTING BUGS --------------- - -For technical support, please provide the following -information: Driver version, kernel release, distribution of -kernel, and type of board you are using. Error messages and log -printouts port configuration details are especially helpful. - -USA - Phone: (612) 494-4100 - FAX: (612) 494-4199 - email: support@comtrol.com - -Comtrol Europe - Phone: +44 (0) 1 869 323-220 - FAX: +44 (0) 1 869 323-211 - email: support@comtrol.co.uk - -Web: http://www.comtrol.com -FTP: ftp.comtrol.com - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - - diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt index d30a281c570..10711d9f078 100644 --- a/Documentation/s390/Debugging390.txt +++ b/Documentation/s390/Debugging390.txt @@ -1402,7 +1402,7 @@ Syscalls are implemented on Linux for S390 by the Supervisor call instruction (S possibilities of these as the instruction is made up of a 0xA opcode & the second byte being the syscall number. They are traced using the simple command. TR SVC -the syscalls are defined in linux/include/asm-s390/unistd.h +the syscalls are defined in linux/arch/s390/include/asm/unistd.h e.g. to trace all file opens just do TR SVC 5 ( as this is the syscall number of open ) diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt index c4b7b2bd369..480a78ef5a1 100644 --- a/Documentation/s390/cds.txt +++ b/Documentation/s390/cds.txt @@ -98,7 +98,7 @@ platform. Some of the interface routines are specific to Linux/390 and some of them can be found on other Linux platforms implementations too. Miscellaneous function prototypes, data declarations, and macro definitions can be found in the architecture specific C header file -linux/include/asm-s390/irq.h. +linux/arch/s390/include/asm/irq.h. Overview of CDS interface concepts diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt index e0542097369..2d10053dd97 100644 --- a/Documentation/s390/s390dbf.txt +++ b/Documentation/s390/s390dbf.txt @@ -2,7 +2,7 @@ S390 Debug Feature ================== files: arch/s390/kernel/debug.c - include/asm-s390/debug.h + arch/s390/include/asm/debug.h Description: ------------ diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 941615a9769..d43dbcbd163 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt @@ -8,7 +8,7 @@ Context switch By default, the switch_to arch function is called with the runqueue locked. This is usually not a problem unless switch_to may need to take the runqueue lock. This is usually due to a wake up operation in -the context switch. See include/asm-ia64/system.h for an example. +the context switch. See arch/ia64/include/asm/system.h for an example. To request the scheduler call switch_to with the runqueue unlocked, you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file @@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to introduce a significant interrupt latency by adding the line `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for unlocked context switches. This define also implies -`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an +`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an example. diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index eb471c7a905..6f33593e59e 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -231,7 +231,7 @@ CPU bandwidth control purposes: This options needs CONFIG_CGROUPS to be defined, and lets the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See - Documentation/cgroups.txt for more information about this filesystem. + Documentation/cgroups/cgroups.txt for more information about this filesystem. Only one of these options to group tasks can be chosen and not both. @@ -273,3 +273,24 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem. # #Launch gmplayer (or your favourite movie player) # echo > multimedia/tasks + +8. Implementation note: user namespaces + +User namespaces are intended to be hierarchical. But they are currently +only partially implemented. Each of those has ramifications for CFS. + +First, since user namespaces are hierarchical, the /sys/kernel/uids +presentation is inadequate. Eventually we will likely want to use sysfs +tagging to provide private views of /sys/kernel/uids within each user +namespace. + +Second, the hierarchical nature is intended to support completely +unprivileged use of user namespaces. So if using user groups, then +we want the users in a user namespace to be children of the user +who created it. + +That is currently unimplemented. So instead, every user in a new +user namespace will receive 1024 shares just like any user in the +initial user namespace. Note that at the moment creation of a new +user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and +CAP_SETGID. diff --git a/Documentation/scsi/ChangeLog.lpfc b/Documentation/scsi/ChangeLog.lpfc index ae3f962a7cf..ff19a52fe00 100644 --- a/Documentation/scsi/ChangeLog.lpfc +++ b/Documentation/scsi/ChangeLog.lpfc @@ -733,7 +733,7 @@ Changes from 20040920 to 20041018 I/O completion path a little more, especially taking care of fast-pathing the non-error case. Also removes tons of dead members and defines from lpfc_scsi.h - e.g. lpfc_target is down - to nothing more then the lpfc_nodelist pointer. + to nothing more than the lpfc_nodelist pointer. * Added binary sysfs file to issue mbox commands * Replaced #if __BIG_ENDIAN with #if __BIG_ENDIAN_BITFIELD for compatibility with the user space applications. diff --git a/Documentation/scsi/ChangeLog.ncr53c8xx b/Documentation/scsi/ChangeLog.ncr53c8xx index a9f721aeb11..8b278c10edf 100644 --- a/Documentation/scsi/ChangeLog.ncr53c8xx +++ b/Documentation/scsi/ChangeLog.ncr53c8xx @@ -19,7 +19,7 @@ Sun Sep 24 21:30 2000 Gerard Roudier (groudier@club-internet.fr) Wed Jul 26 23:30 2000 Gerard Roudier (groudier@club-internet.fr) * version ncr53c8xx-3.4.1 - - Provide OpenFirmare path through the proc FS on PPC. + - Provide OpenFirmware path through the proc FS on PPC. - Remove trailing argument #2 from a couple of #undefs. Sun Jul 09 16:30 2000 Gerard Roudier (groudier@club-internet.fr) diff --git a/Documentation/scsi/ChangeLog.sym53c8xx b/Documentation/scsi/ChangeLog.sym53c8xx index ef985ec348e..02ffbc1e8a8 100644 --- a/Documentation/scsi/ChangeLog.sym53c8xx +++ b/Documentation/scsi/ChangeLog.sym53c8xx @@ -81,7 +81,7 @@ Sun Sep 24 21:30 2000 Gerard Roudier (groudier@club-internet.fr) Wed Jul 26 23:30 2000 Gerard Roudier (groudier@club-internet.fr) * version sym53c8xx-1.7.1 - - Provide OpenFirmare path through the proc FS on PPC. + - Provide OpenFirmware path through the proc FS on PPC. - Download of on-chip SRAM using memcpy_toio() doesn't work on PPC. Restore previous method (MEMORY MOVE from SCRIPTS). - Remove trailing argument #2 from a couple of #undefs. diff --git a/Documentation/scsi/cxgb3i.txt b/Documentation/scsi/cxgb3i.txt new file mode 100644 index 00000000000..8141fa01978 --- /dev/null +++ b/Documentation/scsi/cxgb3i.txt @@ -0,0 +1,85 @@ +Chelsio S3 iSCSI Driver for Linux + +Introduction +============ + +The Chelsio T3 ASIC based Adapters (S310, S320, S302, S304, Mezz cards, etc. +series of products) supports iSCSI acceleration and iSCSI Direct Data Placement +(DDP) where the hardware handles the expensive byte touching operations, such +as CRC computation and verification, and direct DMA to the final host memory +destination: + + - iSCSI PDU digest generation and verification + + On transmitting, Chelsio S3 h/w computes and inserts the Header and + Data digest into the PDUs. + On receiving, Chelsio S3 h/w computes and verifies the Header and + Data digest of the PDUs. + + - Direct Data Placement (DDP) + + S3 h/w can directly place the iSCSI Data-In or Data-Out PDU's + payload into pre-posted final destination host-memory buffers based + on the Initiator Task Tag (ITT) in Data-In or Target Task Tag (TTT) + in Data-Out PDUs. + + - PDU Transmit and Recovery + + On transmitting, S3 h/w accepts the complete PDU (header + data) + from the host driver, computes and inserts the digests, decomposes + the PDU into multiple TCP segments if necessary, and transmit all + the TCP segments onto the wire. It handles TCP retransmission if + needed. + + On receving, S3 h/w recovers the iSCSI PDU by reassembling TCP + segments, separating the header and data, calculating and verifying + the digests, then forwards the header to the host. The payload data, + if possible, will be directly placed into the pre-posted host DDP + buffer. Otherwise, the payload data will be sent to the host too. + +The cxgb3i driver interfaces with open-iscsi initiator and provides the iSCSI +acceleration through Chelsio hardware wherever applicable. + +Using the cxgb3i Driver +======================= + +The following steps need to be taken to accelerates the open-iscsi initiator: + +1. Load the cxgb3i driver: "modprobe cxgb3i" + + The cxgb3i module registers a new transport class "cxgb3i" with open-iscsi. + + * in the case of recompiling the kernel, the cxgb3i selection is located at + Device Drivers + SCSI device support ---> + [*] SCSI low-level drivers ---> + Chelsio S3xx iSCSI support + +2. Create an interface file located under /etc/iscsi/ifaces/ for the new + transport class "cxgb3i". + + The content of the file should be in the following format: + iface.transport_name = cxgb3i + iface.net_ifacename = + iface.ipaddress = + + * if iface.ipaddress is specified, needs to be either the + same as the ethX's ip address or an address on the same subnet. Make + sure the ip address is unique in the network. + +3. edit /etc/iscsi/iscsid.conf + The default setting for MaxRecvDataSegmentLength (131072) is too big, + replace "node.conn[0].iscsi.MaxRecvDataSegmentLength" to be a value no + bigger than 15360 (for example 8192): + + node.conn[0].iscsi.MaxRecvDataSegmentLength = 8192 + + * The login would fail for a normal session if MaxRecvDataSegmentLength is + too big. A error message in the format of + "cxgb3i: ERR! MaxRecvSegmentLength too big. Need to be <= ." + would be logged to dmesg. + +4. To direct open-iscsi traffic to go through cxgb3i's accelerated path, + "-I " option needs to be specified with most of the + iscsiadm command. is the transport interface file created + in step 2. diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt index 38d324d62b2..e5b071d4661 100644 --- a/Documentation/scsi/scsi_fc_transport.txt +++ b/Documentation/scsi/scsi_fc_transport.txt @@ -191,7 +191,7 @@ Vport States: This is equivalent to a driver "attach" on an adapter, which is independent of the adapter's link state. - Instantiation of the vport on the FC link via ELS traffic, etc. - This is equivalent to a "link up" and successfull link initialization. + This is equivalent to a "link up" and successful link initialization. Further information can be found in the interfaces section below for Vport Creation. @@ -320,7 +320,7 @@ Vport Creation: This is equivalent to a driver "attach" on an adapter, which is independent of the adapter's link state. - Instantiation of the vport on the FC link via ELS traffic, etc. - This is equivalent to a "link up" and successfull link initialization. + This is equivalent to a "link up" and successful link initialization. The LLDD's vport_create() function will not synchronously wait for both parts to be fully completed before returning. It must validate that the diff --git a/Documentation/serial/00-INDEX b/Documentation/serial/00-INDEX new file mode 100644 index 00000000000..07dcdb0d2a3 --- /dev/null +++ b/Documentation/serial/00-INDEX @@ -0,0 +1,24 @@ +00-INDEX + - this file. +README.cycladesZ + - info on Cyclades-Z firmware loading. +computone.txt + - info on Computone Intelliport II/Plus Multiport Serial Driver. +digiepca.txt + - info on Digi Intl. {PC,PCI,EISA}Xx and Xem series cards. +hayes-esp.txt + - info on using the Hayes ESP serial driver. +moxa-smartio + - file with info on installing/using Moxa multiport serial driver. +riscom8.txt + - notes on using the RISCom/8 multi-port serial driver. +rocket.txt + - info on the Comtrol RocketPort multiport serial driver. +specialix.txt + - info on hardware/driver for specialix IO8+ multiport serial card. +stallion.txt + - info on using the Stallion multiport serial driver. +sx.txt + - info on the Specialix SX/SI multiport serial driver. +tty.txt + - guide to the locking policies of the tty layer. diff --git a/Documentation/serial/README.cycladesZ b/Documentation/serial/README.cycladesZ new file mode 100644 index 00000000000..024a69443cc --- /dev/null +++ b/Documentation/serial/README.cycladesZ @@ -0,0 +1,8 @@ + +The Cyclades-Z must have firmware loaded onto the card before it will +operate. This operation should be performed during system startup, + +The firmware, loader program and the latest device driver code are +available from Cyclades at + ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/ + diff --git a/Documentation/serial/computone.txt b/Documentation/serial/computone.txt new file mode 100644 index 00000000000..c57ea4781e5 --- /dev/null +++ b/Documentation/serial/computone.txt @@ -0,0 +1,522 @@ +NOTE: This is an unmaintained driver. It is not guaranteed to work due to +changes made in the tty layer in 2.6. If you wish to take over maintenance of +this driver, contact Michael Warfield . + +Changelog: +---------- +11-01-2001: Original Document + +10-29-2004: Minor misspelling & format fix, update status of driver. + James Nelson + +Computone Intelliport II/Plus Multiport Serial Driver +----------------------------------------------------- + +Release Notes For Linux Kernel 2.2 and higher. +These notes are for the drivers which have already been integrated into the +kernel and have been tested on Linux kernels 2.0, 2.2, 2.3, and 2.4. + +Version: 1.2.14 +Date: 11/01/2001 +Historical Author: Andrew Manison +Primary Author: Doug McNash +Support: support@computone.com +Fixes and Updates: Mike Warfield + +This file assumes that you are using the Computone drivers which are +integrated into the kernel sources. For updating the drivers or installing +drivers into kernels which do not already have Computone drivers, please +refer to the instructions in the README.computone file in the driver patch. + + +1. INTRODUCTION + +This driver supports the entire family of Intelliport II/Plus controllers +with the exception of the MicroChannel controllers. It does not support +products previous to the Intelliport II. + +This driver was developed on the v2.0.x Linux tree and has been tested up +to v2.4.14; it will probably not work with earlier v1.X kernels,. + + +2. QUICK INSTALLATION + +Hardware - If you have an ISA card, find a free interrupt and io port. + List those in use with `cat /proc/interrupts` and + `cat /proc/ioports`. Set the card dip switches to a free + address. You may need to configure your BIOS to reserve an + irq for an ISA card. PCI and EISA parameters are set + automagically. Insert card into computer with the power off + before or after drivers installation. + + Note the hardware address from the Computone ISA cards installed into + the system. These are required for editing ip2.c or editing + /etc/modprobe.conf, or for specification on the modprobe + command line. + + Note that the /etc/modules.conf should be used for older (pre-2.6) + kernels. + +Software - + +Module installation: + +a) Determine free irq/address to use if any (configure BIOS if need be) +b) Run "make config" or "make menuconfig" or "make xconfig" + Select (m) module for CONFIG_COMPUTONE under character + devices. CONFIG_PCI and CONFIG_MODULES also may need to be set. +c) Set address on ISA cards then: + edit /usr/src/linux/drivers/char/ip2.c if needed + or + edit /etc/modprobe.conf if needed (module). + or both to match this setting. +d) Run "make modules" +e) Run "make modules_install" +f) Run "/sbin/depmod -a" +g) install driver using `modprobe ip2 ` (options listed below) +h) run ip2mkdev (either the script below or the binary version) + + +Kernel installation: + +a) Determine free irq/address to use if any (configure BIOS if need be) +b) Run "make config" or "make menuconfig" or "make xconfig" + Select (y) kernel for CONFIG_COMPUTONE under character + devices. CONFIG_PCI may need to be set if you have PCI bus. +c) Set address on ISA cards then: + edit /usr/src/linux/drivers/char/ip2.c + (Optional - may be specified on kernel command line now) +d) Run "make zImage" or whatever target you prefer. +e) mv /usr/src/linux/arch/i386/boot/zImage to /boot. +f) Add new config for this kernel into /etc/lilo.conf, run "lilo" + or copy to a floppy disk and boot from that floppy disk. +g) Reboot using this kernel +h) run ip2mkdev (either the script below or the binary version) + +Kernel command line options: + +When compiling the driver into the kernel, io and irq may be +compiled into the driver by editing ip2.c and setting the values for +io and irq in the appropriate array. An alternative is to specify +a command line parameter to the kernel at boot up. + + ip2=io0,irq0,io1,irq1,io2,irq2,io3,irq3 + +Note that this order is very different from the specifications for the +modload parameters which have separate IRQ and IO specifiers. + +The io port also selects PCI (1) and EISA (2) boards. + + io=0 No board + io=1 PCI board + io=2 EISA board + else ISA board io address + +You only need to specify the boards which are present. + + Examples: + + 2 PCI boards: + + ip2=1,0,1,0 + + 1 ISA board at 0x310 irq 5: + + ip2=0x310,5 + +This can be added to and "append" option in lilo.conf similar to this: + + append="ip2=1,0,1,0" + + +3. INSTALLATION + +Previously, the driver sources were packaged with a set of patch files +to update the character drivers' makefile and configuration file, and other +kernel source files. A build script (ip2build) was included which applies +the patches if needed, and build any utilities needed. +What you receive may be a single patch file in conventional kernel +patch format build script. That form can also be applied by +running patch -p1 < ThePatchFile. Otherwise run ip2build. + +The driver can be installed as a module (recommended) or built into the +kernel. This is selected as for other drivers through the `make config` +command from the root of the Linux source tree. If the driver is built +into the kernel you will need to edit the file ip2.c to match the boards +you are installing. See that file for instructions. If the driver is +installed as a module the configuration can also be specified on the +modprobe command line as follows: + + modprobe ip2 irq=irq1,irq2,irq3,irq4 io=addr1,addr2,addr3,addr4 + +where irqnum is one of the valid Intelliport II interrupts (3,4,5,7,10,11, +12,15) and addr1-4 are the base addresses for up to four controllers. If +the irqs are not specified the driver uses the default in ip2.c (which +selects polled mode). If no base addresses are specified the defaults in +ip2.c are used. If you are autoloading the driver module with kerneld or +kmod the base addresses and interrupt number must also be set in ip2.c +and recompile or just insert and options line in /etc/modprobe.conf or both. +The options line is equivalent to the command line and takes precedence over +what is in ip2.c. + +/etc/modprobe.conf sample: + options ip2 io=1,0x328 irq=1,10 + alias char-major-71 ip2 + alias char-major-72 ip2 + alias char-major-73 ip2 + +The equivalent in ip2.c: + +static int io[IP2_MAX_BOARDS]= { 1, 0x328, 0, 0 }; +static int irq[IP2_MAX_BOARDS] = { 1, 10, -1, -1 }; + +The equivalent for the kernel command line (in lilo.conf): + + append="ip2=1,1,0x328,10" + + +Note: Both io and irq should be updated to reflect YOUR system. An "io" + address of 1 or 2 indicates a PCI or EISA card in the board table. + The PCI or EISA irq will be assigned automatically. + +Specifying an invalid or in-use irq will default the driver into +running in polled mode for that card. If all irq entries are 0 then +all cards will operate in polled mode. + +If you select the driver as part of the kernel run : + + make zlilo (or whatever you do to create a bootable kernel) + +If you selected a module run : + + make modules && make modules_install + +The utility ip2mkdev (see 5 and 7 below) creates all the device nodes +required by the driver. For a device to be created it must be configured +in the driver and the board must be installed. Only devices corresponding +to real IntelliPort II ports are created. With multiple boards and expansion +boxes this will leave gaps in the sequence of device names. ip2mkdev uses +Linux tty naming conventions: ttyF0 - ttyF255 for normal devices, and +cuf0 - cuf255 for callout devices. + + +4. USING THE DRIVERS + +As noted above, the driver implements the ports in accordance with Linux +conventions, and the devices should be interchangeable with the standard +serial devices. (This is a key point for problem reporting: please make +sure that what you are trying do works on the ttySx/cuax ports first; then +tell us what went wrong with the ip2 ports!) + +Higher speeds can be obtained using the setserial utility which remaps +38,400 bps (extb) to 57,600 bps, 115,200 bps, or a custom speed. +Intelliport II installations using the PowerPort expansion module can +use the custom speed setting to select the highest speeds: 153,600 bps, +230,400 bps, 307,200 bps, 460,800bps and 921,600 bps. The base for +custom baud rate configuration is fixed at 921,600 for cards/expansion +modules with ST654's and 115200 for those with Cirrus CD1400's. This +corresponds to the maximum bit rates those chips are capable. +For example if the baud base is 921600 and the baud divisor is 18 then +the custom rate is 921600/18 = 51200 bps. See the setserial man page for +complete details. Of course if stty accepts the higher rates now you can +use that as well as the standard ioctls(). + + +5. ip2mkdev and assorted utilities... + +Several utilities, including the source for a binary ip2mkdev utility are +available under .../drivers/char/ip2. These can be build by changing to +that directory and typing "make" after the kernel has be built. If you do +not wish to compile the binary utilities, the shell script below can be +cut out and run as "ip2mkdev" to create the necessary device files. To +use the ip2mkdev script, you must have procfs enabled and the proc file +system mounted on /proc. + + +6. NOTES + +This is a release version of the driver, but it is impossible to test it +in all configurations of Linux. If there is any anomalous behaviour that +does not match the standard serial port's behaviour please let us know. + + +7. ip2mkdev shell script + +Previously, this script was simply attached here. It is now attached as a +shar archive to make it easier to extract the script from the documentation. +To create the ip2mkdev shell script change to a convenient directory (/tmp +works just fine) and run the following command: + + unshar Documentation/serial/computone.txt + (This file) + +You should now have a file ip2mkdev in your current working directory with +permissions set to execute. Running that script with then create the +necessary devices for the Computone boards, interfaces, and ports which +are present on you system at the time it is run. + + +#!/bin/sh +# This is a shell archive (produced by GNU sharutils 4.2.1). +# To extract the files from this archive, save it to some FILE, remove +# everything before the `!/bin/sh' line above, then type `sh FILE'. +# +# Made on 2001-10-29 10:32 EST by . +# Source directory was `/home2/src/tmp'. +# +# Existing files will *not* be overwritten unless `-c' is specified. +# +# This shar contains: +# length mode name +# ------ ---------- ------------------------------------------ +# 4251 -rwxr-xr-x ip2mkdev +# +save_IFS="${IFS}" +IFS="${IFS}:" +gettext_dir=FAILED +locale_dir=FAILED +first_param="$1" +for dir in $PATH +do + if test "$gettext_dir" = FAILED && test -f $dir/gettext \ + && ($dir/gettext --version >/dev/null 2>&1) + then + set `$dir/gettext --version 2>&1` + if test "$3" = GNU + then + gettext_dir=$dir + fi + fi + if test "$locale_dir" = FAILED && test -f $dir/shar \ + && ($dir/shar --print-text-domain-dir >/dev/null 2>&1) + then + locale_dir=`$dir/shar --print-text-domain-dir` + fi +done +IFS="$save_IFS" +if test "$locale_dir" = FAILED || test "$gettext_dir" = FAILED +then + echo=echo +else + TEXTDOMAINDIR=$locale_dir + export TEXTDOMAINDIR + TEXTDOMAIN=sharutils + export TEXTDOMAIN + echo="$gettext_dir/gettext -s" +fi +if touch -am -t 200112312359.59 $$.touch >/dev/null 2>&1 && test ! -f 200112312359.59 -a -f $$.touch; then + shar_touch='touch -am -t $1$2$3$4$5$6.$7 "$8"' +elif touch -am 123123592001.59 $$.touch >/dev/null 2>&1 && test ! -f 123123592001.59 -a ! -f 123123592001.5 -a -f $$.touch; then + shar_touch='touch -am $3$4$5$6$1$2.$7 "$8"' +elif touch -am 1231235901 $$.touch >/dev/null 2>&1 && test ! -f 1231235901 -a -f $$.touch; then + shar_touch='touch -am $3$4$5$6$2 "$8"' +else + shar_touch=: + echo + $echo 'WARNING: not restoring timestamps. Consider getting and' + $echo "installing GNU \`touch', distributed in GNU File Utilities..." + echo +fi +rm -f 200112312359.59 123123592001.59 123123592001.5 1231235901 $$.touch +# +if mkdir _sh17581; then + $echo 'x -' 'creating lock directory' +else + $echo 'failed to create lock directory' + exit 1 +fi +# ============= ip2mkdev ============== +if test -f 'ip2mkdev' && test "$first_param" != -c; then + $echo 'x -' SKIPPING 'ip2mkdev' '(file already exists)' +else + $echo 'x -' extracting 'ip2mkdev' '(text)' + sed 's/^X//' << 'SHAR_EOF' > 'ip2mkdev' && +#!/bin/sh - +# +# ip2mkdev +# +# Make or remove devices as needed for Computone Intelliport drivers +# +# First rule! If the dev file exists and you need it, don't mess +# with it. That prevents us from screwing up open ttys, ownership +# and permissions on a running system! +# +# This script will NOT remove devices that no longer exist if their +# board or interface box has been removed. If you want to get rid +# of them, you can manually do an "rm -f /dev/ttyF* /dev/cuaf*" +# before running this script. Running this script will then recreate +# all the valid devices. +# +# Michael H. Warfield +# /\/\|=mhw=|\/\/ +# mhw@wittsend.com +# +# Updated 10/29/2000 for version 1.2.13 naming convention +# under devfs. /\/\|=mhw=|\/\/ +# +# Updated 03/09/2000 for devfs support in ip2 drivers. /\/\|=mhw=|\/\/ +# +X +if test -d /dev/ip2 ; then +# This is devfs mode... We don't do anything except create symlinks +# from the real devices to the old names! +X cd /dev +X echo "Creating symbolic links to devfs devices" +X for i in `ls ip2` ; do +X if test ! -L ip2$i ; then +X # Remove it incase it wasn't a symlink (old device) +X rm -f ip2$i +X ln -s ip2/$i ip2$i +X fi +X done +X for i in `( cd tts ; ls F* )` ; do +X if test ! -L tty$i ; then +X # Remove it incase it wasn't a symlink (old device) +X rm -f tty$i +X ln -s tts/$i tty$i +X fi +X done +X for i in `( cd cua ; ls F* )` ; do +X DEVNUMBER=`expr $i : 'F\(.*\)'` +X if test ! -L cuf$DEVNUMBER ; then +X # Remove it incase it wasn't a symlink (old device) +X rm -f cuf$DEVNUMBER +X ln -s cua/$i cuf$DEVNUMBER +X fi +X done +X exit 0 +fi +X +if test ! -f /proc/tty/drivers +then +X echo "\ +Unable to check driver status. +Make sure proc file system is mounted." +X +X exit 255 +fi +X +if test ! -f /proc/tty/driver/ip2 +then +X echo "\ +Unable to locate ip2 proc file. +Attempting to load driver" +X +X if /sbin/insmod ip2 +X then +X if test ! -f /proc/tty/driver/ip2 +X then +X echo "\ +Unable to locate ip2 proc file after loading driver. +Driver initialization failure or driver version error. +" +X exit 255 +X fi +X else +X echo "Unable to load ip2 driver." +X exit 255 +X fi +fi +X +# Ok... So we got the driver loaded and we can locate the procfs files. +# Next we need our major numbers. +X +TTYMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/tt/!d' -e 's/.*tt[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers` +CUAMAJOR=`sed -e '/^ip2/!d' -e '/\/dev\/cu/!d' -e 's/.*cu[^ ]*[ ]*\([0-9]*\)[ ]*.*/\1/' < /proc/tty/drivers` +BRDMAJOR=`sed -e '/^Driver: /!d' -e 's/.*IMajor=\([0-9]*\)[ ]*.*/\1/' < /proc/tty/driver/ip2` +X +echo "\ +TTYMAJOR = $TTYMAJOR +CUAMAJOR = $CUAMAJOR +BRDMAJOR = $BRDMAJOR +" +X +# Ok... Now we should know our major numbers, if appropriate... +# Now we need our boards and start the device loops. +X +grep '^Board [0-9]:' /proc/tty/driver/ip2 | while read token number type alltherest +do +X # The test for blank "type" will catch the stats lead-in lines +X # if they exist in the file +X if test "$type" = "vacant" -o "$type" = "Vacant" -o "$type" = "" +X then +X continue +X fi +X +X BOARDNO=`expr "$number" : '\([0-9]\):'` +X PORTS=`expr "$alltherest" : '.*ports=\([0-9]*\)' | tr ',' ' '` +X MINORS=`expr "$alltherest" : '.*minors=\([0-9,]*\)' | tr ',' ' '` +X +X if test "$BOARDNO" = "" -o "$PORTS" = "" +X then +# This may be a bug. We should at least get this much information +X echo "Unable to process board line" +X continue +X fi +X +X if test "$MINORS" = "" +X then +# Silently skip this one. This board seems to have no boxes +X continue +X fi +X +X echo "board $BOARDNO: $type ports = $PORTS; port numbers = $MINORS" +X +X if test "$BRDMAJOR" != "" +X then +X BRDMINOR=`expr $BOARDNO \* 4` +X STSMINOR=`expr $BRDMINOR + 1` +X if test ! -c /dev/ip2ipl$BOARDNO ; then +X mknod /dev/ip2ipl$BOARDNO c $BRDMAJOR $BRDMINOR +X fi +X if test ! -c /dev/ip2stat$BOARDNO ; then +X mknod /dev/ip2stat$BOARDNO c $BRDMAJOR $STSMINOR +X fi +X fi +X +X if test "$TTYMAJOR" != "" +X then +X PORTNO=$BOARDBASE +X +X for PORTNO in $MINORS +X do +X if test ! -c /dev/ttyF$PORTNO ; then +X # We got the hardware but no device - make it +X mknod /dev/ttyF$PORTNO c $TTYMAJOR $PORTNO +X fi +X done +X fi +X +X if test "$CUAMAJOR" != "" +X then +X PORTNO=$BOARDBASE +X +X for PORTNO in $MINORS +X do +X if test ! -c /dev/cuf$PORTNO ; then +X # We got the hardware but no device - make it +X mknod /dev/cuf$PORTNO c $CUAMAJOR $PORTNO +X fi +X done +X fi +done +X +Xexit 0 +SHAR_EOF + (set 20 01 10 29 10 32 01 'ip2mkdev'; eval "$shar_touch") && + chmod 0755 'ip2mkdev' || + $echo 'restore of' 'ip2mkdev' 'failed' + if ( md5sum --help 2>&1 | grep 'sage: md5sum \[' ) >/dev/null 2>&1 \ + && ( md5sum --version 2>&1 | grep -v 'textutils 1.12' ) >/dev/null; then + md5sum -c << SHAR_EOF >/dev/null 2>&1 \ + || $echo 'ip2mkdev:' 'MD5 check failed' +cb5717134509f38bad9fde6b1f79b4a4 ip2mkdev +SHAR_EOF + else + shar_count="`LC_ALL= LC_CTYPE= LANG= wc -c < 'ip2mkdev'`" + test 4251 -eq "$shar_count" || + $echo 'ip2mkdev:' 'original size' '4251,' 'current size' "$shar_count!" + fi +fi +rm -fr _sh17581 +exit 0 diff --git a/Documentation/serial/digiepca.txt b/Documentation/serial/digiepca.txt new file mode 100644 index 00000000000..f2560e22f2c --- /dev/null +++ b/Documentation/serial/digiepca.txt @@ -0,0 +1,98 @@ +NOTE: This driver is obsolete. Digi provides a 2.6 driver (dgdm) at +http://www.digi.com for PCI cards. They no longer maintain this driver, +and have no 2.6 driver for ISA cards. + +This driver requires a number of user-space tools. They can be acquired from +http://www.digi.com, but only works with 2.4 kernels. + + +The Digi Intl. epca driver. +---------------------------- +The Digi Intl. epca driver for Linux supports the following boards: + +Digi PC/Xem, PC/Xr, PC/Xe, PC/Xi, PC/Xeve +Digi EISA/Xem, PCI/Xem, PCI/Xr + +Limitations: +------------ +Currently the driver only autoprobes for supported PCI boards. + +The Linux MAKEDEV command does not support generating the Digiboard +Devices. Users executing digiConfig to setup EISA and PC series cards +will have their device nodes automatically constructed (cud?? for ~CLOCAL, +and ttyD?? for CLOCAL). Users wishing to boot their board from the LILO +prompt, or those users booting PCI cards may use buildDIGI to construct +the necessary nodes. + +Notes: +------ +This driver may be configured via LILO. For users who have already configured +their driver using digiConfig, configuring from LILO will override previous +settings. Multiple boards may be configured by issuing multiple LILO command +lines. For examples see the bottom of this document. + +Device names start at 0 and continue up. Beware of this as previous Digi +drivers started device names with 1. + +PCI boards are auto-detected and configured by the driver. PCI boards will +be allocated device numbers (internally) beginning with the lowest PCI slot +first. In other words a PCI card in slot 3 will always have higher device +nodes than a PCI card in slot 1. + +LILO config examples: +--------------------- +Using LILO's APPEND command, a string of comma separated identifiers or +integers can be used to configure supported boards. The six values in order +are: + + Enable/Disable this card or Override, + Type of card: PC/Xe (AccelePort) (0), PC/Xeve (1), PC/Xem or PC/Xr (2), + EISA/Xem (3), PC/64Xe (4), PC/Xi (5), + Enable/Disable alternate pin arrangement, + Number of ports on this card, + I/O Port where card is configured (in HEX if using string identifiers), + Base of memory window (in HEX if using string identifiers), + +NOTE : PCI boards are auto-detected and configured. Do not attempt to +configure PCI boards with the LILO append command. If you wish to override +previous configuration data (As set by digiConfig), but you do not wish to +configure any specific card (Example if there are PCI cards in the system) +the following override command will accomplish this: +-> append="digi=2" + +Samples: + append="digiepca=E,PC/Xe,D,16,200,D0000" + or + append="digi=1,0,0,16,512,851968" + +Supporting Tools: +----------------- +Supporting tools include digiDload, digiConfig, buildPCI, and ditty. See +drivers/char/README.epca for more details. Note, +this driver REQUIRES that digiDload be executed prior to it being used. +Failure to do this will result in an ENODEV error. + +Documentation: +-------------- +Complete documentation for this product may be found in the tool package. + +Sources of information and support: +----------------------------------- +Digi Intl. support site for this product: + +-> http://www.digi.com + +Acknowledgments: +---------------- +Much of this work (And even text) was derived from a similar document +supporting the original public domain DigiBoard driver Copyright (C) +1994,1995 Troy De Jongh. Many thanks to Christoph Lameter +(christoph@lameter.com) and Mike McLagan (mike.mclagan@linux.org) who authored +and contributed to the original document. + +Changelog: +---------- +10-29-04: Update status of driver, remove dead links in document + James Nelson + +2000 (?) Original Document diff --git a/Documentation/serial/hayes-esp.txt b/Documentation/serial/hayes-esp.txt new file mode 100644 index 00000000000..09b5d585675 --- /dev/null +++ b/Documentation/serial/hayes-esp.txt @@ -0,0 +1,154 @@ +HAYES ESP DRIVER VERSION 2.1 + +A big thanks to the people at Hayes, especially Alan Adamson. Their support +has enabled me to provide enhancements to the driver. + +Please report your experiences with this driver to me (arobinso@nyx.net). I +am looking for both positive and negative feedback. + +*** IMPORTANT CHANGES FOR 2.1 *** +Support for PIO mode. Five situations will cause PIO mode to be used: +1) A multiport card is detected. PIO mode will always be used. (8 port cards +do not support DMA). +2) The DMA channel is set to an invalid value (anything other than 1 or 3). +3) The DMA buffer/channel could not be allocated. The port will revert to PIO +mode until it is reopened. +4) Less than a specified number of bytes need to be transferred to/from the +FIFOs. PIO mode will be used for that transfer only. +5) A port needs to do a DMA transfer and another port is already using the +DMA channel. PIO mode will be used for that transfer only. + +Since the Hayes ESP seems to conflict with other cards (notably sound cards) +when using DMA, DMA is turned off by default. To use DMA, it must be turned +on explicitly, either with the "dma=" option described below or with +setserial. A multiport card can be forced into DMA mode by using setserial; +however, most multiport cards don't support DMA. + +The latest version of setserial allows the enhanced configuration of the ESP +card to be viewed and modified. +*** + +This package contains the files needed to compile a module to support the Hayes +ESP card. The drivers are basically a modified version of the serial drivers. + +Features: + +- Uses the enhanced mode of the ESP card, allowing a wider range of + interrupts and features than compatibility mode +- Uses DMA and 16 bit PIO mode to transfer data to and from the ESP's FIFOs, + reducing CPU load +- Supports primary and secondary ports + + +If the driver is compiled as a module, the IRQs to use can be specified by +using the irq= option. The format is: + +irq=[0x100],[0x140],[0x180],[0x200],[0x240],[0x280],[0x300],[0x380] + +The address in brackets is the base address of the card. The IRQ of +nonexistent cards can be set to 0. If an IRQ of a card that does exist is set +to 0, the driver will attempt to guess at the correct IRQ. For example, to set +the IRQ of the card at address 0x300 to 12, the insmod command would be: + +insmod esp irq=0,0,0,0,0,0,12,0 + +The custom divisor can be set by using the divisor= option. The format is the +same as for the irq= option. Each divisor value is a series of hex digits, +with each digit representing the divisor to use for a corresponding port. The +divisor value is constructed RIGHT TO LEFT. Specifying a nonzero divisor value +will automatically set the spd_cust flag. To calculate the divisor to use for +a certain baud rate, divide the port's base baud (generally 921600) by the +desired rate. For example, to set the divisor of the primary port at 0x300 to +4 and the divisor of the secondary port at 0x308 to 8, the insmod command would +be: + +insmod esp divisor=0,0,0,0,0,0,0x84,0 + +The dma= option can be used to set the DMA channel. The channel can be either +1 or 3. Specifying any other value will force the driver to use PIO mode. +For example, to set the DMA channel to 3, the insmod command would be: + +insmod esp dma=3 + +The rx_trigger= and tx_trigger= options can be used to set the FIFO trigger +levels. They specify when the ESP card should send an interrupt. Larger +values will decrease the number of interrupts; however, a value too high may +result in data loss. Valid values are 1 through 1023, with 768 being the +default. For example, to set the receive trigger level to 512 bytes and the +transmit trigger level to 700 bytes, the insmod command would be: + +insmod esp rx_trigger=512 tx_trigger=700 + +The flow_off= and flow_on= options can be used to set the hardware flow off/ +flow on levels. The flow on level must be lower than the flow off level, and +the flow off level should be higher than rx_trigger. Valid values are 1 +through 1023, with 1016 being the default flow off level and 944 being the +default flow on level. For example, to set the flow off level to 1000 bytes +and the flow on level to 935 bytes, the insmod command would be: + +insmod esp flow_off=1000 flow_on=935 + +The rx_timeout= option can be used to set the receive timeout value. This +value indicates how long after receiving the last character that the ESP card +should wait before signalling an interrupt. Valid values are 0 though 255, +with 128 being the default. A value too high will increase latency, and a +value too low will cause unnecessary interrupts. For example, to set the +receive timeout to 255, the insmod command would be: + +insmod esp rx_timeout=255 + +The pio_threshold= option sets the threshold (in number of characters) for +using PIO mode instead of DMA mode. For example, if this value is 32, +transfers of 32 bytes or less will always use PIO mode. + +insmod esp pio_threshold=32 + +Multiple options can be listed on the insmod command line by separating each +option with a space. For example: + +insmod esp dma=3 trigger=512 + +The esp module can be automatically loaded when needed. To cause this to +happen, add the following lines to /etc/modprobe.conf (replacing the last line +with options for your configuration): + +alias char-major-57 esp +alias char-major-58 esp +options esp irq=0,0,0,0,0,0,3,0 divisor=0,0,0,0,0,0,0x4,0 + +You may also need to run 'depmod -a'. + +Devices must be created manually. To create the devices, note the output from +the module after it is inserted. The output will appear in the location where +kernel messages usually appear (usually /var/adm/messages). Create two devices +for each 'tty' mentioned, one with major of 57 and the other with major of 58. +The minor number should be the same as the tty number reported. The commands +would be (replace ? with the tty number): + +mknod /dev/ttyP? c 57 ? +mknod /dev/cup? c 58 ? + +For example, if the following line appears: + +Oct 24 18:17:23 techno kernel: ttyP8 at 0x0140 (irq = 3) is an ESP primary port + +...two devices should be created: + +mknod /dev/ttyP8 c 57 8 +mknod /dev/cup8 c 58 8 + +You may need to set the permissions on the devices: + +chmod 666 /dev/ttyP* +chmod 666 /dev/cup* + +The ESP module and the serial module should not conflict (they can be used at +the same time). After the ESP module has been loaded the ports on the ESP card +will no longer be accessible by the serial driver. + +If I/O errors are experienced when accessing the port, check for IRQ and DMA +conflicts ('cat /proc/interrupts' and 'cat /proc/dma' for a list of IRQs and +DMAs currently in use). + +Enjoy! +Andrew J. Robinson diff --git a/Documentation/serial/moxa-smartio b/Documentation/serial/moxa-smartio new file mode 100644 index 00000000000..5337e80a5b9 --- /dev/null +++ b/Documentation/serial/moxa-smartio @@ -0,0 +1,523 @@ +============================================================================= + MOXA Smartio/Industio Family Device Driver Installation Guide + for Linux Kernel 2.4.x, 2.6.x + Copyright (C) 2008, Moxa Inc. +============================================================================= +Date: 01/21/2008 + +Content + +1. Introduction +2. System Requirement +3. Installation + 3.1 Hardware installation + 3.2 Driver files + 3.3 Device naming convention + 3.4 Module driver configuration + 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x. + 3.6 Custom configuration + 3.7 Verify driver installation +4. Utilities +5. Setserial +6. Troubleshooting + +----------------------------------------------------------------------------- +1. Introduction + + The Smartio/Industio/UPCI family Linux driver supports following multiport + boards. + + - 2 ports multiport board + CP-102U, CP-102UL, CP-102UF + CP-132U-I, CP-132UL, + CP-132, CP-132I, CP132S, CP-132IS, + CI-132, CI-132I, CI-132IS, + (C102H, C102HI, C102HIS, C102P, CP-102, CP-102S) + + - 4 ports multiport board + CP-104EL, + CP-104UL, CP-104JU, + CP-134U, CP-134U-I, + C104H/PCI, C104HS/PCI, + CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL, + C104H, C104HS, + CI-104J, CI-104JS, + CI-134, CI-134I, CI-134IS, + (C114HI, CT-114I, C104P) + POS-104UL, + CB-114, + CB-134I + + - 8 ports multiport board + CP-118EL, CP-168EL, + CP-118U, CP-168U, + C168H/PCI, + C168H, C168HS, + (C168P), + CB-108 + + This driver and installation procedure have been developed upon Linux Kernel + 2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order + to maintain compatibility, this version has also been properly tested with + RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem + occurs, please contact Moxa at support@moxa.com.tw. + + In addition to device driver, useful utilities are also provided in this + version. They are + - msdiag Diagnostic program for displaying installed Moxa + Smartio/Industio boards. + - msmon Monitor program to observe data count and line status signals. + - msterm A simple terminal program which is useful in testing serial + ports. + - io-irq.exe Configuration program to setup ISA boards. Please note that + this program can only be executed under DOS. + + All the drivers and utilities are published in form of source code under + GNU General Public License in this version. Please refer to GNU General + Public License announcement in each source code file for more detail. + + In Moxa's Web sites, you may always find latest driver at http://web.moxa.com. + + This version of driver can be installed as Loadable Module (Module driver) + or built-in into kernel (Static driver). You may refer to following + installation procedure for suitable one. Before you install the driver, + please refer to hardware installation procedure in the User's Manual. + + We assume the user should be familiar with following documents. + - Serial-HOWTO + - Kernel-HOWTO + +----------------------------------------------------------------------------- +2. System Requirement + - Hardware platform: Intel x86 machine + - Kernel version: 2.4.x or 2.6.x + - gcc version 2.72 or later + - Maximum 4 boards can be installed in combination + +----------------------------------------------------------------------------- +3. Installation + + 3.1 Hardware installation + 3.2 Driver files + 3.3 Device naming convention + 3.4 Module driver configuration + 3.5 Static driver configuration for Linux kernel 2.4.x, 2.6.x. + 3.6 Custom configuration + 3.7 Verify driver installation + + + 3.1 Hardware installation + + There are two types of buses, ISA and PCI, for Smartio/Industio + family multiport board. + + ISA board + --------- + You'll have to configure CAP address, I/O address, Interrupt Vector + as well as IRQ before installing this driver. Please refer to hardware + installation procedure in User's Manual before proceed any further. + Please make sure the JP1 is open after the ISA board is set properly. + + PCI/UPCI board + -------------- + You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict + with other ISA devices. Please refer to hardware installation + procedure in User's Manual in advance. + + PCI IRQ Sharing + ----------- + Each port within the same multiport board shares the same IRQ. Up to + 4 Moxa Smartio/Industio PCI Family multiport boards can be installed + together on one system and they can share the same IRQ. + + + 3.2 Driver files + + The driver file may be obtained from ftp, CD-ROM or floppy disk. The + first step, anyway, is to copy driver file "mxser.tgz" into specified + directory. e.g. /moxa. The execute commands as below. + + # cd / + # mkdir moxa + # cd /moxa + # tar xvf /dev/fd0 + + or + + # cd / + # mkdir moxa + # cd /moxa + # cp /mnt/cdrom//mxser.tgz . + # tar xvfz mxser.tgz + + + 3.3 Device naming convention + + You may find all the driver and utilities files in /moxa/mxser. + Following installation procedure depends on the model you'd like to + run the driver. If you prefer module driver, please refer to 3.4. + If static driver is required, please refer to 3.5. + + Dialin and callout port + ----------------------- + This driver remains traditional serial device properties. There are + two special file name for each serial port. One is dial-in port + which is named "ttyMxx". For callout port, the naming convention + is "cumxx". + + Device naming when more than 2 boards installed + ----------------------------------------------- + Naming convention for each Smartio/Industio multiport board is + pre-defined as below. + + Board Num. Dial-in Port Callout port + 1st board ttyM0 - ttyM7 cum0 - cum7 + 2nd board ttyM8 - ttyM15 cum8 - cum15 + 3rd board ttyM16 - ttyM23 cum16 - cum23 + 4th board ttyM24 - ttym31 cum24 - cum31 + + + !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + Under Kernel 2.6 the cum Device is Obsolete. So use ttyM* + device instead. + !!!!!!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + Board sequence + -------------- + This driver will activate ISA boards according to the parameter set + in the driver. After all specified ISA board activated, PCI board + will be installed in the system automatically driven. + Therefore the board number is sorted by the CAP address of ISA boards. + For PCI boards, their sequence will be after ISA boards and C168H/PCI + has higher priority than C104H/PCI boards. + + 3.4 Module driver configuration + Module driver is easiest way to install. If you prefer static driver + installation, please skip this paragraph. + + + ------------- Prepare to use the MOXA driver-------------------- + 3.4.1 Create tty device with correct major number + Before using MOXA driver, your system must have the tty devices + which are created with driver's major number. We offer one shell + script "msmknod" to simplify the procedure. + This step is only needed to be executed once. But you still + need to do this procedure when: + a. You change the driver's major number. Please refer the "3.7" + section. + b. Your total installed MOXA boards number is changed. Maybe you + add/delete one MOXA board. + c. You want to change the tty name. This needs to modify the + shell script "msmknod" + + The procedure is: + # cd /moxa/mxser/driver + # ./msmknod + + This shell script will require the major number for dial-in + device and callout device to create tty device. You also need + to specify the total installed MOXA board number. Default major + numbers for dial-in device and callout device are 30, 35. If + you need to change to other number, please refer section "3.7" + for more detailed procedure. + Msmknod will delete any special files occupying the same device + naming. + + 3.4.2 Build the MOXA driver and utilities + Before using the MOXA driver and utilities, you need compile the + all the source code. This step is only need to be executed once. + But you still re-compile the source code if you modify the source + code. For example, if you change the driver's major number (see + "3.7" section), then you need to do this step again. + + Find "Makefile" in /moxa/mxser, then run + + # make clean; make install + + !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!! + For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1: + # make clean; make installsp1 + + For Red Hat Enterprise Linux AS4/ES4/WS4: + # make clean; make installsp2 + !!!!!!!!!! NOTE !!!!!!!!!!!!!!!!! + + The driver files "mxser.o" and utilities will be properly compiled + and copied to system directories respectively. + + ------------- Load MOXA driver-------------------- + 3.4.3 Load the MOXA driver + + # modprobe mxser + + will activate the module driver. You may run "lsmod" to check + if "mxser" is activated. If the MOXA board is ISA board, the + is needed. Please refer to section "3.4.5" for more + information. + + + ------------- Load MOXA driver on boot -------------------- + 3.4.4 For the above description, you may manually execute + "modprobe mxser" to activate this driver and run + "rmmod mxser" to remove it. + However, it's better to have a boot time configuration to + eliminate manual operation. Boot time configuration can be + achieved by rc file. We offer one "rc.mxser" file to simplify + the procedure under "moxa/mxser/driver". + + But if you use ISA board, please modify the "modprobe ..." command + to add the argument (see "3.4.5" section). After modifying the + rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser" + manually to make sure the modification is ok. If any error + encountered, please try to modify again. If the modification is + completed, follow the below step. + + Run following command for setting rc files. + + # cd /moxa/mxser/driver + # cp ./rc.mxser /etc/rc.d + # cd /etc/rc.d + + Check "rc.serial" is existed or not. If "rc.serial" doesn't exist, + create it by vi, run "chmod 755 rc.serial" to change the permission. + Add "/etc/rc.d/rc.mxser" in last line, + + Reboot and check if moxa.o activated by "lsmod" command. + + 3.4.5. If you'd like to drive Smartio/Industio ISA boards in the system, + you'll have to add parameter to specify CAP address of given + board while activating "mxser.o". The format for parameters are + as follows. + + modprobe mxser ioaddr=0x???,0x???,0x???,0x??? + | | | | + | | | +- 4th ISA board + | | +------ 3rd ISA board + | +------------ 2nd ISA board + +------------------- 1st ISA board + + 3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x + + Note: To use static driver, you must install the linux kernel + source package. + + 3.5.1 Backup the built-in driver in the kernel. + # cd /usr/src/linux/drivers/char + # mv mxser.c mxser.c.old + + For Red Hat 7.x user, you need to create link: + # cd /usr/src + # ln -s linux-2.4 linux + + 3.5.2 Create link + # cd /usr/src/linux/drivers/char + # ln -s /moxa/mxser/driver/mxser.c mxser.c + + 3.5.3 Add CAP address list for ISA boards. For PCI boards user, + please skip this step. + + In module mode, the CAP address for ISA board is given by + parameter. In static driver configuration, you'll have to + assign it within driver's source code. If you will not + install any ISA boards, you may skip to next portion. + The instructions to modify driver source code are as + below. + a. # cd /moxa/mxser/driver + # vi mxser.c + b. Find the array mxserBoardCAP[] as below. + + static int mxserBoardCAP[] + = {0x00, 0x00, 0x00, 0x00}; + + c. Change the address within this array using vi. For + example, to driver 2 ISA boards with CAP address + 0x280 and 0x180 as 1st and 2nd board. Just to change + the source code as follows. + + static int mxserBoardCAP[] + = {0x280, 0x180, 0x00, 0x00}; + + 3.5.4 Setup kernel configuration + + Configure the kernel: + + # cd /usr/src/linux + # make menuconfig + + You will go into a menu-driven system. Please select [Character + devices][Non-standard serial port support], enable the [Moxa + SmartIO support] driver with "[*]" for built-in (not "[M]"), then + select [Exit] to exit this program. + + 3.5.5 Rebuild kernel + The following are for Linux kernel rebuilding, for your + reference only. + For appropriate details, please refer to the Linux document. + + a. cd /usr/src/linux + b. make clean /* take a few minutes */ + c. make dep /* take a few minutes */ + d. make bzImage /* take probably 10-20 minutes */ + e. make install /* copy boot image to correct position */ + f. Please make sure the boot kernel (vmlinuz) is in the + correct position. + g. If you use 'lilo' utility, you should check /etc/lilo.conf + 'image' item specified the path which is the 'vmlinuz' path, + or you will load wrong (or old) boot kernel image (vmlinuz). + After checking /etc/lilo.conf, please run "lilo". + + Note that if the result of "make bzImage" is ERROR, then you have to + go back to Linux configuration Setup. Type "make menuconfig" in + directory /usr/src/linux. + + + 3.5.6 Make tty device and special file + # cd /moxa/mxser/driver + # ./msmknod + + 3.5.7 Make utility + # cd /moxa/mxser/utility + # make clean; make install + + 3.5.8 Reboot + + + + 3.6 Custom configuration + Although this driver already provides you default configuration, you + still can change the device name and major number. The instruction to + change these parameters are shown as below. + + Change Device name + ------------------ + If you'd like to use other device names instead of default naming + convention, all you have to do is to modify the internal code + within the shell script "msmknod". First, you have to open "msmknod" + by vi. Locate each line contains "ttyM" and "cum" and change them + to the device name you desired. "msmknod" creates the device names + you need next time executed. + + Change Major number + ------------------- + If major number 30 and 35 had been occupied, you may have to select + 2 free major numbers for this driver. There are 3 steps to change + major numbers. + + 3.6.1 Find free major numbers + In /proc/devices, you may find all the major numbers occupied + in the system. Please select 2 major numbers that are available. + e.g. 40, 45. + 3.6.2 Create special files + Run /moxa/mxser/driver/msmknod to create special files with + specified major numbers. + 3.6.3 Modify driver with new major number + Run vi to open /moxa/mxser/driver/mxser.c. Locate the line + contains "MXSERMAJOR". Change the content as below. + #define MXSERMAJOR 40 + #define MXSERCUMAJOR 45 + 3.6.4 Run "make clean; make install" in /moxa/mxser/driver. + + 3.7 Verify driver installation + You may refer to /var/log/messages to check the latest status + log reported by this driver whenever it's activated. + +----------------------------------------------------------------------------- +4. Utilities + There are 3 utilities contained in this driver. They are msdiag, msmon and + msterm. These 3 utilities are released in form of source code. They should + be compiled into executable file and copied into /usr/bin. + + Before using these utilities, please load driver (refer 3.4 & 3.5) and + make sure you had run the "msmknod" utility. + + msdiag - Diagnostic + -------------------- + This utility provides the function to display what Moxa Smartio/Industio + board found by driver in the system. + + msmon - Port Monitoring + ----------------------- + This utility gives the user a quick view about all the MOXA ports' + activities. One can easily learn each port's total received/transmitted + (Rx/Tx) character count since the time when the monitoring is started. + Rx/Tx throughputs per second are also reported in interval basis (e.g. + the last 5 seconds) and in average basis (since the time the monitoring + is started). You can reset all ports' count by key. <+> <-> + (plus/minus) keys to change the displaying time interval. Press + on the port, that cursor stay, to view the port's communication + parameters, signal status, and input/output queue. + + msterm - Terminal Emulation + --------------------------- + This utility provides data sending and receiving ability of all tty ports, + especially for MOXA ports. It is quite useful for testing simple + application, for example, sending AT command to a modem connected to the + port or used as a terminal for login purpose. Note that this is only a + dumb terminal emulation without handling full screen operation. + +----------------------------------------------------------------------------- +5. Setserial + + Supported Setserial parameters are listed as below. + + uart set UART type(16450-->disable FIFO, 16550A-->enable FIFO) + close_delay set the amount of time(in 1/100 of a second) that DTR + should be kept low while being closed. + closing_wait set the amount of time(in 1/100 of a second) that the + serial port should wait for data to be drained while + being closed, before the receiver is disable. + spd_hi Use 57.6kb when the application requests 38.4kb. + spd_vhi Use 115.2kb when the application requests 38.4kb. + spd_shi Use 230.4kb when the application requests 38.4kb. + spd_warp Use 460.8kb when the application requests 38.4kb. + spd_normal Use 38.4kb when the application requests 38.4kb. + spd_cust Use the custom divisor to set the speed when the + application requests 38.4kb. + divisor This option set the custom divison. + baud_base This option set the base baud rate. + +----------------------------------------------------------------------------- +6. Troubleshooting + + The boot time error messages and solutions are stated as clearly as + possible. If all the possible solutions fail, please contact our technical + support team to get more help. + + + Error msg: More than 4 Moxa Smartio/Industio family boards found. Fifth board + and after are ignored. + Solution: + To avoid this problem, please unplug fifth and after board, because Moxa + driver supports up to 4 boards. + + Error msg: Request_irq fail, IRQ(?) may be conflict with another device. + Solution: + Other PCI or ISA devices occupy the assigned IRQ. If you are not sure + which device causes the situation, please check /proc/interrupts to find + free IRQ and simply change another free IRQ for Moxa board. + + Error msg: Board #: C1xx Series(CAP=xxx) interrupt number invalid. + Solution: + Each port within the same multiport board shares the same IRQ. Please set + one IRQ (IRQ doesn't equal to zero) for one Moxa board. + + Error msg: No interrupt vector be set for Moxa ISA board(CAP=xxx). + Solution: + Moxa ISA board needs an interrupt vector.Please refer to user's manual + "Hardware Installation" chapter to set interrupt vector. + + Error msg: Couldn't install MOXA Smartio/Industio family driver! + Solution: + Load Moxa driver fail, the major number may conflict with other devices. + Please refer to previous section 3.7 to change a free major number for + Moxa driver. + + Error msg: Couldn't install MOXA Smartio/Industio family callout driver! + Solution: + Load Moxa callout driver fail, the callout device major number may + conflict with other devices. Please refer to previous section 3.7 to + change a free callout device major number for Moxa driver. + + +----------------------------------------------------------------------------- + diff --git a/Documentation/serial/riscom8.txt b/Documentation/serial/riscom8.txt new file mode 100644 index 00000000000..14f61fdad7c --- /dev/null +++ b/Documentation/serial/riscom8.txt @@ -0,0 +1,36 @@ +* NOTE - this is an unmaintained driver. The original author cannot be located. + +SDL Communications is now SBS Technologies, and does not have any +information on these ancient ISA cards on their website. + +James Nelson - 12-12-2004 + + This is the README for RISCom/8 multi-port serial driver + (C) 1994-1996 D.Gorodchanin + See file LICENSE for terms and conditions. + +NOTE: English is not my native language. + I'm sorry for any mistakes in this text. + +Misc. notes for RISCom/8 serial driver, in no particular order :) + +1) This driver can support up to 4 boards at time. + Use string "riscom8=0xXXX,0xXXX,0xXXX,0xXXX" at LILO prompt, for + setting I/O base addresses for boards. If you compile driver + as module use modprobe options "iobase=0xXXX iobase1=0xXXX iobase2=..." + +2) The driver partially supports famous 'setserial' program, you can use almost + any of its options, excluding port & irq settings. + +3) There are some misc. defines at the beginning of riscom8.c, please read the + comments and try to change some of them in case of problems. + +4) I consider the current state of the driver as BETA. + +5) SDL Communications WWW page is http://www.sdlcomm.com. + +6) You can use the MAKEDEV program to create RISCom/8 /dev/ttyL* entries. + +7) Minor numbers for first board are 0-7, for second 8-15, etc. + +22 Apr 1996. diff --git a/Documentation/serial/rocket.txt b/Documentation/serial/rocket.txt new file mode 100644 index 00000000000..1d858299043 --- /dev/null +++ b/Documentation/serial/rocket.txt @@ -0,0 +1,189 @@ +Comtrol(tm) RocketPort(R)/RocketModem(TM) Series +Device Driver for the Linux Operating System + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +PRODUCT OVERVIEW +---------------- + +This driver provides a loadable kernel driver for the Comtrol RocketPort +and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32 +high-speed serial ports or modems. This driver supports up to a combination +of four RocketPort or RocketModems boards in one machine simultaneously. +This file assumes that you are using the RocketPort driver which is +integrated into the kernel sources. + +The driver can also be installed as an external module using the usual +"make;make install" routine. This external module driver, obtainable +from the Comtrol website listed below, is useful for updating the driver +or installing it into kernels which do not have the driver configured +into them. Installations instructions for the external module +are in the included README and HW_INSTALL files. + +RocketPort ISA and RocketModem II PCI boards currently are only supported by +this driver in module form. + +The RocketPort ISA board requires I/O ports to be configured by the DIP +switches on the board. See the section "ISA Rocketport Boards" below for +information on how to set the DIP switches. + +You pass the I/O port to the driver using the following module parameters: + +board1 : I/O port for the first ISA board +board2 : I/O port for the second ISA board +board3 : I/O port for the third ISA board +board4 : I/O port for the fourth ISA board + +There is a set of utilities and scripts provided with the external driver +( downloadable from http://www.comtrol.com ) that ease the configuration and +setup of the ISA cards. + +The RocketModem II PCI boards require firmware to be loaded into the card +before it will function. The driver has only been tested as a module for this +board. + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +INSTALLATION PROCEDURES +----------------------- + +RocketPort/RocketModem PCI cards require no driver configuration, they are +automatically detected and configured. + +The RocketPort driver can be installed as a module (recommended) or built +into the kernel. This is selected, as for other drivers, through the `make config` +command from the root of the Linux source tree during the kernel build process. + +The RocketPort/RocketModem serial ports installed by this driver are assigned +device major number 46, and will be named /dev/ttyRx, where x is the port number +starting at zero (ex. /dev/ttyR0, /devttyR1, ...). If you have multiple cards +installed in the system, the mapping of port names to serial ports is displayed +in the system log at /var/log/messages. + +If installed as a module, the module must be loaded. This can be done +manually by entering "modprobe rocket". To have the module loaded automatically +upon system boot, edit the /etc/modprobe.conf file and add the line +"alias char-major-46 rocket". + +In order to use the ports, their device names (nodes) must be created with mknod. +This is only required once, the system will retain the names once created. To +create the RocketPort/RocketModem device names, use the command +"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero. For example: + +>mknod /dev/ttyR0 c 46 0 +>mknod /dev/ttyR1 c 46 1 +>mknod /dev/ttyR2 c 46 2 + +The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes) +for you: + +>/dev/MAKEDEV ttyR + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +ISA Rocketport Boards +--------------------- + +You must assign and configure the I/O addresses used by the ISA Rocketport +card before installing and using it. This is done by setting a set of DIP +switches on the Rocketport board. + + +SETTING THE I/O ADDRESS +----------------------- + +Before installing RocketPort(R) or RocketPort RA boards, you must find +a range of I/O addresses for it to use. The first RocketPort card +requires a 68-byte contiguous block of I/O addresses, starting at one +of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h, +0x300h, 0x340h, 0x380h. This I/O address must be reflected in the DIP +switches of *all* of the Rocketport cards. + +The second, third, and fourth RocketPort cards require a 64-byte +contiguous block of I/O addresses, starting at one of the following +I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h, +0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h. The I/O address used by the +second, third, and fourth Rocketport cards (if present) are set via +software control. The DIP switch settings for the I/O address must be +set to the value of the first Rocketport cards. + +In order to distinguish each of the card from the others, each card +must have a unique board ID set on the dip switches. The first +Rocketport board must be set with the DIP switches corresponding to +the first board, the second board must be set with the DIP switches +corresponding to the second board, etc. IMPORTANT: The board ID is +the only place where the DIP switch settings should differ between the +various Rocketport boards in a system. + +The I/O address range used by any of the RocketPort cards must not +conflict with any other cards in the system, including other +RocketPort cards. Below, you will find a list of commonly used I/O +address ranges which may be in use by other devices in your system. +On a Linux system, "cat /proc/ioports" will also be helpful in +identifying what I/O addresses are being used by devices on your +system. + +Remember, the FIRST RocketPort uses 68 I/O addresses. So, if you set it +for 0x100, it will occupy 0x100 to 0x143. This would mean that you +CAN NOT set the second, third or fourth board for address 0x140 since +the first 4 bytes of that range are used by the first board. You would +need to set the second, third, or fourth board to one of the next available +blocks such as 0x180. + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +RocketPort and RocketPort RA SW1 Settings: + + +-------------------------------+ + | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | + +-------+-------+---------------+ + | Unused| Card | I/O Port Block| + +-------------------------------+ + +DIP Switches DIP Switches +7 8 6 5 +=================== =================== +On On UNUSED, MUST BE ON. On On First Card <==== Default + On Off Second Card + Off On Third Card + Off Off Fourth Card + +DIP Switches I/O Address Range +4 3 2 1 Used by the First Card +===================================== +On Off On Off 100-143 +On Off Off On 140-183 +On Off Off Off 180-1C3 <==== Default +Off On On Off 200-243 +Off On Off On 240-283 +Off On Off Off 280-2C3 +Off Off On Off 300-343 +Off Off Off On 340-383 +Off Off Off Off 380-3C3 + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + +REPORTING BUGS +-------------- + +For technical support, please provide the following +information: Driver version, kernel release, distribution of +kernel, and type of board you are using. Error messages and log +printouts port configuration details are especially helpful. + +USA + Phone: (612) 494-4100 + FAX: (612) 494-4199 + email: support@comtrol.com + +Comtrol Europe + Phone: +44 (0) 1 869 323-220 + FAX: +44 (0) 1 869 323-211 + email: support@comtrol.co.uk + +Web: http://www.comtrol.com +FTP: ftp.comtrol.com + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + + diff --git a/Documentation/serial/specialix.txt b/Documentation/serial/specialix.txt new file mode 100644 index 00000000000..6eb6f3a3331 --- /dev/null +++ b/Documentation/serial/specialix.txt @@ -0,0 +1,383 @@ + + specialix.txt -- specialix IO8+ multiport serial driver readme. + + + + Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl) + + Specialix pays for the development and support of this driver. + Please DO contact io8-linux@specialix.co.uk if you require + support. + + This driver was developed in the BitWizard linux device + driver service. If you require a linux device driver for your + product, please contact devices@BitWizard.nl for a quote. + + This code is firmly based on the riscom/8 serial driver, + written by Dmitry Gorodchanin. The specialix IO8+ card + programming information was obtained from the CL-CD1865 Data + Book, and Specialix document number 6200059: IO8+ Hardware + Functional Specification, augmented by document number 6200088: + Merak Hardware Functional Specification. (IO8+/PCI is also + called Merak) + + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + USA. + + +Intro +===== + + +This file contains some random information, that I like to have online +instead of in a manual that can get lost. Ever misplace your Linux +kernel sources? And the manual of one of the boards in your computer? + + +Addresses and interrupts +======================== + +Address dip switch settings: +The dip switch sets bits 2-9 of the IO address. + + 9 8 7 6 5 4 3 2 + +-----------------+ + 0 | X X X X X X X | + | | = IoBase = 0x100 + 1 | X | + +-----------------+ ------ RS232 connectors ----> + + | | | + edge connector + | | | + V V V + +Base address 0x100 caused a conflict in one of my computers once. I +haven't the foggiest why. My Specialix card is now at 0x180. My +other computer runs just fine with the Specialix card at 0x100.... +The card occupies 4 addresses, but actually only two are really used. + +The PCI version doesn't have any dip switches. The BIOS assigns +an IO address. + +The driver now still autoprobes at 0x100, 0x180, 0x250 and 0x260. If +that causes trouble for you, please report that. I'll remove +autoprobing then. + +The driver will tell the card what IRQ to use, so you don't have to +change any jumpers to change the IRQ. Just use a command line +argument (irq=xx) to the insmod program to set the interrupt. + +The BIOS assigns the IRQ on the PCI version. You have no say in what +IRQ to use in that case. + +If your specialix cards are not at the default locations, you can use +the kernel command line argument "specialix=io0,irq0,io1,irq1...". +Here "io0" is the io address for the first card, and "irq0" is the +irq line that the first card should use. And so on. + +Examples. + +You use the driver as a module and have three cards at 0x100, 0x250 +and 0x180. And some way or another you want them detected in that +order. Moreover irq 12 is taken (e.g. by your PS/2 mouse). + + insmod specialix.o iobase=0x100,0x250,0x180 irq=9,11,15 + +The same three cards, but now in the kernel would require you to +add + + specialix=0x100,9,0x250,11,0x180,15 + +to the command line. This would become + + append="specialix=0x100,9,0x250,11,0x180,15" + +in your /etc/lilo.conf file if you use lilo. + +The Specialix driver is slightly odd: It allows you to have the second +or third card detected without having a first card. This has +advantages and disadvantages. A slot that isn't filled by an ISA card, +might be filled if a PCI card is detected. Thus if you have an ISA +card at 0x250 and a PCI card, you would get: + +sx0: specialix IO8+ Board at 0x100 not found. +sx1: specialix IO8+ Board at 0x180 not found. +sx2: specialix IO8+ board detected at 0x250, IRQ 12, CD1865 Rev. B. +sx3: specialix IO8+ Board at 0x260 not found. +sx0: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B. + +This would happen if you don't give any probe hints to the driver. +If you would specify: + + specialix=0x250,11 + +you'd get the following messages: + +sx0: specialix IO8+ board detected at 0x250, IRQ 11, CD1865 Rev. B. +sx1: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B. + +ISA probing is aborted after the IO address you gave is exhausted, and +the PCI card is now detected as the second card. The ISA card is now +also forced to IRQ11.... + + +Baud rates +========== + +The rev 1.2 and below boards use a CL-CD1864. These chips can only +do 64kbit. The rev 1.3 and newer boards use a CL-CD1865. These chips +are officially capable of 115k2. + +The Specialix card uses a 25MHz crystal (in times two mode, which in +fact is a divided by two mode). This is not enough to reach the rated +115k2 on all ports at the same time. With this clock rate you can only +do 37% of this rate. This means that at 115k2 on all ports you are +going to lose characters (The chip cannot handle that many incoming +bits at this clock rate.) (Yes, you read that correctly: there is a +limit to the number of -=bits=- per second that the chip can handle.) + +If you near the "limit" you will first start to see a graceful +degradation in that the chip cannot keep the transmitter busy at all +times. However with a central clock this slow, you can also get it to +miss incoming characters. The driver will print a warning message when +you are outside the official specs. The messages usually show up in +the file /var/log/messages . + +The specialix card cannot reliably do 115k2. If you use it, you have +to do "extensive testing" (*) to verify if it actually works. + +When "mgetty" communicates with my modem at 115k2 it reports: +got: +++[0d]ATQ0V1H0[0d][0d][8a]O[cb][0d][8a] + ^^^^ ^^^^ ^^^^ + +The three characters that have the "^^^" under them have suffered a +bit error in the highest bit. In conclusion: I've tested it, and found +that it simply DOESN'T work for me. I also suspect that this is also +caused by the baud rate being just a little bit out of tune. + +I upgraded the crystal to 66Mhz on one of my Specialix cards. Works +great! Contact me for details. (Voids warranty, requires a steady hand +and more such restrictions....) + + +(*) Cirrus logic CD1864 databook, page 40. + + +Cables for the Specialix IO8+ +============================= + +The pinout of the connectors on the IO8+ is: + + pin short direction long name + name + Pin 1 DCD input Data Carrier Detect + Pin 2 RXD input Receive + Pin 3 DTR/RTS output Data Terminal Ready/Ready To Send + Pin 4 GND - Ground + Pin 5 TXD output Transmit + Pin 6 CTS input Clear To Send + + + -- 6 5 4 3 2 1 -- + | | + | | + | | + | | + +----- -----+ + |__________| + clip + + Front view of an RJ12 connector. Cable moves "into" the paper. + (the plug is ready to plug into your mouth this way...) + + + NULL cable. I don't know who is going to use these except for + testing purposes, but I tested the cards with this cable. (It + took quite a while to figure out, so I'm not going to delete + it. So there! :-) + + + This end goes This end needs + straight into the some twists in + RJ12 plug. the wiring. + IO8+ RJ12 IO8+ RJ12 + 1 DCD white - + - - 1 DCD + 2 RXD black 5 TXD + 3 DTR/RTS red 6 CTS + 4 GND green 4 GND + 5 TXD yellow 2 RXD + 6 CTS blue 3 DTR/RTS + + + Same NULL cable, but now sorted on the second column. + + 1 DCD white - + - - 1 DCD + 5 TXD yellow 2 RXD + 6 CTS blue 3 DTR/RTS + 4 GND green 4 GND + 2 RXD black 5 TXD + 3 DTR/RTS red 6 CTS + + + + This is a modem cable usable for hardware handshaking: + RJ12 DB25 DB9 + 1 DCD white 8 DCD 1 DCD + 2 RXD black 3 RXD 2 RXD + 3 DTR/RTS red 4 RTS 7 RTS + 4 GND green 7 GND 5 GND + 5 TXD yellow 2 TXD 3 TXD + 6 CTS blue 5 CTS 8 CTS + +---- 6 DSR 6 DSR + +---- 20 DTR 4 DTR + + This is a modem cable usable for software handshaking: + It allows you to reset the modem using the DTR ioctls. + I (REW) have never tested this, "but xxxxxxxxxxxxx + says that it works." If you test this, please + tell me and I'll fill in your name on the xxx's. + + RJ12 DB25 DB9 + 1 DCD white 8 DCD 1 DCD + 2 RXD black 3 RXD 2 RXD + 3 DTR/RTS red 20 DTR 4 DTR + 4 GND green 7 GND 5 GND + 5 TXD yellow 2 TXD 3 TXD + 6 CTS blue 5 CTS 8 CTS + +---- 6 DSR 6 DSR + +---- 4 RTS 7 RTS + + I bought a 6 wire flat cable. It was colored as indicated. + Check that yours is the same before you trust me on this. + + +Hardware handshaking issues. +============================ + +The driver can be told to operate in two different ways. The default +behaviour is specialix.sx_rtscts = 0 where the pin behaves as DTR when +hardware handshaking is off. It behaves as the RTS hardware +handshaking signal when hardware handshaking is selected. + +When you use this, you have to use the appropriate cable. The +cable will either be compatible with hardware handshaking or with +software handshaking. So switching on the fly is not really an +option. + +I actually prefer to use the "specialix.sx_rtscts=1" option. +This makes the DTR/RTS pin always an RTS pin, and ioctls to +change DTR are always ignored. I have a cable that is configured +for this. + + +Ports and devices +================= + +Port 0 is the one furthest from the card-edge connector. + +Devices: + +You should make the devices as follows: + +bash +cd /dev +for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 \ + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +do + echo -n "$i " + mknod /dev/ttyW$i c 75 $i + mknod /dev/cuw$i c 76 $i +done +echo "" + +If your system doesn't come with these devices preinstalled, bug your +linux-vendor about this. They have had ample time to get this +implemented by now. + +You cannot have more than 4 boards in one computer. The card only +supports 4 different interrupts. If you really want this, contact me +about this and I'll give you a few tips (requires soldering iron).... + +If you have enough PCI slots, you can probably use more than 4 PCI +versions of the card though.... + +The PCI version of the card cannot adhere to the mechanical part of +the PCI spec because the 8 serial connectors are simply too large. If +it doesn't fit in your computer, bring back the card. + + +------------------------------------------------------------------------ + + + Fixed bugs and restrictions: + - During initialization, interrupts are blindly turned on. + Having a shadow variable would cause an extra memory + access on every IO instruction. + - The interrupt (on the card) should be disabled when we + don't allocate the Linux end of the interrupt. This allows + a different driver/card to use it while all ports are not in + use..... (a la standard serial port) + == An extra _off variant of the sx_in and sx_out macros are + now available. They don't set the interrupt enable bit. + These are used during initialization. Normal operation uses + the old variant which enables the interrupt line. + - RTS/DTR issue needs to be implemented according to + specialix' spec. + I kind of like the "determinism" of the current + implementation. Compile time flag? + == Ok. Compile time flag! Default is how Specialix likes it. + == Now a config time flag! Gets saved in your config file. Neat! + - Can you set the IO address from the lilo command line? + If you need this, bug me about it, I'll make it. + == Hah! No bugging needed. Fixed! :-) + - Cirrus logic hasn't gotten back to me yet why the CD1865 can + and the CD1864 can't do 115k2. I suspect that this is + because the CD1864 is not rated for 33MHz operation. + Therefore the CD1864 versions of the card can't do 115k2 on + all ports just like the CD1865 versions. The driver does + not block 115k2 on CD1864 cards. + == I called the Cirrus Logic representative here in Holland. + The CD1864 databook is identical to the CD1865 databook, + except for an extra warning at the end. Similar Bit errors + have been observed in testing at 115k2 on both an 1865 and + a 1864 chip. I see no reason why I would prohibit 115k2 on + 1864 chips and not do it on 1865 chips. Actually there is + reason to prohibit it on BOTH chips. I print a warning. + If you use 115k2, you're on your own. + - A spiky CD may send spurious HUPs. Also in CLOCAL??? + -- A fix for this turned out to be counter productive. + Different fix? Current behaviour is acceptable? + -- Maybe the current implementation is correct. If anybody + gets bitten by this, please report, and it will get fixed. + + -- Testing revealed that when in CLOCAL, the problem doesn't + occur. As warned for in the CD1865 manual, the chip may + send modem intr's on a spike. We could filter those out, + but that would be a cludge anyway (You'd still risk getting + a spurious HUP when two spikes occur.)..... + + + + Bugs & restrictions: + - This is a difficult card to autoprobe. + You have to WRITE to the address register to even + read-probe a CD186x register. Disable autodetection? + -- Specialix: any suggestions? + + diff --git a/Documentation/serial/stallion.txt b/Documentation/serial/stallion.txt new file mode 100644 index 00000000000..5c4902d9a5b --- /dev/null +++ b/Documentation/serial/stallion.txt @@ -0,0 +1,392 @@ +* NOTE - This is an unmaintained driver. Lantronix, which bought Stallion +technologies, is not active in driver maintenance, and they have no information +on when or if they will have a 2.6 driver. + +James Nelson - 12-12-2004 + +Stallion Multiport Serial Driver Readme +--------------------------------------- + +Copyright (C) 1994-1999, Stallion Technologies. + +Version: 5.5.1 +Date: 28MAR99 + + + +1. INTRODUCTION + +There are two drivers that work with the different families of Stallion +multiport serial boards. One is for the Stallion smart boards - that is +EasyIO, EasyConnection 8/32 and EasyConnection 8/64-PCI, the other for +the true Stallion intelligent multiport boards - EasyConnection 8/64 +(ISA, EISA, MCA), EasyConnection/RA-PCI, ONboard and Brumby. + +If you are using any of the Stallion intelligent multiport boards (Brumby, +ONboard, EasyConnection 8/64 (ISA, EISA, MCA), EasyConnection/RA-PCI) with +Linux you will need to get the driver utility package. This contains a +firmware loader and the firmware images necessary to make the devices operate. + +The Stallion Technologies ftp site, ftp.stallion.com, will always have +the latest version of the driver utility package. + +ftp://ftp.stallion.com/drivers/ata5/Linux/ata-linux-550.tar.gz + +As of the printing of this document the latest version of the driver +utility package is 5.5.0. If a later version is now available then you +should use the latest version. + +If you are using the EasyIO, EasyConnection 8/32 or EasyConnection 8/64-PCI +boards then you don't need this package, although it does have a serial stats +display program. + +If you require DIP switch settings, EISA or MCA configuration files, or any +other information related to Stallion boards then have a look at Stallion's +web pages at http://www.stallion.com. + + + +2. INSTALLATION + +The drivers can be used as loadable modules or compiled into the kernel. +You can choose which when doing a "config" on the kernel. + +All ISA, EISA and MCA boards that you want to use need to be configured into +the driver(s). All PCI boards will be automatically detected when you load +the driver - so they do not need to be entered into the driver(s) +configuration structure. Note that kernel PCI support is required to use PCI +boards. + +There are two methods of configuring ISA, EISA and MCA boards into the drivers. +If using the driver as a loadable module then the simplest method is to pass +the driver configuration as module arguments. The other method is to modify +the driver source to add configuration lines for each board in use. + +If you have pre-built Stallion driver modules then the module argument +configuration method should be used. A lot of Linux distributions come with +pre-built driver modules in /lib/modules/X.Y.Z/misc for the kernel in use. +That makes things pretty simple to get going. + + +2.1 MODULE DRIVER CONFIGURATION: + +The simplest configuration for modules is to use the module load arguments +to configure any ISA, EISA or MCA boards. PCI boards are automatically +detected, so do not need any additional configuration at all. + +If using EasyIO, EasyConnection 8/32 ISA or MCA, or EasyConnection 8/63-PCI +boards then use the "stallion" driver module, Otherwise if you are using +an EasyConnection 8/64 ISA, EISA or MCA, EasyConnection/RA-PCI, ONboard, +Brumby or original Stallion board then use the "istallion" driver module. + +Typically to load up the smart board driver use: + + modprobe stallion + +This will load the EasyIO and EasyConnection 8/32 driver. It will output a +message to say that it loaded and print the driver version number. It will +also print out whether it found the configured boards or not. These messages +may not appear on the console, but typically are always logged to +/var/adm/messages or /var/log/syslog files - depending on how the klogd and +syslogd daemons are setup on your system. + +To load the intelligent board driver use: + + modprobe istallion + +It will output similar messages to the smart board driver. + +If not using an auto-detectable board type (that is a PCI board) then you +will also need to supply command line arguments to the modprobe command +when loading the driver. The general form of the configuration argument is + + board?=[,[,][,]] + +where: + + board? -- specifies the arbitrary board number of this board, + can be in the range 0 to 3. + + name -- textual name of this board. The board name is the common + board name, or any "shortened" version of that. The board + type number may also be used here. + + ioaddr -- specifies the I/O address of this board. This argument is + optional, but should generally be specified. + + addr -- optional second address argument. Some board types require + a second I/O address, some require a memory address. The + exact meaning of this argument depends on the board type. + + irq -- optional IRQ line used by this board. + +Up to 4 board configuration arguments can be specified on the load line. +Here is some examples: + + modprobe stallion board0=easyio,0x2a0,5 + +This configures an EasyIO board as board 0 at I/O address 0x2a0 and IRQ 5. + + modprobe istallion board3=ec8/64,0x2c0,0xcc000 + +This configures an EasyConnection 8/64 ISA as board 3 at I/O address 0x2c0 at +memory address 0xcc000. + + modprobe stallion board1=ec8/32-at,0x2a0,0x280,10 + +This configures an EasyConnection 8/32 ISA board at primary I/O address 0x2a0, +secondary address 0x280 and IRQ 10. + +You will probably want to enter this module load and configuration information +into your system startup scripts so that the drivers are loaded and configured +on each system boot. Typically the start up script would be something like +/etc/modprobe.conf. + + +2.2 STATIC DRIVER CONFIGURATION: + +For static driver configuration you need to modify the driver source code. +Entering ISA, EISA and MCA boards into the driver(s) configuration structure +involves editing the driver(s) source file. It's pretty easy if you follow +the instructions below. Both drivers can support up to 4 boards. The smart +card driver (the stallion.c driver) supports any combination of EasyIO and +EasyConnection 8/32 boards (up to a total of 4). The intelligent driver +supports any combination of ONboards, Brumbys, Stallions and EasyConnection +8/64 (ISA and EISA) boards (up to a total of 4). + +To set up the driver(s) for the boards that you want to use you need to +edit the appropriate driver file and add configuration entries. + +If using EasyIO or EasyConnection 8/32 ISA or MCA boards, + In drivers/char/stallion.c: + - find the definition of the stl_brdconf array (of structures) + near the top of the file + - modify this to match the boards you are going to install + (the comments before this structure should help) + - save and exit + +If using ONboard, Brumby, Stallion or EasyConnection 8/64 (ISA or EISA) +boards, + In drivers/char/istallion.c: + - find the definition of the stli_brdconf array (of structures) + near the top of the file + - modify this to match the boards you are going to install + (the comments before this structure should help) + - save and exit + +Once you have set up the board configurations then you are ready to build +the kernel or modules. + +When the new kernel is booted, or the loadable module loaded then the +driver will emit some kernel trace messages about whether the configured +boards were detected or not. Depending on how your system logger is set +up these may come out on the console, or just be logged to +/var/adm/messages or /var/log/syslog. You should check the messages to +confirm that all is well. + + +2.3 SHARING INTERRUPTS + +It is possible to share interrupts between multiple EasyIO and +EasyConnection 8/32 boards in an EISA system. To do this you must be using +static driver configuration, modifying the driver source code to add driver +configuration. Then a couple of extra things are required: + +1. When entering the board resources into the stallion.c file you need to + mark the boards as using level triggered interrupts. Do this by replacing + the "0" entry at field position 6 (the last field) in the board + configuration structure with a "1". (This is the structure that defines + the board type, I/O locations, etc. for each board). All boards that are + sharing an interrupt must be set this way, and each board should have the + same interrupt number specified here as well. Now build the module or + kernel as you would normally. + +2. When physically installing the boards into the system you must enter + the system EISA configuration utility. You will need to install the EISA + configuration files for *all* the EasyIO and EasyConnection 8/32 boards + that are sharing interrupts. The Stallion EasyIO and EasyConnection 8/32 + EISA configuration files required are supplied by Stallion Technologies + on the EASY Utilities floppy diskette (usually supplied in the box with + the board when purchased. If not, you can pick it up from Stallion's FTP + site, ftp.stallion.com). You will need to edit the board resources to + choose level triggered interrupts, and make sure to set each board's + interrupt to the same IRQ number. + +You must complete both the above steps for this to work. When you reboot +or load the driver your EasyIO and EasyConnection 8/32 boards will be +sharing interrupts. + + +2.4 USING HIGH SHARED MEMORY + +The EasyConnection 8/64-EI, ONboard and Stallion boards are capable of +using shared memory addresses above the usual 640K - 1Mb range. The ONboard +ISA and the Stallion boards can be programmed to use memory addresses up to +16Mb (the ISA bus addressing limit), and the EasyConnection 8/64-EI and +ONboard/E can be programmed for memory addresses up to 4Gb (the EISA bus +addressing limit). + +The higher than 1Mb memory addresses are fully supported by this driver. +Just enter the address as you normally would for a lower than 1Mb address +(in the driver's board configuration structure). + + + +2.5 TROUBLE SHOOTING + +If a board is not found by the driver but is actually in the system then the +most likely problem is that the I/O address is wrong. Change the module load +argument for the loadable module form. Or change it in the driver stallion.c +or istallion.c configuration structure and rebuild the kernel or modules, or +change it on the board. + +On EasyIO and EasyConnection 8/32 boards the IRQ is software programmable, so +if there is a conflict you may need to change the IRQ used for a board. There +are no interrupts to worry about for ONboard, Brumby or EasyConnection 8/64 +(ISA, EISA and MCA) boards. The memory region on EasyConnection 8/64 and +ONboard boards is software programmable, but not on the Brumby boards. + + + +3. USING THE DRIVERS + +3.1 INTELLIGENT DRIVER OPERATION + +The intelligent boards also need to have their "firmware" code downloaded +to them. This is done via a user level application supplied in the driver +utility package called "stlload". Compile this program wherever you dropped +the package files, by typing "make". In its simplest form you can then type + + ./stlload -i cdk.sys + +in this directory and that will download board 0 (assuming board 0 is an +EasyConnection 8/64 or EasyConnection/RA board). To download to an +ONboard, Brumby or Stallion do: + + ./stlload -i 2681.sys + +Normally you would want all boards to be downloaded as part of the standard +system startup. To achieve this, add one of the lines above into the +/etc/rc.d/rc.S or /etc/rc.d/rc.serial file. To download each board just add +the "-b " option to the line. You will need to download code for +every board. You should probably move the stlload program into a system +directory, such as /usr/sbin. Also, the default location of the cdk.sys image +file in the stlload down-loader is /usr/lib/stallion. Create that directory +and put the cdk.sys and 2681.sys files in it. (It's a convenient place to put +them anyway). As an example your /etc/rc.d/rc.S file might have the +following lines added to it (if you had 3 boards): + + /usr/sbin/stlload -b 0 -i /usr/lib/stallion/cdk.sys + /usr/sbin/stlload -b 1 -i /usr/lib/stallion/2681.sys + /usr/sbin/stlload -b 2 -i /usr/lib/stallion/2681.sys + +The image files cdk.sys and 2681.sys are specific to the board types. The +cdk.sys will only function correctly on an EasyConnection 8/64 board. Similarly +the 2681.sys image fill only operate on ONboard, Brumby and Stallion boards. +If you load the wrong image file into a board it will fail to start up, and +of course the ports will not be operational! + +If you are using the modularized version of the driver you might want to put +the modprobe calls in the startup script as well (before the download lines +obviously). + + +3.2 USING THE SERIAL PORTS + +Once the driver is installed you will need to setup some device nodes to +access the serial ports. The simplest method is to use the /dev/MAKEDEV program. +It will automatically create device entries for Stallion boards. This will +create the normal serial port devices as /dev/ttyE# where# is the port number +starting from 0. A bank of 64 minor device numbers is allocated to each board, +so the first port on the second board is port 64,etc. A set of callout type +devices may also be created. They are created as the devices /dev/cue# where # +is the same as for the ttyE devices. + +For the most part the Stallion driver tries to emulate the standard PC system +COM ports and the standard Linux serial driver. The idea is that you should +be able to use Stallion board ports and COM ports interchangeably without +modifying anything but the device name. Anything that doesn't work like that +should be considered a bug in this driver! + +If you look at the driver code you will notice that it is fairly closely +based on the Linux serial driver (linux/drivers/char/serial.c). This is +intentional, obviously this is the easiest way to emulate its behavior! + +Since this driver tries to emulate the standard serial ports as much as +possible, most system utilities should work as they do for the standard +COM ports. Most importantly "stty" works as expected and "setserial" can +also be used (excepting the ability to auto-configure the I/O and IRQ +addresses of boards). Higher baud rates are supported in the usual fashion +through setserial or using the CBAUDEX extensions. Note that the EasyIO and +EasyConnection (all types) support at least 57600 and 115200 baud. The newer +EasyConnection XP modules and new EasyIO boards support 230400 and 460800 +baud as well. The older boards including ONboard and Brumby support a +maximum baud rate of 38400. + +If you are unfamiliar with how to use serial ports, then get the Serial-HOWTO +by Greg Hankins. It will explain everything you need to know! + + + +4. NOTES + +You can use both drivers at once if you have a mix of board types installed +in a system. However to do this you will need to change the major numbers +used by one of the drivers. Currently both drivers use major numbers 24, 25 +and 28 for their devices. Change one driver to use some other major numbers, +and then modify the mkdevnods script to make device nodes based on those new +major numbers. For example, you could change the istallion.c driver to use +major numbers 60, 61 and 62. You will also need to create device nodes with +different names for the ports, for example ttyF# and cuf#. + +The original Stallion board is no longer supported by Stallion Technologies. +Although it is known to work with the istallion driver. + +Finding a free physical memory address range can be a problem. The older +boards like the Stallion and ONboard need large areas (64K or even 128K), so +they can be very difficult to get into a system. If you have 16 Mb of RAM +then you have no choice but to put them somewhere in the 640K -> 1Mb range. +ONboards require 64K, so typically 0xd0000 is good, or 0xe0000 on some +systems. If you have an original Stallion board, "V4.0" or Rev.O, then you +need a 64K memory address space, so again 0xd0000 and 0xe0000 are good. +Older Stallion boards are a much bigger problem. They need 128K of address +space and must be on a 128K boundary. If you don't have a VGA card then +0xc0000 might be usable - there is really no other place you can put them +below 1Mb. + +Both the ONboard and old Stallion boards can use higher memory addresses as +well, but you must have less than 16Mb of RAM to be able to use them. Usual +high memory addresses used include 0xec0000 and 0xf00000. + +The Brumby boards only require 16Kb of address space, so you can usually +squeeze them in somewhere. Common addresses are 0xc8000, 0xcc000, or in +the 0xd0000 range. EasyConnection 8/64 boards are even better, they only +require 4Kb of address space, again usually 0xc8000, 0xcc000 or 0xd0000 +are good. + +If you are using an EasyConnection 8/64-EI or ONboard/E then usually the +0xd0000 or 0xe0000 ranges are the best options below 1Mb. If neither of +them can be used then the high memory support to use the really high address +ranges is the best option. Typically the 2Gb range is convenient for them, +and gets them well out of the way. + +The ports of the EasyIO-8M board do not have DCD or DTR signals. So these +ports cannot be used as real modem devices. Generally, when using these +ports you should only use the cueX devices. + +The driver utility package contains a couple of very useful programs. One +is a serial port statistics collection and display program - very handy +for solving serial port problems. The other is an extended option setting +program that works with the intelligent boards. + + + +5. DISCLAIMER + +The information contained in this document is believed to be accurate and +reliable. However, no responsibility is assumed by Stallion Technologies +Pty. Ltd. for its use, nor any infringements of patents or other rights +of third parties resulting from its use. Stallion Technologies reserves +the right to modify the design of its products and will endeavour to change +the information in manuals and accompanying documentation accordingly. + diff --git a/Documentation/serial/sx.txt b/Documentation/serial/sx.txt new file mode 100644 index 00000000000..cb4efa0fb5c --- /dev/null +++ b/Documentation/serial/sx.txt @@ -0,0 +1,294 @@ + + sx.txt -- specialix SX/SI multiport serial driver readme. + + + + Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl) + + Specialix pays for the development and support of this driver. + Please DO contact support@specialix.co.uk if you require + support. + + This driver was developed in the BitWizard linux device + driver service. If you require a linux device driver for your + product, please contact devices@BitWizard.nl for a quote. + + (History) + There used to be an SI driver by Simon Allan. This is a complete + rewrite from scratch. Just a few lines-of-code have been snatched. + + (Sources) + Specialix document number 6210028: SX Host Card and Download Code + Software Functional Specification. + + (Copying) + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + USA. + + (Addendum) + I'd appreciate it that if you have fixes, that you send them + to me first. + + +Introduction +============ + +This file contains some random information, that I like to have online +instead of in a manual that can get lost. Ever misplace your Linux +kernel sources? And the manual of one of the boards in your computer? + + +Theory of operation +=================== + +An important thing to know is that the driver itself doesn't have the +firmware for the card. This means that you need the separate package +"sx_firmware". For now you can get the source at + + ftp://ftp.bitwizard.nl/specialix/sx_firmware_.tgz + +The firmware load needs a "misc" device, so you'll need to enable the +"Support for user misc device modules" in your kernel configuration. +The misc device needs to be called "/dev/specialix_sxctl". It needs +misc major 10, and minor number 167 (assigned by HPA). The section +on creating device files below also creates this device. + +After loading the sx.o module into your kernel, the driver will report +the number of cards detected, but because it doesn't have any +firmware, it will not be able to determine the number of ports. Only +when you then run "sx_firmware" will the firmware be downloaded and +the rest of the driver initialized. At that time the sx_firmware +program will report the number of ports installed. + +In contrast with many other multi port serial cards, some of the data +structures are only allocated when the card knows the number of ports +that are connected. This means we won't waste memory for 120 port +descriptor structures when you only have 8 ports. If you experience +problems due to this, please report them: I haven't seen any. + + +Interrupts +========== + +A multi port serial card, would generate a horrendous amount of +interrupts if it would interrupt the CPU for every received +character. Even more than 10 years ago, the trick not to use +interrupts but to poll the serial cards was invented. + +The SX card allow us to do this two ways. First the card limits its +own interrupt rate to a rate that won't overwhelm the CPU. Secondly, +we could forget about the cards interrupt completely and use the +internal timer for this purpose. + +Polling the card can take up to a few percent of your CPU. Using the +interrupts would be better if you have most of the ports idle. Using +timer-based polling is better if your card almost always has work to +do. You save the separate interrupt in that case. + +In any case, it doesn't really matter all that much. + +The most common problem with interrupts is that for ISA cards in a PCI +system the BIOS has to be told to configure that interrupt as "legacy +ISA". Otherwise the card can pull on the interrupt line all it wants +but the CPU won't see this. + +If you can't get the interrupt to work, remember that polling mode is +more efficient (provided you actually use the card intensively). + + +Allowed Configurations +====================== + +Some configurations are disallowed. Even though at a glance they might +seem to work, they are known to lockup the bus between the host card +and the device concentrators. You should respect the drivers decision +not to support certain configurations. It's there for a reason. + +Warning: Seriously technical stuff ahead. Executive summary: Don't use +SX cards except configured at a 64k boundary. Skip the next paragraph. + +The SX cards can theoretically be placed at a 32k boundary. So for +instance you can put an SX card at 0xc8000-0xd7fff. This is not a +"recommended configuration". ISA cards have to tell the bus controller +how they like their timing. Due to timing issues they have to do this +based on which 64k window the address falls into. This means that the +32k window below and above the SX card have to use exactly the same +timing as the SX card. That reportedly works for other SX cards. But +you're still left with two useless 32k windows that should not be used +by anybody else. + + +Configuring the driver +====================== + +PCI cards are always detected. The driver auto-probes for ISA cards at +some sensible addresses. Please report if the auto-probe causes trouble +in your system, or when a card isn't detected. + +I'm afraid I haven't implemented "kernel command line parameters" yet. +This means that if the default doesn't work for you, you shouldn't use +the compiled-into-the-kernel version of the driver. Use a module +instead. If you convince me that you need this, I'll make it for +you. Deal? + +I'm afraid that the module parameters are a bit clumsy. If you have a +better idea, please tell me. + +You can specify several parameters: + + sx_poll: number of jiffies between timer-based polls. + + Set this to "0" to disable timer based polls. + Initialization of cards without a working interrupt + will fail. + + Set this to "1" if you want a polling driver. + (on Intel: 100 polls per second). If you don't use + fast baud rates, you might consider a value like "5". + (If you don't know how to do the math, use 1). + + sx_slowpoll: Number of jiffies between timer-based polls. + Set this to "100" to poll once a second. + This should get the card out of a stall if the driver + ever misses an interrupt. I've never seen this happen, + and if it does, that's a bug. Tell me. + + sx_maxints: Number of interrupts to request from the card. + The card normally limits interrupts to about 100 per + second to offload the host CPU. You can increase this + number to reduce latency on the card a little. + Note that if you give a very high number you can overload + your CPU as well as the CPU on the host card. This setting + is inaccurate and not recommended for SI cards (But it + works). + + sx_irqmask: The mask of allowable IRQs to use. I suggest you set + this to 0 (disable IRQs all together) and use polling if + the assignment of IRQs becomes problematic. This is defined + as the sum of (1 << irq) 's that you want to allow. So + sx_irqmask of 8 (1 << 3) specifies that only irq 3 may + be used by the SX driver. If you want to specify to the + driver: "Either irq 11 or 12 is ok for you to use", then + specify (1 << 11) | (1 << 12) = 0x1800 . + + sx_debug: You can enable different sorts of debug traces with this. + At "-1" all debugging traces are active. You'll get several + times more debugging output than you'll get characters + transmitted. + + +Baud rates +========== + +Theoretically new SXDCs should be capable of more than 460k +baud. However the line drivers usually give up before that. Also the +CPU on the card may not be able to handle 8 channels going at full +blast at that speed. Moreover, the buffers are not large enough to +allow operation with 100 interrupts per second. You'll have to realize +that the card has a 256 byte buffer, so you'll have to increase the +number of interrupts per second if you have more than 256*100 bytes +per second to transmit. If you do any performance testing in this +area, I'd be glad to hear from you... + +(Psst Linux users..... I think the Linux driver is more efficient than +the driver for other OSes. If you can and want to benchmark them +against each other, be my guest, and report your findings...... :-) + + +Ports and devices +================= + +Port 0 is the top connector on the module closest to the host +card. Oh, the ports on the SXDCs and TAs are labelled from 1 to 8 +instead of from 0 to 7, as they are numbered by linux. I'm stubborn in +this: I know for sure that I wouldn't be able to calculate which port +is which anymore if I would change that.... + + +Devices: + +You should make the device files as follows: + +#!/bin/sh +# (I recommend that you cut-and-paste this into a file and run that) +cd /dev +t=0 +mknod specialix_sxctl c 10 167 +while [ $t -lt 64 ] + do + echo -n "$t " + mknod ttyX$t c 32 $t + mknod cux$t c 33 $t + t=`expr $t + 1` +done +echo "" +rm /etc/psdevtab +ps > /dev/null + + +This creates 64 devices. If you have more, increase the constant on +the line with "while". The devices start at 0, as is customary on +Linux. Specialix seems to like starting the numbering at 1. + +If your system doesn't come with these devices pre-installed, bug your +linux-vendor about this. They should have these devices +"pre-installed" before the new millennium. The "ps" stuff at the end +is to "tell" ps that the new devices exist. + +Officially the maximum number of cards per computer is 4. This driver +however supports as many cards in one machine as you want. You'll run +out of interrupts after a few, but you can switch to polled operation +then. At about 256 ports (More than 8 cards), we run out of minor +device numbers. Sorry. I suggest you buy a second computer.... (Or +switch to RIO). + +------------------------------------------------------------------------ + + + Fixed bugs and restrictions: + - Hangup processing. + -- Done. + + - the write path in generic_serial (lockup / oops). + -- Done (Ugly: not the way I want it. Copied from serial.c). + + - write buffer isn't flushed at close. + -- Done. I still seem to lose a few chars at close. + Sorry. I think that this is a firmware issue. (-> Specialix) + + - drain hardware before changing termios + - Change debug on the fly. + - ISA free irq -1. (no firmware loaded). + - adding c8000 as a probe address. Added warning. + - Add a RAMtest for the RAM on the card.c + - Crash when opening a port "way" of the number of allowed ports. + (for example opening port 60 when there are only 24 ports attached) + - Sometimes the use-count strays a bit. After a few hours of + testing the use count is sometimes "3". If you are not like + me and can remember what you did to get it that way, I'd + appreciate an Email. Possibly fixed. Tell me if anyone still + sees this. + - TAs don't work right if you don't connect all the modem control + signals. SXDCs do. T225 firmware problem -> Specialix. + (Mostly fixed now, I think. Tell me if you encounter this!) + + Bugs & restrictions: + + - Arbitrary baud rates. Requires firmware update. (-> Specialix) + + - Low latency (mostly firmware, -> Specialix) + + + diff --git a/Documentation/serial/tty.txt b/Documentation/serial/tty.txt new file mode 100644 index 00000000000..8e65c4498c5 --- /dev/null +++ b/Documentation/serial/tty.txt @@ -0,0 +1,292 @@ + + The Lockronomicon + +Your guide to the ancient and twisted locking policies of the tty layer and +the warped logic behind them. Beware all ye who read on. + +FIXME: still need to work out the full set of BKL assumptions and document +them so they can eventually be killed off. + + +Line Discipline +--------------- + +Line disciplines are registered with tty_register_ldisc() passing the +discipline number and the ldisc structure. At the point of registration the +discipline must be ready to use and it is possible it will get used before +the call returns success. If the call returns an error then it won't get +called. Do not re-use ldisc numbers as they are part of the userspace ABI +and writing over an existing ldisc will cause demons to eat your computer. +After the return the ldisc data has been copied so you may free your own +copy of the structure. You must not re-register over the top of the line +discipline even with the same data or your computer again will be eaten by +demons. + +In order to remove a line discipline call tty_unregister_ldisc(). +In ancient times this always worked. In modern times the function will +return -EBUSY if the ldisc is currently in use. Since the ldisc referencing +code manages the module counts this should not usually be a concern. + +Heed this warning: the reference count field of the registered copies of the +tty_ldisc structure in the ldisc table counts the number of lines using this +discipline. The reference count of the tty_ldisc structure within a tty +counts the number of active users of the ldisc at this instant. In effect it +counts the number of threads of execution within an ldisc method (plus those +about to enter and exit although this detail matters not). + +Line Discipline Methods +----------------------- + +TTY side interfaces: + +open() - Called when the line discipline is attached to + the terminal. No other call into the line + discipline for this tty will occur until it + completes successfully. Can sleep. + +close() - This is called on a terminal when the line + discipline is being unplugged. At the point of + execution no further users will enter the + ldisc code for this tty. Can sleep. + +hangup() - Called when the tty line is hung up. + The line discipline should cease I/O to the tty. + No further calls into the ldisc code will occur. + Can sleep. + +write() - A process is writing data through the line + discipline. Multiple write calls are serialized + by the tty layer for the ldisc. May sleep. + +flush_buffer() - (optional) May be called at any point between + open and close, and instructs the line discipline + to empty its input buffer. + +chars_in_buffer() - (optional) Report the number of bytes in the input + buffer. + +set_termios() - (optional) Called on termios structure changes. + The caller passes the old termios data and the + current data is in the tty. Called under the + termios semaphore so allowed to sleep. Serialized + against itself only. + +read() - Move data from the line discipline to the user. + Multiple read calls may occur in parallel and the + ldisc must deal with serialization issues. May + sleep. + +poll() - Check the status for the poll/select calls. Multiple + poll calls may occur in parallel. May sleep. + +ioctl() - Called when an ioctl is handed to the tty layer + that might be for the ldisc. Multiple ioctl calls + may occur in parallel. May sleep. + +Driver Side Interfaces: + +receive_buf() - Hand buffers of bytes from the driver to the ldisc + for processing. Semantics currently rather + mysterious 8( + +write_wakeup() - May be called at any point between open and close. + The TTY_DO_WRITE_WAKEUP flag indicates if a call + is needed but always races versus calls. Thus the + ldisc must be careful about setting order and to + handle unexpected calls. Must not sleep. + + The driver is forbidden from calling this directly + from the ->write call from the ldisc as the ldisc + is permitted to call the driver write method from + this function. In such a situation defer it. + + +Driver Access + +Line discipline methods can call the following methods of the underlying +hardware driver through the function pointers within the tty->driver +structure: + +write() Write a block of characters to the tty device. + Returns the number of characters accepted. The + character buffer passed to this method is already + in kernel space. + +put_char() Queues a character for writing to the tty device. + If there is no room in the queue, the character is + ignored. + +flush_chars() (Optional) If defined, must be called after + queueing characters with put_char() in order to + start transmission. + +write_room() Returns the numbers of characters the tty driver + will accept for queueing to be written. + +ioctl() Invoke device specific ioctl. + Expects data pointers to refer to userspace. + Returns ENOIOCTLCMD for unrecognized ioctl numbers. + +set_termios() Notify the tty driver that the device's termios + settings have changed. New settings are in + tty->termios. Previous settings should be passed in + the "old" argument. + + The API is defined such that the driver should return + the actual modes selected. This means that the + driver function is responsible for modifying any + bits in the request it cannot fulfill to indicate + the actual modes being used. A device with no + hardware capability for change (eg a USB dongle or + virtual port) can provide NULL for this method. + +throttle() Notify the tty driver that input buffers for the + line discipline are close to full, and it should + somehow signal that no more characters should be + sent to the tty. + +unthrottle() Notify the tty driver that characters can now be + sent to the tty without fear of overrunning the + input buffers of the line disciplines. + +stop() Ask the tty driver to stop outputting characters + to the tty device. + +start() Ask the tty driver to resume sending characters + to the tty device. + +hangup() Ask the tty driver to hang up the tty device. + +break_ctl() (Optional) Ask the tty driver to turn on or off + BREAK status on the RS-232 port. If state is -1, + then the BREAK status should be turned on; if + state is 0, then BREAK should be turned off. + If this routine is not implemented, use ioctls + TIOCSBRK / TIOCCBRK instead. + +wait_until_sent() Waits until the device has written out all of the + characters in its transmitter FIFO. + +send_xchar() Send a high-priority XON/XOFF character to the device. + + +Flags + +Line discipline methods have access to tty->flags field containing the +following interesting flags: + +TTY_THROTTLED Driver input is throttled. The ldisc should call + tty->driver->unthrottle() in order to resume + reception when it is ready to process more data. + +TTY_DO_WRITE_WAKEUP If set, causes the driver to call the ldisc's + write_wakeup() method in order to resume + transmission when it can accept more data + to transmit. + +TTY_IO_ERROR If set, causes all subsequent userspace read/write + calls on the tty to fail, returning -EIO. + +TTY_OTHER_CLOSED Device is a pty and the other side has closed. + +TTY_NO_WRITE_SPLIT Prevent driver from splitting up writes into + smaller chunks. + + +Locking + +Callers to the line discipline functions from the tty layer are required to +take line discipline locks. The same is true of calls from the driver side +but not yet enforced. + +Three calls are now provided + + ldisc = tty_ldisc_ref(tty); + +takes a handle to the line discipline in the tty and returns it. If no ldisc +is currently attached or the ldisc is being closed and re-opened at this +point then NULL is returned. While this handle is held the ldisc will not +change or go away. + + tty_ldisc_deref(ldisc) + +Returns the ldisc reference and allows the ldisc to be closed. Returning the +reference takes away your right to call the ldisc functions until you take +a new reference. + + ldisc = tty_ldisc_ref_wait(tty); + +Performs the same function as tty_ldisc_ref except that it will wait for an +ldisc change to complete and then return a reference to the new ldisc. + +While these functions are slightly slower than the old code they should have +minimal impact as most receive logic uses the flip buffers and they only +need to take a reference when they push bits up through the driver. + +A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc +functions are called with the ldisc unavailable. Thus tty_ldisc_ref will +fail in this situation if used within these functions. Ldisc and driver +code calling its own functions must be careful in this case. + + +Driver Interface +---------------- + +open() - Called when a device is opened. May sleep + +close() - Called when a device is closed. At the point of + return from this call the driver must make no + further ldisc calls of any kind. May sleep + +write() - Called to write bytes to the device. May not + sleep. May occur in parallel in special cases. + Because this includes panic paths drivers generally + shouldn't try and do clever locking here. + +put_char() - Stuff a single character onto the queue. The + driver is guaranteed following up calls to + flush_chars. + +flush_chars() - Ask the kernel to write put_char queue + +write_room() - Return the number of characters tht can be stuffed + into the port buffers without overflow (or less). + The ldisc is responsible for being intelligent + about multi-threading of write_room/write calls + +ioctl() - Called when an ioctl may be for the driver + +set_termios() - Called on termios change, serialized against + itself by a semaphore. May sleep. + +set_ldisc() - Notifier for discipline change. At the point this + is done the discipline is not yet usable. Can now + sleep (I think) + +throttle() - Called by the ldisc to ask the driver to do flow + control. Serialization including with unthrottle + is the job of the ldisc layer. + +unthrottle() - Called by the ldisc to ask the driver to stop flow + control. + +stop() - Ldisc notifier to the driver to stop output. As with + throttle the serializations with start() are down + to the ldisc layer. + +start() - Ldisc notifier to the driver to start output. + +hangup() - Ask the tty driver to cause a hangup initiated + from the host side. [Can sleep ??] + +break_ctl() - Send RS232 break. Can sleep. Can get called in + parallel, driver must serialize (for now), and + with write calls. + +wait_until_sent() - Wait for characters to exit the hardware queue + of the driver. Can sleep + +send_xchar() - Send XON/XOFF and if possible jump the queue with + it in order to get fast flow control responses. + Cannot sleep ?? + diff --git a/Documentation/sh/kgdb.txt b/Documentation/sh/kgdb.txt deleted file mode 100644 index 05b4ba89d28..00000000000 --- a/Documentation/sh/kgdb.txt +++ /dev/null @@ -1,179 +0,0 @@ - -This file describes the configuration and behavior of KGDB for the SH -kernel. Based on a description from Henry Bell , it -has been modified to account for quirks in the current implementation. - -Version -======= - -This version of KGDB was written for 2.4.xx kernels for the SH architecture. -Further documentation is available from the linux-sh project website. - - -Debugging Setup: Host -====================== - -The two machines will be connected together via a serial line - this -should be a null modem cable i.e. with a twist. - -On your DEVELOPMENT machine, go to your kernel source directory and -build the kernel, enabling KGDB support in the "kernel hacking" section. -This includes the KGDB code, and also makes the kernel be compiled with -the "-g" option set -- necessary for debugging. - -To install this new kernel, use the following installation procedure. - -Decide on which tty port you want the machines to communicate, then -cable them up back-to-back using the null modem. On the DEVELOPMENT -machine, you may wish to create an initialization file called .gdbinit -(in the kernel source directory or in your home directory) to execute -commonly-used commands at startup. - -A minimal .gdbinit might look like this: - - file vmlinux - set remotebaud 115200 - target remote /dev/ttyS0 - -Change the "target" definition so that it specifies the tty port that -you intend to use. Change the "remotebaud" definition to match the -data rate that you are going to use for the com line (115200 is the -default). - -Debugging Setup: Target -======================== - -By default, the KGDB stub will communicate with the host GDB using -ttySC1 at 115200 baud, 8 databits, no parity; these defaults can be -changed in the kernel configuration. As the kernel starts up, KGDB will -initialize so that breakpoints, kernel segfaults, and so forth will -generally enter the debugger. - -This behavior can be modified by including the "kgdb" option in the -kernel command line; this option has the general form: - - kgdb=, - -The indicates the port to use, and can optionally specify -baud, parity and databits -- e.g. "ttySC0,9600N8" or "ttySC1,19200". - -The can be "halt" or "disabled". The "halt" action enters the -debugger via a breakpoint as soon as kgdb is initialized; the "disabled" -action causes kgdb to ignore kernel segfaults and such until explicitly -entered by a breakpoint in the code or by external action (sysrq or NMI). - -(Both and can appear alone, w/o the separating comma.) - -For example, if you wish to debug early in kernel startup code, you -might specify the halt option: - - kgdb=halt - -Boot the TARGET machine, which will appear to hang. - -On your DEVELOPMENT machine, cd to the source directory and run the gdb -program. (This is likely to be a cross GDB which runs on your host but -is built for an SH target.) If everything is working correctly you -should see gdb print out a few lines indicating that a breakpoint has -been taken. It will actually show a line of code in the target kernel -inside the gdbstub activation code. - -NOTE: BE SURE TO TERMINATE OR SUSPEND any other host application which -may be using the same serial port (for example, a terminal emulator you -have been using to connect to the target boot code.) Otherwise, data -from the target may not all get to GDB! - -You can now use whatever gdb commands you like to set breakpoints. -Enter "continue" to start your target machine executing again. At this -point the target system will run at full speed until it encounters -your breakpoint or gets a segment violation in the kernel, or whatever. - -Serial Ports: KGDB, Console -============================ - -This version of KGDB may not gracefully handle conflict with other -drivers in the kernel using the same port. If KGDB is configured on the -same port (and with the same parameters) as the kernel console, or if -CONFIG_SH_KGDB_CONSOLE is configured, things should be fine (though in -some cases console messages may appear twice through GDB). But if the -KGDB port is not the kernel console and used by another serial driver -which assumes different serial parameters (e.g. baud rate) KGDB may not -recover. - -Also, when KGDB is entered via sysrq-g (requires CONFIG_KGDB_SYSRQ) and -the kgdb port uses the same port as the console, detaching GDB will not -restore the console to working order without the port being re-opened. - -Another serious consequence of this is that GDB currently CANNOT break -into KGDB externally (e.g. via ^C or ); unless a breakpoint or -error is encountered, the only way to enter KGDB after the initial halt -(see above) is via NMI (CONFIG_KGDB_NMI) or sysrq-g (CONFIG_KGDB_SYSRQ). - -Code is included for the basic Hitachi Solution Engine boards to allow -the use of ttyS0 for KGDB if desired; this is less robust, but may be -useful in some cases. (This cannot be selected using the config file, -but only through the kernel command line, e.g. "kgdb=ttyS0", though the -configured defaults for baud rate etc. still apply if not overridden.) - -If gdbstub Does Not Work -======================== - -If it doesn't work, you will have to troubleshoot it. Do the easy -things first like double checking your cabling and data rates. You -might try some non-kernel based programs to see if the back-to-back -connection works properly. Just something simple like cat /etc/hosts -/dev/ttyS0 on one machine and cat /dev/ttyS0 on the other will tell you -if you can send data from one machine to the other. There is no point -in tearing out your hair in the kernel if the line doesn't work. - -If you need to debug the GDB/KGDB communication itself, the gdb commands -"set debug remote 1" and "set debug serial 1" may be useful, but be -warned: they produce a lot of output. - -Threads -======= - -Each process in a target machine is seen as a gdb thread. gdb thread related -commands (info threads, thread n) can be used. CONFIG_KGDB_THREAD must -be defined for this to work. - -In this version, kgdb reports PID_MAX (32768) as the process ID for the -idle process (pid 0), since GDB does not accept 0 as an ID. - -Detaching (exiting KGDB) -========================= - -There are two ways to resume full-speed target execution: "continue" and -"detach". With "continue", GDB inserts any specified breakpoints in the -target code and resumes execution; the target is still in "gdb mode". -If a breakpoint or other debug event (e.g. NMI) happens, the target -halts and communicates with GDB again, which is waiting for it. - -With "detach", GDB does *not* insert any breakpoints; target execution -is resumed and GDB stops communicating (does not wait for the target). -In this case, the target is no longer in "gdb mode" -- for example, -console messages no longer get sent separately to the KGDB port, or -encapsulated for GDB. If a debug event (e.g. NMI) occurs, the target -will re-enter "gdb mode" and will display this fact on the console; you -must give a new "target remote" command to gdb. - -NOTE: TO AVOID LOSSING CONSOLE MESSAGES IN CASE THE KERNEL CONSOLE AND -KGDB USING THE SAME PORT, THE TARGET WAITS FOR ANY INPUT CHARACTER ON -THE KGDB PORT AFTER A DETACH COMMAND. For example, after the detach you -could start a terminal emulator on the same host port and enter a ; -however, this program must then be terminated or suspended in order to -use GBD again if KGDB is re-entered. - - -Acknowledgements -================ - -This code was mostly generated by Henry Bell ; -largely from KGDB by Amit S. Kale - extracts from -code by Glenn Engel, Jim Kingdon, David Grothe , Tigran -Aivazian , William Gatliff , Ben -Lee, Steve Chamberlain and Benoit Miller are also -included. - -Jeremy Siegel - diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index e0e54a27fc1..841a9365d5f 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt @@ -757,6 +757,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. model - force the model name position_fix - Fix DMA pointer (0 = auto, 1 = use LPIB, 2 = POSBUF) probe_mask - Bitmask to probe codecs (default = -1, meaning all slots) + probe_only - Only probing and no codec initialization (default=off); + Useful to check the initial codec status for debugging bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. Passing -1 will make the driver to choose the appropriate value based on the controller chip. @@ -772,322 +774,23 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. This module supports multiple cards and autoprobe. + See Documentation/sound/alsa/HD-Audio.txt for more details about + HD-audio driver. + Each codec may have a model table for different configurations. If your machine isn't listed there, the default (usually minimal) configuration is set up. You can pass "model=" option to specify a certain model in such a case. There are different - models depending on the codec chip. - - Model name Description - ---------- ----------- - ALC880 - 3stack 3-jack in back and a headphone out - 3stack-digout 3-jack in back, a HP out and a SPDIF out - 5stack 5-jack in back, 2-jack in front - 5stack-digout 5-jack in back, 2-jack in front, a SPDIF out - 6stack 6-jack in back, 2-jack in front - 6stack-digout 6-jack with a SPDIF out - w810 3-jack - z71v 3-jack (HP shared SPDIF) - asus 3-jack (ASUS Mobo) - asus-w1v ASUS W1V - asus-dig ASUS with SPDIF out - asus-dig2 ASUS with SPDIF out (using GPIO2) - uniwill 3-jack - fujitsu Fujitsu Laptops (Pi1536) - F1734 2-jack - lg LG laptop (m1 express dual) - lg-lw LG LW20/LW25 laptop - tcl TCL S700 - clevo Clevo laptops (m520G, m665n) - medion Medion Rim 2150 - test for testing/debugging purpose, almost all controls can be - adjusted. Appearing only when compiled with - $CONFIG_SND_DEBUG=y - auto auto-config reading BIOS (default) - - ALC260 - hp HP machines - hp-3013 HP machines (3013-variant) - hp-dc7600 HP DC7600 - fujitsu Fujitsu S7020 - acer Acer TravelMate - will Will laptops (PB V7900) - replacer Replacer 672V - basic fixed pin assignment (old default model) - test for testing/debugging purpose, almost all controls can - adjusted. Appearing only when compiled with - $CONFIG_SND_DEBUG=y - auto auto-config reading BIOS (default) - - ALC262 - fujitsu Fujitsu Laptop - hp-bpc HP xw4400/6400/8400/9400 laptops - hp-bpc-d7000 HP BPC D7000 - hp-tc-t5735 HP Thin Client T5735 - hp-rp5700 HP RP5700 - benq Benq ED8 - benq-t31 Benq T31 - hippo Hippo (ATI) with jack detection, Sony UX-90s - hippo_1 Hippo (Benq) with jack detection - sony-assamd Sony ASSAMD - toshiba-s06 Toshiba S06 - toshiba-rx1 Toshiba RX1 - ultra Samsung Q1 Ultra Vista model - lenovo-3000 Lenovo 3000 y410 - nec NEC Versa S9100 - basic fixed pin assignment w/o SPDIF - auto auto-config reading BIOS (default) - - ALC267/268 - quanta-il1 Quanta IL1 mini-notebook - 3stack 3-stack model - toshiba Toshiba A205 - acer Acer laptops - acer-aspire Acer Aspire One - dell Dell OEM laptops (Vostro 1200) - zepto Zepto laptops - test for testing/debugging purpose, almost all controls can - adjusted. Appearing only when compiled with - $CONFIG_SND_DEBUG=y - auto auto-config reading BIOS (default) - - ALC269 - basic Basic preset - quanta Quanta FL1 - eeepc-p703 ASUS Eeepc P703 P900A - eeepc-p901 ASUS Eeepc P901 S101 - - ALC662/663 - 3stack-dig 3-stack (2-channel) with SPDIF - 3stack-6ch 3-stack (6-channel) - 3stack-6ch-dig 3-stack (6-channel) with SPDIF - 6stack-dig 6-stack with SPDIF - lenovo-101e Lenovo laptop - eeepc-p701 ASUS Eeepc P701 - eeepc-ep20 ASUS Eeepc EP20 - ecs ECS/Foxconn mobo - m51va ASUS M51VA - g71v ASUS G71V - h13 ASUS H13 - g50v ASUS G50V - asus-mode1 ASUS - asus-mode2 ASUS - asus-mode3 ASUS - asus-mode4 ASUS - asus-mode5 ASUS - asus-mode6 ASUS - auto auto-config reading BIOS (default) - - ALC882/885 - 3stack-dig 3-jack with SPDIF I/O - 6stack-dig 6-jack digital with SPDIF I/O - arima Arima W820Di1 - targa Targa T8, MSI-1049 T8 - asus-a7j ASUS A7J - asus-a7m ASUS A7M - macpro MacPro support - mbp3 Macbook Pro rev3 - imac24 iMac 24'' with jack detection - w2jc ASUS W2JC - auto auto-config reading BIOS (default) - - ALC883/888 - 3stack-dig 3-jack with SPDIF I/O - 6stack-dig 6-jack digital with SPDIF I/O - 3stack-6ch 3-jack 6-channel - 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O - 6stack-dig-demo 6-jack digital for Intel demo board - acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) - acer-aspire Acer Aspire 9810 - medion Medion Laptops - medion-md2 Medion MD2 - targa-dig Targa/MSI - targa-2ch-dig Targs/MSI with 2-channel - laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) - lenovo-101e Lenovo 101E - lenovo-nb0763 Lenovo NB0763 - lenovo-ms7195-dig Lenovo MS7195 - lenovo-sky Lenovo Sky - haier-w66 Haier W66 - 3stack-hp HP machines with 3stack (Lucknow, Samba boards) - 6stack-dell Dell machines with 6stack (Inspiron 530) - mitac Mitac 8252D - clevo-m720 Clevo M720 laptop series - fujitsu-pi2515 Fujitsu AMILO Pi2515 - 3stack-6ch-intel Intel DG33* boards - auto auto-config reading BIOS (default) - - ALC861/660 - 3stack 3-jack - 3stack-dig 3-jack with SPDIF I/O - 6stack-dig 6-jack with SPDIF I/O - 3stack-660 3-jack (for ALC660) - uniwill-m31 Uniwill M31 laptop - toshiba Toshiba laptop support - asus Asus laptop support - asus-laptop ASUS F2/F3 laptops - auto auto-config reading BIOS (default) - - ALC861VD/660VD - 3stack 3-jack - 3stack-dig 3-jack with SPDIF OUT - 6stack-dig 6-jack with SPDIF OUT - 3stack-660 3-jack (for ALC660VD) - 3stack-660-digout 3-jack with SPDIF OUT (for ALC660VD) - lenovo Lenovo 3000 C200 - dallas Dallas laptops - hp HP TX1000 - auto auto-config reading BIOS (default) - - CMI9880 - minimal 3-jack in back - min_fp 3-jack in back, 2-jack in front - full 6-jack in back, 2-jack in front - full_dig 6-jack in back, 2-jack in front, SPDIF I/O - allout 5-jack in back, 2-jack in front, SPDIF out - auto auto-config reading BIOS (default) - - AD1882 / AD1882A - 3stack 3-stack mode (default) - 6stack 6-stack mode - - AD1884A / AD1883 / AD1984A / AD1984B - desktop 3-stack desktop (default) - laptop laptop with HP jack sensing - mobile mobile devices with HP jack sensing - thinkpad Lenovo Thinkpad X300 - - AD1884 - N/A - - AD1981 - basic 3-jack (default) - hp HP nx6320 - thinkpad Lenovo Thinkpad T60/X60/Z60 - toshiba Toshiba U205 - - AD1983 - N/A - - AD1984 - basic default configuration - thinkpad Lenovo Thinkpad T61/X61 - dell Dell T3400 - - AD1986A - 6stack 6-jack, separate surrounds (default) - 3stack 3-stack, shared surrounds - laptop 2-channel only (FSC V2060, Samsung M50) - laptop-eapd 2-channel with EAPD (Samsung R65, ASUS A6J) - laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100) - ultra 2-channel with EAPD (Samsung Ultra tablet PC) - - AD1988/AD1988B/AD1989A/AD1989B - 6stack 6-jack - 6stack-dig ditto with SPDIF - 3stack 3-jack - 3stack-dig ditto with SPDIF - laptop 3-jack with hp-jack automute - laptop-dig ditto with SPDIF - auto auto-config reading BIOS (default) - - Conexant 5045 - laptop-hpsense Laptop with HP sense (old model laptop) - laptop-micsense Laptop with Mic sense (old model fujitsu) - laptop-hpmicsense Laptop with HP and Mic senses - benq Benq R55E - test for testing/debugging purpose, almost all controls - can be adjusted. Appearing only when compiled with - $CONFIG_SND_DEBUG=y - - Conexant 5047 - laptop Basic Laptop config - laptop-hp Laptop config for some HP models (subdevice 30A5) - laptop-eapd Laptop config with EAPD support - test for testing/debugging purpose, almost all controls - can be adjusted. Appearing only when compiled with - $CONFIG_SND_DEBUG=y - - Conexant 5051 - laptop Basic Laptop config (default) - hp HP Spartan laptop - - STAC9200 - ref Reference board - dell-d21 Dell (unknown) - dell-d22 Dell (unknown) - dell-d23 Dell (unknown) - dell-m21 Dell Inspiron 630m, Dell Inspiron 640m - dell-m22 Dell Latitude D620, Dell Latitude D820 - dell-m23 Dell XPS M1710, Dell Precision M90 - dell-m24 Dell Latitude 120L - dell-m25 Dell Inspiron E1505n - dell-m26 Dell Inspiron 1501 - dell-m27 Dell Inspiron E1705/9400 - gateway Gateway laptops with EAPD control - panasonic Panasonic CF-74 - - STAC9205/9254 - ref Reference board - dell-m42 Dell (unknown) - dell-m43 Dell Precision - dell-m44 Dell Inspiron - - STAC9220/9221 - ref Reference board - 3stack D945 3stack - 5stack D945 5stack + SPDIF - intel-mac-v1 Intel Mac Type 1 - intel-mac-v2 Intel Mac Type 2 - intel-mac-v3 Intel Mac Type 3 - intel-mac-v4 Intel Mac Type 4 - intel-mac-v5 Intel Mac Type 5 - intel-mac-auto Intel Mac (detect type according to subsystem id) - macmini Intel Mac Mini (equivalent with type 3) - macbook Intel Mac Book (eq. type 5) - macbook-pro-v1 Intel Mac Book Pro 1st generation (eq. type 3) - macbook-pro Intel Mac Book Pro 2nd generation (eq. type 3) - imac-intel Intel iMac (eq. type 2) - imac-intel-20 Intel iMac (newer version) (eq. type 3) - dell-d81 Dell (unknown) - dell-d82 Dell (unknown) - dell-m81 Dell (unknown) - dell-m82 Dell XPS M1210 - - STAC9202/9250/9251 - ref Reference board, base config - m2-2 Some Gateway MX series laptops - m6 Some Gateway NX series laptops - pa6 Gateway NX860 series - - STAC9227/9228/9229/927x - ref Reference board - 3stack D965 3stack - 5stack D965 5stack + SPDIF - dell-3stack Dell Dimension E520 - dell-bios Fixes with Dell BIOS setup - - STAC92HD71B* - ref Reference board - dell-m4-1 Dell desktops - dell-m4-2 Dell desktops - - STAC92HD73* - ref Reference board - dell-m6 Dell desktops - - STAC9872 - vaio Setup for VAIO FE550G/SZ110 - vaio-ar Setup for VAIO AR + models depending on the codec chip. The list of available models + is found in HD-Audio-Models.txt The model name "genric" is treated as a special case. When this model is given, the driver uses the generic codec parser without "codec-patch". It's sometimes good for testing and debugging. If the default configuration doesn't work and one of the above - matches with your device, report it together with the PCI - subsystem ID (output of "lspci -nv") to ALSA BTS or alsa-devel + matches with your device, report it together with alsa-info.sh + output (with --no-upload option) to kernel bugzilla or alsa-devel ML (see the section "Links and Addresses"). power_save and power_save_controller options are for power-saving @@ -1647,7 +1350,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. * AuzenTech X-Meridian * Bgears b-Enspirer * Club3D Theatron DTS - * HT-Omega Claro + * HT-Omega Claro (plus) + * HT-Omega Claro halo (XT) * Razer Barracuda AC-1 * Sondigo Inferno @@ -2404,8 +2108,11 @@ Links and Addresses ALSA project homepage http://www.alsa-project.org - ALSA Bug Tracking System - https://bugtrack.alsa-project.org/bugs/ + Kernel Bugzilla + http://bugzilla.kernel.org/ ALSA Developers ML mailto:alsa-devel@alsa-project.org + + alsa-info.sh script + http://www.alsa-project.org/alsa-info.sh diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt new file mode 100644 index 00000000000..64eb1100eec --- /dev/null +++ b/Documentation/sound/alsa/HD-Audio-Models.txt @@ -0,0 +1,356 @@ + Model name Description + ---------- ----------- +ALC880 +====== + 3stack 3-jack in back and a headphone out + 3stack-digout 3-jack in back, a HP out and a SPDIF out + 5stack 5-jack in back, 2-jack in front + 5stack-digout 5-jack in back, 2-jack in front, a SPDIF out + 6stack 6-jack in back, 2-jack in front + 6stack-digout 6-jack with a SPDIF out + w810 3-jack + z71v 3-jack (HP shared SPDIF) + asus 3-jack (ASUS Mobo) + asus-w1v ASUS W1V + asus-dig ASUS with SPDIF out + asus-dig2 ASUS with SPDIF out (using GPIO2) + uniwill 3-jack + fujitsu Fujitsu Laptops (Pi1536) + F1734 2-jack + lg LG laptop (m1 express dual) + lg-lw LG LW20/LW25 laptop + tcl TCL S700 + clevo Clevo laptops (m520G, m665n) + medion Medion Rim 2150 + test for testing/debugging purpose, almost all controls can be + adjusted. Appearing only when compiled with + $CONFIG_SND_DEBUG=y + auto auto-config reading BIOS (default) + +ALC260 +====== + hp HP machines + hp-3013 HP machines (3013-variant) + hp-dc7600 HP DC7600 + fujitsu Fujitsu S7020 + acer Acer TravelMate + will Will laptops (PB V7900) + replacer Replacer 672V + basic fixed pin assignment (old default model) + test for testing/debugging purpose, almost all controls can + adjusted. Appearing only when compiled with + $CONFIG_SND_DEBUG=y + auto auto-config reading BIOS (default) + +ALC262 +====== + fujitsu Fujitsu Laptop + hp-bpc HP xw4400/6400/8400/9400 laptops + hp-bpc-d7000 HP BPC D7000 + hp-tc-t5735 HP Thin Client T5735 + hp-rp5700 HP RP5700 + benq Benq ED8 + benq-t31 Benq T31 + hippo Hippo (ATI) with jack detection, Sony UX-90s + hippo_1 Hippo (Benq) with jack detection + sony-assamd Sony ASSAMD + toshiba-s06 Toshiba S06 + toshiba-rx1 Toshiba RX1 + ultra Samsung Q1 Ultra Vista model + lenovo-3000 Lenovo 3000 y410 + nec NEC Versa S9100 + basic fixed pin assignment w/o SPDIF + auto auto-config reading BIOS (default) + +ALC267/268 +========== + quanta-il1 Quanta IL1 mini-notebook + 3stack 3-stack model + toshiba Toshiba A205 + acer Acer laptops + acer-dmic Acer laptops with digital-mic + acer-aspire Acer Aspire One + dell Dell OEM laptops (Vostro 1200) + zepto Zepto laptops + test for testing/debugging purpose, almost all controls can + adjusted. Appearing only when compiled with + $CONFIG_SND_DEBUG=y + auto auto-config reading BIOS (default) + +ALC269 +====== + basic Basic preset + quanta Quanta FL1 + eeepc-p703 ASUS Eeepc P703 P900A + eeepc-p901 ASUS Eeepc P901 S101 + fujitsu FSC Amilo + auto auto-config reading BIOS (default) + +ALC662/663 +========== + 3stack-dig 3-stack (2-channel) with SPDIF + 3stack-6ch 3-stack (6-channel) + 3stack-6ch-dig 3-stack (6-channel) with SPDIF + 6stack-dig 6-stack with SPDIF + lenovo-101e Lenovo laptop + eeepc-p701 ASUS Eeepc P701 + eeepc-ep20 ASUS Eeepc EP20 + ecs ECS/Foxconn mobo + m51va ASUS M51VA + g71v ASUS G71V + h13 ASUS H13 + g50v ASUS G50V + asus-mode1 ASUS + asus-mode2 ASUS + asus-mode3 ASUS + asus-mode4 ASUS + asus-mode5 ASUS + asus-mode6 ASUS + auto auto-config reading BIOS (default) + +ALC882/885 +========== + 3stack-dig 3-jack with SPDIF I/O + 6stack-dig 6-jack digital with SPDIF I/O + arima Arima W820Di1 + targa Targa T8, MSI-1049 T8 + asus-a7j ASUS A7J + asus-a7m ASUS A7M + macpro MacPro support + mbp3 Macbook Pro rev3 + imac24 iMac 24'' with jack detection + w2jc ASUS W2JC + auto auto-config reading BIOS (default) + +ALC883/888 +========== + 3stack-dig 3-jack with SPDIF I/O + 6stack-dig 6-jack digital with SPDIF I/O + 3stack-6ch 3-jack 6-channel + 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O + 6stack-dig-demo 6-jack digital for Intel demo board + acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) + acer-aspire Acer Aspire 9810 + acer-aspire-4930g Acer Aspire 4930G + medion Medion Laptops + medion-md2 Medion MD2 + targa-dig Targa/MSI + targa-2ch-dig Targs/MSI with 2-channel + laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) + lenovo-101e Lenovo 101E + lenovo-nb0763 Lenovo NB0763 + lenovo-ms7195-dig Lenovo MS7195 + lenovo-sky Lenovo Sky + haier-w66 Haier W66 + 3stack-hp HP machines with 3stack (Lucknow, Samba boards) + 6stack-dell Dell machines with 6stack (Inspiron 530) + mitac Mitac 8252D + clevo-m720 Clevo M720 laptop series + fujitsu-pi2515 Fujitsu AMILO Pi2515 + fujitsu-xa3530 Fujitsu AMILO XA3530 + 3stack-6ch-intel Intel DG33* boards + auto auto-config reading BIOS (default) + +ALC861/660 +========== + 3stack 3-jack + 3stack-dig 3-jack with SPDIF I/O + 6stack-dig 6-jack with SPDIF I/O + 3stack-660 3-jack (for ALC660) + uniwill-m31 Uniwill M31 laptop + toshiba Toshiba laptop support + asus Asus laptop support + asus-laptop ASUS F2/F3 laptops + auto auto-config reading BIOS (default) + +ALC861VD/660VD +============== + 3stack 3-jack + 3stack-dig 3-jack with SPDIF OUT + 6stack-dig 6-jack with SPDIF OUT + 3stack-660 3-jack (for ALC660VD) + 3stack-660-digout 3-jack with SPDIF OUT (for ALC660VD) + lenovo Lenovo 3000 C200 + dallas Dallas laptops + hp HP TX1000 + asus-v1s ASUS V1Sn + auto auto-config reading BIOS (default) + +CMI9880 +======= + minimal 3-jack in back + min_fp 3-jack in back, 2-jack in front + full 6-jack in back, 2-jack in front + full_dig 6-jack in back, 2-jack in front, SPDIF I/O + allout 5-jack in back, 2-jack in front, SPDIF out + auto auto-config reading BIOS (default) + +AD1882 / AD1882A +================ + 3stack 3-stack mode (default) + 6stack 6-stack mode + +AD1884A / AD1883 / AD1984A / AD1984B +==================================== + desktop 3-stack desktop (default) + laptop laptop with HP jack sensing + mobile mobile devices with HP jack sensing + thinkpad Lenovo Thinkpad X300 + +AD1884 +====== + N/A + +AD1981 +====== + basic 3-jack (default) + hp HP nx6320 + thinkpad Lenovo Thinkpad T60/X60/Z60 + toshiba Toshiba U205 + +AD1983 +====== + N/A + +AD1984 +====== + basic default configuration + thinkpad Lenovo Thinkpad T61/X61 + dell Dell T3400 + +AD1986A +======= + 6stack 6-jack, separate surrounds (default) + 3stack 3-stack, shared surrounds + laptop 2-channel only (FSC V2060, Samsung M50) + laptop-eapd 2-channel with EAPD (ASUS A6J) + laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100) + ultra 2-channel with EAPD (Samsung Ultra tablet PC) + samsung 2-channel with EAPD (Samsung R65) + +AD1988/AD1988B/AD1989A/AD1989B +============================== + 6stack 6-jack + 6stack-dig ditto with SPDIF + 3stack 3-jack + 3stack-dig ditto with SPDIF + laptop 3-jack with hp-jack automute + laptop-dig ditto with SPDIF + auto auto-config reading BIOS (default) + +Conexant 5045 +============= + laptop-hpsense Laptop with HP sense (old model laptop) + laptop-micsense Laptop with Mic sense (old model fujitsu) + laptop-hpmicsense Laptop with HP and Mic senses + benq Benq R55E + test for testing/debugging purpose, almost all controls + can be adjusted. Appearing only when compiled with + $CONFIG_SND_DEBUG=y + +Conexant 5047 +============= + laptop Basic Laptop config + laptop-hp Laptop config for some HP models (subdevice 30A5) + laptop-eapd Laptop config with EAPD support + test for testing/debugging purpose, almost all controls + can be adjusted. Appearing only when compiled with + $CONFIG_SND_DEBUG=y + +Conexant 5051 +============= + laptop Basic Laptop config (default) + hp HP Spartan laptop + +STAC9200 +======== + ref Reference board + dell-d21 Dell (unknown) + dell-d22 Dell (unknown) + dell-d23 Dell (unknown) + dell-m21 Dell Inspiron 630m, Dell Inspiron 640m + dell-m22 Dell Latitude D620, Dell Latitude D820 + dell-m23 Dell XPS M1710, Dell Precision M90 + dell-m24 Dell Latitude 120L + dell-m25 Dell Inspiron E1505n + dell-m26 Dell Inspiron 1501 + dell-m27 Dell Inspiron E1705/9400 + gateway-m4 Gateway laptops with EAPD control + gateway-m4-2 Gateway laptops with EAPD control + panasonic Panasonic CF-74 + +STAC9205/9254 +============= + ref Reference board + dell-m42 Dell (unknown) + dell-m43 Dell Precision + dell-m44 Dell Inspiron + +STAC9220/9221 +============= + ref Reference board + 3stack D945 3stack + 5stack D945 5stack + SPDIF + intel-mac-v1 Intel Mac Type 1 + intel-mac-v2 Intel Mac Type 2 + intel-mac-v3 Intel Mac Type 3 + intel-mac-v4 Intel Mac Type 4 + intel-mac-v5 Intel Mac Type 5 + intel-mac-auto Intel Mac (detect type according to subsystem id) + macmini Intel Mac Mini (equivalent with type 3) + macbook Intel Mac Book (eq. type 5) + macbook-pro-v1 Intel Mac Book Pro 1st generation (eq. type 3) + macbook-pro Intel Mac Book Pro 2nd generation (eq. type 3) + imac-intel Intel iMac (eq. type 2) + imac-intel-20 Intel iMac (newer version) (eq. type 3) + ecs202 ECS/PC chips + dell-d81 Dell (unknown) + dell-d82 Dell (unknown) + dell-m81 Dell (unknown) + dell-m82 Dell XPS M1210 + +STAC9202/9250/9251 +================== + ref Reference board, base config + m1 Some Gateway MX series laptops (NX560XL) + m1-2 Some Gateway MX series laptops (MX6453) + m2 Some Gateway MX series laptops (M255) + m2-2 Some Gateway MX series laptops + m3 Some Gateway MX series laptops + m5 Some Gateway MX series laptops (MP6954) + m6 Some Gateway NX series laptops + +STAC9227/9228/9229/927x +======================= + ref Reference board + ref-no-jd Reference board without HP/Mic jack detection + 3stack D965 3stack + 5stack D965 5stack + SPDIF + dell-3stack Dell Dimension E520 + dell-bios Fixes with Dell BIOS setup + +STAC92HD71B* +============ + ref Reference board + dell-m4-1 Dell desktops + dell-m4-2 Dell desktops + dell-m4-3 Dell desktops + hp-m4 HP dv laptops + +STAC92HD73* +=========== + ref Reference board + no-jd BIOS setup but without jack-detection + dell-m6-amic Dell desktops/laptops with analog mics + dell-m6-dmic Dell desktops/laptops with digital mics + dell-m6 Dell desktops/laptops with both type of mics + dell-eq Dell desktops/laptops + +STAC92HD83* +=========== + ref Reference board + +STAC9872 +======== + vaio Setup for VAIO FE550G/SZ110 + vaio-ar Setup for VAIO AR diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt new file mode 100644 index 00000000000..8d68fff7183 --- /dev/null +++ b/Documentation/sound/alsa/HD-Audio.txt @@ -0,0 +1,577 @@ +MORE NOTES ON HD-AUDIO DRIVER +============================= + Takashi Iwai + + +GENERAL +------- + +HD-audio is the new standard on-board audio component on modern PCs +after AC97. Although Linux has been supporting HD-audio since long +time ago, there are often problems with new machines. A part of the +problem is broken BIOS, and the rest is the driver implementation. +This document explains the brief trouble-shooting and debugging +methods for the HD-audio hardware. + +The HD-audio component consists of two parts: the controller chip and +the codec chips on the HD-audio bus. Linux provides a single driver +for all controllers, snd-hda-intel. Although the driver name contains +a word of a well-known harware vendor, it's not specific to it but for +all controller chips by other companies. Since the HD-audio +controllers are supposed to be compatible, the single snd-hda-driver +should work in most cases. But, not surprisingly, there are known +bugs and issues specific to each controller type. The snd-hda-intel +driver has a bunch of workarounds for these as described below. + +A controller may have multiple codecs. Usually you have one audio +codec and optionally one modem codec. In theory, there might be +multiple audio codecs, e.g. for analog and digital outputs, and the +driver might not work properly because of conflict of mixer elements. +This should be fixed in future if such hardware really exists. + +The snd-hda-intel driver has several different codec parsers depending +on the codec. It has a generic parser as a fallback, but this +functionality is fairly limited until now. Instead of the generic +parser, usually the codec-specific parser (coded in patch_*.c) is used +for the codec-specific implementations. The details about the +codec-specific problems are explained in the later sections. + +If you are interested in the deep debugging of HD-audio, read the +HD-audio specification at first. The specification is found on +Intel's web page, for example: + +- http://www.intel.com/standards/hdaudio/ + + +HD-AUDIO CONTROLLER +------------------- + +DMA-Position Problem +~~~~~~~~~~~~~~~~~~~~ +The most common problem of the controller is the inaccurate DMA +pointer reporting. The DMA pointer for playback and capture can be +read in two ways, either via a LPIB register or via a position-buffer +map. As default the driver tries to read from the io-mapped +position-buffer, and falls back to LPIB if the position-buffer appears +dead. However, this detection isn't perfect on some devices. In such +a case, you can change the default method via `position_fix` option. + +`position_fix=1` means to use LPIB method explicitly. +`position_fix=2` means to use the position-buffer. 0 is the default +value, the automatic check and fallback to LPIB as described in the +above. If you get a problem of repeated sounds, this option might +help. + +In addition to that, every controller is known to be broken regarding +the wake-up timing. It wakes up a few samples before actually +processing the data on the buffer. This caused a lot of problems, for +example, with ALSA dmix or JACK. Since 2.6.27 kernel, the driver puts +an artificial delay to the wake up timing. This delay is controlled +via `bdl_pos_adj` option. + +When `bdl_pos_adj` is a negative value (as default), it's assigned to +an appropriate value depending on the controller chip. For Intel +chips, it'd be 1 while it'd be 32 for others. Usually this works. +Only in case it doesn't work and you get warning messages, you should +change this parameter to other values. + + +Codec-Probing Problem +~~~~~~~~~~~~~~~~~~~~~ +A less often but a more severe problem is the codec probing. When +BIOS reports the available codec slots wrongly, the driver gets +confused and tries to access the non-existing codec slot. This often +results in the total screw-up, and destructs the further communication +with the codec chips. The symptom appears usually as error messages +like: +------------------------------------------------------------------------ + hda_intel: azx_get_response timeout, switching to polling mode: + last cmd=0x12345678 + hda_intel: azx_get_response timeout, switching to single_cmd mode: + last cmd=0x12345678 +------------------------------------------------------------------------ + +The first line is a warning, and this is usually relatively harmless. +It means that the codec response isn't notified via an IRQ. The +driver uses explicit polling method to read the response. It gives +very slight CPU overhead, but you'd unlikely notice it. + +The second line is, however, a fatal error. If this happens, usually +it means that something is really wrong. Most likely you are +accessing a non-existing codec slot. + +Thus, if the second error message appears, try to narrow the probed +codec slots via `probe_mask` option. It's a bitmask, and each bit +corresponds to the codec slot. For example, to probe only the first +slot, pass `probe_mask=1`. For the first and the third slots, pass +`probe_mask=5` (where 5 = 1 | 4), and so on. + +Since 2.6.29 kernel, the driver has a more robust probing method, so +this error might happen rarely, though. + + +Interrupt Handling +~~~~~~~~~~~~~~~~~~ +In rare but some cases, the interrupt isn't properly handled as +default. You would notice this by the DMA transfer error reported by +ALSA PCM core, for example. Using MSI might help in such a case. +Pass `enable_msi=1` option for enabling MSI. + + +HD-AUDIO CODEC +-------------- + +Model Option +~~~~~~~~~~~~ +The most common problem regarding the HD-audio driver is the +unsupported codec features or the mismatched device configuration. +Most of codec-specific code has several preset models, either to +override the BIOS setup or to provide more comprehensive features. + +The driver checks PCI SSID and looks through the static configuration +table until any matching entry is found. If you have a new machine, +you may see a message like below: +------------------------------------------------------------------------ + hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... +------------------------------------------------------------------------ +Even if you see such a message, DON'T PANIC. Take a deep breath and +keep your towel. First of all, it's an informational message, no +warning, no error. This means that the PCI SSID of your device isn't +listed in the known preset model (white-)list. But, this doesn't mean +that the driver is broken. Many codec-drivers provide the automatic +configuration mechanism based on the BIOS setup. + +The HD-audio codec has usually "pin" widgets, and BIOS sets the default +configuration of each pin, which indicates the location, the +connection type, the jack color, etc. The HD-audio driver can guess +the right connection judging from these default configuration values. +However -- some codec-support codes, such as patch_analog.c, don't +support the automatic probing (yet as of 2.6.28). And, BIOS is often, +yes, pretty often broken. It sets up wrong values and screws up the +driver. + +The preset model is provided basically to overcome such a situation. +When the matching preset model is found in the white-list, the driver +assumes the static configuration of that preset and builds the mixer +elements and PCM streams based on the static information. Thus, if +you have a newer machine with a slightly different PCI SSID from the +existing one, you may have a good chance to re-use the same model. +You can pass the `model` option to specify the preset model instead of +PCI SSID look-up. + +What `model` option values are available depends on the codec chip. +Check your codec chip from the codec proc file (see "Codec Proc-File" +section below). It will show the vendor/product name of your codec +chip. Then, see Documentation/sound/alsa/HD-Audio-Modelstxt file, +the section of HD-audio driver. You can find a list of codecs +and `model` options belonging to each codec. For example, for Realtek +ALC262 codec chip, pass `model=ultra` for devices that are compatible +with Samsung Q1 Ultra. + +Thus, the first thing you can do for any brand-new, unsupported and +non-working HD-audio hardware is to check HD-audio codec and several +different `model` option values. If you have a luck, some of them +might suit with your device well. + +Some codecs such as ALC880 have a special model option `model=test`. +This configures the driver to provide as many mixer controls as +possible for every single pin feature except for the unsolicited +events (and maybe some other specials). Adjust each mixer element and +try the I/O in the way of trial-and-error until figuring out the whole +I/O pin mappings. + +Note that `model=generic` has a special meaning. It means to use the +generic parser regardless of the codec. Usually the codec-specific +parser is much better than the generic parser (as now). Thus this +option is more about the debugging purpose. + + +Speaker and Headphone Output +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +One of the most frequent (and obvious) bugs with HD-audio is the +silent output from either or both of a built-in speaker and a +headphone jack. In general, you should try a headphone output at +first. A speaker output often requires more additional controls like +the external amplifier bits. Thus a headphone output has a slightly +better chance. + +Before making a bug report, double-check whether the mixer is set up +correctly. The recent version of snd-hda-intel driver provides mostly +"Master" volume control as well as "Front" volume (where Front +indicates the front-channels). In addition, there can be individual +"Headphone" and "Speaker" controls. + +Ditto for the speaker output. There can be "External Amplifier" +switch on some codecs. Turn on this if present. + +Another related problem is the automatic mute of speaker output by +headphone plugging. This feature is implemented in most cases, but +not on every preset model or codec-support code. + +In anyway, try a different model option if you have such a problem. +Some other models may match better and give you more matching +functionality. If none of the available models works, send a bug +report. See the bug report section for details. + +If you are masochistic enough to debug the driver problem, note the +following: + +- The speaker (and the headphone, too) output often requires the + external amplifier. This can be set usually via EAPD verb or a + certain GPIO. If the codec pin supports EAPD, you have a better + chance via SET_EAPD_BTL verb (0x70c). On others, GPIO pin (mostly + it's either GPIO0 or GPIO1) may turn on/off EAPD. +- Some Realtek codecs require special vendor-specific coefficients to + turn on the amplifier. See patch_realtek.c. +- IDT codecs may have extra power-enable/disable controls on each + analog pin. See patch_sigmatel.c. +- Very rare but some devices don't accept the pin-detection verb until + triggered. Issuing GET_PIN_SENSE verb (0xf09) may result in the + codec-communication stall. Some examples are found in + patch_realtek.c. + + +Capture Problems +~~~~~~~~~~~~~~~~ +The capture problems are often because of missing setups of mixers. +Thus, before submitting a bug report, make sure that you set up the +mixer correctly. For example, both "Capture Volume" and "Capture +Switch" have to be set properly in addition to the right "Capture +Source" or "Input Source" selection. Some devices have "Mic Boost" +volume or switch. + +When the PCM device is opened via "default" PCM (without pulse-audio +plugin), you'll likely have "Digital Capture Volume" control as well. +This is provided for the extra gain/attenuation of the signal in +software, especially for the inputs without the hardware volume +control such as digital microphones. Unless really needed, this +should be set to exactly 50%, corresponding to 0dB -- neither extra +gain nor attenuation. When you use "hw" PCM, i.e., a raw access PCM, +this control will have no influence, though. + +It's known that some codecs / devices have fairly bad analog circuits, +and the recorded sound contains a certain DC-offset. This is no bug +of the driver. + +Most of modern laptops have no analog CD-input connection. Thus, the +recording from CD input won't work in many cases although the driver +provides it as the capture source. Use CDDA instead. + +The automatic switching of the built-in and external mic per plugging +is implemented on some codec models but not on every model. Partly +because of my laziness but mostly lack of testers. Feel free to +submit the improvement patch to the author. + + +Direct Debugging +~~~~~~~~~~~~~~~~ +If no model option gives you a better result, and you are a tough guy +to fight against evil, try debugging via hitting the raw HD-audio +codec verbs to the device. Some tools are available: hda-emu and +hda-analyzer. The detailed description is found in the sections +below. You'd need to enable hwdep for using these tools. See "Kernel +Configuration" section. + + +OTHER ISSUES +------------ + +Kernel Configuration +~~~~~~~~~~~~~~~~~~~~ +In general, I recommend you to enable the sound debug option, +`CONFIG_SND_DEBUG=y`, no matter whether you are debugging or not. +This enables snd_printd() macro and others, and you'll get additional +kernel messages at probing. + +In addition, you can enable `CONFIG_SND_DEBUG_VERBOSE=y`. But this +will give you far more messages. Thus turn this on only when you are +sure to want it. + +Don't forget to turn on the appropriate `CONFIG_SND_HDA_CODEC_*` +options. Note that each of them corresponds to the codec chip, not +the controller chip. Thus, even if lspci shows the Nvidia controller, +you may need to choose the option for other vendors. If you are +unsure, just select all yes. + +`CONFIG_SND_HDA_HWDEP` is a useful option for debugging the driver. +When this is enabled, the driver creates hardware-dependent devices +(one per each codec), and you have a raw access to the device via +these device files. For example, `hwC0D2` will be created for the +codec slot #2 of the first card (#0). For debug-tools such as +hda-verb and hda-analyzer, the hwdep device has to be enabled. +Thus, it'd be better to turn this on always. + +`CONFIG_SND_HDA_RECONFIG` is a new option, and this depends on the +hwdep option above. When enabled, you'll have some sysfs files under +the corresponding hwdep directory. See "HD-audio reconfiguration" +section below. + +`CONFIG_SND_HDA_POWER_SAVE` option enables the power-saving feature. +See "Power-saving" section below. + + +Codec Proc-File +~~~~~~~~~~~~~~~ +The codec proc-file is a treasure-chest for debugging HD-audio. +It shows most of useful information of each codec widget. + +The proc file is located in /proc/asound/card*/codec#*, one file per +each codec slot. You can know the codec vendor, product id and +names, the type of each widget, capabilities and so on. +This file, however, doesn't show the jack sensing state, so far. This +is because the jack-sensing might be depending on the trigger state. + +This file will be picked up by the debug tools, and also it can be fed +to the emulator as the primary codec information. See the debug tools +section below. + +This proc file can be also used to check whether the generic parser is +used. When the generic parser is used, the vendor/product ID name +will appear as "Realtek ID 0262", instead of "Realtek ALC262". + + +HD-Audio Reconfiguration +~~~~~~~~~~~~~~~~~~~~~~~~ +This is an experimental feature to allow you re-configure the HD-audio +codec dynamically without reloading the driver. The following sysfs +files are available under each codec-hwdep device directory (e.g. +/sys/class/sound/hwC0D0): + +vendor_id:: + Shows the 32bit codec vendor-id hex number. You can change the + vendor-id value by writing to this file. +subsystem_id:: + Shows the 32bit codec subsystem-id hex number. You can change the + subsystem-id value by writing to this file. +revision_id:: + Shows the 32bit codec revision-id hex number. You can change the + revision-id value by writing to this file. +afg:: + Shows the AFG ID. This is read-only. +mfg:: + Shows the MFG ID. This is read-only. +name:: + Shows the codec name string. Can be changed by writing to this + file. +modelname:: + Shows the currently set `model` option. Can be changed by writing + to this file. +init_verbs:: + The extra verbs to execute at initialization. You can add a verb by + writing to this file. Pass tree numbers, nid, verb and parameter. +hints:: + Shows hint strings for codec parsers for any use. Right now it's + not used. +reconfig:: + Triggers the codec re-configuration. When any value is written to + this file, the driver re-initialize and parses the codec tree + again. All the changes done by the sysfs entries above are taken + into account. +clear:: + Resets the codec, removes the mixer elements and PCM stuff of the + specified codec, and clear all init verbs and hints. + + +Power-Saving +~~~~~~~~~~~~ +The power-saving is a kind of auto-suspend of the device. When the +device is inactive for a certain time, the device is automatically +turned off to save the power. The time to go down is specified via +`power_save` module option, and this option can be changed dynamically +via sysfs. + +The power-saving won't work when the analog loopback is enabled on +some codecs. Make sure that you mute all unneeded signal routes when +you want the power-saving. + +The power-saving feature might cause audible click noises at each +power-down/up depending on the device. Some of them might be +solvable, but some are hard, I'm afraid. Some distros such as +openSUSE enables the power-saving feature automatically when the power +cable is unplugged. Thus, if you hear noises, suspect first the +power-saving. See /sys/module/snd_hda_intel/parameters/power_save to +check the current value. If it's non-zero, the feature is turned on. + + +Development Tree +~~~~~~~~~~~~~~~~ +The latest development codes for HD-audio are found on sound git tree: + +- git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound-2.6.git + +The master branch or for-next branches can be used as the main +development branches in general while the HD-audio specific patches +are committed in topic/hda branch. + +If you are using the latest Linus tree, it'd be better to pull the +above GIT tree onto it. If you are using the older kernels, an easy +way to try the latest ALSA code is to build from the snapshot +tarball. There are daily tarballs and the latest snapshot tarball. +All can be built just like normal alsa-driver release packages, that +is, installed via the usual spells: configure, make and make +install(-modules). See INSTALL in the package. The snapshot tarballs +are found at: + +- ftp://ftp.kernel.org/pub/linux/kernel/people/tiwai/snapshot/ + + +Sending a Bug Report +~~~~~~~~~~~~~~~~~~~~ +If any model or module options don't work for your device, it's time +to send a bug report to the developers. Give the following in your +bug report: + +- Hardware vendor, product and model names +- Kernel version (and ALSA-driver version if you built externally) +- `alsa-info.sh` output; run with `--no-upload` option. See the + section below about alsa-info + +If it's a regression, at best, send alsa-info outputs of both working +and non-working kernels. This is really helpful because we can +compare the codec registers directly. + +Send a bug report either the followings: + +kernel-bugzilla:: + http://bugme.linux-foundation.org/ +alsa-devel ML:: + alsa-devel@alsa-project.org + + +DEBUG TOOLS +----------- + +This section describes some tools available for debugging HD-audio +problems. + +alsa-info +~~~~~~~~~ +The script `alsa-info.sh` is a very useful tool to gather the audio +device information. You can fetch the latest version from: + +- http://www.alsa-project.org/alsa-info.sh + +Run this script as root, and it will gather the important information +such as the module lists, module parameters, proc file contents +including the codec proc files, mixer outputs and the control +elements. As default, it will store the information onto a web server +on alsa-project.org. But, if you send a bug report, it'd be better to +run with `--no-upload` option, and attach the generated file. + +There are some other useful options. See `--help` option output for +details. + + +hda-verb +~~~~~~~~ +hda-verb is a tiny program that allows you to access the HD-audio +codec directly. You can execute a raw HD-audio codec verb with this. +This program accesses the hwdep device, thus you need to enable the +kernel config `CONFIG_SND_HDA_HWDEP=y` beforehand. + +The hda-verb program takes four arguments: the hwdep device file, the +widget NID, the verb and the parameter. When you access to the codec +on the slot 2 of the card 0, pass /dev/snd/hwC0D2 to the first +argument, typically. (However, the real path name depends on the +system.) + +The second parameter is the widget number-id to access. The third +parameter can be either a hex/digit number or a string corresponding +to a verb. Similarly, the last parameter is the value to write, or +can be a string for the parameter type. + +------------------------------------------------------------------------ + % hda-verb /dev/snd/hwC0D0 0x12 0x701 2 + nid = 0x12, verb = 0x701, param = 0x2 + value = 0x0 + + % hda-verb /dev/snd/hwC0D0 0x0 PARAMETERS VENDOR_ID + nid = 0x0, verb = 0xf00, param = 0x0 + value = 0x10ec0262 + + % hda-verb /dev/snd/hwC0D0 2 set_a 0xb080 + nid = 0x2, verb = 0x300, param = 0xb080 + value = 0x0 +------------------------------------------------------------------------ + +Although you can issue any verbs with this program, the driver state +won't be always updated. For example, the volume values are usually +cached in the driver, and thus changing the widget amp value directly +via hda-verb won't change the mixer value. + +The hda-verb program is found in the ftp directory: + +- ftp://ftp.kernel.org/pub/linux/kernel/people/tiwai/misc/ + +Also a git repository is available: + +- git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/hda-verb.git + +See README file in the tarball for more details about hda-verb +program. + + +hda-analyzer +~~~~~~~~~~~~ +hda-analyzer provides a graphical interface to access the raw HD-audio +control, based on pyGTK2 binding. It's a more powerful version of +hda-verb. The program gives you an easy-to-use GUI stuff for showing +the widget information and adjusting the amp values, as well as the +proc-compatible output. + +The hda-analyzer is a part of alsa.git repository in +alsa-project.org: + +- http://git.alsa-project.org/?p=alsa.git;a=tree;f=hda-analyzer + + +Codecgraph +~~~~~~~~~~ +Codecgraph is a utility program to generate a graph and visualizes the +codec-node connection of a codec chip. It's especially useful when +you analyze or debug a codec without a proper datasheet. The program +parses the given codec proc file and converts to SVG via graphiz +program. + +The tarball and GIT trees are found in the web page at: + +- http://helllabs.org/codecgraph/ + + +hda-emu +~~~~~~~ +hda-emu is an HD-audio emulator. The main purpose of this program is +to debug an HD-audio codec without the real hardware. Thus, it +doesn't emulate the behavior with the real audio I/O, but it just +dumps the codec register changes and the ALSA-driver internal changes +at probing and operating the HD-audio driver. + +The program requires a codec proc-file to simulate. Get a proc file +for the target codec beforehand, or pick up an example codec from the +codec proc collections in the tarball. Then, run the program with the +proc file, and the hda-emu program will start parsing the codec file +and simulates the HD-audio driver: + +------------------------------------------------------------------------ + % hda-emu codecs/stac9200-dell-d820-laptop + # Parsing.. + hda_codec: Unknown model for STAC9200, using BIOS defaults + hda_codec: pin nid 08 bios pin config 40c003fa + .... +------------------------------------------------------------------------ + +The program gives you only a very dumb command-line interface. You +can get a proc-file dump at the current state, get a list of control +(mixer) elements, set/get the control element value, simulate the PCM +operation, the jack plugging simulation, etc. + +The package is found in: + +- ftp://ftp.kernel.org/pub/linux/kernel/people/tiwai/misc/ + +A git repository is available: + +- git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/hda-emu.git + +See README file in the tarball for more details about hda-emu +program. diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt index f738b296440..bba2dbb79d8 100644 --- a/Documentation/sound/alsa/Procfile.txt +++ b/Documentation/sound/alsa/Procfile.txt @@ -153,6 +153,16 @@ card*/codec#* Shows the general codec information and the attribute of each widget node. +card*/eld#* + Available for HDMI or DisplayPort interfaces. + Shows ELD(EDID Like Data) info retrieved from the attached HDMI sink, + and describes its audio capabilities and configurations. + + Some ELD fields may be modified by doing `echo name hex_value > eld#*`. + Only do this if you are sure the HDMI sink provided value is wrong. + And if that makes your HDMI audio work, please report to us so that we + can fix it in future kernel releases. + Sequencer Information --------------------- diff --git a/Documentation/sound/alsa/soc/machine.txt b/Documentation/sound/alsa/soc/machine.txt index f370e7db86a..bab7711ce96 100644 --- a/Documentation/sound/alsa/soc/machine.txt +++ b/Documentation/sound/alsa/soc/machine.txt @@ -9,7 +9,7 @@ the audio subsystem with the kernel as a platform device and is represented by the following struct:- /* SoC machine */ -struct snd_soc_machine { +struct snd_soc_card { char *name; int (*probe)(struct platform_device *pdev); @@ -67,10 +67,10 @@ static struct snd_soc_dai_link corgi_dai = { .ops = &corgi_ops, }; -struct snd_soc_machine then sets up the machine with it's DAIs. e.g. +struct snd_soc_card then sets up the machine with it's DAIs. e.g. /* corgi audio machine driver */ -static struct snd_soc_machine snd_soc_machine_corgi = { +static struct snd_soc_card snd_soc_corgi = { .name = "Corgi", .dai_link = &corgi_dai, .num_links = 1, @@ -90,7 +90,7 @@ static struct wm8731_setup_data corgi_wm8731_setup = { /* corgi audio subsystem */ static struct snd_soc_device corgi_snd_devdata = { - .machine = &snd_soc_machine_corgi, + .machine = &snd_soc_corgi, .platform = &pxa2xx_soc_platform, .codec_dev = &soc_codec_dev_wm8731, .codec_data = &corgi_wm8731_setup, diff --git a/Documentation/specialix.txt b/Documentation/specialix.txt deleted file mode 100644 index 6eb6f3a3331..00000000000 --- a/Documentation/specialix.txt +++ /dev/null @@ -1,383 +0,0 @@ - - specialix.txt -- specialix IO8+ multiport serial driver readme. - - - - Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl) - - Specialix pays for the development and support of this driver. - Please DO contact io8-linux@specialix.co.uk if you require - support. - - This driver was developed in the BitWizard linux device - driver service. If you require a linux device driver for your - product, please contact devices@BitWizard.nl for a quote. - - This code is firmly based on the riscom/8 serial driver, - written by Dmitry Gorodchanin. The specialix IO8+ card - programming information was obtained from the CL-CD1865 Data - Book, and Specialix document number 6200059: IO8+ Hardware - Functional Specification, augmented by document number 6200088: - Merak Hardware Functional Specification. (IO8+/PCI is also - called Merak) - - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, - USA. - - -Intro -===== - - -This file contains some random information, that I like to have online -instead of in a manual that can get lost. Ever misplace your Linux -kernel sources? And the manual of one of the boards in your computer? - - -Addresses and interrupts -======================== - -Address dip switch settings: -The dip switch sets bits 2-9 of the IO address. - - 9 8 7 6 5 4 3 2 - +-----------------+ - 0 | X X X X X X X | - | | = IoBase = 0x100 - 1 | X | - +-----------------+ ------ RS232 connectors ----> - - | | | - edge connector - | | | - V V V - -Base address 0x100 caused a conflict in one of my computers once. I -haven't the foggiest why. My Specialix card is now at 0x180. My -other computer runs just fine with the Specialix card at 0x100.... -The card occupies 4 addresses, but actually only two are really used. - -The PCI version doesn't have any dip switches. The BIOS assigns -an IO address. - -The driver now still autoprobes at 0x100, 0x180, 0x250 and 0x260. If -that causes trouble for you, please report that. I'll remove -autoprobing then. - -The driver will tell the card what IRQ to use, so you don't have to -change any jumpers to change the IRQ. Just use a command line -argument (irq=xx) to the insmod program to set the interrupt. - -The BIOS assigns the IRQ on the PCI version. You have no say in what -IRQ to use in that case. - -If your specialix cards are not at the default locations, you can use -the kernel command line argument "specialix=io0,irq0,io1,irq1...". -Here "io0" is the io address for the first card, and "irq0" is the -irq line that the first card should use. And so on. - -Examples. - -You use the driver as a module and have three cards at 0x100, 0x250 -and 0x180. And some way or another you want them detected in that -order. Moreover irq 12 is taken (e.g. by your PS/2 mouse). - - insmod specialix.o iobase=0x100,0x250,0x180 irq=9,11,15 - -The same three cards, but now in the kernel would require you to -add - - specialix=0x100,9,0x250,11,0x180,15 - -to the command line. This would become - - append="specialix=0x100,9,0x250,11,0x180,15" - -in your /etc/lilo.conf file if you use lilo. - -The Specialix driver is slightly odd: It allows you to have the second -or third card detected without having a first card. This has -advantages and disadvantages. A slot that isn't filled by an ISA card, -might be filled if a PCI card is detected. Thus if you have an ISA -card at 0x250 and a PCI card, you would get: - -sx0: specialix IO8+ Board at 0x100 not found. -sx1: specialix IO8+ Board at 0x180 not found. -sx2: specialix IO8+ board detected at 0x250, IRQ 12, CD1865 Rev. B. -sx3: specialix IO8+ Board at 0x260 not found. -sx0: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B. - -This would happen if you don't give any probe hints to the driver. -If you would specify: - - specialix=0x250,11 - -you'd get the following messages: - -sx0: specialix IO8+ board detected at 0x250, IRQ 11, CD1865 Rev. B. -sx1: specialix IO8+ board detected at 0xd800, IRQ 9, CD1865 Rev. B. - -ISA probing is aborted after the IO address you gave is exhausted, and -the PCI card is now detected as the second card. The ISA card is now -also forced to IRQ11.... - - -Baud rates -========== - -The rev 1.2 and below boards use a CL-CD1864. These chips can only -do 64kbit. The rev 1.3 and newer boards use a CL-CD1865. These chips -are officially capable of 115k2. - -The Specialix card uses a 25MHz crystal (in times two mode, which in -fact is a divided by two mode). This is not enough to reach the rated -115k2 on all ports at the same time. With this clock rate you can only -do 37% of this rate. This means that at 115k2 on all ports you are -going to lose characters (The chip cannot handle that many incoming -bits at this clock rate.) (Yes, you read that correctly: there is a -limit to the number of -=bits=- per second that the chip can handle.) - -If you near the "limit" you will first start to see a graceful -degradation in that the chip cannot keep the transmitter busy at all -times. However with a central clock this slow, you can also get it to -miss incoming characters. The driver will print a warning message when -you are outside the official specs. The messages usually show up in -the file /var/log/messages . - -The specialix card cannot reliably do 115k2. If you use it, you have -to do "extensive testing" (*) to verify if it actually works. - -When "mgetty" communicates with my modem at 115k2 it reports: -got: +++[0d]ATQ0V1H0[0d][0d][8a]O[cb][0d][8a] - ^^^^ ^^^^ ^^^^ - -The three characters that have the "^^^" under them have suffered a -bit error in the highest bit. In conclusion: I've tested it, and found -that it simply DOESN'T work for me. I also suspect that this is also -caused by the baud rate being just a little bit out of tune. - -I upgraded the crystal to 66Mhz on one of my Specialix cards. Works -great! Contact me for details. (Voids warranty, requires a steady hand -and more such restrictions....) - - -(*) Cirrus logic CD1864 databook, page 40. - - -Cables for the Specialix IO8+ -============================= - -The pinout of the connectors on the IO8+ is: - - pin short direction long name - name - Pin 1 DCD input Data Carrier Detect - Pin 2 RXD input Receive - Pin 3 DTR/RTS output Data Terminal Ready/Ready To Send - Pin 4 GND - Ground - Pin 5 TXD output Transmit - Pin 6 CTS input Clear To Send - - - -- 6 5 4 3 2 1 -- - | | - | | - | | - | | - +----- -----+ - |__________| - clip - - Front view of an RJ12 connector. Cable moves "into" the paper. - (the plug is ready to plug into your mouth this way...) - - - NULL cable. I don't know who is going to use these except for - testing purposes, but I tested the cards with this cable. (It - took quite a while to figure out, so I'm not going to delete - it. So there! :-) - - - This end goes This end needs - straight into the some twists in - RJ12 plug. the wiring. - IO8+ RJ12 IO8+ RJ12 - 1 DCD white - - - - 1 DCD - 2 RXD black 5 TXD - 3 DTR/RTS red 6 CTS - 4 GND green 4 GND - 5 TXD yellow 2 RXD - 6 CTS blue 3 DTR/RTS - - - Same NULL cable, but now sorted on the second column. - - 1 DCD white - - - - 1 DCD - 5 TXD yellow 2 RXD - 6 CTS blue 3 DTR/RTS - 4 GND green 4 GND - 2 RXD black 5 TXD - 3 DTR/RTS red 6 CTS - - - - This is a modem cable usable for hardware handshaking: - RJ12 DB25 DB9 - 1 DCD white 8 DCD 1 DCD - 2 RXD black 3 RXD 2 RXD - 3 DTR/RTS red 4 RTS 7 RTS - 4 GND green 7 GND 5 GND - 5 TXD yellow 2 TXD 3 TXD - 6 CTS blue 5 CTS 8 CTS - +---- 6 DSR 6 DSR - +---- 20 DTR 4 DTR - - This is a modem cable usable for software handshaking: - It allows you to reset the modem using the DTR ioctls. - I (REW) have never tested this, "but xxxxxxxxxxxxx - says that it works." If you test this, please - tell me and I'll fill in your name on the xxx's. - - RJ12 DB25 DB9 - 1 DCD white 8 DCD 1 DCD - 2 RXD black 3 RXD 2 RXD - 3 DTR/RTS red 20 DTR 4 DTR - 4 GND green 7 GND 5 GND - 5 TXD yellow 2 TXD 3 TXD - 6 CTS blue 5 CTS 8 CTS - +---- 6 DSR 6 DSR - +---- 4 RTS 7 RTS - - I bought a 6 wire flat cable. It was colored as indicated. - Check that yours is the same before you trust me on this. - - -Hardware handshaking issues. -============================ - -The driver can be told to operate in two different ways. The default -behaviour is specialix.sx_rtscts = 0 where the pin behaves as DTR when -hardware handshaking is off. It behaves as the RTS hardware -handshaking signal when hardware handshaking is selected. - -When you use this, you have to use the appropriate cable. The -cable will either be compatible with hardware handshaking or with -software handshaking. So switching on the fly is not really an -option. - -I actually prefer to use the "specialix.sx_rtscts=1" option. -This makes the DTR/RTS pin always an RTS pin, and ioctls to -change DTR are always ignored. I have a cable that is configured -for this. - - -Ports and devices -================= - -Port 0 is the one furthest from the card-edge connector. - -Devices: - -You should make the devices as follows: - -bash -cd /dev -for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 \ - 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -do - echo -n "$i " - mknod /dev/ttyW$i c 75 $i - mknod /dev/cuw$i c 76 $i -done -echo "" - -If your system doesn't come with these devices preinstalled, bug your -linux-vendor about this. They have had ample time to get this -implemented by now. - -You cannot have more than 4 boards in one computer. The card only -supports 4 different interrupts. If you really want this, contact me -about this and I'll give you a few tips (requires soldering iron).... - -If you have enough PCI slots, you can probably use more than 4 PCI -versions of the card though.... - -The PCI version of the card cannot adhere to the mechanical part of -the PCI spec because the 8 serial connectors are simply too large. If -it doesn't fit in your computer, bring back the card. - - ------------------------------------------------------------------------- - - - Fixed bugs and restrictions: - - During initialization, interrupts are blindly turned on. - Having a shadow variable would cause an extra memory - access on every IO instruction. - - The interrupt (on the card) should be disabled when we - don't allocate the Linux end of the interrupt. This allows - a different driver/card to use it while all ports are not in - use..... (a la standard serial port) - == An extra _off variant of the sx_in and sx_out macros are - now available. They don't set the interrupt enable bit. - These are used during initialization. Normal operation uses - the old variant which enables the interrupt line. - - RTS/DTR issue needs to be implemented according to - specialix' spec. - I kind of like the "determinism" of the current - implementation. Compile time flag? - == Ok. Compile time flag! Default is how Specialix likes it. - == Now a config time flag! Gets saved in your config file. Neat! - - Can you set the IO address from the lilo command line? - If you need this, bug me about it, I'll make it. - == Hah! No bugging needed. Fixed! :-) - - Cirrus logic hasn't gotten back to me yet why the CD1865 can - and the CD1864 can't do 115k2. I suspect that this is - because the CD1864 is not rated for 33MHz operation. - Therefore the CD1864 versions of the card can't do 115k2 on - all ports just like the CD1865 versions. The driver does - not block 115k2 on CD1864 cards. - == I called the Cirrus Logic representative here in Holland. - The CD1864 databook is identical to the CD1865 databook, - except for an extra warning at the end. Similar Bit errors - have been observed in testing at 115k2 on both an 1865 and - a 1864 chip. I see no reason why I would prohibit 115k2 on - 1864 chips and not do it on 1865 chips. Actually there is - reason to prohibit it on BOTH chips. I print a warning. - If you use 115k2, you're on your own. - - A spiky CD may send spurious HUPs. Also in CLOCAL??? - -- A fix for this turned out to be counter productive. - Different fix? Current behaviour is acceptable? - -- Maybe the current implementation is correct. If anybody - gets bitten by this, please report, and it will get fixed. - - -- Testing revealed that when in CLOCAL, the problem doesn't - occur. As warned for in the CD1865 manual, the chip may - send modem intr's on a spike. We could filter those out, - but that would be a cludge anyway (You'd still risk getting - a spurious HUP when two spikes occur.)..... - - - - Bugs & restrictions: - - This is a difficult card to autoprobe. - You have to WRITE to the address register to even - read-probe a CD186x register. Disable autodetection? - -- Specialix: any suggestions? - - diff --git a/Documentation/spi/spi-lm70llp b/Documentation/spi/spi-lm70llp index 154bd02220b..34a9cfd746b 100644 --- a/Documentation/spi/spi-lm70llp +++ b/Documentation/spi/spi-lm70llp @@ -13,10 +13,20 @@ Description This driver provides glue code connecting a National Semiconductor LM70 LLP temperature sensor evaluation board to the kernel's SPI core subsystem. +This is a SPI master controller driver. It can be used in conjunction with +(layered under) the LM70 logical driver (a "SPI protocol driver"). In effect, this driver turns the parallel port interface on the eval board into a SPI bus with a single device, which will be driven by the generic LM70 driver (drivers/hwmon/lm70.c). + +Hardware Interfacing +-------------------- +The schematic for this particular board (the LM70EVAL-LLP) is +available (on page 4) here: + + http://www.national.com/appinfo/tempsensors/files/LM70LLPEVALmanual.pdf + The hardware interfacing on the LM70 LLP eval board is as follows: Parallel LM70 LLP diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary index 8bae2f018d3..0f5122eb282 100644 --- a/Documentation/spi/spi-summary +++ b/Documentation/spi/spi-summary @@ -215,7 +215,7 @@ So for example arch/.../mach-*/board-*.c files might have code like: /* if your mach-* infrastructure doesn't support kernels that can * run on multiple boards, pdata wouldn't benefit from "__init". */ - static struct mysoc_spi_data __init pdata = { ... }; + static struct mysoc_spi_data __initdata pdata = { ... }; static __init board_init(void) { diff --git a/Documentation/stallion.txt b/Documentation/stallion.txt deleted file mode 100644 index 5c4902d9a5b..00000000000 --- a/Documentation/stallion.txt +++ /dev/null @@ -1,392 +0,0 @@ -* NOTE - This is an unmaintained driver. Lantronix, which bought Stallion -technologies, is not active in driver maintenance, and they have no information -on when or if they will have a 2.6 driver. - -James Nelson - 12-12-2004 - -Stallion Multiport Serial Driver Readme ---------------------------------------- - -Copyright (C) 1994-1999, Stallion Technologies. - -Version: 5.5.1 -Date: 28MAR99 - - - -1. INTRODUCTION - -There are two drivers that work with the different families of Stallion -multiport serial boards. One is for the Stallion smart boards - that is -EasyIO, EasyConnection 8/32 and EasyConnection 8/64-PCI, the other for -the true Stallion intelligent multiport boards - EasyConnection 8/64 -(ISA, EISA, MCA), EasyConnection/RA-PCI, ONboard and Brumby. - -If you are using any of the Stallion intelligent multiport boards (Brumby, -ONboard, EasyConnection 8/64 (ISA, EISA, MCA), EasyConnection/RA-PCI) with -Linux you will need to get the driver utility package. This contains a -firmware loader and the firmware images necessary to make the devices operate. - -The Stallion Technologies ftp site, ftp.stallion.com, will always have -the latest version of the driver utility package. - -ftp://ftp.stallion.com/drivers/ata5/Linux/ata-linux-550.tar.gz - -As of the printing of this document the latest version of the driver -utility package is 5.5.0. If a later version is now available then you -should use the latest version. - -If you are using the EasyIO, EasyConnection 8/32 or EasyConnection 8/64-PCI -boards then you don't need this package, although it does have a serial stats -display program. - -If you require DIP switch settings, EISA or MCA configuration files, or any -other information related to Stallion boards then have a look at Stallion's -web pages at http://www.stallion.com. - - - -2. INSTALLATION - -The drivers can be used as loadable modules or compiled into the kernel. -You can choose which when doing a "config" on the kernel. - -All ISA, EISA and MCA boards that you want to use need to be configured into -the driver(s). All PCI boards will be automatically detected when you load -the driver - so they do not need to be entered into the driver(s) -configuration structure. Note that kernel PCI support is required to use PCI -boards. - -There are two methods of configuring ISA, EISA and MCA boards into the drivers. -If using the driver as a loadable module then the simplest method is to pass -the driver configuration as module arguments. The other method is to modify -the driver source to add configuration lines for each board in use. - -If you have pre-built Stallion driver modules then the module argument -configuration method should be used. A lot of Linux distributions come with -pre-built driver modules in /lib/modules/X.Y.Z/misc for the kernel in use. -That makes things pretty simple to get going. - - -2.1 MODULE DRIVER CONFIGURATION: - -The simplest configuration for modules is to use the module load arguments -to configure any ISA, EISA or MCA boards. PCI boards are automatically -detected, so do not need any additional configuration at all. - -If using EasyIO, EasyConnection 8/32 ISA or MCA, or EasyConnection 8/63-PCI -boards then use the "stallion" driver module, Otherwise if you are using -an EasyConnection 8/64 ISA, EISA or MCA, EasyConnection/RA-PCI, ONboard, -Brumby or original Stallion board then use the "istallion" driver module. - -Typically to load up the smart board driver use: - - modprobe stallion - -This will load the EasyIO and EasyConnection 8/32 driver. It will output a -message to say that it loaded and print the driver version number. It will -also print out whether it found the configured boards or not. These messages -may not appear on the console, but typically are always logged to -/var/adm/messages or /var/log/syslog files - depending on how the klogd and -syslogd daemons are setup on your system. - -To load the intelligent board driver use: - - modprobe istallion - -It will output similar messages to the smart board driver. - -If not using an auto-detectable board type (that is a PCI board) then you -will also need to supply command line arguments to the modprobe command -when loading the driver. The general form of the configuration argument is - - board?=[,[,][,]] - -where: - - board? -- specifies the arbitrary board number of this board, - can be in the range 0 to 3. - - name -- textual name of this board. The board name is the common - board name, or any "shortened" version of that. The board - type number may also be used here. - - ioaddr -- specifies the I/O address of this board. This argument is - optional, but should generally be specified. - - addr -- optional second address argument. Some board types require - a second I/O address, some require a memory address. The - exact meaning of this argument depends on the board type. - - irq -- optional IRQ line used by this board. - -Up to 4 board configuration arguments can be specified on the load line. -Here is some examples: - - modprobe stallion board0=easyio,0x2a0,5 - -This configures an EasyIO board as board 0 at I/O address 0x2a0 and IRQ 5. - - modprobe istallion board3=ec8/64,0x2c0,0xcc000 - -This configures an EasyConnection 8/64 ISA as board 3 at I/O address 0x2c0 at -memory address 0xcc000. - - modprobe stallion board1=ec8/32-at,0x2a0,0x280,10 - -This configures an EasyConnection 8/32 ISA board at primary I/O address 0x2a0, -secondary address 0x280 and IRQ 10. - -You will probably want to enter this module load and configuration information -into your system startup scripts so that the drivers are loaded and configured -on each system boot. Typically the start up script would be something like -/etc/modprobe.conf. - - -2.2 STATIC DRIVER CONFIGURATION: - -For static driver configuration you need to modify the driver source code. -Entering ISA, EISA and MCA boards into the driver(s) configuration structure -involves editing the driver(s) source file. It's pretty easy if you follow -the instructions below. Both drivers can support up to 4 boards. The smart -card driver (the stallion.c driver) supports any combination of EasyIO and -EasyConnection 8/32 boards (up to a total of 4). The intelligent driver -supports any combination of ONboards, Brumbys, Stallions and EasyConnection -8/64 (ISA and EISA) boards (up to a total of 4). - -To set up the driver(s) for the boards that you want to use you need to -edit the appropriate driver file and add configuration entries. - -If using EasyIO or EasyConnection 8/32 ISA or MCA boards, - In drivers/char/stallion.c: - - find the definition of the stl_brdconf array (of structures) - near the top of the file - - modify this to match the boards you are going to install - (the comments before this structure should help) - - save and exit - -If using ONboard, Brumby, Stallion or EasyConnection 8/64 (ISA or EISA) -boards, - In drivers/char/istallion.c: - - find the definition of the stli_brdconf array (of structures) - near the top of the file - - modify this to match the boards you are going to install - (the comments before this structure should help) - - save and exit - -Once you have set up the board configurations then you are ready to build -the kernel or modules. - -When the new kernel is booted, or the loadable module loaded then the -driver will emit some kernel trace messages about whether the configured -boards were detected or not. Depending on how your system logger is set -up these may come out on the console, or just be logged to -/var/adm/messages or /var/log/syslog. You should check the messages to -confirm that all is well. - - -2.3 SHARING INTERRUPTS - -It is possible to share interrupts between multiple EasyIO and -EasyConnection 8/32 boards in an EISA system. To do this you must be using -static driver configuration, modifying the driver source code to add driver -configuration. Then a couple of extra things are required: - -1. When entering the board resources into the stallion.c file you need to - mark the boards as using level triggered interrupts. Do this by replacing - the "0" entry at field position 6 (the last field) in the board - configuration structure with a "1". (This is the structure that defines - the board type, I/O locations, etc. for each board). All boards that are - sharing an interrupt must be set this way, and each board should have the - same interrupt number specified here as well. Now build the module or - kernel as you would normally. - -2. When physically installing the boards into the system you must enter - the system EISA configuration utility. You will need to install the EISA - configuration files for *all* the EasyIO and EasyConnection 8/32 boards - that are sharing interrupts. The Stallion EasyIO and EasyConnection 8/32 - EISA configuration files required are supplied by Stallion Technologies - on the EASY Utilities floppy diskette (usually supplied in the box with - the board when purchased. If not, you can pick it up from Stallion's FTP - site, ftp.stallion.com). You will need to edit the board resources to - choose level triggered interrupts, and make sure to set each board's - interrupt to the same IRQ number. - -You must complete both the above steps for this to work. When you reboot -or load the driver your EasyIO and EasyConnection 8/32 boards will be -sharing interrupts. - - -2.4 USING HIGH SHARED MEMORY - -The EasyConnection 8/64-EI, ONboard and Stallion boards are capable of -using shared memory addresses above the usual 640K - 1Mb range. The ONboard -ISA and the Stallion boards can be programmed to use memory addresses up to -16Mb (the ISA bus addressing limit), and the EasyConnection 8/64-EI and -ONboard/E can be programmed for memory addresses up to 4Gb (the EISA bus -addressing limit). - -The higher than 1Mb memory addresses are fully supported by this driver. -Just enter the address as you normally would for a lower than 1Mb address -(in the driver's board configuration structure). - - - -2.5 TROUBLE SHOOTING - -If a board is not found by the driver but is actually in the system then the -most likely problem is that the I/O address is wrong. Change the module load -argument for the loadable module form. Or change it in the driver stallion.c -or istallion.c configuration structure and rebuild the kernel or modules, or -change it on the board. - -On EasyIO and EasyConnection 8/32 boards the IRQ is software programmable, so -if there is a conflict you may need to change the IRQ used for a board. There -are no interrupts to worry about for ONboard, Brumby or EasyConnection 8/64 -(ISA, EISA and MCA) boards. The memory region on EasyConnection 8/64 and -ONboard boards is software programmable, but not on the Brumby boards. - - - -3. USING THE DRIVERS - -3.1 INTELLIGENT DRIVER OPERATION - -The intelligent boards also need to have their "firmware" code downloaded -to them. This is done via a user level application supplied in the driver -utility package called "stlload". Compile this program wherever you dropped -the package files, by typing "make". In its simplest form you can then type - - ./stlload -i cdk.sys - -in this directory and that will download board 0 (assuming board 0 is an -EasyConnection 8/64 or EasyConnection/RA board). To download to an -ONboard, Brumby or Stallion do: - - ./stlload -i 2681.sys - -Normally you would want all boards to be downloaded as part of the standard -system startup. To achieve this, add one of the lines above into the -/etc/rc.d/rc.S or /etc/rc.d/rc.serial file. To download each board just add -the "-b " option to the line. You will need to download code for -every board. You should probably move the stlload program into a system -directory, such as /usr/sbin. Also, the default location of the cdk.sys image -file in the stlload down-loader is /usr/lib/stallion. Create that directory -and put the cdk.sys and 2681.sys files in it. (It's a convenient place to put -them anyway). As an example your /etc/rc.d/rc.S file might have the -following lines added to it (if you had 3 boards): - - /usr/sbin/stlload -b 0 -i /usr/lib/stallion/cdk.sys - /usr/sbin/stlload -b 1 -i /usr/lib/stallion/2681.sys - /usr/sbin/stlload -b 2 -i /usr/lib/stallion/2681.sys - -The image files cdk.sys and 2681.sys are specific to the board types. The -cdk.sys will only function correctly on an EasyConnection 8/64 board. Similarly -the 2681.sys image fill only operate on ONboard, Brumby and Stallion boards. -If you load the wrong image file into a board it will fail to start up, and -of course the ports will not be operational! - -If you are using the modularized version of the driver you might want to put -the modprobe calls in the startup script as well (before the download lines -obviously). - - -3.2 USING THE SERIAL PORTS - -Once the driver is installed you will need to setup some device nodes to -access the serial ports. The simplest method is to use the /dev/MAKEDEV program. -It will automatically create device entries for Stallion boards. This will -create the normal serial port devices as /dev/ttyE# where# is the port number -starting from 0. A bank of 64 minor device numbers is allocated to each board, -so the first port on the second board is port 64,etc. A set of callout type -devices may also be created. They are created as the devices /dev/cue# where # -is the same as for the ttyE devices. - -For the most part the Stallion driver tries to emulate the standard PC system -COM ports and the standard Linux serial driver. The idea is that you should -be able to use Stallion board ports and COM ports interchangeably without -modifying anything but the device name. Anything that doesn't work like that -should be considered a bug in this driver! - -If you look at the driver code you will notice that it is fairly closely -based on the Linux serial driver (linux/drivers/char/serial.c). This is -intentional, obviously this is the easiest way to emulate its behavior! - -Since this driver tries to emulate the standard serial ports as much as -possible, most system utilities should work as they do for the standard -COM ports. Most importantly "stty" works as expected and "setserial" can -also be used (excepting the ability to auto-configure the I/O and IRQ -addresses of boards). Higher baud rates are supported in the usual fashion -through setserial or using the CBAUDEX extensions. Note that the EasyIO and -EasyConnection (all types) support at least 57600 and 115200 baud. The newer -EasyConnection XP modules and new EasyIO boards support 230400 and 460800 -baud as well. The older boards including ONboard and Brumby support a -maximum baud rate of 38400. - -If you are unfamiliar with how to use serial ports, then get the Serial-HOWTO -by Greg Hankins. It will explain everything you need to know! - - - -4. NOTES - -You can use both drivers at once if you have a mix of board types installed -in a system. However to do this you will need to change the major numbers -used by one of the drivers. Currently both drivers use major numbers 24, 25 -and 28 for their devices. Change one driver to use some other major numbers, -and then modify the mkdevnods script to make device nodes based on those new -major numbers. For example, you could change the istallion.c driver to use -major numbers 60, 61 and 62. You will also need to create device nodes with -different names for the ports, for example ttyF# and cuf#. - -The original Stallion board is no longer supported by Stallion Technologies. -Although it is known to work with the istallion driver. - -Finding a free physical memory address range can be a problem. The older -boards like the Stallion and ONboard need large areas (64K or even 128K), so -they can be very difficult to get into a system. If you have 16 Mb of RAM -then you have no choice but to put them somewhere in the 640K -> 1Mb range. -ONboards require 64K, so typically 0xd0000 is good, or 0xe0000 on some -systems. If you have an original Stallion board, "V4.0" or Rev.O, then you -need a 64K memory address space, so again 0xd0000 and 0xe0000 are good. -Older Stallion boards are a much bigger problem. They need 128K of address -space and must be on a 128K boundary. If you don't have a VGA card then -0xc0000 might be usable - there is really no other place you can put them -below 1Mb. - -Both the ONboard and old Stallion boards can use higher memory addresses as -well, but you must have less than 16Mb of RAM to be able to use them. Usual -high memory addresses used include 0xec0000 and 0xf00000. - -The Brumby boards only require 16Kb of address space, so you can usually -squeeze them in somewhere. Common addresses are 0xc8000, 0xcc000, or in -the 0xd0000 range. EasyConnection 8/64 boards are even better, they only -require 4Kb of address space, again usually 0xc8000, 0xcc000 or 0xd0000 -are good. - -If you are using an EasyConnection 8/64-EI or ONboard/E then usually the -0xd0000 or 0xe0000 ranges are the best options below 1Mb. If neither of -them can be used then the high memory support to use the really high address -ranges is the best option. Typically the 2Gb range is convenient for them, -and gets them well out of the way. - -The ports of the EasyIO-8M board do not have DCD or DTR signals. So these -ports cannot be used as real modem devices. Generally, when using these -ports you should only use the cueX devices. - -The driver utility package contains a couple of very useful programs. One -is a serial port statistics collection and display program - very handy -for solving serial port problems. The other is an extended option setting -program that works with the intelligent boards. - - - -5. DISCLAIMER - -The information contained in this document is believed to be accurate and -reliable. However, no responsibility is assumed by Stallion Technologies -Pty. Ltd. for its use, nor any infringements of patents or other rights -of third parties resulting from its use. Stallion Technologies reserves -the right to modify the design of its products and will endeavour to change -the information in manuals and accompanying documentation accordingly. - diff --git a/Documentation/sx.txt b/Documentation/sx.txt deleted file mode 100644 index cb4efa0fb5c..00000000000 --- a/Documentation/sx.txt +++ /dev/null @@ -1,294 +0,0 @@ - - sx.txt -- specialix SX/SI multiport serial driver readme. - - - - Copyright (C) 1997 Roger Wolff (R.E.Wolff@BitWizard.nl) - - Specialix pays for the development and support of this driver. - Please DO contact support@specialix.co.uk if you require - support. - - This driver was developed in the BitWizard linux device - driver service. If you require a linux device driver for your - product, please contact devices@BitWizard.nl for a quote. - - (History) - There used to be an SI driver by Simon Allan. This is a complete - rewrite from scratch. Just a few lines-of-code have been snatched. - - (Sources) - Specialix document number 6210028: SX Host Card and Download Code - Software Functional Specification. - - (Copying) - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of - the License, or (at your option) any later version. - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - PURPOSE. See the GNU General Public License for more details. - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, - USA. - - (Addendum) - I'd appreciate it that if you have fixes, that you send them - to me first. - - -Introduction -============ - -This file contains some random information, that I like to have online -instead of in a manual that can get lost. Ever misplace your Linux -kernel sources? And the manual of one of the boards in your computer? - - -Theory of operation -=================== - -An important thing to know is that the driver itself doesn't have the -firmware for the card. This means that you need the separate package -"sx_firmware". For now you can get the source at - - ftp://ftp.bitwizard.nl/specialix/sx_firmware_.tgz - -The firmware load needs a "misc" device, so you'll need to enable the -"Support for user misc device modules" in your kernel configuration. -The misc device needs to be called "/dev/specialix_sxctl". It needs -misc major 10, and minor number 167 (assigned by HPA). The section -on creating device files below also creates this device. - -After loading the sx.o module into your kernel, the driver will report -the number of cards detected, but because it doesn't have any -firmware, it will not be able to determine the number of ports. Only -when you then run "sx_firmware" will the firmware be downloaded and -the rest of the driver initialized. At that time the sx_firmware -program will report the number of ports installed. - -In contrast with many other multi port serial cards, some of the data -structures are only allocated when the card knows the number of ports -that are connected. This means we won't waste memory for 120 port -descriptor structures when you only have 8 ports. If you experience -problems due to this, please report them: I haven't seen any. - - -Interrupts -========== - -A multi port serial card, would generate a horrendous amount of -interrupts if it would interrupt the CPU for every received -character. Even more than 10 years ago, the trick not to use -interrupts but to poll the serial cards was invented. - -The SX card allow us to do this two ways. First the card limits its -own interrupt rate to a rate that won't overwhelm the CPU. Secondly, -we could forget about the cards interrupt completely and use the -internal timer for this purpose. - -Polling the card can take up to a few percent of your CPU. Using the -interrupts would be better if you have most of the ports idle. Using -timer-based polling is better if your card almost always has work to -do. You save the separate interrupt in that case. - -In any case, it doesn't really matter all that much. - -The most common problem with interrupts is that for ISA cards in a PCI -system the BIOS has to be told to configure that interrupt as "legacy -ISA". Otherwise the card can pull on the interrupt line all it wants -but the CPU won't see this. - -If you can't get the interrupt to work, remember that polling mode is -more efficient (provided you actually use the card intensively). - - -Allowed Configurations -====================== - -Some configurations are disallowed. Even though at a glance they might -seem to work, they are known to lockup the bus between the host card -and the device concentrators. You should respect the drivers decision -not to support certain configurations. It's there for a reason. - -Warning: Seriously technical stuff ahead. Executive summary: Don't use -SX cards except configured at a 64k boundary. Skip the next paragraph. - -The SX cards can theoretically be placed at a 32k boundary. So for -instance you can put an SX card at 0xc8000-0xd7fff. This is not a -"recommended configuration". ISA cards have to tell the bus controller -how they like their timing. Due to timing issues they have to do this -based on which 64k window the address falls into. This means that the -32k window below and above the SX card have to use exactly the same -timing as the SX card. That reportedly works for other SX cards. But -you're still left with two useless 32k windows that should not be used -by anybody else. - - -Configuring the driver -====================== - -PCI cards are always detected. The driver auto-probes for ISA cards at -some sensible addresses. Please report if the auto-probe causes trouble -in your system, or when a card isn't detected. - -I'm afraid I haven't implemented "kernel command line parameters" yet. -This means that if the default doesn't work for you, you shouldn't use -the compiled-into-the-kernel version of the driver. Use a module -instead. If you convince me that you need this, I'll make it for -you. Deal? - -I'm afraid that the module parameters are a bit clumsy. If you have a -better idea, please tell me. - -You can specify several parameters: - - sx_poll: number of jiffies between timer-based polls. - - Set this to "0" to disable timer based polls. - Initialization of cards without a working interrupt - will fail. - - Set this to "1" if you want a polling driver. - (on Intel: 100 polls per second). If you don't use - fast baud rates, you might consider a value like "5". - (If you don't know how to do the math, use 1). - - sx_slowpoll: Number of jiffies between timer-based polls. - Set this to "100" to poll once a second. - This should get the card out of a stall if the driver - ever misses an interrupt. I've never seen this happen, - and if it does, that's a bug. Tell me. - - sx_maxints: Number of interrupts to request from the card. - The card normally limits interrupts to about 100 per - second to offload the host CPU. You can increase this - number to reduce latency on the card a little. - Note that if you give a very high number you can overload - your CPU as well as the CPU on the host card. This setting - is inaccurate and not recommended for SI cards (But it - works). - - sx_irqmask: The mask of allowable IRQs to use. I suggest you set - this to 0 (disable IRQs all together) and use polling if - the assignment of IRQs becomes problematic. This is defined - as the sum of (1 << irq) 's that you want to allow. So - sx_irqmask of 8 (1 << 3) specifies that only irq 3 may - be used by the SX driver. If you want to specify to the - driver: "Either irq 11 or 12 is ok for you to use", then - specify (1 << 11) | (1 << 12) = 0x1800 . - - sx_debug: You can enable different sorts of debug traces with this. - At "-1" all debugging traces are active. You'll get several - times more debugging output than you'll get characters - transmitted. - - -Baud rates -========== - -Theoretically new SXDCs should be capable of more than 460k -baud. However the line drivers usually give up before that. Also the -CPU on the card may not be able to handle 8 channels going at full -blast at that speed. Moreover, the buffers are not large enough to -allow operation with 100 interrupts per second. You'll have to realize -that the card has a 256 byte buffer, so you'll have to increase the -number of interrupts per second if you have more than 256*100 bytes -per second to transmit. If you do any performance testing in this -area, I'd be glad to hear from you... - -(Psst Linux users..... I think the Linux driver is more efficient than -the driver for other OSes. If you can and want to benchmark them -against each other, be my guest, and report your findings...... :-) - - -Ports and devices -================= - -Port 0 is the top connector on the module closest to the host -card. Oh, the ports on the SXDCs and TAs are labelled from 1 to 8 -instead of from 0 to 7, as they are numbered by linux. I'm stubborn in -this: I know for sure that I wouldn't be able to calculate which port -is which anymore if I would change that.... - - -Devices: - -You should make the device files as follows: - -#!/bin/sh -# (I recommend that you cut-and-paste this into a file and run that) -cd /dev -t=0 -mknod specialix_sxctl c 10 167 -while [ $t -lt 64 ] - do - echo -n "$t " - mknod ttyX$t c 32 $t - mknod cux$t c 33 $t - t=`expr $t + 1` -done -echo "" -rm /etc/psdevtab -ps > /dev/null - - -This creates 64 devices. If you have more, increase the constant on -the line with "while". The devices start at 0, as is customary on -Linux. Specialix seems to like starting the numbering at 1. - -If your system doesn't come with these devices pre-installed, bug your -linux-vendor about this. They should have these devices -"pre-installed" before the new millennium. The "ps" stuff at the end -is to "tell" ps that the new devices exist. - -Officially the maximum number of cards per computer is 4. This driver -however supports as many cards in one machine as you want. You'll run -out of interrupts after a few, but you can switch to polled operation -then. At about 256 ports (More than 8 cards), we run out of minor -device numbers. Sorry. I suggest you buy a second computer.... (Or -switch to RIO). - ------------------------------------------------------------------------- - - - Fixed bugs and restrictions: - - Hangup processing. - -- Done. - - - the write path in generic_serial (lockup / oops). - -- Done (Ugly: not the way I want it. Copied from serial.c). - - - write buffer isn't flushed at close. - -- Done. I still seem to lose a few chars at close. - Sorry. I think that this is a firmware issue. (-> Specialix) - - - drain hardware before changing termios - - Change debug on the fly. - - ISA free irq -1. (no firmware loaded). - - adding c8000 as a probe address. Added warning. - - Add a RAMtest for the RAM on the card.c - - Crash when opening a port "way" of the number of allowed ports. - (for example opening port 60 when there are only 24 ports attached) - - Sometimes the use-count strays a bit. After a few hours of - testing the use count is sometimes "3". If you are not like - me and can remember what you did to get it that way, I'd - appreciate an Email. Possibly fixed. Tell me if anyone still - sees this. - - TAs don't work right if you don't connect all the modem control - signals. SXDCs do. T225 firmware problem -> Specialix. - (Mostly fixed now, I think. Tell me if you encounter this!) - - Bugs & restrictions: - - - Arbitrary baud rates. Requires firmware update. (-> Specialix) - - - Low latency (mostly firmware, -> Specialix) - - - diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index d79eeda7a69..3197fc83bc5 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -1,12 +1,13 @@ -Documentation for /proc/sys/vm/* kernel version 2.2.10 +Documentation for /proc/sys/vm/* kernel version 2.6.29 (c) 1998, 1999, Rik van Riel + (c) 2008 Peter W. Morreale For general info and legal blurb, please look in README. ============================================================== This file contains the documentation for the sysctl files in -/proc/sys/vm and is valid for Linux kernel version 2.2. +/proc/sys/vm and is valid for Linux kernel version 2.6.29. The files in this directory can be used to tune the operation of the virtual memory (VM) subsystem of the Linux kernel and @@ -16,178 +17,274 @@ Default values and initialization routines for most of these files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: -- overcommit_memory -- page-cluster -- dirty_ratio + +- block_dump +- dirty_background_bytes - dirty_background_ratio +- dirty_bytes - dirty_expire_centisecs +- dirty_ratio - dirty_writeback_centisecs -- highmem_is_dirtyable (only if CONFIG_HIGHMEM set) +- drop_caches +- hugepages_treat_as_movable +- hugetlb_shm_group +- laptop_mode +- legacy_va_layout +- lowmem_reserve_ratio - max_map_count - min_free_kbytes -- laptop_mode -- block_dump -- drop-caches -- zone_reclaim_mode -- min_unmapped_ratio - min_slab_ratio -- panic_on_oom -- oom_dump_tasks -- oom_kill_allocating_task -- mmap_min_address -- numa_zonelist_order +- min_unmapped_ratio +- mmap_min_addr - nr_hugepages - nr_overcommit_hugepages +- nr_pdflush_threads +- nr_trim_pages (only if CONFIG_MMU=n) +- numa_zonelist_order +- oom_dump_tasks +- oom_kill_allocating_task +- overcommit_memory +- overcommit_ratio +- page-cluster +- panic_on_oom +- percpu_pagelist_fraction +- stat_interval +- swappiness +- vfs_cache_pressure +- zone_reclaim_mode + ============================================================== -dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, -dirty_writeback_centisecs, highmem_is_dirtyable, -vfs_cache_pressure, laptop_mode, block_dump, swap_token_timeout, -drop-caches, hugepages_treat_as_movable: +block_dump -See Documentation/filesystems/proc.txt +block_dump enables block I/O debugging when set to a nonzero value. More +information on block I/O debugging is in Documentation/laptops/laptop-mode.txt. ============================================================== -overcommit_memory: +dirty_background_bytes -This value contains a flag that enables memory overcommitment. +Contains the amount of dirty memory at which the pdflush background writeback +daemon will start writeback. -When this flag is 0, the kernel attempts to estimate the amount -of free memory left when userspace requests more memory. +If dirty_background_bytes is written, dirty_background_ratio becomes a function +of its value (dirty_background_bytes / the amount of dirtyable system memory). -When this flag is 1, the kernel pretends there is always enough -memory until it actually runs out. +============================================================== -When this flag is 2, the kernel uses a "never overcommit" -policy that attempts to prevent any overcommit of memory. +dirty_background_ratio -This feature can be very useful because there are a lot of -programs that malloc() huge amounts of memory "just-in-case" -and don't use much of it. +Contains, as a percentage of total system memory, the number of pages at which +the pdflush background writeback daemon will start writing out dirty data. -The default value is 0. +============================================================== -See Documentation/vm/overcommit-accounting and -security/commoncap.c::cap_vm_enough_memory() for more information. +dirty_bytes + +Contains the amount of dirty memory at which a process generating disk writes +will itself start writeback. + +If dirty_bytes is written, dirty_ratio becomes a function of its value +(dirty_bytes / the amount of dirtyable system memory). ============================================================== -overcommit_ratio: +dirty_expire_centisecs -When overcommit_memory is set to 2, the committed address -space is not permitted to exceed swap plus this percentage -of physical RAM. See above. +This tunable is used to define when dirty data is old enough to be eligible +for writeout by the pdflush daemons. It is expressed in 100'ths of a second. +Data which has been dirty in-memory for longer than this interval will be +written out next time a pdflush daemon wakes up. + +============================================================== + +dirty_ratio + +Contains, as a percentage of total system memory, the number of pages at which +a process which is generating disk writes will itself start writing out dirty +data. ============================================================== -page-cluster: +dirty_writeback_centisecs -The Linux VM subsystem avoids excessive disk seeks by reading -multiple pages on a page fault. The number of pages it reads -is dependent on the amount of memory in your machine. +The pdflush writeback daemons will periodically wake up and write `old' data +out to disk. This tunable expresses the interval between those wakeups, in +100'ths of a second. -The number of pages the kernel reads in at once is equal to -2 ^ page-cluster. Values above 2 ^ 5 don't make much sense -for swap because we only cluster swap data in 32-page groups. +Setting this to zero disables periodic writeback altogether. ============================================================== -max_map_count: +drop_caches -This file contains the maximum number of memory map areas a process -may have. Memory map areas are used as a side-effect of calling -malloc, directly by mmap and mprotect, and also when loading shared -libraries. +Writing to this will cause the kernel to drop clean caches, dentries and +inodes from memory, causing that memory to become free. -While most applications need less than a thousand maps, certain -programs, particularly malloc debuggers, may consume lots of them, -e.g., up to one or two maps per allocation. +To free pagecache: + echo 1 > /proc/sys/vm/drop_caches +To free dentries and inodes: + echo 2 > /proc/sys/vm/drop_caches +To free pagecache, dentries and inodes: + echo 3 > /proc/sys/vm/drop_caches -The default value is 65536. +As this is a non-destructive operation and dirty objects are not freeable, the +user should run `sync' first. ============================================================== -min_free_kbytes: +hugepages_treat_as_movable -This is used to force the Linux VM to keep a minimum number -of kilobytes free. The VM uses this number to compute a pages_min -value for each lowmem zone in the system. Each lowmem zone gets -a number of reserved free pages based proportionally on its size. +This parameter is only useful when kernelcore= is specified at boot time to +create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages +are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero +value written to hugepages_treat_as_movable allows huge pages to be allocated +from ZONE_MOVABLE. -Some minimal amount of memory is needed to satisfy PF_MEMALLOC -allocations; if you set this to lower than 1024KB, your system will -become subtly broken, and prone to deadlock under high loads. - -Setting this too high will OOM your machine instantly. +Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge +pages pool can easily grow or shrink within. Assuming that applications are +not running that mlock() a lot of memory, it is likely the huge pages pool +can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value +into nr_hugepages and triggering page reclaim. ============================================================== -percpu_pagelist_fraction +hugetlb_shm_group -This is the fraction of pages at most (high mark pcp->high) in each zone that -are allocated for each per cpu page list. The min value for this is 8. It -means that we don't allow more than 1/8th of pages in each zone to be -allocated in any single per_cpu_pagelist. This entry only changes the value -of hot per cpu pagelists. User can specify a number like 100 to allocate -1/100th of each zone to each per cpu page list. +hugetlb_shm_group contains group id that is allowed to create SysV +shared memory segment using hugetlb page. -The batch value of each per cpu pagelist is also updated as a result. It is -set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) +============================================================== -The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. +laptop_mode -=============================================================== +laptop_mode is a knob that controls "laptop mode". All the things that are +controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt. -zone_reclaim_mode: +============================================================== -Zone_reclaim_mode allows someone to set more or less aggressive approaches to -reclaim memory when a zone runs out of memory. If it is set to zero then no -zone reclaim occurs. Allocations will be satisfied from other zones / nodes -in the system. +legacy_va_layout -This is value ORed together of +If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel +will use the legacy (2.4) layout for all processes. -1 = Zone reclaim on -2 = Zone reclaim writes dirty pages out -4 = Zone reclaim swaps pages +============================================================== -zone_reclaim_mode is set during bootup to 1 if it is determined that pages -from remote zones will cause a measurable performance reduction. The -page allocator will then reclaim easily reusable pages (those page -cache pages that are currently not used) before allocating off node pages. +lowmem_reserve_ratio + +For some specialised workloads on highmem machines it is dangerous for +the kernel to allow process memory to be allocated from the "lowmem" +zone. This is because that memory could then be pinned via the mlock() +system call, or by unavailability of swapspace. + +And on large highmem machines this lack of reclaimable lowmem memory +can be fatal. + +So the Linux page allocator has a mechanism which prevents allocations +which _could_ use highmem from using too much lowmem. This means that +a certain amount of lowmem is defended from the possibility of being +captured into pinned user memory. + +(The same argument applies to the old 16 megabyte ISA DMA region. This +mechanism will also defend that region from allocations which could use +highmem or lowmem). + +The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is +in defending these lower zones. + +If you have a machine which uses highmem or ISA DMA and your +applications are using mlock(), or if you are running with no swap then +you probably should change the lowmem_reserve_ratio setting. + +The lowmem_reserve_ratio is an array. You can see them by reading this file. +- +% cat /proc/sys/vm/lowmem_reserve_ratio +256 256 32 +- +Note: # of this elements is one fewer than number of zones. Because the highest + zone's value is not necessary for following calculation. + +But, these values are not used directly. The kernel calculates # of protection +pages for each zones from them. These are shown as array of protection pages +in /proc/zoneinfo like followings. (This is an example of x86-64 box). +Each zone has an array of protection pages like this. + +- +Node 0, zone DMA + pages free 1355 + min 3 + low 3 + high 4 + : + : + numa_other 0 + protection: (0, 2004, 2004, 2004) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + pagesets + cpu: 0 pcp: 0 + : +- +These protections are added to score to judge whether this zone should be used +for page allocation or should be reclaimed. + +In this example, if normal pages (index=2) are required to this DMA zone and +pages_high is used for watermark, the kernel judges this zone should not be +used because pages_free(1355) is smaller than watermark + protection[2] +(4 + 2004 = 2008). If this protection value is 0, this zone would be used for +normal page requirement. If requirement is DMA zone(index=0), protection[0] +(=0) is used. + +zone[i]'s protection[j] is calculated by following expression. + +(i < j): + zone[i]->protection[j] + = (total sums of present_pages from zone[i+1] to zone[j] on the node) + / lowmem_reserve_ratio[i]; +(i = j): + (should not be protected. = 0; +(i > j): + (not necessary, but looks 0) + +The default values of lowmem_reserve_ratio[i] are + 256 (if zone[i] means DMA or DMA32 zone) + 32 (others). +As above expression, they are reciprocal number of ratio. +256 means 1/256. # of protection pages becomes about "0.39%" of total present +pages of higher zones on the node. + +If you would like to protect more pages, smaller values are effective. +The minimum value is 1 (1/1 -> 100%). -It may be beneficial to switch off zone reclaim if the system is -used for a file server and all of memory should be used for caching files -from disk. In that case the caching effect is more important than -data locality. +============================================================== -Allowing zone reclaim to write out pages stops processes that are -writing large amounts of data from dirtying pages on other nodes. Zone -reclaim will write out dirty pages if a zone fills up and so effectively -throttle the process. This may decrease the performance of a single process -since it cannot use all of system memory to buffer the outgoing writes -anymore but it preserve the memory on other nodes so that the performance -of other processes running on other nodes will not be affected. +max_map_count: -Allowing regular swap effectively restricts allocations to the local -node unless explicitly overridden by memory policies or cpuset -configurations. +This file contains the maximum number of memory map areas a process +may have. Memory map areas are used as a side-effect of calling +malloc, directly by mmap and mprotect, and also when loading shared +libraries. -============================================================= +While most applications need less than a thousand maps, certain +programs, particularly malloc debuggers, may consume lots of them, +e.g., up to one or two maps per allocation. -min_unmapped_ratio: +The default value is 65536. -This is available only on NUMA kernels. +============================================================== -A percentage of the total pages in each zone. Zone reclaim will only -occur if more than this percentage of pages are file backed and unmapped. -This is to insure that a minimal amount of local pages is still available for -file I/O even if the node is overallocated. +min_free_kbytes: -The default is 1 percent. +This is used to force the Linux VM to keep a minimum number +of kilobytes free. The VM uses this number to compute a pages_min +value for each lowmem zone in the system. Each lowmem zone gets +a number of reserved free pages based proportionally on its size. + +Some minimal amount of memory is needed to satisfy PF_MEMALLOC +allocations; if you set this to lower than 1024KB, your system will +become subtly broken, and prone to deadlock under high loads. + +Setting this too high will OOM your machine instantly. ============================================================= @@ -209,82 +306,73 @@ and may not be fast. ============================================================= -panic_on_oom +min_unmapped_ratio: -This enables or disables panic on out-of-memory feature. +This is available only on NUMA kernels. -If this is set to 0, the kernel will kill some rogue process, -called oom_killer. Usually, oom_killer can kill rogue processes and -system will survive. +A percentage of the total pages in each zone. Zone reclaim will only +occur if more than this percentage of pages are file backed and unmapped. +This is to insure that a minimal amount of local pages is still available for +file I/O even if the node is overallocated. -If this is set to 1, the kernel panics when out-of-memory happens. -However, if a process limits using nodes by mempolicy/cpusets, -and those nodes become memory exhaustion status, one process -may be killed by oom-killer. No panic occurs in this case. -Because other nodes' memory may be free. This means system total status -may be not fatal yet. +The default is 1 percent. -If this is set to 2, the kernel panics compulsorily even on the -above-mentioned. +============================================================== -The default value is 0. -1 and 2 are for failover of clustering. Please select either -according to your policy of failover. +mmap_min_addr -============================================================= +This file indicates the amount of address space which a user process will +be restricted from mmaping. Since kernel null dereference bugs could +accidentally operate based on the information in the first couple of pages +of memory userspace processes should not be allowed to write to them. By +default this value is set to 0 and no protections will be enforced by the +security module. Setting this value to something like 64k will allow the +vast majority of applications to work correctly and provide defense in depth +against future potential kernel bugs. -oom_dump_tasks +============================================================== -Enables a system-wide task dump (excluding kernel threads) to be -produced when the kernel performs an OOM-killing and includes such -information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and -name. This is helpful to determine why the OOM killer was invoked -and to identify the rogue task that caused it. +nr_hugepages -If this is set to zero, this information is suppressed. On very -large systems with thousands of tasks it may not be feasible to dump -the memory state information for each one. Such systems should not -be forced to incur a performance penalty in OOM conditions when the -information may not be desired. +Change the minimum size of the hugepage pool. -If this is set to non-zero, this information is shown whenever the -OOM killer actually kills a memory-hogging task. +See Documentation/vm/hugetlbpage.txt -The default value is 0. +============================================================== -============================================================= +nr_overcommit_hugepages -oom_kill_allocating_task +Change the maximum size of the hugepage pool. The maximum is +nr_hugepages + nr_overcommit_hugepages. -This enables or disables killing the OOM-triggering task in -out-of-memory situations. +See Documentation/vm/hugetlbpage.txt -If this is set to zero, the OOM killer will scan through the entire -tasklist and select a task based on heuristics to kill. This normally -selects a rogue memory-hogging task that frees up a large amount of -memory when killed. +============================================================== -If this is set to non-zero, the OOM killer simply kills the task that -triggered the out-of-memory condition. This avoids the expensive -tasklist scan. +nr_pdflush_threads -If panic_on_oom is selected, it takes precedence over whatever value -is used in oom_kill_allocating_task. +The current number of pdflush threads. This value is read-only. +The value changes according to the number of dirty pages in the system. -The default value is 0. +When neccessary, additional pdflush threads are created, one per second, up to +nr_pdflush_threads_max. ============================================================== -mmap_min_addr +nr_trim_pages -This file indicates the amount of address space which a user process will -be restricted from mmaping. Since kernel null dereference bugs could -accidentally operate based on the information in the first couple of pages -of memory userspace processes should not be allowed to write to them. By -default this value is set to 0 and no protections will be enforced by the -security module. Setting this value to something like 64k will allow the -vast majority of applications to work correctly and provide defense in depth -against future potential kernel bugs. +This is available only on NOMMU kernels. + +This value adjusts the excess page trimming behaviour of power-of-2 aligned +NOMMU mmap allocations. + +A value of 0 disables trimming of allocations entirely, while a value of 1 +trims excess pages aggressively. Any value >= 1 acts as the watermark where +trimming of allocations is initiated. + +The default value is 1. + +See Documentation/nommu-mmap.txt for more information. ============================================================== @@ -333,17 +421,199 @@ this is causing problems for your system/application. ============================================================== -nr_hugepages +oom_dump_tasks -Change the minimum size of the hugepage pool. +Enables a system-wide task dump (excluding kernel threads) to be +produced when the kernel performs an OOM-killing and includes such +information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and +name. This is helpful to determine why the OOM killer was invoked +and to identify the rogue task that caused it. -See Documentation/vm/hugetlbpage.txt +If this is set to zero, this information is suppressed. On very +large systems with thousands of tasks it may not be feasible to dump +the memory state information for each one. Such systems should not +be forced to incur a performance penalty in OOM conditions when the +information may not be desired. + +If this is set to non-zero, this information is shown whenever the +OOM killer actually kills a memory-hogging task. + +The default value is 0. ============================================================== -nr_overcommit_hugepages +oom_kill_allocating_task -Change the maximum size of the hugepage pool. The maximum is -nr_hugepages + nr_overcommit_hugepages. +This enables or disables killing the OOM-triggering task in +out-of-memory situations. -See Documentation/vm/hugetlbpage.txt +If this is set to zero, the OOM killer will scan through the entire +tasklist and select a task based on heuristics to kill. This normally +selects a rogue memory-hogging task that frees up a large amount of +memory when killed. + +If this is set to non-zero, the OOM killer simply kills the task that +triggered the out-of-memory condition. This avoids the expensive +tasklist scan. + +If panic_on_oom is selected, it takes precedence over whatever value +is used in oom_kill_allocating_task. + +The default value is 0. + +============================================================== + +overcommit_memory: + +This value contains a flag that enables memory overcommitment. + +When this flag is 0, the kernel attempts to estimate the amount +of free memory left when userspace requests more memory. + +When this flag is 1, the kernel pretends there is always enough +memory until it actually runs out. + +When this flag is 2, the kernel uses a "never overcommit" +policy that attempts to prevent any overcommit of memory. + +This feature can be very useful because there are a lot of +programs that malloc() huge amounts of memory "just-in-case" +and don't use much of it. + +The default value is 0. + +See Documentation/vm/overcommit-accounting and +security/commoncap.c::cap_vm_enough_memory() for more information. + +============================================================== + +overcommit_ratio: + +When overcommit_memory is set to 2, the committed address +space is not permitted to exceed swap plus this percentage +of physical RAM. See above. + +============================================================== + +page-cluster + +page-cluster controls the number of pages which are written to swap in +a single attempt. The swap I/O size. + +It is a logarithmic value - setting it to zero means "1 page", setting +it to 1 means "2 pages", setting it to 2 means "4 pages", etc. + +The default value is three (eight pages at a time). There may be some +small benefits in tuning this to a different value if your workload is +swap-intensive. + +============================================================= + +panic_on_oom + +This enables or disables panic on out-of-memory feature. + +If this is set to 0, the kernel will kill some rogue process, +called oom_killer. Usually, oom_killer can kill rogue processes and +system will survive. + +If this is set to 1, the kernel panics when out-of-memory happens. +However, if a process limits using nodes by mempolicy/cpusets, +and those nodes become memory exhaustion status, one process +may be killed by oom-killer. No panic occurs in this case. +Because other nodes' memory may be free. This means system total status +may be not fatal yet. + +If this is set to 2, the kernel panics compulsorily even on the +above-mentioned. + +The default value is 0. +1 and 2 are for failover of clustering. Please select either +according to your policy of failover. + +============================================================= + +percpu_pagelist_fraction + +This is the fraction of pages at most (high mark pcp->high) in each zone that +are allocated for each per cpu page list. The min value for this is 8. It +means that we don't allow more than 1/8th of pages in each zone to be +allocated in any single per_cpu_pagelist. This entry only changes the value +of hot per cpu pagelists. User can specify a number like 100 to allocate +1/100th of each zone to each per cpu page list. + +The batch value of each per cpu pagelist is also updated as a result. It is +set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) + +The initial value is zero. Kernel does not use this value at boot time to set +the high water marks for each per cpu page list. + +============================================================== + +stat_interval + +The time interval between which vm statistics are updated. The default +is 1 second. + +============================================================== + +swappiness + +This control is used to define how aggressive the kernel will swap +memory pages. Higher values will increase agressiveness, lower values +descrease the amount of swap. + +The default value is 60. + +============================================================== + +vfs_cache_pressure +------------------ + +Controls the tendency of the kernel to reclaim the memory which is used for +caching of directory and inode objects. + +At the default value of vfs_cache_pressure=100 the kernel will attempt to +reclaim dentries and inodes at a "fair" rate with respect to pagecache and +swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer +to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 +causes the kernel to prefer to reclaim dentries and inodes. + +============================================================== + +zone_reclaim_mode: + +Zone_reclaim_mode allows someone to set more or less aggressive approaches to +reclaim memory when a zone runs out of memory. If it is set to zero then no +zone reclaim occurs. Allocations will be satisfied from other zones / nodes +in the system. + +This is value ORed together of + +1 = Zone reclaim on +2 = Zone reclaim writes dirty pages out +4 = Zone reclaim swaps pages + +zone_reclaim_mode is set during bootup to 1 if it is determined that pages +from remote zones will cause a measurable performance reduction. The +page allocator will then reclaim easily reusable pages (those page +cache pages that are currently not used) before allocating off node pages. + +It may be beneficial to switch off zone reclaim if the system is +used for a file server and all of memory should be used for caching files +from disk. In that case the caching effect is more important than +data locality. + +Allowing zone reclaim to write out pages stops processes that are +writing large amounts of data from dirtying pages on other nodes. Zone +reclaim will write out dirty pages if a zone fills up and so effectively +throttle the process. This may decrease the performance of a single process +since it cannot use all of system memory to buffer the outgoing writes +anymore but it preserve the memory on other nodes so that the performance +of other processes running on other nodes will not be affected. + +Allowing regular swap effectively restricts allocations to the local +node unless explicitly overridden by memory policies or cpuset +configurations. + +============ End of Document ================================= diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 10a0263ebb3..9e592c718af 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -1,6 +1,5 @@ Linux Magic System Request Key Hacks Documentation for sysrq.c -Last update: 2007-AUG-04 * What is the magic SysRq key? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -211,6 +210,24 @@ within a function called by handle_sysrq, you must be aware that you are in a lock (you are also in an interrupt handler, which means don't sleep!), so you must call __handle_sysrq_nolock instead. +* When I hit a SysRq key combination only the header appears on the console? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Sysrq output is subject to the same console loglevel control as all +other console output. This means that if the kernel was booted 'quiet' +as is common on distro kernels the output may not appear on the actual +console, even though it will appear in the dmesg buffer, and be accessible +via the dmesg command and to the consumers of /proc/kmsg. As a specific +exception the header line from the sysrq command is passed to all console +consumers as if the current loglevel was maximum. If only the header +is emitted it is almost certain that the kernel loglevel is too low. +Should you require the output on the console channel then you will need +to temporarily up the console loglevel using alt-sysrq-8 or: + + echo 8 > /proc/sysrq-trigger + +Remember to return the loglevel to normal after triggering the sysrq +command you are interested in. + * I have more questions, who can I ask? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ And I'll answer any questions about the registration system you got, also diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 5d354e16749..6f0a044f5b5 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -3,28 +3,30 @@ Mathieu Desnoyers -This document introduces Linux Kernel Tracepoints and their use. It provides -examples of how to insert tracepoints in the kernel and connect probe functions -to them and provides some examples of probe functions. +This document introduces Linux Kernel Tracepoints and their use. It +provides examples of how to insert tracepoints in the kernel and +connect probe functions to them and provides some examples of probe +functions. * Purpose of tracepoints -A tracepoint placed in code provides a hook to call a function (probe) that you -can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or -"off" (no probe is attached). When a tracepoint is "off" it has no effect, -except for adding a tiny time penalty (checking a condition for a branch) and -space penalty (adding a few bytes for the function call at the end of the -instrumented function and adds a data structure in a separate section). When a -tracepoint is "on", the function you provide is called each time the tracepoint -is executed, in the execution context of the caller. When the function provided -ends its execution, it returns to the caller (continuing from the tracepoint -site). +A tracepoint placed in code provides a hook to call a function (probe) +that you can provide at runtime. A tracepoint can be "on" (a probe is +connected to it) or "off" (no probe is attached). When a tracepoint is +"off" it has no effect, except for adding a tiny time penalty +(checking a condition for a branch) and space penalty (adding a few +bytes for the function call at the end of the instrumented function +and adds a data structure in a separate section). When a tracepoint +is "on", the function you provide is called each time the tracepoint +is executed, in the execution context of the caller. When the function +provided ends its execution, it returns to the caller (continuing from +the tracepoint site). You can put tracepoints at important locations in the code. They are lightweight hooks that can pass an arbitrary number of parameters, -which prototypes are described in a tracepoint declaration placed in a header -file. +which prototypes are described in a tracepoint declaration placed in a +header file. They can be used for tracing and performance accounting. @@ -42,14 +44,16 @@ In include/trace/subsys.h : #include -DEFINE_TRACE(subsys_eventname, - TPPTOTO(int firstarg, struct task_struct *p), +DECLARE_TRACE(subsys_eventname, + TPPROTO(int firstarg, struct task_struct *p), TPARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : #include +DEFINE_TRACE(subsys_eventname); + void somefct(void) { ... @@ -61,31 +65,41 @@ Where : - subsys_eventname is an identifier unique to your event - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function - called by this tracepoint. -- TPARGS(firstarg, p) are the parameters names, same as found in the prototype. -Connecting a function (probe) to a tracepoint is done by providing a probe -(function to call) for the specific tracepoint through -register_trace_subsys_eventname(). Removing a probe is done through -unregister_trace_subsys_eventname(); it will remove the probe sure there is no -caller left using the probe when it returns. Probe removal is preempt-safe -because preemption is disabled around the probe call. See the "Probe example" -section below for a sample probe module. - -The tracepoint mechanism supports inserting multiple instances of the same -tracepoint, but a single definition must be made of a given tracepoint name over -all the kernel to make sure no type conflict will occur. Name mangling of the -tracepoints is done using the prototypes to make sure typing is correct. -Verification of probe type correctness is done at the registration site by the -compiler. Tracepoints can be put in inline functions, inlined static functions, -and unrolled loops as well as regular functions. - -The naming scheme "subsys_event" is suggested here as a convention intended -to limit collisions. Tracepoint names are global to the kernel: they are -considered as being the same whether they are in the core kernel image or in -modules. +- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the + function called by this tracepoint. +- TPARGS(firstarg, p) are the parameters names, same as found in the + prototype. + +Connecting a function (probe) to a tracepoint is done by providing a +probe (function to call) for the specific tracepoint through +register_trace_subsys_eventname(). Removing a probe is done through +unregister_trace_subsys_eventname(); it will remove the probe. + +tracepoint_synchronize_unregister() must be called before the end of +the module exit function to make sure there is no caller left using +the probe. This, and the fact that preemption is disabled around the +probe call, make sure that probe removal and module unload are safe. +See the "Probe example" section below for a sample probe module. + +The tracepoint mechanism supports inserting multiple instances of the +same tracepoint, but a single definition must be made of a given +tracepoint name over all the kernel to make sure no type conflict will +occur. Name mangling of the tracepoints is done using the prototypes +to make sure typing is correct. Verification of probe type correctness +is done at the registration site by the compiler. Tracepoints can be +put in inline functions, inlined static functions, and unrolled loops +as well as regular functions. + +The naming scheme "subsys_event" is suggested here as a convention +intended to limit collisions. Tracepoint names are global to the +kernel: they are considered as being the same whether they are in the +core kernel image or in modules. + +If the tracepoint has to be used in kernel modules, an +EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be +used to export the defined tracepoints. * Probe / tracepoint example diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index 5bbbe209622..cde23b4a12a 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -37,7 +37,7 @@ $ echo mmiotrace > /debug/tracing/current_tracer $ cat /debug/tracing/trace_pipe > mydump.txt & Start X or whatever. $ echo "X is up" > /debug/tracing/trace_marker -$ echo none > /debug/tracing/current_tracer +$ echo nop > /debug/tracing/current_tracer Check for lost events. @@ -66,7 +66,7 @@ which action. It is recommended to place descriptive markers about what you do. Shut down mmiotrace (requires root privileges): -$ echo none > /debug/tracing/current_tracer +$ echo nop > /debug/tracing/current_tracer The 'cat' process exits. If it does not, kill it by issuing 'fg' command and pressing ctrl+c. @@ -81,7 +81,9 @@ are: $ cat /debug/tracing/trace_entries gives you a number. Approximately double this number and write it back, for instance: +$ echo 0 > /debug/tracing/tracing_enabled $ echo 128000 > /debug/tracing/trace_entries +$ echo 1 > /debug/tracing/tracing_enabled Then start again from the top. If you are doing a trace for a driver project, e.g. Nouveau, you should also diff --git a/Documentation/tty.txt b/Documentation/tty.txt deleted file mode 100644 index 8e65c4498c5..00000000000 --- a/Documentation/tty.txt +++ /dev/null @@ -1,292 +0,0 @@ - - The Lockronomicon - -Your guide to the ancient and twisted locking policies of the tty layer and -the warped logic behind them. Beware all ye who read on. - -FIXME: still need to work out the full set of BKL assumptions and document -them so they can eventually be killed off. - - -Line Discipline ---------------- - -Line disciplines are registered with tty_register_ldisc() passing the -discipline number and the ldisc structure. At the point of registration the -discipline must be ready to use and it is possible it will get used before -the call returns success. If the call returns an error then it won't get -called. Do not re-use ldisc numbers as they are part of the userspace ABI -and writing over an existing ldisc will cause demons to eat your computer. -After the return the ldisc data has been copied so you may free your own -copy of the structure. You must not re-register over the top of the line -discipline even with the same data or your computer again will be eaten by -demons. - -In order to remove a line discipline call tty_unregister_ldisc(). -In ancient times this always worked. In modern times the function will -return -EBUSY if the ldisc is currently in use. Since the ldisc referencing -code manages the module counts this should not usually be a concern. - -Heed this warning: the reference count field of the registered copies of the -tty_ldisc structure in the ldisc table counts the number of lines using this -discipline. The reference count of the tty_ldisc structure within a tty -counts the number of active users of the ldisc at this instant. In effect it -counts the number of threads of execution within an ldisc method (plus those -about to enter and exit although this detail matters not). - -Line Discipline Methods ------------------------ - -TTY side interfaces: - -open() - Called when the line discipline is attached to - the terminal. No other call into the line - discipline for this tty will occur until it - completes successfully. Can sleep. - -close() - This is called on a terminal when the line - discipline is being unplugged. At the point of - execution no further users will enter the - ldisc code for this tty. Can sleep. - -hangup() - Called when the tty line is hung up. - The line discipline should cease I/O to the tty. - No further calls into the ldisc code will occur. - Can sleep. - -write() - A process is writing data through the line - discipline. Multiple write calls are serialized - by the tty layer for the ldisc. May sleep. - -flush_buffer() - (optional) May be called at any point between - open and close, and instructs the line discipline - to empty its input buffer. - -chars_in_buffer() - (optional) Report the number of bytes in the input - buffer. - -set_termios() - (optional) Called on termios structure changes. - The caller passes the old termios data and the - current data is in the tty. Called under the - termios semaphore so allowed to sleep. Serialized - against itself only. - -read() - Move data from the line discipline to the user. - Multiple read calls may occur in parallel and the - ldisc must deal with serialization issues. May - sleep. - -poll() - Check the status for the poll/select calls. Multiple - poll calls may occur in parallel. May sleep. - -ioctl() - Called when an ioctl is handed to the tty layer - that might be for the ldisc. Multiple ioctl calls - may occur in parallel. May sleep. - -Driver Side Interfaces: - -receive_buf() - Hand buffers of bytes from the driver to the ldisc - for processing. Semantics currently rather - mysterious 8( - -write_wakeup() - May be called at any point between open and close. - The TTY_DO_WRITE_WAKEUP flag indicates if a call - is needed but always races versus calls. Thus the - ldisc must be careful about setting order and to - handle unexpected calls. Must not sleep. - - The driver is forbidden from calling this directly - from the ->write call from the ldisc as the ldisc - is permitted to call the driver write method from - this function. In such a situation defer it. - - -Driver Access - -Line discipline methods can call the following methods of the underlying -hardware driver through the function pointers within the tty->driver -structure: - -write() Write a block of characters to the tty device. - Returns the number of characters accepted. The - character buffer passed to this method is already - in kernel space. - -put_char() Queues a character for writing to the tty device. - If there is no room in the queue, the character is - ignored. - -flush_chars() (Optional) If defined, must be called after - queueing characters with put_char() in order to - start transmission. - -write_room() Returns the numbers of characters the tty driver - will accept for queueing to be written. - -ioctl() Invoke device specific ioctl. - Expects data pointers to refer to userspace. - Returns ENOIOCTLCMD for unrecognized ioctl numbers. - -set_termios() Notify the tty driver that the device's termios - settings have changed. New settings are in - tty->termios. Previous settings should be passed in - the "old" argument. - - The API is defined such that the driver should return - the actual modes selected. This means that the - driver function is responsible for modifying any - bits in the request it cannot fulfill to indicate - the actual modes being used. A device with no - hardware capability for change (eg a USB dongle or - virtual port) can provide NULL for this method. - -throttle() Notify the tty driver that input buffers for the - line discipline are close to full, and it should - somehow signal that no more characters should be - sent to the tty. - -unthrottle() Notify the tty driver that characters can now be - sent to the tty without fear of overrunning the - input buffers of the line disciplines. - -stop() Ask the tty driver to stop outputting characters - to the tty device. - -start() Ask the tty driver to resume sending characters - to the tty device. - -hangup() Ask the tty driver to hang up the tty device. - -break_ctl() (Optional) Ask the tty driver to turn on or off - BREAK status on the RS-232 port. If state is -1, - then the BREAK status should be turned on; if - state is 0, then BREAK should be turned off. - If this routine is not implemented, use ioctls - TIOCSBRK / TIOCCBRK instead. - -wait_until_sent() Waits until the device has written out all of the - characters in its transmitter FIFO. - -send_xchar() Send a high-priority XON/XOFF character to the device. - - -Flags - -Line discipline methods have access to tty->flags field containing the -following interesting flags: - -TTY_THROTTLED Driver input is throttled. The ldisc should call - tty->driver->unthrottle() in order to resume - reception when it is ready to process more data. - -TTY_DO_WRITE_WAKEUP If set, causes the driver to call the ldisc's - write_wakeup() method in order to resume - transmission when it can accept more data - to transmit. - -TTY_IO_ERROR If set, causes all subsequent userspace read/write - calls on the tty to fail, returning -EIO. - -TTY_OTHER_CLOSED Device is a pty and the other side has closed. - -TTY_NO_WRITE_SPLIT Prevent driver from splitting up writes into - smaller chunks. - - -Locking - -Callers to the line discipline functions from the tty layer are required to -take line discipline locks. The same is true of calls from the driver side -but not yet enforced. - -Three calls are now provided - - ldisc = tty_ldisc_ref(tty); - -takes a handle to the line discipline in the tty and returns it. If no ldisc -is currently attached or the ldisc is being closed and re-opened at this -point then NULL is returned. While this handle is held the ldisc will not -change or go away. - - tty_ldisc_deref(ldisc) - -Returns the ldisc reference and allows the ldisc to be closed. Returning the -reference takes away your right to call the ldisc functions until you take -a new reference. - - ldisc = tty_ldisc_ref_wait(tty); - -Performs the same function as tty_ldisc_ref except that it will wait for an -ldisc change to complete and then return a reference to the new ldisc. - -While these functions are slightly slower than the old code they should have -minimal impact as most receive logic uses the flip buffers and they only -need to take a reference when they push bits up through the driver. - -A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc -functions are called with the ldisc unavailable. Thus tty_ldisc_ref will -fail in this situation if used within these functions. Ldisc and driver -code calling its own functions must be careful in this case. - - -Driver Interface ----------------- - -open() - Called when a device is opened. May sleep - -close() - Called when a device is closed. At the point of - return from this call the driver must make no - further ldisc calls of any kind. May sleep - -write() - Called to write bytes to the device. May not - sleep. May occur in parallel in special cases. - Because this includes panic paths drivers generally - shouldn't try and do clever locking here. - -put_char() - Stuff a single character onto the queue. The - driver is guaranteed following up calls to - flush_chars. - -flush_chars() - Ask the kernel to write put_char queue - -write_room() - Return the number of characters tht can be stuffed - into the port buffers without overflow (or less). - The ldisc is responsible for being intelligent - about multi-threading of write_room/write calls - -ioctl() - Called when an ioctl may be for the driver - -set_termios() - Called on termios change, serialized against - itself by a semaphore. May sleep. - -set_ldisc() - Notifier for discipline change. At the point this - is done the discipline is not yet usable. Can now - sleep (I think) - -throttle() - Called by the ldisc to ask the driver to do flow - control. Serialization including with unthrottle - is the job of the ldisc layer. - -unthrottle() - Called by the ldisc to ask the driver to stop flow - control. - -stop() - Ldisc notifier to the driver to stop output. As with - throttle the serializations with start() are down - to the ldisc layer. - -start() - Ldisc notifier to the driver to start output. - -hangup() - Ask the tty driver to cause a hangup initiated - from the host side. [Can sleep ??] - -break_ctl() - Send RS232 break. Can sleep. Can get called in - parallel, driver must serialize (for now), and - with write calls. - -wait_until_sent() - Wait for characters to exit the hardware queue - of the driver. Can sleep - -send_xchar() - Send XON/XOFF and if possible jump the queue with - it in order to get fast flow control responses. - Cannot sleep ?? - diff --git a/Documentation/usb/gadget_serial.txt b/Documentation/usb/gadget_serial.txt index 9b22bd14c34..eac7df94d8e 100644 --- a/Documentation/usb/gadget_serial.txt +++ b/Documentation/usb/gadget_serial.txt @@ -114,11 +114,11 @@ modules. Then you must load the gadget serial driver. To load it as an ACM device (recommended for interoperability), do this: - modprobe g_serial use_acm=1 + modprobe g_serial To load it as a vendor specific bulk in/out device, do this: - modprobe g_serial + modprobe g_serial use_acm=0 This will also automatically load the underlying gadget peripheral controller driver. This must be done each time you reboot the gadget diff --git a/Documentation/usb/power-management.txt b/Documentation/usb/power-management.txt index e48ea1d5101..ad642615ad4 100644 --- a/Documentation/usb/power-management.txt +++ b/Documentation/usb/power-management.txt @@ -313,11 +313,13 @@ three of the methods listed above. In addition, a driver indicates that it supports autosuspend by setting the .supports_autosuspend flag in its usb_driver structure. It is then responsible for informing the USB core whenever one of its interfaces becomes busy or idle. The -driver does so by calling these three functions: +driver does so by calling these five functions: int usb_autopm_get_interface(struct usb_interface *intf); void usb_autopm_put_interface(struct usb_interface *intf); int usb_autopm_set_interface(struct usb_interface *intf); + int usb_autopm_get_interface_async(struct usb_interface *intf); + void usb_autopm_put_interface_async(struct usb_interface *intf); The functions work by maintaining a counter in the usb_interface structure. When intf->pm_usage_count is > 0 then the interface is @@ -330,10 +332,12 @@ associated with the device itself rather than any of its interfaces. This field is used only by the USB core.) The driver owns intf->pm_usage_count; it can modify the value however -and whenever it likes. A nice aspect of the usb_autopm_* routines is -that the changes they make are protected by the usb_device structure's -PM mutex (udev->pm_mutex); however drivers may change pm_usage_count -without holding the mutex. +and whenever it likes. A nice aspect of the non-async usb_autopm_* +routines is that the changes they make are protected by the usb_device +structure's PM mutex (udev->pm_mutex); however drivers may change +pm_usage_count without holding the mutex. Drivers using the async +routines are responsible for their own synchronization and mutual +exclusion. usb_autopm_get_interface() increments pm_usage_count and attempts an autoresume if the new value is > 0 and the @@ -348,6 +352,14 @@ without holding the mutex. is suspended, and it attempts an autosuspend if the value is <= 0 and the device isn't suspended. + usb_autopm_get_interface_async() and + usb_autopm_put_interface_async() do almost the same things as + their non-async counterparts. The differences are: they do + not acquire the PM mutex, and they use a workqueue to do their + jobs. As a result they can be called in an atomic context, + such as an URB's completion handler, but when they return the + device will not generally not yet be in the desired state. + There also are a couple of utility routines drivers can use: usb_autopm_enable() sets pm_usage_cnt to 0 and then calls diff --git a/Documentation/usb/proc_usb_info.txt b/Documentation/usb/proc_usb_info.txt index 077e9032d0c..fafcd472326 100644 --- a/Documentation/usb/proc_usb_info.txt +++ b/Documentation/usb/proc_usb_info.txt @@ -49,8 +49,10 @@ it and 002/048 sometime later. These files can be read as binary data. The binary data consists of first the device descriptor, then the descriptors for each -configuration of the device. That information is also shown in -text form by the /proc/bus/usb/devices file, described later. +configuration of the device. Multi-byte fields in the device and +configuration descriptors, but not other descriptors, are converted +to host endianness by the kernel. This information is also shown +in text form by the /proc/bus/usb/devices file, described later. These files may also be used to write user-level drivers for the USB devices. You would open the /proc/bus/usb/BBB/DDD file read/write, diff --git a/Documentation/usb/usbmon.txt b/Documentation/usb/usbmon.txt index 2917ce4ffdc..270481906dc 100644 --- a/Documentation/usb/usbmon.txt +++ b/Documentation/usb/usbmon.txt @@ -34,11 +34,12 @@ if usbmon is built into the kernel. Verify that bus sockets are present. # ls /sys/kernel/debug/usbmon -0s 0t 0u 1s 1t 1u 2s 2t 2u 3s 3t 3u 4s 4t 4u +0s 0u 1s 1t 1u 2s 2t 2u 3s 3t 3u 4s 4t 4u # -Now you can choose to either use the sockets numbered '0' (to capture packets on -all buses), and skip to step #3, or find the bus used by your device with step #2. +Now you can choose to either use the socket '0u' (to capture packets on all +buses), and skip to step #3, or find the bus used by your device with step #2. +This allows to filter away annoying devices that talk continuously. 2. Find which bus connects to the desired device @@ -99,8 +100,9 @@ on the event type, but there is a set of words, common for all types. Here is the list of words, from left to right: -- URB Tag. This is used to identify URBs is normally a kernel mode address - of the URB structure in hexadecimal. +- URB Tag. This is used to identify URBs, and is normally an in-kernel address + of the URB structure in hexadecimal, but can be a sequence number or any + other unique string, within reason. - Timestamp in microseconds, a decimal number. The timestamp's resolution depends on available clock, and so it can be much worse than a microsecond diff --git a/Documentation/usb/wusb-cbaf b/Documentation/usb/wusb-cbaf index 2e78b70f3ad..426ddaaef96 100644 --- a/Documentation/usb/wusb-cbaf +++ b/Documentation/usb/wusb-cbaf @@ -80,12 +80,6 @@ case $1 in start) for dev in ${2:-$hdevs} do - uwb_rc=$(readlink -f $dev/uwb_rc) - if cat $uwb_rc/beacon | grep -q -- "-1" - then - echo 13 0 > $uwb_rc/beacon - echo I: started beaconing on ch 13 on $(basename $uwb_rc) >&2 - fi echo $host_CHID > $dev/wusb_chid echo I: started host $(basename $dev) >&2 done @@ -95,9 +89,6 @@ case $1 in do echo 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > $dev/wusb_chid echo I: stopped host $(basename $dev) >&2 - uwb_rc=$(readlink -f $dev/uwb_rc) - echo -1 | cat > $uwb_rc/beacon - echo I: stopped beaconing on $(basename $uwb_rc) >&2 done ;; set-chid) diff --git a/Documentation/video4linux/API.html b/Documentation/video4linux/API.html index afbe9ae7ee9..d749d41f647 100644 --- a/Documentation/video4linux/API.html +++ b/Documentation/video4linux/API.html @@ -1,16 +1,27 @@ -V4L API -

Video For Linux APIs

- - - -
- -V4L original API - -Obsoleted by V4L2 API -
- -V4L2 API - -Should be used for new projects -
+ + + + + V4L API + + +

Video For Linux APIs

+ + + + + + + + + +
+ V4L original API + + Obsoleted by V4L2 API +
+ V4L2 API + Should be used for new projects +
+ + diff --git a/Documentation/video4linux/CARDLIST.bttv b/Documentation/video4linux/CARDLIST.bttv index 60ba6683603..0d93fa1ac25 100644 --- a/Documentation/video4linux/CARDLIST.bttv +++ b/Documentation/video4linux/CARDLIST.bttv @@ -104,8 +104,8 @@ 103 -> Grand X-Guard / Trust 814PCI [0304:0102] 104 -> Nebula Electronics DigiTV [0071:0101] 105 -> ProVideo PV143 [aa00:1430,aa00:1431,aa00:1432,aa00:1433,aa03:1433] -106 -> PHYTEC VD-009-X1 MiniDIN (bt878) -107 -> PHYTEC VD-009-X1 Combi (bt878) +106 -> PHYTEC VD-009-X1 VD-011 MiniDIN (bt878) +107 -> PHYTEC VD-009-X1 VD-011 Combi (bt878) 108 -> PHYTEC VD-009 MiniDIN (bt878) 109 -> PHYTEC VD-009 Combi (bt878) 110 -> IVC-100 [ff00:a132] @@ -151,3 +151,6 @@ 150 -> Geovision GV-600 [008a:763c] 151 -> Kozumi KTV-01C 152 -> Encore ENL TV-FM-2 [1000:1801] +153 -> PHYTEC VD-012 (bt878) +154 -> PHYTEC VD-012-X1 (bt878) +155 -> PHYTEC VD-012-X2 (bt878) diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885 index 64823ccacd6..35ea130e989 100644 --- a/Documentation/video4linux/CARDLIST.cx23885 +++ b/Documentation/video4linux/CARDLIST.cx23885 @@ -11,3 +11,4 @@ 10 -> DViCO FusionHDTV7 Dual Express [18ac:d618] 11 -> DViCO FusionHDTV DVB-T Dual Express [18ac:db78] 12 -> Leadtek Winfast PxDVR3200 H [107d:6681] + 13 -> Compro VideoMate E650F [185b:e800] diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88 index a5227e308f4..0d08f1edcf6 100644 --- a/Documentation/video4linux/CARDLIST.cx88 +++ b/Documentation/video4linux/CARDLIST.cx88 @@ -2,7 +2,7 @@ 1 -> Hauppauge WinTV 34xxx models [0070:3400,0070:3401] 2 -> GDI Black Gold [14c7:0106,14c7:0107] 3 -> PixelView [1554:4811] - 4 -> ATI TV Wonder Pro [1002:00f8] + 4 -> ATI TV Wonder Pro [1002:00f8,1002:00f9] 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613] 6 -> AverTV Studio 303 (M126) [1461:000b] 7 -> MSI TV-@nywhere Master [1462:8606] @@ -74,3 +74,6 @@ 73 -> TeVii S420 DVB-S [d420:9022] 74 -> Prolink Pixelview Global Extreme [1554:4976] 75 -> PROF 7300 DVB-S/S2 [B033:3033] + 76 -> SATTRADE ST4200 DVB-S/S2 [b200:4200] + 77 -> TBS 8910 DVB-S [8910:8888] + 78 -> Prof 6200 DVB-S [b022:3022] diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx index 187cc48d092..75bded8a4aa 100644 --- a/Documentation/video4linux/CARDLIST.em28xx +++ b/Documentation/video4linux/CARDLIST.em28xx @@ -1,5 +1,5 @@ 0 -> Unknown EM2800 video grabber (em2800) [eb1a:2800] - 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2820,eb1a:2860,eb1a:2861,eb1a:2870,eb1a:2881,eb1a:2883] + 1 -> Unknown EM2750/28xx video grabber (em2820/em2840) [eb1a:2820,eb1a:2821,eb1a:2860,eb1a:2861,eb1a:2870,eb1a:2881,eb1a:2883] 2 -> Terratec Cinergy 250 USB (em2820/em2840) [0ccd:0036] 3 -> Pinnacle PCTV USB 2 (em2820/em2840) [2304:0208] 4 -> Hauppauge WinTV USB 2 (em2820/em2840) [2040:4200,2040:4201] @@ -12,9 +12,9 @@ 11 -> Terratec Hybrid XS (em2880) [0ccd:0042] 12 -> Kworld PVR TV 2800 RF (em2820/em2840) 13 -> Terratec Prodigy XS (em2880) [0ccd:0047] - 14 -> Pixelview Prolink PlayTV USB 2.0 (em2820/em2840) [eb1a:2821] + 14 -> Pixelview Prolink PlayTV USB 2.0 (em2820/em2840) 15 -> V-Gear PocketTV (em2800) - 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b,2040:651f] + 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502] 19 -> PointNix Intra-Oral Camera (em2860) @@ -27,7 +27,6 @@ 26 -> Hercules Smart TV USB 2.0 (em2820/em2840) 27 -> Pinnacle PCTV USB 2 (Philips FM1216ME) (em2820/em2840) 28 -> Leadtek Winfast USB II Deluxe (em2820/em2840) - 29 -> Pinnacle Dazzle DVC 100 (em2820/em2840) 30 -> Videology 20K14XUSB USB2.0 (em2820/em2840) 31 -> Usbgear VD204v9 (em2821) 32 -> Supercomp USB 2.0 TV (em2821) @@ -57,3 +56,5 @@ 56 -> Pinnacle Hybrid Pro (2) (em2882) [2304:0226] 57 -> Kworld PlusTV HD Hybrid 330 (em2883) [eb1a:a316] 58 -> Compro VideoMate ForYou/Stereo (em2820/em2840) [185b:2041] + 60 -> Hauppauge WinTV HVR 850 (em2883) [2040:651f] + 61 -> Pixelview PlayTV Box 4 USB 2.0 (em2820/em2840) diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134 index dc67eef38ff..b8d470596b0 100644 --- a/Documentation/video4linux/CARDLIST.saa7134 +++ b/Documentation/video4linux/CARDLIST.saa7134 @@ -10,7 +10,7 @@ 9 -> Medion 5044 10 -> Kworld/KuroutoShikou SAA7130-TVPCI 11 -> Terratec Cinergy 600 TV [153b:1143] - 12 -> Medion 7134 [16be:0003] + 12 -> Medion 7134 [16be:0003,16be:5000] 13 -> Typhoon TV+Radio 90031 14 -> ELSA EX-VISION 300TV [1048:226b] 15 -> ELSA EX-VISION 500TV [1048:226a] @@ -151,3 +151,5 @@ 150 -> Zogis Real Angel 220 151 -> ADS Tech Instant HDTV [1421:0380] 152 -> Asus Tiger Rev:1.00 [1043:4857] +153 -> Kworld Plus TV Analog Lite PCI [17de:7128] +154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] diff --git a/Documentation/video4linux/README.cx88 b/Documentation/video4linux/README.cx88 index 06a33a4f52f..35fae23f883 100644 --- a/Documentation/video4linux/README.cx88 +++ b/Documentation/video4linux/README.cx88 @@ -1,4 +1,3 @@ - cx8800 release notes ==================== @@ -10,25 +9,24 @@ current status video - Basically works. - - Some minor image quality glitches. - - For now only capture, overlay support isn't completed yet. + - For now, only capture and read(). Overlay isn't supported. audio - The chip specs for the on-chip TV sound decoder are next to useless :-/ - Neverless the builtin TV sound decoder starts working now, - at least for PAL-BG. Other TV norms need other code ... + at least for some standards. FOR ANY REPORTS ON THIS PLEASE MENTION THE TV NORM YOU ARE USING. - Most tuner chips do provide mono sound, which may or may not be useable depending on the board design. With the Hauppauge cards it works, so there is mono sound available as fallback. - audio data dma (i.e. recording without loopback cable to the - sound card) should be possible, but there is no code yet ... + sound card) is supported via cx88-alsa. vbi - - some code present. Doesn't crash any more, but also doesn't - work yet ... + - Code present. Works for NTSC closed caption. PAL and other + TV norms may or may not work. how to add support for new cards diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt index 004818fab04..1c58a763014 100644 --- a/Documentation/video4linux/gspca.txt +++ b/Documentation/video4linux/gspca.txt @@ -50,9 +50,14 @@ ov519 045e:028c Micro$oft xbox cam spca508 0461:0815 Micro Innovation IC200 sunplus 0461:0821 Fujifilm MV-1 zc3xx 0461:0a00 MicroInnovation WebCam320 +stv06xx 046d:0840 QuickCam Express +stv06xx 046d:0850 LEGO cam / QuickCam Web +stv06xx 046d:0870 Dexxa WebCam USB spca500 046d:0890 Logitech QuickCam traveler vc032x 046d:0892 Logitech Orbicam vc032x 046d:0896 Logitech Orbicam +vc032x 046d:0897 Logitech QuickCam for Dell notebooks +zc3xx 046d:089d Logitech QuickCam E2500 zc3xx 046d:08a0 Logitech QC IM zc3xx 046d:08a1 Logitech QC IM 0x08A1 +sound zc3xx 046d:08a2 Labtec Webcam Pro @@ -169,6 +174,9 @@ spca500 06bd:0404 Agfa CL20 spca500 06be:0800 Optimedia sunplus 06d6:0031 Trust 610 LCD PowerC@m Zoom spca506 06e1:a190 ADS Instant VCD +ov534 06f8:3002 Hercules Blog Webcam +ov534 06f8:3003 Hercules Dualpix HD Weblog +sonixj 06f8:3004 Hercules Classic Silver spca508 0733:0110 ViewQuest VQ110 spca508 0130:0130 Clone Digital Webcam 11043 spca501 0733:0401 Intel Create and Share @@ -199,7 +207,8 @@ sunplus 08ca:2050 Medion MD 41437 sunplus 08ca:2060 Aiptek PocketDV5300 tv8532 0923:010f ICM532 cams mars 093a:050f Mars-Semi Pc-Camera -pac207 093a:2460 PAC207 Qtec Webcam 100 +pac207 093a:2460 Qtec Webcam 100 +pac207 093a:2461 HP Webcam pac207 093a:2463 Philips SPC 220 NC pac207 093a:2464 Labtec Webcam 1200 pac207 093a:2468 PAC207 @@ -213,10 +222,13 @@ pac7311 093a:2603 PAC7312 pac7311 093a:2608 Trust WB-3300p pac7311 093a:260e Gigaware VGA PC Camera, Trust WB-3350p, SIGMA cam 2350 pac7311 093a:260f SnakeCam +pac7311 093a:2620 Apollo AC-905 pac7311 093a:2621 PAC731x +pac7311 093a:2622 Genius Eye 312 pac7311 093a:2624 PAC7302 pac7311 093a:2626 Labtec 2200 pac7311 093a:262a Webcam 300k +pac7311 093a:262c Philips SPC 230 NC zc3xx 0ac8:0302 Z-star Vimicro zc0302 vc032x 0ac8:0321 Vimicro generic vc0321 vc032x 0ac8:0323 Vimicro Vc0323 @@ -249,11 +261,13 @@ sonixj 0c45:60c0 Sangha Sn535 sonixj 0c45:60ec SN9C105+MO4000 sonixj 0c45:60fb Surfer NoName sonixj 0c45:60fc LG-LIC300 +sonixj 0c45:60fe Microdia Audio sonixj 0c45:6128 Microdia/Sonix SNP325 sonixj 0c45:612a Avant Camera sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix sonixj 0c45:6130 Sonix Pccam sonixj 0c45:6138 Sn9c120 Mo4000 +sonixj 0c45:613a Microdia Sonix PC Camera sonixj 0c45:613b Surfer SN-206 sonixj 0c45:613c Sonix Pccam168 sonixj 0c45:6143 Sonix Pccam168 @@ -263,6 +277,9 @@ etoms 102c:6251 Qcam xxxxxx VGA zc3xx 10fd:0128 Typhoon Webshot II USB 300k 0x0128 spca561 10fd:7e50 FlyCam Usb 100 zc3xx 10fd:8050 Typhoon Webshot II USB 300k +ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201) +pac207 145f:013a Trust WB-1300N +vc032x 15b8:6002 HP 2.0 Megapixel rz406aa spca501 1776:501c Arowana 300K CMOS Camera t613 17a1:0128 TASCORP JPEG Webcam, NGS Cyclops vc032x 17ef:4802 Lenovo Vc0323+MI1310_SOC diff --git a/Documentation/video4linux/si470x.txt b/Documentation/video4linux/si470x.txt new file mode 100644 index 00000000000..49679e6aaa7 --- /dev/null +++ b/Documentation/video4linux/si470x.txt @@ -0,0 +1,119 @@ +Driver for USB radios for the Silicon Labs Si470x FM Radio Receivers + +Copyright (c) 2008 Tobias Lorenz + + +Information from Silicon Labs +============================= +Silicon Laboratories is the manufacturer of the radio ICs, that nowadays are the +most often used radio receivers in cell phones. Usually they are connected with +I2C. But SiLabs also provides a reference design, which integrates this IC, +together with a small microcontroller C8051F321, to form a USB radio. +Part of this reference design is also a radio application in binary and source +code. The software also contains an automatic firmware upgrade to the most +current version. Information on these can be downloaded here: +http://www.silabs.com/usbradio + + +Supported ICs +============= +The following ICs have a very similar register set, so that they are or will be +supported somewhen by the driver: +- Si4700: FM radio receiver +- Si4701: FM radio receiver, RDS Support +- Si4702: FM radio receiver +- Si4703: FM radio receiver, RDS Support +- Si4704: FM radio receiver, no external antenna required +- Si4705: FM radio receiver, no external antenna required, RDS support, Dig I/O +- Si4706: Enhanced FM RDS/TMC radio receiver, no external antenna required, RDS + Support +- Si4707: Dedicated weather band radio receiver with SAME decoder, RDS Support +- Si4708: Smallest FM receivers +- Si4709: Smallest FM receivers, RDS Support +More information on these can be downloaded here: +http://www.silabs.com/products/mcu/Pages/USBFMRadioRD.aspx + + +Supported USB devices +===================== +Currently the following USB radios (vendor:product) with the Silicon Labs si470x +chips are known to work: +- 10c4:818a: Silicon Labs USB FM Radio Reference Design +- 06e1:a155: ADS/Tech FM Radio Receiver (formerly Instant FM Music) (RDX-155-EF) +- 1b80:d700: KWorld USB FM Radio SnapMusic Mobile 700 (FM700) +- 10c5:819a: DealExtreme USB Radio + + +Software +======== +Testing is usually done with most application under Debian/testing: +- fmtools - Utility for managing FM tuner cards +- gnomeradio - FM-radio tuner for the GNOME desktop +- gradio - GTK FM radio tuner +- kradio - Comfortable Radio Application for KDE +- radio - ncurses-based radio application + +There is also a library libv4l, which can be used. It's going to have a function +for frequency seeking, either by using hardware functionality as in radio-si470x +or by implementing a function as we currently have in every of the mentioned +programs. Somewhen the radio programs should make use of libv4l. + +For processing RDS information, there is a project ongoing at: +http://rdsd.berlios.de/ + +There is currently no project for making TMC sentences human readable. + + +Audio Listing +============= +USB Audio is provided by the ALSA snd_usb_audio module. It is recommended to +also select SND_USB_AUDIO, as this is required to get sound from the radio. For +listing you have to redirect the sound, for example using one of the following +commands. + +If you just want to test audio (very poor quality): +cat /dev/dsp1 > /dev/dsp + +If you use OSS try: +sox -2 --endian little -r 96000 -t oss /dev/dsp1 -t oss /dev/dsp + +If you use arts try: +arecord -D hw:1,0 -r96000 -c2 -f S16_LE | artsdsp aplay -B - + + +Module Parameters +================= +After loading the module, you still have access to some of them in the sysfs +mount under /sys/module/radio_si470x/parameters. The contents of read-only files +(0444) are not updated, even if space, band and de are changed using private +video controls. The others are runtime changeable. + + +Errors +====== +Increase tune_timeout, if you often get -EIO errors. + +When timed out or band limit is reached, hw_freq_seek returns -EAGAIN. + +If you get any errors from snd_usb_audio, please report them to the ALSA people. + + +Open Issues +=========== +V4L minor device allocation and parameter setting is not perfect. A solution is +currently under discussion. + +There is an USB interface for downloading/uploading new firmware images. Support +for it can be implemented using the request_firmware interface. + +There is a RDS interrupt mode. The driver is already using the same interface +for polling RDS information, but is currently not using the interrupt mode. + +There is a LED interface, which can be used to override the LED control +programmed in the firmware. This can be made available using the LED support +functions in the kernel. + + +Other useful information and links +================================== +http://www.silabs.com/usbradio diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt new file mode 100644 index 00000000000..ff124374e9b --- /dev/null +++ b/Documentation/video4linux/v4l2-framework.txt @@ -0,0 +1,521 @@ +Overview of the V4L2 driver framework +===================================== + +This text documents the various structures provided by the V4L2 framework and +their relationships. + + +Introduction +------------ + +The V4L2 drivers tend to be very complex due to the complexity of the +hardware: most devices have multiple ICs, export multiple device nodes in +/dev, and create also non-V4L2 devices such as DVB, ALSA, FB, I2C and input +(IR) devices. + +Especially the fact that V4L2 drivers have to setup supporting ICs to +do audio/video muxing/encoding/decoding makes it more complex than most. +Usually these ICs are connected to the main bridge driver through one or +more I2C busses, but other busses can also be used. Such devices are +called 'sub-devices'. + +For a long time the framework was limited to the video_device struct for +creating V4L device nodes and video_buf for handling the video buffers +(note that this document does not discuss the video_buf framework). + +This meant that all drivers had to do the setup of device instances and +connecting to sub-devices themselves. Some of this is quite complicated +to do right and many drivers never did do it correctly. + +There is also a lot of common code that could never be refactored due to +the lack of a framework. + +So this framework sets up the basic building blocks that all drivers +need and this same framework should make it much easier to refactor +common code into utility functions shared by all drivers. + + +Structure of a driver +--------------------- + +All drivers have the following structure: + +1) A struct for each device instance containing the device state. + +2) A way of initializing and commanding sub-devices (if any). + +3) Creating V4L2 device nodes (/dev/videoX, /dev/vbiX, /dev/radioX and + /dev/vtxX) and keeping track of device-node specific data. + +4) Filehandle-specific structs containing per-filehandle data. + +This is a rough schematic of how it all relates: + + device instances + | + +-sub-device instances + | + \-V4L2 device nodes + | + \-filehandle instances + + +Structure of the framework +-------------------------- + +The framework closely resembles the driver structure: it has a v4l2_device +struct for the device instance data, a v4l2_subdev struct to refer to +sub-device instances, the video_device struct stores V4L2 device node data +and in the future a v4l2_fh struct will keep track of filehandle instances +(this is not yet implemented). + + +struct v4l2_device +------------------ + +Each device instance is represented by a struct v4l2_device (v4l2-device.h). +Very simple devices can just allocate this struct, but most of the time you +would embed this struct inside a larger struct. + +You must register the device instance: + + v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); + +Registration will initialize the v4l2_device struct and link dev->driver_data +to v4l2_dev. Registration will also set v4l2_dev->name to a value derived from +dev (driver name followed by the bus_id, to be precise). You may change the +name after registration if you want. + +The first 'dev' argument is normally the struct device pointer of a pci_dev, +usb_device or platform_device. + +You unregister with: + + v4l2_device_unregister(struct v4l2_device *v4l2_dev); + +Unregistering will also automatically unregister all subdevs from the device. + +Sometimes you need to iterate over all devices registered by a specific +driver. This is usually the case if multiple device drivers use the same +hardware. E.g. the ivtvfb driver is a framebuffer driver that uses the ivtv +hardware. The same is true for alsa drivers for example. + +You can iterate over all registered devices as follows: + +static int callback(struct device *dev, void *p) +{ + struct v4l2_device *v4l2_dev = dev_get_drvdata(dev); + + /* test if this device was inited */ + if (v4l2_dev == NULL) + return 0; + ... + return 0; +} + +int iterate(void *p) +{ + struct device_driver *drv; + int err; + + /* Find driver 'ivtv' on the PCI bus. + pci_bus_type is a global. For USB busses use usb_bus_type. */ + drv = driver_find("ivtv", &pci_bus_type); + /* iterate over all ivtv device instances */ + err = driver_for_each_device(drv, NULL, p, callback); + put_driver(drv); + return err; +} + +Sometimes you need to keep a running counter of the device instance. This is +commonly used to map a device instance to an index of a module option array. + +The recommended approach is as follows: + +static atomic_t drv_instance = ATOMIC_INIT(0); + +static int __devinit drv_probe(struct pci_dev *dev, + const struct pci_device_id *pci_id) +{ + ... + state->instance = atomic_inc_return(&drv_instance) - 1; +} + + +struct v4l2_subdev +------------------ + +Many drivers need to communicate with sub-devices. These devices can do all +sort of tasks, but most commonly they handle audio and/or video muxing, +encoding or decoding. For webcams common sub-devices are sensors and camera +controllers. + +Usually these are I2C devices, but not necessarily. In order to provide the +driver with a consistent interface to these sub-devices the v4l2_subdev struct +(v4l2-subdev.h) was created. + +Each sub-device driver must have a v4l2_subdev struct. This struct can be +stand-alone for simple sub-devices or it might be embedded in a larger struct +if more state information needs to be stored. Usually there is a low-level +device struct (e.g. i2c_client) that contains the device data as setup +by the kernel. It is recommended to store that pointer in the private +data of v4l2_subdev using v4l2_set_subdevdata(). That makes it easy to go +from a v4l2_subdev to the actual low-level bus-specific device data. + +You also need a way to go from the low-level struct to v4l2_subdev. For the +common i2c_client struct the i2c_set_clientdata() call is used to store a +v4l2_subdev pointer, for other busses you may have to use other methods. + +From the bridge driver perspective you load the sub-device module and somehow +obtain the v4l2_subdev pointer. For i2c devices this is easy: you call +i2c_get_clientdata(). For other busses something similar needs to be done. +Helper functions exists for sub-devices on an I2C bus that do most of this +tricky work for you. + +Each v4l2_subdev contains function pointers that sub-device drivers can +implement (or leave NULL if it is not applicable). Since sub-devices can do +so many different things and you do not want to end up with a huge ops struct +of which only a handful of ops are commonly implemented, the function pointers +are sorted according to category and each category has its own ops struct. + +The top-level ops struct contains pointers to the category ops structs, which +may be NULL if the subdev driver does not support anything from that category. + +It looks like this: + +struct v4l2_subdev_core_ops { + int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip); + int (*log_status)(struct v4l2_subdev *sd); + int (*init)(struct v4l2_subdev *sd, u32 val); + ... +}; + +struct v4l2_subdev_tuner_ops { + ... +}; + +struct v4l2_subdev_audio_ops { + ... +}; + +struct v4l2_subdev_video_ops { + ... +}; + +struct v4l2_subdev_ops { + const struct v4l2_subdev_core_ops *core; + const struct v4l2_subdev_tuner_ops *tuner; + const struct v4l2_subdev_audio_ops *audio; + const struct v4l2_subdev_video_ops *video; +}; + +The core ops are common to all subdevs, the other categories are implemented +depending on the sub-device. E.g. a video device is unlikely to support the +audio ops and vice versa. + +This setup limits the number of function pointers while still making it easy +to add new ops and categories. + +A sub-device driver initializes the v4l2_subdev struct using: + + v4l2_subdev_init(subdev, &ops); + +Afterwards you need to initialize subdev->name with a unique name and set the +module owner. This is done for you if you use the i2c helper functions. + +A device (bridge) driver needs to register the v4l2_subdev with the +v4l2_device: + + int err = v4l2_device_register_subdev(device, subdev); + +This can fail if the subdev module disappeared before it could be registered. +After this function was called successfully the subdev->dev field points to +the v4l2_device. + +You can unregister a sub-device using: + + v4l2_device_unregister_subdev(subdev); + +Afterwards the subdev module can be unloaded and subdev->dev == NULL. + +You can call an ops function either directly: + + err = subdev->ops->core->g_chip_ident(subdev, &chip); + +but it is better and easier to use this macro: + + err = v4l2_subdev_call(subdev, core, g_chip_ident, &chip); + +The macro will to the right NULL pointer checks and returns -ENODEV if subdev +is NULL, -ENOIOCTLCMD if either subdev->core or subdev->core->g_chip_ident is +NULL, or the actual result of the subdev->ops->core->g_chip_ident ops. + +It is also possible to call all or a subset of the sub-devices: + + v4l2_device_call_all(dev, 0, core, g_chip_ident, &chip); + +Any subdev that does not support this ops is skipped and error results are +ignored. If you want to check for errors use this: + + err = v4l2_device_call_until_err(dev, 0, core, g_chip_ident, &chip); + +Any error except -ENOIOCTLCMD will exit the loop with that error. If no +errors (except -ENOIOCTLCMD) occured, then 0 is returned. + +The second argument to both calls is a group ID. If 0, then all subdevs are +called. If non-zero, then only those whose group ID match that value will +be called. Before a bridge driver registers a subdev it can set subdev->grp_id +to whatever value it wants (it's 0 by default). This value is owned by the +bridge driver and the sub-device driver will never modify or use it. + +The group ID gives the bridge driver more control how callbacks are called. +For example, there may be multiple audio chips on a board, each capable of +changing the volume. But usually only one will actually be used when the +user want to change the volume. You can set the group ID for that subdev to +e.g. AUDIO_CONTROLLER and specify that as the group ID value when calling +v4l2_device_call_all(). That ensures that it will only go to the subdev +that needs it. + +The advantage of using v4l2_subdev is that it is a generic struct and does +not contain any knowledge about the underlying hardware. So a driver might +contain several subdevs that use an I2C bus, but also a subdev that is +controlled through GPIO pins. This distinction is only relevant when setting +up the device, but once the subdev is registered it is completely transparent. + + +I2C sub-device drivers +---------------------- + +Since these drivers are so common, special helper functions are available to +ease the use of these drivers (v4l2-common.h). + +The recommended method of adding v4l2_subdev support to an I2C driver is to +embed the v4l2_subdev struct into the state struct that is created for each +I2C device instance. Very simple devices have no state struct and in that case +you can just create a v4l2_subdev directly. + +A typical state struct would look like this (where 'chipname' is replaced by +the name of the chip): + +struct chipname_state { + struct v4l2_subdev sd; + ... /* additional state fields */ +}; + +Initialize the v4l2_subdev struct as follows: + + v4l2_i2c_subdev_init(&state->sd, client, subdev_ops); + +This function will fill in all the fields of v4l2_subdev and ensure that the +v4l2_subdev and i2c_client both point to one another. + +You should also add a helper inline function to go from a v4l2_subdev pointer +to a chipname_state struct: + +static inline struct chipname_state *to_state(struct v4l2_subdev *sd) +{ + return container_of(sd, struct chipname_state, sd); +} + +Use this to go from the v4l2_subdev struct to the i2c_client struct: + + struct i2c_client *client = v4l2_get_subdevdata(sd); + +And this to go from an i2c_client to a v4l2_subdev struct: + + struct v4l2_subdev *sd = i2c_get_clientdata(client); + +Finally you need to make a command function to make driver->command() +call the right subdev_ops functions: + +static int subdev_command(struct i2c_client *client, unsigned cmd, void *arg) +{ + return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg); +} + +If driver->command is never used then you can leave this out. Eventually the +driver->command usage should be removed from v4l. + +Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback +is called. This will unregister the sub-device from the bridge driver. It is +safe to call this even if the sub-device was never registered. + + +The bridge driver also has some helper functions it can use: + +struct v4l2_subdev *sd = v4l2_i2c_new_subdev(adapter, "module_foo", "chipid", 0x36); + +This loads the given module (can be NULL if no module needs to be loaded) and +calls i2c_new_device() with the given i2c_adapter and chip/address arguments. +If all goes well, then it registers the subdev with the v4l2_device. It gets +the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure +that adapdata is set to v4l2_device when you setup the i2c_adapter in your +driver. + +You can also use v4l2_i2c_new_probed_subdev() which is very similar to +v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses +that it should probe. Internally it calls i2c_new_probed_device(). + +Both functions return NULL if something went wrong. + + +struct video_device +------------------- + +The actual device nodes in the /dev directory are created using the +video_device struct (v4l2-dev.h). This struct can either be allocated +dynamically or embedded in a larger struct. + +To allocate it dynamically use: + + struct video_device *vdev = video_device_alloc(); + + if (vdev == NULL) + return -ENOMEM; + + vdev->release = video_device_release; + +If you embed it in a larger struct, then you must set the release() +callback to your own function: + + struct video_device *vdev = &my_vdev->vdev; + + vdev->release = my_vdev_release; + +The release callback must be set and it is called when the last user +of the video device exits. + +The default video_device_release() callback just calls kfree to free the +allocated memory. + +You should also set these fields: + +- v4l2_dev: set to the v4l2_device parent device. +- name: set to something descriptive and unique. +- fops: set to the v4l2_file_operations struct. +- ioctl_ops: if you use the v4l2_ioctl_ops to simplify ioctl maintenance + (highly recommended to use this and it might become compulsory in the + future!), then set this to your v4l2_ioctl_ops struct. + +If you use v4l2_ioctl_ops, then you should set either .unlocked_ioctl or +.ioctl to video_ioctl2 in your v4l2_file_operations struct. + +The v4l2_file_operations struct is a subset of file_operations. The main +difference is that the inode argument is omitted since it is never used. + + +video_device registration +------------------------- + +Next you register the video device: this will create the character device +for you. + + err = video_register_device(vdev, VFL_TYPE_GRABBER, -1); + if (err) { + video_device_release(vdev); /* or kfree(my_vdev); */ + return err; + } + +Which device is registered depends on the type argument. The following +types exist: + +VFL_TYPE_GRABBER: videoX for video input/output devices +VFL_TYPE_VBI: vbiX for vertical blank data (i.e. closed captions, teletext) +VFL_TYPE_RADIO: radioX for radio tuners +VFL_TYPE_VTX: vtxX for teletext devices (deprecated, don't use) + +The last argument gives you a certain amount of control over the device +kernel number used (i.e. the X in videoX). Normally you will pass -1 to +let the v4l2 framework pick the first free number. But if a driver creates +many devices, then it can be useful to have different video devices in +separate ranges. For example, video capture devices start at 0, video +output devices start at 16. + +So you can use the last argument to specify a minimum kernel number and +the v4l2 framework will try to pick the first free number that is equal +or higher to what you passed. If that fails, then it will just pick the +first free number. + +Whenever a device node is created some attributes are also created for you. +If you look in /sys/class/video4linux you see the devices. Go into e.g. +video0 and you will see 'name' and 'index' attributes. The 'name' attribute +is the 'name' field of the video_device struct. The 'index' attribute is +a device node index that can be assigned by the driver, or that is calculated +for you. + +If you call video_register_device(), then the index is just increased by +1 for each device node you register. The first video device node you register +always starts off with 0. + +Alternatively you can call video_register_device_index() which is identical +to video_register_device(), but with an extra index argument. Here you can +pass a specific index value (between 0 and 31) that should be used. + +Users can setup udev rules that utilize the index attribute to make fancy +device names (e.g. 'mpegX' for MPEG video capture device nodes). + +After the device was successfully registered, then you can use these fields: + +- vfl_type: the device type passed to video_register_device. +- minor: the assigned device minor number. +- num: the device kernel number (i.e. the X in videoX). +- index: the device index number (calculated or set explicitly using + video_register_device_index). + +If the registration failed, then you need to call video_device_release() +to free the allocated video_device struct, or free your own struct if the +video_device was embedded in it. The vdev->release() callback will never +be called if the registration failed, nor should you ever attempt to +unregister the device if the registration failed. + + +video_device cleanup +-------------------- + +When the video device nodes have to be removed, either during the unload +of the driver or because the USB device was disconnected, then you should +unregister them: + + video_unregister_device(vdev); + +This will remove the device nodes from sysfs (causing udev to remove them +from /dev). + +After video_unregister_device() returns no new opens can be done. + +However, in the case of USB devices some application might still have one +of these device nodes open. You should block all new accesses to read, +write, poll, etc. except possibly for certain ioctl operations like +queueing buffers. + +When the last user of the video device node exits, then the vdev->release() +callback is called and you can do the final cleanup there. + + +video_device helper functions +----------------------------- + +There are a few useful helper functions: + +You can set/get driver private data in the video_device struct using: + +void *video_get_drvdata(struct video_device *dev); +void video_set_drvdata(struct video_device *dev, void *data); + +Note that you can safely call video_set_drvdata() before calling +video_register_device(). + +And this function: + +struct video_device *video_devdata(struct file *file); + +returns the video_device belonging to the file struct. + +The final helper function combines video_get_drvdata with +video_devdata: + +void *video_drvdata(struct file *file); + +You can go from a video_device struct to the v4l2_device struct using: + +struct v4l2_device *v4l2_dev = vdev->v4l2_dev; diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 125eed560e5..0706a7282a8 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -137,13 +137,6 @@ shrink_page_list() where they will be detected when vmscan walks the reverse map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() will cull the page at that point. -Note that for anonymous pages, shrink_page_list() attempts to add the page to -the swap cache before it tries to unmap the page. To avoid this unnecessary -consumption of swap space, shrink_page_list() calls try_to_munlock() to check -whether any VM_LOCKED vmas map the page without attempting to unmap the page. -If try_to_munlock() returns SWAP_MLOCK, shrink_page_list() will cull the page -without consuming swap space. try_to_munlock() will be described below. - To "cull" an unevictable page, vmscan simply puts the page back on the lru list using putback_lru_page()--the inverse operation to isolate_lru_page()-- after dropping the page lock. Because the condition which makes the page @@ -190,8 +183,8 @@ several places: in the VM_LOCKED flag being set for the vma. 3) in the fault path, if mlocked pages are "culled" in the fault path, and when a VM_LOCKED stack segment is expanded. -4) as mentioned above, in vmscan:shrink_page_list() with attempting to - reclaim a page in a VM_LOCKED vma--via try_to_unmap() or try_to_munlock(). +4) as mentioned above, in vmscan:shrink_page_list() when attempting to + reclaim a page in a VM_LOCKED vma via try_to_unmap(). Mlocked pages become unlocked and rescued from the unevictable list when: @@ -260,9 +253,9 @@ mlock_fixup() filters several classes of "special" vmas: 2) vmas mapping hugetlbfs page are already effectively pinned into memory. We don't need nor want to mlock() these pages. However, to preserve the - prior behavior of mlock()--before the unevictable/mlock changes--mlock_fixup() - will call make_pages_present() in the hugetlbfs vma range to allocate the - huge pages and populate the ptes. + prior behavior of mlock()--before the unevictable/mlock changes-- + mlock_fixup() will call make_pages_present() in the hugetlbfs vma range + to allocate the huge pages and populate the ptes. 3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of kernel pages, such as the vdso page, relay channel pages, etc. These pages @@ -322,7 +315,7 @@ __mlock_vma_pages_range()--the same function used to mlock a vma range-- passing a flag to indicate that munlock() is being performed. Because the vma access protections could have been changed to PROT_NONE after -faulting in and mlocking some pages, get_user_pages() was unreliable for visiting +faulting in and mlocking pages, get_user_pages() was unreliable for visiting these pages for munlocking. Because we don't want to leave pages mlocked(), get_user_pages() was enhanced to accept a flag to ignore the permissions when fetching the pages--all of which should be resident as a result of previous @@ -416,8 +409,8 @@ Mlocked Pages: munmap()/exit()/exec() System Call Handling When unmapping an mlocked region of memory, whether by an explicit call to munmap() or via an internal unmap from exit() or exec() processing, we must munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. -Before the unevictable/mlock changes, mlocking did not mark the pages in any way, -so unmapping them required no processing. +Before the unevictable/mlock changes, mlocking did not mark the pages in any +way, so unmapping them required no processing. To munlock a range of memory under the unevictable/mlock infrastructure, the munmap() hander and task address space tear down function call @@ -517,12 +510,10 @@ couldn't be mlocked. Mlocked pages: try_to_munlock() Reverse Map Scan TODO/FIXME: a better name might be page_mlocked()--analogous to the -page_referenced() reverse map walker--especially if we continue to call this -from shrink_page_list(). See related TODO/FIXME below. +page_referenced() reverse map walker. -When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() System -Call Handling" above--tries to munlock a page, or when shrink_page_list() -encounters an anonymous page that is not yet in the swap cache, they need to +When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() +System Call Handling" above--tries to munlock a page, it needs to determine whether or not the page is mapped by any VM_LOCKED vma, without actually attempting to unmap all ptes from the page. For this purpose, the unevictable/mlock infrastructure introduced a variant of try_to_unmap() called @@ -535,10 +526,7 @@ for VM_LOCKED vmas. When such a vma is found for anonymous pages and file pages mapped in linear VMAs, as in the try_to_unmap() case, the functions attempt to acquire the associated mmap semphore, mlock the page via mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the -pre-clearing of the page's PG_mlocked done by munlock_vma_page() and informs -shrink_page_list() that the anonymous page should be culled rather than added -to the swap cache in preparation for a try_to_unmap() that will almost -certainly fail. +pre-clearing of the page's PG_mlocked done by munlock_vma_page. If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() @@ -557,10 +545,7 @@ However, the scan can terminate when it encounters a VM_LOCKED vma and can successfully acquire the vma's mmap semphore for read and mlock the page. Although try_to_munlock() can be called many [very many!] times when munlock()ing a large region or tearing down a large address space that has been -mlocked via mlockall(), overall this is a fairly rare event. In addition, -although shrink_page_list() calls try_to_munlock() for every anonymous page that -it handles that is not yet in the swap cache, on average anonymous pages will -have very short reverse map lists. +mlocked via mlockall(), overall this is a fairly rare event. Mlocked Page: Page Reclaim in shrink_*_list() @@ -588,8 +573,8 @@ Some examples of these unevictable pages on the LRU lists are: munlock_vma_page() was forced to let the page back on to the normal LRU list for vmscan to handle. -shrink_inactive_list() also culls any unevictable pages that it finds -on the inactive lists, again diverting them to the appropriate zone's unevictable +shrink_inactive_list() also culls any unevictable pages that it finds on +the inactive lists, again diverting them to the appropriate zone's unevictable lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from @@ -597,19 +582,7 @@ the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, but will pass on to shrink_page_list(). shrink_page_list() again culls obviously unevictable pages that it could -encounter for similar reason to shrink_inactive_list(). As already discussed, -shrink_page_list() proactively looks for anonymous pages that should have -PG_mlocked set but don't--these would not be detected by page_evictable()--to -avoid adding them to the swap cache unnecessarily. File pages mapped into +encounter for similar reason to shrink_inactive_list(). Pages mapped into VM_LOCKED vmas but without PG_mlocked set will make it all the way to -try_to_unmap(). shrink_page_list() will divert them to the unevictable list when -try_to_unmap() returns SWAP_MLOCK, as discussed above. - -TODO/FIXME: If we can enhance the swap cache to reliably remove entries -with page_count(page) > 2, as long as all ptes are mapped to the page and -not the swap entry, we can probably remove the call to try_to_munlock() in -shrink_page_list() and just remove the page from the swap cache when -try_to_unmap() returns SWAP_MLOCK. Currently, remove_exclusive_swap_page() -doesn't seem to allow that. - - +try_to_unmap(). shrink_page_list() will divert them to the unevictable list +when try_to_unmap() returns SWAP_MLOCK, as discussed above. diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX index 7b0ceaaad7a..d63fa024ac0 100644 --- a/Documentation/w1/masters/00-INDEX +++ b/Documentation/w1/masters/00-INDEX @@ -4,5 +4,7 @@ ds2482 - The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses. ds2490 - The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges. +mxc_w1 + - W1 master controller driver found on Freescale MX2/MX3 SoCs w1-gpio - GPIO 1-wire bus master driver. diff --git a/Documentation/w1/masters/mxc-w1 b/Documentation/w1/masters/mxc-w1 new file mode 100644 index 00000000000..97f6199a7f3 --- /dev/null +++ b/Documentation/w1/masters/mxc-w1 @@ -0,0 +1,11 @@ +Kernel driver mxc_w1 +==================== + +Supported chips: + * Freescale MX27, MX31 and probably other i.MX SoCs + Datasheets: + http://www.freescale.com/files/32bit/doc/data_sheet/MCIMX31.pdf?fpsp=1 + http://www.freescale.com/files/dsp/MCIMX27.pdf?fpsp=1 + +Author: Originally based on Freescale code, prepared for mainline by + Sascha Hauer diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq new file mode 100644 index 00000000000..ca722e09b6a --- /dev/null +++ b/Documentation/w1/masters/omap-hdq @@ -0,0 +1,46 @@ +Kernel driver for omap HDQ/1-wire module. +======================================== + +Supported chips: +================ + HDQ/1-wire controller on the TI OMAP 2430/3430 platforms. + +A useful link about HDQ basics: +=============================== +http://focus.ti.com/lit/an/slua408/slua408.pdf + +Description: +============ +The HDQ/1-Wire module of TI OMAP2430/3430 platforms implement the hardware +protocol of the master functions of the Benchmark HDQ and the Dallas +Semiconductor 1-Wire protocols. These protocols use a single wire for +communication between the master (HDQ/1-Wire controller) and the slave +(HDQ/1-Wire external compliant device). + +A typical application of the HDQ/1-Wire module is the communication with battery +monitor (gas gauge) integrated circuits. + +The controller supports operation in both HDQ and 1-wire mode. The essential +difference between the HDQ and 1-wire mode is how the slave device responds to +initialization pulse.In HDQ mode, the firmware does not require the host to +create an initialization pulse to the slave.However, the slave can be reset by +using an initialization pulse (also referred to as a break pulse).The slave +does not respond with a presence pulse as it does in the 1-Wire protocol. + +Remarks: +======== +The driver (drivers/w1/masters/omap_hdq.c) supports the HDQ mode of the +controller. In this mode, as we can not read the ID which obeys the W1 +spec(family:id:crc), a module parameter can be passed to the driver which will +be used to calculate the CRC and pass back an appropriate slave ID to the W1 +core. + +By default the master driver and the BQ slave i/f +driver(drivers/w1/slaves/w1_bq27000.c) sets the ID to 1. +Please note to load both the modules with a different ID if required, but note +that the ID used should be same for both master and slave driver loading. + +e.g: +insmod omap_hdq.ko W1_ID=2 +inamod w1_bq27000.ko F_ID=2 + diff --git a/Documentation/w1/w1.netlink b/Documentation/w1/w1.netlink index 3640c7c87d4..804445f745e 100644 --- a/Documentation/w1/w1.netlink +++ b/Documentation/w1/w1.netlink @@ -5,69 +5,157 @@ Message types. ============= There are three types of messages between w1 core and userspace: -1. Events. They are generated each time new master or slave device found - either due to automatic or requested search. -2. Userspace commands. Includes read/write and search/alarm search comamnds. +1. Events. They are generated each time new master or slave device + found either due to automatic or requested search. +2. Userspace commands. 3. Replies to userspace commands. Protocol. ======== -[struct cn_msg] - connector header. It's length field is equal to size of the attached data. +[struct cn_msg] - connector header. + Its length field is equal to size of the attached data [struct w1_netlink_msg] - w1 netlink header. __u8 type - message type. - W1_SLAVE_ADD/W1_SLAVE_REMOVE - slave add/remove events. - W1_MASTER_ADD/W1_MASTER_REMOVE - master add/remove events. - W1_MASTER_CMD - userspace command for bus master device (search/alarm search). - W1_SLAVE_CMD - userspace command for slave device (read/write/ search/alarm search - for bus master device where given slave device found). + W1_LIST_MASTERS + list current bus masters + W1_SLAVE_ADD/W1_SLAVE_REMOVE + slave add/remove events + W1_MASTER_ADD/W1_MASTER_REMOVE + master add/remove events + W1_MASTER_CMD + userspace command for bus master + device (search/alarm search) + W1_SLAVE_CMD + userspace command for slave device + (read/write/touch) __u8 res - reserved - __u16 len - size of attached to this header data. + __u16 len - size of data attached to this header data union { - __u8 id; - slave unique device id + __u8 id[8]; - slave unique device id struct w1_mst { - __u32 id; - master's id. + __u32 id; - master's id __u32 res; - reserved } mst; } id; -[strucrt w1_netlink_cmd] - command for gived master or slave device. +[struct w1_netlink_cmd] - command for given master or slave device. __u8 cmd - command opcode. - W1_CMD_READ - read command. - W1_CMD_WRITE - write command. - W1_CMD_SEARCH - search command. - W1_CMD_ALARM_SEARCH - alarm search command. + W1_CMD_READ - read command + W1_CMD_WRITE - write command + W1_CMD_TOUCH - touch command + (write and sample data back to userspace) + W1_CMD_SEARCH - search command + W1_CMD_ALARM_SEARCH - alarm search command __u8 res - reserved - __u16 len - length of data for this command. - For read command data must be allocated like for write command. - __u8 data[0] - data for this command. + __u16 len - length of data for this command + For read command data must be allocated like for write command + __u8 data[0] - data for this command -Each connector message can include one or more w1_netlink_msg with zero of more attached w1_netlink_cmd messages. +Each connector message can include one or more w1_netlink_msg with +zero or more attached w1_netlink_cmd messages. -For event messages there are no w1_netlink_cmd embedded structures, only connector header -and w1_netlink_msg strucutre with "len" field being zero and filled type (one of event types) -and id - either 8 bytes of slave unique id in host order, or master's id, which is assigned -to bus master device when it is added to w1 core. +For event messages there are no w1_netlink_cmd embedded structures, +only connector header and w1_netlink_msg strucutre with "len" field +being zero and filled type (one of event types) and id: +either 8 bytes of slave unique id in host order, +or master's id, which is assigned to bus master device +when it is added to w1 core. + +Currently replies to userspace commands are only generated for read +command request. One reply is generated exactly for one w1_netlink_cmd +read request. Replies are not combined when sent - i.e. typical reply +messages looks like the following: -Currently replies to userspace commands are only generated for read command request. -One reply is generated exactly for one w1_netlink_cmd read request. -Replies are not combined when sent - i.e. typical reply messages looks like the following: [cn_msg][w1_netlink_msg][w1_netlink_cmd] -cn_msg.len = sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd) + cmd->len; +cn_msg.len = sizeof(struct w1_netlink_msg) + + sizeof(struct w1_netlink_cmd) + + cmd->len; w1_netlink_msg.len = sizeof(struct w1_netlink_cmd) + cmd->len; w1_netlink_cmd.len = cmd->len; +Replies to W1_LIST_MASTERS should send a message back to the userspace +which will contain list of all registered master ids in the following +format: + + cn_msg (CN_W1_IDX.CN_W1_VAL as id, len is equal to sizeof(struct + w1_netlink_msg) plus number of masters multipled by 4) + w1_netlink_msg (type: W1_LIST_MASTERS, len is equal to + number of masters multiplied by 4 (u32 size)) + id0 ... idN + + Each message is at most 4k in size, so if number of master devices + exceeds this, it will be split into several messages, + cn.seq will be increased for each one. + +W1 search and alarm search commands. +request: +[cn_msg] + [w1_netlink_msg type = W1_MASTER_CMD + id is equal to the bus master id to use for searching] + [w1_netlink_cmd cmd = W1_CMD_SEARCH or W1_CMD_ALARM_SEARCH] + +reply: + [cn_msg, ack = 1 and increasing, 0 means the last message, + seq is equal to the request seq] + [w1_netlink_msg type = W1_MASTER_CMD] + [w1_netlink_cmd cmd = W1_CMD_SEARCH or W1_CMD_ALARM_SEARCH + len is equal to number of IDs multiplied by 8] + [64bit-id0 ... 64bit-idN] +Length in each header corresponds to the size of the data behind it, so +w1_netlink_cmd->len = N * 8; where N is number of IDs in this message. + Can be zero. +w1_netlink_msg->len = sizeof(struct w1_netlink_cmd) + N * 8; +cn_msg->len = sizeof(struct w1_netlink_msg) + + sizeof(struct w1_netlink_cmd) + + N*8; + +W1 reset command. +[cn_msg] + [w1_netlink_msg type = W1_MASTER_CMD + id is equal to the bus master id to use for searching] + [w1_netlink_cmd cmd = W1_CMD_RESET] + + +Command status replies. +====================== + +Each command (either root, master or slave with or without w1_netlink_cmd +structure) will be 'acked' by the w1 core. Format of the reply is the same +as request message except that length parameters do not account for data +requested by the user, i.e. read/write/touch IO requests will not contain +data, so w1_netlink_cmd.len will be 0, w1_netlink_msg.len will be size +of the w1_netlink_cmd structure and cn_msg.len will be equal to the sum +of the sizeof(struct w1_netlink_msg) and sizeof(struct w1_netlink_cmd). +If reply is generated for master or root command (which do not have +w1_netlink_cmd attached), reply will contain only cn_msg and w1_netlink_msg +structires. + +w1_netlink_msg.status field will carry positive error value +(EINVAL for example) or zero in case of success. + +All other fields in every structure will mirror the same parameters in the +request message (except lengths as described above). + +Status reply is generated for every w1_netlink_cmd embedded in the +w1_netlink_msg, if there are no w1_netlink_cmd structures, +reply will be generated for the w1_netlink_msg. + +All w1_netlink_cmd command structures are handled in every w1_netlink_msg, +even if there were errors, only length mismatch interrupts message processing. + Operation steps in w1 core when new command is received. ======================================================= -When new message (w1_netlink_msg) is received w1 core detects if it is master of slave request, -according to w1_netlink_msg.type field. +When new message (w1_netlink_msg) is received w1 core detects if it is +master or slave request, according to w1_netlink_msg.type field. Then master or slave device is searched for. -When found, master device (requested or those one on where slave device is found) is locked. -If slave command is requested, then reset/select procedure is started to select given device. +When found, master device (requested or those one on where slave device +is found) is locked. If slave command is requested, then reset/select +procedure is started to select given device. Then all requested in w1_netlink_msg operations are performed one by one. If command requires reply (like read command) it is sent on command completion. @@ -82,8 +170,8 @@ Connector [1] specific documentation. Each connector message includes two u32 fields as "address". w1 uses CN_W1_IDX and CN_W1_VAL defined in include/linux/connector.h header. Each message also includes sequence and acknowledge numbers. -Sequence number for event messages is appropriate bus master sequence number increased with -each event message sent "through" this master. +Sequence number for event messages is appropriate bus master sequence number +increased with each event message sent "through" this master. Sequence number for userspace requests is set by userspace application. Sequence number for reply is the same as was in request, and acknowledge number is set to seq+1. @@ -93,6 +181,6 @@ Additional documantion, source code examples. ============================================ 1. Documentation/connector -2. http://tservice.net.ru/~s0mbre/archive/w1 -This archive includes userspace application w1d.c which -uses read/write/search commands for all master/slave devices found on the bus. +2. http://www.ioremap.net/archive/w1 +This archive includes userspace application w1d.c which uses +read/write/search commands for all master/slave devices found on the bus. diff --git a/Documentation/wimax/README.i2400m b/Documentation/wimax/README.i2400m new file mode 100644 index 00000000000..7dffd8919cb --- /dev/null +++ b/Documentation/wimax/README.i2400m @@ -0,0 +1,260 @@ + + Driver for the Intel Wireless Wimax Connection 2400m + + (C) 2008 Intel Corporation < linux-wimax@intel.com > + + This provides a driver for the Intel Wireless WiMAX Connection 2400m + and a basic Linux kernel WiMAX stack. + +1. Requirements + + * Linux installation with Linux kernel 2.6.22 or newer (if building + from a separate tree) + * Intel i2400m Echo Peak or Baxter Peak; this includes the Intel + Wireless WiMAX/WiFi Link 5x50 series. + * build tools: + + Linux kernel development package for the target kernel; to + build against your currently running kernel, you need to have + the kernel development package corresponding to the running + image installed (usually if your kernel is named + linux-VERSION, the development package is called + linux-dev-VERSION or linux-headers-VERSION). + + GNU C Compiler, make + +2. Compilation and installation + +2.1. Compilation of the drivers included in the kernel + + Configure the kernel; to enable the WiMAX drivers select Drivers > + Networking Drivers > WiMAX device support. Enable all of them as + modules (easier). + + If USB or SDIO are not enabled in the kernel configuration, the options + to build the i2400m USB or SDIO drivers will not show. Enable said + subsystems and go back to the WiMAX menu to enable the drivers. + + Compile and install your kernel as usual. + +2.2. Compilation of the drivers distributed as an standalone module + + To compile + +$ cd source/directory +$ make + + Once built you can load and unload using the provided load.sh script; + load.sh will load the modules, load.sh u will unload them. + + To install in the default kernel directories (and enable auto loading + when the device is plugged): + +$ make install +$ depmod -a + + If your kernel development files are located in a non standard + directory or if you want to build for a kernel that is not the + currently running one, set KDIR to the right location: + +$ make KDIR=/path/to/kernel/dev/tree + + For more information, please contact linux-wimax@intel.com. + +3. Installing the firmware + + The firmware can be obtained from http://linuxwimax.org or might have + been supplied with your hardware. + + It has to be installed in the target system: + * +$ cp FIRMWAREFILE.sbcf /lib/firmware/i2400m-fw-BUSTYPE-1.3.sbcf + + * NOTE: if your firmware came in an .rpm or .deb file, just install + it as normal, with the rpm (rpm -i FIRMWARE.rpm) or dpkg + (dpkg -i FIRMWARE.deb) commands. No further action is needed. + * BUSTYPE will be usb or sdio, depending on the hardware you have. + Each hardware type comes with its own firmware and will not work + with other types. + +4. Design + + This package contains two major parts: a WiMAX kernel stack and a + driver for the Intel i2400m. + + The WiMAX stack is designed to provide for common WiMAX control + services to current and future WiMAX devices from any vendor; please + see README.wimax for details. + + The i2400m kernel driver is broken up in two main parts: the bus + generic driver and the bus-specific drivers. The bus generic driver + forms the drivercore and contain no knowledge of the actual method we + use to connect to the device. The bus specific drivers are just the + glue to connect the bus-generic driver and the device. Currently only + USB and SDIO are supported. See drivers/net/wimax/i2400m/i2400m.h for + more information. + + The bus generic driver is logically broken up in two parts: OS-glue and + hardware-glue. The OS-glue interfaces with Linux. The hardware-glue + interfaces with the device on using an interface provided by the + bus-specific driver. The reason for this breakup is to be able to + easily reuse the hardware-glue to write drivers for other OSes; note + the hardware glue part is written as a native Linux driver; no + abstraction layers are used, so to port to another OS, the Linux kernel + API calls should be replaced with the target OS's. + +5. Usage + + To load the driver, follow the instructions in the install section; + once the driver is loaded, plug in the device (unless it is permanently + plugged in). The driver will enumerate the device, upload the firmware + and output messages in the kernel log (dmesg, /var/log/messages or + /var/log/kern.log) such as: + +... +i2400m_usb 5-4:1.0: firmware interface version 8.0.0 +i2400m_usb 5-4:1.0: WiMAX interface wmx0 (00:1d:e1:01:94:2c) ready + + At this point the device is ready to work. + + Current versions require the Intel WiMAX Network Service in userspace + to make things work. See the network service's README for instructions + on how to scan, connect and disconnect. + +5.1. Module parameters + + Module parameters can be set at kernel or module load time or by + echoing values: + +$ echo VALUE > /sys/module/MODULENAME/parameters/PARAMETERNAME + + To make changes permanent, for example, for the i2400m module, you can + also create a file named /etc/modprobe.d/i2400m containing: + +options i2400m idle_mode_disabled=1 + + To find which parameters are supported by a module, run: + +$ modinfo path/to/module.ko + + During kernel bootup (if the driver is linked in the kernel), specify + the following to the kernel command line: + +i2400m.PARAMETER=VALUE + +5.1.1. i2400m: idle_mode_disabled + + The i2400m module supports a parameter to disable idle mode. This + parameter, once set, will take effect only when the device is + reinitialized by the driver (eg: following a reset or a reconnect). + +5.2. Debug operations: debugfs entries + + The driver will register debugfs entries that allow the user to tweak + debug settings. There are three main container directories where + entries are placed, which correspond to the three blocks a i2400m WiMAX + driver has: + * /sys/kernel/debug/wimax:DEVNAME/ for the generic WiMAX stack + controls + * /sys/kernel/debug/wimax:DEVNAME/i2400m for the i2400m generic + driver controls + * /sys/kernel/debug/wimax:DEVNAME/i2400m-usb (or -sdio) for the + bus-specific i2400m-usb or i2400m-sdio controls). + + Of course, if debugfs is mounted in a directory other than + /sys/kernel/debug, those paths will change. + +5.2.1. Increasing debug output + + The files named *dl_* indicate knobs for controlling the debug output + of different submodules: + * +# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\* +/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_tx +/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_rx +/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_notif +/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_fw +/sys/kernel/debug/wimax:wmx0/i2400m-usb/dl_usb +/sys/kernel/debug/wimax:wmx0/i2400m/dl_tx +/sys/kernel/debug/wimax:wmx0/i2400m/dl_rx +/sys/kernel/debug/wimax:wmx0/i2400m/dl_rfkill +/sys/kernel/debug/wimax:wmx0/i2400m/dl_netdev +/sys/kernel/debug/wimax:wmx0/i2400m/dl_fw +/sys/kernel/debug/wimax:wmx0/i2400m/dl_debugfs +/sys/kernel/debug/wimax:wmx0/i2400m/dl_driver +/sys/kernel/debug/wimax:wmx0/i2400m/dl_control +/sys/kernel/debug/wimax:wmx0/wimax_dl_stack +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg +/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table +/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs + + By reading the file you can obtain the current value of said debug + level; by writing to it, you can set it. + + To increase the debug level of, for example, the i2400m's generic TX + engine, just write: + +$ echo 3 > /sys/kernel/debug/wimax:wmx0/i2400m/dl_tx + + Increasing numbers yield increasing debug information; for details of + what is printed and the available levels, check the source. The code + uses 0 for disabled and increasing values until 8. + +5.2.2. RX and TX statistics + + The i2400m/rx_stats and i2400m/tx_stats provide statistics about the + data reception/delivery from the device: + +$ cat /sys/kernel/debug/wimax:wmx0/i2400m/rx_stats +45 1 3 34 3104 48 480 + + The numbers reported are + * packets/RX-buffer: total, min, max + * RX-buffers: total RX buffers received, accumulated RX buffer size + in bytes, min size received, max size received + + Thus, to find the average buffer size received, divide accumulated + RX-buffer / total RX-buffers. + + To clear the statistics back to 0, write anything to the rx_stats file: + +$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m_rx_stats + + Likewise for TX. + + Note the packets this debug file refers to are not network packet, but + packets in the sense of the device-specific protocol for communication + to the host. See drivers/net/wimax/i2400m/tx.c. + +5.2.3. Tracing messages received from user space + + To echo messages received from user space into the trace pipe that the + i2400m driver creates, set the debug file i2400m/trace_msg_from_user to + 1: + * +$ echo 1 > /sys/kernel/debug/wimax:wmx0/i2400m/trace_msg_from_user + +5.2.4. Performing a device reset + + By writing a 0, a 1 or a 2 to the file + /sys/kernel/debug/wimax:wmx0/reset, the driver performs a warm (without + disconnecting from the bus), cold (disconnecting from the bus) or bus + (bus specific) reset on the device. + +5.2.5. Asking the device to enter power saving mode + + By writing any value to the /sys/kernel/debug/wimax:wmx0 file, the + device will attempt to enter power saving mode. + +6. Troubleshooting + +6.1. Driver complains about 'i2400m-fw-usb-1.2.sbcf: request failed' + + If upon connecting the device, the following is output in the kernel + log: + +i2400m_usb 5-4:1.0: fw i2400m-fw-usb-1.3.sbcf: request failed: -2 + + This means that the driver cannot locate the firmware file named + /lib/firmware/i2400m-fw-usb-1.2.sbcf. Check that the file is present in + the right location. diff --git a/Documentation/wimax/README.wimax b/Documentation/wimax/README.wimax new file mode 100644 index 00000000000..b78c4378084 --- /dev/null +++ b/Documentation/wimax/README.wimax @@ -0,0 +1,81 @@ + + Linux kernel WiMAX stack + + (C) 2008 Intel Corporation < linux-wimax@intel.com > + + This provides a basic Linux kernel WiMAX stack to provide a common + control API for WiMAX devices, usable from kernel and user space. + +1. Design + + The WiMAX stack is designed to provide for common WiMAX control + services to current and future WiMAX devices from any vendor. + + Because currently there is only one and we don't know what would be the + common services, the APIs it currently provides are very minimal. + However, it is done in such a way that it is easily extensible to + accommodate future requirements. + + The stack works by embedding a struct wimax_dev in your device's + control structures. This provides a set of callbacks that the WiMAX + stack will call in order to implement control operations requested by + the user. As well, the stack provides API functions that the driver + calls to notify about changes of state in the device. + + The stack exports the API calls needed to control the device to user + space using generic netlink as a marshalling mechanism. You can access + them using your own code or use the wrappers provided for your + convenience in libwimax (in the wimax-tools package). + + For detailed information on the stack, please see + include/linux/wimax.h. + +2. Usage + + For usage in a driver (registration, API, etc) please refer to the + instructions in the header file include/linux/wimax.h. + + When a device is registered with the WiMAX stack, a set of debugfs + files will appear in /sys/kernel/debug/wimax:wmxX can tweak for + control. + +2.1. Obtaining debug information: debugfs entries + + The WiMAX stack is compiled, by default, with debug messages that can + be used to diagnose issues. By default, said messages are disabled. + + The drivers will register debugfs entries that allow the user to tweak + debug settings. + + Each driver, when registering with the stack, will cause a debugfs + directory named wimax:DEVICENAME to be created; optionally, it might + create more subentries below it. + +2.1.1. Increasing debug output + + The files named *dl_* indicate knobs for controlling the debug output + of different submodules of the WiMAX stack: + * +# find /sys/kernel/debug/wimax\:wmx0 -name \*dl_\* +/sys/kernel/debug/wimax:wmx0/wimax_dl_stack +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_rfkill +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_reset +/sys/kernel/debug/wimax:wmx0/wimax_dl_op_msg +/sys/kernel/debug/wimax:wmx0/wimax_dl_id_table +/sys/kernel/debug/wimax:wmx0/wimax_dl_debugfs +/sys/kernel/debug/wimax:wmx0/.... # other driver specific files + + NOTE: Of course, if debugfs is mounted in a directory other than + /sys/kernel/debug, those paths will change. + + By reading the file you can obtain the current value of said debug + level; by writing to it, you can set it. + + To increase the debug level of, for example, the id-table submodule, + just write: + +$ echo 3 > /sys/kernel/debug/wimax:wmx0/wimax_dl_id_table + + Increasing numbers yield increasing debug information; for details of + what is printed and the available levels, check the source. The code + uses 0 for disabled and increasing values until 8. diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 83c0033ee9e..7b4596ac412 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -44,7 +44,7 @@ Protocol 2.07: (Kernel 2.6.24) Added paravirtualised boot protocol. and KEEP_SEGMENTS flag in load_flags. Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format - payload. Introduced payload_offset and payload length + payload. Introduced payload_offset and payload_length fields to aid in locating the payload. Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical @@ -349,7 +349,7 @@ Protocol: 2.00+ 3 SYSLINUX 4 EtherBoot 5 ELILO - 7 GRuB + 7 GRUB 8 U-BOOT 9 Xen A Gujin @@ -537,8 +537,8 @@ Type: read Offset/size: 0x248/4 Protocol: 2.08+ - If non-zero then this field contains the offset from the end of the - real-mode code to the payload. + If non-zero then this field contains the offset from the beginning + of the protected-mode code to the payload. The payload may be compressed. The format of both the compressed and uncompressed data should be determined using the standard magic diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index c93ff5f4c0d..cf08c9fff3c 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -80,6 +80,30 @@ pci proc | -- | -- | WC | | | | | ------------------------------------------------------------------- +Advanced APIs for drivers +------------------------- +A. Exporting pages to users with remap_pfn_range, io_remap_pfn_range, +vm_insert_pfn + +Drivers wanting to export some pages to userspace do it by using mmap +interface and a combination of +1) pgprot_noncached() +2) io_remap_pfn_range() or remap_pfn_range() or vm_insert_pfn() + +With PAT support, a new API pgprot_writecombine is being added. So, drivers can +continue to use the above sequence, with either pgprot_noncached() or +pgprot_writecombine() in step 1, followed by step 2. + +In addition, step 2 internally tracks the region as UC or WC in memtype +list in order to ensure no conflicting mapping. + +Note that this set of APIs only works with IO (non RAM) regions. If driver +wants to export a RAM region, it has to do set_memory_uc() or set_memory_wc() +as step 0 above and also track the usage of those pages and use set_memory_wb() +before the page is freed to free pool. + + + Notes: -- in the above table mean "Not suggested usage for the API". Some of the --'s diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index f6d561a1a9b..34c13040a71 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -79,17 +79,6 @@ Timing Report when timer interrupts are lost because some code turned off interrupts for too long. - nmi_watchdog=NUMBER[,panic] - NUMBER can be: - 0 don't use an NMI watchdog - 1 use the IO-APIC timer for the NMI watchdog - 2 use the local APIC for the NMI watchdog using a performance counter. Note - This will use one performance counter and the local APIC's performance - vector. - When panic is specified panic when an NMI watchdog timeout occurs. - This is useful when you use a panic=... timeout and need the box - quickly up again. - nohpet Don't use the HPET timer. diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index efce7509736..29b52b14d0b 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory +ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 169ad423a3d..4f913857b8a 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -3,7 +3,7 @@ protocol of kernel. These should be filled by bootloader or 16-bit real-mode setup code of the kernel. References/settings to it mainly are in: - include/asm-x86/bootparam.h + arch/x86/include/asm/bootparam.h Offset Proto Name Meaning -- cgit v1.2.3