aboutsummaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/00-INDEX24
-rw-r--r--Documentation/CodingStyle7
-rw-r--r--Documentation/DMA-mapping.txt24
-rw-r--r--Documentation/DocBook/Makefile2
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl10
-rw-r--r--Documentation/DocBook/filesystems.tmpl36
-rw-r--r--Documentation/DocBook/gadget.tmpl6
-rw-r--r--Documentation/DocBook/kernel-api.tmpl7
-rw-r--r--Documentation/DocBook/mtdnand.tmpl6
-rw-r--r--Documentation/HOWTO4
-rw-r--r--Documentation/IPMI.txt25
-rw-r--r--Documentation/Intel-IOMMU.txt115
-rw-r--r--Documentation/RCU/00-INDEX22
-rw-r--r--Documentation/SM501.txt5
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/SubmittingDrivers3
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--Documentation/accounting/getdelays.c1
-rw-r--r--Documentation/arm/00-INDEX22
-rw-r--r--Documentation/arm/Samsung-S3C24XX/DMA.txt18
-rw-r--r--Documentation/atomic_ops.txt73
-rw-r--r--Documentation/block/00-INDEX20
-rw-r--r--Documentation/block/as-iosched.txt21
-rw-r--r--Documentation/block/biodoc.txt4
-rw-r--r--Documentation/block/deadline-iosched.txt25
-rw-r--r--Documentation/block/ioprio.txt2
-rw-r--r--Documentation/block/request.txt2
-rw-r--r--Documentation/block/switching-sched.txt21
-rw-r--r--Documentation/cachetlb.txt33
-rw-r--r--Documentation/cdrom/cdrom-standard.tex2
-rw-r--r--Documentation/cgroups.txt545
-rw-r--r--Documentation/cpu-hotplug.txt4
-rw-r--r--Documentation/cpusets.txt250
-rw-r--r--Documentation/device-mapper/dm-uevent.txt97
-rw-r--r--Documentation/devices.txt2
-rw-r--r--Documentation/dontdiff8
-rw-r--r--Documentation/driver-model/devres.txt4
-rw-r--r--Documentation/early-userspace/README6
-rw-r--r--Documentation/email-clients.txt217
-rw-r--r--Documentation/fb/00-INDEX46
-rw-r--r--Documentation/fb/deferred_io.txt4
-rw-r--r--Documentation/fb/uvesafb.txt188
-rw-r--r--Documentation/feature-removal-schedule.txt91
-rw-r--r--Documentation/filesystems/00-INDEX10
-rw-r--r--Documentation/filesystems/9p.txt24
-rw-r--r--Documentation/filesystems/Exporting115
-rw-r--r--Documentation/filesystems/Locking11
-rw-r--r--Documentation/filesystems/ext3.txt14
-rw-r--r--Documentation/filesystems/files.txt6
-rw-r--r--Documentation/filesystems/locks.txt (renamed from Documentation/locks.txt)10
-rw-r--r--Documentation/filesystems/mandatory-locking.txt (renamed from Documentation/mandatory.txt)21
-rw-r--r--Documentation/filesystems/proc.txt37
-rw-r--r--Documentation/filesystems/quota.txt59
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt14
-rw-r--r--Documentation/filesystems/sysfs.txt2
-rw-r--r--Documentation/filesystems/vfs.txt53
-rw-r--r--Documentation/firmware_class/firmware_sample_firmware_class.c10
-rw-r--r--Documentation/i2c/i2c-protocol2
-rw-r--r--Documentation/i386/boot.txt34
-rw-r--r--Documentation/ia64/err_inject.txt6
-rw-r--r--Documentation/ide.txt6
-rw-r--r--Documentation/initrd.txt12
-rw-r--r--Documentation/input/atarikbd.txt8
-rw-r--r--Documentation/input/ff.txt2
-rw-r--r--Documentation/input/iforce-protocol.txt20
-rw-r--r--Documentation/input/input-programming.txt17
-rw-r--r--Documentation/isdn/CREDITS2
-rw-r--r--Documentation/isdn/README.concap2
-rw-r--r--Documentation/java.txt2
-rw-r--r--Documentation/kbuild/kconfig-language.txt14
-rw-r--r--Documentation/kbuild/makefiles.txt84
-rw-r--r--Documentation/kdump/kdump.txt89
-rw-r--r--Documentation/kernel-docs.txt4
-rw-r--r--Documentation/kernel-parameters.txt122
-rw-r--r--Documentation/keys-request-key.txt25
-rw-r--r--Documentation/keys.txt93
-rw-r--r--Documentation/lguest/Makefile26
-rw-r--r--Documentation/lguest/lguest.c1629
-rw-r--r--Documentation/lguest/lguest.txt72
-rw-r--r--Documentation/local_ops.txt25
-rw-r--r--Documentation/m68k/kernel-options.txt9
-rw-r--r--Documentation/make/headers_install.txt46
-rw-r--r--Documentation/markers.txt81
-rw-r--r--Documentation/memory-barriers.txt14
-rw-r--r--Documentation/memory-hotplug.txt58
-rw-r--r--Documentation/mips/00-INDEX6
-rw-r--r--Documentation/mips/AU1xxx_IDE.README2
-rw-r--r--Documentation/mips/time.README173
-rw-r--r--Documentation/mutex-design.txt7
-rw-r--r--Documentation/networking/bcm43xx.txt2
-rw-r--r--Documentation/networking/bonding.txt33
-rw-r--r--Documentation/networking/ip-sysctl.txt2
-rw-r--r--Documentation/networking/proc_net_tcp.txt5
-rw-r--r--Documentation/networking/rxrpc.txt9
-rw-r--r--Documentation/networking/udplite.txt6
-rw-r--r--Documentation/parport-lowlevel.txt29
-rw-r--r--Documentation/power/00-INDEX34
-rw-r--r--Documentation/power/basic-pm-debugging.txt4
-rw-r--r--Documentation/power/drivers-testing.txt4
-rw-r--r--Documentation/power/freezing-of-tasks.txt44
-rw-r--r--Documentation/power/interface.txt2
-rw-r--r--Documentation/power/swsusp-and-swap-files.txt2
-rw-r--r--Documentation/powerpc/00-INDEX4
-rw-r--r--Documentation/powerpc/eeh-pci-error-recovery.txt4
-rw-r--r--Documentation/powerpc/mpc52xx-device-tree-bindings.txt16
-rw-r--r--Documentation/ramdisk.txt18
-rw-r--r--Documentation/sched-design-CFS.txt67
-rw-r--r--Documentation/scsi/00-INDEX34
-rw-r--r--Documentation/scsi/ChangeLog.arcmsr17
-rw-r--r--Documentation/scsi/ChangeLog.ncr53c8xx6
-rw-r--r--Documentation/scsi/aacraid.txt8
-rw-r--r--Documentation/scsi/advansys.txt243
-rw-r--r--Documentation/scsi/aic79xx.txt2
-rw-r--r--Documentation/scsi/aic7xxx.txt6
-rw-r--r--Documentation/scsi/arcmsr_spec.txt2
-rw-r--r--Documentation/scsi/ibmmca.txt60
-rw-r--r--Documentation/scsi/ncr53c8xx.txt14
-rw-r--r--Documentation/sharedsubtree.txt1
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt115
-rw-r--r--Documentation/sound/alsa/CMIPCI.txt17
-rw-r--r--Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl184
-rw-r--r--Documentation/sound/alsa/OSS-Emulation.txt7
-rw-r--r--Documentation/sound/alsa/hda_codec.txt49
-rw-r--r--Documentation/sound/alsa/powersave.txt41
-rw-r--r--Documentation/sound/alsa/soc/DAI.txt8
-rw-r--r--Documentation/sound/alsa/soc/clocking.txt10
-rw-r--r--Documentation/sound/alsa/soc/codec.txt6
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt4
-rw-r--r--Documentation/sound/alsa/soc/overview.txt17
-rw-r--r--Documentation/sound/alsa/soc/platform.txt2
-rw-r--r--Documentation/sound/alsa/soc/pops_clicks.txt6
-rw-r--r--Documentation/sound/oss/es137164
-rw-r--r--Documentation/spi/pxa2xx2
-rw-r--r--Documentation/spi/spi-summary25
-rw-r--r--Documentation/spi/spidev_test.c6
-rw-r--r--Documentation/sysctl/00-INDEX16
-rw-r--r--Documentation/sysctl/kernel.txt8
-rw-r--r--Documentation/sysctl/vm.txt28
-rw-r--r--Documentation/telephony/00-INDEX4
-rw-r--r--Documentation/thinkpad-acpi.txt29
-rw-r--r--Documentation/usb/usb-serial.txt2
-rw-r--r--Documentation/vm/00-INDEX20
-rw-r--r--Documentation/vm/numa_memory_policy.txt33
-rw-r--r--Documentation/vm/slabinfo.c27
-rw-r--r--Documentation/w1/00-INDEX8
-rw-r--r--Documentation/w1/masters/00-INDEX6
-rw-r--r--Documentation/w1/masters/ds24822
-rw-r--r--Documentation/w1/masters/ds24902
-rw-r--r--Documentation/x86_64/mm.txt1
-rw-r--r--Documentation/xterm-linux.xpm61
150 files changed, 4594 insertions, 2003 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 43e89b1537d..299615d821a 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -22,6 +22,8 @@ CodingStyle
- how the boss likes the C code in the kernel to look.
DMA-API.txt
- DMA API, pci_ API & extensions for non-consistent memory machines.
+DMA-ISA-LPC.txt
+ - How to do DMA with ISA (and LPC) devices.
DMA-mapping.txt
- info for PCI drivers using DMA portably across all platforms.
DocBook/
@@ -50,6 +52,8 @@ README.cycladesZ
- info on Cyclades-Z firmware loading.
SAK.txt
- info on Secure Attention Keys.
+SM501.txt
+ - Silicon Motion SM501 multimedia companion chip
SecurityBugs
- procedure for reporting security bugs found in the kernel.
SubmitChecklist
@@ -145,7 +149,7 @@ fb/
feature-removal-schedule.txt
- list of files and features that are going to be removed.
filesystems/
- - directory with info on the various filesystems that Linux supports.
+ - info on the vfs and the various filesystems that Linux supports.
firmware_class/
- request_firmware() hotplug interface info.
floppy.txt
@@ -230,8 +234,6 @@ local_ops.txt
- semantics and behavior of local atomic operations.
lockdep-design.txt
- documentation on the runtime locking correctness validator.
-locks.txt
- - info on file locking implementations, flock() vs. fcntl(), etc.
logo.gif
- full colour GIF image of Linux logo (penguin - Tux).
logo.txt
@@ -240,14 +242,14 @@ m68k/
- directory with info about Linux on Motorola 68k architecture.
magic-number.txt
- list of magic numbers used to mark/protect kernel data structures.
-mandatory.txt
- - info on the Linux implementation of Sys V mandatory file locking.
mca.txt
- info on supporting Micro Channel Architecture (e.g. PS/2) systems.
md.txt
- info on boot arguments for the multiple devices driver.
memory-barriers.txt
- info on Linux kernel memory barriers.
+memory-hotplug.txt
+ - Hotpluggable memory support, how to use and current status.
memory.txt
- info on typical Linux memory problems.
mips/
@@ -298,6 +300,8 @@ pm.txt
- info on Linux power management support.
pnp.txt
- Linux Plug and Play documentation.
+power_supply_class.txt
+ - Tells userspace about battery, UPS, AC or DC power supply properties
power/
- directory with info on Linux PCI power management.
powerpc/
@@ -334,8 +338,12 @@ sched-coding.txt
- reference for various scheduler-related methods in the O(1) scheduler.
sched-design.txt
- goals, design and implementation of the Linux O(1) scheduler.
+sched-design-CFS.txt
+ - goals, design and implementation of the Complete Fair Scheduler.
sched-domains.txt
- information on scheduling domains.
+sched-nice-design.txt
+ - How and why the scheduler's nice levels are implemented.
sched-stats.txt
- information on schedstats (Linux Scheduler Statistics).
scsi/
@@ -380,6 +388,8 @@ stallion.txt
- info on using the Stallion multiport serial driver.
svga.txt
- short guide on selecting video modes at boot via VGA BIOS.
+sysfs-rules.txt
+ - How not to use sysfs.
sx.txt
- info on the Specialix SX/SI multiport serial driver.
sysctl/
@@ -410,6 +420,8 @@ video4linux/
- directory with info regarding video/TV/radio cards and linux.
vm/
- directory with info on the Linux vm code.
+volatile-considered-harmful.txt
+ - Why the "volatile" type class should not be used
voyager.txt
- guide to running Linux on the Voyager architecture.
w1/
@@ -418,7 +430,5 @@ watchdog/
- how to auto-reboot Linux if it has "fallen and can't get up". ;-)
x86_64/
- directory with info on Linux support for AMD x86-64 (Hammer) machines.
-xterm-linux.xpm
- - XPM image of penguin logo (see logo.txt) sitting on an xterm.
zorro.txt
- info on writing drivers for Zorro bus devices found on Amigas.
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 7f1730f1a1a..6caa1461557 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -77,12 +77,15 @@ Get a decent editor and don't leave whitespace at the end of lines.
Coding style is all about readability and maintainability using commonly
available tools.
-The limit on the length of lines is 80 columns and this is a hard limit.
+The limit on the length of lines is 80 columns and this is a strongly
+preferred limit.
Statements longer than 80 columns will be broken into sensible chunks.
Descendants are always substantially shorter than the parent and are placed
substantially to the right. The same applies to function headers with a long
-argument list. Long strings are as well broken into shorter strings.
+argument list. Long strings are as well broken into shorter strings. The
+only exception to this is where exceeding 80 columns significantly increases
+readability and does not hide information.
void fun(int a, int b, int c)
{
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index e07f2530326..d84f89dbf92 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -189,12 +189,6 @@ smaller mask as pci_set_dma_mask(). However for the rare case that a
device driver only uses consistent allocations, one would have to
check the return value from pci_set_consistent_dma_mask().
-If your 64-bit device is going to be an enormous consumer of DMA
-mappings, this can be problematic since the DMA mappings are a
-finite resource on many platforms. Please see the "DAC Addressing
-for Address Space Hungry Devices" section near the end of this
-document for how to handle this case.
-
Finally, if your device can only drive the low 24-bits of
address during PCI bus mastering you might do something like:
@@ -203,8 +197,6 @@ address during PCI bus mastering you might do something like:
"mydev: 24-bit DMA addressing not available.\n");
goto ignore_this_device;
}
-[Better use DMA_24BIT_MASK instead of 0x00ffffff.
-See linux/include/dma-mapping.h for reference.]
When pci_set_dma_mask() is successful, and returns zero, the PCI layer
saves away this mask you have provided. The PCI layer will use this
@@ -514,7 +506,7 @@ With scatterlists, you map a region gathered from several regions by:
int i, count = pci_map_sg(dev, sglist, nents, direction);
struct scatterlist *sg;
- for (i = 0, sg = sglist; i < count; i++, sg++) {
+ for_each_sg(sglist, sg, count, i) {
hw_address[i] = sg_dma_address(sg);
hw_len[i] = sg_dma_len(sg);
}
@@ -652,18 +644,6 @@ It is planned to completely remove virt_to_bus() and bus_to_virt() as
they are entirely deprecated. Some ports already do not provide these
as it is impossible to correctly support them.
- 64-bit DMA and DAC cycle support
-
-Do you understand all of the text above? Great, then you already
-know how to use 64-bit DMA addressing under Linux. Simply make
-the appropriate pci_set_dma_mask() calls based upon your cards
-capabilities, then use the mapping APIs above.
-
-It is that simple.
-
-Well, not for some odd devices. See the next section for information
-about that.
-
Optimizing Unmap State Space Consumption
On many platforms, pci_unmap_{single,page}() is simply a nop.
@@ -782,5 +762,5 @@ following people:
Jay Estabrook <Jay.Estabrook@compaq.com>
Thomas Sailer <sailer@ife.ee.ethz.ch>
Andrea Arcangeli <andrea@suse.de>
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
David Mosberger-Tang <davidm@hpl.hp.com>
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 1a7f53068ec..054a7ecf64c 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN $@
@touch $@
###
-# Rules to generate postscripts and PNG imgages from .fig format files
+# Rules to generate postscripts and PNG images from .fig format files
quiet_cmd_fig2eps = FIG2EPS $@
cmd_fig2eps = fig2dev -Leps $< $@
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
index 361c884d860..9ee6f3cbb41 100644
--- a/Documentation/DocBook/deviceiobook.tmpl
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -85,7 +85,7 @@
<chapter id="mmio">
<title>Memory Mapped IO</title>
- <sect1>
+ <sect1 id="getting_access_to_the_device">
<title>Getting Access to the Device</title>
<para>
The most widely supported form of IO is memory mapped IO.
@@ -114,7 +114,7 @@
</para>
</sect1>
- <sect1>
+ <sect1 id="accessing_the_device">
<title>Accessing the device</title>
<para>
The part of the interface most used by drivers is reading and
@@ -272,9 +272,9 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
</chapter>
- <chapter>
+ <chapter id="port_space_accesses">
<title>Port Space Accesses</title>
- <sect1>
+ <sect1 id="port_space_explained">
<title>Port Space Explained</title>
<para>
@@ -291,7 +291,7 @@ CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
</para>
</sect1>
- <sect1>
+ <sect1 id="accessing_port_space">
<title>Accessing Port Space</title>
<para>
Accesses to this space are provided through a set of functions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 39fa2aba7f9..5eaef87e8f1 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -40,25 +40,25 @@
<chapter id="vfs">
<title>The Linux VFS</title>
- <sect1><title>The Filesystem types</title>
+ <sect1 id="the_filesystem_types"><title>The Filesystem types</title>
!Iinclude/linux/fs.h
</sect1>
- <sect1><title>The Directory Cache</title>
+ <sect1 id="the_directory_cache"><title>The Directory Cache</title>
!Efs/dcache.c
!Iinclude/linux/dcache.h
</sect1>
- <sect1><title>Inode Handling</title>
+ <sect1 id="inode_handling"><title>Inode Handling</title>
!Efs/inode.c
!Efs/bad_inode.c
</sect1>
- <sect1><title>Registration and Superblocks</title>
+ <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
!Efs/super.c
</sect1>
- <sect1><title>File Locks</title>
+ <sect1 id="file_locks"><title>File Locks</title>
!Efs/locks.c
!Ifs/locks.c
</sect1>
- <sect1><title>Other Functions</title>
+ <sect1 id="other_functions"><title>Other Functions</title>
!Efs/mpage.c
!Efs/namei.c
!Efs/buffer.c
@@ -73,11 +73,11 @@
<chapter id="proc">
<title>The proc filesystem</title>
- <sect1><title>sysctl interface</title>
+ <sect1 id="sysctl_interface"><title>sysctl interface</title>
!Ekernel/sysctl.c
</sect1>
- <sect1><title>proc filesystem interface</title>
+ <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
!Ifs/proc/base.c
</sect1>
</chapter>
@@ -92,7 +92,7 @@
<chapter id="debugfs">
<title>The debugfs filesystem</title>
- <sect1><title>debugfs interface</title>
+ <sect1 id="debugfs_interface"><title>debugfs interface</title>
!Efs/debugfs/inode.c
!Efs/debugfs/file.c
</sect1>
@@ -134,9 +134,9 @@
<title>The Linux Journalling API</title>
- <sect1>
+ <sect1 id="journaling_overview">
<title>Overview</title>
- <sect2>
+ <sect2 id="journaling_details">
<title>Details</title>
<para>
The journalling layer is easy to use. You need to
@@ -307,7 +307,7 @@ particular inode.
</sect2>
- <sect2>
+ <sect2 id="jbd_summary">
<title>Summary</title>
<para>
Using the journal is a matter of wrapping the different context changes,
@@ -349,7 +349,7 @@ an example.
</sect1>
- <sect1>
+ <sect1 id="data_types">
<title>Data Types</title>
<para>
The journalling layer uses typedefs to 'hide' the concrete definitions
@@ -358,27 +358,27 @@ an example.
Obviously the hiding is not enforced as this is 'C'.
</para>
- <sect2><title>Structures</title>
+ <sect2 id="structures"><title>Structures</title>
!Iinclude/linux/jbd.h
</sect2>
</sect1>
- <sect1>
+ <sect1 id="functions">
<title>Functions</title>
<para>
The functions here are split into two groups those that
affect a journal as a whole, and those which are used to
manage transactions
</para>
- <sect2><title>Journal Level</title>
+ <sect2 id="journal_level"><title>Journal Level</title>
!Efs/jbd/journal.c
!Ifs/jbd/recovery.c
</sect2>
- <sect2><title>Transasction Level</title>
+ <sect2 id="transaction_level"><title>Transasction Level</title>
!Efs/jbd/transaction.c
</sect2>
</sect1>
- <sect1>
+ <sect1 id="see_also">
<title>See also</title>
<para>
<citation>
diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl
index 6996d977bf8..5a8ffa761e0 100644
--- a/Documentation/DocBook/gadget.tmpl
+++ b/Documentation/DocBook/gadget.tmpl
@@ -144,7 +144,7 @@ with the lowest level (which directly handles hardware).
<para>This is the lowest software level.
It is the only layer that talks to hardware,
through registers, fifos, dma, irqs, and the like.
- The <filename>&lt;linux/usb_gadget.h&gt;</filename> API abstracts
+ The <filename>&lt;linux/usb/gadget.h&gt;</filename> API abstracts
the peripheral controller endpoint hardware.
That hardware is exposed through endpoint objects, which accept
streams of IN/OUT buffers, and through callbacks that interact
@@ -494,7 +494,7 @@ side drivers (and usbcore).
<sect1 id="core"><title>Core Objects and Methods</title>
<para>These are declared in
-<filename>&lt;linux/usb_gadget.h&gt;</filename>,
+<filename>&lt;linux/usb/gadget.h&gt;</filename>,
and are used by gadget drivers to interact with
USB peripheral controller drivers.
</para>
@@ -509,7 +509,7 @@ USB peripheral controller drivers.
unless the explanations are trivial.
-->
-!Iinclude/linux/usb_gadget.h
+!Iinclude/linux/usb/gadget.h
</sect1>
<sect1 id="utils"><title>Optional Utilities</title>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 230cbf75378..aa38cc5692a 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -46,7 +46,7 @@
<sect1><title>Atomic and pointer manipulation</title>
!Iinclude/asm-x86/atomic_32.h
-!Iinclude/asm-x86/unaligned_32.h
+!Iinclude/asm-x86/unaligned.h
</sect1>
<sect1><title>Delaying, scheduling, and timer routines</title>
@@ -340,7 +340,7 @@ X!Earch/x86/kernel/mca_32.c
<chapter id="security">
<title>Security Framework</title>
-!Esecurity/security.c
+!Isecurity/security.c
</chapter>
<chapter id="audit">
@@ -386,8 +386,7 @@ X!Edrivers/base/interface.c
!Edrivers/base/bus.c
</sect1>
<sect1><title>Device Drivers Power Management</title>
-!Edrivers/base/power/resume.c
-!Edrivers/base/power/suspend.c
+!Edrivers/base/power/main.c
</sect1>
<sect1><title>Device Drivers ACPI Support</title>
<!-- Internal functions only
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 6fbc41d98c1..957cf5c2683 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -282,7 +282,7 @@ int __init board_init (void)
goto out;
}
- /* map physical adress */
+ /* map physical address */
baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
if(!baseaddr){
printk("Ioremap to access NAND chip failed\n");
@@ -306,7 +306,7 @@ int __init board_init (void)
this->dev_ready = board_dev_ready;
this->eccmode = NAND_ECC_SOFT;
- /* Scan to find existance of the device */
+ /* Scan to find existence of the device */
if (nand_scan (board_mtd, 1)) {
err = -ENXIO;
goto out_ior;
@@ -340,7 +340,7 @@ static void __exit board_cleanup (void)
/* Release resources, unregister device */
nand_release (board_mtd);
- /* unmap physical adress */
+ /* unmap physical address */
iounmap((void *)baseaddr);
/* Free the MTD device structure */
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index c64e969dc33..54835610b3d 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -77,7 +77,7 @@ documentation files are also added which explain how to use the feature.
When a kernel change causes the interface that the kernel exposes to
userspace to change, it is recommended that you send the information or
a patch to the manual pages explaining the change to the manual pages
-maintainer at mtk-manpages@gmx.net.
+maintainer at mtk.manpages@gmail.com.
Here is a list of files that are in the kernel source tree that are
required reading:
@@ -330,7 +330,7 @@ Here is a list of some of the different kernel trees available:
- ACPI development tree, Len Brown <len.brown@intel.com>
git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
- - Block development tree, Jens Axboe <axboe@suse.de>
+ - Block development tree, Jens Axboe <jens.axboe@oracle.com>
git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
- DRM development tree, Dave Airlie <airlied@linux.ie>
diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 24dc3fcf159..bc38283379f 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified
0xca2. If you want to turn this off, set the "trydefaults" option to
false.
-If you have high-res timers compiled into the kernel, the driver will
-use them to provide much better performance. Note that if you do not
-have high-res timers enabled in the kernel and you don't have
-interrupts enabled, the driver will run VERY slowly. Don't blame me,
+If your IPMI interface does not support interrupts and is a KCS or
+SMIC interface, the IPMI driver will start a kernel thread for the
+interface to help speed things up. This is a low-priority kernel
+thread that constantly polls the IPMI driver while an IPMI operation
+is in progress. The force_kipmid module parameter will all the user to
+force this thread on or off. If you force it off and don't have
+interrupts, the driver will run VERY slowly. Don't blame me,
these interfaces suck.
The driver supports a hot add and remove of interfaces. This way,
interfaces can be added or removed after the kernel is up and running.
-This is done using /sys/modules/ipmi_si/hotmod, which is a write-only
-parameter. You write a string to this interface. The string has the
-format:
+This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
+write-only parameter. You write a string to this interface. The string
+has the format:
<op1>[:op2[:op3...]]
The "op"s are:
add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
@@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it
gets a pre-action. During a panic or a reboot, the watchdog will
start a 120 timer if it is running to make sure the reboot occurs.
-Note that if you use the NMI preaction for the watchdog, you MUST
-NOT use nmi watchdog mode 1. If you use the NMI watchdog, you
-must use mode 2.
+Note that if you use the NMI preaction for the watchdog, you MUST NOT
+use the nmi watchdog. There is no reasonable way to tell if an NMI
+comes from the IPMI controller, so it must assume that if it gets an
+otherwise unhandled NMI, it must be from IPMI and it will panic
+immediately.
Once you open the watchdog timer, you must write a 'V' character to the
device to close it, or the timer will not stop. This is a new semantic
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt
new file mode 100644
index 00000000000..c2321903aa0
--- /dev/null
+++ b/Documentation/Intel-IOMMU.txt
@@ -0,0 +1,115 @@
+Linux IOMMU Support
+===================
+
+The architecture spec can be obtained from the below location.
+
+http://www.intel.com/technology/virtualization/
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Some Keywords
+
+DMAR - DMA remapping
+DRHD - DMA Engine Reporting Structure
+RMRR - Reserved memory Region Reporting Structure
+ZLR - Zero length reads from PCI devices
+IOVA - IO Virtual address.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different DMA engines in the platform, and
+device scope relationships between PCI devices and which DMA engine controls
+them.
+
+What is RMRR?
+-------------
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+How is IOVA generated?
+---------------------
+
+Well behaved drivers call pci_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, device performs a pci_unmap_*() calls to unmap the region.
+
+The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
+device has its own domain (hence protection). Devices under p2p bridges
+share the virtual address with all devices under the p2p bridge due to
+transaction id aliasing for p2p bridges.
+
+IOVA generation is pretty generic. We used the same technique as vmalloc()
+but these are not global address spaces, but separate for each domain.
+Different DMA engines may support different number of domains.
+
+We also allocate gaurd pages with each mapping, so we can attempt to catch
+any overflow that might happen.
+
+
+Graphics Problems?
+------------------
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+
+If it happens to be a PCI device included in the INCLUDE_ALL Engine,
+then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear
+graphics drivers may be in process of using DMA api's in the near
+future and at that time this option can be yanked out.
+
+Some exceptions to IOVA
+-----------------------
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+
+Fault reporting
+---------------
+When errors are reported, the DMA engine signals via an interrupt. The fault
+reason and device that caused it with fault reason is printed on console.
+
+See below for sample.
+
+
+Boot Message Sample
+-------------------
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI.
+
+ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed.
+
+ACPI DMAR:Host address width 36
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice..
+
+PCI-DMA: Using DMAR IOMMU
+
+Fault reporting
+---------------
+
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+DMAR:[fault reason 05] PTE Write access is not set
+
+TBD
+----
+
+- For compatibility testing, could use unity map domain for all devices, just
+ provide a 1-1 for all useful memory under a single domain for all devices.
+- API for paravirt ops for abstracting functionlity for VMM folks.
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
new file mode 100644
index 00000000000..461481dfb7c
--- /dev/null
+++ b/Documentation/RCU/00-INDEX
@@ -0,0 +1,22 @@
+00-INDEX
+ - This file
+arrayRCU.txt
+ - Using RCU to Protect Read-Mostly Arrays
+checklist.txt
+ - Review Checklist for RCU Patches
+listRCU.txt
+ - Using RCU to Protect Read-Mostly Linked Lists
+NMI-RCU.txt
+ - Using RCU to Protect Dynamic NMI Handlers
+rcuref.txt
+ - Reference-count design for elements of lists/arrays protected by RCU
+rcu.txt
+ - RCU Concepts
+RTFP.txt
+ - List of RCU papers (bibliography) going back to 1980.
+torture.txt
+ - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+UP.txt
+ - RCU on Uniprocessor Systems
+whatisRCU.txt
+ - What is RCU?
diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt
index 3a1bd95d376..6fc65603592 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/SM501.txt
@@ -3,6 +3,11 @@
Copyright 2006, 2007 Simtec Electronics
+The Silicon Motion SM501 multimedia companion chip is a multifunction device
+which may provide numerous interfaces including USB host controller USB gadget,
+Asyncronous Serial ports, Audio functions and a dual display video interface.
+The device may be connected by PCI or local bus with varying functions enabled.
+
Core
----
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index 19e7f65c269..34e06d2f194 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -67,7 +67,7 @@ kernel patches.
20: Check that it all passes `make headers_check'.
21: Has been checked with injection of at least slab and page-allocation
- fauilures. See Documentation/fault-injection/.
+ failures. See Documentation/fault-injection/.
If the new code is substantial, addition of subsystem-specific fault
injection might be appropriate.
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index d7e26427e42..24f2eb40cae 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -36,8 +36,7 @@ Linux 2.4:
If the code area has a general maintainer then please submit it to
the maintainer listed in MAINTAINERS in the kernel file. If the
maintainer does not respond or you cannot find the appropriate
- maintainer then please contact Marcelo Tosatti
- <marcelo.tosatti@cyclades.com>.
+ maintainer then please contact Willy Tarreau <w@1wt.eu>.
Linux 2.6:
The same rules apply as 2.4 except that you should follow linux-kernel
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
new file mode 100644
index 00000000000..eda40fd39ca
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.txt
@@ -0,0 +1,27 @@
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below
+
+~/balbir/cgroupstats # ./getdelays -C "/cgroup/a"
+sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+~/balbir/cgroupstats # ./getdelays -C "/cgroup"
+sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index cbee3a27f76..ab82b7f5331 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -21,7 +21,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
-#include <sys/types.h>
#include <signal.h>
#include <linux/genetlink.h>
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index 2c6a3b38967..82e418d648d 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -4,19 +4,29 @@ Booting
- requirements for booting
Interrupts
- ARM Interrupt subsystem documentation
+IXP2000
+ - Release Notes for Linux on Intel's IXP2000 Network Processor
Netwinder
- Netwinder specific documentation
+Porting
+ - Symbol definitions for porting Linux to a new ARM machine.
+Setup
+ - Kernel initialization parameters on ARM Linux
README
- General ARM documentation
-SA1100
+SA1100/
- SA1100 documentation
-XScale
- - XScale documentation
-empeg
- - Empeg documentation
+Samsung-S3C24XX
+ - S3C24XX ARM Linux Overview
+Sharp-LH
+ - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
+VFP/
+ - Release notes for Linux Kernel Vector Floating Point support code
+empeg/
+ - Ltd's Empeg MP3 Car Audio Player
mem_alignment
- alignment abort handler documentation
memory.txt
- description of the virtual memory layout
-nwfpe
+nwfpe/
- NWFPE floating point emulator documentation
diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt
index 37f4edcc5d8..3ed82383efe 100644
--- a/Documentation/arm/Samsung-S3C24XX/DMA.txt
+++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt
@@ -5,7 +5,7 @@ Introduction
------------
The kernel provides an interface to manage DMA transfers
- using the DMA channels in the cpu, so that the central
+ using the DMA channels in the CPU, so that the central
duty of managing channel mappings, and programming the
channel generators is in one place.
@@ -17,24 +17,24 @@ DMA Channel Ordering
channels to all sources, which means that some devices
have a restricted number of channels that can be used.
- To allow flexibilty for each cpu type and board, the
- dma code can be given an dma ordering structure which
+ To allow flexibility for each CPU type and board, the
+ DMA code can be given a DMA ordering structure which
allows the order of channel search to be specified, as
well as allowing the prohibition of certain claims.
struct s3c24xx_dma_order has a list of channels, and
- each channel within has a slot for a list of dma
- channel numbers. The slots are searched in order, for
- the presence of a dma channel number with DMA_CH_VALID
- orred in.
+ each channel within has a slot for a list of DMA
+ channel numbers. The slots are searched in order for
+ the presence of a DMA channel number with DMA_CH_VALID
+ or-ed in.
If the order has the flag DMA_CH_NEVER set, then after
checking the channel list, the system will return no
found channel, thus denying the request.
A board support file can call s3c24xx_dma_order_set()
- to register an complete ordering set. The routine will
- copy the data, so the original can be discared with
+ to register a complete ordering set. The routine will
+ copy the data, so the original can be discarded with
__initdata.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 05851e9982e..f20c10c2858 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -14,8 +14,15 @@ suffice:
typedef struct { volatile int counter; } atomic_t;
- The first operations to implement for atomic_t's are the
-initializers and plain reads.
+Historically, counter has been declared volatile. This is now discouraged.
+See Documentation/volatile-considered-harmful.txt for the complete rationale.
+
+local_t is very similar to atomic_t. If the counter is per CPU and only
+updated by one CPU, local_t is probably more appropriate. Please see
+Documentation/local_ops.txt for the semantics of local_t.
+
+The first operations to implement for atomic_t's are the initializers and
+plain reads.
#define ATOMIC_INIT(i) { (i) }
#define atomic_set(v, i) ((v)->counter = (i))
@@ -24,6 +31,12 @@ The first macro is used in definitions, such as:
static atomic_t my_counter = ATOMIC_INIT(1);
+The initializer is atomic in that the return values of the atomic operations
+are guaranteed to be correct reflecting the initialized value if the
+initializer is used before runtime. If the initializer is used at runtime, a
+proper implicit or explicit read memory barrier is needed before reading the
+value with atomic_read from another thread.
+
The second interface can be used at runtime, as in:
struct foo { atomic_t counter; };
@@ -36,13 +49,43 @@ The second interface can be used at runtime, as in:
return -ENOMEM;
atomic_set(&k->counter, 0);
+The setting is atomic in that the return values of the atomic operations by
+all threads are guaranteed to be correct reflecting either the value that has
+been set with this operation or set with another operation. A proper implicit
+or explicit memory barrier is needed before the value set with the operation
+is guaranteed to be readable with atomic_read from another thread.
+
Next, we have:
#define atomic_read(v) ((v)->counter)
-which simply reads the current value of the counter.
-
-Now, we move onto the actual atomic operation interfaces.
+which simply reads the counter value currently visible to the calling thread.
+The read is atomic in that the return value is guaranteed to be one of the
+values initialized or modified with the interface operations if a proper
+implicit or explicit memory barrier is used after possible runtime
+initialization by any other thread and the value is modified only with the
+interface operations. atomic_read does not guarantee that the runtime
+initialization by any other thread is visible yet, so the user of the
+interface must take care of that with a proper implicit or explicit memory
+barrier.
+
+*** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***
+
+Some architectures may choose to use the volatile keyword, barriers, or inline
+assembly to guarantee some degree of immediacy for atomic_read() and
+atomic_set(). This is not uniformly guaranteed, and may change in the future,
+so all users of atomic_t should treat atomic_read() and atomic_set() as simple
+C statements that may be reordered or optimized away entirely by the compiler
+or processor, and explicitly invoke the appropriate compiler and/or memory
+barrier for each use case. Failure to do so will result in code that may
+suddenly break when used with different architectures or compiler
+optimizations, or even changes in unrelated code which changes how the
+compiler optimizes the section accessing atomic_t variables.
+
+*** YOU HAVE BEEN WARNED! ***
+
+Now, we move onto the atomic operation interfaces typically implemented with
+the help of assembly code.
void atomic_add(int i, atomic_t *v);
void atomic_sub(int i, atomic_t *v);
@@ -117,6 +160,12 @@ operation.
Then:
+ int atomic_xchg(atomic_t *v, int new);
+
+This performs an atomic exchange operation on the atomic variable v, setting
+the given new value. It returns the old value that the atomic variable v had
+just before the operation.
+
int atomic_cmpxchg(atomic_t *v, int old, int new);
This performs an atomic compare exchange operation on the atomic value v,
@@ -369,6 +418,20 @@ brothers:
*/
smp_mb__after_clear_bit();
+There are two special bitops with lock barrier semantics (acquire/release,
+same as spinlocks). These operate in the same way as their non-_lock/unlock
+postfixed variants, except that they are to provide acquire/release semantics,
+respectively. This means they can be used for bit_spin_trylock and
+bit_spin_unlock type operations without specifying any more barriers.
+
+ int test_and_set_bit_lock(unsigned long nr, unsigned long *addr);
+ void clear_bit_unlock(unsigned long nr, unsigned long *addr);
+ void __clear_bit_unlock(unsigned long nr, unsigned long *addr);
+
+The __clear_bit_unlock version is non-atomic, however it still implements
+unlock barrier semantics. This can be useful if the lock itself is protecting
+the other bits in the word.
+
Finally, there are non-atomic versions of the bitmask operations
provided. They are used in contexts where some other higher-level SMP
locking scheme is being used to protect the bitmask, and thus less
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
new file mode 100644
index 00000000000..961a0513f8c
--- /dev/null
+++ b/Documentation/block/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - This file
+as-iosched.txt
+ - Anticipatory IO scheduler
+barrier.txt
+ - I/O Barriers
+biodoc.txt
+ - Notes on the Generic Block Layer Rewrite in Linux 2.5
+capability.txt
+ - Generic Block Device Capability (/sys/block/<disk>/capability)
+deadline-iosched.txt
+ - Deadline IO scheduler tunables
+ioprio.txt
+ - Block io priorities (in CFQ scheduler)
+request.txt
+ - The members of struct request (in include/linux/blkdev.h)
+stat.txt
+ - Block layer statistics in /sys/block/<dev>/stat
+switching-sched.txt
+ - Switching I/O schedulers at runtime
diff --git a/Documentation/block/as-iosched.txt b/Documentation/block/as-iosched.txt
index a598fe10a29..738b72be128 100644
--- a/Documentation/block/as-iosched.txt
+++ b/Documentation/block/as-iosched.txt
@@ -20,15 +20,10 @@ actually has a head for each physical device in the logical RAID device.
However, setting the antic_expire (see tunable parameters below) produces
very similar behavior to the deadline IO scheduler.
-
Selecting IO schedulers
-----------------------
-To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
-'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
-assigned globally at boot time only presently. It's also possible to change
-the IO scheduler for a determined device on the fly, as described in
-Documentation/block/switching-sched.txt.
-
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
Anticipatory IO scheduler Policies
----------------------------------
@@ -115,7 +110,7 @@ statistics (average think time, average seek distance) on the process
that submitted the just completed request are examined. If it seems
likely that that process will submit another request soon, and that
request is likely to be near the just completed request, then the IO
-scheduler will stop dispatching more read requests for up time (antic_expire)
+scheduler will stop dispatching more read requests for up to (antic_expire)
milliseconds, hoping that process will submit a new request near the one
that just completed. If such a request is made, then it is dispatched
immediately. If the antic_expire wait time expires, then the IO scheduler
@@ -165,3 +160,13 @@ The parameters are:
for big seek time devices though not a linear correspondence - most
processes have only a few ms thinktime.
+In addition to the tunables above there is a read-only file named est_time
+which, when read, will show:
+
+ - The probability of a task exiting without a cooperating task
+ submitting an anticipated IO.
+
+ - The current mean think time.
+
+ - The seek distance used to determine if an incoming IO is better.
+
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index dc3f49e3e53..93f223b9723 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -2,7 +2,7 @@
=====================================================
Notes Written on Jan 15, 2002:
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
Suparna Bhattacharya <suparna@in.ibm.com>
Last Updated May 2, 2002
@@ -21,7 +21,7 @@ Credits:
---------
2.5 bio rewrite:
- Jens Axboe <axboe@suse.de>
+ Jens Axboe <jens.axboe@oracle.com>
Many aspects of the generic block layer redesign were driven by and evolved
over discussions, prior patches and the collective experience of several
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index be08ffd1e9b..c23cab13c3d 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -5,16 +5,10 @@ This little file attempts to document how the deadline io scheduler works.
In particular, it will clarify the meaning of the exposed tunables that may be
of interest to power users.
-Each io queue has a set of io scheduler tunables associated with it. These
-tunables control how the io scheduler works. You can find these entries
-in:
-
-/sys/block/<device>/queue/iosched
-
-assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
-you can do so by typing:
-
-# mount none /sys -t sysfs
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.txt for information on
+selecting an io scheduler on a per-device basis.
********************************************************************************
@@ -41,14 +35,11 @@ fifo_batch
When a read request expires its deadline, we must move some requests from
the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move, based on the cost of each request. A
-request is either qualified as a seek or a stream. The io scheduler knows
-the last request that was serviced by the drive (or will be serviced right
-before this one). See seek_cost and stream_unit.
+controls how many requests we move.
-write_starved (number of dispatches)
--------------
+writes_starved (number of dispatches)
+--------------
When we have to move requests from the io scheduler queue to the block
device dispatch queue, we always give a preference to reads. However, we
@@ -73,6 +64,6 @@ that comes at basically 0 cost we leave that on. We simply disable the
rbtree front sector lookup when the io scheduler merge function is called.
-Nov 11 2002, Jens Axboe <axboe@suse.de>
+Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.txt
index 35e516b0b8a..8ed8c59380b 100644
--- a/Documentation/block/ioprio.txt
+++ b/Documentation/block/ioprio.txt
@@ -180,4 +180,4 @@ int main(int argc, char *argv[])
---> snip ionice.c tool <---
-March 11 2005, Jens Axboe <axboe@suse.de>
+March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
index fff58acb40a..754e104ed36 100644
--- a/Documentation/block/request.txt
+++ b/Documentation/block/request.txt
@@ -1,7 +1,7 @@
struct request documentation
-Jens Axboe <axboe@suse.de> 27/05/02
+Jens Axboe <jens.axboe@oracle.com> 27/05/02
1.0
Index
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
index 5fa130a6753..634c952e196 100644
--- a/Documentation/block/switching-sched.txt
+++ b/Documentation/block/switching-sched.txt
@@ -1,3 +1,18 @@
+To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
+'noop', 'as' and 'cfq' (the default) are also available. IO schedulers are
+assigned globally at boot time only presently.
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
+
+assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
+you can do so by typing:
+
+# mount none /sys -t sysfs
+
As of the Linux 2.6.10 kernel, it is now possible to change the
IO scheduler for a given block device on the fly (thus making it possible,
for instance, to set the CFQ scheduler for the system default, but
@@ -20,3 +35,9 @@ noop anticipatory deadline [cfq]
# echo anticipatory > /sys/block/hda/queue/scheduler
# cat /sys/block/hda/queue/scheduler
noop [anticipatory] deadline cfq
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in:
+
+/sys/block/<device>/queue/iosched
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 866b7613942..da42ab414c4 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -87,30 +87,7 @@ changes occur:
This is used primarily during fault processing.
-5) void flush_tlb_pgtables(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-
- The software page tables for address space 'mm' for virtual
- addresses in the range 'start' to 'end-1' are being torn down.
-
- Some platforms cache the lowest level of the software page tables
- in a linear virtually mapped array, to make TLB miss processing
- more efficient. On such platforms, since the TLB is caching the
- software page table structure, it needs to be flushed when parts
- of the software page table tree are unlinked/freed.
-
- Sparc64 is one example of a platform which does this.
-
- Usually, when munmap()'ing an area of user virtual address
- space, the kernel leaves the page table parts around and just
- marks the individual pte's as invalid. However, if very large
- portions of the address space are unmapped, the kernel frees up
- those portions of the software page tables to prevent potential
- excessive kernel memory usage caused by erratic mmap/mmunmap
- sequences. It is at these times that flush_tlb_pgtables will
- be invoked.
-
-6) void update_mmu_cache(struct vm_area_struct *vma,
+5) void update_mmu_cache(struct vm_area_struct *vma,
unsigned long address, pte_t pte)
At the end of every page fault, this routine is invoked to
@@ -123,7 +100,7 @@ changes occur:
translations for software managed TLB configurations.
The sparc64 port currently does this.
-7) void tlb_migrate_finish(struct mm_struct *mm)
+6) void tlb_migrate_finish(struct mm_struct *mm)
This interface is called at the end of an explicit
process migration. This interface provides a hook
@@ -133,12 +110,6 @@ changes occur:
The ia64 sn2 platform is one example of a platform
that uses this interface.
-8) void lazy_mmu_prot_update(pte_t pte)
- This interface is called whenever the protection on
- any user PTEs change. This interface provides a notification
- to architecture specific code to take appropriate action.
-
-
Next, we have the cache flushing interfaces. In general, when Linux
is changing an existing virtual-->physical mapping to a new value,
the sequence will be in one of the following forms:
diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex
index 92f94e59758..c713aeb020c 100644
--- a/Documentation/cdrom/cdrom-standard.tex
+++ b/Documentation/cdrom/cdrom-standard.tex
@@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much
\cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and
Gerd Knorr, who were the first to implement this interface for SCSI
and IDE-CD drivers and added many ideas for extension of the data
-structures relative to kernel~2.0. Further thanks to Heiko Eissfeldt,
+structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt,
Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
Kroll, the \linux\ \cdrom\ device driver developers who were kind
enough to give suggestions and criticisms during the writing. Finally
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
new file mode 100644
index 00000000000..98a26f81fa7
--- /dev/null
+++ b/Documentation/cgroups.txt
@@ -0,0 +1,545 @@
+ CGROUPS
+ -------
+
+Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt
+
+Original copyright statements from cpusets.txt:
+Portions Copyright (C) 2004 BULL SA.
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+Modified by Paul Jackson <pj@sgi.com>
+Modified by Christoph Lameter <clameter@sgi.com>
+
+CONTENTS:
+=========
+
+1. Control Groups
+ 1.1 What are cgroups ?
+ 1.2 Why are cgroups needed ?
+ 1.3 How are cgroups implemented ?
+ 1.4 What does notify_on_release do ?
+ 1.5 How do I use cgroups ?
+2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Attaching processes
+3. Kernel API
+ 3.1 Overview
+ 3.2 Synchronization
+ 3.3 Subsystem API
+4. Questions
+
+1. Control Groups
+==========
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy. Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierachies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task pids assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/cpusets.txt) allows
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines:
+
+ CPU : Top cpuset
+ / \
+ CPUSet1 CPUSet2
+ | |
+ (Profs) (Students)
+
+ In addition (system tasks) are attached to topcpuset (so
+ that they can run anywhere) with a limit of 20%
+
+ Memory : Professors (50%), students (30%), system (20%)
+
+ Disk : Prof (50%), students (30%), system (20%)
+
+ Network : WWW browsing (20%), Network File System (60%), others (20%)
+ / \
+ Prof (15%) students (5%)
+
+Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go
+into NFS network class.
+
+At the same time firefox/lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies) then
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can
+
+ # echo browser_pid > /mnt/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+approp network and other resource class. This may lead to
+proliferation of such cgroups.
+
+Also lets say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :) OR give one of the students simulation
+apps enhanced CPU power,
+
+With ability to write pids directly to resource classes, its just a
+matter of :
+
+ # echo pid > /mnt/network/<new_class>/tasks
+ (after some time)
+ # echo pid > /mnt/network/<orig_class>/tasks
+
+Without this ability, he would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+ css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+ cgroup_subsys_state objects, one for each cgroup subsystem
+ registered in the system. There is no direct link from a task to
+ the cgroup of which it's a member in each hierarchy, but this
+ can be determined by following pointers through the
+ cgroup_subsys_state objects. This is because accessing the
+ subsystem state is something that's expected to happen frequently
+ and in performance-critical code, whereas operations that require a
+ task's actual cgroup assignments (in particular, moving between
+ cgroups) are less common. A linked list runs through the cg_list
+ field of each task_struct using the css_set, anchored at
+ css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+ manipulation from user space.
+
+ - You can list all the tasks (by pid) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+ css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition a new file system, of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel. When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options. By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by pid) attached to that cgroup
+ - notify_on_release flag: run /sbin/cgroup_release_agent on exit?
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir
+
+New cgroups are created using the mkdir system call or shell
+command. The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks. A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, else a new
+css_set is allocated. Note that the current implementation uses a
+linear search to locate an appropriate existing css_set, so isn't
+very efficient. A future version will use a hash table for better
+performance.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cont_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+*** notify_on_release is disabled in the current patch set. It will be
+*** reactivated in a future patch in a less-intrusive manner
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup. This enables automatic
+removal of abandoned cgroups. The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0). The default value of other cgroups at creation is the current
+value of their parents notify_on_release setting. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like:
+
+ 1) mkdir /dev/cgroup
+ 2) mount -t cgroup -ocpuset cpuset /dev/cgroup
+ 3) Create the new cgroup by doing mkdir's and write's (or echo's) in
+ the /dev/cgroup virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cgroup by writing its pid to the
+ /dev/cgroup tasks file for that cgroup.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup:
+
+ mount -t cgroup cpuset -ocpuset /dev/cgroup
+ cd /dev/cgroup
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpus
+ /bin/echo 1 > mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cgroup Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy will all available subsystems, type:
+# mount -t cgroup xxx /dev/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+To mount a cgroup hierarchy with just the cpuset and numtasks
+subsystems, type:
+# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup
+
+To change the set of subsystems bound to a mounted hierarchy, just
+remount with different options:
+
+# mount -o remount,cpuset,ns /dev/cgroup
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /dev/cgroup you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /dev/cgroup
+is the cgroup that holds the whole system.
+
+If you want to create a new cgroup under /dev/cgroup:
+# cd /dev/cgroup
+# mkdir my_cgroup
+
+Now you want to do something with this cgroup.
+# cd my_cgroup
+
+In this directory you can find several files:
+# ls
+notify_on_release release_agent tasks
+(plus whatever files are added by the attached subsystems)
+
+Now attach your shell to this cgroup:
+# /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory.
+# mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir:
+# rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+# /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another:
+
+# /bin/echo PID1 > tasks
+# /bin/echo PID2 > tasks
+ ...
+# /bin/echo PIDn > tasks
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem id which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+ entry in cgroup->subsys[] this subsystem should be
+ managing. Initialized by cgroup_register_subsys(); prior to this
+ it should be initialized to -1
+
+- hierarchy: an index indicating which hierarchy, if any, this
+ subsystem is currently attached to. If this is -1, then the
+ subsystem is not attached to any hierarchy, and all tasks should be
+ considered to be members of the subsystem's top_cgroup. It should
+ be initialized to -1.
+
+- name: should be initialized to a unique subsystem name prior to
+ calling cgroup_register_subsystem. Should be no longer than
+ MAX_CGROUP_TYPE_NAMELEN
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem id; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock(), and can
+take/release the callback_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+--------------------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are create/destroy. Any others that are null are presumed to
+be successful no-ops.
+
+struct cgroup_subsys_state *create(struct cgroup *cont)
+LL=cgroup_mutex
+
+Called to create a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+negative error code. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+void destroy(struct cgroup *cont)
+LL=cgroup_mutex
+
+The cgroup system is about to destroy the passed cgroup; the
+subsystem should do any necessary cleanup
+
+int can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct task_struct *task)
+LL=cgroup_mutex
+
+Called prior to moving a task into a cgroup; if the subsystem
+returns an error, this will abort the attach operation. If a NULL
+task is passed, then a successful result indicates that *any*
+unspecified task can be moved into the cgroup. Note that this isn't
+called on a fork. If this method returns 0 (success) then this should
+remain valid while the caller holds cgroup_mutex.
+
+void attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct cgroup *old_cont, struct task_struct *task)
+LL=cgroup_mutex
+
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+
+void fork(struct cgroup_subsy *ss, struct task_struct *task)
+LL=callback_mutex, maybe read_lock(tasklist_lock)
+
+Called when a task is forked into a cgroup. Also called during
+registration for all existing tasks.
+
+void exit(struct cgroup_subsys *ss, struct task_struct *task)
+LL=callback_mutex
+
+Called during task exit
+
+int populate(struct cgroup_subsys *ss, struct cgroup *cont)
+LL=none
+
+Called after creation of a cgroup to allow a subsystem to populate
+the cgroup directory with file entries. The subsystem should make
+calls to cgroup_add_file() with objects of type cftype (see
+include/linux/cgroup.h for details). Note that although this
+method can return an error code, the error code is currently not
+always handled well.
+
+void post_clone(struct cgroup_subsys *ss, struct cgroup *cont)
+
+Called at the end of cgroup_clone() to do any paramater
+initialization which might be required before a task could attach. For
+example in cpusets, no task may attach before 'cpus' and 'mems' are set
+up.
+
+void bind(struct cgroup_subsys *ss, struct cgroup *root)
+LL=callback_mutex
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Questions
+============
+
+Q: what's up with this '/bin/echo' ?
+A: bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cgroup file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+Q: When I attach processes, only the first of the line gets really attached !
+A: We can only return one error code per call to write(). So you should also
+ put only ONE pid.
+
diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index b6d24c22274..a741f658a3c 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-)
CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
CPU is being offlined while tasks are frozen due to a suspend operation in
progress
-- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+ The new CPU is chosen from each process' current cpuset, which may be
+ a subset of all online CPUs.
- All interrupts targeted to this CPU is migrated to a new CPU
- timers/bottom half/task lets are also migrated to a new CPU
- Once all services are migrated, kernel calls an arch specific routine
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index f2c0a684293..141bef1c859 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <pj@sgi.com>
Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Paul Menage <menage@google.com>
CONTENTS:
=========
@@ -16,9 +17,9 @@ CONTENTS:
1.2 Why are cpusets needed ?
1.3 How are cpusets implemented ?
1.4 What are exclusive cpusets ?
- 1.5 What does notify_on_release do ?
- 1.6 What is memory_pressure ?
- 1.7 What is memory spread ?
+ 1.5 What is memory_pressure ?
+ 1.6 What is memory spread ?
+ 1.7 What is sched_load_balance ?
1.8 How do I use cpusets ?
2. Usage Examples and Syntax
2.1 Basic Usage
@@ -35,7 +36,8 @@ CONTENTS:
----------------------
Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.
+Nodes to a set of tasks. In this document "Memory Node" refers to
+an on-line node that contains memory.
Cpusets constrain the CPU and Memory placement of tasks to only
the resources within a tasks current cpuset. They form a nested
@@ -43,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential
hooks, beyond what is already present, required to manage dynamic
job placement on large systems.
-Each task has a pointer to a cpuset. Multiple tasks may reference
-the same cpuset. Requests by a task, using the sched_setaffinity(2)
-system call to include CPUs in its CPU affinity mask, and using the
-mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
-in its memory policy, are both filtered through that tasks cpuset,
-filtering out any CPUs or Memory Nodes not in that cpuset. The
-scheduler will not schedule a task on a CPU that is not allowed in
-its cpus_allowed vector, and the kernel page allocator will not
-allocate a page on a node that is not allowed in the requesting tasks
-mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cpuset
+Cpusets use the generic cgroup subsystem described in
+Documentation/cgroup.txt.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that tasks cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset. The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting tasks mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
virtual file system, manage the attributes and permissions of these
cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
specify and query to which cpuset a task is assigned, and list the
@@ -86,9 +89,6 @@ This can be especially valuable on:
and a database), or
* NUMA systems running large HPC applications with demanding
performance characteristics.
- * Also cpu_exclusive cpusets are useful for servers running orthogonal
- workloads such as RT applications requiring low latency and HPC
- applications that are throughput sensitive
These subsets, or "soft partitions" must be able to be dynamically
adjusted, as the job mix changes, without impacting other concurrently
@@ -117,7 +117,7 @@ Cpusets extends these two mechanisms as follows:
- Cpusets are sets of allowed CPUs and Memory Nodes, known to the
kernel.
- Each task in the system is attached to a cpuset, via a pointer
- in the task structure to a reference counted cpuset structure.
+ in the task structure to a reference counted cgroup structure.
- Calls to sched_setaffinity are filtered to just those CPUs
allowed in that tasks cpuset.
- Calls to mbind and set_mempolicy are filtered to just
@@ -131,8 +131,6 @@ Cpusets extends these two mechanisms as follows:
- A cpuset may be marked exclusive, which ensures that no other
cpuset (except direct ancestors and descendents) may contain
any overlapping CPUs or Memory Nodes.
- Also a cpu_exclusive cpuset would be associated with a sched
- domain.
- You can list all the tasks (by pid) attached to any cpuset.
The implementation of cpusets requires a few, simple hooks
@@ -144,23 +142,15 @@ into the rest of the kernel, none in performance critical paths:
allowed in that tasks cpuset.
- in sched.c migrate_all_tasks(), to keep migrating tasks within
the CPUs allowed by their cpuset, if possible.
- - in sched.c, a new API partition_sched_domains for handling
- sched domain changes associated with cpu_exclusive cpusets
- and related changes in both sched.c and arch/ia64/kernel/domain.c
- in the mbind and set_mempolicy system calls, to mask the requested
Memory Nodes by what's allowed in that tasks cpuset.
- in page_alloc.c, to restrict memory to allowed nodes.
- in vmscan.c, to restrict page recovery to the current cpuset.
-In addition a new file system, of type "cpuset" may be mounted,
-typically at /dev/cpuset, to enable browsing and modifying the cpusets
-presently known to the kernel. No new system calls are added for
-cpusets - all support for querying and modifying cpusets is via
-this cpuset file system.
-
-Each task under /proc has an added file named 'cpuset', displaying
-the cpuset name, as the path relative to the root of the cpuset file
-system.
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel. No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
The /proc/<pid>/status file for each task has two added lines,
displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
@@ -170,16 +160,15 @@ in the format seen in the following example:
Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
Mems_allowed: ffffffff,ffffffff
-Each cpuset is represented by a directory in the cpuset file system
-containing the following files describing that cpuset:
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
- cpus: list of CPUs in that cpuset
- mems: list of Memory Nodes in that cpuset
- memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
- - tasks: list of tasks (by pid) attached to that cpuset
- - notify_on_release flag: run /sbin/cpuset_release_agent on exit?
- memory_pressure: measure of how much paging pressure in cpuset
In addition, the root cpuset only has the following file:
@@ -220,8 +209,8 @@ and name space for cpusets, with a minimum of additional kernel code.
The cpus and mems files in the root (top_cpuset) cpuset are
read-only. The cpus file automatically tracks the value of
cpu_online_map using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_online_map using the
-cpuset_track_online_nodes() hook.
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
1.4 What are exclusive cpusets ?
@@ -231,15 +220,6 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
a direct ancestor or descendent, may share any of the same CPUs or
Memory Nodes.
-A cpuset that is cpu_exclusive has a scheduler (sched) domain
-associated with it. The sched domain consists of all CPUs in the
-current cpuset that are not part of any exclusive child cpusets.
-This ensures that the scheduler load balancing code only balances
-against the CPUs that are in the sched domain as defined above and
-not all of the CPUs in the system. This removes any overhead due to
-load balancing code trying to pull tasks outside of the cpu_exclusive
-cpuset only to be prevented by the tasks' cpus_allowed mask.
-
A cpuset that is mem_exclusive restricts kernel allocations for
page, buffer and other data commonly shared by the kernel across
multiple users. All cpusets, whether mem_exclusive or not, restrict
@@ -253,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken
outside even a mem_exclusive cpuset.
-1.5 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cpuset, then whenever
-the last task in the cpuset leaves (exits or attaches to some other
-cpuset) and the last child cpuset of that cpuset is removed, then
-the kernel runs the command /sbin/cpuset_release_agent, supplying the
-pathname (relative to the mount point of the cpuset file system) of the
-abandoned cpuset. This enables automatic removal of abandoned cpusets.
-The default value of notify_on_release in the root cpuset at system
-boot is disabled (0). The default value of other cpusets at creation
-is the current value of their parents notify_on_release setting.
-
-
-1.6 What is memory_pressure ?
+1.5 What is memory_pressure ?
-----------------------------
The memory_pressure of a cpuset provides a simple per-cpuset metric
of the rate that the tasks in a cpuset are attempting to free up in
@@ -324,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
times 1000.
-1.7 What is memory spread ?
+1.6 What is memory spread ?
---------------------------
There are two boolean flag files per cpuset that control where the
kernel allocates pages for the file system buffers and related in
@@ -394,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the
data set, the memory allocation across the nodes in the jobs cpuset
can become very uneven.
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched.c) automatically load balances
+tasks. If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced. So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, except those
+marked isolated using the kernel boot time "isolcpus=" argument.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+ 1) On large systems, load balancing across many CPUs is expensive.
+ If the system is managed using cpusets to place independent jobs
+ on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+ system overhead on those CPUs, including avoiding task load
+ balancing if that is not needed.
+
+When the per-cpuset flag "sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendent cpusets. Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest. Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding. So if each of two partially
+overlapping cpusets enables the flag 'sched_load_balance', then we
+form a single sched domain that is a superset of both. We won't move
+a task to a CPU outside it cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "sched_load_balance" enabled,
+and the sched domain configuration. If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above. In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.) When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can. It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all
+the CPUs that must be load balanced.
+
+Whenever the 'sched_load_balance' flag changes, or CPUs come or go
+from a cpuset with this flag enabled, or a cpuset with this flag
+enabled is removed, the cpuset code builds a new such partition and
+passes it to the scheduler sched domain setup code, to have the sched
+domains rebuilt as necessary.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (cpumask_t) in the partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
1.8 How do I use cpusets ?
--------------------------
@@ -485,7 +587,7 @@ than stress the kernel.
To start a new job that is to be contained within a cpuset, the steps are:
1) mkdir /dev/cpuset
- 2) mount -t cpuset none /dev/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /dev/cpuset
3) Create the new cpuset by doing mkdir's and write's (or echo's) in
the /dev/cpuset virtual file system.
4) Start a task that will be the "founding father" of the new job.
@@ -497,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset
named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
and then start a subshell 'sh' in that cpuset:
- mount -t cpuset none /dev/cpuset
+ mount -t cgroup -ocpuset cpuset /dev/cpuset
cd /dev/cpuset
mkdir Charlie
cd Charlie
@@ -529,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
virtual filesystem.
To mount it, type:
-# mount -t cpuset none /dev/cpuset
+# mount -t cgroup -o cpuset cpuset /dev/cpuset
Then under /dev/cpuset you can find a tree that corresponds to the
tree of the cpusets in the system. For instance, /dev/cpuset
@@ -572,6 +674,18 @@ To remove a cpuset, just use rmdir:
This will fail if the cpuset is in use (has cpusets inside, or has
processes attached).
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command
+
+mount -t cpuset X /dev/cpuset
+
+is equivalent to
+
+mount -t cgroup -ocpuset X /dev/cpuset
+echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
+
2.2 Adding/removing cpus
------------------------
diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt
new file mode 100644
index 00000000000..07edbd85c71
--- /dev/null
+++ b/Documentation/device-mapper/dm-uevent.txt
@@ -0,0 +1,97 @@
+The device-mapper uevent code adds the capability to device-mapper to create
+and send kobject uevents (uevents). Previously device-mapper events were only
+available through the ioctl interface. The advantage of the uevents interface
+is the event contains environment attributes providing increased context for
+the event avoiding the need to query the state of the device-mapper device after
+the event is received.
+
+There are two functions currently for device-mapper events. The first function
+listed creates the event and the second function sends the event(s).
+
+void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
+ const char *path, unsigned nr_valid_paths)
+
+void dm_send_uevents(struct list_head *events, struct kobject *kobj)
+
+
+The variables added to the uevent environment are:
+
+Variable Name: DM_TARGET
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Name of device-mapper target that generated the event.
+
+Variable Name: DM_ACTION
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description:
+Value: Device-mapper specific action that caused the uevent action.
+ PATH_FAILED - A path has failed.
+ PATH_REINSTATED - A path has been reinstated.
+
+Variable Name: DM_SEQNUM
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description: A sequence number for this specific device-mapper device.
+Value: Valid unsigned integer range.
+
+Variable Name: DM_PATH
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Major and minor number of the path device pertaining to this
+event.
+Value: Path name in the form of "Major:Minor"
+
+Variable Name: DM_NR_VALID_PATHS
+Uevent Action(s): KOBJ_CHANGE
+Type: unsigned integer
+Description:
+Value: Valid unsigned integer range.
+
+Variable Name: DM_NAME
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: Name of the device-mapper device.
+Value: Name
+
+Variable Name: DM_UUID
+Uevent Action(s): KOBJ_CHANGE
+Type: string
+Description: UUID of the device-mapper device.
+Value: UUID. (Empty string if there isn't one.)
+
+An example of the uevents generated as captured by udevmonitor is shown
+below.
+
+1.) Path failure.
+UEVENT[1192521009.711215] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_FAILED
+DM_SEQNUM=1
+DM_PATH=8:32
+DM_NR_VALID_PATHS=0
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1130
+
+2.) Path reinstate.
+UEVENT[1192521132.989927] change@/block/dm-3
+ACTION=change
+DEVPATH=/block/dm-3
+SUBSYSTEM=block
+DM_TARGET=multipath
+DM_ACTION=PATH_REINSTATED
+DM_SEQNUM=2
+DM_PATH=8:32
+DM_NR_VALID_PATHS=1
+DM_NAME=mpath2
+DM_UUID=mpath-35333333000002328
+MINOR=3
+MAJOR=253
+SEQNUM=1131
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 6c46730c631..e6244cde26e 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -2188,7 +2188,7 @@ Your cooperation is appreciated.
136-143 char Unix98 PTY slaves
0 = /dev/pts/0 First Unix98 pseudo-TTY
- 1 = /dev/pts/1 Second Unix98 pesudo-TTY
+ 1 = /dev/pts/1 Second Unix98 pseudo-TTY
...
These device nodes are automatically generated with
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 7b9551fc6fe..f2d658a6a94 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -42,6 +42,9 @@
*.9.gz
.*
.cscope
+.gitignore
+.mailmap
+.mm
53c700_d.h
53c7xx_d.h
53c7xx_u.h
@@ -121,7 +124,6 @@ kxgettext
lkc_defs.h
lex.c*
lex.*.c
-lk201-map.c
logo_*.c
logo_*_clut224.c
logo_*_mono.c
@@ -176,11 +178,13 @@ times.h*
tkparse
trix_boot.h
utsrelease.h*
+vdso.lds
version.h*
vmlinux
vmlinux-*
vmlinux.aout
-vmlinux.lds
+vmlinux*.lds*
+vmlinux*.scr
vsyscall.lds
wanxlfw.inc
uImage
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 8569072fa38..387b8a720f4 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working.
For one reason or another, low level drivers don't receive as much
attention or testing as core code, and bugs on driver detach or
-initilaization failure doesn't happen often enough to be noticeable.
+initialization failure don't happen often enough to be noticeable.
Init failure path is worse because it's much less travelled while
needs to handle multiple entry points.
@@ -160,7 +160,7 @@ resources on failure. For example,
devres_release_group(dev, NULL);
return err_code;
-As resource acquision failure usually means probe failure, constructs
+As resource acquisition failure usually means probe failure, constructs
like above are usually useful in midlayer driver (e.g. libata core
layer) where interface function shouldn't have side effect on failure.
For LLDs, just returning error code suffices in most cases.
diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README
index cddbac456c2..766d320c8eb 100644
--- a/Documentation/early-userspace/README
+++ b/Documentation/early-userspace/README
@@ -19,7 +19,7 @@ It consists of several major infrastructure components:
- klibc, a userspace C library, currently packaged separately, that is
optimized for correctness and small size.
-The cpio file format used by initramfs is the "newc" (aka "cpio -c")
+The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
format, and is documented in the file "buffer-format.txt". There are
two ways to add an early userspace image: specify an existing cpio
archive to be used as the image or have the kernel build process build
@@ -44,7 +44,7 @@ The image is specified as one or more sources in
CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files -
cpio archives are *not* allowed when building from sources.
-A source directory will have it and all of it's contents packaged. The
+A source directory will have it and all of its contents packaged. The
specified directory name will be mapped to '/'. When packaging a
directory, limited user and group ID translation can be performed.
INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
@@ -144,7 +144,7 @@ c) using initramfs. The call to prepare_namespace() must be skipped.
initrd format, an cpio archive. It must be called "/init". This binary
is responsible to do all the things prepare_namespace() would do.
- To remain backwards compatibility, the /init binary will only run if it
+ To maintain backwards compatibility, the /init binary will only run if it
comes via an initramfs cpio archive. If this is not the case,
init/main.c:init() will run prepare_namespace() to mount the final root
and exec one of the predefined init binaries.
diff --git a/Documentation/email-clients.txt b/Documentation/email-clients.txt
new file mode 100644
index 00000000000..113165b4830
--- /dev/null
+++ b/Documentation/email-clients.txt
@@ -0,0 +1,217 @@
+Email clients info for Linux
+======================================================================
+
+General Preferences
+----------------------------------------------------------------------
+Patches for the Linux kernel are submitted via email, preferably as
+inline text in the body of the email. Some maintainers accept
+attachments, but then the attachments should have content-type
+"text/plain". However, attachments are generally frowned upon because
+it makes quoting portions of the patch more difficult in the patch
+review process.
+
+Email clients that are used for Linux kernel patches should send the
+patch text untouched. For example, they should not modify or delete tabs
+or spaces, even at the beginning or end of lines.
+
+Don't send patches with "format=flowed". This can cause unexpected
+and unwanted line breaks.
+
+Don't let your email client do automatic word wrapping for you.
+This can also corrupt your patch.
+
+Email clients should not modify the character set encoding of the text.
+Emailed patches should be in ASCII or UTF-8 encoding only.
+If you configure your email client to send emails with UTF-8 encoding,
+you avoid some possible charset problems.
+
+Email clients should generate and maintain References: or In-Reply-To:
+headers so that mail threading is not broken.
+
+Copy-and-paste (or cut-and-paste) usually does not work for patches
+because tabs are converted to spaces. Using xclipboard, xclip, and/or
+xcutsel may work, but it's best to test this for yourself or just avoid
+copy-and-paste.
+
+Don't use PGP/GPG signatures in mail that contains patches.
+This breaks many scripts that read and apply the patches.
+(This should be fixable.)
+
+It's a good idea to send a patch to yourself, save the received message,
+and successfully apply it with 'patch' before sending patches to Linux
+mailing lists.
+
+
+Some email client (MUA) hints
+----------------------------------------------------------------------
+Here are some specific MUA configuration hints for editing and sending
+patches for the Linux kernel. These are not meant to be complete
+software package configuration summaries.
+
+Legend:
+TUI = text-based user interface
+GUI = graphical user interface
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Alpine (TUI)
+
+Config options:
+In the "Sending Preferences" section:
+
+- "Do Not Send Flowed Text" must be enabled
+- "Strip Whitespace Before Sending" must be disabled
+
+When composing the message, the cursor should be placed where the patch
+should appear, and then pressing CTRL-R let you specify the patch file
+to insert into the message.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Evolution (GUI)
+
+Some people use this successfully for patches.
+
+When composing mail select: Preformat
+ from Format->Heading->Preformatted (Ctrl-7)
+ or the toolbar
+
+Then use:
+ Insert->Text File... (Alt-n x)
+to insert the patch.
+
+You can also "diff -Nru old.c new.c | xclip", select Preformat, then
+paste with the middle button.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Kmail (GUI)
+
+Some people use Kmail successfully for patches.
+
+The default setting of not composing in HTML is appropriate; do not
+enable it.
+
+When composing an email, under options, uncheck "word wrap". The only
+disadvantage is any text you type in the email will not be word-wrapped
+so you will have to manually word wrap text before the patch. The easiest
+way around this is to compose your email with word wrap enabled, then save
+it as a draft. Once you pull it up again from your drafts it is now hard
+word-wrapped and you can uncheck "word wrap" without losing the existing
+wrapping.
+
+At the bottom of your email, put the commonly-used patch delimiter before
+inserting your patch: three hyphens (---).
+
+Then from the "Message" menu item, select insert file and choose your patch.
+As an added bonus you can customise the message creation toolbar menu
+and put the "insert file" icon there.
+
+You can safely GPG sign attachments, but inlined text is preferred for
+patches so do not GPG sign them. Signing patches that have been inserted
+as inlined text will make them tricky to extract from their 7-bit encoding.
+
+If you absolutely must send patches as attachments instead of inlining
+them as text, right click on the attachment and select properties, and
+highlight "Suggest automatic display" to make the attachment inlined to
+make it more viewable.
+
+When saving patches that are sent as inlined text, select the email that
+contains the patch from the message list pane, right click and select
+"save as". You can use the whole email unmodified as a patch if it was
+properly composed. There is no option currently to save the email when you
+are actually viewing it in its own window -- there has been a request filed
+at kmail's bugzilla and hopefully this will be addressed. Emails are saved
+as read-write for user only so you will have to chmod them to make them
+group and world readable if you copy them elsewhere.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Lotus Notes (GUI)
+
+Run away from it.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Mutt (TUI)
+
+Plenty of Linux developers use mutt, so it must work pretty well.
+
+Mutt doesn't come with an editor, so whatever editor you use should be
+used in a way that there are no automatic linebreaks. Most editors have
+an "insert file" option that inserts the contents of a file unaltered.
+
+To use 'vim' with mutt:
+ set editor="vi"
+
+ If using xclip, type the command
+ :set paste
+ before middle button or shift-insert or use
+ :r filename
+
+if you want to include the patch inline.
+(a)ttach works fine without "set paste".
+
+Config options:
+It should work with default settings.
+However, it's a good idea to set the "send_charset" to:
+ set send_charset="us-ascii:utf-8"
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Pine (TUI)
+
+Pine has had some whitespace truncation issues in the past, but these
+should all be fixed now.
+
+Use alpine (pine's successor) if you can.
+
+Config options:
+- quell-flowed-text is needed for recent versions
+- the "no-strip-whitespace-before-send" option is needed
+
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Sylpheed (GUI)
+
+- Works well for inlining text (or using attachments).
+- Allows use of an external editor.
+- Not good for IMAP.
+- Is slow on large folders.
+- Won't do TLS SMTP auth over a non-SSL connection.
+- Has a helpful ruler bar in the compose window.
+- Adding addresses to address book doesn't understand the display name
+ properly.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Thunderbird (GUI)
+
+By default, thunderbird likes to mangle text, but there are ways to
+coerce it into being nice.
+
+- Under account settings, composition and addressing, uncheck "Compose
+ messages in HTML format".
+
+- Edit your Thunderbird config settings to tell it not to wrap lines:
+ user_pref("mailnews.wraplength", 0);
+
+- Edit your Thunderbird config settings so that it won't use format=flowed:
+ user_pref("mailnews.send_plaintext_flowed", false);
+
+- You need to get Thunderbird into preformat mode:
+. If you compose HTML messages by default, it's not too hard. Just select
+ "Preformat" from the drop-down box just under the subject line.
+. If you compose in text by default, you have to tell it to compose a new
+ message in HTML (just as a one-off), and then force it from there back to
+ text, else it will wrap lines. To do this, use shift-click on the Write
+ icon to compose to get HTML compose mode, then select "Preformat" from
+ the drop-down box just under the subject line.
+
+- Allows use of an external editor:
+ The easiest thing to do with Thunderbird and patches is to use an
+ "external editor" extension and then just use your favorite $EDITOR
+ for reading/merging patches into the body text. To do this, download
+ and install the extension, then add a button for it using
+ View->Toolbars->Customize... and finally just click on it when in the
+ Compose dialog.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TkRat (GUI)
+
+Works. Use "Insert file..." or external editor.
+
+ ###
diff --git a/Documentation/fb/00-INDEX b/Documentation/fb/00-INDEX
index 92e89aeef52..caabbd395e6 100644
--- a/Documentation/fb/00-INDEX
+++ b/Documentation/fb/00-INDEX
@@ -5,21 +5,49 @@ please mail me.
00-INDEX
- this file
+arkfb.txt
+ - info on the fbdev driver for ARK Logic chips.
+aty128fb.txt
+ - info on the ATI Rage128 frame buffer driver.
+cirrusfb.txt
+ - info on the driver for Cirrus Logic chipsets.
+cyblafb/
+ - directory with documentation files related to the cyblafb driver.
+deferred_io.txt
+ - an introduction to deferred IO.
+fbcon.txt
+ - intro to and usage guide for the framebuffer console (fbcon).
framebuffer.txt
- - introduction to frame buffer devices
+ - introduction to frame buffer devices.
+imacfb.txt
+ - info on the generic EFI platform driver for Intel based Macs.
+intel810.txt
+ - documentation for the Intel 810/815 framebuffer driver.
+intelfb.txt
+ - docs for Intel 830M/845G/852GM/855GM/865G/915G/945G fb driver.
internals.txt
- - quick overview of frame buffer device internals
+ - quick overview of frame buffer device internals.
+matroxfb.txt
+ - info on the Matrox framebuffer driver for Alpha, Intel and PPC.
modedb.txt
- - info on the video mode database
-aty128fb.txt
- - info on the ATI Rage128 frame buffer driver
-clgenfb.txt
- - info on the Cirrus Logic frame buffer driver
+ - info on the video mode database.
matroxfb.txt
- - info on the Matrox frame buffer driver
+ - info on the Matrox frame buffer driver.
pvr2fb.txt
- - info on the PowerVR 2 frame buffer driver
+ - info on the PowerVR 2 frame buffer driver.
+pxafb.txt
+ - info on the driver for the PXA25x LCD controller.
+s3fb.txt
+ - info on the fbdev driver for S3 Trio/Virge chips.
+sa1100fb.txt
+ - information about the driver for the SA-1100 LCD controller.
+sisfb.txt
+ - info on the framebuffer device driver for various SiS chips.
+sstfb.txt
+ - info on the frame buffer driver for 3dfx' Voodoo Graphics boards.
tgafb.txt
- info on the TGA (DECChip 21030) frame buffer driver
vesafb.txt
- info on the VESA frame buffer device
+vt8623fb.txt
+ - info on the fb driver for the graphics core in VIA VT8623 chipsets.
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
index 73cf9fb7cf6..63883a89212 100644
--- a/Documentation/fb/deferred_io.txt
+++ b/Documentation/fb/deferred_io.txt
@@ -3,7 +3,7 @@ Deferred IO
Deferred IO is a way to delay and repurpose IO. It uses host memory as a
buffer and the MMU pagefault as a pretrigger for when to perform the device
-IO. The following example may be a useful explaination of how one such setup
+IO. The following example may be a useful explanation of how one such setup
works:
- userspace app like Xfbdev mmaps framebuffer
@@ -28,7 +28,7 @@ a relatively more expensive operation.
For some types of nonvolatile high latency displays, the desired image is
the final image rather than the intermediate stages which is why it's okay
-to not update for each write that is occuring.
+to not update for each write that is occurring.
It may be the case that this is useful in other scenarios as well. Paul Mundt
has mentioned a case where it is beneficial to use the page count to decide
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
new file mode 100644
index 00000000000..bcfc233a008
--- /dev/null
+++ b/Documentation/fb/uvesafb.txt
@@ -0,0 +1,188 @@
+
+uvesafb - A Generic Driver for VBE2+ compliant video cards
+==========================================================
+
+1. Requirements
+---------------
+
+uvesafb should work with any video card that has a Video BIOS compliant
+with the VBE 2.0 standard.
+
+Unlike other drivers, uvesafb makes use of a userspace helper called
+v86d. v86d is used to run the x86 Video BIOS code in a simulated and
+controlled environment. This allows uvesafb to function on arches other
+than x86. Check the v86d documentation for a list of currently supported
+arches.
+
+v86d source code can be downloaded from the following website:
+ http://dev.gentoo.org/~spock/projects/uvesafb
+
+Please refer to the v86d documentation for detailed configuration and
+installation instructions.
+
+Note that the v86d userspace helper has to be available at all times in
+order for uvesafb to work properly. If you want to use uvesafb during
+early boot, you will have to include v86d into an initramfs image, and
+either compile it into the kernel or use it as an initrd.
+
+2. Caveats and limitations
+--------------------------
+
+uvesafb is a _generic_ driver which supports a wide variety of video
+cards, but which is ultimately limited by the Video BIOS interface.
+The most important limitations are:
+
+- Lack of any type of acceleration.
+- A strict and limited set of supported video modes. Often the native
+ or most optimal resolution/refresh rate for your setup will not work
+ with uvesafb, simply because the Video BIOS doesn't support the
+ video mode you want to use. This can be especially painful with
+ widescreen panels, where native video modes don't have the 4:3 aspect
+ ratio, which is what most BIOS-es are limited to.
+- Adjusting the refresh rate is only possible with a VBE 3.0 compliant
+ Video BIOS. Note that many nVidia Video BIOS-es claim to be VBE 3.0
+ compliant, while they simply ignore any refresh rate settings.
+
+3. Configuration
+----------------
+
+uvesafb can be compiled either as a module, or directly into the kernel.
+In both cases it supports the same set of configuration options, which
+are either given on the kernel command line or as module parameters, e.g.:
+
+ video=uvesafb:1024x768-32,mtrr:3,ywrap (compiled into the kernel)
+
+ # modprobe uvesafb mode=1024x768-32 mtrr=3 scroll=ywrap (module)
+
+Accepted options:
+
+ypan Enable display panning using the VESA protected mode
+ interface. The visible screen is just a window of the
+ video memory, console scrolling is done by changing the
+ start of the window. Available on x86 only.
+
+ywrap Same as ypan, but assumes your gfx board can wrap-around
+ the video memory (i.e. starts reading from top if it
+ reaches the end of video memory). Faster than ypan.
+ Available on x86 only.
+
+redraw Scroll by redrawing the affected part of the screen, this
+ is the safe (and slow) default.
+
+(If you're using uvesafb as a module, the above three options are
+ used a parameter of the scroll option, e.g. scroll=ypan.)
+
+vgapal Use the standard VGA registers for palette changes.
+
+pmipal Use the protected mode interface for palette changes.
+ This is the default if the protected mode interface is
+ available. Available on x86 only.
+
+mtrr:n Setup memory type range registers for the framebuffer
+ where n:
+ 0 - disabled (equivalent to nomtrr) (default)
+ 1 - uncachable
+ 2 - write-back
+ 3 - write-combining
+ 4 - write-through
+
+ If you see the following in dmesg, choose the type that matches
+ the old one. In this example, use "mtrr:2".
+...
+mtrr: type mismatch for e0000000,8000000 old: write-back new: write-combining
+...
+
+nomtrr Do not use memory type range registers.
+
+vremap:n
+ Remap 'n' MiB of video RAM. If 0 or not specified, remap memory
+ according to video mode.
+
+vtotal:n
+ If the video BIOS of your card incorrectly determines the total
+ amount of video RAM, use this option to override the BIOS (in MiB).
+
+<mode> The mode you want to set, in the standard modedb format. Refer to
+ modedb.txt for a detailed description. When uvesafb is compiled as
+ a module, the mode string should be provided as a value of the
+ 'mode' option.
+
+vbemode:x
+ Force the use of VBE mode x. The mode will only be set if it's
+ found in the VBE-provided list of supported modes.
+ NOTE: The mode number 'x' should be specified in VESA mode number
+ notation, not the Linux kernel one (eg. 257 instead of 769).
+ HINT: If you use this option because normal <mode> parameter does
+ not work for you and you use a X server, you'll probably want to
+ set the 'nocrtc' option to ensure that the video mode is properly
+ restored after console <-> X switches.
+
+nocrtc Do not use CRTC timings while setting the video mode. This option
+ has any effect only if the Video BIOS is VBE 3.0 compliant. Use it
+ if you have problems with modes set the standard way. Note that
+ using this option implies that any refresh rate adjustments will
+ be ignored and the refresh rate will stay at your BIOS default (60 Hz).
+
+noedid Do not try to fetch and use EDID-provided modes.
+
+noblank Disable hardware blanking.
+
+v86d:path
+ Set path to the v86d executable. This option is only available as
+ a module parameter, and not as a part of the video= string. If you
+ need to use it and have uvesafb built into the kernel, use
+ uvesafb.v86d="path".
+
+Additionally, the following parameters may be provided. They all override the
+EDID-provided values and BIOS defaults. Refer to your monitor's specs to get
+the correct values for maxhf, maxvf and maxclk for your hardware.
+
+maxhf:n Maximum horizontal frequency (in kHz).
+maxvf:n Maximum vertical frequency (in Hz).
+maxclk:n Maximum pixel clock (in MHz).
+
+4. The sysfs interface
+----------------------
+
+uvesafb provides several sysfs nodes for configurable parameters and
+additional information.
+
+Driver attributes:
+
+/sys/bus/platform/drivers/uvesafb
+ - v86d (default: /sbin/v86d)
+ Path to the v86d executable. v86d is started by uvesafb
+ if an instance of the daemon isn't already running.
+
+Device attributes:
+
+/sys/bus/platform/drivers/uvesafb/uvesafb.0
+ - nocrtc
+ Use the default refresh rate (60 Hz) if set to 1.
+
+ - oem_product_name
+ - oem_product_rev
+ - oem_string
+ - oem_vendor
+ Information about the card and its maker.
+
+ - vbe_modes
+ A list of video modes supported by the Video BIOS along with their
+ VBE mode numbers in hex.
+
+ - vbe_version
+ A BCD value indicating the implemented VBE standard.
+
+5. Miscellaneous
+----------------
+
+Uvesafb will set a video mode with the default refresh rate and timings
+from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
+
+
+--
+ Michal Januszewski <spock@gentoo.org>
+ Last updated: 2007-06-16
+
+ Documentation of the uvesafb options is loosely based on vesafb.txt.
+
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 63df2262d41..6bb9be54ab7 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -14,18 +14,6 @@ Who: Jiri Slaby <jirislaby@gmail.com>
---------------------------
-What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP
-When: October 2007
-Why: Broken attempt to set MPEG compression parameters. These ioctls are
- not able to implement the wide variety of parameters that can be set
- by hardware MPEG encoders. A new MPEG control mechanism was created
- in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification
- (section 1.9: Extended controls) for more information on this topic.
-Who: Hans Verkuil <hverkuil@xs4all.nl> and
- Mauro Carvalho Chehab <mchehab@infradead.org>
-
----------------------------
-
What: dev->power.power_state
When: July 2007
Why: Broken design for runtime control over driver power states, confusing
@@ -49,10 +37,10 @@ Who: David Miller <davem@davemloft.net>
---------------------------
What: Video4Linux API 1 ioctls and video_decoder.h from Video devices.
-When: December 2006
-Files: include/linux/video_decoder.h
-Check: include/linux/video_decoder.h
-Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
+When: December 2008
+Files: include/linux/video_decoder.h include/linux/videodev.h
+Check: include/linux/video_decoder.h include/linux/videodev.h
+Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6
series. The old API have lots of drawbacks and don't provide enough
means to work with all video and audio standards. The newer API is
already available on the main drivers and should be used instead.
@@ -61,7 +49,9 @@ Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6
Decoder iocts are using internally to allow video drivers to
communicate with video decoders. This should also be improved to allow
V4L2 calls being translated into compatible internal ioctls.
-Who: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
+ Compatibility ioctls will be provided, for a while, via
+ v4l1-compat module.
+Who: Mauro Carvalho Chehab <mchehab@infradead.org>
---------------------------
@@ -82,6 +72,52 @@ Who: Dominik Brodowski <linux@brodo.de>
---------------------------
+What: sys_sysctl
+When: September 2010
+Option: CONFIG_SYSCTL_SYSCALL
+Why: The same information is available in a more convenient from
+ /proc/sys, and none of the sysctl variables appear to be
+ important performance wise.
+
+ Binary sysctls are a long standing source of subtle kernel
+ bugs and security issues.
+
+ When I looked several months ago all I could find after
+ searching several distributions were 5 user space programs and
+ glibc (which falls back to /proc/sys) using this syscall.
+
+ The man page for sysctl(2) documents it as unusable for user
+ space programs.
+
+ sysctl(2) is not generally ABI compatible to a 32bit user
+ space application on a 64bit and a 32bit kernel.
+
+ For the last several months the policy has been no new binary
+ sysctls and no one has put forward an argument to use them.
+
+ Binary sysctls issues seem to keep happening appearing so
+ properly deprecating them (with a warning to user space) and a
+ 2 year grace warning period will mean eventually we can kill
+ them and end the pain.
+
+ In the mean time individual binary sysctls can be dealt with
+ in a piecewise fashion.
+
+Who: Eric Biederman <ebiederm@xmission.com>
+
+---------------------------
+
+What: a.out interpreter support for ELF executables
+When: 2.6.25
+Files: fs/binfmt_elf.c
+Why: Using a.out interpreters for ELF executables was a feature for
+ transition from a.out to ELF. But now it is unlikely to be still
+ needed anymore and removing it would simplify the hairy ELF
+ loader code.
+Who: Andi Kleen <ak@suse.de>
+
+---------------------------
+
What: remove EXPORT_SYMBOL(kernel_thread)
When: August 2006
Files: arch/*/kernel/*_ksyms.c
@@ -173,13 +209,6 @@ Who: Jean Delvare <khali@linux-fr.org>,
---------------------------
-What: drivers depending on OBSOLETE_OSS
-When: options in 2.6.22, code in 2.6.24
-Why: OSS drivers with ALSA replacements
-Who: Adrian Bunk <bunk@stusta.de>
-
----------------------------
-
What: ACPI procfs interface
When: July 2008
Why: ACPI sysfs conversion should be finished by January 2008.
@@ -205,20 +234,6 @@ Who: Len Brown <len.brown@intel.com>
---------------------------
-What: Compaq touchscreen device emulation
-When: Oct 2007
-Files: drivers/input/tsdev.c
-Why: The code says it was obsolete when it was written in 2001.
- tslib is a userspace library which does anything tsdev can do and
- much more besides in userspace where this code belongs. There is no
- longer any need for tsdev and applications should have converted to
- use tslib by now.
- The name "tsdev" is also extremely confusing and lots of people have
- it loaded when they don't need/use it.
-Who: Richard Purdie <rpurdie@rpsys.net>
-
----------------------------
-
What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers
When: September 2007
Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 59db1bca702..1de155e2dc3 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -44,14 +44,24 @@ files.txt
- info on file management in the Linux kernel.
fuse.txt
- info on the Filesystem in User SpacE including mount options.
+gfs2.txt
+ - info on the Global File System 2.
hfs.txt
- info on the Macintosh HFS Filesystem for Linux.
+hfsplus.txt
+ - info on the Macintosh HFSPlus Filesystem for Linux.
hpfs.txt
- info and mount options for the OS/2 HPFS.
+inotify.txt
+ - info on the powerful yet simple file change notification system.
isofs.txt
- info and mount options for the ISO 9660 (CDROM) filesystem.
jfs.txt
- info and mount options for the JFS filesystem.
+locks.txt
+ - info on file locking implementations, flock() vs. fcntl(), etc.
+mandatory-locking.txt
+ - info on the Linux implementation of Sys V mandatory file locking.
ncpfs.txt
- info on Novell Netware(tm) filesystem using NCP protocol.
ntfs.txt
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt
index cda6905cbe4..b90f537af35 100644
--- a/Documentation/filesystems/9p.txt
+++ b/Documentation/filesystems/9p.txt
@@ -35,12 +35,12 @@ For remote file server:
For Plan 9 From User Space applications (http://swtch.com/plan9)
- mount -t 9p `namespace`/acme /mnt/9 -o proto=unix,uname=$USER
+ mount -t 9p `namespace`/acme /mnt/9 -o trans=unix,uname=$USER
OPTIONS
=======
- proto=name select an alternative transport. Valid options are
+ trans=name select an alternative transport. Valid options are
currently:
unix - specifying a named pipe mount point
tcp - specifying a normal TCP/IP connection
@@ -54,7 +54,7 @@ OPTIONS
aname=name aname specifies the file tree to access when the server is
offering several exported file systems.
- cache=mode specifies a cacheing policy. By default, no caches are used.
+ cache=mode specifies a caching policy. By default, no caches are used.
loose = no attempts are made at consistency,
intended for exclusive, read-only mounts
@@ -68,9 +68,9 @@ OPTIONS
0x40 = display transport debug
0x80 = display allocation debug
- rfdno=n the file descriptor for reading with proto=fd
+ rfdno=n the file descriptor for reading with trans=fd
- wfdno=n the file descriptor for writing with proto=fd
+ wfdno=n the file descriptor for writing with trans=fd
maxdata=n the number of bytes to use for 9p packet payload (msize)
@@ -78,9 +78,9 @@ OPTIONS
noextend force legacy mode (no 9p2000.u semantics)
- uid attempt to mount as a particular uid
+ dfltuid attempt to mount as a particular uid
- gid attempt to mount with a particular gid
+ dfltgid attempt to mount with a particular gid
afid security channel - used by Plan 9 authentication protocols
@@ -88,6 +88,16 @@ OPTIONS
This can be used to share devices/named pipes/sockets between
hosts. This functionality will be expanded in later versions.
+ access there are three access modes.
+ user = if a user tries to access a file on v9fs
+ filesystem for the first time, v9fs sends an
+ attach command (Tattach) for that user.
+ This is the default mode.
+ <uid> = allows only user with uid=<uid> to access
+ the files on the mounted filesystem
+ any = v9fs does single attach and performs all
+ operations as one user
+
RESOURCES
=========
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting
index 31047e0fe14..87019d2b598 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/Exporting
@@ -2,9 +2,12 @@
Making Filesystems Exportable
=============================
-Most filesystem operations require a dentry (or two) as a starting
+Overview
+--------
+
+All filesystem operations require a dentry (or two) as a starting
point. Local applications have a reference-counted hold on suitable
-dentrys via open file descriptors or cwd/root. However remote
+dentries via open file descriptors or cwd/root. However remote
applications that access a filesystem via a remote filesystem protocol
such as NFS may not be able to hold such a reference, and so need a
different way to refer to a particular dentry. As the alternative
@@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most
problematic), there is no simple answer like 'filename'.
The mechanism discussed here allows each filesystem implementation to
-specify how to generate an opaque (out side of the filesystem) byte
+specify how to generate an opaque (outside of the filesystem) byte
string for any dentry, and how to find an appropriate dentry for any
given opaque byte string.
This byte string will be called a "filehandle fragment" as it
corresponds to part of an NFS filehandle.
A filesystem which supports the mapping between filehandle fragments
-and dentrys will be termed "exportable".
+and dentries will be termed "exportable".
@@ -89,11 +92,9 @@ For a filesystem to be exportable it must:
1/ provide the filehandle fragment routines described below.
2/ make sure that d_splice_alias is used rather than d_add
when ->lookup finds an inode for a given parent and name.
- Typically the ->lookup routine will end:
- if (inode)
- return d_splice(inode, dentry);
- d_add(dentry, inode);
- return NULL;
+ Typically the ->lookup routine will end with a:
+
+ return d_splice_alias(inode, dentry);
}
@@ -101,67 +102,39 @@ For a filesystem to be exportable it must:
A file system implementation declares that instances of the filesystem
are exportable by setting the s_export_op field in the struct
super_block. This field must point to a "struct export_operations"
-struct which could potentially be full of NULLs, though normally at
-least get_parent will be set.
-
- The primary operations are decode_fh and encode_fh.
-decode_fh takes a filehandle fragment and tries to find or create a
-dentry for the object referred to by the filehandle.
-encode_fh takes a dentry and creates a filehandle fragment which can
-later be used to find/create a dentry for the same object.
-
-decode_fh will probably make use of "find_exported_dentry".
-This function lives in the "exportfs" module which a filesystem does
-not need unless it is being exported. So rather that calling
-find_exported_dentry directly, each filesystem should call it through
-the find_exported_dentry pointer in it's export_operations table.
-This field is set correctly by the exporting agent (e.g. nfsd) when a
-filesystem is exported, and before any export operations are called.
-
-find_exported_dentry needs three support functions from the
-filesystem:
- get_name. When given a parent dentry and a child dentry, this
- should find a name in the directory identified by the parent
- dentry, which leads to the object identified by the child dentry.
- If no get_name function is supplied, a default implementation is
- provided which uses vfs_readdir to find potential names, and
- matches inode numbers to find the correct match.
-
- get_parent. When given a dentry for a directory, this should return
- a dentry for the parent. Quite possibly the parent dentry will
- have been allocated by d_alloc_anon.
- The default get_parent function just returns an error so any
- filehandle lookup that requires finding a parent will fail.
- ->lookup("..") is *not* used as a default as it can leave ".."
- entries in the dcache which are too messy to work with.
-
- get_dentry. When given an opaque datum, this should find the
- implied object and create a dentry for it (possibly with
- d_alloc_anon).
- The opaque datum is whatever is passed down by the decode_fh
- function, and is often simply a fragment of the filehandle
- fragment.
- decode_fh passes two datums through find_exported_dentry. One that
- should be used to identify the target object, and one that can be
- used to identify the object's parent, should that be necessary.
- The default get_dentry function assumes that the datum contains an
- inode number and a generation number, and it attempts to get the
- inode using "iget" and check it's validity by matching the
- generation number. A filesystem should only depend on the default
- if iget can safely be used this way.
-
-If decode_fh and/or encode_fh are left as NULL, then default
-implementations are used. These defaults are suitable for ext2 and
-extremely similar filesystems (like ext3).
-
-The default encode_fh creates a filehandle fragment from the inode
-number and generation number of the target together with the inode
-number and generation number of the parent (if the parent is
-required).
-
-The default decode_fh extract the target and parent datums from the
-filehandle assuming the format used by the default encode_fh and
-passed them to find_exported_dentry.
+struct which has the following members:
+
+ encode_fh (optional)
+ Takes a dentry and creates a filehandle fragment which can later be used
+ to find or create a dentry for the same object. The default
+ implementation creates a filehandle fragment that encodes a 32bit inode
+ and generation number for the inode encoded, and if necessary the
+ same information for the parent.
+
+ fh_to_dentry (mandatory)
+ Given a filehandle fragment, this should find the implied object and
+ create a dentry for it (possibly with d_alloc_anon).
+
+ fh_to_parent (optional but strongly recommended)
+ Given a filehandle fragment, this should find the parent of the
+ implied object and create a dentry for it (possibly with d_alloc_anon).
+ May fail if the filehandle fragment is too small.
+
+ get_parent (optional but strongly recommended)
+ When given a dentry for a directory, this should return a dentry for
+ the parent. Quite possibly the parent dentry will have been allocated
+ by d_alloc_anon. The default get_parent function just returns an error
+ so any filehandle lookup that requires finding a parent will fail.
+ ->lookup("..") is *not* used as a default as it can leave ".." entries
+ in the dcache which are too messy to work with.
+
+ get_name (optional)
+ When given a parent dentry and a child dentry, this should find a name
+ in the directory identified by the parent dentry, which leads to the
+ object identified by the child dentry. If no get_name function is
+ supplied, a default implementation is provided which uses vfs_readdir
+ to find potential names, and matches inode numbers to find the correct
+ match.
A filehandle fragment consists of an array of 1 or more 4byte words,
@@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with
nuls. Rather, the encode_fh routine should choose a "type" which
indicates the decode_fh how much of the filehandle is valid, and how
it should be interpreted.
-
-
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f0f825808ca..37c10cba717 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,15 +178,18 @@ prototypes:
locking rules:
All except set_page_dirty may block
- BKL PageLocked(page)
+ BKL PageLocked(page) i_sem
writepage: no yes, unlocks (see below)
readpage: no yes, unlocks
sync_page: no maybe
writepages: no
set_page_dirty no no
readpages: no
-prepare_write: no yes
-commit_write: no yes
+prepare_write: no yes yes
+commit_write: no yes yes
+write_begin: no locks the page yes
+write_end: no yes, unlocks yes
+perform_write: no n/a yes
bmap: yes
invalidatepage: no yes
releasepage: no yes
@@ -221,7 +224,7 @@ against the page the filesystem should redirty the page with
redirty_page_for_writepage(), then unlock the page and return zero.
This may also be done to avoid internal deadlocks, but rarely.
-If the filesytem is called for sync then it must wait on any
+If the filesystem is called for sync then it must wait on any
in-progress I/O and then start new I/O.
The filesystem should unlock the page synchronously, before returning to the
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 4aecc9bdb27..b45f3c1b8b4 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -130,12 +130,12 @@ Device layer.
Journaling Block Device layer
-----------------------------
-The Journaling Block Device layer (JBD) isn't ext3 specific. It was design to
-add journaling capabilities on a block device. The ext3 filesystem code will
-inform the JBD of modifications it is performing (called a transaction). The
-journal supports the transactions start and stop, and in case of crash, the
-journal can replayed the transactions to put the partition back in a
-consistent state fast.
+The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed
+to add journaling capabilities to a block device. The ext3 filesystem code
+will inform the JBD of modifications it is performing (called a transaction).
+The journal supports the transactions start and stop, and in case of a crash,
+the journal can replay the transactions to quickly put the partition back into
+a consistent state.
Handles represent a single atomic update to a filesystem. JBD can handle an
external journal on a block device.
@@ -164,7 +164,7 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all other modes.
Compatibility
-------------
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index 133e213ebb7..bb0142f6108 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -76,13 +76,13 @@ the fdtable structure -
5. Handling of the file structures is special. Since the look-up
of the fd (fget()/fget_light()) are lock-free, it is possible
that look-up may race with the last put() operation on the
- file structure. This is avoided using the rcuref APIs
+ file structure. This is avoided using atomic_inc_not_zero()
on ->f_count :
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
- if (rcuref_inc_lf(&file->f_count))
+ if (atomic_inc_not_zero(&file->f_count))
*fput_needed = 1;
else
/* Didn't get the reference, someone's freed */
@@ -92,7 +92,7 @@ the fdtable structure -
....
return file;
- rcuref_inc_lf() detects if refcounts is already zero or
+ atomic_inc_not_zero() detects if refcounts is already zero or
goes to zero during increment. If it does, we fail
fget()/fget_light().
diff --git a/Documentation/locks.txt b/Documentation/filesystems/locks.txt
index e3b402ef33b..fab857accbd 100644
--- a/Documentation/locks.txt
+++ b/Documentation/filesystems/locks.txt
@@ -53,11 +53,11 @@ fcntl(), with all the problems that implies.
1.3 Mandatory Locking As A Mount Option
---------------------------------------
-Mandatory locking, as described in 'Documentation/mandatory.txt' was prior
-to this release a general configuration option that was valid for all
-mounted filesystems. This had a number of inherent dangers, not the least
-of which was the ability to freeze an NFS server by asking it to read a
-file for which a mandatory lock existed.
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems. This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
From this release of the kernel, mandatory locking can be turned on and off
on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
diff --git a/Documentation/mandatory.txt b/Documentation/filesystems/mandatory-locking.txt
index bc449d49eee..0979d1d2ca8 100644
--- a/Documentation/mandatory.txt
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -3,7 +3,26 @@
Andy Walker <andy@lysaker.kvaerner.no>
15 April 1996
-
+ (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+ - The write system call checks for a mandatory lock only once
+ at its start. It is therefore possible for a lock request to
+ be granted after this check but before the data is modified.
+ A process may then see file data change even while a mandatory
+ lock was held.
+ - Similarly, an exclusive lock may be granted on a file after
+ the kernel has decided to proceed with a read, but before the
+ read has actually completed, and the reading process may see
+ the file data in a state which should not have been visible
+ to it.
+ - Similar races make the claimed mutual exclusion between lock
+ and mmap similarly unreliable.
1. What is mandatory locking?
------------------------------
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 4a37e25e694..dec99455321 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -347,7 +347,35 @@ connects the CPUs in a SMP system. This means that an error has been detected,
the IO-APIC automatically retry the transmission, so it should not be a big
problem, but you should read the SMP-FAQ.
-In this context it could be interesting to note the new irq directory in 2.4.
+In 2.6.2* /proc/interrupts was expanded again. This time the goal was for
+/proc/interrupts to display every IRQ vector in use by the system, not
+just those considered 'most important'. The new vectors are:
+
+ THR -- interrupt raised when a machine check threshold counter
+ (typically counting ECC corrected errors of memory or cache) exceeds
+ a configurable threshold. Only available on some systems.
+
+ TRM -- a thermal event interrupt occurs when a temperature threshold
+ has been exceeded for the CPU. This interrupt may also be generated
+ when the temperature drops back to normal.
+
+ SPU -- a spurious interrupt is some interrupt that was raised then lowered
+ by some IO device before it could be fully processed by the APIC. Hence
+ the APIC sees the interrupt but does not know what device it came from.
+ For this case the APIC will generate the interrupt with a IRQ vector
+ of 0xff. This might also be generated by chipset bugs.
+
+ RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are
+ sent from one CPU to another per the needs of the OS. Typically,
+ their statistics are used by kernel developers and interested users to
+ determine the occurance of interrupt of the given type.
+
+The above IRQ vectors are displayed only when relevent. For example,
+the threshold vector does not exist on x86_64 platforms. Others are
+suppressed when the system is a uniprocessor. As of this writing, only
+i386 and x86_64 platforms support the new IRQ vector displays.
+
+Of some interest is the introduction of the /proc/irq directory to 2.4.
It could be used to set IRQ to CPU affinity, this means that you can "hook" an
IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask
@@ -785,9 +813,9 @@ Various pieces of information about kernel activity are available in the
since the system first booted. For a quick look, simply cat the file:
> cat /proc/stat
- cpu 2255 34 2290 22625563 6290 127 456
- cpu0 1132 34 1441 11311718 3675 127 438
- cpu1 1123 0 849 11313845 2614 0 18
+ cpu 2255 34 2290 22625563 6290 127 456 0
+ cpu0 1132 34 1441 11311718 3675 127 438 0
+ cpu1 1123 0 849 11313845 2614 0 18 0
intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
ctxt 1990473
btime 1062191376
@@ -807,6 +835,7 @@ second). The meanings of the columns are as follows, from left to right:
- iowait: waiting for I/O to complete
- irq: servicing interrupts
- softirq: servicing softirqs
+- steal: involuntary wait
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt
new file mode 100644
index 00000000000..a590c4093ef
--- /dev/null
+++ b/Documentation/filesystems/quota.txt
@@ -0,0 +1,59 @@
+
+Quota subsystem
+===============
+
+Quota subsystem allows system administrator to set limits on used space and
+number of used inodes (inode is a filesystem structure which is associated
+with each file or directory) for users and/or groups. For both used space and
+number of used inodes there are actually two limits. The first one is called
+softlimit and the second one hardlimit. An user can never exceed a hardlimit
+for any resource. User is allowed to exceed softlimit but only for limited
+period of time. This period is called "grace period" or "grace time". When
+grace time is over, user is not able to allocate more space/inodes until he
+frees enough of them to get below softlimit.
+
+Quota limits (and amount of grace time) are set independently for each
+filesystem.
+
+For more details about quota design, see the documentation in quota-tools package
+(http://sourceforge.net/projects/linuxquota).
+
+Quota netlink interface
+=======================
+When user exceeds a softlimit, runs out of grace time or reaches hardlimit,
+quota subsystem traditionally printed a message to the controlling terminal of
+the process which caused the excess. This method has the disadvantage that
+when user is using a graphical desktop he usually cannot see the message.
+Thus quota netlink interface has been designed to pass information about
+the above events to userspace. There they can be captured by an application
+and processed accordingly.
+
+The interface uses generic netlink framework (see
+http://lwn.net/Articles/208755/ and http://people.suug.ch/~tgr/libnl/ for more
+details about this layer). The name of the quota generic netlink interface
+is "VFS_DQUOT". Definitions of constants below are in <linux/quota.h>.
+ Currently, the interface supports only one message type QUOTA_NL_C_WARNING.
+This command is used to send a notification about any of the above mentioned
+events. Each message has six attributes. These are (type of the argument is
+in parentheses):
+ QUOTA_NL_A_QTYPE (u32)
+ - type of quota being exceeded (one of USRQUOTA, GRPQUOTA)
+ QUOTA_NL_A_EXCESS_ID (u64)
+ - UID/GID (depends on quota type) of user / group whose limit
+ is being exceeded.
+ QUOTA_NL_A_CAUSED_ID (u64)
+ - UID of a user who caused the event
+ QUOTA_NL_A_WARNING (u32)
+ - what kind of limit is exceeded:
+ QUOTA_NL_IHARDWARN - inode hardlimit
+ QUOTA_NL_ISOFTLONGWARN - inode softlimit is exceeded longer
+ than given grace period
+ QUOTA_NL_ISOFTWARN - inode softlimit
+ QUOTA_NL_BHARDWARN - space (block) hardlimit
+ QUOTA_NL_BSOFTLONGWARN - space (block) softlimit is exceeded
+ longer than given grace period.
+ QUOTA_NL_BSOFTWARN - space (block) softlimit
+ QUOTA_NL_A_DEV_MAJOR (u32)
+ - major number of a device with the affected filesystem
+ QUOTA_NL_A_DEV_MINOR (u32)
+ - minor number of a device with the affected filesystem
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 25981e2e51b..339c6a4f220 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -8,7 +8,7 @@ What is ramfs?
Ramfs is a very simple filesystem that exports Linux's disk caching
mechanisms (the page cache and dentry cache) as a dynamically resizable
-ram-based filesystem.
+RAM-based filesystem.
Normally all files are cached in memory by Linux. Pages of data read from
backing store (usually the block device the filesystem is mounted on) are kept
@@ -34,7 +34,7 @@ ramfs and ramdisk:
------------------
The older "ram disk" mechanism created a synthetic block device out of
-an area of ram and used it as backing store for a filesystem. This block
+an area of RAM and used it as backing store for a filesystem. This block
device was of fixed size, so the filesystem mounted on it was of fixed
size. Using a ram disk also required unnecessarily copying memory from the
fake block device into the page cache (and copying changes back out), as well
@@ -46,8 +46,8 @@ unnecessary work for the CPU, and pollutes the CPU caches. (There are tricks
to avoid this copying by playing with the page tables, but they're unpleasantly
complicated and turn out to be about as expensive as the copying anyway.)
More to the point, all the work ramfs is doing has to happen _anyway_,
-since all file access goes through the page and dentry caches. The ram
-disk is simply unnecessary, ramfs is internally much simpler.
+since all file access goes through the page and dentry caches. The RAM
+disk is simply unnecessary; ramfs is internally much simpler.
Another reason ramdisks are semi-obsolete is that the introduction of
loopback devices offered a more flexible and convenient way to create
@@ -103,7 +103,7 @@ All this differs from the old initrd in several ways:
initramfs archive is a gzipped cpio archive (like tar only simpler,
see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
kernel's cpio extraction code is not only extremely small, it's also
- __init data that can be discarded during the boot process.
+ __init text and data that can be discarded during the boot process.
- The program run by the old initrd (which was called /initrd, not /init) did
some setup and then returned to the kernel, while the init program from
@@ -220,7 +220,7 @@ device) but the separate packaging of initrd (which is nice if you have
non-GPL code you'd like to run from initramfs, without conflating it with
the GPL licensed Linux kernel binary).
-It can also be used to supplement the kernel's built-in initamfs image. The
+It can also be used to supplement the kernel's built-in initramfs image. The
files in the external archive will overwrite any conflicting files in
the built-in initramfs archive. Some distributors also prefer to customize
a single kernel image with task-specific initramfs images, without recompiling.
@@ -339,7 +339,7 @@ smooth transition and allowing early boot functionality to gradually move to
The move to early userspace is necessary because finding and mounting the real
root device is complex. Root partitions can span multiple devices (raid or
separate journal). They can be out on the network (requiring dhcp, setting a
-specific mac address, logging into a server, etc). They can live on removable
+specific MAC address, logging into a server, etc). They can live on removable
media, with dynamically allocated major/minor numbers and persistent naming
issues requiring a full udev implementation to sort out. They can be
compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned,
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 4b5ca26e504..4598ef7b622 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel
attributes.
Attributes should be ASCII text files, preferably with only one value
-per file. It is noted that it may not be efficient to contain only
+per file. It is noted that it may not be efficient to contain only one
value per file, so it is socially acceptable to express an array of
values of the same type.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 045f3e055a2..9d019d35728 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -537,6 +537,12 @@ struct address_space_operations {
struct list_head *pages, unsigned nr_pages);
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ int (*write_begin)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
+ int (*write_end)(struct file *, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -615,11 +621,7 @@ struct address_space_operations {
any basic-blocks on storage, then those blocks should be
pre-read (if they haven't been read already) so that the
updated blocks can be written out properly.
- The page will be locked. If prepare_write wants to unlock the
- page it, like readpage, may do so and return
- AOP_TRUNCATED_PAGE.
- In this case the prepare_write will be retried one the lock is
- regained.
+ The page will be locked.
Note: the page _must not_ be marked uptodate in this function
(or anywhere else) unless it actually is uptodate right now. As
@@ -633,6 +635,45 @@ struct address_space_operations {
operations. It should avoid returning an error if possible -
errors should have been handled by prepare_write.
+ write_begin: This is intended as a replacement for prepare_write. The
+ key differences being that:
+ - it returns a locked page (in *pagep) rather than being
+ given a pre locked page;
+ - it must be able to cope with short writes (where the
+ length passed to write_begin is greater than the number
+ of bytes copied into the page).
+
+ Called by the generic buffered write code to ask the filesystem to
+ prepare to write len bytes at the given offset in the file. The
+ address_space should check that the write will be able to complete,
+ by allocating space if necessary and doing any other internal
+ housekeeping. If the write will update parts of any basic-blocks on
+ storage, then those blocks should be pre-read (if they haven't been
+ read already) so that the updated blocks can be written out properly.
+
+ The filesystem must return the locked pagecache page for the specified
+ offset, in *pagep, for the caller to write into.
+
+ flags is a field for AOP_FLAG_xxx flags, described in
+ include/linux/fs.h.
+
+ A void * may be returned in fsdata, which then gets passed into
+ write_end.
+
+ Returns 0 on success; < 0 on failure (which is the error code), in
+ which case write_end is not called.
+
+ write_end: After a successful write_begin, and data copy, write_end must
+ be called. len is the original len passed to write_begin, and copied
+ is the amount that was able to be copied (copied == len is always true
+ if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
+
+ The filesystem must take care of unlocking the page and releasing it
+ refcount, and updating i_size.
+
+ Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
+ that were able to be copied into pagecache.
+
bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files. To be able to swap to
@@ -665,7 +706,7 @@ struct address_space_operations {
wants to make it a free page. If ->releasepage succeeds, the
page will be removed from the address_space and become free.
- The second case if when a request has been made to invalidate
+ The second case is when a request has been made to invalidate
some or all pages in an address_space. This can happen
through the fadvice(POSIX_FADV_DONTNEED) system call or by the
filesystem explicitly requesting it as nfs and 9fs do (when
diff --git a/Documentation/firmware_class/firmware_sample_firmware_class.c b/Documentation/firmware_class/firmware_sample_firmware_class.c
index fba943aacf9..2de62854f0e 100644
--- a/Documentation/firmware_class/firmware_sample_firmware_class.c
+++ b/Documentation/firmware_class/firmware_sample_firmware_class.c
@@ -109,15 +109,15 @@ static int fw_setup_class_device(struct class_device *class_dev,
const char *fw_name,
struct device *device)
{
- int retval = 0;
- struct firmware_priv *fw_priv = kmalloc(sizeof(struct firmware_priv),
- GFP_KERNEL);
+ int retval;
+ struct firmware_priv *fw_priv;
- if(!fw_priv){
+ fw_priv = kzalloc(sizeof(struct firmware_priv), GFP_KERNEL);
+ if (!fw_priv) {
retval = -ENOMEM;
goto out;
}
- memset(fw_priv, 0, sizeof(*fw_priv));
+
memset(class_dev, 0, sizeof(*class_dev));
strncpy(fw_priv->fw_id, fw_name, FIRMWARE_NAME_MAX);
diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol
index 579b92d5f3a..10518dd5881 100644
--- a/Documentation/i2c/i2c-protocol
+++ b/Documentation/i2c/i2c-protocol
@@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications:
Flags I2C_M_IGNORE_NAK
Normally message is interrupted immediately if there is [NA] from the
- client. Setting this flag treats any [NA] as [A], and all of
+ client. Setting this flag treats any [NA] as [A], and all of
message is sent.
These messages may still fail to SCL lo->hi timeout.
diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt
index 35985b34d5a..2f75e750e4f 100644
--- a/Documentation/i386/boot.txt
+++ b/Documentation/i386/boot.txt
@@ -168,6 +168,8 @@ Offset Proto Name Meaning
0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
0235/3 N/A pad2 Unused
0238/4 2.06+ cmdline_size Maximum size of the kernel command line
+023C/4 2.07+ hardware_subarch Hardware subarchitecture
+0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data
(1) For backwards compatibility, if the setup_sects field contains 0, the
real value is 4.
@@ -204,7 +206,7 @@ boot loaders can ignore those fields.
The byte order of all fields is littleendian (this is x86, after all.)
-Field name: setup_secs
+Field name: setup_sects
Type: read
Offset/size: 0x1f1/1
Protocol: ALL
@@ -356,6 +358,13 @@ Protocol: 2.00+
- If 0, the protected-mode code is loaded at 0x10000.
- If 1, the protected-mode code is loaded at 0x100000.
+ Bit 6 (write): KEEP_SEGMENTS
+ Protocol: 2.07+
+ - if 0, reload the segment registers in the 32bit entry point.
+ - if 1, do not reload the segment registers in the 32bit entry point.
+ Assume that %cs %ds %ss %es are all set to flat segments with
+ a base of 0 (or the equivalent for their environment).
+
Bit 7 (write): CAN_USE_HEAP
Set this bit to 1 to indicate that the value entered in the
heap_end_ptr is valid. If this field is clear, some setup code
@@ -480,6 +489,29 @@ Protocol: 2.06+
cmdline_size characters. With protocol version 2.05 and earlier, the
maximum size was 255.
+Field name: hardware_subarch
+Type: write
+Offset/size: 0x23c/4
+Protocol: 2.07+
+
+ In a paravirtualized environment the hardware low level architectural
+ pieces such as interrupt handling, page table handling, and
+ accessing process control registers needs to be done differently.
+
+ This field allows the bootloader to inform the kernel we are in one
+ one of those environments.
+
+ 0x00000000 The default x86/PC environment
+ 0x00000001 lguest
+ 0x00000002 Xen
+
+Field name: hardware_subarch_data
+Type: write
+Offset/size: 0x240/8
+Protocol: 2.07+
+
+ A pointer to data that is specific to hardware subarch
+
**** THE KERNEL COMMAND LINE
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
index 6449a7090db..223e4f0582d 100644
--- a/Documentation/ia64/err_inject.txt
+++ b/Documentation/ia64/err_inject.txt
@@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF.
Below is a sample application as part of the whole tool. The sample
can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a libary or other user
+more features. It also can be a integrated into a library or other user
application to have more thorough test.
-The sample application takes err.conf as error configuation input. Gcc
+The sample application takes err.conf as error configuration input. GCC
compiles the code. After you install err_inject driver, you can run
this sample application to inject errors.
@@ -809,7 +809,7 @@ int err_inj()
}
/* Create semaphore: If one_lock, one semaphore for all processors.
- Otherwise, one sempaphore for each processor. */
+ Otherwise, one semaphore for each processor. */
if (one_lock) {
if (create_sem(0)) {
printf("Can not create semaphore...exit\n");
diff --git a/Documentation/ide.txt b/Documentation/ide.txt
index 3bb9f9c9861..1d50f23a5ca 100644
--- a/Documentation/ide.txt
+++ b/Documentation/ide.txt
@@ -242,6 +242,8 @@ Summary of ide driver parameters for kernel command line
and quite likely to cause trouble with
older/odd IDE drives.
+ "hdx=nodma" : disallow DMA
+
"hdx=swapdata" : when the drive is a disk, byte swap all data
"hdx=bswap" : same as above..........
@@ -278,8 +280,6 @@ Summary of ide driver parameters for kernel command line
"idex=four" : four drives on idex and ide(x^1) share same ports
"idex=reset" : reset interface after probe
-
- "idex=dma" : automatically configure/use DMA if possible.
"idex=ata66" : informs the interface that it has an 80c cable
for chipsets that are ATA-66 capable, but the
@@ -288,8 +288,6 @@ Summary of ide driver parameters for kernel command line
"ide=reverse" : formerly called to pci sub-system, but now local.
- "ide=nodma" : disable DMA globally for the IDE subsystem.
-
The following are valid ONLY on ide0, which usually corresponds
to the first ATA interface found on the particular host, and the defaults for
the base,ctl ports must not be altered.
diff --git a/Documentation/initrd.txt b/Documentation/initrd.txt
index d3dc505104d..74f68b35f7c 100644
--- a/Documentation/initrd.txt
+++ b/Documentation/initrd.txt
@@ -80,8 +80,8 @@ Compressed cpio images
----------------------
Recent kernels have support for populating a ramdisk from a compressed cpio
-archive, on such systems, the creation of a ramdisk image doesn't need to
-involve special block devices or loopbacks, you merely create a directory on
+archive. On such systems, the creation of a ramdisk image doesn't need to
+involve special block devices or loopbacks; you merely create a directory on
disk with the desired initrd content, cd to that directory, and run (as an
example):
@@ -293,7 +293,7 @@ information as small as possible. In this case, a common initrd could be
generated with all the necessary modules. Then, only /sbin/init or a file
read by it would have to be different.
-A third scenario are more convenient recovery disks, because information
+A third scenario is more convenient recovery disks, because information
like the location of the root FS partition doesn't have to be provided at
boot time, but the system loaded from initrd can invoke a user-friendly
dialog and it can also perform some sanity checks (or even some form of
@@ -339,8 +339,8 @@ the new, supported mechanism is called "pivot_root".
Mixed change_root and pivot_root mechanism
------------------------------------------
-In case you did not want to use root=/dev/ram0 to trig the pivot_root mechanism,
-you may create both /linuxrc and /sbin/init in your initrd image.
+In case you did not want to use root=/dev/ram0 to trigger the pivot_root
+mechanism, you may create both /linuxrc and /sbin/init in your initrd image.
/linuxrc would contain only the following:
@@ -350,7 +350,7 @@ echo 0x0100 >/proc/sys/kernel/real-root-dev
umount -n /proc
Once linuxrc exited, the kernel would mount again your initrd as root,
-this time executing /sbin/init. Again, it would be duty of this init
+this time executing /sbin/init. Again, it would be the duty of this init
to build the right environment (maybe using the root= device passed on
the cmdline) before the final execution of the real /sbin/init.
diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt
index ab050621e20..f3a3ba8847b 100644
--- a/Documentation/input/atarikbd.txt
+++ b/Documentation/input/atarikbd.txt
@@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck
keys. Any keys down at power-up are presumed to be stuck, and their BREAK
(sic) code is returned (which without the preceding MAKE code is a flag for a
keyboard error). If the controller self-test completes without error, the code
-0xF0 is returned. (This code will be used to indicate the version/rlease of
+0xF0 is returned. (This code will be used to indicate the version/release of
the ikbd controller. The first release of the ikbd is version 0xF0, should
there be a second release it will be 0xF1, and so on.)
The ikbd defaults to a mouse position reporting with threshold of 1 unit in
@@ -413,7 +413,7 @@ INTERROGATION MODE.
%nnnnmmmm ; where m is JOYSTICK1 state
; and n is JOYSTICK0 state
-Sets the ikbd to do nothing but monitor the serial command lne, maintain the
+Sets the ikbd to do nothing but monitor the serial command line, maintain the
time-of-day clock, and monitor the joystick. The rate sets the interval
between joystick samples.
N.B. The user should not set the rate higher than the serial communications
@@ -446,10 +446,10 @@ The sample interval should be as constant as possible.
; until vertical cursor key is generated before RY
; has elapsed
VX ; length (in tenths of seconds) of joystick closure
- ; until horizontal cursor keystokes are generated
+ ; until horizontal cursor keystrokes are generated
; after RX has elapsed
VY ; length (in tenths of seconds) of joystick closure
- ; until vertical cursor keystokes are generated
+ ; until vertical cursor keystrokes are generated
; after RY has elapsed
In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes.
diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt
index 085eb15b45b..ded4d5f5310 100644
--- a/Documentation/input/ff.txt
+++ b/Documentation/input/ff.txt
@@ -1,5 +1,5 @@
Force feedback for Linux.
-By Johann Deneux <deneux@ifrance.com> on 2001/04/22.
+By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22.
Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09.
You may redistribute this file. Please remember to include shape.fig and
interactive.fig as well.
diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt
index 8777d2d321e..3ac92413c87 100644
--- a/Documentation/input/iforce-protocol.txt
+++ b/Documentation/input/iforce-protocol.txt
@@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices. None of this information comes
from Immerse. That's why you should not trust what is written in this
document. This document is intended to help understanding the protocol.
This is not a reference. Comments and corrections are welcome. To contact me,
-send an email to: deneux@ifrance.com
+send an email to: johann.deneux@gmail.com
** WARNING **
-I may not be held responsible for any dammage or harm caused if you try to
+I shall not be held responsible for any damage or harm caused if you try to
send data to your I-Force device based on what you read in this document.
** Preliminary Notes:
@@ -151,13 +151,13 @@ OP= ff
Query command. Length varies according to the query type.
The general format of this packet is:
ff 01 QUERY [INDEX] CHECKSUM
-reponses are of the same form:
+responses are of the same form:
FF LEN QUERY VALUE_QUERIED CHECKSUM2
where LEN = 1 + length(VALUE_QUERIED)
**** Query ram size ****
QUERY = 42 ('B'uffer size)
-The device should reply with the same packet plus two additionnal bytes
+The device should reply with the same packet plus two additional bytes
containing the size of the memory:
ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available.
@@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters:
** Appendix: How to study the protocol ? **
-1. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com)
-2. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
+1. Generate effects using the force editor provided with the DirectX SDK, or
+use Immersion Studio (freely available at their web site in the developer section:
+www.immersion.com)
+2. Start a soft spying RS232 or USB (depending on where you connected your
+joystick/wheel). I used ComPortSpy from fCoder (alpha version!)
3. Play the effect, and watch what happens on the spy screen.
A few words about ComPortSpy:
-At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect.
+At first glance, this software seems, hum, well... buggy. In fact, data appear with a
+few seconds latency. Personally, I restart it every time I play an effect.
Remember it's free (as in free beer) and alpha!
** URLS **
Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy.
** Author of this document **
-Johann Deneux <deneux@ifrance.com>
+Johann Deneux <johann.deneux@gmail.com>
Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/
Additions by Vojtech Pavlik.
diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt
index d9d523099bb..47fc86830cd 100644
--- a/Documentation/input/input-programming.txt
+++ b/Documentation/input/input-programming.txt
@@ -42,8 +42,8 @@ static int __init button_init(void)
goto err_free_irq;
}
- button_dev->evbit[0] = BIT(EV_KEY);
- button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0);
+ button_dev->evbit[0] = BIT_MASK(EV_KEY);
+ button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0);
error = input_register_device(button_dev);
if (error) {
@@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when
booting the kernel, it grabs the required resources (it should also check
for the presence of the device).
-Then it allocates a new input device structure with input_aloocate_device()
+Then it allocates a new input device structure with input_allocate_device()
and sets up input bitfields. This way the device driver tells the other
parts of the input systems what it is - what events can be generated or
accepted by this input device. Our example device can only generate EV_KEY
@@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean
that the thing is precise and always returns to exactly the center position
(if it has any).
-1.4 NBITS(), LONG(), BIT()
+1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK()
~~~~~~~~~~~~~~~~~~~~~~~~~~
-These three macros from input.h help some bitfield computations:
+These three macros from bitops.h help some bitfield computations:
- NBITS(x) - returns the length of a bitfield array in longs for x bits
- LONG(x) - returns the index in the array in longs for bit x
- BIT(x) - returns the index in a long for bit x
+ BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for
+ x bits
+ BIT_WORD(x) - returns the index in the array in longs for bit x
+ BIT_MASK(x) - returns the index in a long for bit x
1.5 The id* and name fields
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS
index 7c17c837064..8cac6c2f23e 100644
--- a/Documentation/isdn/CREDITS
+++ b/Documentation/isdn/CREDITS
@@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de)
Pedro Roque Marques (roque@di.fc.ul.pt)
For lot of new ideas and the pcbit driver.
-Eberhard Moenkeberg (emoenke@gwdg.de)
+Eberhard Mönkeberg (emoenke@gwdg.de)
For testing and help to get into kernel.
Thomas Neumann (tn@ruhr.de)
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
index 2f114babe4b..a76d74845a4 100644
--- a/Documentation/isdn/README.concap
+++ b/Documentation/isdn/README.concap
@@ -111,7 +111,7 @@ struct concap_proto_ops{
struct concap_proto * (*proto_new) (void);
/* delete encapsulation protocol instance and free all its resources.
- cprot may no loger be referenced after calling this */
+ cprot may no longer be referenced after calling this */
void (*proto_del)(struct concap_proto *cprot);
/* initialize the protocol's data. To be called at interface startup
diff --git a/Documentation/java.txt b/Documentation/java.txt
index 3cce3fbb664..e6a72328154 100644
--- a/Documentation/java.txt
+++ b/Documentation/java.txt
@@ -37,7 +37,7 @@ other program after you have done the following:
or the following, if you want to be more selective:
':Applet:M::<!--applet::/usr/bin/appletviewer:'
- Of cause you have to fix the path names. Given path/file names in this
+ Of course you have to fix the path names. The path/file names given in this
document match the Debian 2.1 system. (i.e. jdk installed in /usr,
custom wrappers from this document in /usr/local)
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index fe8b0c4892c..616043a6da9 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -77,7 +77,12 @@ applicable everywhere (see syntax).
Optionally, dependencies only for this default value can be added with
"if".
-- dependencies: "depends on"/"requires" <expr>
+- type definition + default value:
+ "def_bool"/"def_tristate" <expr> ["if" <expr>]
+ This is a shorthand notation for a type definition plus a value.
+ Optionally dependencies for this default value can be added with "if".
+
+- dependencies: "depends on" <expr>
This defines a dependency for this menu entry. If multiple
dependencies are defined, they are connected with '&&'. Dependencies
are applied to all other options within this menu entry (which also
@@ -289,3 +294,10 @@ source:
"source" <prompt>
This reads the specified configuration file. This file is always parsed.
+
+mainmenu:
+
+ "mainmenu" <prompt>
+
+This sets the config program's title bar if the config program chooses
+to use it.
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index e08ef8759a0..7a7753321a2 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -276,41 +276,39 @@ more details, with real examples.
--- 3.7 Compilation flags
- EXTRA_CFLAGS, EXTRA_AFLAGS, EXTRA_LDFLAGS, EXTRA_ARFLAGS
+ ccflags-y, asflags-y and ldflags-y
+ The three flags listed above applies only to the kbuild makefile
+ where they are assigned. They are used for all the normal
+ cc, as and ld invocation happenign during a recursive build.
+ Note: Flags with the same behaviour were previously named:
+ EXTRA_CFLAGS, EXTRA_AFLAGS and EXTRA_LDFLAGS.
+ They are yet supported but their use are deprecated.
- All the EXTRA_ variables apply only to the kbuild makefile
- where they are assigned. The EXTRA_ variables apply to all
- commands executed in the kbuild makefile.
-
- $(EXTRA_CFLAGS) specifies options for compiling C files with
- $(CC).
+ ccflags-y specifies options for compiling C files with $(CC).
Example:
# drivers/sound/emu10k1/Makefile
- EXTRA_CFLAGS += -I$(obj)
- ifdef DEBUG
- EXTRA_CFLAGS += -DEMU10K1_DEBUG
- endif
+ ccflags-y += -I$(obj)
+ ccflags-$(DEBUG) += -DEMU10K1_DEBUG
This variable is necessary because the top Makefile owns the
- variable $(CFLAGS) and uses it for compilation flags for the
+ variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
entire tree.
- $(EXTRA_AFLAGS) is a similar string for per-directory options
+ asflags-y is a similar string for per-directory options
when compiling assembly language source.
Example:
#arch/x86_64/kernel/Makefile
- EXTRA_AFLAGS := -traditional
+ asflags-y := -traditional
- $(EXTRA_LDFLAGS) and $(EXTRA_ARFLAGS) are similar strings for
- per-directory options to $(LD) and $(AR).
+ ldflags-y is a string for per-directory options to $(LD).
Example:
#arch/m68k/fpsp040/Makefile
- EXTRA_LDFLAGS := -x
+ ldflags-y := -x
CFLAGS_$@, AFLAGS_$@
@@ -425,6 +423,7 @@ more details, with real examples.
as-instr checks if the assembler reports a specific instruction
and then outputs either option1 or option2
C escapes are supported in the test instruction
+ Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options
cc-option
cc-option is used to check if $(CC) supports a given option, and not
@@ -438,6 +437,7 @@ more details, with real examples.
-march=pentium-mmx if supported by $(CC), otherwise -march=i586.
The second argument to cc-option is optional, and if omitted,
cflags-y will be assigned no value if first option is not supported.
+ Note: cc-option uses KBUILD_CFLAGS for $(CC) options
cc-option-yn
cc-option-yn is used to check if gcc supports a given option
@@ -453,6 +453,7 @@ more details, with real examples.
option. When $(biarch) equals 'y', the expanded variables $(aflags-y)
and $(cflags-y) will be assigned the values -a32 and -m32,
respectively.
+ Note: cc-option-yn uses KBUILD_CFLAGS for $(CC) options
cc-option-align
gcc versions >= 3.0 changed the type of options used to specify
@@ -464,10 +465,11 @@ more details, with real examples.
cc-option-align = -falign
Example:
- CFLAGS += $(cc-option-align)-functions=4
+ KBUILD_CFLAGS += $(cc-option-align)-functions=4
In the above example, the option -falign-functions=4 is used for
gcc >= 3.00. For gcc < 3.00, -malign-functions=4 is used.
+ Note: cc-option-align uses KBUILD_CFLAGS for $(CC) options
cc-version
cc-version returns a numerical version of the $(CC) compiler version.
@@ -492,9 +494,9 @@ more details, with real examples.
Example:
#fs/reiserfs/Makefile
- EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0402, -O1)
+ ccflags-y := $(call cc-ifversion, -lt, 0402, -O1)
- In this example, EXTRA_CFLAGS will be assigned the value -O1 if the
+ In this example, ccflags-y will be assigned the value -O1 if the
$(CC) version is less than 4.2.
cc-ifversion takes all the shell operators:
-eq, -ne, -lt, -le, -gt, and -ge
@@ -516,6 +518,28 @@ more details, with real examples.
In this example for a specific GCC version the build will error out explaining
to the user why it stops.
+ cc-cross-prefix
+ cc-cross-prefix is used to check if there exists a $(CC) in path with
+ one of the listed prefixes. The first prefix where there exist a
+ prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found
+ then nothing is returned.
+ Additional prefixes are separated by a single space in the
+ call of cc-cross-prefix.
+ This functionality is useful for architecture Makefiles that try
+ to set CROSS_COMPILE to well-known values but may have several
+ values to select between.
+ It is recommended only to try to set CROSS_COMPILE if it is a cross
+ build (host arch is different from target arch). And if CROSS_COMPILE
+ is already set then leave it with the old value.
+
+ Example:
+ #arch/m68k/Makefile
+ ifneq ($(SUBARCH),$(ARCH))
+ ifeq ($(CROSS_COMPILE),)
+ CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-)
+ endif
+ endif
+
=== 4 Host Program support
Kbuild supports building executables on the host for use during the
@@ -780,8 +804,8 @@ When kbuild executes, the following steps are followed (roughly):
Example:
#arch/s390/Makefile
LDFLAGS := -m elf_s390
- Note: EXTRA_LDFLAGS and LDFLAGS_$@ can be used to further customise
- the flags used. See chapter 7.
+ Note: ldflags-y can be used to further customise
+ the flags used. See chapter 3.7.
LDFLAGS_MODULE Options for $(LD) when linking modules
@@ -817,26 +841,26 @@ When kbuild executes, the following steps are followed (roughly):
In this example, the binary $(obj)/image is a binary version of
vmlinux. The usage of $(call if_changed,xxx) will be described later.
- AFLAGS $(AS) assembler flags
+ KBUILD_AFLAGS $(AS) assembler flags
Default value - see top level Makefile
Append or modify as required per architecture.
Example:
#arch/sparc64/Makefile
- AFLAGS += -m64 -mcpu=ultrasparc
+ KBUILD_AFLAGS += -m64 -mcpu=ultrasparc
- CFLAGS $(CC) compiler flags
+ KBUILD_CFLAGS $(CC) compiler flags
Default value - see top level Makefile
Append or modify as required per architecture.
- Often, the CFLAGS variable depends on the configuration.
+ Often, the KBUILD_CFLAGS variable depends on the configuration.
Example:
#arch/i386/Makefile
cflags-$(CONFIG_M386) += -march=i386
- CFLAGS += $(cflags-y)
+ KBUILD_CFLAGS += $(cflags-y)
Many arch Makefiles dynamically run the target C compiler to
probe supported options:
@@ -848,7 +872,7 @@ When kbuild executes, the following steps are followed (roughly):
-march=pentium2,-march=i686)
...
# Disable unit-at-a-time mode ...
- CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+ KBUILD_CFLAGS += $(call cc-option,-fno-unit-at-a-time)
...
@@ -1096,8 +1120,8 @@ When kbuild executes, the following steps are followed (roughly):
specified options when building the target vmlinux.lds.
When building the *.lds target, kbuild uses the variables:
- CPPFLAGS : Set in top-level Makefile
- EXTRA_CPPFLAGS : May be set in the kbuild makefile
+ KBUILD_CPPFLAGS : Set in top-level Makefile
+ cppflags-y : May be set in the kbuild makefile
CPPFLAGS_$(@F) : Target specific flags.
Note that the full filename is used in this
assignment.
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 2fedc081b4c..d0ac72cc19f 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -13,7 +13,7 @@ dump of the system kernel's memory needs to be taken (for example, when
the system panics). The system kernel's memory image is preserved across
the reboot and is accessible to the dump-capture kernel.
-You can use common Linux commands, such as cp and scp, to copy the
+You can use common commands, such as cp and scp, to copy the
memory image to a dump file on the local disk, or across the network to
a remote system.
@@ -69,7 +69,7 @@ http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/kexec-tools-test
This is a symlink to the latest version, which at the time of writing is
20061214, the only release of kexec-tools-testing so far. As other versions
-are made released, the older onese will remain available at
+are released, the older ones will remain available at
http://www.kernel.org/pub/linux/kernel/people/horms/kexec-tools/
Note: Latest kexec-tools-testing git tree is available at
@@ -159,16 +159,17 @@ Dump-capture kernel config options (Arch Independent)
CONFIG_PROC_VMCORE=y
(CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.)
-Dump-capture kernel config options (Arch Dependent, i386)
---------------------------------------------------------
-1) On x86, enable high memory support under "Processor type and
+Dump-capture kernel config options (Arch Dependent, i386 and x86_64)
+--------------------------------------------------------------------
+
+1) On i386, enable high memory support under "Processor type and
features":
CONFIG_HIGHMEM64G=y
or
CONFIG_HIGHMEM4G
-2) On x86 and x86_64, disable symmetric multi-processing support
+2) On i386 and x86_64, disable symmetric multi-processing support
under "Processor type and features":
CONFIG_SMP=n
@@ -203,28 +204,6 @@ Dump-capture kernel config options (Arch Dependent, i386)
5) Make and install the kernel and its modules. DO NOT add this kernel
to the boot loader configuration files.
-Dump-capture kernel config options (Arch Dependent, x86_64)
-----------------------------------------------------------
-1) On x86 and x86_64, disable symmetric multi-processing support
- under "Processor type and features":
-
- CONFIG_SMP=n
-
- (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line
- when loading the dump-capture kernel, see section "Load the Dump-capture
- Kernel".)
-
-2) Use a suitable value for "Physical address where the kernel is
- loaded" (under "Processor type and features"). This only appears when
- "kernel crash dumps" is enabled. By default this value is 0x1000000
- (16MB). It should be the same as X in the "crashkernel=Y@X" boot
- parameter.
-
- For x86_64, normally "CONFIG_PHYSICAL_START=0x1000000".
-
-3) Make and install the kernel and its modules. DO NOT add this kernel
- to the boot loader configuration files.
-
Dump-capture kernel config options (Arch Dependent, ppc64)
----------------------------------------------------------
@@ -252,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64)
any space below the alignment point will be wasted.
+Extended crashkernel syntax
+===========================
+
+While the "crashkernel=size[@offset]" syntax is sufficient for most
+configurations, sometimes it's handy to have the reserved memory dependent
+on the value of System RAM -- that's mostly for distributors that pre-setup
+the kernel command line to avoid a unbootable system after some memory has
+been removed from the machine.
+
+The syntax is:
+
+ crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset]
+ range=start-[end]
+
+For example:
+
+ crashkernel=512M-2G:64M,2G-:128M
+
+This would mean:
+
+ 1) if the RAM is smaller than 512M, then don't reserve anything
+ (this is the "rescue" case)
+ 2) if the RAM size is between 512M and 2G, then reserve 64M
+ 3) if the RAM size is larger than 2G, then reserve 128M
+
+
Boot into System Kernel
=======================
@@ -282,11 +287,9 @@ Based on the architecture and type of image (relocatable or not), one
can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz
of dump-capture kernel. Following is the summary.
-For i386:
+For i386 and x86_64:
- Use vmlinux if kernel is not relocatable.
- Use bzImage/vmlinuz if kernel is relocatable.
-For x86_64:
- - Use vmlinux
For ppc64:
- Use vmlinux
For ia64:
@@ -315,20 +318,22 @@ Following are the arch specific command line options to be used while
loading dump-capture kernel.
For i386, x86_64 and ia64:
- "1 irqpoll maxcpus=1"
+ "1 irqpoll maxcpus=1 reset_devices"
For ppc64:
- "1 maxcpus=1 noirqdistrib"
+ "1 maxcpus=1 noirqdistrib reset_devices"
Notes on loading the dump-capture kernel:
* By default, the ELF headers are stored in ELF64 format to support
- systems with more than 4GB memory. The --elf32-core-headers option can
- be used to force the generation of ELF32 headers. This is necessary
- because GDB currently cannot open vmcore files with ELF64 headers on
- 32-bit systems. ELF32 headers can be used on non-PAE systems (that is,
- less than 4GB of memory).
+ systems with more than 4GB memory. On i386, kexec automatically checks if
+ the physical RAM size exceeds the 4 GB limit and if not, uses ELF32.
+ So, on non-PAE systems, ELF32 is always used.
+
+ The --elf32-core-headers option can be used to force the generation of ELF32
+ headers. This is necessary because GDB currently cannot open vmcore files
+ with ELF64 headers on 32-bit systems.
* The "irqpoll" boot parameter reduces driver initialization failures
due to shared interrupts in the dump-capture kernel.
@@ -360,7 +365,7 @@ If die() is called, and it happens to be a thread with pid 0 or 1, or die()
is called inside interrupt context or die() is called and panic_on_oops is set,
the system will boot into the dump-capture kernel.
-On powererpc systems when a soft-reset is generated, die() is called by all cpus
+On powerpc systems when a soft-reset is generated, die() is called by all cpus
and the system will boot into the dump-capture kernel.
For testing purposes, you can trigger a crash by using "ALT-SysRq-c",
@@ -426,9 +431,3 @@ Contact
Vivek Goyal (vgoyal@in.ibm.com)
Maneesh Soni (maneesh@in.ibm.com)
-
-Trademark
-=========
-
-Linux is a trademark of Linus Torvalds in the United States, other
-countries, or both.
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt
index d9e3b199929..5a4ef48224a 100644
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -76,9 +76,9 @@
* Title: "Conceptual Architecture of the Linux Kernel"
Author: Ivan T. Bowman.
URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html
- Keywords: conceptual software arquitecture, extracted design,
+ Keywords: conceptual software architecture, extracted design,
reverse engineering, system structure.
- Description: Conceptual software arquitecture of the Linux kernel,
+ Description: Conceptual software architecture of the Linux kernel,
automatically extracted from the source code. Very detailed. Good
figures. Gives good overall kernel understanding.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c323778270f..b2361667839 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -75,10 +75,12 @@ parameter is applicable:
PPT Parallel port support is enabled.
PS2 Appropriate PS/2 support is enabled.
RAM RAM disk support is enabled.
+ ROOTPLUG The example Root Plug LSM is enabled.
S390 S390 architecture is enabled.
SCSI Appropriate SCSI support is enabled.
A lot of drivers has their options described inside of
Documentation/scsi/.
+ SECURITY Different security models are enabled.
SELINUX SELinux support is enabled.
SERIAL Serial support is enabled.
SH SuperH architecture is enabled.
@@ -220,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file
Warning: Many of these options can produce a lot of
output and make your system unusable. Be very careful.
-
- acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT
-
acpi_pm_good [X86-32,X86-64]
Override the pmtimer bug detection: force the kernel
to assume that this machine's pmtimer latches its value
@@ -295,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file
apm= [APM] Advanced Power Management
See header of arch/i386/kernel/apm.c.
- applicom= [HW]
- Format: <mem>,<irq>
-
arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
Format: <io>,<irq>,<nodeID>
@@ -343,11 +339,10 @@ and is between 256 and 4096 characters. It is defined in the file
Format: <io>,<irq>,<mode>
See header of drivers/net/hamradio/baycom_ser_hdx.c.
- blkmtd_device= [HW,MTD]
- blkmtd_erasesz=
- blkmtd_ro=
- blkmtd_bs=
- blkmtd_count=
+ boot_delay= Milliseconds to delay each printk during boot.
+ Values larger than 10 seconds (10000) are changed to
+ no delay (0).
+ Format: integer
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
@@ -368,6 +363,12 @@ and is between 256 and 4096 characters. It is defined in the file
possible to determine what the correct size should be.
This option provides an override for these situations.
+ capability.disable=
+ [SECURITY] Disable capabilities. This would normally
+ be used only if an alternative security model is to be
+ configured. Potentially dangerous and should only be
+ used if you are entirely sure of the consequences.
+
chandev= [HW,NET] Generic channel device initialisation
checkreqprot [SELINUX] Set initial checkreqprot flag value.
@@ -418,8 +419,10 @@ and is between 256 and 4096 characters. It is defined in the file
over the 8254 in addition to over the IO-APIC. The
kernel tries to set a sensible default.
- hpet= [X86-32,HPET] option to disable HPET and use PIT.
- Format: disable
+ hpet= [X86-32,HPET] option to control HPET usage
+ Format: { enable (default) | disable | force }
+ disable: disable HPET and use PIT instead
+ force: allow force enabled of undocumented chips (ICH4, VIA)
com20020= [HW,NET] ARCnet - COM20020 chipset
Format:
@@ -466,6 +469,16 @@ and is between 256 and 4096 characters. It is defined in the file
UART at the specified I/O port or MMIO address.
The options are the same as for ttyS, above.
+ no_console_suspend
+ [HW] Never suspend the console
+ Disable suspending of consoles during suspend and
+ hibernate operations. Once disabled, debugging
+ messages can reach various consoles while the rest
+ of the system is being put to sleep (ie, while
+ debugging driver suspend/resume hooks). This may
+ not work reliably with all consoles, but is known
+ to work with serial and VGA consoles.
+
cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
Format:
<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -474,6 +487,13 @@ and is between 256 and 4096 characters. It is defined in the file
[KNL] Reserve a chunk of physical memory to
hold a kernel to switch to with kexec on panic.
+ crashkernel=range1:size1[,range2:size2,...][@offset]
+ [KNL] Same as above, but depends on the memory
+ in the running system. The syntax of range is
+ start-[end] where start and end are both
+ a memory unit (amount[KMG]). See also
+ Documentation/kdump/kdump.txt for a example.
+
cs4232= [HW,OSS]
Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
@@ -483,8 +503,6 @@ and is between 256 and 4096 characters. It is defined in the file
cs89x0_media= [HW,NET]
Format: { rj45 | aui | bnc }
- cyclades= [HW,SERIAL] Cyclades multi-serial port adapter.
-
dasd= [HW,NET]
See header of drivers/s390/block/dasd_devmap.c.
@@ -542,10 +560,6 @@ and is between 256 and 4096 characters. It is defined in the file
See drivers/char/README.epca and
Documentation/digiepca.txt.
- dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
- support available.
- Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
-
dmasound= [HW,OSS] Sound subsystem buffers
dscc4.setup= [NET]
@@ -576,17 +590,10 @@ and is between 256 and 4096 characters. It is defined in the file
0: polling mode
non-0: interrupt mode (default)
- eda= [HW,PS2]
-
- edb= [HW,PS2]
-
edd= [EDD]
Format: {"of[f]" | "sk[ipmbr]"}
See comment in arch/i386/boot/edd.S
- eicon= [HW,ISDN]
- Format: <id>,<membase>,<irq>
-
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
@@ -765,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file
inttest= [IA64]
+ intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
+ off
+ Disable intel iommu driver.
+ igfx_off [Default Off]
+ By default, gfx is mapped as normal device. If a gfx
+ device has a dedicated DMAR unit, the DMAR unit is
+ bypassed by not enabling DMAR with this option. In
+ this case, gfx device will use physical address for
+ DMA.
+ forcedac [x86_64]
+ With this option iommu will not optimize to look
+ for io virtual address below 32 bit forcing dual
+ address cycle on pci bus for cards supporting greater
+ than 32 bit addressing. The default is to look
+ for translation below 32 bit and if not available
+ then look in the higher range.
+
io7= [HW] IO7 for Marvel based alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
@@ -862,9 +886,6 @@ and is between 256 and 4096 characters. It is defined in the file
lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
C2 power state.
- lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
- Format: addr:<io>,irq:<irq>
-
libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume
when set.
Format: <int>
@@ -906,6 +927,11 @@ and is between 256 and 4096 characters. It is defined in the file
n must be a power of two. The default size
is set in the kernel config file.
+ logo.nologo [FB] Disables display of the built-in Linux logo.
+ This may be used to provide more screen space for
+ kernel log messages and is useful when debugging
+ kernel boot problems.
+
lp=0 [LP] Specify parallel ports to use, e.g,
lp=port[,port...] lp=none,parport0 (lp0 not configured, lp1 uses
lp=reset first parallel port). 'lp=0' disables the
@@ -976,6 +1002,8 @@ and is between 256 and 4096 characters. It is defined in the file
mce [X86-32] Machine Check Exception
+ mce=option [X86-64] See Documentation/x86_64/boot-options.txt
+
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.
@@ -1083,6 +1111,13 @@ and is between 256 and 4096 characters. It is defined in the file
[NFS] set the maximum lifetime for idmapper cache
entries.
+ nfs.enable_ino64=
+ [NFS] enable 64-bit inode numbers.
+ If zero, the NFS client will fake up a 32-bit inode
+ number for the readdir() and stat() syscalls instead
+ of returning the full 64-bit number.
+ The default is to return 64-bit inode numbers.
+
nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
@@ -1098,9 +1133,6 @@ and is between 256 and 4096 characters. It is defined in the file
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
- noasync [HW,M68K] Disables async and sync negotiation for
- all devices.
-
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.
@@ -1412,6 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file
Param: <number> - step/bucket size as a power of 2 for
statistical time based profiling.
Param: "sleep" - profile D-state sleeping (millisecs)
+ Param: "kvm" - profile VM exits.
processor.max_cstate= [HW,ACPI]
Limit processor to maximum C-state
@@ -1456,14 +1489,10 @@ and is between 256 and 4096 characters. It is defined in the file
raid= [HW,RAID]
See Documentation/md.txt.
- ramdisk= [RAM] Sizes of RAM disks in kilobytes [deprecated]
- See Documentation/ramdisk.txt.
-
ramdisk_blocksize= [RAM]
See Documentation/ramdisk.txt.
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
- New name for the ramdisk parameter.
See Documentation/ramdisk.txt.
rcu.blimit= [KNL,BOOT] Set maximum number of finished
@@ -1526,6 +1555,15 @@ and is between 256 and 4096 characters. It is defined in the file
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
+ root_plug.vendor_id=
+ [ROOTPLUG] Override the default vendor ID
+
+ root_plug.product_id=
+ [ROOTPLUG] Override the default product ID
+
+ root_plug.debug=
+ [ROOTPLUG] Enable debugging output
+
rw [KNL] Mount root device read-write on boot
S [KNL] Run init in single mode
@@ -1533,9 +1571,6 @@ and is between 256 and 4096 characters. It is defined in the file
sa1100ir [NET]
See drivers/net/irda/sa1100_ir.c.
- sb= [HW,OSS]
- Format: <io>,<irq>,<dma>,<dma2>
-
sbni= [NET] Granch SBNI12 leased line adapter
sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver
@@ -1579,8 +1614,6 @@ and is between 256 and 4096 characters. It is defined in the file
serialnumber [BUGS=X86-32]
- sg_def_reserved_size= [SCSI]
-
shapers= [NET]
Maximal number of shapers.
@@ -1883,9 +1916,6 @@ and is between 256 and 4096 characters. It is defined in the file
Format:
<io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
- tsdev.xres= [TS] Horizontal screen resolution.
- tsdev.yres= [TS] Vertical screen resolution.
-
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
Format:
@@ -1974,10 +2004,6 @@ and is between 256 and 4096 characters. It is defined in the file
norandmaps Don't use address space randomization
Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
- unwind_debug=N N > 0 will enable dwarf2 unwinder debugging
- This is useful to get more information why
- you got a "dwarf2 unwinder stuck"
-
______________________________________________________________________
TODO:
diff --git a/Documentation/keys-request-key.txt b/Documentation/keys-request-key.txt
index c1f64fdf84c..266955d23ee 100644
--- a/Documentation/keys-request-key.txt
+++ b/Documentation/keys-request-key.txt
@@ -20,6 +20,19 @@ or:
const char *callout_string,
void *aux);
+or:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const char *callout_string);
+
+or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_string,
+ void *aux);
+
Or by userspace invoking the request_key system call:
key_serial_t request_key(const char *type,
@@ -32,10 +45,14 @@ does not need to link the key to a keyring to prevent it from being immediately
destroyed. The kernel interface returns a pointer directly to the key, and
it's up to the caller to destroy the key.
-The request_key_with_auxdata() call is like the in-kernel request_key() call,
-except that it permits auxiliary data to be passed to the upcaller (the default
-is NULL). This is only useful for those key types that define their own upcall
-mechanism rather than using /sbin/request-key.
+The request_key*_with_auxdata() calls are like the in-kernel request_key*()
+calls, except that they permit auxiliary data to be passed to the upcaller (the
+default is NULL). This is only useful for those key types that define their
+own upcall mechanism rather than using /sbin/request-key.
+
+The two async in-kernel calls may return keys that are still in the process of
+being constructed. The two non-async ones will wait for construction to
+complete first.
The userspace interface links the key to a keyring associated with the process
to prevent the key from going away, and returns the serial number of the key to
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 947d57d5345..51652d39e61 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -4,7 +4,7 @@
This service allows cryptographic keys, authentication tokens, cross-domain
user mappings, and similar to be cached in the kernel for the use of
-filesystems other kernel services.
+filesystems and other kernel services.
Keyrings are permitted; these are a special type of key that can hold links to
other keys. Processes each have three standard keyring subscriptions that a
@@ -726,6 +726,15 @@ call, and the key released upon close. How to deal with conflicting keys due to
two different users opening the same file is left to the filesystem author to
solve.
+To access the key manager, the following header must be #included:
+
+ <linux/key.h>
+
+Specific key types should have a header file under include/keys/ that should be
+used to access that type. For keys of type "user", for example, that would be:
+
+ <keys/user-type.h>
+
Note that there are two different types of pointers to keys that may be
encountered:
@@ -791,6 +800,36 @@ payload contents" for more information.
passed to the key_type->request_key() op if it exists.
+(*) A key can be requested asynchronously by calling one of:
+
+ struct key *request_key_async(const struct key_type *type,
+ const char *description,
+ const char *callout_string);
+
+ or:
+
+ struct key *request_key_async_with_auxdata(const struct key_type *type,
+ const char *description,
+ const char *callout_string,
+ void *aux);
+
+ which are asynchronous equivalents of request_key() and
+ request_key_with_auxdata() respectively.
+
+ These two functions return with the key potentially still under
+ construction. To wait for contruction completion, the following should be
+ called:
+
+ int wait_for_key_construction(struct key *key, bool intr);
+
+ The function will wait for the key to finish being constructed and then
+ invokes key_validate() to return an appropriate value to indicate the state
+ of the key (0 indicates the key is usable).
+
+ If intr is true, then the wait can be interrupted by a signal, in which
+ case error ERESTARTSYS will be returned.
+
+
(*) When it is no longer required, the key should be released using:
void key_put(struct key *key);
@@ -924,7 +963,11 @@ DEFINING A KEY TYPE
A kernel service may want to define its own key type. For instance, an AFS
filesystem might want to define a Kerberos 5 ticket key type. To do this, it
-author fills in a struct key_type and registers it with the system.
+author fills in a key_type struct and registers it with the system.
+
+Source files that implement key types should include the following header file:
+
+ <linux/key-type.h>
The structure has a number of fields, some of which are mandatory:
@@ -1053,22 +1096,44 @@ The structure has a number of fields, some of which are mandatory:
as might happen when the userspace buffer is accessed.
- (*) int (*request_key)(struct key *key, struct key *authkey, const char *op,
+ (*) int (*request_key)(struct key_construction *cons, const char *op,
void *aux);
- This method is optional. If provided, request_key() and
- request_key_with_auxdata() will invoke this function rather than
- upcalling to /sbin/request-key to operate upon a key of this type.
+ This method is optional. If provided, request_key() and friends will
+ invoke this function rather than upcalling to /sbin/request-key to operate
+ upon a key of this type.
+
+ The aux parameter is as passed to request_key_async_with_auxdata() and
+ similar or is NULL otherwise. Also passed are the construction record for
+ the key to be operated upon and the operation type (currently only
+ "create").
+
+ This method is permitted to return before the upcall is complete, but the
+ following function must be called under all circumstances to complete the
+ instantiation process, whether or not it succeeds, whether or not there's
+ an error:
+
+ void complete_request_key(struct key_construction *cons, int error);
+
+ The error parameter should be 0 on success, -ve on error. The
+ construction record is destroyed by this action and the authorisation key
+ will be revoked. If an error is indicated, the key under construction
+ will be negatively instantiated if it wasn't already instantiated.
+
+ If this method returns an error, that error will be returned to the
+ caller of request_key*(). complete_request_key() must be called prior to
+ returning.
+
+ The key under construction and the authorisation key can be found in the
+ key_construction struct pointed to by cons:
+
+ (*) struct key *key;
+
+ The key under construction.
- The aux parameter is as passed to request_key_with_auxdata() or is NULL
- otherwise. Also passed are the key to be operated upon, the
- authorisation key for this operation and the operation type (currently
- only "create").
+ (*) struct key *authkey;
- This function should return only when the upcall is complete. Upon return
- the authorisation key will be revoked, and the target key will be
- negatively instantiated if it is still uninstantiated. The error will be
- returned to the caller of request_key*().
+ The authorisation key.
============================
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index c0b7a455639..bac037eb1cd 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,28 +1,8 @@
# This creates the demonstration utility "lguest" which runs a Linux guest.
-
-# For those people that have a separate object dir, look there for .config
-KBUILD_OUTPUT := ../..
-ifdef O
- ifeq ("$(origin O)", "command line")
- KBUILD_OUTPUT := $(O)
- endif
-endif
-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-include $(KBUILD_OUTPUT)/.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
-
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
+CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include
LDLIBS:=-lz
-# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
-# not others (eg. FC7).
-LDFLAGS+=-static
-all: lguest.lds lguest
-# The linker script on x86 is so complex the only way of creating one
-# which will link our binary in the right place is to mangle the
-# default one.
-lguest.lds:
- $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@
+all: lguest
clean:
- rm -f lguest.lds lguest
+ rm -f lguest
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 103e346c8b6..5bdc37f8184 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,10 +1,7 @@
/*P:100 This is the Launcher code, a simple program which lays out the
* "physical" memory for the new Guest by mapping the kernel image and the
* virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
- *
- * The only trick: the Makefile links it at a high address so it will be clear
- * of the guest memory region. It means that each Guest cannot have more than
- * about 2.5G of memory on a normally configured Host. :*/
+:*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
@@ -15,6 +12,7 @@
#include <stdlib.h>
#include <elf.h>
#include <sys/mman.h>
+#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
@@ -34,7 +32,9 @@
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
-/*L:110 We can ignore the 28 include files we need for this program, but I do
+#include <assert.h>
+#include <sched.h>
+/*L:110 We can ignore the 30 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -45,8 +45,14 @@ typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
-#include "../../include/asm-x86/e820_32.h"
+#include "linux/lguest_launcher.h"
+#include "linux/pci_ids.h"
+#include "linux/virtio_config.h"
+#include "linux/virtio_net.h"
+#include "linux/virtio_blk.h"
+#include "linux/virtio_console.h"
+#include "linux/virtio_ring.h"
+#include "asm-x86/bootparam.h"
/*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
@@ -55,6 +61,10 @@ typedef uint8_t u8;
#ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
#endif
+/* We can have up to 256 pages for devices. */
+#define DEVICE_PAGES 256
+/* This fits nicely in a single 4096-byte page. */
+#define VIRTQUEUE_NUM 127
/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
* this, and although I wouldn't recommend it, it works quite nicely here. */
@@ -65,8 +75,10 @@ static bool verbose;
/* The pipe to send commands to the waker process */
static int waker_fd;
-/* The top of guest physical memory. */
-static u32 top;
+/* The pointer to the start of guest memory. */
+static void *guest_base;
+/* The maximum guest physical address allowed, and maximum possible. */
+static unsigned long guest_limit, guest_max;
/* This is our list of devices. */
struct device_list
@@ -76,8 +88,17 @@ struct device_list
fd_set infds;
int max_infd;
+ /* Counter to assign interrupt numbers. */
+ unsigned int next_irq;
+
+ /* Counter to print out convenient device numbers. */
+ unsigned int device_num;
+
/* The descriptor page for the devices. */
- struct lguest_device_desc *descs;
+ u8 *descpage;
+
+ /* The tail of the last descriptor. */
+ unsigned int desc_used;
/* A single linked list of devices. */
struct device *dev;
@@ -85,31 +106,111 @@ struct device_list
struct device **lastdev;
};
+/* The list of Guest devices, based on command line arguments. */
+static struct device_list devices;
+
/* The device structure describes a single device. */
struct device
{
/* The linked-list pointer. */
struct device *next;
- /* The descriptor for this device, as mapped into the Guest. */
+
+ /* The this device's descriptor, as mapped into the Guest. */
struct lguest_device_desc *desc;
- /* The memory page(s) of this device, if any. Also mapped in Guest. */
- void *mem;
+
+ /* The name of this device, for --verbose. */
+ const char *name;
/* If handle_input is set, it wants to be called when this file
* descriptor is ready. */
int fd;
bool (*handle_input)(int fd, struct device *me);
- /* If handle_output is set, it wants to be called when the Guest sends
- * DMA to this key. */
- unsigned long watch_key;
- u32 (*handle_output)(int fd, const struct iovec *iov,
- unsigned int num, struct device *me);
+ /* Any queues attached to this device */
+ struct virtqueue *vq;
/* Device-specific data. */
void *priv;
};
+/* The virtqueue structure describes a queue attached to a device. */
+struct virtqueue
+{
+ struct virtqueue *next;
+
+ /* Which device owns me. */
+ struct device *dev;
+
+ /* The configuration for this queue. */
+ struct lguest_vqconfig config;
+
+ /* The actual ring of buffers. */
+ struct vring vring;
+
+ /* Last available index we saw. */
+ u16 last_avail_idx;
+
+ /* The routine to call when the Guest pings us. */
+ void (*handle_output)(int fd, struct virtqueue *me);
+};
+
+/* Since guest is UP and we don't run at the same time, we don't need barriers.
+ * But I include them in the code in case others copy it. */
+#define wmb()
+
+/* Convert an iovec element to the given type.
+ *
+ * This is a fairly ugly trick: we need to know the size of the type and
+ * alignment requirement to check the pointer is kosher. It's also nice to
+ * have the name of the type in case we report failure.
+ *
+ * Typing those three things all the time is cumbersome and error prone, so we
+ * have a macro which sets them all up and passes to the real function. */
+#define convert(iov, type) \
+ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
+
+static void *_convert(struct iovec *iov, size_t size, size_t align,
+ const char *name)
+{
+ if (iov->iov_len != size)
+ errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
+ if ((unsigned long)iov->iov_base % align != 0)
+ errx(1, "Bad alignment %p for %s", iov->iov_base, name);
+ return iov->iov_base;
+}
+
+/* The virtio configuration space is defined to be little-endian. x86 is
+ * little-endian too, but it's nice to be explicit so we have these helpers. */
+#define cpu_to_le16(v16) (v16)
+#define cpu_to_le32(v32) (v32)
+#define cpu_to_le64(v64) (v64)
+#define le16_to_cpu(v16) (v16)
+#define le32_to_cpu(v32) (v32)
+#define le64_to_cpu(v32) (v64)
+
+/*L:100 The Launcher code itself takes us out into userspace, that scary place
+ * where pointers run wild and free! Unfortunately, like most userspace
+ * programs, it's quite boring (which is why everyone likes to hack on the
+ * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
+ * will get you through this section. Or, maybe not.
+ *
+ * The Launcher sets up a big chunk of memory to be the Guest's "physical"
+ * memory and stores it in "guest_base". In other words, Guest physical ==
+ * Launcher virtual with an offset.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * use these trivial conversion functions when the Guest gives us it's
+ * "physical" addresses: */
+static void *from_guest_phys(unsigned long addr)
+{
+ return guest_base + addr;
+}
+
+static unsigned long to_guest_phys(const void *addr)
+{
+ return (addr - guest_base);
+}
+
/*L:130
* Loading the Kernel.
*
@@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags)
return fd;
}
-/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */
-static void *map_zeroed_pages(unsigned long addr, unsigned int num)
+/* map_zeroed_pages() takes a number of pages. */
+static void *map_zeroed_pages(unsigned int num)
{
- /* We cache the /dev/zero file-descriptor so we only open it once. */
- static int fd = -1;
-
- if (fd == -1)
- fd = open_or_die("/dev/zero", O_RDONLY);
+ int fd = open_or_die("/dev/zero", O_RDONLY);
+ void *addr;
/* We use a private mapping (ie. if we write to the page, it will be
- * copied), and obviously we insist that it be mapped where we ask. */
- if (mmap((void *)addr, getpagesize() * num,
- PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
- != (void *)addr)
- err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
-
- /* Returning the address is just a courtesy: can simplify callers. */
- return (void *)addr;
+ * copied). */
+ addr = mmap(NULL, getpagesize() * num,
+ PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
+ if (addr == MAP_FAILED)
+ err(1, "Mmaping %u pages of /dev/zero", num);
+
+ return addr;
}
-/* To find out where to start we look for the magic Guest string, which marks
- * the code we see in lguest_asm.S. This is a hack which we are currently
- * plotting to replace with the normal Linux entry point. */
-static unsigned long entry_point(void *start, void *end,
- unsigned long page_offset)
+/* Get some more pages for a device. */
+static void *get_pages(unsigned int num)
{
- void *p;
+ void *addr = from_guest_phys(guest_limit);
- /* The scan gives us the physical starting address. We want the
- * virtual address in this case, and fortunately, we already figured
- * out the physical-virtual difference and passed it here in
- * "page_offset". */
- for (p = start; p < end; p++)
- if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
- return (long)p + strlen("GenuineLguest") + page_offset;
+ guest_limit += num * getpagesize();
+ if (guest_limit > guest_max)
+ errx(1, "Not enough memory for devices");
+ return addr;
+}
- err(1, "Is this image a genuine lguest?");
+/* This routine is used to load the kernel or initrd. It tries mmap, but if
+ * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
+ * it falls back to reading the memory in. */
+static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
+{
+ ssize_t r;
+
+ /* We map writable even though for some segments are marked read-only.
+ * The kernel really wants to be writable: it patches its own
+ * instructions.
+ *
+ * MAP_PRIVATE means that the page won't be copied until a write is
+ * done to it. This allows us to share untouched memory between
+ * Guests. */
+ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
+ return;
+
+ /* pread does a seek and a read in one shot: saves a few lines. */
+ r = pread(fd, addr, len, offset);
+ if (r != len)
+ err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
}
/* This routine takes an open vmlinux image, which is in ELF, and maps it into
@@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end,
* by all modern binaries on Linux including the kernel.
*
* The ELF headers give *two* addresses: a physical address, and a virtual
- * address. The Guest kernel expects to be placed in memory at the physical
- * address, and the page tables set up so it will correspond to that virtual
- * address. We return the difference between the virtual and physical
- * addresses in the "page_offset" pointer.
+ * address. We use the physical address; the Guest will map itself to the
+ * virtual address.
*
* We return the starting address. */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
- unsigned long *page_offset)
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
{
- void *addr;
Elf32_Phdr phdr[ehdr->e_phnum];
unsigned int i;
- unsigned long start = -1UL, end = 0;
/* Sanity checks on the main ELF header: an x86 executable with a
* reasonable number of correctly-sized program headers. */
@@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
err(1, "Reading program headers");
- /* We don't know page_offset yet. */
- *page_offset = 0;
-
/* Try all the headers: there are usually only three. A read-only one,
* a read-write one, and a "note" section which isn't loadable. */
for (i = 0; i < ehdr->e_phnum; i++) {
@@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
verbose("Section %i: size %i addr %p\n",
i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
- /* We expect a simple linear address space: every segment must
- * have the same difference between virtual (p_vaddr) and
- * physical (p_paddr) address. */
- if (!*page_offset)
- *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
- else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
- errx(1, "Page offset of section %i different", i);
-
- /* We track the first and last address we mapped, so we can
- * tell entry_point() where to scan. */
- if (phdr[i].p_paddr < start)
- start = phdr[i].p_paddr;
- if (phdr[i].p_paddr + phdr[i].p_filesz > end)
- end = phdr[i].p_paddr + phdr[i].p_filesz;
-
- /* We map this section of the file at its physical address. We
- * map it read & write even if the header says this segment is
- * read-only. The kernel really wants to be writable: it
- * patches its own instructions which would normally be
- * read-only.
- *
- * MAP_PRIVATE means that the page won't be copied until a
- * write is done to it. This allows us to share much of the
- * kernel memory between Guests. */
- addr = mmap((void *)phdr[i].p_paddr,
- phdr[i].p_filesz,
- PROT_READ|PROT_WRITE|PROT_EXEC,
- MAP_FIXED|MAP_PRIVATE,
- elf_fd, phdr[i].p_offset);
- if (addr != (void *)phdr[i].p_paddr)
- err(1, "Mmaping vmlinux seg %i gave %p not %p",
- i, addr, (void *)phdr[i].p_paddr);
+ /* We map this section of the file at its physical address. */
+ map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
+ phdr[i].p_offset, phdr[i].p_filesz);
}
- return entry_point((void *)start, (void *)end, *page_offset);
+ /* The entry point is given in the ELF header. */
+ return ehdr->e_entry;
}
-/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
- *
- * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
- * to be. We don't know what that option was, but we can figure it out
- * approximately by looking at the addresses in the code. I chose the common
- * case of reading a memory location into the %eax register:
- *
- * movl <some-address>, %eax
- *
- * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
- * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
- *
- * In this example can guess that the kernel was compiled with
- * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
- * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
- * kernel isn't that bloated yet.
- *
- * Unfortunately, x86 has variable-length instructions, so finding this
- * particular instruction properly involves writing a disassembler. Instead,
- * we rely on statistics. We look for "0xA1" and tally the different bytes
- * which occur 4 bytes later (the "0xC0" in our example above). When one of
- * those bytes appears three times, we can be reasonably confident that it
- * forms the start of CONFIG_PAGE_OFFSET.
+/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
+ * supposed to jump into it and it will unpack itself. We used to have to
+ * perform some hairy magic because the unpacking code scared me.
*
- * This is amazingly reliable. */
-static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
+ * a small patch to jump over the tricky bits in the Guest, so now we just read
+ * the funky header so we know where in the file to load, and away we go! */
+static unsigned long load_bzimage(int fd)
{
- unsigned int i, possibilities[256] = { 0 };
+ struct boot_params boot;
+ int r;
+ /* Modern bzImages get loaded at 1M. */
+ void *p = from_guest_phys(0x100000);
- for (i = 0; i + 4 < len; i++) {
- /* mov 0xXXXXXXXX,%eax */
- if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3)
- return (unsigned long)img[i+4] << 24;
- }
- errx(1, "could not determine page offset");
-}
+ /* Go back to the start of the file and read the header. It should be
+ * a Linux boot header (see Documentation/i386/boot.txt) */
+ lseek(fd, 0, SEEK_SET);
+ read(fd, &boot, sizeof(boot));
-/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
- * which need loading are extracted and compressed raw. This denies us the
- * information we need to make a fully-general loader. */
-static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
-{
- gzFile f;
- int ret, len = 0;
- /* A bzImage always gets loaded at physical address 1M. This is
- * actually configurable as CONFIG_PHYSICAL_START, but as the comment
- * there says, "Don't change this unless you know what you are doing".
- * Indeed. */
- void *img = (void *)0x100000;
-
- /* gzdopen takes our file descriptor (carefully placed at the start of
- * the GZIP header we found) and returns a gzFile. */
- f = gzdopen(fd, "rb");
- /* We read it into memory in 64k chunks until we hit the end. */
- while ((ret = gzread(f, img + len, 65536)) > 0)
- len += ret;
- if (ret < 0)
- err(1, "reading image from bzImage");
-
- verbose("Unpacked size %i addr %p\n", len, img);
-
- /* Without the ELF header, we can't tell virtual-physical gap. This is
- * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
- * I have a clever way of figuring it out from the code itself. */
- *page_offset = intuit_page_offset(img, len);
-
- return entry_point(img, img + len, *page_offset);
-}
+ /* Inside the setup_hdr, we expect the magic "HdrS" */
+ if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
+ errx(1, "This doesn't look like a bzImage to me");
-/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
- * supposed to jump into it and it will unpack itself. We can't do that
- * because the Guest can't run the unpacking code, and adding features to
- * lguest kills puppies, so we don't want to.
- *
- * The bzImage is formed by putting the decompressing code in front of the
- * compressed kernel code. So we can simple scan through it looking for the
- * first "gzip" header, and start decompressing from there. */
-static unsigned long load_bzimage(int fd, unsigned long *page_offset)
-{
- unsigned char c;
- int state = 0;
-
- /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
- while (read(fd, &c, 1) == 1) {
- switch (state) {
- case 0:
- if (c == 0x1F)
- state++;
- break;
- case 1:
- if (c == 0x8B)
- state++;
- else
- state = 0;
- break;
- case 2 ... 8:
- state++;
- break;
- case 9:
- /* Seek back to the start of the gzip header. */
- lseek(fd, -10, SEEK_CUR);
- /* One final check: "compressed under UNIX". */
- if (c != 0x03)
- state = -1;
- else
- return unpack_bzimage(fd, page_offset);
- }
- }
- errx(1, "Could not find kernel in bzImage");
+ /* Skip over the extra sectors of the header. */
+ lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
+
+ /* Now read everything into memory. in nice big chunks. */
+ while ((r = read(fd, p, 65536)) > 0)
+ p += r;
+
+ /* Finally, code32_start tells us where to enter the kernel. */
+ return boot.hdr.code32_start;
}
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
* come wrapped up in the self-decompressing "bzImage" format. With some funky
* coding, we can load those, too. */
-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+static unsigned long load_kernel(int fd)
{
Elf32_Ehdr hdr;
@@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset)
/* If it's an ELF file, it starts with "\177ELF" */
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
- return map_elf(fd, &hdr, page_offset);
+ return map_elf(fd, &hdr);
/* Otherwise we assume it's a bzImage, and try to unpack it */
- return load_bzimage(fd, page_offset);
+ return load_bzimage(fd);
}
/* This is a trivial little helper to align pages. Andi Kleen hated it because
@@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
int ifd;
struct stat st;
unsigned long len;
- void *iaddr;
ifd = open_or_die(name, O_RDONLY);
/* fstat() is needed to get the file size. */
if (fstat(ifd, &st) < 0)
err(1, "fstat() on initrd '%s'", name);
- /* The length needs to be rounded up to a page size: mmap needs the
- * address to be page aligned. */
+ /* We map the initrd at the top of memory, but mmap wants it to be
+ * page-aligned, so we round the size up for that. */
len = page_align(st.st_size);
- /* We map the initrd at the top of memory. */
- iaddr = mmap((void *)mem - len, st.st_size,
- PROT_READ|PROT_EXEC|PROT_WRITE,
- MAP_FIXED|MAP_PRIVATE, ifd, 0);
- if (iaddr != (void *)mem - len)
- err(1, "Mmaping initrd '%s' returned %p not %p",
- name, iaddr, (void *)mem - len);
+ map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
/* Once a file is mapped, you can close the file descriptor. It's a
* little odd, but quite useful. */
close(ifd);
- verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+ verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
/* We return the initrd size. */
return len;
}
-/* Once we know how much memory we have, and the address the Guest kernel
- * expects, we can construct simple linear page tables which will get the Guest
- * far enough into the boot to create its own.
+/* Once we know how much memory we have, we can construct simple linear page
+ * tables which set virtual == physical which will get the Guest far enough
+ * into the boot to create its own.
*
* We lay them out of the way, just below the initrd (which is why we need to
* know its size). */
static unsigned long setup_pagetables(unsigned long mem,
- unsigned long initrd_size,
- unsigned long page_offset)
+ unsigned long initrd_size)
{
- u32 *pgdir, *linear;
+ unsigned long *pgdir, *linear;
unsigned int mapped_pages, i, linear_pages;
- unsigned int ptes_per_page = getpagesize()/sizeof(u32);
+ unsigned int ptes_per_page = getpagesize()/sizeof(void *);
- /* Ideally we map all physical memory starting at page_offset.
- * However, if page_offset is 0xC0000000 we can only map 1G of physical
- * (0xC0000000 + 1G overflows). */
- if (mem <= -page_offset)
- mapped_pages = mem/getpagesize();
- else
- mapped_pages = -page_offset/getpagesize();
+ mapped_pages = mem/getpagesize();
/* Each PTE page can map ptes_per_page pages: how many do we need? */
linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
/* We put the toplevel page directory page at the top of memory. */
- pgdir = (void *)mem - initrd_size - getpagesize();
+ pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
/* Now we use the next linear_pages pages as pte pages */
linear = (void *)pgdir - linear_pages*getpagesize();
@@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem,
for (i = 0; i < mapped_pages; i++)
linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
- /* The top level points to the linear page table pages above. The
- * entry representing page_offset points to the first one, and they
- * continue from there. */
+ /* The top level points to the linear page table pages above. */
for (i = 0; i < mapped_pages; i += ptes_per_page) {
- pgdir[(i + page_offset/getpagesize())/ptes_per_page]
- = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+ pgdir[i/ptes_per_page]
+ = ((to_guest_phys(linear) + i*sizeof(void *))
+ | PAGE_PRESENT);
}
- verbose("Linear mapping of %u pages in %u pte pages at %p\n",
- mapped_pages, linear_pages, linear);
+ verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
+ mapped_pages, linear_pages, to_guest_phys(linear));
/* We return the top level (guest-physical) address: the kernel needs
* to know where it is. */
- return (unsigned long)pgdir;
+ return to_guest_phys(pgdir);
}
/* Simple routine to roll all the commandline arguments together with spaces
@@ -498,14 +483,17 @@ static void concat(char *dst, char *args[])
/* This is where we actually tell the kernel to initialize the Guest. We saw
* the arguments it expects when we looked at initialize() in lguest_user.c:
- * the top physical page to allow, the top level pagetable, the entry point and
- * the page_offset constant for the Guest. */
-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+ * the base of guest "physical" memory, the top physical page to allow, the
+ * top level pagetable and the entry point for the Guest. */
+static int tell_kernel(unsigned long pgdir, unsigned long start)
{
- u32 args[] = { LHREQ_INITIALIZE,
- top/getpagesize(), pgdir, start, page_offset };
+ unsigned long args[] = { LHREQ_INITIALIZE,
+ (unsigned long)guest_base,
+ guest_limit / getpagesize(), pgdir, start };
int fd;
+ verbose("Guest: %p - %p (%#lx)\n",
+ guest_base, guest_base + guest_limit, guest_limit);
fd = open_or_die("/dev/lguest", O_RDWR);
if (write(fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
@@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
}
/*:*/
-static void set_fd(int fd, struct device_list *devices)
+static void add_device_fd(int fd)
{
- FD_SET(fd, &devices->infds);
- if (fd > devices->max_infd)
- devices->max_infd = fd;
+ FD_SET(fd, &devices.infds);
+ if (fd > devices.max_infd)
+ devices.max_infd = fd;
}
/*L:200
@@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices)
*
* This, of course, is merely a different *kind* of icky.
*/
-static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
+static void wake_parent(int pipefd, int lguest_fd)
{
/* Add the pipe from the Launcher to the fdset in the device_list, so
* we watch it, too. */
- set_fd(pipefd, devices);
+ add_device_fd(pipefd);
for (;;) {
- fd_set rfds = devices->infds;
- u32 args[] = { LHREQ_BREAK, 1 };
+ fd_set rfds = devices.infds;
+ unsigned long args[] = { LHREQ_BREAK, 1 };
/* Wait until input is ready from one of the devices. */
- select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+ select(devices.max_infd+1, &rfds, NULL, NULL, NULL);
/* Is it a message from the Launcher? */
if (FD_ISSET(pipefd, &rfds)) {
- int ignorefd;
+ int fd;
/* If read() returns 0, it means the Launcher has
* exited. We silently follow. */
- if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
+ if (read(pipefd, &fd, sizeof(fd)) == 0)
exit(0);
- /* Otherwise it's telling us there's a problem with one
- * of the devices, and we should ignore that file
- * descriptor from now on. */
- FD_CLR(ignorefd, &devices->infds);
+ /* Otherwise it's telling us to change what file
+ * descriptors we're to listen to. */
+ if (fd >= 0)
+ FD_SET(fd, &devices.infds);
+ else
+ FD_CLR(-fd - 1, &devices.infds);
} else /* Send LHREQ_BREAK command. */
write(lguest_fd, args, sizeof(args));
}
}
/* This routine just sets up a pipe to the Waker process. */
-static int setup_waker(int lguest_fd, struct device_list *device_list)
+static int setup_waker(int lguest_fd)
{
int pipefd[2], child;
@@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list)
if (child == 0) {
/* Close the "writing" end of our copy of the pipe */
close(pipefd[1]);
- wake_parent(pipefd[0], lguest_fd, device_list);
+ wake_parent(pipefd[0], lguest_fd);
}
/* Close the reading end of our copy of the pipe. */
close(pipefd[0]);
@@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
{
/* We have to separately check addr and addr+size, because size could
* be huge and addr + size might wrap around. */
- if (addr >= top || addr + size >= top)
- errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+ if (addr >= guest_limit || addr + size >= guest_limit)
+ errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
/* We return a pointer for the caller's convenience, now we know it's
* safe to use. */
- return (void *)addr;
+ return from_guest_phys(addr);
}
/* A macro which transparently hands the line number to the real function. */
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
-/* The Guest has given us the address of a "struct lguest_dma". We check it's
- * OK and convert it to an iovec (which is a simple array of ptr/size
- * pairs). */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+/* This function returns the next descriptor in the chain, or vq->vring.num. */
+static unsigned next_desc(struct virtqueue *vq, unsigned int i)
{
- unsigned int i;
- struct lguest_dma *udma;
-
- /* First we make sure that the array memory itself is valid. */
- udma = check_pointer(dma, sizeof(*udma));
- /* Now we check each element */
- for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
- /* A zero length ends the array. */
- if (!udma->len[i])
- break;
+ unsigned int next;
- iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]);
- iov[i].iov_len = udma->len[i];
- }
- *num = i;
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
+ return vq->vring.num;
+
+ /* Check they're not leading us off end of descriptors. */
+ next = vq->vring.desc[i].next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ wmb();
- /* We return the pointer to where the caller should write the amount of
- * the buffer used. */
- return &udma->used_len;
+ if (next >= vq->vring.num)
+ errx(1, "Desc next is %u", next);
+
+ return next;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->vring.num (which
+ * is never a valid descriptor number) if none was found. */
+static unsigned get_vq_desc(struct virtqueue *vq,
+ struct iovec iov[],
+ unsigned int *out_num, unsigned int *in_num)
+{
+ unsigned int i, head;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+ errx(1, "Guest moved used index from %u to %u",
+ vq->last_avail_idx, vq->vring.avail->idx);
+
+ /* If there's nothing new since last we looked, return invalid. */
+ if (vq->vring.avail->idx == vq->last_avail_idx)
+ return vq->vring.num;
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+
+ /* If their number is silly, that's a fatal mistake. */
+ if (head >= vq->vring.num)
+ errx(1, "Guest says index %u is available", head);
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+
+ i = head;
+ do {
+ /* Grab the first descriptor, and check it's OK. */
+ iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+ iov[*out_num + *in_num].iov_base
+ = check_pointer(vq->vring.desc[i].addr,
+ vq->vring.desc[i].len);
+ /* If this is an input descriptor, increment that count. */
+ if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+ (*in_num)++;
+ else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (*in_num)
+ errx(1, "Descriptor has out after in");
+ (*out_num)++;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (*out_num + *in_num > vq->vring.num)
+ errx(1, "Looped descriptor");
+ } while ((i = next_desc(vq, i)) != vq->vring.num);
+
+ return head;
}
-/* This routine gets a DMA buffer from the Guest for a given key, and converts
- * it to an iovec array. It returns the interrupt the Guest wants when we're
- * finished, and a pointer to the "used_len" field to fill in. */
-static u32 *get_dma_buffer(int fd, void *key,
- struct iovec iov[], unsigned int *num, u32 *irq)
+/* Once we've used one of their buffers, we tell them about it. We'll then
+ * want to send them an interrupt, using trigger_irq(). */
+static void add_used(struct virtqueue *vq, unsigned int head, int len)
{
- u32 buf[] = { LHREQ_GETDMA, (u32)key };
- unsigned long udma;
- u32 *res;
-
- /* Ask the kernel for a DMA buffer corresponding to this key. */
- udma = write(fd, buf, sizeof(buf));
- /* They haven't registered any, or they're all used? */
- if (udma == (unsigned long)-1)
- return NULL;
-
- /* Convert it into our iovec array */
- res = dma2iov(udma, iov, num);
- /* The kernel stashes irq in ->used_len to get it out to us. */
- *irq = *res;
- /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
- return res;
+ struct vring_used_elem *used;
+
+ /* Get a pointer to the next entry in the used ring. */
+ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
+ used->id = head;
+ used->len = len;
+ /* Make sure buffer is written before we update index. */
+ wmb();
+ vq->vring.used->idx++;
}
-/* This is a convenient routine to send the Guest an interrupt. */
-static void trigger_irq(int fd, u32 irq)
+/* This actually sends the interrupt for this virtqueue */
+static void trigger_irq(int fd, struct virtqueue *vq)
{
- u32 buf[] = { LHREQ_IRQ, irq };
+ unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
+ return;
+
+ /* Send the Guest an interrupt tell them we used something up. */
if (write(fd, buf, sizeof(buf)) != 0)
- err(1, "Triggering irq %i", irq);
+ err(1, "Triggering irq %i", vq->config.irq);
}
-/* This simply sets up an iovec array where we can put data to be discarded.
- * This happens when the Guest doesn't want or can't handle the input: we have
- * to get rid of it somewhere, and if we bury it in the ceiling space it will
- * start to smell after a week. */
-static void discard_iovec(struct iovec *iov, unsigned int *num)
+/* And here's the combo meal deal. Supersize me! */
+static void add_used_and_trigger(int fd, struct virtqueue *vq,
+ unsigned int head, int len)
{
- static char discard_buf[1024];
- *num = 1;
- iov->iov_base = discard_buf;
- iov->iov_len = sizeof(discard_buf);
+ add_used(vq, head, len);
+ trigger_irq(fd, vq);
}
/* Here is the input terminal setting we save, and the routine to restore them
@@ -701,38 +736,39 @@ struct console_abort
/* This is the routine which handles console input (ie. stdin). */
static bool handle_console_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
int len;
- unsigned int num;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ unsigned int head, in_num, out_num;
+ struct iovec iov[dev->vq->vring.num];
struct console_abort *abort = dev->priv;
- /* First we get the console buffer from the Guest. The key is dev->mem
- * which was set to 0 in setup_console(). */
- lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
- if (!lenp) {
- /* If it's not ready for input, warn and set up to discard. */
- warn("console: no dma buffer!");
- discard_iovec(iov, &num);
- }
+ /* First we need a console buffer from the Guests's input virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+
+ /* If they're not ready for input, stop listening to this file
+ * descriptor. We'll start again once they add an input buffer. */
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num)
+ errx(1, "Output buffers in console in queue?");
/* This is why we convert to iovecs: the readv() call uses them, and so
* it reads straight into the Guest's buffer. */
- len = readv(dev->fd, iov, num);
+ len = readv(dev->fd, iov, in_num);
if (len <= 0) {
/* This implies that the console is closed, is /dev/null, or
- * something went terribly wrong. We still go through the rest
- * of the logic, though, especially the exit handling below. */
+ * something went terribly wrong. */
warnx("Failed to get console input, ignoring console.");
- len = 0;
+ /* Put the input terminal back. */
+ restore_term();
+ /* Remove callback from input vq, so it doesn't restart us. */
+ dev->vq->handle_output = NULL;
+ /* Stop listening to this fd: don't call us again. */
+ return false;
}
- /* If we read the data into the Guest, fill in the length and send the
- * interrupt. */
- if (lenp) {
- *lenp = len;
- trigger_irq(fd, irq);
- }
+ /* Tell the Guest about the new input. */
+ add_used_and_trigger(fd, dev->vq, head, len);
/* Three ^C within one second? Exit.
*
@@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev)
struct timeval now;
gettimeofday(&now, NULL);
if (now.tv_sec <= abort->start.tv_sec+1) {
- u32 args[] = { LHREQ_BREAK, 0 };
+ unsigned long args[] = { LHREQ_BREAK, 0 };
/* Close the fd so Waker will know it has to
* exit. */
close(waker_fd);
@@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev)
/* Any other key resets the abort counter. */
abort->count = 0;
- /* Now, if we didn't read anything, put the input terminal back and
- * return failure (meaning, don't call us again). */
- if (!len) {
- restore_term();
- return false;
- }
/* Everything went OK! */
return true;
}
-/* Handling console output is much simpler than input. */
-static u32 handle_console_output(int fd, const struct iovec *iov,
- unsigned num, struct device*dev)
+/* Handling output for console is simple: we just get all the output buffers
+ * and write them to stdout. */
+static void handle_console_output(int fd, struct virtqueue *vq)
{
- /* Whatever the Guest sends, write it to standard output. Return the
- * number of bytes written. */
- return writev(STDOUT_FILENO, iov, num);
-}
-
-/* Guest->Host network output is also pretty easy. */
-static u32 handle_tun_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
-{
- /* We put a flag in the "priv" pointer of the network device, and set
- * it as soon as we see output. We'll see why in handle_tun_input() */
- *(bool *)dev->priv = true;
- /* Whatever packet the Guest sent us, write it out to the tun
- * device. */
- return writev(dev->fd, iov, num);
+ unsigned int head, out, in;
+ int len;
+ struct iovec iov[vq->vring.num];
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ len = writev(STDOUT_FILENO, iov, out);
+ add_used_and_trigger(fd, vq, head, len);
+ }
}
-/* This matches the peer_key() in lguest_net.c. The key for any given slot
- * is the address of the network device's page plus 4 * the slot number. */
-static unsigned long peer_offset(unsigned int peernum)
+/* Handling output for network is also simple: we get all the output buffers
+ * and write them (ignoring the first element) to this device's file descriptor
+ * (stdout). */
+static void handle_net_output(int fd, struct virtqueue *vq)
{
- return 4 * peernum;
+ unsigned int head, out, in;
+ int len;
+ struct iovec iov[vq->vring.num];
+
+ /* Keep getting output buffers from the Guest until we run out. */
+ while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
+ if (in)
+ errx(1, "Input buffers in output queue?");
+ /* Check header, but otherwise ignore it (we said we supported
+ * no features). */
+ (void)convert(&iov[0], struct virtio_net_hdr);
+ len = writev(vq->dev->fd, iov+1, out-1);
+ add_used_and_trigger(fd, vq, head, len);
+ }
}
-/* This is where we handle a packet coming in from the tun device */
+/* This is where we handle a packet coming in from the tun device to our
+ * Guest. */
static bool handle_tun_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
+ unsigned int head, in_num, out_num;
int len;
- unsigned num;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ struct iovec iov[dev->vq->vring.num];
+ struct virtio_net_hdr *hdr;
- /* First we get a buffer the Guest has bound to its key. */
- lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
- &irq);
- if (!lenp) {
+ /* First we need a network buffer from the Guests's recv virtqueue. */
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num) {
/* Now, it's expected that if we try to send a packet too
- * early, the Guest won't be ready yet. This is why we set a
- * flag when the Guest sends its first packet. If it's sent a
- * packet we assume it should be ready to receive them.
- *
- * Actually, this is what the status bits in the descriptor are
- * for: we should *use* them. FIXME! */
- if (*(bool *)dev->priv)
+ * early, the Guest won't be ready yet. Wait until the device
+ * status says it's ready. */
+ /* FIXME: Actually want DRIVER_ACTIVE here. */
+ if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK)
warn("network: no dma buffer!");
- discard_iovec(iov, &num);
- }
+ /* We'll turn this back on if input buffers are registered. */
+ return false;
+ } else if (out_num)
+ errx(1, "Output buffers in network recv queue?");
+
+ /* First element is the header: we set it to 0 (no features). */
+ hdr = convert(&iov[0], struct virtio_net_hdr);
+ hdr->flags = 0;
+ hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
/* Read the packet from the device directly into the Guest's buffer. */
- len = readv(dev->fd, iov, num);
+ len = readv(dev->fd, iov+1, in_num-1);
if (len <= 0)
err(1, "reading network");
- /* Write the used_len, and trigger the interrupt for the Guest */
- if (lenp) {
- *lenp = len;
- trigger_irq(fd, irq);
- }
+ /* Tell the Guest about the new packet. */
+ add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len);
+
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
- ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
- lenp ? "sent" : "discarded");
+ ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
+ head != dev->vq->vring.num ? "sent" : "discarded");
+
/* All good. */
return true;
}
-/* The last device handling routine is block output: the Guest has sent a DMA
- * to the block device. It will have placed the command it wants in the
- * "struct lguest_block_page". */
-static u32 handle_block_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+/* This callback ensures we try again, in case we stopped console or net
+ * delivery because Guest didn't have any buffers. */
+static void enable_fd(int fd, struct virtqueue *vq)
{
- struct lguest_block_page *p = dev->mem;
- u32 irq, *lenp;
- unsigned int len, reply_num;
- struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
- off64_t device_len, off = (off64_t)p->sector * 512;
-
- /* First we extract the device length from the dev->priv pointer. */
- device_len = *(off64_t *)dev->priv;
-
- /* We first check that the read or write is within the length of the
- * block file. */
- if (off >= device_len)
- err(1, "Bad offset %llu vs %llu", off, device_len);
- /* Move to the right location in the block file. This shouldn't fail,
- * but best to check. */
- if (lseek64(dev->fd, off, SEEK_SET) != off)
- err(1, "Bad seek to sector %i", p->sector);
-
- verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
-
- /* They were supposed to bind a reply buffer at key equal to the start
- * of the block device memory. We need this to tell them when the
- * request is finished. */
- lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
- if (!lenp)
- err(1, "Block request didn't give us a dma buffer");
-
- if (p->type) {
- /* A write request. The DMA they sent contained the data, so
- * write it out. */
- len = writev(dev->fd, iov, num);
- /* Grr... Now we know how long the "struct lguest_dma" they
- * sent was, we make sure they didn't try to write over the end
- * of the block file (possibly extending it). */
- if (off + len > device_len) {
- /* Trim it back to the correct length */
- ftruncate64(dev->fd, device_len);
- /* Die, bad Guest, die. */
- errx(1, "Write past end %llu+%u", off, len);
- }
- /* The reply length is 0: we just send back an empty DMA to
- * interrupt them and tell them the write is finished. */
- *lenp = 0;
- } else {
- /* A read request. They sent an empty DMA to start the
- * request, and we put the read contents into the reply
- * buffer. */
- len = readv(dev->fd, reply, reply_num);
- *lenp = len;
- }
-
- /* The result is 1 (done), 2 if there was an error (short read or
- * write). */
- p->result = 1 + (p->bytes != len);
- /* Now tell them we've used their reply buffer. */
- trigger_irq(fd, irq);
-
- /* We're supposed to return the number of bytes of the output buffer we
- * used. But the block device uses the "result" field instead, so we
- * don't bother. */
- return 0;
+ add_device_fd(vq->dev->fd);
+ /* Tell waker to listen to it again */
+ write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd));
}
-/* This is the generic routine we call when the Guest sends some DMA out. */
-static void handle_output(int fd, unsigned long dma, unsigned long key,
- struct device_list *devices)
+/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
+static void handle_output(int fd, unsigned long addr)
{
struct device *i;
- u32 *lenp;
- struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
- unsigned num = 0;
-
- /* Convert the "struct lguest_dma" they're sending to a "struct
- * iovec". */
- lenp = dma2iov(dma, iov, &num);
-
- /* Check each device: if they expect output to this key, tell them to
- * handle it. */
- for (i = devices->dev; i; i = i->next) {
- if (i->handle_output && key == i->watch_key) {
- /* We write the result straight into the used_len field
- * for them. */
- *lenp = i->handle_output(fd, iov, num, i);
- return;
+ struct virtqueue *vq;
+
+ /* Check each virtqueue. */
+ for (i = devices.dev; i; i = i->next) {
+ for (vq = i->vq; vq; vq = vq->next) {
+ if (vq->config.pfn == addr/getpagesize()
+ && vq->handle_output) {
+ verbose("Output to %s\n", vq->dev->name);
+ vq->handle_output(fd, vq);
+ return;
+ }
}
}
- /* This can happen: the kernel sends any SEND_DMA which doesn't match
- * another Guest to us. It could be that another Guest just left a
- * network, for example. But it's unusual. */
- warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
+ /* Early console write is done using notify on a nul-terminated string
+ * in Guest memory. */
+ if (addr >= guest_limit)
+ errx(1, "Bad NOTIFY %#lx", addr);
+
+ write(STDOUT_FILENO, from_guest_phys(addr),
+ strnlen(from_guest_phys(addr), guest_limit - addr));
}
/* This is called when the waker wakes us up: check for incoming file
* descriptors. */
-static void handle_input(int fd, struct device_list *devices)
+static void handle_input(int fd)
{
/* select() wants a zeroed timeval to mean "don't wait". */
struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
for (;;) {
struct device *i;
- fd_set fds = devices->infds;
+ fd_set fds = devices.infds;
/* If nothing is ready, we're done. */
- if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
+ if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
/* Otherwise, call the device(s) which have readable
* file descriptors and a method of handling them. */
- for (i = devices->dev; i; i = i->next) {
+ for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+ int dev_fd;
+ if (i->handle_input(fd, i))
+ continue;
+
/* If handle_input() returns false, it means we
- * should no longer service it.
- * handle_console_input() does this. */
- if (!i->handle_input(fd, i)) {
- /* Clear it from the set of input file
- * descriptors kept at the head of the
- * device list. */
- FD_CLR(i->fd, &devices->infds);
- /* Tell waker to ignore it too... */
- write(waker_fd, &i->fd, sizeof(i->fd));
- }
+ * should no longer service it. Networking and
+ * console do this when there's no input
+ * buffers to deliver into. Console also uses
+ * it when it discovers that stdin is
+ * closed. */
+ FD_CLR(i->fd, &devices.infds);
+ /* Tell waker to ignore it too, by sending a
+ * negative fd number (-1, since 0 is a valid
+ * FD number). */
+ dev_fd = -i->fd - 1;
+ write(waker_fd, &dev_fd, sizeof(dev_fd));
}
}
}
@@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices)
* routines to allocate them.
*
* This routine allocates a new "struct lguest_device_desc" from descriptor
- * table in the devices array just above the Guest's normal memory. */
-static struct lguest_device_desc *
-new_dev_desc(struct lguest_device_desc *descs,
- u16 type, u16 features, u16 num_pages)
+ * table just above the Guest's normal memory. It returns a pointer to that
+ * descriptor. */
+static struct lguest_device_desc *new_dev_desc(u16 type)
{
- unsigned int i;
+ struct lguest_device_desc *d;
- for (i = 0; i < LGUEST_MAX_DEVICES; i++) {
- if (!descs[i].type) {
- descs[i].type = type;
- descs[i].features = features;
- descs[i].num_pages = num_pages;
- /* If they said the device needs memory, we allocate
- * that now, bumping up the top of Guest memory. */
- if (num_pages) {
- map_zeroed_pages(top, num_pages);
- descs[i].pfn = top/getpagesize();
- top += num_pages*getpagesize();
- }
- return &descs[i];
- }
- }
- errx(1, "too many devices");
+ /* We only have one page for all the descriptors. */
+ if (devices.desc_used + sizeof(*d) > getpagesize())
+ errx(1, "Too many devices");
+
+ /* We don't need to set config_len or status: page is 0 already. */
+ d = (void *)devices.descpage + devices.desc_used;
+ d->type = type;
+ devices.desc_used += sizeof(*d);
+
+ return d;
}
-/* This monster routine does all the creation and setup of a new device,
- * including caling new_dev_desc() to allocate the descriptor and device
- * memory. */
-static struct device *new_device(struct device_list *devices,
- u16 type, u16 num_pages, u16 features,
- int fd,
- bool (*handle_input)(int, struct device *),
- unsigned long watch_off,
- u32 (*handle_output)(int,
- const struct iovec *,
- unsigned,
- struct device *))
+/* Each device descriptor is followed by some configuration information.
+ * The first byte is a "status" byte for the Guest to report what's happening.
+ * After that are fields: u8 type, u8 len, [... len bytes...].
+ *
+ * This routine adds a new field to an existing device's descriptor. It only
+ * works for the last device, but that's OK because that's how we use it. */
+static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c)
+{
+ /* This is the last descriptor, right? */
+ assert(devices.descpage + devices.desc_used
+ == (u8 *)(dev->desc + 1) + dev->desc->config_len);
+
+ /* We only have one page of device descriptions. */
+ if (devices.desc_used + 2 + len > getpagesize())
+ errx(1, "Too many devices");
+
+ /* Copy in the new config header: type then length. */
+ devices.descpage[devices.desc_used++] = type;
+ devices.descpage[devices.desc_used++] = len;
+ memcpy(devices.descpage + devices.desc_used, c, len);
+ devices.desc_used += len;
+
+ /* Update the device descriptor length: two byte head then data. */
+ dev->desc->config_len += 2 + len;
+}
+
+/* This routine adds a virtqueue to a device. We specify how many descriptors
+ * the virtqueue is to have. */
+static void add_virtqueue(struct device *dev, unsigned int num_descs,
+ void (*handle_output)(int fd, struct virtqueue *me))
+{
+ unsigned int pages;
+ struct virtqueue **i, *vq = malloc(sizeof(*vq));
+ void *p;
+
+ /* First we need some pages for this virtqueue. */
+ pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize();
+ p = get_pages(pages);
+
+ /* Initialize the configuration. */
+ vq->config.num = num_descs;
+ vq->config.irq = devices.next_irq++;
+ vq->config.pfn = to_guest_phys(p) / getpagesize();
+
+ /* Initialize the vring. */
+ vring_init(&vq->vring, num_descs, p);
+
+ /* Add the configuration information to this device's descriptor. */
+ add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE,
+ sizeof(vq->config), &vq->config);
+
+ /* Add to tail of list, so dev->vq is first vq, dev->vq->next is
+ * second. */
+ for (i = &dev->vq; *i; i = &(*i)->next);
+ *i = vq;
+
+ /* Link virtqueue back to device. */
+ vq->dev = dev;
+
+ /* Set up handler. */
+ vq->handle_output = handle_output;
+ if (!handle_output)
+ vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+}
+
+/* This routine does all the creation and setup of a new device, including
+ * caling new_dev_desc() to allocate the descriptor and device memory. */
+static struct device *new_device(const char *name, u16 type, int fd,
+ bool (*handle_input)(int, struct device *))
{
struct device *dev = malloc(sizeof(*dev));
@@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices,
* easier, but the user expects the devices to be arranged on the bus
* in command-line order. The first network device on the command line
* is eth0, the first block device /dev/lgba, etc. */
- *devices->lastdev = dev;
+ *devices.lastdev = dev;
dev->next = NULL;
- devices->lastdev = &dev->next;
+ devices.lastdev = &dev->next;
/* Now we populate the fields one at a time. */
dev->fd = fd;
/* If we have an input handler for this file descriptor, then we add it
* to the device_list's fdset and maxfd. */
if (handle_input)
- set_fd(dev->fd, devices);
- dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
- dev->mem = (void *)(dev->desc->pfn * getpagesize());
+ add_device_fd(dev->fd);
+ dev->desc = new_dev_desc(type);
dev->handle_input = handle_input;
- dev->watch_key = (unsigned long)dev->mem + watch_off;
- dev->handle_output = handle_output;
+ dev->name = name;
return dev;
}
/* Our first setup routine is the console. It's a fairly simple device, but
* UNIX tty handling makes it uglier than it could be. */
-static void setup_console(struct device_list *devices)
+static void setup_console(void)
{
struct device *dev;
@@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices)
atexit(restore_term);
}
- /* We don't currently require any memory for the console, so we ask for
- * 0 pages. */
- dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
- STDIN_FILENO, handle_console_input,
- LGUEST_CONSOLE_DMA_KEY, handle_console_output);
+ dev = new_device("console", VIRTIO_ID_CONSOLE,
+ STDIN_FILENO, handle_console_input);
/* We store the console state in dev->priv, and initialize it. */
dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0;
- verbose("device %p: console\n",
- (void *)(dev->desc->pfn * getpagesize()));
-}
-/* Setting up a block file is also fairly straightforward. */
-static void setup_block_file(const char *filename, struct device_list *devices)
-{
- int fd;
- struct device *dev;
- off64_t *device_len;
- struct lguest_block_page *p;
-
- /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
- * open with O_DIRECT because otherwise our benchmarks go much too
- * fast. */
- fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
-
- /* We want one page, and have no input handler (the block file never
- * has anything interesting to say to us). Our timing will be quite
- * random, so it should be a reasonable randomness source. */
- dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
- LGUEST_DEVICE_F_RANDOMNESS,
- fd, NULL, 0, handle_block_output);
-
- /* We store the device size in the private area */
- device_len = dev->priv = malloc(sizeof(*device_len));
- /* This is the safe way of establishing the size of our device: it
- * might be a normal file or an actual block device like /dev/hdb. */
- *device_len = lseek64(fd, 0, SEEK_END);
-
- /* The device memory is a "struct lguest_block_page". It's zeroed
- * already, we just need to put in the device size. Block devices
- * think in sectors (ie. 512 byte chunks), so we translate here. */
- p = dev->mem;
- p->num_sectors = *device_len/512;
- verbose("device %p: block %i sectors\n",
- (void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
+ /* The console needs two virtqueues: the input then the output. When
+ * they put something the input queue, we make sure we're listening to
+ * stdin. When they put something in the output queue, we write it to
+ * stdout. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
+
+ verbose("device %u: console\n", devices.device_num++);
}
+/*:*/
-/*
- * Network Devices.
+/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
+ * --sharenet=<name> option which opens or creates a named pipe. This can be
+ * used to send packets to another guest in a 1:1 manner.
*
- * Setting up network devices is quite a pain, because we have three types.
- * First, we have the inter-Guest network. This is a file which is mapped into
- * the address space of the Guests who are on the network. Because it is a
- * shared mapping, the same page underlies all the devices, and they can send
- * DMA to each other.
+ * More sopisticated is to use one of the tools developed for project like UML
+ * to do networking.
*
- * Remember from our network driver, the Guest is told what slot in the page it
- * is to use. We use exclusive fnctl locks to reserve a slot. If another
- * Guest is using a slot, the lock will fail and we try another. Because fnctl
- * locks are cleaned up automatically when we die, this cleverly means that our
- * reservation on the slot will vanish if we crash. */
-static unsigned int find_slot(int netfd, const char *filename)
-{
- struct flock fl;
-
- fl.l_type = F_WRLCK;
- fl.l_whence = SEEK_SET;
- fl.l_len = 1;
- /* Try a 1 byte lock in each possible position number */
- for (fl.l_start = 0;
- fl.l_start < getpagesize()/sizeof(struct lguest_net);
- fl.l_start++) {
- /* If we succeed, return the slot number. */
- if (fcntl(netfd, F_SETLK, &fl) == 0)
- return fl.l_start;
- }
- errx(1, "No free slots in network file %s", filename);
-}
-
-/* This function sets up the network file */
-static void setup_net_file(const char *filename,
- struct device_list *devices)
-{
- int netfd;
- struct device *dev;
-
- /* We don't use open_or_die() here: for friendliness we create the file
- * if it doesn't already exist. */
- netfd = open(filename, O_RDWR, 0);
- if (netfd < 0) {
- if (errno == ENOENT) {
- netfd = open(filename, O_RDWR|O_CREAT, 0600);
- if (netfd >= 0) {
- /* If we succeeded, initialize the file with a
- * blank page. */
- char page[getpagesize()];
- memset(page, 0, sizeof(page));
- write(netfd, page, sizeof(page));
- }
- }
- if (netfd < 0)
- err(1, "cannot open net file '%s'", filename);
- }
-
- /* We need 1 page, and the features indicate the slot to use and that
- * no checksum is needed. We never touch this device again; it's
- * between the Guests on the network, so we don't register input or
- * output handlers. */
- dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
- find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
- -1, NULL, 0, NULL);
-
- /* Map the shared file. */
- if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
- MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
- err(1, "could not mmap '%s'", filename);
- verbose("device %p: shared net %s, peer %i\n",
- (void *)(dev->desc->pfn * getpagesize()), filename,
- dev->desc->features & ~LGUEST_NET_F_NOCSUM);
-}
-/*:*/
+ * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
+ * completely generic ("here's my vring, attach to your vring") and would work
+ * for any traffic. Of course, namespace and permissions issues need to be
+ * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide
+ * multiple inter-guest channels behind one interface, although it would
+ * require some manner of hotplugging new virtio channels.
+ *
+ * Finally, we could implement a virtio network switch in the kernel. :*/
static u32 str2ip(const char *ipaddr)
{
@@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
/* This sets up the Host end of the network device with an IP address, brings
* it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer (in practice, the Host's slot in the network device's memory). */
+ * pointer. */
static void configure_device(int fd, const char *devname, u32 ipaddr,
unsigned char hwaddr[6])
{
@@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
}
-/*L:195 The other kind of network is a Host<->Guest network. This can either
- * use briding or routing, but the principle is the same: it uses the "tun"
- * device to inject packets into the Host as if they came in from a normal
- * network card. We just shunt packets between the Guest and the tun
- * device. */
-static void setup_tun_net(const char *arg, struct device_list *devices)
+/*L:195 Our network is a Host<->Guest network. This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card. We
+ * just shunt packets between the Guest and the tun device. */
+static void setup_tun_net(const char *arg)
{
struct device *dev;
struct ifreq ifr;
int netfd, ipfd;
u32 ip;
const char *br_name = NULL;
+ u8 hwaddr[6];
/* We open the /dev/net/tun device and tell it we want a tap device. A
* tap device is like a tun device, only somehow different. To tell
@@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
* device: trust us! */
ioctl(netfd, TUNSETNOCSUM, 1);
- /* We create the net device with 1 page, using the features field of
- * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and
- * that the device has fairly random timing. We do *not* specify
- * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
- *
- * We will put our MAC address is slot 0 for the Guest to see, so
- * it will send packets to us using the key "peer_offset(0)": */
- dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
- NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
- handle_tun_input, peer_offset(0), handle_tun_output);
+ /* First we create a new network device. */
+ dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
- /* We keep a flag which says whether we've seen packets come out from
- * this network device. */
- dev->priv = malloc(sizeof(bool));
- *(bool *)dev->priv = false;
+ /* Network devices need a receive and a send queue, just like
+ * console. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
/* We need a socket to perform the magic network ioctls to bring up the
* tap interface, connect to the bridge etc. Any socket will do! */
@@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
} else /* It is an IP address to set up the device with */
ip = str2ip(arg);
- /* We are peer 0, ie. first slot, so we hand dev->mem to this routine
- * to write the MAC address at the start of the device memory. */
- configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
+ /* Set up the tun device, and get the mac address for the interface. */
+ configure_device(ipfd, ifr.ifr_name, ip, hwaddr);
- /* Set "promisc" bit: we want every single packet if we're going to
- * bridge to other machines (and otherwise it doesn't matter). */
- *((u8 *)dev->mem) |= 0x1;
+ /* Tell Guest what MAC address to use. */
+ add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr);
+ /* We don't seed the socket any more; setup is done. */
close(ipfd);
- verbose("device %p: tun net %u.%u.%u.%u\n",
- (void *)(dev->desc->pfn * getpagesize()),
- (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip);
+ verbose("device %u: tun net %u.%u.%u.%u\n",
+ devices.device_num++,
+ (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip);
if (br_name)
verbose("attached to bridge: %s\n", br_name);
}
+
+
+/*
+ * Block device.
+ *
+ * Serving a block device is really easy: the Guest asks for a block number and
+ * we read or write that position in the file.
+ *
+ * Unfortunately, this is amazingly slow: the Guest waits until the read is
+ * finished before running anything else, even if it could be doing useful
+ * work. We could use async I/O, except it's reputed to suck so hard that
+ * characters actually go missing from your code when you try to use it.
+ *
+ * So we farm the I/O out to thread, and communicate with it via a pipe. */
+
+/* This hangs off device->priv, with the data. */
+struct vblk_info
+{
+ /* The size of the file. */
+ off64_t len;
+
+ /* The file descriptor for the file. */
+ int fd;
+
+ /* IO thread listens on this file descriptor [0]. */
+ int workpipe[2];
+
+ /* IO thread writes to this file descriptor to mark it done, then
+ * Launcher triggers interrupt to Guest. */
+ int done_fd;
+};
+
+/* This is the core of the I/O thread. It returns true if it did something. */
+static bool service_io(struct device *dev)
+{
+ struct vblk_info *vblk = dev->priv;
+ unsigned int head, out_num, in_num, wlen;
+ int ret;
+ struct virtio_blk_inhdr *in;
+ struct virtio_blk_outhdr *out;
+ struct iovec iov[dev->vq->vring.num];
+ off64_t off;
+
+ head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+ if (head == dev->vq->vring.num)
+ return false;
+
+ if (out_num == 0 || in_num == 0)
+ errx(1, "Bad virtblk cmd %u out=%u in=%u",
+ head, out_num, in_num);
+
+ out = convert(&iov[0], struct virtio_blk_outhdr);
+ in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+ off = out->sector * 512;
+
+ /* This is how we implement barriers. Pretty poor, no? */
+ if (out->type & VIRTIO_BLK_T_BARRIER)
+ fdatasync(vblk->fd);
+
+ if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+ fprintf(stderr, "Scsi commands unsupported\n");
+ in->status = VIRTIO_BLK_S_UNSUPP;
+ wlen = sizeof(in);
+ } else if (out->type & VIRTIO_BLK_T_OUT) {
+ /* Write */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to write past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = writev(vblk->fd, iov+1, out_num-1);
+ verbose("WRITE to sector %llu: %i\n", out->sector, ret);
+
+ /* Grr... Now we know how long the descriptor they sent was, we
+ * make sure they didn't try to write over the end of the block
+ * file (possibly extending it). */
+ if (ret > 0 && off + ret > vblk->len) {
+ /* Trim it back to the correct length */
+ ftruncate64(vblk->fd, vblk->len);
+ /* Die, bad Guest, die. */
+ errx(1, "Write past end %llu+%u", off, ret);
+ }
+ wlen = sizeof(in);
+ in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+ } else {
+ /* Read */
+
+ /* Move to the right location in the block file. This can fail
+ * if they try to read past end. */
+ if (lseek64(vblk->fd, off, SEEK_SET) != off)
+ err(1, "Bad seek to sector %llu", out->sector);
+
+ ret = readv(vblk->fd, iov+1, in_num-1);
+ verbose("READ from sector %llu: %i\n", out->sector, ret);
+ if (ret >= 0) {
+ wlen = sizeof(in) + ret;
+ in->status = VIRTIO_BLK_S_OK;
+ } else {
+ wlen = sizeof(in);
+ in->status = VIRTIO_BLK_S_IOERR;
+ }
+ }
+
+ /* We can't trigger an IRQ, because we're not the Launcher. It does
+ * that when we tell it we're done. */
+ add_used(dev->vq, head, wlen);
+ return true;
+}
+
+/* This is the thread which actually services the I/O. */
+static int io_thread(void *_dev)
+{
+ struct device *dev = _dev;
+ struct vblk_info *vblk = dev->priv;
+ char c;
+
+ /* Close other side of workpipe so we get 0 read when main dies. */
+ close(vblk->workpipe[1]);
+ /* Close the other side of the done_fd pipe. */
+ close(dev->fd);
+
+ /* When this read fails, it means Launcher died, so we follow. */
+ while (read(vblk->workpipe[0], &c, 1) == 1) {
+ /* We acknowledge each request immediately, to reduce latency,
+ * rather than waiting until we've done them all. I haven't
+ * measured to see if it makes any difference. */
+ while (service_io(dev))
+ write(vblk->done_fd, &c, 1);
+ }
+ return 0;
+}
+
+/* When the thread says some I/O is done, we interrupt the Guest. */
+static bool handle_io_finish(int fd, struct device *dev)
+{
+ char c;
+
+ /* If child died, presumably it printed message. */
+ if (read(dev->fd, &c, 1) != 1)
+ exit(1);
+
+ /* It did some work, so trigger the irq. */
+ trigger_irq(fd, dev->vq);
+ return true;
+}
+
+/* When the Guest submits some I/O, we wake the I/O thread. */
+static void handle_virtblk_output(int fd, struct virtqueue *vq)
+{
+ struct vblk_info *vblk = vq->dev->priv;
+ char c = 0;
+
+ /* Wake up I/O thread and tell it to go to work! */
+ if (write(vblk->workpipe[1], &c, 1) != 1)
+ /* Presumably it indicated why it died. */
+ exit(1);
+}
+
+/* This creates a virtual block device. */
+static void setup_block_file(const char *filename)
+{
+ int p[2];
+ struct device *dev;
+ struct vblk_info *vblk;
+ void *stack;
+ u64 cap;
+ unsigned int val;
+
+ /* This is the pipe the I/O thread will use to tell us I/O is done. */
+ pipe(p);
+
+ /* The device responds to return from I/O thread. */
+ dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
+
+ /* The device has a virtqueue. */
+ add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
+
+ /* Allocate the room for our own bookkeeping */
+ vblk = dev->priv = malloc(sizeof(*vblk));
+
+ /* First we open the file and store the length. */
+ vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
+ vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+
+ /* Tell Guest how many sectors this device has. */
+ cap = cpu_to_le64(vblk->len / 512);
+ add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap);
+
+ /* Tell Guest not to put in too many descriptors at once: two are used
+ * for the in and out elements. */
+ val = cpu_to_le32(VIRTQUEUE_NUM - 2);
+ add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val);
+
+ /* The I/O thread writes to this end of the pipe when done. */
+ vblk->done_fd = p[1];
+
+ /* This is how we tell the I/O thread about more work. */
+ pipe(vblk->workpipe);
+
+ /* Create stack for thread and run it */
+ stack = malloc(32768);
+ if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1)
+ err(1, "Creating clone");
+
+ /* We don't need to keep the I/O thread's end of the pipes open. */
+ close(vblk->done_fd);
+ close(vblk->workpipe[0]);
+
+ verbose("device %u: virtblock %llu sectors\n",
+ devices.device_num, cap);
+}
/* That's the end of device setup. */
/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
* its input and output, and finally, lays it to rest. */
-static void __attribute__((noreturn))
-run_guest(int lguest_fd, struct device_list *device_list)
+static void __attribute__((noreturn)) run_guest(int lguest_fd)
{
for (;;) {
- u32 args[] = { LHREQ_BREAK, 0 };
- unsigned long arr[2];
+ unsigned long args[] = { LHREQ_BREAK, 0 };
+ unsigned long notify_addr;
int readval;
/* We read from the /dev/lguest device to run the Guest. */
- readval = read(lguest_fd, arr, sizeof(arr));
-
- /* The read can only really return sizeof(arr) (the Guest did a
- * SEND_DMA to us), or an error. */
+ readval = read(lguest_fd, &notify_addr, sizeof(notify_addr));
- /* For a successful read, arr[0] is the address of the "struct
- * lguest_dma", and arr[1] is the key the Guest sent to. */
- if (readval == sizeof(arr)) {
- handle_output(lguest_fd, arr[0], arr[1], device_list);
+ /* One unsigned long means the Guest did HCALL_NOTIFY */
+ if (readval == sizeof(notify_addr)) {
+ verbose("Notify on address %#lx\n", notify_addr);
+ handle_output(lguest_fd, notify_addr);
continue;
/* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) {
@@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list)
/* Service input, then unset the BREAK which releases
* the Waker. */
- handle_input(lguest_fd, device_list);
+ handle_input(lguest_fd);
if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Resetting break");
}
@@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list)
static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
- { "sharenet", 1, NULL, 's' },
{ "tunnet", 1, NULL, 't' },
{ "block", 1, NULL, 'b' },
{ "initrd", 1, NULL, 'i' },
@@ -1374,37 +1516,21 @@ static struct option opts[] = {
static void usage(void)
{
errx(1, "Usage: lguest [--verbose] "
- "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
+ "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n"
"|--block=<filename>|--initrd=<filename>]...\n"
"<mem-in-mb> vmlinux [args...]");
}
-/*L:100 The Launcher code itself takes us out into userspace, that scary place
- * where pointers run wild and free! Unfortunately, like most userspace
- * programs, it's quite boring (which is why everyone like to hack on the
- * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
- * will get you through this section. Or, maybe not.
- *
- * The Launcher binary sits up high, usually starting at address 0xB8000000.
- * Everything below this is the "physical" memory for the Guest. For example,
- * if the Guest were to write a "1" at physical address 0, we would see a "1"
- * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
- *
- * This can be tough to get your head around, but usually it just means that we
- * don't need to do any conversion when the Guest gives us it's "physical"
- * addresses.
- */
+/*L:105 The main routine is where the real work begins: */
int main(int argc, char *argv[])
{
- /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
- * of the (optional) initrd. */
- unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
+ /* Memory, top-level pagetable, code startpoint and size of the
+ * (optional) initrd. */
+ unsigned long mem = 0, pgdir, start, initrd_size = 0;
/* A temporary and the /dev/lguest file descriptor. */
int i, c, lguest_fd;
- /* The list of Guest devices, based on command line arguments. */
- struct device_list device_list;
- /* The boot information for the Guest: at guest-physical address 0. */
- void *boot = (void *)0;
+ /* The boot information for the Guest. */
+ struct boot_params *boot;
/* If they specify an initrd file to load. */
const char *initrd_name = NULL;
@@ -1412,11 +1538,12 @@ int main(int argc, char *argv[])
* device receive input from a file descriptor, we keep an fdset
* (infds) and the maximum fd number (max_infd) with the head of the
* list. We also keep a pointer to the last device, for easy appending
- * to the list. */
- device_list.max_infd = -1;
- device_list.dev = NULL;
- device_list.lastdev = &device_list.dev;
- FD_ZERO(&device_list.infds);
+ * to the list. Finally, we keep the next interrupt number to hand out
+ * (1: remember that 0 is used by the timer). */
+ FD_ZERO(&devices.infds);
+ devices.max_infd = -1;
+ devices.lastdev = &devices.dev;
+ devices.next_irq = 1;
/* We need to know how much memory so we can set up the device
* descriptor and memory pages for the devices as we parse the command
@@ -1424,9 +1551,16 @@ int main(int argc, char *argv[])
* of memory now. */
for (i = 1; i < argc; i++) {
if (argv[i][0] != '-') {
- mem = top = atoi(argv[i]) * 1024 * 1024;
- device_list.descs = map_zeroed_pages(top, 1);
- top += getpagesize();
+ mem = atoi(argv[i]) * 1024 * 1024;
+ /* We start by mapping anonymous pages over all of
+ * guest-physical memory range. This fills it with 0,
+ * and ensures that the Guest won't be killed when it
+ * tries to access it. */
+ guest_base = map_zeroed_pages(mem / getpagesize()
+ + DEVICE_PAGES);
+ guest_limit = mem;
+ guest_max = mem + DEVICE_PAGES*getpagesize();
+ devices.descpage = get_pages(1);
break;
}
}
@@ -1437,14 +1571,11 @@ int main(int argc, char *argv[])
case 'v':
verbose = true;
break;
- case 's':
- setup_net_file(optarg, &device_list);
- break;
case 't':
- setup_tun_net(optarg, &device_list);
+ setup_tun_net(optarg);
break;
case 'b':
- setup_block_file(optarg, &device_list);
+ setup_block_file(optarg);
break;
case 'i':
initrd_name = optarg;
@@ -1459,56 +1590,60 @@ int main(int argc, char *argv[])
if (optind + 2 > argc)
usage();
- /* We always have a console device */
- setup_console(&device_list);
+ verbose("Guest base is at %p\n", guest_base);
- /* We start by mapping anonymous pages over all of guest-physical
- * memory range. This fills it with 0, and ensures that the Guest
- * won't be killed when it tries to access it. */
- map_zeroed_pages(0, mem / getpagesize());
+ /* We always have a console device */
+ setup_console();
/* Now we load the kernel */
- start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
- &page_offset);
+ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
+
+ /* Boot information is stashed at physical address 0 */
+ boot = from_guest_phys(0);
/* Map the initrd image if requested (at top of physical memory) */
if (initrd_name) {
initrd_size = load_initrd(initrd_name, mem);
/* These are the location in the Linux boot header where the
* start and size of the initrd are expected to be found. */
- *(unsigned long *)(boot+0x218) = mem - initrd_size;
- *(unsigned long *)(boot+0x21c) = initrd_size;
+ boot->hdr.ramdisk_image = mem - initrd_size;
+ boot->hdr.ramdisk_size = initrd_size;
/* The bootloader type 0xFF means "unknown"; that's OK. */
- *(unsigned char *)(boot+0x210) = 0xFF;
+ boot->hdr.type_of_loader = 0xFF;
}
/* Set up the initial linear pagetables, starting below the initrd. */
- pgdir = setup_pagetables(mem, initrd_size, page_offset);
+ pgdir = setup_pagetables(mem, initrd_size);
/* The Linux boot header contains an "E820" memory map: ours is a
* simple, single region. */
- *(char*)(boot+E820NR) = 1;
- *((struct e820entry *)(boot+E820MAP))
- = ((struct e820entry) { 0, mem, E820_RAM });
+ boot->e820_entries = 1;
+ boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
/* The boot header contains a command line pointer: we put the command
- * line after the boot header (at address 4096) */
- *(void **)(boot + 0x228) = boot + 4096;
- concat(boot + 4096, argv+optind+2);
+ * line after the boot header. */
+ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
+ concat((char *)(boot + 1), argv+optind+2);
+
+ /* Boot protocol version: 2.07 supports the fields for lguest. */
+ boot->hdr.version = 0x207;
+
+ /* The hardware_subarch value of "1" tells the Guest it's an lguest. */
+ boot->hdr.hardware_subarch = 1;
- /* The guest type value of "1" tells the Guest it's under lguest. */
- *(int *)(boot + 0x23c) = 1;
+ /* Tell the entry path not to try to reload segment registers. */
+ boot->hdr.loadflags |= KEEP_SEGMENTS;
/* We tell the kernel to initialize the Guest: this returns the open
* /dev/lguest file descriptor. */
- lguest_fd = tell_kernel(pgdir, start, page_offset);
+ lguest_fd = tell_kernel(pgdir, start);
/* We fork off a child process, which wakes the Launcher whenever one
* of the input file descriptors needs attention. Otherwise we would
* run the Guest until it tries to output something. */
- waker_fd = setup_waker(lguest_fd, &device_list);
+ waker_fd = setup_waker(lguest_fd);
/* Finally, run the Guest. This doesn't return. */
- run_guest(lguest_fd, &device_list);
+ run_guest(lguest_fd);
}
/*:*/
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 821617bd6c0..7885ab2d5f5 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for
Linux developers and users to experiment with virtualization with the
minimum of complexity. Nonetheless, it should have sufficient
features to make it useful for specific tasks, and, of course, you are
-encouraged to fork and enhance it.
+encouraged to fork and enhance it (see drivers/lguest/README).
Features:
@@ -23,19 +23,30 @@ Developer features:
Running Lguest:
-- Lguest runs the same kernel as guest and host. You can configure
- them differently, but usually it's easiest not to.
+- The easiest way to run lguest is to use same kernel as guest and host.
+ You can configure them differently, but usually it's easiest not to.
You will need to configure your kernel with the following options:
- CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1]
- CONFIG_TUN=y/m ("Universal TUN/TAP device driver support")
- CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers")
- CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)")
- CONFIG_LGUEST=y/m ("Linux hypervisor example code")
-
- and I recommend:
- CONFIG_HZ=100 ("Timer frequency")[2]
+ "General setup":
+ "Prompt for development and/or incomplete code/drivers" = Y
+ (CONFIG_EXPERIMENTAL=y)
+
+ "Processor type and features":
+ "Paravirtualized guest support" = Y
+ "Lguest guest support" = Y
+ "High Memory Support" = off/4GB
+ "Alignment value to which kernel should be aligned" = 0x100000
+ (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+ CONFIG_PHYSICAL_ALIGN=0x100000)
+
+ "Device Drivers":
+ "Network device support"
+ "Universal TUN/TAP device driver support" = M/Y
+ (CONFIG_TUN=m)
+ "Virtualization"
+ "Linux hypervisor example code" = M/Y
+ (CONFIG_LGUEST=m)
- A tool called "lguest" is available in this directory: type "make"
to build it. If you didn't build your kernel in-tree, use "make
@@ -51,14 +62,17 @@ Running Lguest:
dd if=/dev/zero of=rootfile bs=1M count=2048
qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+ Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+ console!
+
- "modprobe lg" if you built it as a module.
- Run an lguest as root:
- Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba
+ Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
Explanation:
- 64m: the amount of memory to use.
+ 64: the amount of memory to use, in MB.
vmlinux: the kernel image found in the top of your build directory. You
can also use a standard bzImage.
@@ -66,10 +80,10 @@ Running Lguest:
--tunnet=192.168.19.1: configures a "tap" device for networking with this
IP address.
- --block=rootfile: a file or block device which becomes /dev/lgba
+ --block=rootfile: a file or block device which becomes /dev/vda
inside the guest.
- root=/dev/lgba: this (and anything else on the command line) are
+ root=/dev/vda: this (and anything else on the command line) are
kernel boot parameters.
- Configuring networking. I usually have the host masquerade, using
@@ -99,31 +113,7 @@ Running Lguest:
"--sharenet=<filename>": any two guests using the same file are on
the same network. This file is created if it does not exist.
-Lguest I/O model:
-
-Lguest uses a simplified DMA model plus shared memory for I/O. Guests
-can communicate with each other if they share underlying memory
-(usually by the lguest program mmaping the same file), but they can
-use any non-shared memory to communicate with the lguest process.
-
-Guests can register DMA buffers at any key (must be a valid physical
-address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq)
-hypercall. "dmabufs" is the physical address of an array of "num"
-"struct lguest_dma": each contains a used_len, and an array of
-physical addresses and lengths. When a transfer occurs, the
-"used_len" field of one of the buffers which has used_len 0 will be
-set to the length transferred and the irq will fire.
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-Using an irq value of 0 unbinds the dma buffers.
-
-To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used,
-and the bytes used is written to the used_len field. This can be 0 if
-noone else has bound a DMA buffer to that key or some other error.
-DMA buffers bound by the same guest are ignored.
-
-Cheers!
+Good luck!
Rusty Russell rusty@rustcorp.com.au.
-
-[1] These are on various places on the TODO list, waiting for you to
- get annoyed enough at the limitation to fix it.
-[2] Lguest is not yet tickless when idle. See [1].
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
index b0aca0705d1..4269a1105b3 100644
--- a/Documentation/local_ops.txt
+++ b/Documentation/local_ops.txt
@@ -27,7 +27,7 @@ CPU which owns the data. Therefore, care must taken to make sure that only one
CPU writes to the local_t data. This is done by using per cpu data and making
sure that we modify it from within a preemption safe context. It is however
permitted to read local_t data from any CPU : it will then appear to be written
-out of order wrt other memory writes on the owner CPU.
+out of order wrt other memory writes by the owner CPU.
* Implementation for a given architecture
@@ -45,6 +45,29 @@ long fails. The definition looks like :
typedef struct { atomic_long_t a; } local_t;
+* Rules to follow when using local atomic operations
+
+- Variables touched by local ops must be per cpu variables.
+- _Only_ the CPU owner of these variables must write to them.
+- This CPU can use local ops from any context (process, irq, softirq, nmi, ...)
+ to update its local_t variables.
+- Preemption (or interrupts) must be disabled when using local ops in
+ process context to make sure the process won't be migrated to a
+ different CPU between getting the per-cpu variable and doing the
+ actual local op.
+- When using local ops in interrupt context, no special care must be
+ taken on a mainline kernel, since they will run on the local CPU with
+ preemption already disabled. I suggest, however, to explicitly
+ disable preemption anyway to make sure it will still work correctly on
+ -rt kernels.
+- Reading the local cpu variable will provide the current copy of the
+ variable.
+- Reads of these variables can be done from any CPU, because updates to
+ "long", aligned, variables are always atomic. Since no memory
+ synchronization is done by the writer CPU, an outdated copy of the
+ variable can be read when reading some _other_ cpu's variables.
+
+
* How to use local atomic operations
#include <linux/percpu.h>
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
index 59108cebe16..248589e8bcf 100644
--- a/Documentation/m68k/kernel-options.txt
+++ b/Documentation/m68k/kernel-options.txt
@@ -192,10 +192,10 @@ Devices possible for Atari:
seconds.
-2.6) ramdisk=
+2.6) ramdisk_size=
-------------
-Syntax: ramdisk=<size>
+Syntax: ramdisk_size=<size>
This option instructs the kernel to set up a ramdisk of the given
size in KBytes. Do not use this option if the ramdisk contents are
@@ -890,10 +890,7 @@ Syntax: nosync:0
5.5.2) noasync
--------------
-Syntax: noasync:0
-
- Disables async and sync negotiation for all devices. Any value
- after the colon is acceptable (and has the same effect).
+[OBSOLETE, REMOVED]
5.5.3) nodisconnect
-------------------
diff --git a/Documentation/make/headers_install.txt b/Documentation/make/headers_install.txt
new file mode 100644
index 00000000000..f2481cabffc
--- /dev/null
+++ b/Documentation/make/headers_install.txt
@@ -0,0 +1,46 @@
+Exporting kernel headers for use by userspace
+=============================================
+
+The "make headers_install" command exports the kernel's header files in a
+form suitable for use by userspace programs.
+
+The linux kernel's exported header files describe the API for user space
+programs attempting to use kernel services. These kernel header files are
+used by the system's C library (such as glibc or uClibc) to define available
+system calls, as well as constants and structures to be used with these
+system calls. The C library's header files include the kernel header files
+from the "linux" subdirectory. The system's libc headers are usually
+installed at the default location /usr/include and the kernel headers in
+subdirectories under that (most notably /usr/include/linux and
+/usr/include/asm).
+
+Kernel headers are backwards compatible, but not forwards compatible. This
+means that a program built against a C library using older kernel headers
+should run on a newer kernel (although it may not have access to new
+features), but a program built against newer kernel headers may not work on an
+older kernel.
+
+The "make headers_install" command can be run in the top level directory of the
+kernel source code (or using a standard out-of-tree build). It takes two
+optional arguments:
+
+ make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr/include
+
+ARCH indicates which architecture to produce headers for, and defaults to the
+current architecture. The linux/asm directory of the exported kernel headers
+is platform-specific, to see a complete list of supported architectures use
+the command:
+
+ ls -d include/asm-* | sed 's/.*-//'
+
+INSTALL_HDR_PATH indicates where to install the headers. It defaults to
+"./usr/include".
+
+The command "make headers_install_all" exports headers for all architectures
+simultaneously. (This is mostly of interest to distribution maintainers,
+who create an architecture-independent tarball from the resulting include
+directory.) Remember to provide the appropriate linux/asm directory via "mv"
+or "ln -s" before building a C library with headers exported this way.
+
+The kernel header export infrastructure is maintained by David Woodhouse
+<dwmw2@infradead.org>.
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
new file mode 100644
index 00000000000..295a71bc301
--- /dev/null
+++ b/Documentation/markers.txt
@@ -0,0 +1,81 @@
+ Using the Linux Kernel Markers
+
+ Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Markers and their use. It provides
+examples of how to insert markers in the kernel and connect probe functions to
+them and provides some examples of probe functions.
+
+
+* Purpose of markers
+
+A marker placed in code provides a hook to call a function (probe) that you can
+provide at runtime. A marker can be "on" (a probe is connected to it) or "off"
+(no probe is attached). When a marker is "off" it has no effect, except for
+adding a tiny time penalty (checking a condition for a branch) and space
+penalty (adding a few bytes for the function call at the end of the
+instrumented function and adds a data structure in a separate section). When a
+marker is "on", the function you provide is called each time the marker is
+executed, in the execution context of the caller. When the function provided
+ends its execution, it returns to the caller (continuing from the marker site).
+
+You can put markers at important locations in the code. Markers are
+lightweight hooks that can pass an arbitrary number of parameters,
+described in a printk-like format string, to the attached probe function.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+In order to use the macro trace_mark, you should include linux/marker.h.
+
+#include <linux/marker.h>
+
+And,
+
+trace_mark(subsystem_event, "%d %s", someint, somestring);
+Where :
+- subsystem_event is an identifier unique to your event
+ - subsystem is the name of your subsystem.
+ - event is the name of the event to mark.
+- "%d %s" is the formatted string for the serializer.
+- someint is an integer.
+- somestring is a char pointer.
+
+Connecting a function (probe) to a marker is done by providing a probe (function
+to call) for the specific marker through marker_probe_register() and can be
+activated by calling marker_arm(). Marker deactivation can be done by calling
+marker_disarm() as many times as marker_arm() has been called. Removing a probe
+is done through marker_probe_unregister(); it will disarm the probe and make
+sure there is no caller left using the probe when it returns. Probe removal is
+preempt-safe because preemption is disabled around the probe call. See the
+"Probe example" section below for a sample probe module.
+
+The marker mechanism supports inserting multiple instances of the same marker.
+Markers can be put in inline functions, inlined static functions, and
+unrolled loops as well as regular functions.
+
+The naming scheme "subsystem_event" is suggested here as a convention intended
+to limit collisions. Marker names are global to the kernel: they are considered
+as being the same whether they are in the core kernel image or in modules.
+Conflicting format strings for markers with the same name will cause the markers
+to be detected to have a different format string not to be armed and will output
+a printk warning which identifies the inconsistency:
+
+"Format mismatch for probe probe_name (format), marker (format)"
+
+
+* Probe / marker example
+
+See the example provided in samples/markers/src
+
+Compile them with your kernel.
+
+Run, as root :
+modprobe marker-example (insmod order is not important)
+modprobe probe-example
+cat /proc/marker-example (returns an expected error)
+rmmod marker-example probe-example
+dmesg
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 650657c5473..4e17beba237 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1479,7 +1479,8 @@ kernel.
Any atomic operation that modifies some state in memory and returns information
about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation. These include:
+(smp_mb()) on each side of the actual operation (with the exception of
+explicit lock operations, described later). These include:
xchg();
cmpxchg();
@@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably
do need memory barriers as a lock primitive generally has to do things in a
specific order.
-
Basically, each usage case has to be carefully considered as to whether memory
barriers are needed or not.
+The following operations are special locking primitives:
+
+ test_and_set_bit_lock();
+ clear_bit_unlock();
+ __clear_bit_unlock();
+
+These implement LOCK-class and UNLOCK-class operations. These should be used in
+preference to other operations when implementing locking primitives, because
+their implementations can be optimised on many architectures.
+
[!] Note that special memory barrier primitives are available for these
situations because on some CPUs the atomic instructions used imply full memory
barriers, and so barrier instructions are superfluous in conjunction with them,
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 5fbcc22c98e..168117bd6ee 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -2,7 +2,8 @@
Memory Hotplug
==============
-Last Updated: Jul 28 2007
+Created: Jul 28 2007
+Add description of notifier of memory hotplug Oct 11 2007
This document is about memory hotplug including how-to-use and current status.
Because Memory Hotplug is still under development, contents of this text will
@@ -24,7 +25,8 @@ be changed often.
6.1 Memory offline and ZONE_MOVABLE
6.2. How to offline memory
7. Physical memory remove
-8. Future Work List
+8. Memory hotplug event notifier
+9. Future Work List
Note(1): x86_64's has special implementation for memory hotplug.
This text does not describe it.
@@ -307,8 +309,58 @@ Need more implementation yet....
- Notification completion of remove works by OS to firmware.
- Guard from remove if not yet.
+--------------------------------
+8. Memory hotplug event notifier
+--------------------------------
+Memory hotplug has event notifer. There are 6 types of notification.
+
+MEMORY_GOING_ONLINE
+ Generated before new memory becomes available in order to be able to
+ prepare subsystems to handle memory. The page allocator is still unable
+ to allocate from the new memory.
+
+MEMORY_CANCEL_ONLINE
+ Generated if MEMORY_GOING_ONLINE fails.
+
+MEMORY_ONLINE
+ Generated when memory has succesfully brought online. The callback may
+ allocate pages from the new memory.
+
+MEMORY_GOING_OFFLINE
+ Generated to begin the process of offlining memory. Allocations are no
+ longer possible from the memory but some of the memory to be offlined
+ is still in use. The callback can be used to free memory known to a
+ subsystem from the indicated memory section.
+
+MEMORY_CANCEL_OFFLINE
+ Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from
+ the section that we attempted to offline.
+
+MEMORY_OFFLINE
+ Generated after offlining memory is complete.
+
+A callback routine can be registered by
+ hotplug_memory_notifier(callback_func, priority)
+
+The second argument of callback function (action) is event types of above.
+The third argument is passed by pointer of struct memory_notify.
+
+struct memory_notify {
+ unsigned long start_pfn;
+ unsigned long nr_pages;
+ int status_cahnge_nid;
+}
+
+start_pfn is start_pfn of online/offline memory.
+nr_pages is # of pages of online/offline memory.
+status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+set/clear. It means a new(memoryless) node gets new memory by online and a
+node loses all memory. If this is -1, then nodemask status is not changed.
+If status_changed_nid >= 0, callback should create/discard structures for the
+node if necessary.
+
--------------
-8. Future Work
+9. Future Work
--------------
- allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
sysctl or new control file.
diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX
new file mode 100644
index 00000000000..3f13bf8043d
--- /dev/null
+++ b/Documentation/mips/00-INDEX
@@ -0,0 +1,6 @@
+00-INDEX
+ - this file.
+AU1xxx_IDE.README
+ - README for MIPS AU1XXX IDE driver.
+GT64120.README
+ - README for dir with info on MIPS boards using GT-64120 or GT-64120A.
diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README
index afb31c141d9..5c8334123f4 100644
--- a/Documentation/mips/AU1xxx_IDE.README
+++ b/Documentation/mips/AU1xxx_IDE.README
@@ -59,7 +59,7 @@ Four configs variables are introduced:
CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode
CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode
CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA
- controler
+ controller
CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size
per descriptor
diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README
deleted file mode 100644
index a4ce603ed3b..00000000000
--- a/Documentation/mips/time.README
+++ /dev/null
@@ -1,173 +0,0 @@
-README for MIPS time services
-
-Jun Sun
-jsun@mvista.com or jsun@junsun.net
-
-
-ABOUT
------
-This file describes the new arch/mips/kernel/time.c, related files and the
-services they provide.
-
-If you are short in patience and just want to know how to use time.c for a
-new board or convert an existing board, go to the last section.
-
-
-FILES, COMPATABILITY AND CONFIGS
----------------------------------
-
-The old arch/mips/kernel/time.c is renamed to old-time.c.
-
-A new time.c is put there, together with include/asm-mips/time.h.
-
-Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C.
-So we allow boards using
-
- 1) old time.c (CONFIG_OLD_TIME_C)
- 2) new time.c (CONFIG_NEW_TIME_C)
- 3) neither (their own private time.c)
-
-However, it is expected every board will move to the new time.c in the near
-future.
-
-
-WHAT THE NEW CODE PROVIDES?
----------------------------
-
-The new time code provide the following services:
-
- a) Implements functions required by Linux common code:
- time_init
-
- b) provides an abstraction of RTC and null RTC implementation as default.
- extern unsigned long (*rtc_get_time)(void);
- extern int (*rtc_set_time)(unsigned long);
-
- c) high-level and low-level timer interrupt routines where the timer
- interrupt source may or may not be the CPU timer. The high-level
- routine is dispatched through do_IRQ() while the low-level is
- dispatched in assemably code (usually int-handler.S)
-
-
-WHAT THE NEW CODE REQUIRES?
----------------------------
-
-For the new code to work properly, each board implementation needs to supply
-the following functions or values:
-
- a) board_time_init - a function pointer. Invoked at the beginnig of
- time_init(). It is optional.
- 1. (optional) set up RTC routines
- 2. (optional) calibrate and set the mips_hpt_frequency
-
- b) plat_timer_setup - a function pointer. Invoked at the end of time_init()
- 1. (optional) over-ride any decisions made in time_init()
- 2. set up the irqaction for timer interrupt.
- 3. enable the timer interrupt
-
- c) (optional) board-specific RTC routines.
-
- d) (optional) mips_hpt_frequency - It must be definied if the board
- is using CPU counter for timer interrupt.
-
-
-PORTING GUIDE
--------------
-
-Step 1: decide how you like to implement the time services.
-
- a) does this board have a RTC? If yes, implement the two RTC funcs.
-
- b) does the CPU have counter/compare registers?
-
- If the answer is no, you need a timer to provide the timer interrupt
- at 100 HZ speed.
-
- c) The following sub steps assume your CPU has counter register.
- Do you plan to use the CPU counter register as the timer interrupt
- or use an exnternal timer?
-
- In order to use CPU counter register as the timer interrupt source, you
- must know the counter speed (mips_hpt_frequency). It is usually the
- same as the CPU speed or an integral divisor of it.
-
- d) decide on whether you want to use high-level or low-level timer
- interrupt routines. The low-level one is presumably faster, but should
- not make too mcuh difference.
-
-
-Step 2: the machine setup() function
-
- If you supply board_time_init(), set the function poointer.
-
-
-Step 3: implement rtc routines, board_time_init() and plat_timer_setup()
- if needed.
-
- board_time_init() -
- a) (optional) set up RTC routines,
- b) (optional) calibrate and set the mips_hpt_frequency
- (only needed if you intended to use cpu counter as timer interrupt
- source)
-
- plat_timer_setup() -
- a) (optional) over-write any choices made above by time_init().
- b) machine specific code should setup the timer irqaction.
- c) enable the timer interrupt
-
-
- If the RTC chip is a common chip, I suggest the routines are put under
- arch/mips/libs. For example, for DS1386 chip, one would create
- rtc-ds1386.c under arch/mips/lib directory. Add the following line to
- the arch/mips/lib/Makefile:
-
- obj-$(CONFIG_DDB5476) += rtc-ds1386.o
-
-Step 4: if you are using low-level timer interrupt, change your interrupt
- dispathcing code to check for timer interrupt and jump to
- ll_timer_interrupt() directly if one is detected.
-
-Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine.
- Modify the appropriate defconfig if applicable.
-
-Final notes:
-
-For some tricky cases, you may need to add your own wrapper functions
-for some of the functions in time.c.
-
-For example, you may define your own timer interrupt routine, which does
-some of its own processing and then calls timer_interrupt().
-
-You can also over-ride any of the built-in functions (RTC routines
-and/or timer interrupt routine).
-
-
-PORTING NOTES FOR SMP
-----------------------
-
-If you have a SMP box, things are slightly more complicated.
-
-The time service running every jiffy is logically divided into two parts:
-
- 1) the one for the whole system (defined in timer_interrupt())
- 2) the one that should run for each CPU (defined in local_timer_interrupt())
-
-You need to decide on your timer interrupt sources.
-
- case 1) - whole system has only one timer interrupt delivered to one CPU
-
- In this case, you set up timer interrupt as in UP systems. In addtion,
- you need to set emulate_local_timer_interrupt to 1 so that other
- CPUs get to call local_timer_interrupt().
-
- THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write
- one should such a need arise. You simply make a IPI call.
-
- case 2) - each CPU has a separate timer interrupt
-
- In this case, you need to set up IRQ such that each of them will
- call local_timer_interrupt(). In addition, you need to arrange
- one and only one of them to call timer_interrupt().
-
- You can also do the low-level version of those interrupt routines,
- following similar dispatching routes described above.
diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt
index cbf79881a41..aa60d1f627e 100644
--- a/Documentation/mutex-design.txt
+++ b/Documentation/mutex-design.txt
@@ -90,7 +90,8 @@ of advantages of mutexes:
* - task may not exit with mutex held
* - memory areas where held locks reside must not be freed
* - held mutexes must not be reinitialized
- * - mutexes may not be used in irq contexts
+ * - mutexes may not be used in hardware or software interrupt
+ * contexts such as tasklets and timers
furthermore, there are also convenience features in the debugging
code:
@@ -132,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined:
int mutex_trylock(struct mutex *lock);
void mutex_unlock(struct mutex *lock);
int mutex_is_locked(struct mutex *lock);
-
+ void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+ int mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass);
diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt
index a136721499b..d602c8d6ff3 100644
--- a/Documentation/networking/bcm43xx.txt
+++ b/Documentation/networking/bcm43xx.txt
@@ -37,7 +37,7 @@ all, distributions. There is, however, additional software that is
required. The firmware used by the chip is the intellectual property
of Broadcom and they have not given the bcm43xx team redistribution
rights to this firmware. Since we cannot legally redistribute
-the firwmare we cannot include it with the driver. Furthermore, it
+the firmware we cannot include it with the driver. Furthermore, it
cannot be placed in the downloadable archives of any distributing
organization; therefore, the user is responsible for obtaining the
firmware and placing it in the appropriate location so that the driver
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 1da56663083..11340625e36 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -281,6 +281,39 @@ downdelay
will be rounded down to the nearest multiple. The default
value is 0.
+fail_over_mac
+
+ Specifies whether active-backup mode should set all slaves to
+ the same MAC address (the traditional behavior), or, when
+ enabled, change the bond's MAC address when changing the
+ active interface (i.e., fail over the MAC address itself).
+
+ Fail over MAC is useful for devices that cannot ever alter
+ their MAC address, or for devices that refuse incoming
+ broadcasts with their own source MAC (which interferes with
+ the ARP monitor).
+
+ The down side of fail over MAC is that every device on the
+ network must be updated via gratuitous ARP, vs. just updating
+ a switch or set of switches (which often takes place for any
+ traffic, not just ARP traffic, if the switch snoops incoming
+ traffic to update its tables) for the traditional method. If
+ the gratuitous ARP is lost, communication may be disrupted.
+
+ When fail over MAC is used in conjuction with the mii monitor,
+ devices which assert link up prior to being able to actually
+ transmit and receive are particularly susecptible to loss of
+ the gratuitous ARP, and an appropriate updelay setting may be
+ required.
+
+ A value of 0 disables fail over MAC, and is the default. A
+ value of 1 enables fail over MAC. This option is enabled
+ automatically if the first slave added cannot change its MAC
+ address. This option may be modified via sysfs only when no
+ slaves are present in the bond.
+
+ This option was added in bonding version 3.2.0.
+
lacp_rate
Option specifying the rate in which we'll ask our link partner
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6ae2feff308..747a5d15d52 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -293,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN
when the connection closes, so that connections established in the
near future can use these to set initial conditions. Usually, this
increases overall performance, but may sometimes cause performance
- degredation. If set, TCP will not cache metrics on closing
+ degradation. If set, TCP will not cache metrics on closing
connections.
tcp_orphan_retries - INTEGER
diff --git a/Documentation/networking/proc_net_tcp.txt b/Documentation/networking/proc_net_tcp.txt
index 5e21f7cb638..4a79209e77a 100644
--- a/Documentation/networking/proc_net_tcp.txt
+++ b/Documentation/networking/proc_net_tcp.txt
@@ -1,8 +1,9 @@
This document describes the interfaces /proc/net/tcp and /proc/net/tcp6.
+Note that these interfaces are deprecated in favor of tcp_diag.
These /proc interfaces provide information about currently active TCP
-connections, and are implemented by tcp_get_info() in net/ipv4/tcp_ipv4.c and
-tcp6_get_info() in net/ipv6/tcp_ipv6.c, respectively.
+connections, and are implemented by tcp4_seq_show() in net/ipv4/tcp_ipv4.c
+and tcp6_seq_show() in net/ipv6/tcp_ipv6.c, respectively.
It will first list all listening TCP sockets, and next list all established
TCP connections. A typical entry of /proc/net/tcp would look like this (split
diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index cae231b1c13..c3669a3fb4a 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -689,7 +689,7 @@ such as the AFS filesystem. This permits such a utility to:
buffers manipulated directly.
To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket,
-bind an addess as appropriate and listen if it's to be a server socket, but
+bind an address as appropriate and listen if it's to be a server socket, but
then it passes this to the kernel interface functions.
The kernel interface functions are as follows:
@@ -857,3 +857,10 @@ The kernel interface functions are as follows:
This is used to extract the error number from a message indicating either
a local error occurred or a network error occurred.
+
+ (*) Allocate a null key for doing anonymous security.
+
+ struct key *rxrpc_get_null_key(const char *keyname);
+
+ This is used to allocate a null RxRPC key that can be used to indicate
+ anonymous security for a particular domain.
diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt
index 6be09ba24a3..b6409cab075 100644
--- a/Documentation/networking/udplite.txt
+++ b/Documentation/networking/udplite.txt
@@ -12,7 +12,7 @@
For in-depth information, you can consult:
o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/
- Fom here you can also download some example application source code.
+ From here you can also download some example application source code.
o The UDP-Lite HOWTO on
http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt
@@ -223,7 +223,7 @@
While it is important that such cases are dealt with correctly, they
are (annoyingly) rare: UDP-Lite is designed for optimising multimedia
performance over wireless (or generally noisy) links and thus smaller
- coverage lenghts are likely to be expected.
+ coverage lengths are likely to be expected.
V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING
@@ -259,7 +259,7 @@
VI) IPTABLES
There is packet match support for UDP-Lite as well as support for the LOG target.
- If you copy and paste the following line into /etc/protcols,
+ If you copy and paste the following line into /etc/protocols,
udplite 136 UDP-Lite # UDP-Lite [RFC 3828]
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt
index 8f2302415ef..265fcdcb8e5 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/parport-lowlevel.txt
@@ -25,7 +25,6 @@ Global functions:
parport_open
parport_close
parport_device_id
- parport_device_num
parport_device_coords
parport_find_class
parport_find_device
@@ -735,7 +734,7 @@ NULL is returned.
SEE ALSO
-parport_register_device, parport_device_num
+parport_register_device
parport_close - unregister device for particular device number
-------------
@@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs.
SEE ALSO
-parport_find_class, parport_find_device, parport_device_num
-
-parport_device_num - convert device coordinates to device number
-------------------
-
-SYNOPSIS
-
-#include <linux/parport.h>
-
-int parport_device_num (int parport, int mux, int daisy);
-
-DESCRIPTION
-
-Convert between device coordinates (port, multiplexor, daisy chain
-address) and device number (zero-based).
-
-RETURN VALUE
-
-Device number, or -1 if no device at given coordinates.
-
-SEE ALSO
-
-parport_device_coords, parport_open, parport_device_id
+parport_find_class, parport_find_device
parport_device_coords - convert device number to device coordinates
------------------
@@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux,
SEE ALSO
-parport_device_num, parport_open, parport_device_id
+parport_open, parport_device_id
parport_find_class - find a device by its class
------------------
diff --git a/Documentation/power/00-INDEX b/Documentation/power/00-INDEX
new file mode 100644
index 00000000000..8db4e41a052
--- /dev/null
+++ b/Documentation/power/00-INDEX
@@ -0,0 +1,34 @@
+00-INDEX
+ - This file
+basic-pm-debugging.txt
+ - Debugging suspend and resume
+devices.txt
+ - How drivers interact with system-wide power management
+drivers-testing.txt
+ - Testing suspend and resume support in device drivers
+freezing-of-tasks.txt
+ - How processes and controlled during suspend
+interface.txt
+ - Power management user interface in /sys/power
+notifiers.txt
+ - Registering suspend notifiers in device drivers
+pci.txt
+ - How the PCI Subsystem Does Power Management
+s2ram.txt
+ - How to get suspend to ram working (and debug it when it isn't)
+states.txt
+ - System power management states
+swsusp-and-swap-files.txt
+ - Using swap files with software suspend (to disk)
+swsusp-dmcrypt.txt
+ - How to use dm-crypt and software suspend (to disk) together
+swsusp.txt
+ - Goals, implementation, and usage of software suspend (ACPI S3)
+tricks.txt
+ - How to trick software suspend (to disk) into working when it isn't
+userland-swsusp.txt
+ - Experimental implementation of software suspend in userspace
+video_extension.txt
+ - ACPI video extensions
+video.txt
+ - Video issues during resume from suspend
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
index 1a85e2b964d..57aef2f6e0d 100644
--- a/Documentation/power/basic-pm-debugging.txt
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -78,8 +78,8 @@ c) Advanced debugging
In case the STD does not work on your system even in the minimal configuration
and compiling more drivers as modules is not practical or some modules cannot
be unloaded, you can use one of the more advanced debugging techniques to find
-the problem. First, if there is a serial port in your box, you can set the
-CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel
+the problem. First, if there is a serial port in your box, you can boot the
+kernel with the 'no_console_suspend' parameter and try to log kernel
messages using the serial console. This may provide you with some information
about the reasons of the suspend (resume) failure. Alternatively, it may be
possible to use a FireWire port for debugging with firescope
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
index 33016c2f18d..e4bdcaee24e 100644
--- a/Documentation/power/drivers-testing.txt
+++ b/Documentation/power/drivers-testing.txt
@@ -14,8 +14,8 @@ the machine's BIOS.
Of course, for this purpose the test system has to be known to suspend and
resume without the driver being tested. Thus, if possible, you should first
resolve all suspend/resume-related problems in the test system before you start
-testing the new driver. Please see Documents/power/basic-pm-debugging.txt for
-more information about the debugging of suspend/resume functionality.
+testing the new driver. Please see Documentation/power/basic-pm-debugging.txt
+for more information about the debugging of suspend/resume functionality.
2. Testing the driver
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
index 04dc1cf9d21..38b57248fd6 100644
--- a/Documentation/power/freezing-of-tasks.txt
+++ b/Documentation/power/freezing-of-tasks.txt
@@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend).
Namely, as the first step of the hibernation procedure the function
freeze_processes() (defined in kernel/power/process.c) is called. It executes
try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
-sends a fake signal to each of them. A task that receives such a signal and has
-TIF_FREEZE set, should react to it by calling the refrigerator() function
-(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag,
-changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is
-cleared for it. Then, we say that the task is 'frozen' and therefore the set of
-functions handling this mechanism is called 'the freezer' (these functions are
+either wakes them up, if they are kernel threads, or sends fake signals to them,
+if they are user space processes. A task that has TIF_FREEZE set, should react
+to it by calling the function called refrigerator() (defined in
+kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state
+to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
+Then, we say that the task is 'frozen' and therefore the set of functions
+handling this mechanism is referred to as 'the freezer' (these functions are
defined in kernel/power/process.c and include/linux/freezer.h). User space
processes are generally frozen before kernel threads.
@@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set.
For user space processes try_to_freeze() is called automatically from the
signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places. The code to do this may look like the following:
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if TIF_FREEZE is set and calling
+try_to_freeze(). The main loop of a freezable kernel thread may look like the
+following one:
+ set_freezable();
do {
hub_events();
- wait_event_interruptible(khubd_wait,
- !list_empty(&hub_event_list));
- try_to_freeze();
- } while (!signal_pending(current));
+ wait_event_freezable(khubd_wait,
+ !list_empty(&hub_event_list) ||
+ kthread_should_stop());
+ } while (!kthread_should_stop() || !list_empty(&hub_event_list));
(from drivers/usb/core/hub.c::hub_thread()).
If a freezable kernel thread fails to call try_to_freeze() after the freezer has
set TIF_FREEZE for it, the freezing of tasks will fail and the entire
hibernation operation will be cancelled. For this reason, freezable kernel
-threads must call try_to_freeze() somewhere.
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
After the system memory state has been restored from a hibernation image and
devices have been reinitialized, the function thaw_processes() is called in
@@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off.
The majority of these are user space processes, but if any of the kernel threads
may cause something like this to happen, they have to be freezable.
-2. The second reason is to prevent user space processes and some kernel threads
+2. Next, to create the hibernation image we need to free a sufficient amount of
+memory (approximately 50% of available RAM) and we need to do that before
+devices are deactivated, because we generally need them for swapping out. Then,
+after the memory for the image has been freed, we don't want tasks to allocate
+additional memory and we prevent them from doing that by freezing them earlier.
+[Of course, this also means that device drivers should not allocate substantial
+amounts of memory from their .suspend() callbacks before hibernation, but this
+is e separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
from interfering with the suspending and resuming of devices. A user space
process running on a second CPU while we are suspending devices may, for
example, be troublesome and without the freezing of tasks we would need some
@@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be
thawed after the driver's .resume() callback has run, so it won't be accessing
the device while it's suspended.
-3. Another reason for freezing tasks is to prevent user space processes from
+4. Another reason for freezing tasks is to prevent user space processes from
realizing that hibernation (or suspend) operation takes place. Ideally, user
space processes should not notice that such a system-wide operation has occurred
and should continue running without any problems after the restore (or resume
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index fd5192a8fa8..e67211fe0ee 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -20,7 +20,7 @@ states.
/sys/power/disk controls the operating mode of the suspend-to-disk
mechanism. Suspend-to-disk can be handled in several ways. We have a
few options for putting the system to sleep - using the platform driver
-(e.g. ACPI or other pm_ops), powering off the system or rebooting the
+(e.g. ACPI or other suspend_ops), powering off the system or rebooting the
system (for testing).
Additionally, /sys/power/disk can be used to turn on one of the two testing
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
index 06f911a5f88..f281886de49 100644
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ b/Documentation/power/swsusp-and-swap-files.txt
@@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset>
where <swap_file_partition> is the partition on which the swap file is located
and <swap_file_offset> is the offset of the swap header determined by the
application in 2) (of course, this step may be carried out automatically
-by the same application that determies the swap file's header offset using the
+by the same application that determines the swap file's header offset using the
FIBMAP ioctl)
OR
diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
index d6d65b9bcfe..94a3c577b08 100644
--- a/Documentation/powerpc/00-INDEX
+++ b/Documentation/powerpc/00-INDEX
@@ -5,6 +5,8 @@ please mail me.
00-INDEX
- this file
+booting-without-of.txt
+ - Booting the Linux/ppc kernel without Open Firmware
cpu_features.txt
- info on how we support a variety of CPUs with minimal compile-time
options.
@@ -14,6 +16,8 @@ hvcs.txt
- IBM "Hypervisor Virtual Console Server" Installation Guide
mpc52xx.txt
- Linux 2.6.x on MPC52xx family
+mpc52xx-device-tree-bindings.txt
+ - MPC5200 Device Tree Bindings
ppc_htab.txt
- info about the Linux/PPC /proc/ppc_htab entry
SBC8260_memory_mapping.txt
diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt
index 4530d1bf028..df7afe43d46 100644
--- a/Documentation/powerpc/eeh-pci-error-recovery.txt
+++ b/Documentation/powerpc/eeh-pci-error-recovery.txt
@@ -36,8 +36,8 @@ Causes of EEH Errors
EEH was originally designed to guard against hardware failure, such
as PCI cards dying from heat, humidity, dust, vibration and bad
electrical connections. The vast majority of EEH errors seen in
-"real life" are due to eithr poorly seated PCI cards, or,
-unfortunately quite commonly, due device driver bugs, device firmware
+"real life" are due to either poorly seated PCI cards, or,
+unfortunately quite commonly, due to device driver bugs, device firmware
bugs, and sometimes PCI card hardware bugs.
The most common software bug, is one that causes the device to
diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
index e59fcbbe338..5e03610e186 100644
--- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
+++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt
@@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time. The device tree
describes what devices are present on the board and how they are
connected. The device tree can either be passed as a binary blob (as
described in Documentation/powerpc/booting-without-of.txt), or passed
-by Open Firmare (IEEE 1275) compatible firmware using an OF compatible
+by Open Firmware (IEEE 1275) compatible firmware using an OF compatible
client interface API.
This document specifies the requirements on the device-tree for mpc5200
based boards. These requirements are above and beyond the details
-specified in either the OpenFirmware spec or booting-without-of.txt
+specified in either the Open Firmware spec or booting-without-of.txt
All new mpc5200-based boards are expected to match this document. In
cases where this document is not sufficient to support a new board port,
@@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be
selected.
The split between the MPC5200 and the MPC5200B leaves a bit of a
-connundrum. How should the compatible property be set up to provide
-maximum compatability information; but still acurately describe the
+conundrum. How should the compatible property be set up to provide
+maximum compatibility information; but still accurately describe the
chip? For the MPC5200; the answer is easy. Most of the SoC devices
originally appeared on the MPC5200. Since they didn't exist anywhere
else; the 5200 compatible properties will contain only one item;
@@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite. It fixes
silicon bugs and it adds a small number of enhancements. Most of the
devices either provide exactly the same interface as on the 5200. A few
devices have extra functions but still have a backwards compatible mode.
-To express this infomation as completely as possible, 5200B device trees
+To express this information as completely as possible, 5200B device trees
should have two items in the compatible list;
"mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended
that 5200B device trees follow this convention (instead of only listing
@@ -185,7 +185,7 @@ bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires
Recommended soc5200 child nodes; populate as needed for your board
name device_type compatible Description
---- ----------- ---------- -----------
-gpt@<addr> gpt mpc5200-gpt General purpose timers
+gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers
rtc@<addr> rtc mpc5200-rtc Real time clock
mscan@<addr> mscan mpc5200-mscan CAN bus controller
pci@<addr> pci mpc5200-pci PCI bridge
@@ -199,7 +199,7 @@ ethernet@<addr> network mpc5200-fec MPC5200 ethernet device
ata@<addr> ata mpc5200-ata IDE ATA interface
i2c@<addr> i2c mpc5200-i2c I2C controller
usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller
-xlb@<addr> xlb mpc5200-xlb XLB arbritrator
+xlb@<addr> xlb mpc5200-xlb XLB arbitrator
Important child node properties
name type description
@@ -213,7 +213,7 @@ cell-index int When multiple devices are present, is the
5) General Purpose Timer nodes (child of soc5200 node)
On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board
design supports the internal wdt, then the device node for GPT0 should
-include the empty property 'has-wdt'.
+include the empty property 'fsl,has-wdt'.
6) PSC nodes (child of soc5200 node)
PSC nodes can define the optional 'port-number' property to force assignment
diff --git a/Documentation/ramdisk.txt b/Documentation/ramdisk.txt
index 52f75b7d51c..6c820baa19a 100644
--- a/Documentation/ramdisk.txt
+++ b/Documentation/ramdisk.txt
@@ -22,16 +22,14 @@ The RAM disk dynamically grows as more space is required. It does this by using
RAM from the buffer cache. The driver marks the buffers it is using as dirty
so that the VM subsystem does not try to reclaim them later.
-Also, the RAM disk supports up to 16 RAM disks out of the box, and can
-be reconfigured to support up to 255 RAM disks - change "#define NUM_RAMDISKS"
-in drivers/block/rd.c. To use RAM disk support with your system, run
-'./MAKEDEV ram' from the /dev directory. RAM disks are all major number 1, and
-start with minor number 0 for /dev/ram0, etc. If used, modern kernels use
-/dev/ram0 for an initrd.
-
-The old "ramdisk=<ram_size>" has been changed to "ramdisk_size=<ram_size>" to
-make it clearer. The original "ramdisk=<ram_size>" has been kept around for
-compatibility reasons, but it may be removed in the future.
+The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
+to support an unlimited number of RAM disks (at your own risk). Just change
+the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
+and (re)build the kernel.
+
+To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
+directory. RAM disks are all major number 1, and start with minor number 0
+for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd.
The new RAM disk also has the ability to load compressed RAM disk images,
allowing one to squeeze more programs onto an average installation or
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 84901e7c050..88bcb876733 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
diff --git a/Documentation/scsi/00-INDEX b/Documentation/scsi/00-INDEX
index 12354830c6b..aa1f7e92783 100644
--- a/Documentation/scsi/00-INDEX
+++ b/Documentation/scsi/00-INDEX
@@ -2,14 +2,20 @@
- this file
53c700.txt
- info on driver for 53c700 based adapters
-AM53C974.txt
- - info on driver for AM53c974 based adapters
BusLogic.txt
- info on driver for adapters with BusLogic chips
-ChangeLog
+ChangeLog.1992-1997
- Changes to scsi files, if not listed elsewhere
+ChangeLog.arcmsr
+ - Changes to driver for ARECA's SATA RAID controller cards
ChangeLog.ips
- IBM ServeRAID driver Changelog
+ChangeLog.lpfc
+ - Changes to lpfc driver
+ChangeLog.megaraid
+ - Changes to LSI megaraid controller.
+ChangeLog.megaraid_sas
+ - Changes to serial attached scsi version of LSI megaraid controller.
ChangeLog.ncr53c8xx
- Changes to ncr53c8xx driver
ChangeLog.sym53c8xx
@@ -20,26 +26,44 @@ FlashPoint.txt
- info on driver for BusLogic FlashPoint adapters
LICENSE.FlashPoint
- Licence of the Flashpoint driver
+LICENSE.qla2xxx
+ - License for QLogic Linux Fibre Channel HBA Driver firmware.
Mylex.txt
- info on driver for Mylex adapters
NinjaSCSI.txt
- info on WorkBiT NinjaSCSI-32/32Bi driver
+aacraid.txt
+ - Driver supporting Adaptec RAID controllers
aha152x.txt
- info on driver for Adaptec AHA152x based adapters
+aic79xx.txt
+ - Adaptec Ultra320 SCSI host adapters
aic7xxx.txt
- info on driver for Adaptec controllers
aic7xxx_old.txt
- info on driver for Adaptec controllers, old generation
+arcmsr_spec.txt
+ - ARECA FIRMWARE SPEC (for IOP331 adapter)
+dc395x.txt
+ - README file for the dc395x SCSI driver
dpti.txt
- info on driver for DPT SmartRAID and Adaptec I2O RAID based adapters
dtc3x80.txt
- info on driver for DTC 2x80 based adapters
g_NCR5380.txt
- info on driver for NCR5380 and NCR53c400 based adapters
+hptiop.txt
+ - HIGHPOINT ROCKETRAID 3xxx RAID DRIVER
ibmmca.txt
- info on driver for IBM adapters with MCA bus
in2000.txt
- info on in2000 driver
+libsas.txt
+ - Serial Attached SCSI management layer.
+lpfc.txt
+ - LPFC driver release notes
+megaraid.txt
+ - Common Management Module, shared code handling ioctls for LSI drivers
ncr53c7xx.txt
- info on driver for NCR53c7xx based adapters
ncr53c8xx.txt
@@ -50,6 +74,8 @@ ppa.txt
- info on driver for IOmega zip drive
qlogicfas.txt
- info on driver for QLogic FASxxx based adapters
+scsi-changer.txt
+ - README for the SCSI media changer driver
scsi-generic.txt
- info on the sg driver for generic (non-disk/CD/tape) SCSI devices.
scsi.txt
@@ -58,6 +84,8 @@ scsi_mid_low_api.txt
- info on API between SCSI layer and low level drivers
scsi_eh.txt
- info on SCSI midlayer error handling infrastructure
+scsi_fc_transport.txt
+ - SCSI Fiber Channel Tansport
st.txt
- info on scsi tape driver
sym53c500_cs.txt
diff --git a/Documentation/scsi/ChangeLog.arcmsr b/Documentation/scsi/ChangeLog.arcmsr
index 162c47fdf45..cd8403a33ee 100644
--- a/Documentation/scsi/ChangeLog.arcmsr
+++ b/Documentation/scsi/ChangeLog.arcmsr
@@ -53,4 +53,19 @@
** for linux standard list
** enable usage of pci message signal interrupt
** follow Randy.Danlup kindness suggestion cleanup this code
-************************************************************************** \ No newline at end of file
+** 1.20.00.14 05/02/2007 Erich Chen & Nick Cheng
+** 1.implement PCI-Express error recovery function and AER capability
+** 2.implement the selection of ARCMSR_MAX_XFER_SECTORS_B=4096
+** if firmware version is newer than 1.42
+** 3.modify arcmsr_iop_reset to improve the ability
+** 4.modify the ISR, arcmsr_interrupt routine,to prevent the
+** inconsistency with sg_mod driver if application directly calls
+** the arcmsr driver w/o passing through scsi mid layer
+** specially thanks to Yanmin Zhang's openhanded help about AER
+** 1.20.00.15 08/30/2007 Erich Chen & Nick Cheng
+** 1. support ARC1200/1201/1202 SATA RAID adapter, which is named
+** ACB_ADAPTER_TYPE_B
+** 2. modify the arcmsr_pci_slot_reset function
+** 3. modify the arcmsr_pci_ers_disconnect_forepart function
+** 4. modify the arcmsr_pci_ers_need_reset_forepart function
+**************************************************************************
diff --git a/Documentation/scsi/ChangeLog.ncr53c8xx b/Documentation/scsi/ChangeLog.ncr53c8xx
index 7d03e9d5b5f..a9f721aeb11 100644
--- a/Documentation/scsi/ChangeLog.ncr53c8xx
+++ b/Documentation/scsi/ChangeLog.ncr53c8xx
@@ -195,9 +195,9 @@ Sun Feb 14:00 1999 Gerard Roudier (groudier@club-internet.fr)
Pointed out by Leonard Zubkoff.
- Allow to tune request_irq() flags from the boot command line using
ncr53c8xx=irqm:??, as follows:
- a) If bit 0x10 is set in irqm, SA_SHIRQ flag is not used.
- b) If bit 0x20 is set in irqm, SA_INTERRUPT flag is not used.
- By default the driver uses both SA_SHIRQ and SA_INTERRUPT.
+ a) If bit 0x10 is set in irqm, IRQF_SHARED flag is not used.
+ b) If bit 0x20 is set in irqm, IRQF_DISABLED flag is not used.
+ By default the driver uses both IRQF_SHARED and IRQF_DISABLED.
Option 'ncr53c8xx=irqm:0x20' may be used when an IRQ is shared by
a 53C8XX adapter and a network board.
- Tiny mispelling fixed (ABORT instead of ABRT). Was fortunately
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
index cc12b55d4b3..a8257840695 100644
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -38,10 +38,8 @@ Supported Cards/Chipsets
9005:0286:9005:02ac Adaptec 1800 (Typhoon44)
9005:0285:9005:02b5 Adaptec 5445 (Voodoo44)
9005:0285:15d9:02b5 SMC AOC-USAS-S4i
- 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR
9005:0285:9005:02b6 Adaptec 5805 (Voodoo80)
9005:0285:15d9:02b6 SMC AOC-USAS-S8i
- 9005:0285:15d9:02ca SMC AOC-USAS-S8iR
9005:0285:9005:02b7 Adaptec 5085 (Voodoo08)
9005:0285:9005:02bb Adaptec 3405 (Marauder40LP)
9005:0285:9005:02bc Adaptec 3805 (Marauder80LP)
@@ -50,9 +48,14 @@ Supported Cards/Chipsets
9005:0285:9005:02be Adaptec 31605 (Marauder160)
9005:0285:9005:02c3 Adaptec 51205 (Voodoo120)
9005:0285:9005:02c4 Adaptec 51605 (Voodoo160)
+ 9005:0285:15d9:02c9 SMC AOC-USAS-S4iR
+ 9005:0285:15d9:02ca SMC AOC-USAS-S8iR
9005:0285:9005:02ce Adaptec 51245 (Voodoo124)
9005:0285:9005:02cf Adaptec 51645 (Voodoo164)
9005:0285:9005:02d0 Adaptec 52445 (Voodoo244)
+ 9005:0285:9005:02d1 Adaptec 5405 (Voodoo40)
+ 9005:0285:15d9:02d2 SMC AOC-USAS-S8i-LP
+ 9005:0285:15d9:02d3 SMC AOC-USAS-S8iR-LP
1011:0046:9005:0364 Adaptec 5400S (Mustang)
9005:0287:9005:0800 Adaptec Themisto (Jupiter)
9005:0200:9005:0200 Adaptec Themisto (Jupiter)
@@ -103,6 +106,7 @@ Supported Cards/Chipsets
9005:0285:108e:7aac SUN STK RAID REM (Voodoo44 Coyote)
9005:0285:108e:0286 SUN STK RAID INT (Cougar)
9005:0285:108e:0287 SUN STK RAID EXT (Prometheus)
+ 9005:0285:108e:7aae SUN STK RAID EM (Narvi)
People
-------------------------
diff --git a/Documentation/scsi/advansys.txt b/Documentation/scsi/advansys.txt
new file mode 100644
index 00000000000..4a3db62b742
--- /dev/null
+++ b/Documentation/scsi/advansys.txt
@@ -0,0 +1,243 @@
+AdvanSys (Advanced System Products, Inc.) manufactures the following
+RISC-based, Bus-Mastering, Fast (10 Mhz) and Ultra (20 Mhz) Narrow
+(8-bit transfer) SCSI Host Adapters for the ISA, EISA, VL, and PCI
+buses and RISC-based, Bus-Mastering, Ultra (20 Mhz) Wide (16-bit
+transfer) SCSI Host Adapters for the PCI bus.
+
+The CDB counts below indicate the number of SCSI CDB (Command
+Descriptor Block) requests that can be stored in the RISC chip
+cache and board LRAM. A CDB is a single SCSI command. The driver
+detect routine will display the number of CDBs available for each
+adapter detected. The number of CDBs used by the driver can be
+lowered in the BIOS by changing the 'Host Queue Size' adapter setting.
+
+Laptop Products:
+ ABP-480 - Bus-Master CardBus (16 CDB)
+
+Connectivity Products:
+ ABP510/5150 - Bus-Master ISA (240 CDB)
+ ABP5140 - Bus-Master ISA PnP (16 CDB)
+ ABP5142 - Bus-Master ISA PnP with floppy (16 CDB)
+ ABP902/3902 - Bus-Master PCI (16 CDB)
+ ABP3905 - Bus-Master PCI (16 CDB)
+ ABP915 - Bus-Master PCI (16 CDB)
+ ABP920 - Bus-Master PCI (16 CDB)
+ ABP3922 - Bus-Master PCI (16 CDB)
+ ABP3925 - Bus-Master PCI (16 CDB)
+ ABP930 - Bus-Master PCI (16 CDB)
+ ABP930U - Bus-Master PCI Ultra (16 CDB)
+ ABP930UA - Bus-Master PCI Ultra (16 CDB)
+ ABP960 - Bus-Master PCI MAC/PC (16 CDB)
+ ABP960U - Bus-Master PCI MAC/PC Ultra (16 CDB)
+
+Single Channel Products:
+ ABP542 - Bus-Master ISA with floppy (240 CDB)
+ ABP742 - Bus-Master EISA (240 CDB)
+ ABP842 - Bus-Master VL (240 CDB)
+ ABP940 - Bus-Master PCI (240 CDB)
+ ABP940U - Bus-Master PCI Ultra (240 CDB)
+ ABP940UA/3940UA - Bus-Master PCI Ultra (240 CDB)
+ ABP970 - Bus-Master PCI MAC/PC (240 CDB)
+ ABP970U - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP3960UA - Bus-Master PCI MAC/PC Ultra (240 CDB)
+ ABP940UW/3940UW - Bus-Master PCI Ultra-Wide (253 CDB)
+ ABP970UW - Bus-Master PCI MAC/PC Ultra-Wide (253 CDB)
+ ABP3940U2W - Bus-Master PCI LVD/Ultra2-Wide (253 CDB)
+
+Multi-Channel Products:
+ ABP752 - Dual Channel Bus-Master EISA (240 CDB Per Channel)
+ ABP852 - Dual Channel Bus-Master VL (240 CDB Per Channel)
+ ABP950 - Dual Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP950UW - Dual Channel Bus-Master PCI Ultra-Wide (253 CDB Per Channel)
+ ABP980 - Four Channel Bus-Master PCI (240 CDB Per Channel)
+ ABP980U - Four Channel Bus-Master PCI Ultra (240 CDB Per Channel)
+ ABP980UA/3980UA - Four Channel Bus-Master PCI Ultra (16 CDB Per Chan.)
+ ABP3950U2W - Bus-Master PCI LVD/Ultra2-Wide and Ultra-Wide (253 CDB)
+ ABP3950U3W - Bus-Master PCI Dual LVD2/Ultra3-Wide (253 CDB)
+
+Driver Compile Time Options and Debugging
+
+The following constants can be defined in the source file.
+
+1. ADVANSYS_ASSERT - Enable driver assertions (Def: Enabled)
+
+ Enabling this option adds assertion logic statements to the
+ driver. If an assertion fails a message will be displayed to
+ the console, but the system will continue to operate. Any
+ assertions encountered should be reported to the person
+ responsible for the driver. Assertion statements may proactively
+ detect problems with the driver and facilitate fixing these
+ problems. Enabling assertions will add a small overhead to the
+ execution of the driver.
+
+2. ADVANSYS_DEBUG - Enable driver debugging (Def: Disabled)
+
+ Enabling this option adds tracing functions to the driver and the
+ ability to set a driver tracing level at boot time. This option is
+ very useful for debugging the driver, but it will add to the size
+ of the driver execution image and add overhead to the execution of
+ the driver.
+
+ The amount of debugging output can be controlled with the global
+ variable 'asc_dbglvl'. The higher the number the more output. By
+ default the debug level is 0.
+
+ If the driver is loaded at boot time and the LILO Driver Option
+ is included in the system, the debug level can be changed by
+ specifying a 5th (ASC_NUM_IOPORT_PROBE + 1) I/O Port. The
+ first three hex digits of the pseudo I/O Port must be set to
+ 'deb' and the fourth hex digit specifies the debug level: 0 - F.
+ The following command line will look for an adapter at 0x330
+ and set the debug level to 2.
+
+ linux advansys=0x330,0,0,0,0xdeb2
+
+ If the driver is built as a loadable module this variable can be
+ defined when the driver is loaded. The following insmod command
+ will set the debug level to one.
+
+ insmod advansys.o asc_dbglvl=1
+
+ Debugging Message Levels:
+ 0: Errors Only
+ 1: High-Level Tracing
+ 2-N: Verbose Tracing
+
+ To enable debug output to console, please make sure that:
+
+ a. System and kernel logging is enabled (syslogd, klogd running).
+ b. Kernel messages are routed to console output. Check
+ /etc/syslog.conf for an entry similar to this:
+
+ kern.* /dev/console
+
+ c. klogd is started with the appropriate -c parameter
+ (e.g. klogd -c 8)
+
+ This will cause printk() messages to be be displayed on the
+ current console. Refer to the klogd(8) and syslogd(8) man pages
+ for details.
+
+ Alternatively you can enable printk() to console with this
+ program. However, this is not the 'official' way to do this.
+ Debug output is logged in /var/log/messages.
+
+ main()
+ {
+ syscall(103, 7, 0, 0);
+ }
+
+ Increasing LOG_BUF_LEN in kernel/printk.c to something like
+ 40960 allows more debug messages to be buffered in the kernel
+ and written to the console or log file.
+
+3. ADVANSYS_STATS - Enable statistics (Def: Enabled)
+
+ Enabling this option adds statistics collection and display
+ through /proc to the driver. The information is useful for
+ monitoring driver and device performance. It will add to the
+ size of the driver execution image and add minor overhead to
+ the execution of the driver.
+
+ Statistics are maintained on a per adapter basis. Driver entry
+ point call counts and transfer size counts are maintained.
+ Statistics are only available for kernels greater than or equal
+ to v1.3.0 with the CONFIG_PROC_FS (/proc) file system configured.
+
+ AdvanSys SCSI adapter files have the following path name format:
+
+ /proc/scsi/advansys/{0,1,2,3,...}
+
+ This information can be displayed with cat. For example:
+
+ cat /proc/scsi/advansys/0
+
+ When ADVANSYS_STATS is not defined the AdvanSys /proc files only
+ contain adapter and device configuration information.
+
+Driver LILO Option
+
+If init/main.c is modified as described in the 'Directions for Adding
+the AdvanSys Driver to Linux' section (B.4.) above, the driver will
+recognize the 'advansys' LILO command line and /etc/lilo.conf option.
+This option can be used to either disable I/O port scanning or to limit
+scanning to 1 - 4 I/O ports. Regardless of the option setting EISA and
+PCI boards will still be searched for and detected. This option only
+affects searching for ISA and VL boards.
+
+Examples:
+ 1. Eliminate I/O port scanning:
+ boot: linux advansys=
+ or
+ boot: linux advansys=0x0
+ 2. Limit I/O port scanning to one I/O port:
+ boot: linux advansys=0x110
+ 3. Limit I/O port scanning to four I/O ports:
+ boot: linux advansys=0x110,0x210,0x230,0x330
+
+For a loadable module the same effect can be achieved by setting
+the 'asc_iopflag' variable and 'asc_ioport' array when loading
+the driver, e.g.
+
+ insmod advansys.o asc_iopflag=1 asc_ioport=0x110,0x330
+
+If ADVANSYS_DEBUG is defined a 5th (ASC_NUM_IOPORT_PROBE + 1)
+I/O Port may be added to specify the driver debug level. Refer to
+the 'Driver Compile Time Options and Debugging' section above for
+more information.
+
+Credits (Chronological Order)
+
+Bob Frey <bfrey@turbolinux.com.cn> wrote the AdvanSys SCSI driver
+and maintained it up to 3.3F. He continues to answer questions
+and help maintain the driver.
+
+Nathan Hartwell <mage@cdc3.cdc.net> provided the directions and
+basis for the Linux v1.3.X changes which were included in the
+1.2 release.
+
+Thomas E Zerucha <zerucha@shell.portal.com> pointed out a bug
+in advansys_biosparam() which was fixed in the 1.3 release.
+
+Erik Ratcliffe <erik@caldera.com> has done testing of the
+AdvanSys driver in the Caldera releases.
+
+Rik van Riel <H.H.vanRiel@fys.ruu.nl> provided a patch to
+AscWaitTixISRDone() which he found necessary to make the
+driver work with a SCSI-1 disk.
+
+Mark Moran <mmoran@mmoran.com> has helped test Ultra-Wide
+support in the 3.1A driver.
+
+Doug Gilbert <dgilbert@interlog.com> has made changes and
+suggestions to improve the driver and done a lot of testing.
+
+Ken Mort <ken@mort.net> reported a DEBUG compile bug fixed
+in 3.2K.
+
+Tom Rini <trini@kernel.crashing.org> provided the CONFIG_ISA
+patch and helped with PowerPC wide and narrow board support.
+
+Philip Blundell <philb@gnu.org> provided an
+advansys_interrupts_enabled patch.
+
+Dave Jones <dave@denial.force9.co.uk> reported the compiler
+warnings generated when CONFIG_PROC_FS was not defined in
+the 3.2M driver.
+
+Jerry Quinn <jlquinn@us.ibm.com> fixed PowerPC support (endian
+problems) for wide cards.
+
+Bryan Henderson <bryanh@giraffe-data.com> helped debug narrow
+card error handling.
+
+Manuel Veloso <veloso@pobox.com> worked hard on PowerPC narrow
+board support and fixed a bug in AscGetEEPConfig().
+
+Arnaldo Carvalho de Melo <acme@conectiva.com.br> made
+save_flags/restore_flags changes.
+
+Andy Kellner <AKellner@connectcom.net> continued the Advansys SCSI
+driver development for ConnectCom (Version > 3.3F).
+
+Ken Witherow for extensive testing during the development of version 3.4.
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 6aa9a891f3d..683ccae00ad 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -120,7 +120,7 @@ The following information is available in this file:
list size to avoid SCSI malloc pool fragmentation.
- Cleanup channel display in our /proc output.
- Workaround duplicate device entries in the mid-layer
- devlice list during add-single-device.
+ device list during add-single-device.
1.3.6 (March 28th, 2003)
- Correct a double free in the Domain Validation code.
diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt
index 5f34d2ba69b..b7e238cbb5a 100644
--- a/Documentation/scsi/aic7xxx.txt
+++ b/Documentation/scsi/aic7xxx.txt
@@ -159,7 +159,7 @@ The following information is available in this file:
- Add support for 2.5.X's scsi_report_device_reset().
6.2.34 (May 5th, 2003)
- - Fix locking regression instroduced in 6.2.29 that
+ - Fix locking regression introduced in 6.2.29 that
could cause a lock order reversal between the io_request_lock
and our per-softc lock. This was only possible on RH9,
SuSE, and kernel.org 2.4.X kernels.
@@ -264,7 +264,7 @@ The following information is available in this file:
Option: tag_info:{{value[,value...]}[,{value[,value...]}...]}
Definition: Set the per-target tagged queue depth on a
per controller basis. Both controllers and targets
- may be ommitted indicating that they should retain
+ may be omitted indicating that they should retain
the default tag depth.
Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32}
On Controller 0
@@ -290,7 +290,7 @@ The following information is available in this file:
-----------------------------------------------------------------
Option: dv: {value[,value...]}
Definition: Set Domain Validation Policy on a per-controller basis.
- Controllers may be ommitted indicating that
+ Controllers may be omitted indicating that
they should retain the default read streaming setting.
Example: dv:{-1,0,,1,1,0}
On Controller 0 leave DV at its default setting.
diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt
index 5e0042340fd..45d9482c151 100644
--- a/Documentation/scsi/arcmsr_spec.txt
+++ b/Documentation/scsi/arcmsr_spec.txt
@@ -3,7 +3,7 @@
*******************************************************************************
** Usage of IOP331 adapter
** (All In/Out is in IOP331's view)
-** 1. Message 0 --> InitThread message and retrun code
+** 1. Message 0 --> InitThread message and return code
** 2. Doorbell is used for RS-232 emulation
** inDoorBell : bit0 -- data in ready
** (DRIVER DATA WRITE OK)
diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt
index 9707941704e..a810421f1fb 100644
--- a/Documentation/scsi/ibmmca.txt
+++ b/Documentation/scsi/ibmmca.txt
@@ -21,7 +21,7 @@
versions older than 4.0 do not work with kernels 2.4.0 or later! If you
try to compile your kernel with the wrong driver source, the
compilation is aborted and you get a corresponding error message. This is
- no bug in the driver. It prevents you from using the wrong sourcecode
+ no bug in the driver; it prevents you from using the wrong source code
with the wrong kernel version.
Authors of this Driver
@@ -58,7 +58,7 @@
5 Users' Manual
5.1 Commandline Parameters
5.2 Troubleshooting
- 5.3 Bugreports
+ 5.3 Bug reports
5.4 Support WWW-page
6 References
7 Credits to
@@ -71,13 +71,13 @@
1 Abstract
----------
- This README-file describes the IBM SCSI-subsystem low level driver for
- Linux. The descriptions which were formerly kept in the source-code have
- been taken out to this file to easify the codes' readability. The driver
+ This README-file describes the IBM SCSI-subsystem low level driver for
+ Linux. The descriptions which were formerly kept in the source code have
+ been taken out of this file to simplify the codes readability. The driver
description has been updated, as most of the former description was already
- quite outdated. The history of the driver development is also kept inside
- here. Multiple historical developments have been summarized to shorten the
- textsize a bit. At the end of this file you can find a small manual for
+ quite outdated. The history of the driver development is also kept inside
+ here. Multiple historical developments have been summarized to shorten the
+ text size a bit. At the end of this file you can find a small manual for
this driver and hints to get it running on your machine.
2 Driver Description
@@ -186,7 +186,7 @@
between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two
busses and provides support for 30 logical devices at the same time, where
in wide-addressing mode you can have 16 puns with 32 luns on each device.
- This section dexribes you the handling of devices on non-F/W adapters.
+ This section describes the handling of devices on non-F/W adapters.
Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter
which means a lot of possible devices for such a small machine.
@@ -209,10 +209,10 @@
--------------------------------------------------------
One consequence of information hiding is that the real (pun,lun)
numbers are also hidden. The two possibilities to get around this problem
- is to offer fake pun/lun combinations to the operating system or to
+ are to offer fake pun/lun combinations to the operating system or to
delete the whole mapping of the adapter and to reassign the ldns, using
the immediate assign command of the SCSI-subsystem for probing through
- all possible pun/lun combinations. a ldn is a "logical device number"
+ all possible pun/lun combinations. An ldn is a "logical device number"
which is used by IBM SCSI-subsystems to access some valid SCSI-device.
At the beginning of the development of this driver, the following approach
was used:
@@ -251,9 +251,9 @@
lun>0 or to non-existing devices, in order to satisfy the subsystem, if
there are less than 15 SCSI-devices connected. In the case of more than 15
devices, the dynamical mapping goes active. If the get_scsi[][] reports a
- device to be existant, but it has no ldn assigned, it gets a ldn out of 7
- to 14. The numbers are assigned in cyclic order. Therefore it takes 8
- dynamical reassignments on the SCSI-devices, until a certain device
+ device to be existent, but it has no ldn assigned, it gets an ldn out of 7
+ to 14. The numbers are assigned in cyclic order, therefore it takes 8
+ dynamical reassignments on the SCSI-devices until a certain device
loses its ldn again. This assures that dynamical remapping is avoided
during intense I/O between up to 15 SCSI-devices (means pun,lun
combinations). A further advantage of this method is that people who
@@ -551,7 +551,7 @@
than devices are available, they are assigned to non existing pun,lun
combinations to satisfy the adapter. With this, the dynamical mapping
was possible to implement. (For further info see the text in the
- source-code and in the description below. Read the description
+ source code and in the description below. Read the description
below BEFORE installing this driver on your system!)
2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION.
3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID
@@ -762,9 +762,9 @@
- Michael Lang
Apr 23, 2000 (v3.2pre1)
- 1) During a very long time, I collected a huge amount of bugreports from
+ 1) During a very long time, I collected a huge amount of bug reports from
various people, trying really quite different things on their SCSI-
- PS/2s. Today, all these bugreports are taken into account and should be
+ PS/2s. Today, all these bug reports are taken into account and should be
mostly solved. The major topics were:
- Driver crashes during boottime by no obvious reason.
- Driver panics while the midlevel-SCSI-driver is trying to inquire
@@ -819,7 +819,7 @@
- Michael Lang
July 17, 2000 (v3.2pre8)
- A long period of collecting bugreports from all corners of the world
+ A long period of collecting bug reports from all corners of the world
now lead to the following corrections to the code:
1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this
was that it is possible to disable Fast-SCSI for the external bus.
@@ -873,7 +873,7 @@
July 26, 2000 (v3.2pre11)
1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and
a model 9595. Asking around in the community, nobody except of me has
- seen such errors. Weired, but I am trying to recompile everything on
+ seen such errors. Weird, but I am trying to recompile everything on
the model 9595. Maybe, as I use a specially modified gcc, that could
cause problems. But, it was not the reason. The true background was,
that the kernel was compiled for i386 and the 9595 has a 486DX-2.
@@ -886,7 +886,7 @@
alive rotator during boottime. This makes sense, when no monitor is
connected to the system. You can get rid of all display activity, if
you do not use any parameter or just ibmmcascsi=activity, for the
- harddrive activity LED, existant on all PS/2, except models 8595-XXX.
+ harddrive activity LED, existent on all PS/2, except models 8595-XXX.
If no monitor is available, please use ibmmcascsi=display, which works
fine together with the linuxinfo utility for the LED-panel.
- Michael Lang
@@ -1115,7 +1115,7 @@
If this really happens, do also send e-mail to the maintainer, as
forced detection should be never necessary. Forced detection is in
principal some flaw of the driver adapter detection and goes into
- bugreports.
+ bug reports.
Q: The driver screws up, if it starts to probe SCSI-devices, is there
some way out of it?
A: Yes, that was some recognition problem of the correct SCSI-adapter
@@ -1172,7 +1172,7 @@
recommended version is 3.2 or later. Here, the F/W support is in
a stable and reliable condition. Wide-addressing is in addition
supported.
- Q: I get a Ooops message and something like "killing interrupt".
+ Q: I get an Oops message and something like "killing interrupt".
A: The reason for this is that the IBM SCSI-subsystem only sends a
termination status back, if some error appeared. In former releases
of the driver, it was not checked, if the termination status block
@@ -1188,7 +1188,7 @@
and 15 get ignored by the driver & adapter!
Q: I have a 9595 and I get a NMI during heavy SCSI I/O e.g. during fsck.
A COMMAND ERROR is reported and characters on the screen are missing.
- Warm reboot is not possible. Things look like quite weired.
+ Warm reboot is not possible. Things look like quite weird.
A: Check the processor type of your 9595. If you have an 80486 or 486DX-2
processor complex on your mainboard and you compiled a kernel that
supports 80386 processors, it is possible, that the kernel cannot
@@ -1213,21 +1213,21 @@
problem. Not yet tried, but guessing that it could work. To get this,
set unchecked_isa_dma argument of ibmmca.h from 0 to 1.
- 5.3 Bugreports
+ 5.3 Bug reports
--------------
- If you really find bugs in the sourcecode or the driver will successfully
+ If you really find bugs in the source code or the driver will successfully
refuse to work on your machine, you should send a bug report to me. The
best for this is to follow the instructions on the WWW-page for this
driver. Fill out the bug-report form, placed on the WWW-page and ship it,
so the bugs can be taken into account with maximum efforts. But, please
do not send bug reports about this driver to Linus Torvalds or Leonard
- Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all
+ Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all
SCSI-drivers and won't have the time left to look inside every single
driver to fix a bug and especially DO NOT send modified code to Linus
Torvalds or Alan J. Cox which has not been checked here!!! They are both
- quite burried in E-mail (as me, sometimes, too) and one should first check
+ quite buried in E-mail (as me, sometimes, too) and one should first check
for problems on my local teststand. Recently, I got a lot of
- bugreports for errors in the ibmmca.c code, which I could not imagine, but
+ bug reports for errors in the ibmmca.c code, which I could not imagine, but
a look inside some Linux-distribution showed me quite often some modified
code, which did no longer work on most other machines than the one of the
modifier. Ok, so now that there is maintenance service available for this
@@ -1261,7 +1261,7 @@
some e-mail directly, but at least with the same information as required by
the formular.
- If you have extensive bugreports, including Ooops messages and
+ If you have extensive bug reports, including Oops messages and
screen-shots, please feel free to send it directly to the address
of the maintainer, too. The current address of the maintainer is:
@@ -1318,7 +1318,7 @@
detailed bug reports and ideas for this driver (and his
patience ;-)).
Alan J. Cox
- for his bugreports and his bold activities in cross-checking
+ for his bug reports and his bold activities in cross-checking
the driver-code with his teststand.
7.2 Sponsors & Supporters
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 39d409a8efe..230e30846ef 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -785,8 +785,8 @@ port address 0x1400.
irqm:0 always open drain
irqm:1 same as initial settings (assumed BIOS settings)
irqm:2 always totem pole
- irqm:0x10 driver will not use SA_SHIRQ flag when requesting irq
- irqm:0x20 driver will not use SA_INTERRUPT flag when requesting irq
+ irqm:0x10 driver will not use IRQF_SHARED flag when requesting irq
+ irqm:0x20 driver will not use IRQF_DISABLED flag when requesting irq
(Bits 0x10 and 0x20 can be combined with hardware irq mode option)
@@ -1236,15 +1236,15 @@ when the SCSI DATA IN phase is reentered after a phase mismatch.
When an IRQ is shared by devices that are handled by different drivers, it
may happen that one driver complains about the request of the IRQ having
failed. Inder Linux-2.0, this may be due to one driver having requested the
-IRQ using the SA_INTERRUPT flag but some other having requested the same IRQ
+IRQ using the IRQF_DISABLED flag but some other having requested the same IRQ
without this flag. Under both Linux-2.0 and linux-2.2, this may be caused by
-one driver not having requested the IRQ with the SA_SHIRQ flag.
+one driver not having requested the IRQ with the IRQF_SHARED flag.
By default, the ncr53c8xx and sym53c8xx drivers request IRQs with both the
-SA_INTERRUPT and the SA_SHIRQ flag under Linux-2.0 and with only the SA_SHIRQ
+IRQF_DISABLED and the IRQF_SHARED flag under Linux-2.0 and with only the IRQF_SHARED
flag under Linux-2.2.
-Under Linux-2.0, you can disable use of SA_INTERRUPT flag from the boot
+Under Linux-2.0, you can disable use of IRQF_DISABLED flag from the boot
command line by using the following option:
ncr53c8xx=irqm:0x20 (for the generic ncr53c8xx driver)
@@ -1252,7 +1252,7 @@ command line by using the following option:
If this does not fix the problem, then you may want to check how all other
drivers are requesting the IRQ and report the problem. Note that if at least
-a single driver does not request the IRQ with the SA_SHIRQ flag (share IRQ),
+a single driver does not request the IRQ with the IRQF_SHARED flag (share IRQ),
then the request of the IRQ obviously will not succeed for all the drivers.
15. SCSI problem troubleshooting
diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt
index ccf1cebe744..736540045dc 100644
--- a/Documentation/sharedsubtree.txt
+++ b/Documentation/sharedsubtree.txt
@@ -153,6 +153,7 @@ replicas continue to be exactly same.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+ #include <string.h>
#include <sys/mount.h>
#include <sys/fsuid.h>
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 241e26c4ff9..4b48c2e82c3 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -365,13 +365,14 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module snd-cmipci
-----------------
- Module for C-Media CMI8338 and 8738 PCI sound cards.
+ Module for C-Media CMI8338/8738/8768/8770 PCI sound cards.
- mpu_port - 0x300,0x310,0x320,0x330 = legacy port,
- 1 = integrated PCI port,
+ mpu_port - port address of MIDI interface (8338 only):
+ 0x300,0x310,0x320,0x330 = legacy port,
0 = disable (default)
- fm_port - 0x388 = legacy port,
- 1 = integrated PCI port (default),
+ fm_port - port address of OPL-3 FM synthesizer (8x38 only):
+ 0x388 = legacy port,
+ 1 = integrated PCI port (default on 8738),
0 = disable
soft_ac3 - Software-conversion of raw SPDIF packets (model 033 only)
(default = 1)
@@ -768,6 +769,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
single_cmd - Use single immediate commands to communicate with
codecs (for debugging only)
enable_msi - Enable Message Signaled Interrupt (MSI) (default = off)
+ power_save - Automatic power-saving timtout (in second, 0 =
+ disable)
+ power_save_controller - Reset HD-audio controller in power-saving mode
+ (default = on)
This module supports one card and autoprobe.
@@ -828,6 +833,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ALC268
3stack 3-stack model
+ toshiba Toshiba A205
+ acer Acer laptops
auto auto-config reading BIOS (default)
ALC662
@@ -842,7 +849,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-dig 3-jack with SPDIF I/O
6stack-dig 6-jack digital with SPDIF I/O
arima Arima W820Di1
+ targa Targa T8, MSI-1049 T8
+ asus-a7j ASUS A7J
+ asus-a7m ASUS A7M
macpro MacPro support
+ mbp3 Macbook Pro rev3
imac24 iMac 24'' with jack detection
w2jc ASUS W2JC
auto auto-config reading BIOS (default)
@@ -854,6 +865,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-6ch-dig 3-jack 6-channel with SPDIF I/O
6stack-dig-demo 6-jack digital for Intel demo board
acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
+ acer-aspire Acer Aspire 9810
medion Medion Laptops
medion-md2 Medion MD2
targa-dig Targa/MSI
@@ -862,6 +874,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
lenovo-101e Lenovo 101E
lenovo-nb0763 Lenovo NB0763
lenovo-ms7195-dig Lenovo MS7195
+ haier-w66 Haier W66
6stack-hp HP machines with 6stack (Nettle boards)
3stack-hp HP machines with 3stack (Lucknow, Samba boards)
auto auto-config reading BIOS (default)
@@ -885,6 +898,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack-660-digout 3-jack with SPDIF OUT (for ALC660VD)
lenovo Lenovo 3000 C200
dallas Dallas laptops
+ hp HP TX1000
auto auto-config reading BIOS (default)
CMI9880
@@ -920,6 +934,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
3stack 3-stack, shared surrounds
laptop 2-channel only (FSC V2060, Samsung M50)
laptop-eapd 2-channel with EAPD (Samsung R65, ASUS A6J)
+ laptop-automute 2-channel with EAPD and HP-automute (Lenovo N100)
ultra 2-channel with EAPD (Samsung Ultra tablet PC)
AD1988
@@ -945,14 +960,30 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
can be adjusted. Appearing only when compiled with
$CONFIG_SND_DEBUG=y
- STAC9200/9205/9254
+ STAC9200
ref Reference board
+ dell-d21 Dell (unknown)
+ dell-d22 Dell (unknown)
+ dell-d23 Dell (unknown)
+ dell-m21 Dell Inspiron 630m, Dell Inspiron 640m
+ dell-m22 Dell Latitude D620, Dell Latitude D820
+ dell-m23 Dell XPS M1710, Dell Precision M90
+ dell-m24 Dell Latitude 120L
+ dell-m25 Dell Inspiron E1505n
+ dell-m26 Dell Inspiron 1501
+ dell-m27 Dell Inspiron E1705/9400
+ gateway Gateway laptops with EAPD control
+
+ STAC9205/9254
+ ref Reference board
+ dell-m42 Dell (unknown)
+ dell-m43 Dell Precision
+ dell-m44 Dell Inspiron
STAC9220/9221
ref Reference board
3stack D945 3stack
5stack D945 5stack + SPDIF
- dell Dell XPS M1210
intel-mac-v1 Intel Mac Type 1
intel-mac-v2 Intel Mac Type 2
intel-mac-v3 Intel Mac Type 3
@@ -964,6 +995,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
macbook-pro Intel Mac Book Pro 2nd generation (eq. type 3)
imac-intel Intel iMac (eq. type 2)
imac-intel-20 Intel iMac (newer version) (eq. type 3)
+ dell-d81 Dell (unknown)
+ dell-d82 Dell (unknown)
+ dell-m81 Dell (unknown)
+ dell-m82 Dell XPS M1210
STAC9202/9250/9251
ref Reference board, base config
@@ -975,6 +1010,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
ref Reference board
3stack D965 3stack
5stack D965 5stack + SPDIF
+ dell-3stack Dell Dimension E520
STAC9872
vaio Setup for VAIO FE550G/SZ110
@@ -989,6 +1025,9 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
subsystem ID (output of "lspci -nv") to ALSA BTS or alsa-devel
ML (see the section "Links and Addresses").
+ power_save and power_save_controller options are for power-saving
+ mode. See powersave.txt for details.
+
Note 2: If you get click noises on output, try the module option
position_fix=1 or 2. position_fix=1 will use the SD_LPIB
register value without FIFO size correction as the current
@@ -1349,7 +1388,6 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
port - port number or -1 (disable)
irq - IRQ number or -1 (disable)
pnp - PnP detection - 0 = disable, 1 = enable (default)
- uart_enter - Issue UART_ENTER command at open - bool, default = on
This module supports multiple devices and PnP.
@@ -1630,6 +1668,21 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
The power-management is supported.
+ Module snd-sc6000
+ -----------------
+
+ Module for Gallant SC-6000 soundcard.
+
+ port - Port # (0x220 or 0x240)
+ mss_port - MSS Port # (0x530 or 0xe80)
+ irq - IRQ # (5,7,9,10,11)
+ mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq
+ dma - DMA # (1,3,0)
+
+ This module supports multiple cards.
+
+ This card is also known as Audio Excel DSP 16 or Zoltrix AV302.
+
Module snd-sgalaxy
------------------
@@ -1650,9 +1703,11 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
Module for ENSONIQ SoundScape PnP cards.
port - Port # (PnP setup)
+ wss_port - WSS Port # (PnP setup)
irq - IRQ # (PnP setup)
mpu_irq - MPU-401 IRQ # (PnP setup)
dma - DMA # (PnP setup)
+ dma2 - 2nd DMA # (PnP setup, -1 to disable)
This module supports multiple cards. ISA PnP must be enabled.
You need sscape_ctl tool in alsa-tools package for loading
@@ -1697,8 +1752,52 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
dma2 - DMA2 # for CS4232 PCM interface.
isapnp - ISA PnP detection - 0 = disable, 1 = enable (default)
+ The below are options for wavefront_synth features:
+ wf_raw - Assume that we need to boot the OS (default:no)
+ If yes, then during driver loading, the state of the board is
+ ignored, and we reset the board and load the firmware anyway.
+ fx_raw - Assume that the FX process needs help (default:yes)
+ If false, we'll leave the FX processor in whatever state it is
+ when the driver is loaded. The default is to download the
+ microprogram and associated coefficients to set it up for
+ "default" operation, whatever that means.
+ debug_default - Debug parameters for card initialization
+ wait_usecs - How long to wait without sleeping, usecs
+ (default:150)
+ This magic number seems to give pretty optimal throughput
+ based on my limited experimentation.
+ If you want to play around with it and find a better value, be
+ my guest. Remember, the idea is to get a number that causes us
+ to just busy wait for as many WaveFront commands as possible,
+ without coming up with a number so large that we hog the whole
+ CPU.
+ Specifically, with this number, out of about 134,000 status
+ waits, only about 250 result in a sleep.
+ sleep_interval - How long to sleep when waiting for reply
+ (default: 100)
+ sleep_tries - How many times to try sleeping during a wait
+ (default: 50)
+ ospath - Pathname to processed ICS2115 OS firmware
+ (default:wavefront.os)
+ The path name of the ISC2115 OS firmware. In the recent
+ version, it's handled via firmware loader framework, so it
+ must be installed in the proper path, typically,
+ /lib/firmware.
+ reset_time - How long to wait for a reset to take effect
+ (default:2)
+ ramcheck_time - How many seconds to wait for the RAM test
+ (default:20)
+ osrun_time - How many seconds to wait for the ICS2115 OS
+ (default:10)
+
This module supports multiple cards and ISA PnP.
+ Note: the firmware file "wavefront.os" was located in the earlier
+ version in /etc. Now it's loaded via firmware loader, and
+ must be in the proper firmware path, such as /lib/firmware.
+ Copy (or symlink) the file appropriately if you get an error
+ regarding firmware downloading after upgrading the kernel.
+
Module snd-sonicvibes
---------------------
diff --git a/Documentation/sound/alsa/CMIPCI.txt b/Documentation/sound/alsa/CMIPCI.txt
index 4b2b1538705..16935c8561f 100644
--- a/Documentation/sound/alsa/CMIPCI.txt
+++ b/Documentation/sound/alsa/CMIPCI.txt
@@ -1,5 +1,5 @@
- Brief Notes on C-Media 8738/8338 Driver
- =======================================
+ Brief Notes on C-Media 8338/8738/8768/8770 Driver
+ =================================================
Takashi Iwai <tiwai@suse.de>
@@ -209,10 +209,13 @@ In addition to the standard SB mixer, CM8x38 provides more functions.
MIDI CONTROLLER
---------------
-The MPU401-UART interface is disabled as default. You need to set
-module option "mpu_port" with a valid I/O port address to enable the
-MIDI support. The valid I/O ports are 0x300, 0x310, 0x320 and 0x330.
-Choose the value which doesn't conflict with other cards.
+With CMI8338 chips, the MPU401-UART interface is disabled as default.
+You need to set the module option "mpu_port" to a valid I/O port address
+to enable MIDI support. Valid I/O ports are 0x300, 0x310, 0x320 and
+0x330. Choose a value that doesn't conflict with other cards.
+
+With CMI8738 and newer chips, the MIDI interface is enabled by default
+and the driver automatically chooses a port address.
There is _no_ hardware wavetable function on this chip (except for
OPL3 synth below).
@@ -230,6 +233,8 @@ Set "fm_port" module option for more cards.
The output quality of FM OPL/3 is, however, very weird.
I don't know why..
+CMI8768 and newer chips do not have the FM synth.
+
Joystick and Modem
------------------
diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index 74d3a35b59b..2c3fc3cb3b6 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -18,8 +18,8 @@
</affiliation>
</author>
- <date>November 17, 2005</date>
- <edition>0.3.6</edition>
+ <date>September 10, 2007</date>
+ <edition>0.3.7</edition>
<abstract>
<para>
@@ -405,8 +405,9 @@
/* definition of the chip-specific record */
struct mychip {
struct snd_card *card;
- // rest of implementation will be in the section
- // "PCI Resource Managements"
+ /* rest of implementation will be in the section
+ * "PCI Resource Managements"
+ */
};
/* chip-specific destructor
@@ -414,7 +415,7 @@
*/
static int snd_mychip_free(struct mychip *chip)
{
- .... // will be implemented later...
+ .... /* will be implemented later... */
}
/* component-destructor
@@ -440,8 +441,9 @@
*rchip = NULL;
- // check PCI availability here
- // (see "PCI Resource Managements")
+ /* check PCI availability here
+ * (see "PCI Resource Managements")
+ */
....
/* allocate a chip-specific data with zero filled */
@@ -451,12 +453,13 @@
chip->card = card;
- // rest of initialization here; will be implemented
- // later, see "PCI Resource Managements"
+ /* rest of initialization here; will be implemented
+ * later, see "PCI Resource Managements"
+ */
....
- if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL,
- chip, &ops)) < 0) {
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
snd_mychip_free(chip);
return err;
}
@@ -490,7 +493,8 @@
return -ENOMEM;
/* (3) */
- if ((err = snd_mychip_create(card, pci, &chip)) < 0) {
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -502,10 +506,11 @@
card->shortname, chip->ioport, chip->irq);
/* (5) */
- .... // implemented later
+ .... /* implemented later */
/* (6) */
- if ((err = snd_card_register(card)) < 0) {
+ err = snd_card_register(card);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -605,7 +610,8 @@
<![CDATA[
struct mychip *chip;
....
- if ((err = snd_mychip_create(card, pci, &chip)) < 0) {
+ err = snd_mychip_create(card, pci, &chip);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -666,7 +672,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = snd_card_register(card)) < 0) {
+ err = snd_card_register(card);
+ if (err < 0) {
snd_card_free(card);
return err;
}
@@ -1091,7 +1098,7 @@
static int snd_mychip_free(struct mychip *chip)
{
/* disable hardware here if any */
- .... // (not implemented in this document)
+ .... /* (not implemented in this document) */
/* release the irq */
if (chip->irq >= 0)
@@ -1119,7 +1126,8 @@
*rchip = NULL;
/* initialize the PCI entry */
- if ((err = pci_enable_device(pci)) < 0)
+ err = pci_enable_device(pci);
+ if (err < 0)
return err;
/* check PCI availability (28bit DMA) */
if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
@@ -1141,7 +1149,8 @@
chip->irq = -1;
/* (1) PCI resource allocation */
- if ((err = pci_request_regions(pci, "My Chip")) < 0) {
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
kfree(chip);
pci_disable_device(pci);
return err;
@@ -1156,10 +1165,10 @@
chip->irq = pci->irq;
/* (2) initialization of the chip hardware */
- .... // (not implemented in this document)
+ .... /* (not implemented in this document) */
- if ((err = snd_device_new(card, SNDRV_DEV_LOWLEVEL,
- chip, &ops)) < 0) {
+ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops);
+ if (err < 0) {
snd_mychip_free(chip);
return err;
}
@@ -1233,7 +1242,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = pci_enable_device(pci)) < 0)
+ err = pci_enable_device(pci);
+ if (err < 0)
return err;
if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 ||
pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) {
@@ -1294,7 +1304,8 @@
<informalexample>
<programlisting>
<![CDATA[
- if ((err = pci_request_regions(pci, "My Chip")) < 0) {
+ err = pci_request_regions(pci, "My Chip");
+ if (err < 0) {
kfree(chip);
pci_disable_device(pci);
return err;
@@ -1322,7 +1333,7 @@
<programlisting>
<![CDATA[
if (request_irq(pci->irq, snd_mychip_interrupt,
- IRQF_DISABLED|IRQF_SHARED, "My Chip", chip)) {
+ IRQF_SHARED, "My Chip", chip)) {
printk(KERN_ERR "cannot grab irq %d\n", pci->irq);
snd_mychip_free(chip);
return -EBUSY;
@@ -1773,7 +1784,8 @@
struct snd_pcm_runtime *runtime = substream->runtime;
runtime->hw = snd_mychip_playback_hw;
- // more hardware-initialization will be done here
+ /* more hardware-initialization will be done here */
+ ....
return 0;
}
@@ -1781,7 +1793,8 @@
static int snd_mychip_playback_close(struct snd_pcm_substream *substream)
{
struct mychip *chip = snd_pcm_substream_chip(substream);
- // the hardware-specific codes will be here
+ /* the hardware-specific codes will be here */
+ ....
return 0;
}
@@ -1793,7 +1806,8 @@
struct snd_pcm_runtime *runtime = substream->runtime;
runtime->hw = snd_mychip_capture_hw;
- // more hardware-initialization will be done here
+ /* more hardware-initialization will be done here */
+ ....
return 0;
}
@@ -1801,7 +1815,8 @@
static int snd_mychip_capture_close(struct snd_pcm_substream *substream)
{
struct mychip *chip = snd_pcm_substream_chip(substream);
- // the hardware-specific codes will be here
+ /* the hardware-specific codes will be here */
+ ....
return 0;
}
@@ -1844,10 +1859,12 @@
{
switch (cmd) {
case SNDRV_PCM_TRIGGER_START:
- // do something to start the PCM engine
+ /* do something to start the PCM engine */
+ ....
break;
case SNDRV_PCM_TRIGGER_STOP:
- // do something to stop the PCM engine
+ /* do something to stop the PCM engine */
+ ....
break;
default:
return -EINVAL;
@@ -1900,8 +1917,8 @@
struct snd_pcm *pcm;
int err;
- if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
- &pcm)) < 0)
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
return err;
pcm->private_data = chip;
strcpy(pcm->name, "My Chip");
@@ -1939,8 +1956,8 @@
struct snd_pcm *pcm;
int err;
- if ((err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1,
- &pcm)) < 0)
+ err = snd_pcm_new(chip->card, "My Chip", 0, 1, 1, &pcm);
+ if (err < 0)
return err;
pcm->private_data = chip;
strcpy(pcm->name, "My Chip");
@@ -2097,7 +2114,7 @@
struct mychip *chip = snd_pcm_chip(pcm);
/* free your own data */
kfree(chip->my_private_pcm_data);
- // do what you like else
+ /* do what you like else */
....
}
@@ -2884,10 +2901,10 @@ struct _snd_pcm_runtime {
<![CDATA[
switch (cmd) {
case SNDRV_PCM_TRIGGER_START:
- // do something to start the PCM engine
+ /* do something to start the PCM engine */
break;
case SNDRV_PCM_TRIGGER_STOP:
- // do something to stop the PCM engine
+ /* do something to stop the PCM engine */
break;
default:
return -EINVAL;
@@ -3071,7 +3088,7 @@ struct _snd_pcm_runtime {
spin_unlock(&chip->lock);
snd_pcm_period_elapsed(chip->substream);
spin_lock(&chip->lock);
- // acknowledge the interrupt if necessary
+ /* acknowledge the interrupt if necessary */
}
....
spin_unlock(&chip->lock);
@@ -3134,7 +3151,7 @@ struct _snd_pcm_runtime {
snd_pcm_period_elapsed(substream);
spin_lock(&chip->lock);
}
- // acknowledge the interrupt if necessary
+ /* acknowledge the interrupt if necessary */
}
....
spin_unlock(&chip->lock);
@@ -3456,6 +3473,13 @@ struct _snd_pcm_runtime {
</para>
<para>
+ The <structfield>tlv</structfield> field can be used to provide
+ metadata about the control; see the
+ <link linkend="control-interface-tlv">
+ <citetitle>Metadata</citetitle></link> subsection.
+ </para>
+
+ <para>
The other three are
<link linkend="control-interface-callbacks"><citetitle>
callback functions</citetitle></link>.
@@ -3604,7 +3628,7 @@ struct _snd_pcm_runtime {
<title>Example of info callback</title>
<programlisting>
<![CDATA[
- static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+ static int snd_myctl_mono_info(struct snd_kcontrol *kcontrol,
struct snd_ctl_elem_info *uinfo)
{
uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN;
@@ -3639,7 +3663,7 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- static int snd_myctl_info(struct snd_kcontrol *kcontrol,
+ static int snd_myctl_enum_info(struct snd_kcontrol *kcontrol,
struct snd_ctl_elem_info *uinfo)
{
static char *texts[4] = {
@@ -3658,6 +3682,16 @@ struct _snd_pcm_runtime {
</programlisting>
</informalexample>
</para>
+
+ <para>
+ Some common info callbacks are prepared for easy use:
+ <function>snd_ctl_boolean_mono_info()</function> and
+ <function>snd_ctl_boolean_stereo_info()</function>.
+ Obviously, the former is an info callback for a mono channel
+ boolean item, just like <function>snd_myctl_mono_info</function>
+ above, and the latter is for a stereo channel boolean item.
+ </para>
+
</section>
<section id="control-interface-callbacks-get">
@@ -3794,7 +3828,8 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- if ((err = snd_ctl_add(card, snd_ctl_new1(&my_control, chip))) < 0)
+ err = snd_ctl_add(card, snd_ctl_new1(&my_control, chip));
+ if (err < 0)
return err;
]]>
</programlisting>
@@ -3843,6 +3878,56 @@ struct _snd_pcm_runtime {
</para>
</section>
+ <section id="control-interface-tlv">
+ <title>Metadata</title>
+ <para>
+ To provide information about the dB values of a mixer control, use
+ on of the <constant>DECLARE_TLV_xxx</constant> macros from
+ <filename>&lt;sound/tlv.h&gt;</filename> to define a variable
+ containing this information, set the<structfield>tlv.p
+ </structfield> field to point to this variable, and include the
+ <constant>SNDRV_CTL_ELEM_ACCESS_TLV_READ</constant> flag in the
+ <structfield>access</structfield> field; like this:
+ <informalexample>
+ <programlisting>
+<![CDATA[
+ static DECLARE_TLV_DB_SCALE(db_scale_my_control, -4050, 150, 0);
+
+ static struct snd_kcontrol_new my_control __devinitdata = {
+ ...
+ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+ SNDRV_CTL_ELEM_ACCESS_TLV_READ,
+ ...
+ .tlv.p = db_scale_my_control,
+ };
+]]>
+ </programlisting>
+ </informalexample>
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_SCALE</function> macro defines
+ information about a mixer control where each step in the control's
+ value changes the dB value by a constant dB amount.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the step size, in units of 0.01 dB.
+ Set the fourth parameter to 1 if the minimum value actually mutes
+ the control.
+ </para>
+
+ <para>
+ The <function>DECLARE_TLV_DB_LINEAR</function> macro defines
+ information about a mixer control where the control's value affects
+ the output linearly.
+ The first parameter is the name of the variable to be defined.
+ The second parameter is the minimum value, in units of 0.01 dB.
+ The third parameter is the maximum value, in units of 0.01 dB.
+ If the minimum value mutes the control, set the second parameter to
+ <constant>TLV_DB_GAIN_MUTE</constant>.
+ </para>
+ </section>
+
</chapter>
@@ -3880,7 +3965,7 @@ struct _snd_pcm_runtime {
{
struct mychip *chip = ac97->private_data;
....
- // read a register value here from the codec
+ /* read a register value here from the codec */
return the_register_value;
}
@@ -3889,7 +3974,7 @@ struct _snd_pcm_runtime {
{
struct mychip *chip = ac97->private_data;
....
- // write the given register value to the codec
+ /* write the given register value to the codec */
}
static int snd_mychip_ac97(struct mychip *chip)
@@ -3902,7 +3987,8 @@ struct _snd_pcm_runtime {
.read = snd_mychip_ac97_read,
};
- if ((err = snd_ac97_bus(chip->card, 0, &ops, NULL, &bus)) < 0)
+ err = snd_ac97_bus(chip->card, 0, &ops, NULL, &bus);
+ if (err < 0)
return err;
memset(&ac97, 0, sizeof(ac97));
ac97.private_data = chip;
@@ -4447,10 +4533,10 @@ struct _snd_pcm_runtime {
<informalexample>
<programlisting>
<![CDATA[
- struct list_head *list;
struct snd_rawmidi_substream *substream;
- list_for_each(list, &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams) {
- substream = list_entry(list, struct snd_rawmidi_substream, list);
+ list_for_each_entry(substream,
+ &rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT].substreams,
+ list {
sprintf(substream->name, "My MIDI Port %d", substream->number + 1);
}
/* same for SNDRV_RAWMIDI_STREAM_INPUT */
diff --git a/Documentation/sound/alsa/OSS-Emulation.txt b/Documentation/sound/alsa/OSS-Emulation.txt
index bfa0c9aacb4..022aaeb0e9d 100644
--- a/Documentation/sound/alsa/OSS-Emulation.txt
+++ b/Documentation/sound/alsa/OSS-Emulation.txt
@@ -303,10 +303,3 @@ ICE1712 supports only the unconventional format, interleaved
the buffer as the conventional (mono or 2-channels, 8 or 16bit) format
on OSS.
-USB devices
------------
-Some USB devices support only 24bit format packed in 3bytes. This
-format is not supported by OSS and no conversion is provided by kernel
-OSS emulation. You can use the user-space OSS emulation via libaoss
-instead.
-
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
index 4eaae2a4553..8e1b0252669 100644
--- a/Documentation/sound/alsa/hda_codec.txt
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -49,6 +49,9 @@ struct hda_bus_ops {
unsigned int verb, unsigned int parm);
unsigned int (*get_response)(struct hda_codec *codec);
void (*private_free)(struct hda_bus *);
+#ifdef CONFIG_SND_HDA_POWER_SAVE
+ void (*pm_notify)(struct hda_codec *codec);
+#endif
};
The command callback is called when the codec module needs to send a
@@ -56,9 +59,16 @@ VERB to the controller. It's always a single command.
The get_response callback is called when the codec requires the answer
for the last command. These two callbacks are mandatory and have to
be given.
-The last, private_free callback, is optional. It's called in the
+The third, private_free callback, is optional. It's called in the
destructor to release any necessary data in the lowlevel driver.
+The pm_notify callback is available only with
+CONFIG_SND_HDA_POWER_SAVE kconfig. It's called when the codec needs
+to power up or may power down. The controller should check the all
+belonging codecs on the bus whether they are actually powered off
+(check codec->power_on), and optionally the driver may power down the
+contoller side, too.
+
The bus instance is created via snd_hda_bus_new(). You need to pass
the card instance, the template, and the pointer to store the
resultant bus instance.
@@ -86,10 +96,8 @@ resultant codec instance (can be NULL if not needed).
The codec is stored in a linked list of bus instance. You can follow
the codec list like:
- struct list_head *p;
struct hda_codec *codec;
- list_for_each(p, &bus->codec_list) {
- codec = list_entry(p, struct hda_codec, list);
+ list_for_each_entry(codec, &bus->codec_list, list) {
...
}
@@ -100,10 +108,15 @@ initialization sequence is called when the controls are built later.
Codec Access
============
-To access codec, use snd_codec_read() and snd_codec_write().
+To access codec, use snd_hda_codec_read() and snd_hda_codec_write().
snd_hda_param_read() is for reading parameters.
For writing a sequence of verbs, use snd_hda_sequence_write().
+There are variants of cached read/write, snd_hda_codec_write_cache(),
+snd_hda_sequence_write_cache(). These are used for recording the
+register states for the power-mangement resume. When no PM is needed,
+these are equivalent with non-cached version.
+
To retrieve the number of sub nodes connected to the given node, use
snd_hda_get_sub_nodes(). The connection list can be obtained via
snd_hda_get_connections() call.
@@ -239,6 +252,10 @@ set the codec->patch_ops field. This is defined as below:
int (*suspend)(struct hda_codec *codec, pm_message_t state);
int (*resume)(struct hda_codec *codec);
#endif
+ #ifdef CONFIG_SND_HDA_POWER_SAVE
+ int (*check_power_status)(struct hda_codec *codec,
+ hda_nid_t nid);
+ #endif
};
The build_controls callback is called from snd_hda_build_controls().
@@ -251,6 +268,18 @@ The unsol_event callback is called when an unsolicited event is
received.
The suspend and resume callbacks are for power management.
+They can be NULL if no special sequence is required. When the resume
+callback is NULL, the driver calls the init callback and resumes the
+registers from the cache. If other handling is needed, you'd need to
+write your own resume callback. There, the amp values can be resumed
+via
+ void snd_hda_codec_resume_amp(struct hda_codec *codec);
+and the other codec registers via
+ void snd_hda_codec_resume_cache(struct hda_codec *codec);
+
+The check_power_status callback is called when the amp value of the
+given widget NID is changed. The codec code can turn on/off the power
+appropriately from this information.
Each entry can be NULL if not necessary to be called.
@@ -267,8 +296,7 @@ Digital I/O
===========
Call snd_hda_create_spdif_out_ctls() from the patch to create controls
-related with SPDIF out. In the patch resume callback, call
-snd_hda_resume_spdif().
+related with SPDIF out.
Helper Functions
@@ -284,12 +312,7 @@ as a module parameter, and PCI subsystem IDs. If the matching entry
is found, it returns the config field value.
snd_hda_add_new_ctls() can be used to create and add control entries.
-Pass the zero-terminated array of struct snd_kcontrol_new. The same array
-can be passed to snd_hda_resume_ctls() for resume.
-Note that this will call control->put callback of these entries. So,
-put callback should check codec->in_resume and force to restore the
-given value if it's non-zero even if the value is identical with the
-cached value.
+Pass the zero-terminated array of struct snd_kcontrol_new
Macros HDA_CODEC_VOLUME(), HDA_CODEC_MUTE() and their variables can be
used for the entry of struct snd_kcontrol_new.
diff --git a/Documentation/sound/alsa/powersave.txt b/Documentation/sound/alsa/powersave.txt
new file mode 100644
index 00000000000..9657e809922
--- /dev/null
+++ b/Documentation/sound/alsa/powersave.txt
@@ -0,0 +1,41 @@
+Notes on Power-Saving Mode
+==========================
+
+AC97 and HD-audio drivers have the automatic power-saving mode.
+This feature is enabled via Kconfig CONFIG_SND_AC97_POWER_SAVE
+and CONFIG_SND_HDA_POWER_SAVE options, respectively.
+
+With the automatic power-saving, the driver turns off the codec power
+appropriately when no operation is required. When no applications use
+the device and/or no analog loopback is set, the power disablement is
+done fully or partially. It'll save a certain power consumption, thus
+good for laptops (even for desktops).
+
+The time-out for automatic power-off can be specified via power_save
+module option of snd-ac97-codec and snd-hda-intel modules. Specify
+the time-out value in seconds. 0 means to disable the automatic
+power-saving. The default value of timeout is given via
+CONFIG_SND_AC97_POWER_SAVE_DEFAULT and
+CONFIG_SND_HDA_POWER_SAVE_DEFAULT Kconfig options. Setting this to 1
+(the minimum value) isn't recommended because many applications try to
+reopen the device frequently. 10 would be a good choice for normal
+operations.
+
+The power_save option is exported as writable. This means you can
+adjust the value via sysfs on the fly. For example, to turn on the
+automatic power-save mode with 10 seconds, write to
+/sys/modules/snd_ac97_codec/parameters/power_save (usually as root):
+
+ # echo 10 > /sys/modules/snd_ac97_codec/parameters/power_save
+
+
+Note that you might hear click noise/pop when changing the power
+state. Also, it often takes certain time to wake up from the
+power-down to the active state. These are often hardly to fix, so
+don't report extra bug reports unless you have a fix patch ;-)
+
+For HD-audio interface, there is another module option,
+power_save_controller. This enables/disables the power-save mode of
+the controller side. Setting this on may reduce a bit more power
+consumption, but might result in longer wake-up time and click noise.
+Try to turn it off when you experience such a thing too often.
diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt
index 58cbfd01ea8..3feeb9ecdec 100644
--- a/Documentation/sound/alsa/soc/DAI.txt
+++ b/Documentation/sound/alsa/soc/DAI.txt
@@ -20,12 +20,12 @@ I2S
===
I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and
-Rx lines are used for audio transmision, whilst the bit clock (BCLK) and
+Rx lines are used for audio transmission, whilst the bit clock (BCLK) and
left/right clock (LRC) synchronise the link. I2S is flexible in that either the
controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock
usually varies depending on the sample rate and the master system clock
(SYSCLK). LRCLK is the same as the sample rate. A few devices support separate
-ADC and DAC LRCLK's, this allows for similtanious capture and playback at
+ADC and DAC LRCLK's, this allows for simultaneous capture and playback at
different sample rates.
I2S has several different operating modes:-
@@ -41,12 +41,12 @@ I2S has several different operating modes:-
PCM
===
-PCM is another 4 wire interface, very similar to I2S, that can support a more
+PCM is another 4 wire interface, very similar to I2S, which can support a more
flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used
to synchronise the link whilst the Tx and Rx lines are used to transmit and
receive the audio data. Bit clock usually varies depending on sample rate
whilst sync runs at the sample rate. PCM also supports Time Division
-Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This
+Multiplexing (TDM) in that several devices can use the bus simultaneously (this
is sometimes referred to as network mode).
Common PCM operating modes:-
diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt
index e93960d53a1..14930887c25 100644
--- a/Documentation/sound/alsa/soc/clocking.txt
+++ b/Documentation/sound/alsa/soc/clocking.txt
@@ -2,20 +2,20 @@ Audio Clocking
==============
This text describes the audio clocking terms in ASoC and digital audio in
-general. Note: Audio clocking can be complex !
+general. Note: Audio clocking can be complex!
Master Clock
------------
-Every audio subsystem is driven by a master clock (sometimes refered to as MCLK
+Every audio subsystem is driven by a master clock (sometimes referred to as MCLK
or SYSCLK). This audio master clock can be derived from a number of sources
(e.g. crystal, PLL, CPU clock) and is responsible for producing the correct
audio playback and capture sample rates.
-Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that
+Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that
their speed can be altered by software (depending on the system use and to save
-power). Other master clocks are fixed at at set frequency (i.e. crystals).
+power). Other master clocks are fixed at a set frequency (i.e. crystals).
DAI Clocks
@@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general
it's best to configure BCLK to the lowest possible speed (depending on your
rate, number of channels and wordsize) to save on power.
-It's also desireable to use the codec (if possible) to drive (or master) the
+It's also desirable to use the codec (if possible) to drive (or master) the
audio clocks as it's usually gives more accurate sample rates than the CPU.
diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt
index 48983c75aad..1e766ad0ebd 100644
--- a/Documentation/sound/alsa/soc/codec.txt
+++ b/Documentation/sound/alsa/soc/codec.txt
@@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:-
6) DAPM event handler.
7) DAC Digital mute control.
-It's probably best to use this guide in conjuction with the existing codec
+It's probably best to use this guide in conjunction with the existing codec
driver code in sound/soc/codecs/
ASoC Codec driver breakdown
@@ -28,7 +28,7 @@ ASoC Codec driver breakdown
1 - Codec DAI and PCM configuration
-----------------------------------
Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and
-PCM's capablities and operations. This struct is exported so that it can be
+PCM's capabilities and operations. This struct is exported so that it can be
registered with the core by your machine driver.
e.g.
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai);
2 - Codec control IO
--------------------
-The codec can ususally be controlled via an I2C or SPI style interface (AC97
+The codec can usually be controlled via an I2C or SPI style interface (AC97
combines control with data in the DAI). The codec drivers will have to provide
functions to read and write the codec registers along with supplying a register
cache:-
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index c11877f5b4a..ab0766fd786 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -11,7 +11,7 @@ other PM systems.
DAPM is also completely transparent to all user space applications as all power
switching is done within the ASoC core. No code changes or recompiling are
-required for user space applications. DAPM makes power switching descisions based
+required for user space applications. DAPM makes power switching decisions based
upon any audio stream (capture/playback) activity and audio mixer settings
within the device.
@@ -38,7 +38,7 @@ There are 4 power domains within DAPM
Enabled and disabled when stream playback/capture is started and
stopped respectively. e.g. aplay, arecord.
-All DAPM power switching descisons are made automatically by consulting an audio
+All DAPM power switching decisions are made automatically by consulting an audio
routing map of the whole machine. This map is specific to each machine and
consists of the interconnections between every audio component (including
internal codec components). All audio components that effect power are called
diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt
index 753c5cc5984..c47ce953067 100644
--- a/Documentation/sound/alsa/soc/overview.txt
+++ b/Documentation/sound/alsa/soc/overview.txt
@@ -2,18 +2,19 @@ ALSA SoC Layer
==============
The overall project goal of the ALSA System on Chip (ASoC) layer is to provide
-better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00,
+better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00,
iMX, etc) and portable audio codecs. Currently there is some support in the
kernel for SoC audio, however it has some limitations:-
* Currently, codec drivers are often tightly coupled to the underlying SoC
- cpu. This is not ideal and leads to code duplication i.e. Linux now has 4
+ CPU. This is not ideal and leads to code duplication i.e. Linux now has 4
different wm8731 drivers for 4 different SoC platforms.
- * There is no standard method to signal user initiated audio events.
- e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion
- event. These are quite common events on portable devices and ofter require
- machine specific code to re route audio, enable amps etc after such an event.
+ * There is no standard method to signal user initiated audio events (e.g.
+ Headphone/Mic insertion, Headphone/Mic detection after an insertion
+ event). These are quite common events on portable devices and often require
+ machine specific code to re-route audio, enable amps, etc., after such an
+ event.
* Current drivers tend to power up the entire codec when playing
(or recording) audio. This is fine for a PC, but tends to waste a lot of
@@ -44,7 +45,7 @@ features :-
signals the codec when to change power states.
* Machine specific controls: Allow machines to add controls to the sound card
- e.g. volume control for speaker amp.
+ (e.g. volume control for speaker amp).
To achieve all this, ASoC basically splits an embedded audio system into 3
components :-
@@ -57,7 +58,7 @@ components :-
interface drivers (e.g. I2S, AC97, PCM) for that platform.
* Machine driver: The machine driver handles any machine specific controls and
- audio events. i.e. turing on an amp at start of playback.
+ audio events (e.g. turning on an amp at start of playback).
Documentation
diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt
index e95b16d5a53..d4678b4dc6c 100644
--- a/Documentation/sound/alsa/soc/platform.txt
+++ b/Documentation/sound/alsa/soc/platform.txt
@@ -20,7 +20,7 @@ struct snd_soc_ops {
int (*trigger)(struct snd_pcm_substream *, int);
};
-The platform driver exports it's DMA functionailty via struct snd_soc_platform:-
+The platform driver exports its DMA functionality via struct snd_soc_platform:-
struct snd_soc_platform {
char *name;
diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt
index 2cf7ee5b3d7..3371bd9d7cf 100644
--- a/Documentation/sound/alsa/soc/pops_clicks.txt
+++ b/Documentation/sound/alsa/soc/pops_clicks.txt
@@ -2,7 +2,7 @@ Audio Pops and Clicks
=====================
Pops and clicks are unwanted audio artifacts caused by the powering up and down
-of components within the audio subsystem. This is noticable on PC's when an
+of components within the audio subsystem. This is noticeable on PCs when an
audio module is either loaded or unloaded (at module load time the sound card is
powered up and causes a popping noise on the speakers).
@@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks
===================================
Playback pops in portable audio subsystems cannot be completely eliminated atm,
-however future audio codec hardware will have better pop and click supression.
+however future audio codec hardware will have better pop and click suppression.
Pops can be reduced within playback by powering the audio components in a
specific order. This order is different for startup and shutdown and follows
some basic rules:-
@@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks
==================================
Capture artifacts are somewhat easier to get rid as we can delay activating the
-ADC until all the pops have occured. This follows similar power rules to
+ADC until all the pops have occurred. This follows similar power rules to
playback in that components are powered in a sequence depending upon stream
startup or shutdown.
diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371
deleted file mode 100644
index c3151266771..00000000000
--- a/Documentation/sound/oss/es1371
+++ /dev/null
@@ -1,64 +0,0 @@
-/proc/sound, /dev/sndstat
--------------------------
-
-/proc/sound and /dev/sndstat is not supported by the
-driver. To find out whether the driver succeeded loading,
-check the kernel log (dmesg).
-
-
-ALaw/uLaw sample formats
-------------------------
-
-This driver does not support the ALaw/uLaw sample formats.
-ALaw is the default mode when opening a sound device
-using OSS/Free. The reason for the lack of support is
-that the hardware does not support these formats, and adding
-conversion routines to the kernel would lead to very ugly
-code in the presence of the mmap interface to the driver.
-And since xquake uses mmap, mmap is considered important :-)
-and no sane application uses ALaw/uLaw these days anyway.
-In short, playing a Sun .au file as follows:
-
-cat my_file.au > /dev/dsp
-
-does not work. Instead, you may use the play script from
-Chris Bagwell's sox-12.14 package (available from the URL
-below) to play many different audio file formats.
-The script automatically determines the audio format
-and does do audio conversions if necessary.
-http://home.sprynet.com/sprynet/cbagwell/projects.html
-
-
-Blocking vs. nonblocking IO
----------------------------
-
-Unlike OSS/Free this driver honours the O_NONBLOCK file flag
-not only during open, but also during read and write.
-This is an effort to make the sound driver interface more
-regular. Timidity has problems with this; a patch
-is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html.
-(Timidity patched will also run on OSS/Free).
-
-
-MIDI UART
----------
-
-The driver supports a simple MIDI UART interface, with
-no ioctl's supported.
-
-
-MIDI synthesizer
-----------------
-
-This soundcard does not have any hardware MIDI synthesizer;
-MIDI synthesis has to be done in software. To allow this
-the driver/soundcard supports two PCM (/dev/dsp) interfaces.
-
-There is a freely available software package that allows
-MIDI file playback on this soundcard called Timidity.
-See http://www.cgs.fi/~tt/timidity/.
-
-
-
-Thomas Sailer
-t.sailer@alumni.ethz.ch
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index 215e3b8e726..f3853cc37bd 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -1,4 +1,4 @@
-PXA2xx SPI on SSP driver HOWTO
+PXA2xx SPI on SSP driver HOWTO
===================================================
This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx
synchronous serial port into a SPI master controller
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 76ea6c837be..8861e47e5a2 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -156,21 +156,29 @@ using the driver model to connect controller and protocol drivers using
device tables provided by board specific initialization code. SPI
shows up in sysfs in several locations:
+ /sys/devices/.../CTLR ... physical node for a given SPI controller
+
/sys/devices/.../CTLR/spiB.C ... spi_device on bus "B",
chipselect C, accessed through CTLR.
+ /sys/bus/spi/devices/spiB.C ... symlink to that physical
+ .../CTLR/spiB.C device
+
/sys/devices/.../CTLR/spiB.C/modalias ... identifies the driver
that should be used with this device (for hotplug/coldplug)
- /sys/bus/spi/devices/spiB.C ... symlink to the physical
- spiB.C device
-
/sys/bus/spi/drivers/D ... driver for one or more spi*.* devices
- /sys/class/spi_master/spiB ... class device for the controller
- managing bus "B". All the spiB.* devices share the same
+ /sys/class/spi_master/spiB ... symlink (or actual device node) to
+ a logical node which could hold class related state for the
+ controller managing bus "B". All spiB.* devices share one
physical SPI bus segment, with SCLK, MOSI, and MISO.
+Note that the actual location of the controller's class state depends
+on whether you enabled CONFIG_SYSFS_DEPRECATED or not. At this time,
+the only class-specific state is the bus number ("B" in "spiB"), so
+those /sys/class entries are only useful to quickly identify busses.
+
How does board-specific init code declare SPI devices?
------------------------------------------------------
@@ -337,7 +345,8 @@ SPI protocol drivers somewhat resemble platform device drivers:
The driver core will autmatically attempt to bind this driver to any SPI
device whose board_info gave a modalias of "CHIP". Your probe() code
-might look like this unless you're creating a class_device:
+might look like this unless you're creating a device which is managing
+a bus (appearing under /sys/class/spi_master).
static int __devinit CHIP_probe(struct spi_device *spi)
{
@@ -442,7 +451,7 @@ An SPI controller will probably be registered on the platform_bus; write
a driver to bind to the device, whichever bus is involved.
The main task of this type of driver is to provide an "spi_master".
-Use spi_alloc_master() to allocate the master, and class_get_devdata()
+Use spi_alloc_master() to allocate the master, and spi_master_get_devdata()
to get the driver-private data allocated for that device.
struct spi_master *master;
@@ -452,7 +461,7 @@ to get the driver-private data allocated for that device.
if (!master)
return -ENODEV;
- c = class_get_devdata(&master->cdev);
+ c = spi_master_get_devdata(master);
The driver will initialize the fields of that spi_master, including the
bus number (maybe the same as the platform device ID) and three methods
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index 218e8621529..cf0e3ce0d52 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -29,7 +29,7 @@ static void pabort(const char *s)
abort();
}
-static char *device = "/dev/spidev1.1";
+static const char *device = "/dev/spidev1.1";
static uint8_t mode;
static uint8_t bits = 8;
static uint32_t speed = 500000;
@@ -69,7 +69,7 @@ static void transfer(int fd)
puts("");
}
-void print_usage(char *prog)
+void print_usage(const char *prog)
{
printf("Usage: %s [-DsbdlHOLC3]\n", prog);
puts(" -D --device device to use (default /dev/spidev1.1)\n"
@@ -88,7 +88,7 @@ void print_usage(char *prog)
void parse_opts(int argc, char *argv[])
{
while (1) {
- static struct option lopts[] = {
+ static const struct option lopts[] = {
{ "device", 1, 0, 'D' },
{ "speed", 1, 0, 's' },
{ "delay", 1, 0, 'd' },
diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
new file mode 100644
index 00000000000..a20a9066dc4
--- /dev/null
+++ b/Documentation/sysctl/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file.
+README
+ - general information about /proc/sys/ sysctl files.
+abi.txt
+ - documentation for /proc/sys/abi/*.
+ctl_unnumbered.txt
+ - explanation of why one should not add new binary sysctl numbers.
+fs.txt
+ - documentation for /proc/sys/fs/*.
+kernel.txt
+ - documentation for /proc/sys/kernel/*.
+sunrpc.txt
+ - documentation for /proc/sys/sunrpc/*.
+vm.txt
+ - documentation for /proc/sys/vm/*.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 111fd28727e..8984a539627 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -320,6 +320,14 @@ kernel. This value defaults to SHMMAX.
==============================================================
+softlockup_thresh:
+
+This value can be used to lower the softlockup tolerance
+threshold. The default threshold is 10s. If a cpu is locked up
+for 10s, the kernel complains. Valid values are 1-60s.
+
+==============================================================
+
tainted:
Non-zero if the kernel has been tainted. Numeric values, which
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a0ccc5b6026..b89570c3043 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -31,6 +31,7 @@ Currently, these files are in /proc/sys/vm:
- min_unmapped_ratio
- min_slab_ratio
- panic_on_oom
+- oom_kill_allocating_task
- mmap_min_address
- numa_zonelist_order
@@ -111,6 +112,12 @@ of kilobytes free. The VM uses this number to compute a pages_min
value for each lowmem zone in the system. Each lowmem zone gets
a number of reserved free pages based proportionally on its size.
+Some minimal ammount of memory is needed to satisfy PF_MEMALLOC
+allocations; if you set this to lower than 1024KB, your system will
+become subtly broken, and prone to deadlock under high loads.
+
+Setting this too high will OOM your machine instantly.
+
==============================================================
percpu_pagelist_fraction
@@ -220,6 +227,27 @@ The default value is 0.
1 and 2 are for failover of clustering. Please select either
according to your policy of failover.
+=============================================================
+
+oom_kill_allocating_task
+
+This enables or disables killing the OOM-triggering task in
+out-of-memory situations.
+
+If this is set to zero, the OOM killer will scan through the entire
+tasklist and select a task based on heuristics to kill. This normally
+selects a rogue memory-hogging task that frees up a large amount of
+memory when killed.
+
+If this is set to non-zero, the OOM killer simply kills the task that
+triggered the out-of-memory condition. This avoids the expensive
+tasklist scan.
+
+If panic_on_oom is selected, it takes precedence over whatever value
+is used in oom_kill_allocating_task.
+
+The default value is 0.
+
==============================================================
mmap_min_addr
diff --git a/Documentation/telephony/00-INDEX b/Documentation/telephony/00-INDEX
new file mode 100644
index 00000000000..4ffe0ed5b6f
--- /dev/null
+++ b/Documentation/telephony/00-INDEX
@@ -0,0 +1,4 @@
+00-INDEX
+ - this file.
+ixj.txt
+ - document describing the Quicknet drivers.
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 60953d6c919..ec499265dec 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
as a driver attribute (see below).
Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/.
+for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
-Sysfs device attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/.
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
+space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/.
+
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad".
Driver version
--------------
@@ -766,7 +771,7 @@ Temperature sensors
-------------------
procfs: /proc/acpi/ibm/thermal
-sysfs device attributes: (hwmon) temp*_input
+sysfs device attributes: (hwmon "thinkpad") temp*_input
Most ThinkPads include six or more separate temperature sensors but only
expose the CPU temperature through the standard ACPI methods. This
@@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable
---------------------------------------------------------
procfs: /proc/acpi/ibm/fan
-sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
+ pwm1_enable
+sysfs hwmon driver attributes: fan_watchdog
NOTE NOTE NOTE: fan control operations are disabled by default for
safety reasons. To enable them, the module parameter "fan_control=1"
@@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating.
An enabled fan in level "auto" may stop spinning if the EC decides the
ThinkPad is cool enough and doesn't need the extra airflow. This is
-normal, and the EC will spin the fan up if the varios thermal readings
+normal, and the EC will spin the fan up if the various thermal readings
rise too much.
On the X40, this seems to depend on the CPU and HDD temperatures.
@@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input:
which can take up to two minutes. May return rubbish on older
ThinkPads.
-driver attribute fan_watchdog:
+hwmon driver attribute fan_watchdog:
Fan safety watchdog timer interval, in seconds. Minimum is
1 second, maximum is 120 seconds. 0 disables the watchdog.
@@ -1196,7 +1203,7 @@ for example:
Enabling debugging output
-------------------------
-The module takes a debug paramater which can be used to selectively
+The module takes a debug parameter which can be used to selectively
enable various classes of debugging output, for example:
modprobe ibm_acpi debug=0xffff
@@ -1233,3 +1240,9 @@ Sysfs interface changelog:
layer, the radio switch generates input event EV_RADIO,
and the driver enables hot key handling by default in
the firmware.
+
+0x020000: ABI fix: added a separate hwmon platform device and
+ driver, which must be located by name (thinkpad)
+ and the hwmon class for libsensors4 (lm-sensors 3)
+ compatibility. Moved all hwmon attributes to this
+ new platform device.
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt
index 4e0b62b8566..8b077e43eee 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.txt
@@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232
This driver is for the MCT USB-RS232 Converter (25 pin, Model No.
U232-P25) from Magic Control Technology Corp. (there is also a 9 pin
Model No. U232-P9). More information about this device can be found at
- the manufacture's web-site: http://www.mct.com.tw.
+ the manufacturer's web-site: http://www.mct.com.tw.
The driver is generally working, though it still needs some more testing.
It is derived from the Belkin USB Serial Adapter F5U103 driver and its
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
new file mode 100644
index 00000000000..2131b00b63f
--- /dev/null
+++ b/Documentation/vm/00-INDEX
@@ -0,0 +1,20 @@
+00-INDEX
+ - this file.
+balance
+ - various information on memory balancing.
+hugetlbpage.txt
+ - a brief summary of hugetlbpage support in the Linux kernel.
+locking
+ - info on how locking and synchronization is done in the Linux vm code.
+numa
+ - information about NUMA specific code in the Linux vm.
+numa_memory_policy.txt
+ - documentation of concepts and APIs of the 2.6 memory policy support.
+overcommit-accounting
+ - description of the Linux kernels overcommit handling modes.
+page_migration
+ - description of page migration in NUMA systems.
+slabinfo.c
+ - source code for a tool to get reports about slabs.
+slub.txt
+ - a short users guide for SLUB.
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 8242f52d0f2..dd498649799 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -302,31 +302,30 @@ MEMORY POLICIES AND CPUSETS
Memory policies work within cpusets as described above. For memory policies
that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints. If the
-intersection of the set of nodes specified for the policy and the set of nodes
-allowed by the cpuset is the empty set, the policy is considered invalid and
-cannot be installed.
+nodes whose memories are allowed by the cpuset constraints. If the nodemask
+specified for the policy contains nodes that are not allowed by the cpuset, or
+the intersection of the set of nodes specified for the policy and the set of
+nodes with memory is the empty set, the policy is considered invalid
+and cannot be installed.
The interaction of memory policies and cpusets can be problematic for a
couple of reasons:
-1) the memory policy APIs take physical node id's as arguments. However, the
- memory policy APIs do not provide a way to determine what nodes are valid
- in the context where the application is running. An application MAY consult
- the cpuset file system [directly or via an out of tree, and not generally
- available, libcpuset API] to obtain this information, but then the
- application must be aware that it is running in a cpuset and use what are
- intended primarily as administrative APIs.
-
- However, as long as the policy specifies at least one node that is valid
- in the controlling cpuset, the policy can be used.
+1) the memory policy APIs take physical node id's as arguments. As mentioned
+ above, it is illegal to specify nodes that are not allowed in the cpuset.
+ The application must query the allowed nodes using the get_mempolicy()
+ API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and
+ restrict itself to those nodes. However, the resources available to a
+ cpuset can be changed by the system administrator, or a workload manager
+ application, at any time. So, a task may still get errors attempting to
+ specify policy nodes, and must query the allowed memories again.
2) when tasks in two cpusets share access to a memory region, such as shared
memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and
MAP_SHARED flags, and any of the tasks install shared policy on the region,
only nodes whose memories are allowed in both cpusets may be used in the
- policies. Again, obtaining this information requires "stepping outside"
- the memory policy APIs, as well as knowing in what cpusets other task might
- be attaching to the shared region, to use the cpuset information.
+ policies. Obtaining this information requires "stepping outside" the
+ memory policy APIs to use the cpuset information and requires that one
+ know in what cpusets other task might be attaching to the shared region.
Furthermore, if the cpusets' allowed memory sets are disjoint, "local"
allocation is the only valid policy.
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index 1af7bd5a218..7047696c47a 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -11,6 +11,7 @@
#include <stdlib.h>
#include <sys/types.h>
#include <dirent.h>
+#include <strings.h>
#include <string.h>
#include <unistd.h>
#include <stdarg.h>
@@ -84,7 +85,7 @@ void fatal(const char *x, ...)
va_start(ap, x);
vfprintf(stderr, x, ap);
va_end(ap);
- exit(1);
+ exit(EXIT_FAILURE);
}
void usage(void)
@@ -119,14 +120,14 @@ void usage(void)
);
}
-unsigned long read_obj(char *name)
+unsigned long read_obj(const char *name)
{
FILE *f = fopen(name, "r");
if (!f)
buffer[0] = 0;
else {
- if (!fgets(buffer,sizeof(buffer), f))
+ if (!fgets(buffer, sizeof(buffer), f))
buffer[0] = 0;
fclose(f);
if (buffer[strlen(buffer)] == '\n')
@@ -139,7 +140,7 @@ unsigned long read_obj(char *name)
/*
* Get the contents of an attribute
*/
-unsigned long get_obj(char *name)
+unsigned long get_obj(const char *name)
{
if (!read_obj(name))
return 0;
@@ -147,7 +148,7 @@ unsigned long get_obj(char *name)
return atol(buffer);
}
-unsigned long get_obj_and_str(char *name, char **x)
+unsigned long get_obj_and_str(const char *name, char **x)
{
unsigned long result = 0;
char *p;
@@ -166,12 +167,12 @@ unsigned long get_obj_and_str(char *name, char **x)
return result;
}
-void set_obj(struct slabinfo *s, char *name, int n)
+void set_obj(struct slabinfo *s, const char *name, int n)
{
char x[100];
FILE *f;
- sprintf(x, "%s/%s", s->name, name);
+ snprintf(x, 100, "%s/%s", s->name, name);
f = fopen(x, "w");
if (!f)
fatal("Cannot write to %s\n", x);
@@ -180,13 +181,13 @@ void set_obj(struct slabinfo *s, char *name, int n)
fclose(f);
}
-unsigned long read_slab_obj(struct slabinfo *s, char *name)
+unsigned long read_slab_obj(struct slabinfo *s, const char *name)
{
char x[100];
FILE *f;
- int l;
+ size_t l;
- sprintf(x, "%s/%s", s->name, name);
+ snprintf(x, 100, "%s/%s", s->name, name);
f = fopen(x, "r");
if (!f) {
buffer[0] = 0;
@@ -453,7 +454,7 @@ void slabcache(struct slabinfo *s)
return;
store_size(size_str, slab_size(s));
- sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
+ snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
if (!line++)
first_line();
@@ -1062,6 +1063,7 @@ void read_slab_dir(void)
slab->partial = get_obj("partial");
slab->partial = get_obj_and_str("partial", &t);
decode_numa_list(slab->numa_partial, t);
+ free(t);
slab->poison = get_obj("poison");
slab->reclaim_account = get_obj("reclaim_account");
slab->red_zone = get_obj("red_zone");
@@ -1069,6 +1071,7 @@ void read_slab_dir(void)
slab->slab_size = get_obj("slab_size");
slab->slabs = get_obj_and_str("slabs", &t);
decode_numa_list(slab->numa, t);
+ free(t);
slab->store_user = get_obj("store_user");
slab->trace = get_obj("trace");
chdir("..");
@@ -1148,7 +1151,7 @@ int main(int argc, char *argv[])
while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
opts, NULL)) != -1)
- switch(c) {
+ switch (c) {
case '1':
show_single_ref = 1;
break;
diff --git a/Documentation/w1/00-INDEX b/Documentation/w1/00-INDEX
new file mode 100644
index 00000000000..5270cf4cb10
--- /dev/null
+++ b/Documentation/w1/00-INDEX
@@ -0,0 +1,8 @@
+00-INDEX
+ - This file
+masters/
+ - Individual chips providing 1-wire busses.
+w1.generic
+ - The 1-wire (w1) bus
+w1.netlink
+ - Userspace communication protocol over connector [1].
diff --git a/Documentation/w1/masters/00-INDEX b/Documentation/w1/masters/00-INDEX
new file mode 100644
index 00000000000..752613c4cea
--- /dev/null
+++ b/Documentation/w1/masters/00-INDEX
@@ -0,0 +1,6 @@
+00-INDEX
+ - This file
+ds2482
+ - The Maxim/Dallas Semiconductor DS2482 provides 1-wire busses.
+ds2490
+ - The Maxim/Dallas Semiconductor DS2490 builds USB <-> W1 bridges.
diff --git a/Documentation/w1/masters/ds2482 b/Documentation/w1/masters/ds2482
index c5d5478d90b..9210d6fa502 100644
--- a/Documentation/w1/masters/ds2482
+++ b/Documentation/w1/masters/ds2482
@@ -15,7 +15,7 @@ Author: Ben Gardner <bgardner@wabtec.com>
Description
-----------
-The Maixm/Dallas Semiconductor DS2482 is a I2C device that provides
+The Maxim/Dallas Semiconductor DS2482 is a I2C device that provides
one (DS2482-100) or eight (DS2482-800) 1-wire busses.
diff --git a/Documentation/w1/masters/ds2490 b/Documentation/w1/masters/ds2490
index 44a4918bd7f..239f9ae0184 100644
--- a/Documentation/w1/masters/ds2490
+++ b/Documentation/w1/masters/ds2490
@@ -10,7 +10,7 @@ Author: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Description
-----------
-The Maixm/Dallas Semiconductor DS2490 is a chip
+The Maxim/Dallas Semiconductor DS2490 is a chip
which allows to build USB <-> W1 bridges.
DS9490(R) is a USB <-> W1 bus master device
diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt
index f42798ed1c5..b89b6d2bebf 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86_64/mm.txt
@@ -9,6 +9,7 @@ ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory
ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0
... unused hole ...
diff --git a/Documentation/xterm-linux.xpm b/Documentation/xterm-linux.xpm
deleted file mode 100644
index f469c1a18e6..00000000000
--- a/Documentation/xterm-linux.xpm
+++ /dev/null
@@ -1,61 +0,0 @@
-/* XPM */
-/*****************************************************************************/
-/** This pixmap was made by Torsten Poulin - 1996 - torsten@diku.dk **/
-/** It was made by combining xterm-blank.xpm with **/
-/** the wonderfully cute Linux penguin mascot by Larry Ewing. **/
-/** I had to change Larry's penguin a little to make it fit. **/
-/** xterm-blank.xpm contained the following comment: **/
-/** This pixmap is kindly offered by Ion Cionca - 1992 - **/
-/** Swiss Federal Institute of Technology **/
-/** Central Computing Service **/
-/*****************************************************************************/
-static char * image_name [] = {
-/**/
-"64 38 8 1",
-/**/
-" s mask c none",
-". c gray70",
-"X c gray85",
-"o c gray50",
-"O c yellow",
-"+ c darkolivegreen",
-"@ c white",
-"# c black",
-" ###### ",
-" ######## ",
-" ########## ........................... ",
-" ########### .XXXXXXXXXXXXXXXXXXXXXXXXXXX. ",
-" ########### .XXXXXXXXXXXXXXXXXXXXXXXXXXXXXoo ",
-" #@@@#@@@### .XX+++++++++++++++++++++++XXXXoo ",
-" #@#@#@#@### .XX++++++++++++++++++++++++XXXooo ",
-" #@#####@### .XX++@@+@++@+@@@@++@+++++++XXXooo ",
-" ###OOO######.XX++++++++++++++++++++++++XXXoooo ",
-" ##OOOOOO####.XX++@@@@+@@+@@@+++++++++++XXXoooo ",
-" #O#OOO#O####.XX++++++++++++++++++++++++XXXooooo ",
-" ##O###OO####.XX++@@@@@@@@@@+@@@@@++++++XXXooooo ",
-" ###OOOO@#####XX++++++++++++++++++++++++XXXooooo ",
-" ##@###@@@@####XX++@@@+@@@@+@@++@@@++++++XXXooooo ",
-" #@@@@@@@@@@####X++++++++++++++++++++++++XXXooooo ",
-" ##@@@@@@@@@@#####++@+++++++++++++++++++++XXXooooo ",
-" ###@@@@@@@@@@######+++++++++++++++++++++++XXXooooo ",
-" ####@@@@@@@@@@@#####+@@@@+@+@@@+@++++++++++XXXooooo ",
-" ###@@@@@@@@@@@@######++++++++++++++++++++++XXXooooo ",
-" ##@@@@@@@@@@@@@@#####@+@@@@++++++++++++++++XXXooooo ",
-" ###@@@@@@@@@@@@@@######++++++++++++++++++++XXXXoooo ",
-" ###@@@@@@@@@@@@@@######XXXXXXXXXXXXXXXXXXXXXXXXooo ",
-" ###@@@@@@@@@@@@@@@######XXXXXXXXXXXXXXXXXXXXXXXooo ",
-" ###@@@@@@@@@@@@@@@@#####ooooooooooooooooooooooo...oo ",
-" ###@@@@@@@@@@@@@@@######.........................ooo ",
-" #OO##@@@@@@@@@@@@@#######oooooooooooooooooooooooooooo ",
-" #OOO##@@@@@@@@@@@#OO####O#XXXXXXXXXXXXXXXXXXXXXXXoooo.. .. ",
-" ###OOOOO##@@@@@@@@@@#OOO#OOO#XXXXXXXXXXXXXX#######XXoooo . .",
-" #OOOOOOOO###@@@@@@@@@#OOOOOOO#ooooooooooooooooooooXXXooo . ",
-" #OOOOOOOOO###@@@@@@@@@#OOOOOOO##XXXXXXXXXXXXXXXXXooooo . ",
-" #OOOOOOOOO#@@@@@@@@###OOOOOOOOO#XXXXXXXXXXXXXXXoo oooooo ",
-" #OOOOOOOOO#@@@@@@@####OOOOOOOO#@@@@@@@@@@@XXXXXoo ooooo...o ",
-" #OOOOOOOOOOO###########OOOOOO##XXXXXXXXXXXXXXXXoo ooXXXoo..o ",
-" ##OOOOOOOOO###########OOOO##@@@@@@@@@@@@@XXXXoo oXXXXX..o ",
-" ###OOOO### oXX##OOO#XXXXXXXXXXXXXXXXXXoo o.....oo ",
-" #### oooo####oooooooooooooooooooo ooooooo ",
-" ",
-" "};